entari-plugin-hyw 4.0.0rc4__py3-none-any.whl → 4.0.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/__init__.py +216 -75
- entari_plugin_hyw/assets/card-dist/index.html +70 -79
- entari_plugin_hyw/browser/__init__.py +10 -0
- entari_plugin_hyw/browser/engines/base.py +13 -0
- entari_plugin_hyw/browser/engines/bing.py +95 -0
- entari_plugin_hyw/browser/engines/duckduckgo.py +137 -0
- entari_plugin_hyw/browser/engines/google.py +155 -0
- entari_plugin_hyw/browser/landing.html +172 -0
- entari_plugin_hyw/browser/manager.py +153 -0
- entari_plugin_hyw/browser/service.py +304 -0
- entari_plugin_hyw/card-ui/src/App.vue +526 -182
- entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +7 -11
- entari_plugin_hyw/card-ui/src/components/StageCard.vue +33 -30
- entari_plugin_hyw/card-ui/src/types.ts +9 -0
- entari_plugin_hyw/definitions.py +155 -0
- entari_plugin_hyw/history.py +111 -33
- entari_plugin_hyw/misc.py +34 -0
- entari_plugin_hyw/modular_pipeline.py +384 -0
- entari_plugin_hyw/render_vue.py +326 -239
- entari_plugin_hyw/search.py +95 -708
- entari_plugin_hyw/stage_base.py +92 -0
- entari_plugin_hyw/stage_instruct.py +345 -0
- entari_plugin_hyw/stage_instruct_deepsearch.py +104 -0
- entari_plugin_hyw/stage_summary.py +164 -0
- {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/METADATA +4 -4
- {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/RECORD +28 -16
- entari_plugin_hyw/pipeline.py +0 -1219
- entari_plugin_hyw/prompts.py +0 -47
- {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/top_level.txt +0 -0
entari_plugin_hyw/pipeline.py
DELETED
|
@@ -1,1219 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import html
|
|
3
|
-
import json
|
|
4
|
-
import re
|
|
5
|
-
import time
|
|
6
|
-
from contextlib import asynccontextmanager
|
|
7
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
8
|
-
|
|
9
|
-
from loguru import logger
|
|
10
|
-
from openai import AsyncOpenAI
|
|
11
|
-
|
|
12
|
-
from .search import SearchService
|
|
13
|
-
from .image_cache import get_cached_images
|
|
14
|
-
from .prompts import (
|
|
15
|
-
SUMMARY_SP,
|
|
16
|
-
INSTRUCT_SP,
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
@asynccontextmanager
|
|
20
|
-
async def _null_async_context():
|
|
21
|
-
yield None
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class ProcessingPipeline:
|
|
25
|
-
"""
|
|
26
|
-
Core pipeline (vision -> instruct/search -> agent).
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
def __init__(self, config: Any):
|
|
30
|
-
self.config = config
|
|
31
|
-
self.search_service = SearchService(config)
|
|
32
|
-
self.client = AsyncOpenAI(base_url=self.config.base_url, api_key=self.config.api_key)
|
|
33
|
-
self.all_web_results = [] # Cache for search results
|
|
34
|
-
self.current_mode = "standard" # standard | agent
|
|
35
|
-
# Global ID counter for all types (unified numbering)
|
|
36
|
-
self.global_id_counter = 0
|
|
37
|
-
# Background tasks for async image search (not blocking agent)
|
|
38
|
-
self._image_search_tasks: List[asyncio.Task] = []
|
|
39
|
-
self._search_error: Optional[str] = None # Track critical search errors
|
|
40
|
-
|
|
41
|
-
self.web_search_tool = {
|
|
42
|
-
"type": "function",
|
|
43
|
-
"function": {
|
|
44
|
-
"name": "internal_web_search",
|
|
45
|
-
"description": "Search the web for text.",
|
|
46
|
-
"parameters": {
|
|
47
|
-
"type": "object",
|
|
48
|
-
"properties": {"query": {"type": "string"}},
|
|
49
|
-
"required": ["query"],
|
|
50
|
-
},
|
|
51
|
-
},
|
|
52
|
-
}
|
|
53
|
-
self.crawl_page_tool = {
|
|
54
|
-
"type": "function",
|
|
55
|
-
"function": {
|
|
56
|
-
"name": "crawl_page",
|
|
57
|
-
"description": "使用 Crawl4AI 抓取网页并返回 Markdown 文本。",
|
|
58
|
-
"parameters": {
|
|
59
|
-
"type": "object",
|
|
60
|
-
"properties": {
|
|
61
|
-
"url": {"type": "string"},
|
|
62
|
-
},
|
|
63
|
-
"required": ["url"],
|
|
64
|
-
},
|
|
65
|
-
},
|
|
66
|
-
}
|
|
67
|
-
self.refuse_answer_tool = {
|
|
68
|
-
"type": "function",
|
|
69
|
-
"function": {
|
|
70
|
-
"name": "refuse_answer",
|
|
71
|
-
"description": "拒绝回答问题。当用户问题涉及敏感、违规、不适宜内容时调用此工具,立即终止流程并返回拒绝回答的图片。",
|
|
72
|
-
"parameters": {
|
|
73
|
-
"type": "object",
|
|
74
|
-
"properties": {
|
|
75
|
-
"reason": {"type": "string", "description": "拒绝回答的原因(展示给用户)"},
|
|
76
|
-
},
|
|
77
|
-
"required": [],
|
|
78
|
-
},
|
|
79
|
-
},
|
|
80
|
-
}
|
|
81
|
-
# Flag to indicate refuse_answer was called
|
|
82
|
-
self._should_refuse = False
|
|
83
|
-
self._refuse_reason = ""
|
|
84
|
-
|
|
85
|
-
async def execute(
|
|
86
|
-
self,
|
|
87
|
-
user_input: str,
|
|
88
|
-
conversation_history: List[Dict],
|
|
89
|
-
model_name: str = None,
|
|
90
|
-
images: List[str] = None,
|
|
91
|
-
vision_model_name: str = None,
|
|
92
|
-
selected_vision_model: str = None,
|
|
93
|
-
) -> Dict[str, Any]:
|
|
94
|
-
"""
|
|
95
|
-
New Pipeline Flow:
|
|
96
|
-
1) Instruct: Images go directly here, decides web_search/crawl_page/refuse.
|
|
97
|
-
2) Auto-Fetch: Automatically fetch first 4 search result pages.
|
|
98
|
-
3) Screenshot: Render fetched pages as screenshots.
|
|
99
|
-
4) Summary: Receives user images + page screenshots for final answer.
|
|
100
|
-
"""
|
|
101
|
-
start_time = time.time()
|
|
102
|
-
stats = {"start_time": start_time, "tool_calls_count": 0}
|
|
103
|
-
usage_totals = {"input_tokens": 0, "output_tokens": 0}
|
|
104
|
-
active_model = model_name or self.config.model_name
|
|
105
|
-
|
|
106
|
-
current_history = conversation_history
|
|
107
|
-
# Reset globals
|
|
108
|
-
self.all_web_results = []
|
|
109
|
-
self.global_id_counter = 0
|
|
110
|
-
self._should_refuse = False
|
|
111
|
-
self._refuse_reason = ""
|
|
112
|
-
self._image_search_tasks = []
|
|
113
|
-
|
|
114
|
-
try:
|
|
115
|
-
logger.info(f"Pipeline: Starting workflow for '{user_input}' using {active_model}")
|
|
116
|
-
|
|
117
|
-
trace: Dict[str, Any] = {
|
|
118
|
-
"instruct": None,
|
|
119
|
-
"search": None,
|
|
120
|
-
"fetch": None,
|
|
121
|
-
"summary": None,
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
# --- 1. Instruct Stage (with images if provided) ---
|
|
125
|
-
instruct_start = time.time()
|
|
126
|
-
instruct_model = getattr(self.config, "instruct_model_name", None) or active_model
|
|
127
|
-
instruct_text, search_payloads, instruct_trace, instruct_usage, search_time = await self._run_instruct_stage(
|
|
128
|
-
user_input=user_input,
|
|
129
|
-
images=images, # Pass images directly to instruct
|
|
130
|
-
model=instruct_model,
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
# Check refuse
|
|
134
|
-
if self._should_refuse:
|
|
135
|
-
return {
|
|
136
|
-
"llm_response": "",
|
|
137
|
-
"structured_response": {},
|
|
138
|
-
"stats": stats,
|
|
139
|
-
"model_used": active_model,
|
|
140
|
-
"conversation_history": current_history,
|
|
141
|
-
"refuse_answer": True,
|
|
142
|
-
"refuse_reason": self._refuse_reason
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
# Check for critical search errors
|
|
146
|
-
if self._search_error:
|
|
147
|
-
return {
|
|
148
|
-
"llm_response": "",
|
|
149
|
-
"structured_response": {},
|
|
150
|
-
"stats": stats,
|
|
151
|
-
"model_used": active_model,
|
|
152
|
-
"conversation_history": current_history,
|
|
153
|
-
"refuse_answer": True,
|
|
154
|
-
"refuse_reason": f"搜索服务异常: {self._search_error} 请联系管理员。"
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
usage_totals["input_tokens"] += instruct_usage.get("input_tokens", 0)
|
|
158
|
-
usage_totals["output_tokens"] += instruct_usage.get("output_tokens", 0)
|
|
159
|
-
|
|
160
|
-
instruct_cost = 0.0
|
|
161
|
-
i_in_price = float(getattr(self.config, "instruct_input_price", None) or getattr(self.config, "input_price", 0.0) or 0.0)
|
|
162
|
-
i_out_price = float(getattr(self.config, "instruct_output_price", None) or getattr(self.config, "output_price", 0.0) or 0.0)
|
|
163
|
-
if i_in_price > 0 or i_out_price > 0:
|
|
164
|
-
instruct_cost = (instruct_usage.get("input_tokens", 0) / 1_000_000 * i_in_price) + (instruct_usage.get("output_tokens", 0) / 1_000_000 * i_out_price)
|
|
165
|
-
|
|
166
|
-
instruct_trace["cost"] = instruct_cost
|
|
167
|
-
trace["instruct"] = instruct_trace
|
|
168
|
-
|
|
169
|
-
# --- 2. Auto-Fetch Stage (Automatically fetch first 4 search results) ---
|
|
170
|
-
fetch_start = time.time()
|
|
171
|
-
fetch_trace = {}
|
|
172
|
-
page_screenshots: List[str] = [] # Base64 screenshots of fetched pages
|
|
173
|
-
|
|
174
|
-
fetch_urls = []
|
|
175
|
-
search_items = [r for r in self.all_web_results if r.get("_type") == "search"]
|
|
176
|
-
if search_items:
|
|
177
|
-
# Group search results by query
|
|
178
|
-
query_groups = {}
|
|
179
|
-
for r in search_items:
|
|
180
|
-
q = r.get("query", "default")
|
|
181
|
-
if q not in query_groups:
|
|
182
|
-
query_groups[q] = []
|
|
183
|
-
query_groups[q].append(r)
|
|
184
|
-
|
|
185
|
-
raw_fetch_urls = []
|
|
186
|
-
# If multiple queries, take top 3 from each
|
|
187
|
-
if len(query_groups) > 1:
|
|
188
|
-
logger.info(f"Pipeline: Multiple search queries detected ({len(query_groups)}). Taking top 3 from each.")
|
|
189
|
-
for q, items in query_groups.items():
|
|
190
|
-
for item in items[:3]:
|
|
191
|
-
if item.get("url"):
|
|
192
|
-
raw_fetch_urls.append(item.get("url"))
|
|
193
|
-
else:
|
|
194
|
-
# Single query, take top 8
|
|
195
|
-
raw_fetch_urls = [r.get("url") for r in search_items[:8] if r.get("url")]
|
|
196
|
-
|
|
197
|
-
# Deduplicate while preserving order and filter blocked domains
|
|
198
|
-
final_fetch_urls = []
|
|
199
|
-
for url in raw_fetch_urls:
|
|
200
|
-
if url and url not in final_fetch_urls:
|
|
201
|
-
final_fetch_urls.append(url)
|
|
202
|
-
|
|
203
|
-
fetch_urls = final_fetch_urls
|
|
204
|
-
|
|
205
|
-
# Check if search was performed but no URLs were available for fetching
|
|
206
|
-
has_search_call = False
|
|
207
|
-
if instruct_trace and "tool_calls" in instruct_trace:
|
|
208
|
-
has_search_call = any(tc.get("name") in ["web_search", "internal_web_search"] for tc in instruct_trace["tool_calls"])
|
|
209
|
-
|
|
210
|
-
if has_search_call and not fetch_urls:
|
|
211
|
-
return {
|
|
212
|
-
"llm_response": "",
|
|
213
|
-
"structured_response": {},
|
|
214
|
-
"stats": stats,
|
|
215
|
-
"model_used": active_model,
|
|
216
|
-
"conversation_history": current_history,
|
|
217
|
-
"refuse_answer": True,
|
|
218
|
-
"refuse_reason": "搜索结果为空或全部被过滤,无法生成回答。"
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
if fetch_urls:
|
|
222
|
-
logger.info(f"Pipeline: Auto-fetching up to {len(fetch_urls)} pages (keeping fastest 5): {fetch_urls}")
|
|
223
|
-
|
|
224
|
-
# Execute fetch and get screenshots
|
|
225
|
-
await self._run_auto_fetch_with_screenshots(fetch_urls)
|
|
226
|
-
|
|
227
|
-
fetch_trace = {
|
|
228
|
-
"model": "Auto",
|
|
229
|
-
"urls_fetched": fetch_urls,
|
|
230
|
-
"time": time.time() - fetch_start,
|
|
231
|
-
"cost": 0.0,
|
|
232
|
-
}
|
|
233
|
-
trace["fetch"] = fetch_trace
|
|
234
|
-
|
|
235
|
-
# Always collect screenshots from ALL page results (search auto-fetch + direct URL crawl)
|
|
236
|
-
fetch_items = [r for r in self.all_web_results if r.get("_type") == "page"]
|
|
237
|
-
for r in fetch_items:
|
|
238
|
-
if r.get("screenshot_b64"):
|
|
239
|
-
page_screenshots.append(r["screenshot_b64"])
|
|
240
|
-
|
|
241
|
-
if fetch_trace:
|
|
242
|
-
fetch_trace["screenshots_count"] = len(page_screenshots)
|
|
243
|
-
|
|
244
|
-
# --- 3. Summary Stage (with user images + page screenshots only) ---
|
|
245
|
-
summary_start = time.time()
|
|
246
|
-
summary_model = active_model
|
|
247
|
-
|
|
248
|
-
# Combine user images and page screenshots for summary
|
|
249
|
-
all_summary_images: List[str] = []
|
|
250
|
-
if images:
|
|
251
|
-
all_summary_images.extend(images)
|
|
252
|
-
all_summary_images.extend(page_screenshots)
|
|
253
|
-
|
|
254
|
-
summary_content, summary_usage, summary_trace_info = await self._run_summary_stage(
|
|
255
|
-
user_input=user_input,
|
|
256
|
-
images=all_summary_images if all_summary_images else None,
|
|
257
|
-
has_page_screenshots=bool(page_screenshots),
|
|
258
|
-
model=summary_model
|
|
259
|
-
)
|
|
260
|
-
|
|
261
|
-
usage_totals["input_tokens"] += summary_usage.get("input_tokens", 0)
|
|
262
|
-
usage_totals["output_tokens"] += summary_usage.get("output_tokens", 0)
|
|
263
|
-
|
|
264
|
-
summary_cost = 0.0
|
|
265
|
-
s_in_price = float(getattr(self.config, "input_price", 0.0) or 0.0)
|
|
266
|
-
s_out_price = float(getattr(self.config, "output_price", 0.0) or 0.0)
|
|
267
|
-
if s_in_price > 0 or s_out_price > 0:
|
|
268
|
-
summary_cost = (summary_usage.get("input_tokens", 0) / 1_000_000 * s_in_price) + (summary_usage.get("output_tokens", 0) / 1_000_000 * s_out_price)
|
|
269
|
-
|
|
270
|
-
trace["summary"] = {
|
|
271
|
-
"model": summary_model,
|
|
272
|
-
"system_prompt": summary_trace_info.get("prompt", ""),
|
|
273
|
-
"output": summary_content,
|
|
274
|
-
"usage": summary_usage,
|
|
275
|
-
"time": time.time() - summary_start,
|
|
276
|
-
"cost": summary_cost,
|
|
277
|
-
"images_count": len(all_summary_images)
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
# --- Result Assembly ---
|
|
281
|
-
stats["total_time"] = time.time() - start_time
|
|
282
|
-
structured = self._parse_tagged_response(summary_content)
|
|
283
|
-
final_content = structured.get("response") or summary_content
|
|
284
|
-
|
|
285
|
-
billing_info = {
|
|
286
|
-
"input_tokens": usage_totals["input_tokens"],
|
|
287
|
-
"output_tokens": usage_totals["output_tokens"],
|
|
288
|
-
"total_cost": instruct_cost + summary_cost
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
# Build stages_used
|
|
292
|
-
stages_used = []
|
|
293
|
-
|
|
294
|
-
# Get page info
|
|
295
|
-
fetch_items = [r for r in self.all_web_results if r.get("_type") == "page"]
|
|
296
|
-
crawled_pages_ui = []
|
|
297
|
-
for r in fetch_items:
|
|
298
|
-
domain = ""
|
|
299
|
-
try:
|
|
300
|
-
from urllib.parse import urlparse
|
|
301
|
-
domain = urlparse(r.get("url", "")).netloc
|
|
302
|
-
except: pass
|
|
303
|
-
crawled_pages_ui.append({
|
|
304
|
-
"title": r.get("title", ""),
|
|
305
|
-
"url": r.get("url", ""),
|
|
306
|
-
"favicon_url": f"https://www.google.com/s2/favicons?domain={domain}&sz=32"
|
|
307
|
-
})
|
|
308
|
-
|
|
309
|
-
# Extract images from pages
|
|
310
|
-
extracted_images = []
|
|
311
|
-
seen_imgs = set()
|
|
312
|
-
junk_keywords = ["icon", "logo", "badge", "avatar", "button", "social", "footer", "header", "banner", "license", "by-nc", "hosted_by", "pixel", "tracker", "ad", "ads", "advert", "promotion", "shop", "store", "group", "join", "qr", "qrcode", "weibo", "weixin", "douyin", "xiaohongshu", "bilibili", "official", "follow", "subscribe", "app"]
|
|
313
|
-
|
|
314
|
-
for r in fetch_items:
|
|
315
|
-
if "images" in r:
|
|
316
|
-
for img_url in r["images"]:
|
|
317
|
-
if img_url not in seen_imgs:
|
|
318
|
-
# Filter junk images
|
|
319
|
-
lower_url = img_url.lower()
|
|
320
|
-
if any(k in lower_url for k in junk_keywords):
|
|
321
|
-
continue
|
|
322
|
-
|
|
323
|
-
extracted_images.append({
|
|
324
|
-
"title": r.get("title", "Image"),
|
|
325
|
-
"url": img_url,
|
|
326
|
-
"thumbnail": img_url,
|
|
327
|
-
"domain": r.get("domain", "")
|
|
328
|
-
})
|
|
329
|
-
seen_imgs.add(img_url)
|
|
330
|
-
|
|
331
|
-
# Instruct Stage (with crawled pages and images)
|
|
332
|
-
if trace.get("instruct"):
|
|
333
|
-
i = trace["instruct"]
|
|
334
|
-
# Total time = instruct + search + fetch (until summary starts)
|
|
335
|
-
instruct_total_time = (i.get("time", 0) or 0) + search_time
|
|
336
|
-
if trace.get("fetch"):
|
|
337
|
-
instruct_total_time += trace["fetch"].get("time", 0)
|
|
338
|
-
|
|
339
|
-
stages_used.append({
|
|
340
|
-
"name": "Instruct",
|
|
341
|
-
"model": i.get("model"),
|
|
342
|
-
"icon_config": "openai",
|
|
343
|
-
"provider": "Instruct",
|
|
344
|
-
"time": instruct_total_time,
|
|
345
|
-
"cost": i.get("cost", 0),
|
|
346
|
-
"has_images": bool(images),
|
|
347
|
-
"crawled_pages": crawled_pages_ui, # Add crawled pages here
|
|
348
|
-
"image_references": extracted_images[:9] # Add images here
|
|
349
|
-
})
|
|
350
|
-
|
|
351
|
-
# Summary Stage
|
|
352
|
-
if trace.get("summary"):
|
|
353
|
-
s = trace["summary"]
|
|
354
|
-
stages_used.append({
|
|
355
|
-
"name": "Summary",
|
|
356
|
-
"model": s.get("model"),
|
|
357
|
-
"icon_config": "openai",
|
|
358
|
-
"provider": "Summary",
|
|
359
|
-
"time": s.get("time", 0),
|
|
360
|
-
"cost": s.get("cost", 0),
|
|
361
|
-
"images_count": s.get("images_count", 0)
|
|
362
|
-
})
|
|
363
|
-
|
|
364
|
-
# Construct final trace markdown
|
|
365
|
-
trace_markdown = self._render_trace_markdown(trace)
|
|
366
|
-
|
|
367
|
-
# Update history
|
|
368
|
-
current_history.append({"role": "user", "content": user_input or "..."})
|
|
369
|
-
current_history.append({"role": "assistant", "content": final_content})
|
|
370
|
-
|
|
371
|
-
# Schedule async cache task (fire and forget - doesn't block return)
|
|
372
|
-
cache_data = {
|
|
373
|
-
"user_input": user_input,
|
|
374
|
-
"trace": trace,
|
|
375
|
-
"trace_markdown": trace_markdown,
|
|
376
|
-
"page_screenshots": page_screenshots,
|
|
377
|
-
"final_content": final_content,
|
|
378
|
-
"stages_used": stages_used,
|
|
379
|
-
}
|
|
380
|
-
asyncio.create_task(self._cache_run_async(cache_data))
|
|
381
|
-
|
|
382
|
-
return {
|
|
383
|
-
"llm_response": final_content,
|
|
384
|
-
"structured_response": structured,
|
|
385
|
-
"stats": stats,
|
|
386
|
-
"model_used": active_model,
|
|
387
|
-
"conversation_history": current_history,
|
|
388
|
-
"trace_markdown": trace_markdown,
|
|
389
|
-
"billing_info": billing_info,
|
|
390
|
-
"stages_used": stages_used,
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
except Exception as e:
|
|
394
|
-
logger.exception("Pipeline Critical Failure")
|
|
395
|
-
# Cancel all background image tasks on error
|
|
396
|
-
if hasattr(self, '_image_search_tasks') and self._image_search_tasks:
|
|
397
|
-
for task in self._image_search_tasks:
|
|
398
|
-
if not task.done(): task.cancel()
|
|
399
|
-
try:
|
|
400
|
-
await asyncio.wait(self._image_search_tasks, timeout=0.1)
|
|
401
|
-
except Exception: pass
|
|
402
|
-
self._image_search_tasks = []
|
|
403
|
-
|
|
404
|
-
return {
|
|
405
|
-
"llm_response": f"I encountered a critical error: {e}",
|
|
406
|
-
"stats": stats,
|
|
407
|
-
"error": str(e),
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
def _parse_tagged_response(self, text: str) -> Dict[str, Any]:
|
|
411
|
-
"""Parse response and auto-infer references from citations and markdown images.
|
|
412
|
-
"""
|
|
413
|
-
parsed = {"response": "", "references": [], "page_references": [], "image_references": [], "flow_steps": []}
|
|
414
|
-
if not text:
|
|
415
|
-
return parsed
|
|
416
|
-
|
|
417
|
-
import re
|
|
418
|
-
|
|
419
|
-
# 1. Strip trailing reference/source list
|
|
420
|
-
body_text = text
|
|
421
|
-
ref_list_pattern = re.compile(r'(?:\n\s*|^)\s*(?:#{1,3}|\*\*)\s*(?:References|Citations|Sources|参考资料|引用)[\s\S]*$', re.IGNORECASE | re.MULTILINE)
|
|
422
|
-
body_text = ref_list_pattern.sub('', body_text)
|
|
423
|
-
|
|
424
|
-
remaining_text = body_text.strip()
|
|
425
|
-
|
|
426
|
-
# 2. Unwrap JSON if necessary
|
|
427
|
-
try:
|
|
428
|
-
if remaining_text.strip().startswith("{") and "action" in remaining_text:
|
|
429
|
-
data = json.loads(remaining_text)
|
|
430
|
-
if isinstance(data, dict) and "action_input" in data:
|
|
431
|
-
remaining_text = data["action_input"]
|
|
432
|
-
except Exception:
|
|
433
|
-
pass
|
|
434
|
-
|
|
435
|
-
# 3. Identify all citations [N] and direct markdown images ![]()
|
|
436
|
-
cited_ids = []
|
|
437
|
-
body_pattern = re.compile(r'\[(\d+)\]')
|
|
438
|
-
for match in body_pattern.finditer(remaining_text):
|
|
439
|
-
try:
|
|
440
|
-
cited_ids.append(int(match.group(1)))
|
|
441
|
-
except ValueError: pass
|
|
442
|
-
|
|
443
|
-
# Also find direct URLs in ![]()
|
|
444
|
-
direct_image_urls = []
|
|
445
|
-
img_pattern = re.compile(r'!\[.*?\]\((.*?)\)')
|
|
446
|
-
for match in img_pattern.finditer(remaining_text):
|
|
447
|
-
url = match.group(1).strip()
|
|
448
|
-
if url and not url.startswith('['): # Not a [N] citation
|
|
449
|
-
direct_image_urls.append(url)
|
|
450
|
-
|
|
451
|
-
# 4. Build Citation Maps and Reference Lists
|
|
452
|
-
unified_id_map = {}
|
|
453
|
-
# Keep track of what we've already added to avoid duplicates
|
|
454
|
-
seen_urls = set()
|
|
455
|
-
|
|
456
|
-
# id_order needs to be unique and preserve appearance order
|
|
457
|
-
id_order = []
|
|
458
|
-
for id_val in cited_ids:
|
|
459
|
-
if id_val not in id_order:
|
|
460
|
-
id_order.append(id_val)
|
|
461
|
-
|
|
462
|
-
# Process [N] citations first to determine numbering
|
|
463
|
-
for old_id in id_order:
|
|
464
|
-
result_item = next((r for r in self.all_web_results if r.get("_id") == old_id), None)
|
|
465
|
-
if not result_item: continue
|
|
466
|
-
|
|
467
|
-
url = result_item.get("url", "")
|
|
468
|
-
item_type = result_item.get("_type", "")
|
|
469
|
-
|
|
470
|
-
entry = {
|
|
471
|
-
"title": result_item.get("title", ""),
|
|
472
|
-
"url": url,
|
|
473
|
-
"domain": result_item.get("domain", "")
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
if item_type == "search":
|
|
477
|
-
parsed["references"].append(entry)
|
|
478
|
-
unified_id_map[old_id] = len(parsed["references"]) + len(parsed["page_references"])
|
|
479
|
-
seen_urls.add(url)
|
|
480
|
-
elif item_type == "page":
|
|
481
|
-
parsed["page_references"].append(entry)
|
|
482
|
-
unified_id_map[old_id] = len(parsed["references"]) + len(parsed["page_references"])
|
|
483
|
-
seen_urls.add(url)
|
|
484
|
-
elif item_type == "image":
|
|
485
|
-
entry["thumbnail"] = result_item.get("thumbnail", "")
|
|
486
|
-
if url not in seen_urls:
|
|
487
|
-
parsed["image_references"].append(entry)
|
|
488
|
-
seen_urls.add(url)
|
|
489
|
-
# Note: Images cited as [N] might be used in text like 
|
|
490
|
-
# We'll handle this in replacement
|
|
491
|
-
|
|
492
|
-
# Now handle direct image URLs from ![]() that weren't cited as [N]
|
|
493
|
-
for url in direct_image_urls:
|
|
494
|
-
if url in seen_urls: continue
|
|
495
|
-
# Find in all_web_results
|
|
496
|
-
result_item = next((r for r in self.all_web_results if (r.get("url") == url or r.get("image") == url) and r.get("_type") == "image"), None)
|
|
497
|
-
if result_item:
|
|
498
|
-
entry = {
|
|
499
|
-
"title": result_item.get("title", ""),
|
|
500
|
-
"url": url,
|
|
501
|
-
"domain": result_item.get("domain", ""),
|
|
502
|
-
"thumbnail": result_item.get("thumbnail", "")
|
|
503
|
-
}
|
|
504
|
-
parsed["image_references"].append(entry)
|
|
505
|
-
seen_urls.add(url)
|
|
506
|
-
|
|
507
|
-
# 5. Replacement Logic
|
|
508
|
-
# Define image replacement map separately to handle 
|
|
509
|
-
image_url_map = {} # old_id -> raw_url
|
|
510
|
-
for old_id in id_order:
|
|
511
|
-
item = next((r for r in self.all_web_results if r.get("_id") == old_id), None)
|
|
512
|
-
if item and item.get("_type") == "image":
|
|
513
|
-
image_url_map[old_id] = item.get("url", "")
|
|
514
|
-
|
|
515
|
-
def refined_replace(text):
|
|
516
|
-
# First, handle  specifically
|
|
517
|
-
# We want to replace the [N] with the actual URL so the markdown renders
|
|
518
|
-
def sub_img_ref(match):
|
|
519
|
-
alt = match.group(1)
|
|
520
|
-
ref = match.group(2)
|
|
521
|
-
inner_match = body_pattern.match(ref)
|
|
522
|
-
if inner_match:
|
|
523
|
-
oid = int(inner_match.group(1))
|
|
524
|
-
if oid in image_url_map:
|
|
525
|
-
return f""
|
|
526
|
-
return match.group(0)
|
|
527
|
-
|
|
528
|
-
text = re.sub(r'!\[(.*?)\]\((.*?)\)', sub_img_ref, text)
|
|
529
|
-
|
|
530
|
-
# Then handle normal [N] replacements
|
|
531
|
-
def sub_norm_ref(match):
|
|
532
|
-
oid = int(match.group(1))
|
|
533
|
-
if oid in unified_id_map:
|
|
534
|
-
return f"[{unified_id_map[oid]}]"
|
|
535
|
-
if oid in image_url_map:
|
|
536
|
-
return "" # Remove standalone image citations like [5] if they aren't in ![]()
|
|
537
|
-
return "" # Remove hallucinated or invalid citations like [99] if not found in results
|
|
538
|
-
|
|
539
|
-
return body_pattern.sub(sub_norm_ref, text)
|
|
540
|
-
|
|
541
|
-
final_text = refined_replace(remaining_text)
|
|
542
|
-
parsed["response"] = final_text.strip()
|
|
543
|
-
return parsed
|
|
544
|
-
|
|
545
|
-
async def _safe_route_tool(self, tool_call):
|
|
546
|
-
"""Wrapper for safe concurrent execution of tool calls."""
|
|
547
|
-
try:
|
|
548
|
-
return await asyncio.wait_for(self._route_tool(tool_call), timeout=30.0)
|
|
549
|
-
except asyncio.TimeoutError:
|
|
550
|
-
return "Error: Tool execution timed out (30s limit)."
|
|
551
|
-
except Exception as e:
|
|
552
|
-
return f"Error: Tool execution failed: {e}"
|
|
553
|
-
|
|
554
|
-
async def _route_tool(self, tool_call):
|
|
555
|
-
"""Execute tool call and return result."""
|
|
556
|
-
name = tool_call.function.name
|
|
557
|
-
args = json.loads(html.unescape(tool_call.function.arguments))
|
|
558
|
-
|
|
559
|
-
if name == "internal_web_search" or name == "web_search":
|
|
560
|
-
query = args.get("query")
|
|
561
|
-
try:
|
|
562
|
-
web = await self.search_service.search(query)
|
|
563
|
-
except Exception as e:
|
|
564
|
-
logger.error(f"Failed to execute search: {e}")
|
|
565
|
-
self._search_error = str(e)
|
|
566
|
-
raise e
|
|
567
|
-
|
|
568
|
-
# Filter blocked domains removed per user request (handled in search query)
|
|
569
|
-
|
|
570
|
-
# Cache results and assign global IDs
|
|
571
|
-
for item in web:
|
|
572
|
-
self.global_id_counter += 1
|
|
573
|
-
item["_id"] = self.global_id_counter
|
|
574
|
-
item["_type"] = "search"
|
|
575
|
-
item["query"] = query
|
|
576
|
-
self.all_web_results.append(item)
|
|
577
|
-
|
|
578
|
-
return json.dumps({"web_results_count": len(web), "status": "cached_for_prompt"}, ensure_ascii=False)
|
|
579
|
-
|
|
580
|
-
if name == "internal_image_search":
|
|
581
|
-
query = args.get("query")
|
|
582
|
-
# Start image search in background (non-blocking)
|
|
583
|
-
# Images are for UI rendering only, not passed to LLM
|
|
584
|
-
async def _background_image_search():
|
|
585
|
-
try:
|
|
586
|
-
images = await self.search_service.image_search(query)
|
|
587
|
-
# Cache results and assign global IDs for UI rendering
|
|
588
|
-
for item in images:
|
|
589
|
-
self.global_id_counter += 1
|
|
590
|
-
item["_id"] = self.global_id_counter
|
|
591
|
-
item["_type"] = "image"
|
|
592
|
-
item["query"] = query
|
|
593
|
-
item["is_image"] = True
|
|
594
|
-
self.all_web_results.append(item)
|
|
595
|
-
logger.info(f"Background image search completed: {len(images)} images for query '{query}'")
|
|
596
|
-
except (asyncio.CancelledError, Exception) as e:
|
|
597
|
-
# Silently handle cancellation or minor errors in background pre-warming
|
|
598
|
-
if isinstance(e, asyncio.CancelledError):
|
|
599
|
-
logger.debug(f"Background image search cancelled for query '{query}'")
|
|
600
|
-
else:
|
|
601
|
-
logger.error(f"Background image search failed for query '{query}': {e}")
|
|
602
|
-
|
|
603
|
-
task = asyncio.create_task(_background_image_search())
|
|
604
|
-
self._image_search_tasks.append(task)
|
|
605
|
-
|
|
606
|
-
# Return immediately without waiting for search to complete
|
|
607
|
-
return json.dumps({"image_results_count": 0, "status": "searching_in_background"}, ensure_ascii=False)
|
|
608
|
-
|
|
609
|
-
if name == "crawl_page":
|
|
610
|
-
url = args.get("url")
|
|
611
|
-
logger.info(f"[Tool] Crawling page: {url}")
|
|
612
|
-
# Returns Dict: {content, title, url}
|
|
613
|
-
result_dict = await self.search_service.fetch_page(url)
|
|
614
|
-
|
|
615
|
-
# Cache the crawled content with global ID
|
|
616
|
-
self.global_id_counter += 1
|
|
617
|
-
|
|
618
|
-
# Generate screenshot for direct URL crawl (so LLM can see it)
|
|
619
|
-
screenshot_b64 = await self._render_page_screenshot(
|
|
620
|
-
title=result_dict.get("title", "Page"),
|
|
621
|
-
url=url,
|
|
622
|
-
content=result_dict.get("content", "")[:4000]
|
|
623
|
-
)
|
|
624
|
-
|
|
625
|
-
cached_item = {
|
|
626
|
-
"_id": self.global_id_counter,
|
|
627
|
-
"_type": "page",
|
|
628
|
-
"title": result_dict.get("title", "Page"),
|
|
629
|
-
"url": result_dict.get("url", url),
|
|
630
|
-
"content": result_dict.get("content", ""),
|
|
631
|
-
"domain": "",
|
|
632
|
-
"is_crawled": True,
|
|
633
|
-
"screenshot_b64": screenshot_b64, # Add screenshot
|
|
634
|
-
}
|
|
635
|
-
try:
|
|
636
|
-
from urllib.parse import urlparse
|
|
637
|
-
cached_item["domain"] = urlparse(url).netloc
|
|
638
|
-
except:
|
|
639
|
-
pass
|
|
640
|
-
|
|
641
|
-
self.all_web_results.append(cached_item)
|
|
642
|
-
|
|
643
|
-
return json.dumps({"crawl_status": "success", "title": cached_item["title"], "content_length": len(result_dict.get("content", ""))}, ensure_ascii=False)
|
|
644
|
-
|
|
645
|
-
if name == "set_mode":
|
|
646
|
-
mode = args.get("mode", "standard")
|
|
647
|
-
self.current_mode = mode
|
|
648
|
-
return f"Mode set to {mode}"
|
|
649
|
-
|
|
650
|
-
if name == "refuse_answer":
|
|
651
|
-
reason = args.get("reason", "")
|
|
652
|
-
self._should_refuse = True
|
|
653
|
-
self._refuse_reason = reason
|
|
654
|
-
logger.info(f"[Tool] refuse_answer called. Reason: {reason}")
|
|
655
|
-
return "Refuse answer triggered. Pipeline will terminate early."
|
|
656
|
-
|
|
657
|
-
return f"Unknown tool {name}"
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
async def _safe_llm_call(self, messages, model, tools=None, tool_choice=None, client: Optional[AsyncOpenAI] = None, extra_body: Optional[Dict[str, Any]] = None):
|
|
661
|
-
try:
|
|
662
|
-
return await asyncio.wait_for(
|
|
663
|
-
self._do_llm_request(messages, model, tools, tool_choice, client=client or self.client, extra_body=extra_body),
|
|
664
|
-
timeout=120.0,
|
|
665
|
-
)
|
|
666
|
-
except asyncio.TimeoutError:
|
|
667
|
-
logger.error("LLM Call Timed Out")
|
|
668
|
-
return type("obj", (object,), {"content": "Error: The model took too long to respond.", "tool_calls": None})(), {"input_tokens": 0, "output_tokens": 0}
|
|
669
|
-
except Exception as e:
|
|
670
|
-
logger.error(f"LLM Call Failed: {e}")
|
|
671
|
-
return type("obj", (object,), {"content": f"Error: Model failure ({e})", "tool_calls": None})(), {"input_tokens": 0, "output_tokens": 0}
|
|
672
|
-
|
|
673
|
-
async def _do_llm_request(self, messages, model, tools, tool_choice, client: AsyncOpenAI, extra_body: Optional[Dict[str, Any]] = None):
|
|
674
|
-
try:
|
|
675
|
-
payload_debug = json.dumps(messages)
|
|
676
|
-
logger.info(f"LLM Request Payload Size: {len(payload_debug)} chars")
|
|
677
|
-
except Exception:
|
|
678
|
-
pass
|
|
679
|
-
|
|
680
|
-
t0 = time.time()
|
|
681
|
-
logger.info("LLM Request SENT to API...")
|
|
682
|
-
response = await client.chat.completions.create(
|
|
683
|
-
model=model,
|
|
684
|
-
messages=messages,
|
|
685
|
-
tools=tools,
|
|
686
|
-
tool_choice=tool_choice,
|
|
687
|
-
temperature=self.config.temperature,
|
|
688
|
-
extra_body=extra_body,
|
|
689
|
-
)
|
|
690
|
-
logger.info(f"LLM Request RECEIVED after {time.time() - t0:.2f}s")
|
|
691
|
-
|
|
692
|
-
usage = {"input_tokens": 0, "output_tokens": 0}
|
|
693
|
-
if hasattr(response, "usage") and response.usage:
|
|
694
|
-
usage["input_tokens"] = getattr(response.usage, "prompt_tokens", 0) or 0
|
|
695
|
-
usage["output_tokens"] = getattr(response.usage, "completion_tokens", 0) or 0
|
|
696
|
-
|
|
697
|
-
return response.choices[0].message, usage
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
async def _run_instruct_stage(
|
|
702
|
-
self, user_input: str, images: List[str] = None, model: str = None
|
|
703
|
-
) -> Tuple[str, List[str], Dict[str, Any], Dict[str, int], float]:
|
|
704
|
-
"""Returns (instruct_text, search_payloads, trace_dict, usage_dict, search_time).
|
|
705
|
-
|
|
706
|
-
Images are now passed directly here (merged vision stage).
|
|
707
|
-
"""
|
|
708
|
-
# Instruct has access to: web_search, crawl_page, refuse_answer
|
|
709
|
-
tools = [self.web_search_tool, self.crawl_page_tool, self.refuse_answer_tool]
|
|
710
|
-
tools_desc = "- internal_web_search: 搜索文本\n- crawl_page: 获取网页内容\n- refuse_answer: 拒绝回答(敏感/违规内容)"
|
|
711
|
-
|
|
712
|
-
prompt = INSTRUCT_SP.format(user_msgs=user_input or "", tools_desc=tools_desc)
|
|
713
|
-
|
|
714
|
-
client = self._client_for(
|
|
715
|
-
api_key=getattr(self.config, "instruct_api_key", None),
|
|
716
|
-
base_url=getattr(self.config, "instruct_base_url", None),
|
|
717
|
-
)
|
|
718
|
-
|
|
719
|
-
# Build user content - multimodal if images provided
|
|
720
|
-
if images:
|
|
721
|
-
user_content: List[Dict[str, Any]] = [{"type": "text", "text": user_input or "..."}]
|
|
722
|
-
for img_b64 in images:
|
|
723
|
-
url = f"data:image/png;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
|
|
724
|
-
user_content.append({"type": "image_url", "image_url": {"url": url}})
|
|
725
|
-
else:
|
|
726
|
-
user_content = user_input or "..."
|
|
727
|
-
|
|
728
|
-
history: List[Dict[str, Any]] = [
|
|
729
|
-
{"role": "system", "content": prompt},
|
|
730
|
-
{"role": "user", "content": user_content},
|
|
731
|
-
]
|
|
732
|
-
|
|
733
|
-
response, usage = await self._safe_llm_call(
|
|
734
|
-
messages=history,
|
|
735
|
-
model=model,
|
|
736
|
-
tools=tools,
|
|
737
|
-
tool_choice="auto",
|
|
738
|
-
client=client,
|
|
739
|
-
extra_body=getattr(self.config, "instruct_extra_body", None),
|
|
740
|
-
)
|
|
741
|
-
|
|
742
|
-
search_payloads: List[str] = []
|
|
743
|
-
instruct_trace: Dict[str, Any] = {
|
|
744
|
-
"model": model,
|
|
745
|
-
"base_url": getattr(self.config, "instruct_base_url", None) or self.config.base_url,
|
|
746
|
-
"prompt": prompt,
|
|
747
|
-
"user_input": user_input or "",
|
|
748
|
-
"has_images": bool(images),
|
|
749
|
-
"images_count": len(images) if images else 0,
|
|
750
|
-
"tool_calls": [],
|
|
751
|
-
"tool_results": [],
|
|
752
|
-
"output": "",
|
|
753
|
-
}
|
|
754
|
-
|
|
755
|
-
search_time = 0.0
|
|
756
|
-
|
|
757
|
-
if response.tool_calls:
|
|
758
|
-
plan_dict = response.model_dump() if hasattr(response, "model_dump") else response
|
|
759
|
-
history.append(plan_dict)
|
|
760
|
-
|
|
761
|
-
tasks = [self._safe_route_tool(tc) for tc in response.tool_calls]
|
|
762
|
-
|
|
763
|
-
st = time.time()
|
|
764
|
-
results = await asyncio.gather(*tasks)
|
|
765
|
-
search_time = time.time() - st
|
|
766
|
-
|
|
767
|
-
for i, result in enumerate(results):
|
|
768
|
-
tc = response.tool_calls[i]
|
|
769
|
-
history.append(
|
|
770
|
-
{"tool_call_id": tc.id, "role": "tool", "name": tc.function.name, "content": str(result)}
|
|
771
|
-
)
|
|
772
|
-
instruct_trace["tool_calls"].append(self._tool_call_to_trace(tc))
|
|
773
|
-
instruct_trace["tool_results"].append({"name": tc.function.name, "content": str(result)})
|
|
774
|
-
|
|
775
|
-
if tc.function.name in ["web_search", "internal_web_search"]:
|
|
776
|
-
search_payloads.append(str(result))
|
|
777
|
-
|
|
778
|
-
instruct_trace["output"] = ""
|
|
779
|
-
instruct_trace["usage"] = usage
|
|
780
|
-
return "", search_payloads, instruct_trace, usage, search_time
|
|
781
|
-
|
|
782
|
-
instruct_trace["output"] = (response.content or "").strip()
|
|
783
|
-
instruct_trace["usage"] = usage
|
|
784
|
-
return "", search_payloads, instruct_trace, usage, 0.0
|
|
785
|
-
|
|
786
|
-
async def _run_auto_fetch_with_screenshots(self, urls: List[str]):
|
|
787
|
-
"""
|
|
788
|
-
Automatically fetch URLs and generate screenshots of their content.
|
|
789
|
-
Stops after getting the first 5 successful results (fastest wins).
|
|
790
|
-
Screenshots are stored as base64 in the cached items.
|
|
791
|
-
"""
|
|
792
|
-
if not urls:
|
|
793
|
-
return
|
|
794
|
-
|
|
795
|
-
# Get config
|
|
796
|
-
fetch_timeout = float(getattr(self.config, "fetch_timeout", 15.0))
|
|
797
|
-
max_results = int(getattr(self.config, "fetch_max_results", 5))
|
|
798
|
-
|
|
799
|
-
async def _fetch_and_screenshot(url: str):
|
|
800
|
-
try:
|
|
801
|
-
# Fetch page content
|
|
802
|
-
result_dict = await self.search_service.fetch_page(url)
|
|
803
|
-
|
|
804
|
-
self.global_id_counter += 1
|
|
805
|
-
|
|
806
|
-
# Generate screenshot from page content
|
|
807
|
-
screenshot_b64 = await self._render_page_screenshot(
|
|
808
|
-
title=result_dict.get("title", "Page"),
|
|
809
|
-
url=url,
|
|
810
|
-
content=result_dict.get("content", "")[:4000] # Limit content for screenshot
|
|
811
|
-
)
|
|
812
|
-
|
|
813
|
-
cached_item = {
|
|
814
|
-
"_id": self.global_id_counter,
|
|
815
|
-
"_type": "page",
|
|
816
|
-
"title": result_dict.get("title", "Page"),
|
|
817
|
-
"url": result_dict.get("url", url),
|
|
818
|
-
"content": result_dict.get("content", ""),
|
|
819
|
-
"images": result_dict.get("images", []),
|
|
820
|
-
"domain": "",
|
|
821
|
-
"is_crawled": True,
|
|
822
|
-
"screenshot_b64": screenshot_b64,
|
|
823
|
-
}
|
|
824
|
-
try:
|
|
825
|
-
from urllib.parse import urlparse
|
|
826
|
-
cached_item["domain"] = urlparse(url).netloc
|
|
827
|
-
except:
|
|
828
|
-
pass
|
|
829
|
-
|
|
830
|
-
return cached_item
|
|
831
|
-
except Exception as e:
|
|
832
|
-
logger.error(f"Failed to fetch/screenshot {url}: {e}")
|
|
833
|
-
return None
|
|
834
|
-
|
|
835
|
-
async def _fetch_with_timeout(url: str):
|
|
836
|
-
"""Wrapper to apply timeout to each fetch operation."""
|
|
837
|
-
try:
|
|
838
|
-
return await asyncio.wait_for(_fetch_and_screenshot(url), timeout=fetch_timeout)
|
|
839
|
-
except asyncio.TimeoutError:
|
|
840
|
-
logger.warning(f"Fetch timeout ({fetch_timeout}s) exceeded for: {url}")
|
|
841
|
-
return None
|
|
842
|
-
|
|
843
|
-
# Create tasks for all URLs (track url -> task mapping)
|
|
844
|
-
url_to_task = {url: asyncio.create_task(_fetch_with_timeout(url)) for url in urls}
|
|
845
|
-
tasks = list(url_to_task.values())
|
|
846
|
-
first_url = urls[0] if urls else None
|
|
847
|
-
first_task = url_to_task.get(first_url) if first_url else None
|
|
848
|
-
|
|
849
|
-
# Collect first N successful results (fastest wins)
|
|
850
|
-
collected_results = {} # url -> result
|
|
851
|
-
successful_count = 0
|
|
852
|
-
for coro in asyncio.as_completed(tasks):
|
|
853
|
-
try:
|
|
854
|
-
result = await coro
|
|
855
|
-
if result:
|
|
856
|
-
# Find which URL this result belongs to
|
|
857
|
-
result_url = result.get("url", "")
|
|
858
|
-
collected_results[result_url] = result
|
|
859
|
-
successful_count += 1
|
|
860
|
-
# Only break if we have enough AND first URL is done (or failed)
|
|
861
|
-
first_done = first_url in collected_results or (first_task and first_task.done())
|
|
862
|
-
if successful_count >= max_results and first_done:
|
|
863
|
-
logger.info(f"Got {max_results} successful results, cancelling remaining tasks")
|
|
864
|
-
break
|
|
865
|
-
except Exception as e:
|
|
866
|
-
logger.warning(f"Fetch task failed: {e}")
|
|
867
|
-
|
|
868
|
-
# Ensure first URL task completes (if not already) before cancelling others
|
|
869
|
-
if first_task and not first_task.done():
|
|
870
|
-
logger.info("Waiting for first URL to complete...")
|
|
871
|
-
try:
|
|
872
|
-
result = await first_task
|
|
873
|
-
if result:
|
|
874
|
-
collected_results[result.get("url", first_url)] = result
|
|
875
|
-
except Exception as e:
|
|
876
|
-
logger.warning(f"First URL fetch failed: {e}")
|
|
877
|
-
|
|
878
|
-
# Cancel remaining tasks
|
|
879
|
-
for task in tasks:
|
|
880
|
-
if not task.done():
|
|
881
|
-
task.cancel()
|
|
882
|
-
|
|
883
|
-
# Wait briefly for cancellation to propagate
|
|
884
|
-
if any(not t.done() for t in tasks):
|
|
885
|
-
await asyncio.gather(*tasks, return_exceptions=True)
|
|
886
|
-
|
|
887
|
-
# Add results in original URL order (not fetch speed order)
|
|
888
|
-
for url in urls:
|
|
889
|
-
if url in collected_results:
|
|
890
|
-
self.all_web_results.append(collected_results[url])
|
|
891
|
-
|
|
892
|
-
async def _render_page_screenshot(self, title: str, url: str, content: str) -> Optional[str]:
|
|
893
|
-
"""
|
|
894
|
-
Render page content as a simple HTML and take a screenshot.
|
|
895
|
-
Returns base64 encoded image or None on failure.
|
|
896
|
-
Images are compressed to reduce LLM payload size.
|
|
897
|
-
"""
|
|
898
|
-
import base64
|
|
899
|
-
import tempfile
|
|
900
|
-
|
|
901
|
-
try:
|
|
902
|
-
# Try to use the content renderer if available
|
|
903
|
-
from .render_vue import ContentRenderer
|
|
904
|
-
|
|
905
|
-
# Create a simple markdown representation for screenshot
|
|
906
|
-
markdown = f"> 来源: {url}\n\n# {title}\n\n{content}" # Limit content
|
|
907
|
-
|
|
908
|
-
# Use temp file for screenshot
|
|
909
|
-
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
|
|
910
|
-
tmp_path = tmp.name
|
|
911
|
-
|
|
912
|
-
# Get or create renderer (reuse if possible)
|
|
913
|
-
if not hasattr(self, '_screenshot_renderer'):
|
|
914
|
-
self._screenshot_renderer = ContentRenderer(auto_start=True)
|
|
915
|
-
await self._screenshot_renderer.start(timeout=10000)
|
|
916
|
-
|
|
917
|
-
# Await the async render method
|
|
918
|
-
await self._screenshot_renderer.render(
|
|
919
|
-
markdown,
|
|
920
|
-
tmp_path,
|
|
921
|
-
stats={"total_time": 0},
|
|
922
|
-
references=[{"title": title, "url": url, "domain": ""}],
|
|
923
|
-
)
|
|
924
|
-
|
|
925
|
-
# Compress image to reduce LLM payload size (~350KB target)
|
|
926
|
-
img_bytes = await self._compress_image(tmp_path, max_width=600, quality=70)
|
|
927
|
-
|
|
928
|
-
# Cleanup
|
|
929
|
-
import os
|
|
930
|
-
os.unlink(tmp_path)
|
|
931
|
-
|
|
932
|
-
return base64.b64encode(img_bytes).decode("utf-8")
|
|
933
|
-
|
|
934
|
-
except Exception as e:
|
|
935
|
-
logger.warning(f"Failed to render page screenshot: {e}")
|
|
936
|
-
return None
|
|
937
|
-
|
|
938
|
-
async def _compress_image(self, image_path: str, max_width: int = 400, quality: int = 50) -> bytes:
|
|
939
|
-
"""Compress image to reduce size for LLM payload."""
|
|
940
|
-
from io import BytesIO
|
|
941
|
-
|
|
942
|
-
try:
|
|
943
|
-
from PIL import Image
|
|
944
|
-
|
|
945
|
-
def _compress():
|
|
946
|
-
with Image.open(image_path) as img:
|
|
947
|
-
# Calculate new height maintaining aspect ratio
|
|
948
|
-
if img.width > max_width:
|
|
949
|
-
ratio = max_width / img.width
|
|
950
|
-
new_height = int(img.height * ratio)
|
|
951
|
-
img = img.resize((max_width, new_height), Image.Resampling.LANCZOS)
|
|
952
|
-
|
|
953
|
-
# Convert to RGB if necessary
|
|
954
|
-
if img.mode in ('RGBA', 'P'):
|
|
955
|
-
img = img.convert('RGB')
|
|
956
|
-
|
|
957
|
-
# Save to buffer with compression
|
|
958
|
-
buffer = BytesIO()
|
|
959
|
-
img.save(buffer, format='JPEG', quality=quality, optimize=True)
|
|
960
|
-
return buffer.getvalue()
|
|
961
|
-
|
|
962
|
-
return await asyncio.to_thread(_compress)
|
|
963
|
-
|
|
964
|
-
except ImportError:
|
|
965
|
-
# PIL not available, return original
|
|
966
|
-
logger.warning("PIL not available for image compression, using original")
|
|
967
|
-
with open(image_path, 'rb') as f:
|
|
968
|
-
return f.read()
|
|
969
|
-
|
|
970
|
-
async def _run_summary_stage(
|
|
971
|
-
self, user_input: str, images: List[str] = None,
|
|
972
|
-
has_page_screenshots: bool = False, model: str = None
|
|
973
|
-
) -> Tuple[str, Dict[str, int], Dict[str, Any]]:
|
|
974
|
-
"""
|
|
975
|
-
Generate final summary using page screenshots only.
|
|
976
|
-
Returns (content, usage, trace_info).
|
|
977
|
-
"""
|
|
978
|
-
|
|
979
|
-
# Build system prompt
|
|
980
|
-
try:
|
|
981
|
-
language_conf = getattr(self.config, "language", "Simplified Chinese")
|
|
982
|
-
system_prompt = SUMMARY_SP.format(language=language_conf)
|
|
983
|
-
except Exception:
|
|
984
|
-
system_prompt = SUMMARY_SP
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
# Build user content - multimodal if images provided
|
|
989
|
-
if images:
|
|
990
|
-
user_content: List[Dict[str, Any]] = [{"type": "text", "text": user_input or "..."}]
|
|
991
|
-
for img_b64 in images:
|
|
992
|
-
url = f"data:image/jpeg;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
|
|
993
|
-
user_content.append({"type": "image_url", "image_url": {"url": url}})
|
|
994
|
-
else:
|
|
995
|
-
user_content = user_input or "..."
|
|
996
|
-
|
|
997
|
-
messages = [
|
|
998
|
-
{"role": "system", "content": system_prompt},
|
|
999
|
-
{"role": "user", "content": user_content}
|
|
1000
|
-
]
|
|
1001
|
-
|
|
1002
|
-
client = self._client_for(
|
|
1003
|
-
api_key=getattr(self.config, "summary_api_key", None),
|
|
1004
|
-
base_url=getattr(self.config, "summary_base_url", None)
|
|
1005
|
-
)
|
|
1006
|
-
|
|
1007
|
-
response, usage = await self._safe_llm_call(
|
|
1008
|
-
messages=messages,
|
|
1009
|
-
model=model,
|
|
1010
|
-
client=client,
|
|
1011
|
-
extra_body=getattr(self.config, "summary_extra_body", None)
|
|
1012
|
-
)
|
|
1013
|
-
|
|
1014
|
-
return (response.content or "").strip(), usage, {"prompt": system_prompt}
|
|
1015
|
-
|
|
1016
|
-
def _format_fetch_msgs(self) -> str:
|
|
1017
|
-
"""Format crawled page content for Summary prompt."""
|
|
1018
|
-
if not self.all_web_results:
|
|
1019
|
-
return ""
|
|
1020
|
-
|
|
1021
|
-
lines = []
|
|
1022
|
-
for res in self.all_web_results:
|
|
1023
|
-
if res.get("_type") != "page": continue
|
|
1024
|
-
idx = res.get("_id")
|
|
1025
|
-
title = (res.get("title", "") or "").strip()
|
|
1026
|
-
url = res.get("url", "")
|
|
1027
|
-
content = (res.get("content", "") or "").strip()
|
|
1028
|
-
# Truncate content if too long? For now keep it full or rely on model context
|
|
1029
|
-
lines.append(f"Title: {title}\nURL: {url}\nContent:\n{content}\n")
|
|
1030
|
-
|
|
1031
|
-
return "\n".join(lines)
|
|
1032
|
-
|
|
1033
|
-
def _format_search_msgs(self) -> str:
|
|
1034
|
-
"""Format search snippets only (not crawled pages)."""
|
|
1035
|
-
if not self.all_web_results:
|
|
1036
|
-
return ""
|
|
1037
|
-
|
|
1038
|
-
lines = []
|
|
1039
|
-
for res in self.all_web_results:
|
|
1040
|
-
if res.get("_type") != "search": continue # Only search results
|
|
1041
|
-
idx = res.get("_id")
|
|
1042
|
-
title = (res.get("title", "") or "").strip()
|
|
1043
|
-
url = res.get("url", "")
|
|
1044
|
-
content = (res.get("content", "") or "").strip()
|
|
1045
|
-
lines.append(f"[{idx}] Title: {title}\nURL: {url}\nSnippet: {content}\n")
|
|
1046
|
-
|
|
1047
|
-
return "\n".join(lines)
|
|
1048
|
-
|
|
1049
|
-
def _format_page_msgs(self) -> str:
|
|
1050
|
-
"""Format crawled page content (detailed)."""
|
|
1051
|
-
if not self.all_web_results:
|
|
1052
|
-
return ""
|
|
1053
|
-
|
|
1054
|
-
lines = []
|
|
1055
|
-
for res in self.all_web_results:
|
|
1056
|
-
if res.get("_type") != "page": continue # Only page results
|
|
1057
|
-
idx = res.get("_id")
|
|
1058
|
-
title = (res.get("title", "") or "").strip()
|
|
1059
|
-
url = res.get("url", "")
|
|
1060
|
-
content = (res.get("content", "") or "").strip()
|
|
1061
|
-
lines.append(f"[{idx}] Title: {title}\nURL: {url}\nContent: {content}\n")
|
|
1062
|
-
|
|
1063
|
-
return "\n".join(lines)
|
|
1064
|
-
|
|
1065
|
-
def _format_image_search_msgs(self) -> str:
|
|
1066
|
-
if not self.all_web_results:
|
|
1067
|
-
return ""
|
|
1068
|
-
|
|
1069
|
-
lines = []
|
|
1070
|
-
for res in self.all_web_results:
|
|
1071
|
-
if res.get("_type") != "image": continue # Only image results
|
|
1072
|
-
idx = res.get("_id")
|
|
1073
|
-
title = res.get("title", "")
|
|
1074
|
-
url = res.get("image", "") or res.get("url", "")
|
|
1075
|
-
thumb = res.get("thumbnail", "")
|
|
1076
|
-
lines.append(f"[{idx}] Title: {title}\nURL: {url}\nThumbnail: {thumb}\n")
|
|
1077
|
-
return "\n".join(lines)
|
|
1078
|
-
|
|
1079
|
-
def _client_for(self, api_key: Optional[str], base_url: Optional[str]) -> AsyncOpenAI:
|
|
1080
|
-
if api_key or base_url:
|
|
1081
|
-
return AsyncOpenAI(base_url=base_url or self.config.base_url, api_key=api_key or self.config.api_key)
|
|
1082
|
-
return self.client
|
|
1083
|
-
|
|
1084
|
-
def _tool_call_to_trace(self, tool_call) -> Dict[str, Any]:
|
|
1085
|
-
try:
|
|
1086
|
-
args = json.loads(html.unescape(tool_call.function.arguments))
|
|
1087
|
-
except Exception:
|
|
1088
|
-
args = tool_call.function.arguments
|
|
1089
|
-
return {"id": getattr(tool_call, "id", None), "name": tool_call.function.name, "arguments": args}
|
|
1090
|
-
|
|
1091
|
-
def _render_trace_markdown(self, trace: Dict[str, Any]) -> str:
|
|
1092
|
-
def fence(label: str, content: str) -> str:
|
|
1093
|
-
safe = (content or "").replace("```", "``\\`")
|
|
1094
|
-
return f"```{label}\n{safe}\n```"
|
|
1095
|
-
|
|
1096
|
-
parts: List[str] = []
|
|
1097
|
-
parts.append("# Pipeline Trace\n")
|
|
1098
|
-
|
|
1099
|
-
if trace.get("instruct"):
|
|
1100
|
-
t = trace["instruct"]
|
|
1101
|
-
parts.append("## Instruct\n")
|
|
1102
|
-
parts.append(f"- model: `{t.get('model')}`")
|
|
1103
|
-
parts.append(f"- base_url: `{t.get('base_url')}`")
|
|
1104
|
-
parts.append(f"- has_images: `{t.get('has_images', False)}`")
|
|
1105
|
-
parts.append(f"- images_count: `{t.get('images_count', 0)}`\n")
|
|
1106
|
-
parts.append("### Prompt\n")
|
|
1107
|
-
parts.append(fence("text", t.get("prompt", "")))
|
|
1108
|
-
if t.get("tool_calls"):
|
|
1109
|
-
parts.append("\n### Tool Calls\n")
|
|
1110
|
-
parts.append(fence("json", json.dumps(t.get("tool_calls"), ensure_ascii=False, indent=2)))
|
|
1111
|
-
if t.get("tool_results"):
|
|
1112
|
-
parts.append("\n### Tool Results\n")
|
|
1113
|
-
parts.append(fence("json", json.dumps(t.get("tool_results"), ensure_ascii=False, indent=2)))
|
|
1114
|
-
parts.append("\n### Output\n")
|
|
1115
|
-
parts.append(fence("text", t.get("output", "")))
|
|
1116
|
-
parts.append("")
|
|
1117
|
-
|
|
1118
|
-
if trace.get("fetch"):
|
|
1119
|
-
f = trace["fetch"]
|
|
1120
|
-
parts.append("## Auto-Fetch\n")
|
|
1121
|
-
parts.append(f"- urls_fetched: `{f.get('urls_fetched', [])}`")
|
|
1122
|
-
parts.append(f"- screenshots_count: `{f.get('screenshots_count', 0)}`\n")
|
|
1123
|
-
parts.append("")
|
|
1124
|
-
|
|
1125
|
-
if trace.get("summary"):
|
|
1126
|
-
s = trace["summary"]
|
|
1127
|
-
parts.append("## Summary\n")
|
|
1128
|
-
parts.append(f"- model: `{s.get('model')}`\n")
|
|
1129
|
-
parts.append("### System Prompt\n")
|
|
1130
|
-
parts.append(fence("text", s.get("system_prompt", "")))
|
|
1131
|
-
parts.append("\n### Output\n")
|
|
1132
|
-
parts.append(fence("text", s.get("output", "")))
|
|
1133
|
-
parts.append("")
|
|
1134
|
-
|
|
1135
|
-
return "\n".join(parts).strip() + "\n"
|
|
1136
|
-
|
|
1137
|
-
async def _cache_run_async(self, cache_data: Dict[str, Any]):
|
|
1138
|
-
"""
|
|
1139
|
-
Async background task to cache run data (trace, screenshots) to a folder.
|
|
1140
|
-
Saves to data/conversations/{timestamp}_{query}/
|
|
1141
|
-
This runs after the response is sent, so it doesn't block the main pipeline.
|
|
1142
|
-
"""
|
|
1143
|
-
import base64
|
|
1144
|
-
from datetime import datetime
|
|
1145
|
-
from pathlib import Path
|
|
1146
|
-
|
|
1147
|
-
try:
|
|
1148
|
-
# Create cache directory: data/conversations/{timestamp}_{query}/
|
|
1149
|
-
cache_base = Path(getattr(self.config, "conversations_dir", "data/conversations"))
|
|
1150
|
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
1151
|
-
user_input_short = (cache_data.get("user_input", "query") or "query")[:20]
|
|
1152
|
-
# Clean filename
|
|
1153
|
-
user_input_short = "".join(c if c.isalnum() or c in "._-" else "_" for c in user_input_short)
|
|
1154
|
-
cache_dir = cache_base / f"{timestamp}_{user_input_short}"
|
|
1155
|
-
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
1156
|
-
|
|
1157
|
-
# Save conversation markdown (includes trace and response)
|
|
1158
|
-
conversation_md = f"""# {cache_data.get("user_input", "Query")}
|
|
1159
|
-
|
|
1160
|
-
## Response
|
|
1161
|
-
|
|
1162
|
-
{cache_data.get("final_content", "")}
|
|
1163
|
-
|
|
1164
|
-
---
|
|
1165
|
-
|
|
1166
|
-
## Trace
|
|
1167
|
-
|
|
1168
|
-
{cache_data.get("trace_markdown", "")}
|
|
1169
|
-
"""
|
|
1170
|
-
conv_path = cache_dir / "conversation.md"
|
|
1171
|
-
await asyncio.to_thread(
|
|
1172
|
-
conv_path.write_text,
|
|
1173
|
-
conversation_md,
|
|
1174
|
-
encoding="utf-8"
|
|
1175
|
-
)
|
|
1176
|
-
|
|
1177
|
-
# Save page screenshots
|
|
1178
|
-
screenshots = cache_data.get("page_screenshots", [])
|
|
1179
|
-
for i, screenshot_b64 in enumerate(screenshots):
|
|
1180
|
-
if screenshot_b64:
|
|
1181
|
-
screenshot_path = cache_dir / f"page_{i+1}.jpg"
|
|
1182
|
-
img_bytes = base64.b64decode(screenshot_b64)
|
|
1183
|
-
await asyncio.to_thread(screenshot_path.write_bytes, img_bytes)
|
|
1184
|
-
|
|
1185
|
-
logger.debug(f"Conversation cached to: {cache_dir}")
|
|
1186
|
-
|
|
1187
|
-
except Exception as e:
|
|
1188
|
-
# Don't fail silently but also don't crash the pipeline
|
|
1189
|
-
logger.warning(f"Failed to cache conversation: {e}")
|
|
1190
|
-
|
|
1191
|
-
async def close(self):
|
|
1192
|
-
try:
|
|
1193
|
-
await self.search_service.close()
|
|
1194
|
-
except Exception:
|
|
1195
|
-
pass
|
|
1196
|
-
|
|
1197
|
-
# Gracefully handle background tasks completion
|
|
1198
|
-
if hasattr(self, '_image_search_tasks') and self._image_search_tasks:
|
|
1199
|
-
for task in self._image_search_tasks:
|
|
1200
|
-
if not task.done(): task.cancel()
|
|
1201
|
-
try:
|
|
1202
|
-
# Wait briefly for cancellation to propagate
|
|
1203
|
-
await asyncio.wait(self._image_search_tasks, timeout=0.2)
|
|
1204
|
-
except Exception: pass
|
|
1205
|
-
self._image_search_tasks = []
|
|
1206
|
-
|
|
1207
|
-
# Also cleanup image cache pending tasks if any
|
|
1208
|
-
try:
|
|
1209
|
-
from .image_cache import get_image_cache
|
|
1210
|
-
cache = get_image_cache()
|
|
1211
|
-
if cache._pending:
|
|
1212
|
-
pending = list(cache._pending.values())
|
|
1213
|
-
for task in pending:
|
|
1214
|
-
if not task.done(): task.cancel()
|
|
1215
|
-
await asyncio.wait(pending, timeout=0.2)
|
|
1216
|
-
cache._pending.clear()
|
|
1217
|
-
except Exception: pass
|
|
1218
|
-
|
|
1219
|
-
self.all_web_results = []
|