entari-plugin-hyw 3.5.0rc7__py3-none-any.whl → 4.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

@@ -12,14 +12,8 @@ from openai import AsyncOpenAI
12
12
  from .search import SearchService
13
13
  from .image_cache import get_cached_images
14
14
  from .prompts import (
15
- AGENT_SP,
16
- AGENT_SP_INSTRUCT_VISION_ADD,
17
- AGENT_SP_TOOLS_STANDARD_ADD,
18
- AGENT_SP_TOOLS_AGENT_ADD,
19
- AGENT_SP_SEARCH_ADD,
15
+ SUMMARY_SP,
20
16
  INSTRUCT_SP,
21
- INSTRUCT_SP_VISION_ADD,
22
- VISION_SP,
23
17
  )
24
18
 
25
19
  @asynccontextmanager
@@ -42,6 +36,7 @@ class ProcessingPipeline:
42
36
  self.global_id_counter = 0
43
37
  # Background tasks for async image search (not blocking agent)
44
38
  self._image_search_tasks: List[asyncio.Task] = []
39
+ self._search_error: Optional[str] = None # Track critical search errors
45
40
 
46
41
  self.web_search_tool = {
47
42
  "type": "function",
@@ -55,33 +50,6 @@ class ProcessingPipeline:
55
50
  },
56
51
  },
57
52
  }
58
- self.image_search_tool = {
59
- "type": "function",
60
- "function": {
61
- "name": "internal_image_search",
62
- "description": "Search for images related to a query.",
63
- "parameters": {
64
- "type": "object",
65
- "properties": {"query": {"type": "string"}},
66
- "required": ["query"],
67
- },
68
- },
69
- }
70
- self.set_mode_tool = {
71
- "type": "function",
72
- "function": {
73
- "name": "set_mode",
74
- "description": "设定后续 Agent 的运行模式: standard | agent",
75
- "parameters": {
76
- "type": "object",
77
- "properties": {
78
- "mode": {"type": "string", "enum": ["standard", "agent"]},
79
- "reason": {"type": "string"},
80
- },
81
- "required": ["mode"],
82
- },
83
- },
84
- }
85
53
  self.crawl_page_tool = {
86
54
  "type": "function",
87
55
  "function": {
@@ -104,7 +72,7 @@ class ProcessingPipeline:
104
72
  "parameters": {
105
73
  "type": "object",
106
74
  "properties": {
107
- "reason": {"type": "string", "description": "拒绝回答的原因(内部记录,不展示给用户)"},
75
+ "reason": {"type": "string", "description": "拒绝回答的原因(展示给用户)"},
108
76
  },
109
77
  "required": [],
110
78
  },
@@ -124,669 +92,305 @@ class ProcessingPipeline:
124
92
  selected_vision_model: str = None,
125
93
  ) -> Dict[str, Any]:
126
94
  """
127
- 1) Vision: summarize images once (no image persistence).
128
- 2) Instruct: run web_search and decide whether to grant Playwright MCP tools.
129
- 3) Agent: normally no tools; if granted, allow Playwright MCP tools (max 6 rounds; step 5 nudge, step 6 forced).
95
+ New Pipeline Flow:
96
+ 1) Instruct: Images go directly here, decides web_search/crawl_page/refuse.
97
+ 2) Auto-Fetch: Automatically fetch first 4 search result pages.
98
+ 3) Screenshot: Render fetched pages as screenshots.
99
+ 4) Summary: Receives user images + page screenshots for final answer.
130
100
  """
131
101
  start_time = time.time()
132
102
  stats = {"start_time": start_time, "tool_calls_count": 0}
133
- # Token usage tracking for billing
134
103
  usage_totals = {"input_tokens": 0, "output_tokens": 0}
135
104
  active_model = model_name or self.config.model_name
136
105
 
137
106
  current_history = conversation_history
138
- final_response_content = ""
139
- structured: Dict[str, Any] = {}
140
-
141
- # Reset search cache and ID counter for this execution
107
+ # Reset globals
142
108
  self.all_web_results = []
143
109
  self.global_id_counter = 0
144
- # Reset refuse_answer flag
145
110
  self._should_refuse = False
146
111
  self._refuse_reason = ""
112
+ self._image_search_tasks = []
147
113
 
148
114
  try:
149
115
  logger.info(f"Pipeline: Starting workflow for '{user_input}' using {active_model}")
150
-
116
+
151
117
  trace: Dict[str, Any] = {
152
- "vision": None,
153
118
  "instruct": None,
154
- "agent": None,
119
+ "search": None,
120
+ "fetch": None,
121
+ "summary": None,
155
122
  }
156
123
 
157
- # Vision stage
158
- vision_text = ""
159
- vision_start = time.time()
160
- vision_time = 0
161
- vision_cost = 0.0
162
- vision_usage = {}
163
- if images:
164
- vision_model = (
165
- selected_vision_model
166
- or vision_model_name
167
- or getattr(self.config, "vision_model_name", None)
168
- or active_model
169
- )
170
- vision_prompt = VISION_SP.format(user_msgs=user_input or "[图片]")
171
- vision_text, vision_usage = await self._run_vision_stage(
172
- user_input=user_input,
173
- images=images,
174
- model=vision_model,
175
- prompt=vision_prompt,
176
- )
177
- # Add vision usage with vision-specific pricing
178
- usage_totals["input_tokens"] += vision_usage.get("input_tokens", 0)
179
- usage_totals["output_tokens"] += vision_usage.get("output_tokens", 0)
180
-
181
- # Calculate Vision Cost
182
- v_in_price = float(getattr(self.config, "vision_input_price", None) or getattr(self.config, "input_price", 0.0) or 0.0)
183
- v_out_price = float(getattr(self.config, "vision_output_price", None) or getattr(self.config, "output_price", 0.0) or 0.0)
184
- if v_in_price > 0 or v_out_price > 0:
185
- vision_cost = (vision_usage.get("input_tokens", 0) / 1_000_000 * v_in_price) + (vision_usage.get("output_tokens", 0) / 1_000_000 * v_out_price)
186
-
187
- vision_time = time.time() - vision_start
188
-
189
- trace["vision"] = {
190
- "model": vision_model,
191
- "base_url": getattr(self.config, "vision_base_url", None) or self.config.base_url,
192
- "prompt": vision_prompt,
193
- "user_input": user_input or "",
194
- "images_count": len(images or []),
195
- "output": vision_text,
196
- "usage": vision_usage,
197
- "time": vision_time,
198
- "cost": vision_cost
199
- }
200
-
201
- # Instruct + pre-search
124
+ # --- 1. Instruct Stage (with images if provided) ---
202
125
  instruct_start = time.time()
203
126
  instruct_model = getattr(self.config, "instruct_model_name", None) or active_model
204
- logger.info(f"Instruct Stage Config: instruct_model_name={getattr(self.config, 'instruct_model_name', None)}, active_model={active_model}, using: {instruct_model}")
205
127
  instruct_text, search_payloads, instruct_trace, instruct_usage, search_time = await self._run_instruct_stage(
206
128
  user_input=user_input,
207
- vision_text=vision_text,
129
+ images=images, # Pass images directly to instruct
208
130
  model=instruct_model,
209
131
  )
210
- # Instruct time excludes search time (search_time is returned separately)
211
- instruct_time = time.time() - instruct_start - search_time
212
132
 
213
- # Calculate Instruct Cost
133
+ # Check refuse
134
+ if self._should_refuse:
135
+ return {
136
+ "llm_response": "",
137
+ "structured_response": {},
138
+ "stats": stats,
139
+ "model_used": active_model,
140
+ "conversation_history": current_history,
141
+ "refuse_answer": True,
142
+ "refuse_reason": self._refuse_reason
143
+ }
144
+
145
+ # Check for critical search errors
146
+ if self._search_error:
147
+ return {
148
+ "llm_response": "",
149
+ "structured_response": {},
150
+ "stats": stats,
151
+ "model_used": active_model,
152
+ "conversation_history": current_history,
153
+ "refuse_answer": True,
154
+ "refuse_reason": f"搜索服务异常: {self._search_error} 请联系管理员。"
155
+ }
156
+
157
+ usage_totals["input_tokens"] += instruct_usage.get("input_tokens", 0)
158
+ usage_totals["output_tokens"] += instruct_usage.get("output_tokens", 0)
159
+
214
160
  instruct_cost = 0.0
215
161
  i_in_price = float(getattr(self.config, "instruct_input_price", None) or getattr(self.config, "input_price", 0.0) or 0.0)
216
162
  i_out_price = float(getattr(self.config, "instruct_output_price", None) or getattr(self.config, "output_price", 0.0) or 0.0)
217
163
  if i_in_price > 0 or i_out_price > 0:
218
164
  instruct_cost = (instruct_usage.get("input_tokens", 0) / 1_000_000 * i_in_price) + (instruct_usage.get("output_tokens", 0) / 1_000_000 * i_out_price)
219
165
 
220
- # Add instruct usage
221
- usage_totals["input_tokens"] += instruct_usage.get("input_tokens", 0)
222
- usage_totals["output_tokens"] += instruct_usage.get("output_tokens", 0)
223
-
224
- instruct_trace["time"] = instruct_time
225
166
  instruct_trace["cost"] = instruct_cost
226
167
  trace["instruct"] = instruct_trace
227
168
 
228
- # Check if refuse_answer was called - terminate early
229
- if self._should_refuse:
230
- logger.info(f"Pipeline: refuse_answer triggered. Reason: {self._refuse_reason}")
231
- stats["total_time"] = time.time() - start_time
232
- return {
233
- "llm_response": "",
234
- "structured_response": {},
235
- "stats": stats,
236
- "model_used": active_model,
237
- "conversation_history": current_history,
238
- "refuse_answer": True,
239
- "refuse_reason": self._refuse_reason,
240
- "stages_used": [],
241
- }
242
-
243
- # Start agent loop
244
- agent_start_time = time.time()
245
- current_history.append({"role": "user", "content": user_input or "..."})
246
-
247
- mode = instruct_trace.get("mode", self.current_mode).lower()
248
- logger.success(f"Instruct Mode: {mode}")
249
- self.current_mode = mode
169
+ # --- 2. Auto-Fetch Stage (Automatically fetch first 4 search results) ---
170
+ fetch_start = time.time()
171
+ fetch_trace = {}
172
+ page_screenshots: List[str] = [] # Base64 screenshots of fetched pages
250
173
 
251
- # Determine max iterations
252
- max_steps = 10 if mode == "agent" else 1
253
-
254
- step = 0
255
- agent_trace_steps: List[Dict[str, Any]] = []
256
- last_system_prompt = ""
257
-
258
- agent_tools: Optional[List[Dict[str, Any]]] = None
259
- if mode == "agent":
260
- agent_tools = [self.web_search_tool, self.image_search_tool, self.crawl_page_tool]
261
-
262
- # Agent loop
263
- while step < max_steps:
264
- step += 1
265
- logger.info(f"Pipeline: Agent step {step}/{max_steps}")
266
-
267
- if step == 5 and mode == "agent":
268
- current_history.append(
269
- {
270
- "role": "system",
271
- "content": "System: [Next Step Final] Please start consolidating the answer; the next step must be the final response.",
272
- }
273
- )
274
-
275
- tools_desc = ""
276
- if agent_tools:
277
- tools_desc = "\n".join([
278
- "- internal_web_search(query): 触发搜索并缓存结果",
279
- "- crawl_page(url): 使用 Crawl4AI 抓取网页返回 Markdown"
280
- ])
281
-
282
- user_msgs_text = user_input or ""
283
-
284
- search_msgs_text = self._format_search_msgs()
285
- # Image search results are NOT passed to LLM - they're for UI rendering only
174
+ fetch_urls = []
175
+ search_items = [r for r in self.all_web_results if r.get("_type") == "search"]
176
+ if search_items:
177
+ # Group search results by query
178
+ query_groups = {}
179
+ for r in search_items:
180
+ q = r.get("query", "default")
181
+ if q not in query_groups:
182
+ query_groups[q] = []
183
+ query_groups[q].append(r)
286
184
 
287
- has_search_results = any(r.get("_type") == "search" for r in self.all_web_results)
288
- has_image_results = any(r.get("_type") == "image" for r in self.all_web_results) # For UI rendering only
289
-
290
- # Build agent system prompt
291
- mode_desc_text = AGENT_SP_TOOLS_AGENT_ADD.format(tools_desc=tools_desc) if mode == "agent" else AGENT_SP_TOOLS_STANDARD_ADD
292
- system_prompt = AGENT_SP.format(
293
- user_msgs=user_msgs_text,
294
- mode=mode,
295
- mode_desc=mode_desc_text,
296
- language=getattr(self.config, "language", "Simplified Chinese")[:128]
297
- )
185
+ raw_fetch_urls = []
186
+ # If multiple queries, take top 3 from each
187
+ if len(query_groups) > 1:
188
+ logger.info(f"Pipeline: Multiple search queries detected ({len(query_groups)}). Taking top 3 from each.")
189
+ for q, items in query_groups.items():
190
+ for item in items[:3]:
191
+ if item.get("url"):
192
+ raw_fetch_urls.append(item.get("url"))
193
+ else:
194
+ # Single query, take top 8
195
+ raw_fetch_urls = [r.get("url") for r in search_items[:8] if r.get("url")]
298
196
 
299
- # Append vision text if available
300
- if vision_text:
301
- system_prompt += AGENT_SP_INSTRUCT_VISION_ADD.format(vision_msgs=vision_text)
197
+ # Deduplicate while preserving order and filter blocked domains
198
+ final_fetch_urls = []
199
+ blocked_domains = getattr(self.config, "fetch_blocked_domains", ["wikipedia.org", "csdn.net", "sohu.com", "sogou.com"])
200
+ if isinstance(blocked_domains, str):
201
+ blocked_domains = [d.strip() for d in blocked_domains.split(",")]
302
202
 
303
- # Append search results (text and page only, NOT images)
304
- page_msgs_text = self._format_page_msgs()
305
- all_search_parts = []
306
- if has_search_results and search_msgs_text:
307
- all_search_parts.append(search_msgs_text)
308
- if page_msgs_text:
309
- all_search_parts.append(page_msgs_text)
310
- # Images are excluded from LLM prompt - they're for UI rendering only
203
+ for url in raw_fetch_urls:
204
+ if url and url not in final_fetch_urls:
205
+ # Check blocklist
206
+ if any(domain in url.lower() for domain in blocked_domains):
207
+ continue
208
+ final_fetch_urls.append(url)
311
209
 
312
- if all_search_parts:
313
- system_prompt += AGENT_SP_SEARCH_ADD.format(search_msgs="\n".join(all_search_parts))
314
-
315
- last_system_prompt = system_prompt
210
+ fetch_urls = final_fetch_urls
316
211
 
317
- messages = [{"role": "system", "content": system_prompt}]
318
- messages.extend(current_history)
319
-
320
- tools_for_step = agent_tools if (agent_tools and step < max_steps) else None
321
-
322
- # Debug logging
323
- if tools_for_step:
324
- logger.info(f"[Agent] Tools provided: {[t['function']['name'] for t in tools_for_step]}")
325
- else:
326
- logger.warning(f"[Agent] NO TOOLS provided for step {step} (agent_tools={agent_tools is not None}, step<max={step < max_steps})")
327
-
328
- step_llm_start = time.time()
329
- response, step_usage = await self._safe_llm_call(
330
- messages=messages,
331
- model=active_model,
332
- tools=tools_for_step,
333
- tool_choice="auto" if tools_for_step else None,
334
- extra_body=self.config.extra_body,
335
- )
336
- step_llm_time = time.time() - step_llm_start
337
-
338
- # Debug: Check response
339
- has_tool_calls = response.tool_calls is not None and len(response.tool_calls) > 0
340
- logger.info(f"[Agent] Response has_tool_calls={has_tool_calls}, has_content={bool(response.content)}")
212
+ # Check if search was performed but no URLs were available for fetching
213
+ has_search_call = False
214
+ if instruct_trace and "tool_calls" in instruct_trace:
215
+ has_search_call = any(tc.get("name") in ["web_search", "internal_web_search"] for tc in instruct_trace["tool_calls"])
216
+
217
+ if has_search_call and not fetch_urls:
218
+ return {
219
+ "llm_response": "",
220
+ "structured_response": {},
221
+ "stats": stats,
222
+ "model_used": active_model,
223
+ "conversation_history": current_history,
224
+ "refuse_answer": True,
225
+ "refuse_reason": "搜索结果为空或全部被过滤,无法生成回答。"
226
+ }
227
+
228
+ if fetch_urls:
229
+ logger.info(f"Pipeline: Auto-fetching up to {len(fetch_urls)} pages (keeping fastest 5): {fetch_urls}")
341
230
 
342
- # Accumulate agent usage
343
- usage_totals["input_tokens"] += step_usage.get("input_tokens", 0)
344
- usage_totals["output_tokens"] += step_usage.get("output_tokens", 0)
345
-
346
- if response.tool_calls and tools_for_step:
347
- tool_calls = response.tool_calls
348
- stats["tool_calls_count"] += len(tool_calls)
349
-
350
- # Use model_dump to preserve provider-specific fields (e.g., Gemini's thought_signature)
351
- assistant_msg = response.model_dump(exclude_unset=True) if hasattr(response, "model_dump") else {
352
- "role": "assistant",
353
- "content": response.content,
354
- "tool_calls": [{"id": tc.id, "type": "function", "function": {"name": tc.function.name, "arguments": tc.function.arguments}} for tc in tool_calls]
355
- }
356
- current_history.append(assistant_msg)
357
-
358
- tasks = [self._safe_route_tool(tc) for tc in tool_calls]
359
- tool_start_time = time.time()
360
- results = await asyncio.gather(*tasks)
361
- tool_exec_time = time.time() - tool_start_time
362
-
363
- step_trace = {
364
- "step": step,
365
- "tool_calls": [self._tool_call_to_trace(tc) for tc in tool_calls],
366
- "tool_results": [],
367
- "tool_time": tool_exec_time,
368
- "llm_time": step_llm_time,
369
- "usage": step_usage,
370
- }
371
- for i, result in enumerate(results):
372
- tc = tool_calls[i]
373
- step_trace["tool_results"].append({"name": tc.function.name, "content": str(result)})
374
- current_history.append(
375
- {
376
- "tool_call_id": tc.id,
377
- "role": "tool",
378
- "name": tc.function.name,
379
- "content": str(result),
380
- }
381
- )
382
- agent_trace_steps.append(step_trace)
383
- continue
384
-
385
- final_response_content = response.content or ""
386
- current_history.append({"role": "assistant", "content": final_response_content})
387
- agent_trace_steps.append({
388
- "step": step,
389
- "final": True,
390
- "output": final_response_content,
391
- "llm_time": step_llm_time,
392
- "usage": step_usage
393
- })
394
- break
231
+ # Execute fetch and get screenshots
232
+ await self._run_auto_fetch_with_screenshots(fetch_urls)
395
233
 
396
- if not final_response_content:
397
- final_response_content = "执行结束,但未生成内容。"
234
+ fetch_trace = {
235
+ "model": "Auto",
236
+ "urls_fetched": fetch_urls,
237
+ "time": time.time() - fetch_start,
238
+ "cost": 0.0,
239
+ }
240
+ trace["fetch"] = fetch_trace
398
241
 
399
- structured = self._parse_tagged_response(final_response_content)
400
- final_content = structured.get("response") or final_response_content
242
+ # Always collect screenshots from ALL page results (search auto-fetch + direct URL crawl)
243
+ fetch_items = [r for r in self.all_web_results if r.get("_type") == "page"]
244
+ for r in fetch_items:
245
+ if r.get("screenshot_b64"):
246
+ page_screenshots.append(r["screenshot_b64"])
247
+
248
+ if fetch_trace:
249
+ fetch_trace["screenshots_count"] = len(page_screenshots)
401
250
 
402
- agent_time = time.time() - agent_start_time
251
+ # --- 3. Summary Stage (with user images + page screenshots only) ---
252
+ summary_start = time.time()
253
+ summary_model = active_model
254
+
255
+ # Combine user images and page screenshots for summary
256
+ all_summary_images: List[str] = []
257
+ if images:
258
+ all_summary_images.extend(images)
259
+ all_summary_images.extend(page_screenshots)
403
260
 
404
- # Calculate Agent Cost
405
- agent_cost = 0.0
406
- a_in_price = float(getattr(self.config, "input_price", 0.0) or 0.0)
407
- a_out_price = float(getattr(self.config, "output_price", 0.0) or 0.0)
261
+ summary_content, summary_usage, summary_trace_info = await self._run_summary_stage(
262
+ user_input=user_input,
263
+ images=all_summary_images if all_summary_images else None,
264
+ has_page_screenshots=bool(page_screenshots),
265
+ model=summary_model
266
+ )
408
267
 
409
- agent_input_tokens = usage_totals["input_tokens"] - vision_usage.get("input_tokens", 0) - instruct_usage.get("input_tokens", 0)
410
- agent_output_tokens = usage_totals["output_tokens"] - vision_usage.get("output_tokens", 0) - instruct_usage.get("output_tokens", 0)
268
+ usage_totals["input_tokens"] += summary_usage.get("input_tokens", 0)
269
+ usage_totals["output_tokens"] += summary_usage.get("output_tokens", 0)
411
270
 
412
- if a_in_price > 0 or a_out_price > 0:
413
- agent_cost = (max(0, agent_input_tokens) / 1_000_000 * a_in_price) + (max(0, agent_output_tokens) / 1_000_000 * a_out_price)
414
-
415
- trace["agent"] = {
416
- "model": active_model,
417
- "base_url": self.config.base_url,
418
- "system_prompt": last_system_prompt,
419
- "steps": agent_trace_steps,
420
- "final_output": final_response_content,
421
- "time": agent_time,
422
- "cost": agent_cost
271
+ summary_cost = 0.0
272
+ s_in_price = float(getattr(self.config, "input_price", 0.0) or 0.0)
273
+ s_out_price = float(getattr(self.config, "output_price", 0.0) or 0.0)
274
+ if s_in_price > 0 or s_out_price > 0:
275
+ summary_cost = (summary_usage.get("input_tokens", 0) / 1_000_000 * s_in_price) + (summary_usage.get("output_tokens", 0) / 1_000_000 * s_out_price)
276
+
277
+ trace["summary"] = {
278
+ "model": summary_model,
279
+ "system_prompt": summary_trace_info.get("prompt", ""),
280
+ "output": summary_content,
281
+ "usage": summary_usage,
282
+ "time": time.time() - summary_start,
283
+ "cost": summary_cost,
284
+ "images_count": len(all_summary_images)
423
285
  }
424
- trace_markdown = self._render_trace_markdown(trace)
425
286
 
287
+ # --- Result Assembly ---
426
288
  stats["total_time"] = time.time() - start_time
427
- stats["steps"] = step
428
-
429
- # Calculate billing info correctly by summing up all actual costs
430
- total_cost_sum = vision_cost + instruct_cost
431
- for s in agent_trace_steps:
432
- s_usage = s.get("usage", {})
433
- if s_usage:
434
- s_in_price = float(getattr(self.config, "input_price", 0.0) or 0.0)
435
- s_out_price = float(getattr(self.config, "output_price", 0.0) or 0.0)
436
- total_cost_sum += (s_usage.get("input_tokens", 0) / 1_000_000 * s_in_price) + (s_usage.get("output_tokens", 0) / 1_000_000 * s_out_price)
437
-
289
+ structured = self._parse_tagged_response(summary_content)
290
+ final_content = structured.get("response") or summary_content
291
+
438
292
  billing_info = {
439
293
  "input_tokens": usage_totals["input_tokens"],
440
294
  "output_tokens": usage_totals["output_tokens"],
441
- "total_cost": total_cost_sum,
295
+ "total_cost": instruct_cost + summary_cost
442
296
  }
443
-
444
- # Build stages_used list for UI display
445
- stages_used = []
446
297
 
447
- def infer_icon(model_name: str, base_url: str) -> str:
448
- model_lower = (model_name or "").lower()
449
- url_lower = (base_url or "").lower()
450
- if "deepseek" in model_lower or "deepseek" in url_lower: return "deepseek"
451
- elif "claude" in model_lower or "anthropic" in url_lower: return "anthropic"
452
- elif "gemini" in model_lower or "google" in url_lower: return "google"
453
- elif "gpt" in model_lower or "openai" in url_lower: return "openai"
454
- elif "qwen" in model_lower: return "qwen"
455
- elif "openrouter" in url_lower: return "openrouter"
456
- return "openai"
457
-
458
- def infer_provider(base_url: str) -> str:
459
- url_lower = (base_url or "").lower()
460
- if "openrouter" in url_lower: return "OpenRouter"
461
- elif "openai" in url_lower: return "OpenAI"
462
- elif "anthropic" in url_lower: return "Anthropic"
463
- elif "google" in url_lower: return "Google"
464
- elif "deepseek" in url_lower: return "DeepSeek"
465
- return ""
298
+ # Build stages_used
299
+ stages_used = []
466
300
 
467
- if trace.get("vision"):
468
- v = trace["vision"]
469
- v_model = v.get("model", "")
470
- v_base_url = v.get("base_url", "") or self.config.base_url
471
- stages_used.append({
472
- "name": "Vision",
473
- "model": v_model,
474
- "icon_config": infer_icon(v_model, v_base_url),
475
- "provider": infer_provider(v_base_url),
476
- "time": v.get("time", 0),
477
- "cost": v.get("cost", 0.0)
301
+ # Get page info
302
+ fetch_items = [r for r in self.all_web_results if r.get("_type") == "page"]
303
+ crawled_pages_ui = []
304
+ for r in fetch_items:
305
+ domain = ""
306
+ try:
307
+ from urllib.parse import urlparse
308
+ domain = urlparse(r.get("url", "")).netloc
309
+ except: pass
310
+ crawled_pages_ui.append({
311
+ "title": r.get("title", ""),
312
+ "url": r.get("url", ""),
313
+ "favicon_url": f"https://www.google.com/s2/favicons?domain={domain}&sz=32"
478
314
  })
479
315
 
316
+ # Extract images from pages
317
+ extracted_images = []
318
+ seen_imgs = set()
319
+ junk_keywords = ["icon", "logo", "badge", "avatar", "button", "social", "footer", "header", "banner", "license", "by-nc", "hosted_by", "pixel", "tracker", "ad", "ads", "advert", "promotion", "shop", "store", "group", "join", "qr", "qrcode", "weibo", "weixin", "douyin", "xiaohongshu", "bilibili", "official", "follow", "subscribe", "app"]
320
+
321
+ for r in fetch_items:
322
+ if "images" in r:
323
+ for img_url in r["images"]:
324
+ if img_url not in seen_imgs:
325
+ # Filter junk images
326
+ lower_url = img_url.lower()
327
+ if any(k in lower_url for k in junk_keywords):
328
+ continue
329
+
330
+ extracted_images.append({
331
+ "title": r.get("title", "Image"),
332
+ "url": img_url,
333
+ "thumbnail": img_url,
334
+ "domain": r.get("domain", "")
335
+ })
336
+ seen_imgs.add(img_url)
337
+
338
+ # Instruct Stage (with crawled pages and images)
480
339
  if trace.get("instruct"):
481
340
  i = trace["instruct"]
482
- i_model = i.get("model", "")
483
- i_base_url = i.get("base_url", "") or self.config.base_url
341
+ # Total time = instruct + search + fetch (until summary starts)
342
+ instruct_total_time = (i.get("time", 0) or 0) + search_time
343
+ if trace.get("fetch"):
344
+ instruct_total_time += trace["fetch"].get("time", 0)
345
+
484
346
  stages_used.append({
485
347
  "name": "Instruct",
486
- "model": i_model,
487
- "icon_config": infer_icon(i_model, i_base_url),
488
- "provider": infer_provider(i_base_url),
489
- "time": i.get("time", 0),
490
- "cost": i.get("cost", 0.0)
348
+ "model": i.get("model"),
349
+ "icon_config": "openai",
350
+ "provider": "Instruct",
351
+ "time": instruct_total_time,
352
+ "cost": i.get("cost", 0),
353
+ "has_images": bool(images),
354
+ "crawled_pages": crawled_pages_ui, # Add crawled pages here
355
+ "image_references": extracted_images[:9] # Add images here
491
356
  })
492
-
493
- # Show Search stage if we have ANY search results (text OR image)
494
- if (has_search_results or has_image_results) and search_payloads:
495
- # Collect initial search results for the Search stage card
496
- initial_refs = [
497
- {"title": r.get("title", ""), "url": r.get("url", ""), "domain": r.get("domain", "")}
498
- for r in self.all_web_results if r.get("_type") == "search"
499
- ]
500
- initial_images = [
501
- {"title": r.get("title", ""), "url": r.get("url", ""), "thumbnail": r.get("thumbnail", "")}
502
- for r in self.all_web_results if r.get("_type") == "image"
503
- ]
504
-
357
+
358
+ # Summary Stage
359
+ if trace.get("summary"):
360
+ s = trace["summary"]
505
361
  stages_used.append({
506
- "name": "Search",
507
- "model": getattr(self.config, "search_name", "DuckDuckGo"),
508
- "icon_config": "search",
509
- "provider": getattr(self.config, 'search_provider', 'Crawl4AI'),
510
- "time": search_time,
511
- "cost": 0.0,
512
- "references": initial_refs,
513
- "image_references": initial_images
362
+ "name": "Summary",
363
+ "model": s.get("model"),
364
+ "icon_config": "openai",
365
+ "provider": "Summary",
366
+ "time": s.get("time", 0),
367
+ "cost": s.get("cost", 0),
368
+ "images_count": s.get("images_count", 0)
514
369
  })
515
-
516
- # Add Crawler stage if Instruct used crawl_page
517
- if trace.get("instruct"):
518
- instruct_tool_calls = trace["instruct"].get("tool_calls", [])
519
- crawl_calls = [tc for tc in instruct_tool_calls if tc.get("name") == "crawl_page"]
520
- if crawl_calls:
521
- # Build crawled_pages list for UI
522
- crawled_pages = []
523
- for tc in crawl_calls:
524
- url = tc.get("arguments", {}).get("url", "")
525
- # Try to find cached result
526
- found = next((r for r in self.all_web_results if r.get("url") == url and r.get("_type") == "page"), None)
527
- if found:
528
- try:
529
- from urllib.parse import urlparse
530
- domain = urlparse(url).netloc
531
- except:
532
- domain = ""
533
- crawled_pages.append({
534
- "title": found.get("title", "Page"),
535
- "url": url,
536
- "favicon_url": f"https://www.google.com/s2/favicons?domain={domain}&sz=32"
537
- })
538
-
539
- stages_used.append({
540
- "name": "Crawler",
541
- "model": "Crawl4AI",
542
- "icon_config": "search",
543
- "provider": "网页抓取",
544
- "time": search_time, # Use existing search_time which includes fetch time
545
- "cost": 0.0,
546
- "crawled_pages": crawled_pages
547
- })
548
-
549
- # --- Granular Agent Stages (Grouped) ---
550
- if trace.get("agent"):
551
- a = trace["agent"]
552
- a_model = a.get("model", "") or active_model
553
- a_base_url = a.get("base_url", "") or self.config.base_url
554
- steps = a.get("steps", [])
555
- agent_icon = infer_icon(a_model, a_base_url)
556
- agent_provider = infer_provider(a_base_url)
557
-
558
- for s in steps:
559
- if "tool_calls" in s:
560
- # 1. Agent Thought Stage (with LLM time)
561
- # Calculate step cost
562
- step_usage = s.get("usage", {})
563
- step_cost = 0.0
564
- if a_in_price > 0 or a_out_price > 0:
565
- step_cost = (step_usage.get("input_tokens", 0) / 1_000_000 * a_in_price) + (step_usage.get("output_tokens", 0) / 1_000_000 * a_out_price)
566
-
567
- stages_used.append({
568
- "name": "Agent",
569
- "model": a_model,
570
- "icon_config": agent_icon,
571
- "provider": agent_provider,
572
- "time": s.get("llm_time", 0), "cost": step_cost
573
- })
574
-
575
- # 2. Grouped Tool Stages
576
- # Collect results for grouping
577
- search_group_items = []
578
- crawler_group_items = []
579
-
580
- tcs = s.get("tool_calls", [])
581
- trs = s.get("tool_results", [])
582
-
583
- for idx, tc in enumerate(tcs):
584
- t_name = tc.get("name")
585
- # Try to get result content if available
586
- t_res_content = trs[idx].get("content", "") if idx < len(trs) else ""
587
-
588
- if t_name in ["internal_web_search", "web_search", "internal_image_search"]:
589
- # We don't have per-call metadata easily unless we parse the 'result' string (which is JSON dump now for route_tool)
590
- # But search results are cached in self.all_web_results.
591
- # The 'content' of search tool result is basically "cached_for_prompt".
592
- # So we don't need to put items here, just show "Search" container.
593
- # But wait, if we want to show "what was searched", we can parse args.
594
- args = tc.get("arguments", {})
595
- query = args.get("query", "")
596
- if query:
597
- search_group_items.append({"query": query})
598
-
599
- elif t_name == "crawl_page":
600
- # Get URL from arguments, title from result
601
- args = tc.get("arguments", {})
602
- url = args.get("url", "")
603
- title = "Page"
604
- try:
605
- page_data = json.loads(t_res_content)
606
- if isinstance(page_data, dict):
607
- title = page_data.get("title", "Page")
608
- except:
609
- pass
610
-
611
- if url:
612
- try:
613
- domain = urlparse(url).netloc
614
- except:
615
- domain = ""
616
- crawler_group_items.append({
617
- "title": title,
618
- "url": url,
619
- "favicon_url": f"https://www.google.com/s2/favicons?domain={domain}&sz=32"
620
- })
621
-
622
- # Append Grouped Stages
623
- if search_group_items:
624
- stages_used.append({
625
- "name": "Search",
626
- "model": getattr(self.config, "search_name", "DuckDuckGo"),
627
- "icon_config": "search",
628
- "provider": "Agent Search",
629
- "time": s.get("tool_time", 0), "cost": 0,
630
- "queries": search_group_items # Render can use this if needed, or just show generic
631
- })
632
-
633
- if crawler_group_items:
634
- stages_used.append({
635
- "name": "Crawler",
636
- "model": "Crawl4AI",
637
- "icon_config": "browser",
638
- "provider": "Page Fetcher",
639
- "time": s.get("tool_time", 0), "cost": 0,
640
- "crawled_pages": crawler_group_items
641
- })
642
370
 
643
- elif s.get("final"):
644
- # Correctly calculate final step cost
645
- step_usage = s.get("usage", {})
646
- step_cost = 0.0
647
- if a_in_price > 0 or a_out_price > 0:
648
- step_cost = (step_usage.get("input_tokens", 0) / 1_000_000 * a_in_price) + (step_usage.get("output_tokens", 0) / 1_000_000 * a_out_price)
649
-
650
- stages_used.append({
651
- "name": "Agent",
652
- "model": a_model,
653
- "icon_config": agent_icon,
654
- "provider": agent_provider,
655
- "time": s.get("llm_time", 0),
656
- "cost": step_cost
657
- })
658
-
659
- # Assign total time/cost to last Agent stage
660
- # Sum up total time/cost for UI/stats (implicit via loop above)
661
- # No need to assign everything to last agent anymore as we distribute it.
662
-
663
- # --- Final Filter: Only show cited items in workflow cards ---
664
- cited_urls = {ref['url'] for ref in (structured.get("references", []) +
665
- structured.get("page_references", []) +
666
- structured.get("image_references", []))}
667
-
668
- # Find images already rendered in markdown content (to avoid duplicate display)
669
- markdown_image_urls = set()
670
- md_img_pattern = re.compile(r'!\[.*?\]\((https?://[^)]+)\)')
671
- for match in md_img_pattern.finditer(final_content):
672
- markdown_image_urls.add(match.group(1))
673
-
674
- for s in stages_used:
675
- if "references" in s and s["references"]:
676
- s["references"] = [r for r in s["references"] if r.get("url") in cited_urls]
677
- # Filter out images already shown in markdown content
678
- # Check both url AND thumbnail since either might be used in markdown
679
- if "image_references" in s and s["image_references"]:
680
- s["image_references"] = [
681
- r for r in s["image_references"]
682
- if r.get("url") not in markdown_image_urls and (r.get("thumbnail") or "") not in markdown_image_urls
683
- ]
684
- if "crawled_pages" in s and s["crawled_pages"]:
685
- s["crawled_pages"] = [r for r in s["crawled_pages"] if r.get("url") in cited_urls]
686
-
687
- # Clean up conversation history: Remove tool calls and results to save tokens and avoid ID conflicts
688
- # Keep only 'user' messages and 'assistant' messages without tool_calls (final answers)
689
- cleaned_history = []
690
- for msg in current_history:
691
- if msg.get("role") == "tool":
692
- continue
693
- if msg.get("role") == "assistant" and msg.get("tool_calls"):
694
- continue
695
- cleaned_history.append(msg)
696
-
697
- # Update the reference (since it might be used by caller)
698
- current_history[:] = cleaned_history
699
-
700
- # --- Apply cached images to reduce render time ---
701
- # Collect all image URLs that need caching (avoid duplicates when thumbnail == url)
702
- all_image_urls = set()
703
- for img_ref in structured.get("image_references", []):
704
- if img_ref.get("thumbnail"):
705
- all_image_urls.add(img_ref["thumbnail"])
706
- if img_ref.get("url"):
707
- all_image_urls.add(img_ref["url"])
708
-
709
- for stage in stages_used:
710
- for img_ref in stage.get("image_references", []):
711
- if img_ref.get("thumbnail"):
712
- all_image_urls.add(img_ref["thumbnail"])
713
- if img_ref.get("url"):
714
- all_image_urls.add(img_ref["url"])
715
-
716
- # Also collect image URLs from markdown content
717
- markdown_img_pattern = re.compile(r'!\[.*?\]\((https?://[^)]+)\)')
718
- markdown_urls = markdown_img_pattern.findall(final_content)
719
- all_image_urls.update(markdown_urls)
371
+ # Construct final trace markdown
372
+ trace_markdown = self._render_trace_markdown(trace)
720
373
 
721
- # Get cached versions (waits for pending downloads until agent ends)
722
- if all_image_urls:
723
- try:
724
- cached_map = await get_cached_images(list(all_image_urls))
725
-
726
- # Apply cached URLs to structured response
727
- for img_ref in structured.get("image_references", []):
728
- if img_ref.get("thumbnail") and img_ref["thumbnail"] in cached_map:
729
- img_ref["thumbnail"] = cached_map[img_ref["thumbnail"]]
730
- if img_ref.get("url") and img_ref["url"] in cached_map:
731
- img_ref["url"] = cached_map[img_ref["url"]]
732
-
733
- # Apply cached URLs to stages
734
- for stage in stages_used:
735
- for img_ref in stage.get("image_references", []):
736
- if img_ref.get("thumbnail") and img_ref["thumbnail"] in cached_map:
737
- img_ref["thumbnail"] = cached_map[img_ref["thumbnail"]]
738
- if img_ref.get("url") and img_ref["url"] in cached_map:
739
- img_ref["url"] = cached_map[img_ref["url"]]
740
-
741
- # Replace image URLs in markdown content with cached versions
742
- def replace_markdown_img(match):
743
- full_match = match.group(0)
744
- url = match.group(1)
745
- cached_url = cached_map.get(url)
746
- if cached_url and cached_url != url:
747
- return full_match.replace(url, cached_url)
748
- return full_match
749
-
750
- final_content = markdown_img_pattern.sub(replace_markdown_img, final_content)
751
- structured["response"] = markdown_img_pattern.sub(replace_markdown_img, structured.get("response", ""))
752
-
753
- # Log cache stats
754
- from .image_cache import get_image_cache
755
- cache_stats = get_image_cache().get_stats()
756
- logger.info(f"ImageCache stats: {cache_stats}")
757
-
758
- except Exception as e:
759
- logger.warning(f"Failed to apply image cache: {e}")
374
+ # Update history
375
+ current_history.append({"role": "user", "content": user_input or "..."})
376
+ current_history.append({"role": "assistant", "content": final_content})
760
377
 
761
- # Cancel all background image search/download tasks when agent ends
762
- if self._image_search_tasks:
763
- logger.info(f"Cancelling {len(self._image_search_tasks)} background image search tasks")
764
- for task in self._image_search_tasks:
765
- if not task.done():
766
- task.cancel()
767
- # Wait a bit for tasks to handle cancellation gracefully
768
- try:
769
- await asyncio.gather(*self._image_search_tasks, return_exceptions=True)
770
- except Exception:
771
- pass
772
- self._image_search_tasks.clear()
773
-
774
- # Also cancel any pending image downloads in the cache
775
- from .image_cache import get_image_cache
776
- cache = get_image_cache()
777
- if cache._pending:
778
- logger.info(f"Cancelling {len(cache._pending)} pending image downloads")
779
- for task in cache._pending.values():
780
- if not task.done():
781
- task.cancel()
782
- cache._pending.clear()
378
+ # Schedule async cache task (fire and forget - doesn't block return)
379
+ cache_data = {
380
+ "user_input": user_input,
381
+ "trace": trace,
382
+ "trace_markdown": trace_markdown,
383
+ "page_screenshots": page_screenshots,
384
+ "final_content": final_content,
385
+ "stages_used": stages_used,
386
+ }
387
+ asyncio.create_task(self._cache_run_async(cache_data))
783
388
 
784
389
  return {
785
390
  "llm_response": final_content,
786
391
  "structured_response": structured,
787
392
  "stats": stats,
788
393
  "model_used": active_model,
789
- "vision_model_used": (selected_vision_model or getattr(self.config, "vision_model_name", None)) if images else None,
790
394
  "conversation_history": current_history,
791
395
  "trace_markdown": trace_markdown,
792
396
  "billing_info": billing_info,
@@ -799,18 +403,11 @@ class ProcessingPipeline:
799
403
  if hasattr(self, '_image_search_tasks') and self._image_search_tasks:
800
404
  for task in self._image_search_tasks:
801
405
  if not task.done(): task.cancel()
802
- # Wait briefly for cleanup
803
- await asyncio.wait(self._image_search_tasks, timeout=0.1)
804
- self._image_search_tasks.clear()
406
+ try:
407
+ await asyncio.wait(self._image_search_tasks, timeout=0.1)
408
+ except Exception: pass
409
+ self._image_search_tasks = []
805
410
 
806
- from .image_cache import get_image_cache
807
- cache = get_image_cache()
808
- if cache._pending:
809
- pending_tasks = list(cache._pending.values())
810
- for task in pending_tasks:
811
- if not task.done(): task.cancel()
812
- await asyncio.wait(pending_tasks, timeout=0.1)
813
- cache._pending.clear()
814
411
  return {
815
412
  "llm_response": f"I encountered a critical error: {e}",
816
413
  "stats": stats,
@@ -968,7 +565,26 @@ class ProcessingPipeline:
968
565
 
969
566
  if name == "internal_web_search" or name == "web_search":
970
567
  query = args.get("query")
971
- web = await self.search_service.search(query)
568
+ try:
569
+ web = await self.search_service.search(query)
570
+ except Exception as e:
571
+ logger.error(f"Failed to execute search: {e}")
572
+ self._search_error = str(e)
573
+ raise e
574
+
575
+ # Filter blocked domains immediately
576
+ blocked_domains = getattr(self.config, "fetch_blocked_domains", ["wikipedia.org", "csdn.net", "baidu.com"])
577
+ if isinstance(blocked_domains, str):
578
+ blocked_domains = [d.strip() for d in blocked_domains.split(",")]
579
+
580
+ # Use list comprehension for filtering
581
+ original_count = len(web)
582
+ web = [
583
+ item for item in web
584
+ if not any(blocked in item.get("url", "").lower() for blocked in blocked_domains)
585
+ ]
586
+ if len(web) < original_count:
587
+ logger.info(f"Filtered {original_count - len(web)} blocked search results.")
972
588
 
973
589
  # Cache results and assign global IDs
974
590
  for item in web:
@@ -1018,6 +634,13 @@ class ProcessingPipeline:
1018
634
  # Cache the crawled content with global ID
1019
635
  self.global_id_counter += 1
1020
636
 
637
+ # Generate screenshot for direct URL crawl (so LLM can see it)
638
+ screenshot_b64 = await self._render_page_screenshot(
639
+ title=result_dict.get("title", "Page"),
640
+ url=url,
641
+ content=result_dict.get("content", "")[:4000]
642
+ )
643
+
1021
644
  cached_item = {
1022
645
  "_id": self.global_id_counter,
1023
646
  "_type": "page",
@@ -1026,6 +649,7 @@ class ProcessingPipeline:
1026
649
  "content": result_dict.get("content", ""),
1027
650
  "domain": "",
1028
651
  "is_crawled": True,
652
+ "screenshot_b64": screenshot_b64, # Add screenshot
1029
653
  }
1030
654
  try:
1031
655
  from urllib.parse import urlparse
@@ -1091,45 +715,38 @@ class ProcessingPipeline:
1091
715
 
1092
716
  return response.choices[0].message, usage
1093
717
 
1094
- async def _run_vision_stage(self, user_input: str, images: List[str], model: str, prompt: str) -> Tuple[str, Dict[str, int]]:
1095
- content_payload: List[Dict[str, Any]] = [{"type": "text", "text": user_input or ""}]
1096
- for img_b64 in images:
1097
- url = f"data:image/png;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
1098
- content_payload.append({"type": "image_url", "image_url": {"url": url}})
1099
718
 
1100
- client = self._client_for(
1101
- api_key=getattr(self.config, "vision_api_key", None),
1102
- base_url=getattr(self.config, "vision_base_url", None),
1103
- )
1104
- response, usage = await self._safe_llm_call(
1105
- messages=[{"role": "system", "content": prompt}, {"role": "user", "content": content_payload}],
1106
- model=model,
1107
- client=client,
1108
- extra_body=getattr(self.config, "vision_extra_body", None),
1109
- )
1110
- return (response.content or "").strip(), usage
1111
719
 
1112
720
  async def _run_instruct_stage(
1113
- self, user_input: str, vision_text: str, model: str
721
+ self, user_input: str, images: List[str] = None, model: str = None
1114
722
  ) -> Tuple[str, List[str], Dict[str, Any], Dict[str, int], float]:
1115
- """Returns (instruct_text, search_payloads, trace_dict, usage_dict, search_time)."""
1116
- # Instruct has access to: web_search, image_search, set_mode, crawl_page, refuse_answer
1117
- tools = [self.web_search_tool, self.image_search_tool, self.set_mode_tool, self.crawl_page_tool, self.refuse_answer_tool]
1118
- tools_desc = "- internal_web_search: 搜索文本\n- internal_image_search: 搜索图片\n- crawl_page: 获取网页内容\n- set_mode: 设定standard/agent模式\n- refuse_answer: 拒绝回答(敏感/违规内容)"
723
+ """Returns (instruct_text, search_payloads, trace_dict, usage_dict, search_time).
724
+
725
+ Images are now passed directly here (merged vision stage).
726
+ """
727
+ # Instruct has access to: web_search, crawl_page, refuse_answer
728
+ tools = [self.web_search_tool, self.crawl_page_tool, self.refuse_answer_tool]
729
+ tools_desc = "- internal_web_search: 搜索文本\n- crawl_page: 获取网页内容\n- refuse_answer: 拒绝回答(敏感/违规内容)"
1119
730
 
1120
731
  prompt = INSTRUCT_SP.format(user_msgs=user_input or "", tools_desc=tools_desc)
1121
-
1122
- if vision_text:
1123
- prompt = f"{prompt}\\n\\n{INSTRUCT_SP_VISION_ADD.format(vision_msgs=vision_text)}"
1124
732
 
1125
733
  client = self._client_for(
1126
734
  api_key=getattr(self.config, "instruct_api_key", None),
1127
735
  base_url=getattr(self.config, "instruct_base_url", None),
1128
736
  )
1129
737
 
738
+ # Build user content - multimodal if images provided
739
+ if images:
740
+ user_content: List[Dict[str, Any]] = [{"type": "text", "text": user_input or "..."}]
741
+ for img_b64 in images:
742
+ url = f"data:image/png;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
743
+ user_content.append({"type": "image_url", "image_url": {"url": url}})
744
+ else:
745
+ user_content = user_input or "..."
746
+
1130
747
  history: List[Dict[str, Any]] = [
1131
748
  {"role": "system", "content": prompt},
1132
- {"role": "user", "content": user_input or "..."},
749
+ {"role": "user", "content": user_content},
1133
750
  ]
1134
751
 
1135
752
  response, usage = await self._safe_llm_call(
@@ -1147,15 +764,14 @@ class ProcessingPipeline:
1147
764
  "base_url": getattr(self.config, "instruct_base_url", None) or self.config.base_url,
1148
765
  "prompt": prompt,
1149
766
  "user_input": user_input or "",
1150
- "vision_add": vision_text or "",
767
+ "has_images": bool(images),
768
+ "images_count": len(images) if images else 0,
1151
769
  "tool_calls": [],
1152
770
  "tool_results": [],
1153
771
  "output": "",
1154
772
  }
1155
773
 
1156
774
  search_time = 0.0
1157
- mode = "standard"
1158
- mode_reason = ""
1159
775
 
1160
776
  if response.tool_calls:
1161
777
  plan_dict = response.model_dump() if hasattr(response, "model_dump") else response
@@ -1177,27 +793,262 @@ class ProcessingPipeline:
1177
793
 
1178
794
  if tc.function.name in ["web_search", "internal_web_search"]:
1179
795
  search_payloads.append(str(result))
1180
- elif tc.function.name == "set_mode":
1181
- try:
1182
- args = json.loads(html.unescape(tc.function.arguments))
1183
- except Exception:
1184
- args = {}
1185
- mode = args.get("mode", mode)
1186
- mode_reason = args.get("reason", "")
1187
-
1188
- instruct_trace["mode"] = mode
1189
- if mode_reason:
1190
- instruct_trace["mode_reason"] = mode_reason
1191
-
796
+
1192
797
  instruct_trace["output"] = ""
1193
798
  instruct_trace["usage"] = usage
1194
799
  return "", search_payloads, instruct_trace, usage, search_time
1195
800
 
1196
- instruct_trace["mode"] = mode
1197
801
  instruct_trace["output"] = (response.content or "").strip()
1198
802
  instruct_trace["usage"] = usage
1199
803
  return "", search_payloads, instruct_trace, usage, 0.0
1200
804
 
805
+ async def _run_auto_fetch_with_screenshots(self, urls: List[str]):
806
+ """
807
+ Automatically fetch URLs and generate screenshots of their content.
808
+ Stops after getting the first 5 successful results (fastest wins).
809
+ Screenshots are stored as base64 in the cached items.
810
+ """
811
+ if not urls:
812
+ return
813
+
814
+ # Get config
815
+ fetch_timeout = float(getattr(self.config, "fetch_timeout", 15.0))
816
+ max_results = int(getattr(self.config, "fetch_max_results", 5))
817
+
818
+ async def _fetch_and_screenshot(url: str):
819
+ try:
820
+ # Fetch page content
821
+ result_dict = await self.search_service.fetch_page(url)
822
+
823
+ self.global_id_counter += 1
824
+
825
+ # Generate screenshot from page content
826
+ screenshot_b64 = await self._render_page_screenshot(
827
+ title=result_dict.get("title", "Page"),
828
+ url=url,
829
+ content=result_dict.get("content", "")[:4000] # Limit content for screenshot
830
+ )
831
+
832
+ cached_item = {
833
+ "_id": self.global_id_counter,
834
+ "_type": "page",
835
+ "title": result_dict.get("title", "Page"),
836
+ "url": result_dict.get("url", url),
837
+ "content": result_dict.get("content", ""),
838
+ "images": result_dict.get("images", []),
839
+ "domain": "",
840
+ "is_crawled": True,
841
+ "screenshot_b64": screenshot_b64,
842
+ }
843
+ try:
844
+ from urllib.parse import urlparse
845
+ cached_item["domain"] = urlparse(url).netloc
846
+ except:
847
+ pass
848
+
849
+ return cached_item
850
+ except Exception as e:
851
+ logger.error(f"Failed to fetch/screenshot {url}: {e}")
852
+ return None
853
+
854
+ async def _fetch_with_timeout(url: str):
855
+ """Wrapper to apply timeout to each fetch operation."""
856
+ try:
857
+ return await asyncio.wait_for(_fetch_and_screenshot(url), timeout=fetch_timeout)
858
+ except asyncio.TimeoutError:
859
+ logger.warning(f"Fetch timeout ({fetch_timeout}s) exceeded for: {url}")
860
+ return None
861
+
862
+ # Create tasks for all URLs (track url -> task mapping)
863
+ url_to_task = {url: asyncio.create_task(_fetch_with_timeout(url)) for url in urls}
864
+ tasks = list(url_to_task.values())
865
+ first_url = urls[0] if urls else None
866
+ first_task = url_to_task.get(first_url) if first_url else None
867
+
868
+ # Collect first N successful results (fastest wins)
869
+ collected_results = {} # url -> result
870
+ successful_count = 0
871
+ for coro in asyncio.as_completed(tasks):
872
+ try:
873
+ result = await coro
874
+ if result:
875
+ # Find which URL this result belongs to
876
+ result_url = result.get("url", "")
877
+ collected_results[result_url] = result
878
+ successful_count += 1
879
+ # Only break if we have enough AND first URL is done (or failed)
880
+ first_done = first_url in collected_results or (first_task and first_task.done())
881
+ if successful_count >= max_results and first_done:
882
+ logger.info(f"Got {max_results} successful results, cancelling remaining tasks")
883
+ break
884
+ except Exception as e:
885
+ logger.warning(f"Fetch task failed: {e}")
886
+
887
+ # Ensure first URL task completes (if not already) before cancelling others
888
+ if first_task and not first_task.done():
889
+ logger.info("Waiting for first URL to complete...")
890
+ try:
891
+ result = await first_task
892
+ if result:
893
+ collected_results[result.get("url", first_url)] = result
894
+ except Exception as e:
895
+ logger.warning(f"First URL fetch failed: {e}")
896
+
897
+ # Cancel remaining tasks
898
+ for task in tasks:
899
+ if not task.done():
900
+ task.cancel()
901
+
902
+ # Wait briefly for cancellation to propagate
903
+ if any(not t.done() for t in tasks):
904
+ await asyncio.gather(*tasks, return_exceptions=True)
905
+
906
+ # Add results in original URL order (not fetch speed order)
907
+ for url in urls:
908
+ if url in collected_results:
909
+ self.all_web_results.append(collected_results[url])
910
+
911
+ async def _render_page_screenshot(self, title: str, url: str, content: str) -> Optional[str]:
912
+ """
913
+ Render page content as a simple HTML and take a screenshot.
914
+ Returns base64 encoded image or None on failure.
915
+ Images are compressed to reduce LLM payload size.
916
+ """
917
+ import base64
918
+ import tempfile
919
+
920
+ try:
921
+ # Try to use the content renderer if available
922
+ from .render_vue import ContentRenderer
923
+
924
+ # Create a simple markdown representation for screenshot
925
+ markdown = f"> 来源: {url}\n\n# {title}\n\n{content}" # Limit content
926
+
927
+ # Use temp file for screenshot
928
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
929
+ tmp_path = tmp.name
930
+
931
+ # Get or create renderer (reuse if possible)
932
+ if not hasattr(self, '_screenshot_renderer'):
933
+ self._screenshot_renderer = ContentRenderer(auto_start=True)
934
+ await self._screenshot_renderer.start(timeout=10000)
935
+
936
+ # Await the async render method
937
+ await self._screenshot_renderer.render(
938
+ markdown,
939
+ tmp_path,
940
+ stats={"total_time": 0},
941
+ references=[{"title": title, "url": url, "domain": ""}],
942
+ )
943
+
944
+ # Compress image to reduce LLM payload size (~350KB target)
945
+ img_bytes = await self._compress_image(tmp_path, max_width=600, quality=70)
946
+
947
+ # Cleanup
948
+ import os
949
+ os.unlink(tmp_path)
950
+
951
+ return base64.b64encode(img_bytes).decode("utf-8")
952
+
953
+ except Exception as e:
954
+ logger.warning(f"Failed to render page screenshot: {e}")
955
+ return None
956
+
957
+ async def _compress_image(self, image_path: str, max_width: int = 400, quality: int = 50) -> bytes:
958
+ """Compress image to reduce size for LLM payload."""
959
+ from io import BytesIO
960
+
961
+ try:
962
+ from PIL import Image
963
+
964
+ def _compress():
965
+ with Image.open(image_path) as img:
966
+ # Calculate new height maintaining aspect ratio
967
+ if img.width > max_width:
968
+ ratio = max_width / img.width
969
+ new_height = int(img.height * ratio)
970
+ img = img.resize((max_width, new_height), Image.Resampling.LANCZOS)
971
+
972
+ # Convert to RGB if necessary
973
+ if img.mode in ('RGBA', 'P'):
974
+ img = img.convert('RGB')
975
+
976
+ # Save to buffer with compression
977
+ buffer = BytesIO()
978
+ img.save(buffer, format='JPEG', quality=quality, optimize=True)
979
+ return buffer.getvalue()
980
+
981
+ return await asyncio.to_thread(_compress)
982
+
983
+ except ImportError:
984
+ # PIL not available, return original
985
+ logger.warning("PIL not available for image compression, using original")
986
+ with open(image_path, 'rb') as f:
987
+ return f.read()
988
+
989
+ async def _run_summary_stage(
990
+ self, user_input: str, images: List[str] = None,
991
+ has_page_screenshots: bool = False, model: str = None
992
+ ) -> Tuple[str, Dict[str, int], Dict[str, Any]]:
993
+ """
994
+ Generate final summary using page screenshots only.
995
+ Returns (content, usage, trace_info).
996
+ """
997
+
998
+ # Build system prompt
999
+ try:
1000
+ language_conf = getattr(self.config, "language", "Simplified Chinese")
1001
+ system_prompt = SUMMARY_SP.format(language=language_conf)
1002
+ except Exception:
1003
+ system_prompt = SUMMARY_SP
1004
+
1005
+
1006
+
1007
+ # Build user content - multimodal if images provided
1008
+ if images:
1009
+ user_content: List[Dict[str, Any]] = [{"type": "text", "text": user_input or "..."}]
1010
+ for img_b64 in images:
1011
+ url = f"data:image/jpeg;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
1012
+ user_content.append({"type": "image_url", "image_url": {"url": url}})
1013
+ else:
1014
+ user_content = user_input or "..."
1015
+
1016
+ messages = [
1017
+ {"role": "system", "content": system_prompt},
1018
+ {"role": "user", "content": user_content}
1019
+ ]
1020
+
1021
+ client = self._client_for(
1022
+ api_key=getattr(self.config, "summary_api_key", None),
1023
+ base_url=getattr(self.config, "summary_base_url", None)
1024
+ )
1025
+
1026
+ response, usage = await self._safe_llm_call(
1027
+ messages=messages,
1028
+ model=model,
1029
+ client=client,
1030
+ extra_body=getattr(self.config, "summary_extra_body", None)
1031
+ )
1032
+
1033
+ return (response.content or "").strip(), usage, {"prompt": system_prompt}
1034
+
1035
+ def _format_fetch_msgs(self) -> str:
1036
+ """Format crawled page content for Summary prompt."""
1037
+ if not self.all_web_results:
1038
+ return ""
1039
+
1040
+ lines = []
1041
+ for res in self.all_web_results:
1042
+ if res.get("_type") != "page": continue
1043
+ idx = res.get("_id")
1044
+ title = (res.get("title", "") or "").strip()
1045
+ url = res.get("url", "")
1046
+ content = (res.get("content", "") or "").strip()
1047
+ # Truncate content if too long? For now keep it full or rely on model context
1048
+ lines.append(f"Title: {title}\nURL: {url}\nContent:\n{content}\n")
1049
+
1050
+ return "\n".join(lines)
1051
+
1201
1052
  def _format_search_msgs(self) -> str:
1202
1053
  """Format search snippets only (not crawled pages)."""
1203
1054
  if not self.all_web_results:
@@ -1264,23 +1115,13 @@ class ProcessingPipeline:
1264
1115
  parts: List[str] = []
1265
1116
  parts.append("# Pipeline Trace\n")
1266
1117
 
1267
- if trace.get("vision"):
1268
- v = trace["vision"]
1269
- parts.append("## Vision\n")
1270
- parts.append(f"- model: `{v.get('model')}`")
1271
- parts.append(f"- base_url: `{v.get('base_url')}`")
1272
- parts.append(f"- images_count: `{v.get('images_count')}`\n")
1273
- parts.append("### Prompt\n")
1274
- parts.append(fence("text", v.get("prompt", "")))
1275
- parts.append("\n### Output\n")
1276
- parts.append(fence("text", v.get("output", "")))
1277
- parts.append("")
1278
-
1279
1118
  if trace.get("instruct"):
1280
1119
  t = trace["instruct"]
1281
1120
  parts.append("## Instruct\n")
1282
1121
  parts.append(f"- model: `{t.get('model')}`")
1283
- parts.append(f"- base_url: `{t.get('base_url')}`\n")
1122
+ parts.append(f"- base_url: `{t.get('base_url')}`")
1123
+ parts.append(f"- has_images: `{t.get('has_images', False)}`")
1124
+ parts.append(f"- images_count: `{t.get('images_count', 0)}`\n")
1284
1125
  parts.append("### Prompt\n")
1285
1126
  parts.append(fence("text", t.get("prompt", "")))
1286
1127
  if t.get("tool_calls"):
@@ -1293,20 +1134,79 @@ class ProcessingPipeline:
1293
1134
  parts.append(fence("text", t.get("output", "")))
1294
1135
  parts.append("")
1295
1136
 
1296
- if trace.get("agent"):
1297
- a = trace["agent"]
1298
- parts.append("## Agent\n")
1299
- parts.append(f"- model: `{a.get('model')}`")
1300
- parts.append(f"- base_url: `{a.get('base_url')}`\n")
1137
+ if trace.get("fetch"):
1138
+ f = trace["fetch"]
1139
+ parts.append("## Auto-Fetch\n")
1140
+ parts.append(f"- urls_fetched: `{f.get('urls_fetched', [])}`")
1141
+ parts.append(f"- screenshots_count: `{f.get('screenshots_count', 0)}`\n")
1142
+ parts.append("")
1143
+
1144
+ if trace.get("summary"):
1145
+ s = trace["summary"]
1146
+ parts.append("## Summary\n")
1147
+ parts.append(f"- model: `{s.get('model')}`\n")
1301
1148
  parts.append("### System Prompt\n")
1302
- parts.append(fence("text", a.get("system_prompt", "")))
1303
- parts.append("\n### Steps\n")
1304
- parts.append(fence("json", json.dumps(a.get("steps", []), ensure_ascii=False, indent=2)))
1305
- parts.append("\n### Final Output\n")
1306
- parts.append(fence("text", a.get("final_output", "")))
1149
+ parts.append(fence("text", s.get("system_prompt", "")))
1150
+ parts.append("\n### Output\n")
1151
+ parts.append(fence("text", s.get("output", "")))
1152
+ parts.append("")
1307
1153
 
1308
1154
  return "\n".join(parts).strip() + "\n"
1309
1155
 
1156
+ async def _cache_run_async(self, cache_data: Dict[str, Any]):
1157
+ """
1158
+ Async background task to cache run data (trace, screenshots) to a folder.
1159
+ Saves to data/conversations/{timestamp}_{query}/
1160
+ This runs after the response is sent, so it doesn't block the main pipeline.
1161
+ """
1162
+ import base64
1163
+ from datetime import datetime
1164
+ from pathlib import Path
1165
+
1166
+ try:
1167
+ # Create cache directory: data/conversations/{timestamp}_{query}/
1168
+ cache_base = Path(getattr(self.config, "conversations_dir", "data/conversations"))
1169
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1170
+ user_input_short = (cache_data.get("user_input", "query") or "query")[:20]
1171
+ # Clean filename
1172
+ user_input_short = "".join(c if c.isalnum() or c in "._-" else "_" for c in user_input_short)
1173
+ cache_dir = cache_base / f"{timestamp}_{user_input_short}"
1174
+ cache_dir.mkdir(parents=True, exist_ok=True)
1175
+
1176
+ # Save conversation markdown (includes trace and response)
1177
+ conversation_md = f"""# {cache_data.get("user_input", "Query")}
1178
+
1179
+ ## Response
1180
+
1181
+ {cache_data.get("final_content", "")}
1182
+
1183
+ ---
1184
+
1185
+ ## Trace
1186
+
1187
+ {cache_data.get("trace_markdown", "")}
1188
+ """
1189
+ conv_path = cache_dir / "conversation.md"
1190
+ await asyncio.to_thread(
1191
+ conv_path.write_text,
1192
+ conversation_md,
1193
+ encoding="utf-8"
1194
+ )
1195
+
1196
+ # Save page screenshots
1197
+ screenshots = cache_data.get("page_screenshots", [])
1198
+ for i, screenshot_b64 in enumerate(screenshots):
1199
+ if screenshot_b64:
1200
+ screenshot_path = cache_dir / f"page_{i+1}.jpg"
1201
+ img_bytes = base64.b64decode(screenshot_b64)
1202
+ await asyncio.to_thread(screenshot_path.write_bytes, img_bytes)
1203
+
1204
+ logger.debug(f"Conversation cached to: {cache_dir}")
1205
+
1206
+ except Exception as e:
1207
+ # Don't fail silently but also don't crash the pipeline
1208
+ logger.warning(f"Failed to cache conversation: {e}")
1209
+
1310
1210
  async def close(self):
1311
1211
  try:
1312
1212
  await self.search_service.close()