PyPI - entari-plugin-hyw - Versions diffs - 3.5.0rc7__py3-none-any.whl → 4.0.0rc2__py3-none-any.whl - Mend - Supply Chain Defender

entari-plugin-hyw 3.5.0rc7py3-none-any.whl → 4.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (9) hide show

entari_plugin_hyw/pipeline.py CHANGED Viewed

@@ -12,14 +12,8 @@ from openai import AsyncOpenAI
 from .search import SearchService
 from .image_cache import get_cached_images
 from .prompts import (
-    AGENT_SP,
-    AGENT_SP_INSTRUCT_VISION_ADD,
-    AGENT_SP_TOOLS_STANDARD_ADD,
-    AGENT_SP_TOOLS_AGENT_ADD,
-    AGENT_SP_SEARCH_ADD,
+    SUMMARY_SP,
     INSTRUCT_SP,
-    INSTRUCT_SP_VISION_ADD,
-    VISION_SP,
 )
 @asynccontextmanager
@@ -42,6 +36,7 @@ class ProcessingPipeline:
         self.global_id_counter = 0
         # Background tasks for async image search (not blocking agent)
         self._image_search_tasks: List[asyncio.Task] = []
+        self._search_error: Optional[str] = None # Track critical search errors
         self.web_search_tool = {
             "type": "function",
@@ -55,33 +50,6 @@ class ProcessingPipeline:
                 },
             },
         }
-        self.image_search_tool = {
-            "type": "function",
-            "function": {
-                "name": "internal_image_search",
-                "description": "Search for images related to a query.",
-                "parameters": {
-                    "type": "object",
-                    "properties": {"query": {"type": "string"}},
-                    "required": ["query"],
-                },
-            },
-        }
-        self.set_mode_tool = {
-            "type": "function",
-            "function": {
-                "name": "set_mode",
-                "description": "设定后续 Agent 的运行模式: standard | agent",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "mode": {"type": "string", "enum": ["standard", "agent"]},
-                        "reason": {"type": "string"},
-                    },
-                    "required": ["mode"],
-                },
-            },
-        }
         self.crawl_page_tool = {
             "type": "function",
             "function": {
@@ -104,7 +72,7 @@ class ProcessingPipeline:
                 "parameters": {
                     "type": "object",
                     "properties": {
-                        "reason": {"type": "string", "description": "拒绝回答的原因（内部记录，不展示给用户）"},
+                        "reason": {"type": "string", "description": "拒绝回答的原因（展示给用户）"},
                     },
                     "required": [],
                 },
@@ -124,669 +92,305 @@ class ProcessingPipeline:
         selected_vision_model: str = None,
     ) -> Dict[str, Any]:
         """
-        1) Vision: summarize images once (no image persistence).
-        2) Instruct: run web_search and decide whether to grant Playwright MCP tools.
-        3) Agent: normally no tools; if granted, allow Playwright MCP tools (max 6 rounds; step 5 nudge, step 6 forced).
+        New Pipeline Flow:
+        1) Instruct: Images go directly here, decides web_search/crawl_page/refuse.
+        2) Auto-Fetch: Automatically fetch first 4 search result pages.
+        3) Screenshot: Render fetched pages as screenshots.
+        4) Summary: Receives user images + page screenshots for final answer.
         """
         start_time = time.time()
         stats = {"start_time": start_time, "tool_calls_count": 0}
-        # Token usage tracking for billing
         usage_totals = {"input_tokens": 0, "output_tokens": 0}
         active_model = model_name or self.config.model_name
         current_history = conversation_history
-        final_response_content = ""
-        structured: Dict[str, Any] = {}
-        # Reset search cache and ID counter for this execution
+        # Reset globals
         self.all_web_results = []
         self.global_id_counter = 0
-        # Reset refuse_answer flag
         self._should_refuse = False
         self._refuse_reason = ""
+        self._image_search_tasks = []
         try:
             logger.info(f"Pipeline: Starting workflow for '{user_input}' using {active_model}")
             trace: Dict[str, Any] = {
-                "vision": None,
                 "instruct": None,
-                "agent": None,
+                "search": None,
+                "fetch": None,
+                "summary": None,
             }
-            # Vision stage
-            vision_text = ""
-            vision_start = time.time()
-            vision_time = 0
-            vision_cost = 0.0
-            vision_usage = {}
-            if images:
-                vision_model = (
-                    selected_vision_model
-                    or vision_model_name
-                    or getattr(self.config, "vision_model_name", None)
-                    or active_model
-                )
-                vision_prompt = VISION_SP.format(user_msgs=user_input or "[图片]")
-                vision_text, vision_usage = await self._run_vision_stage(
-                    user_input=user_input,
-                    images=images,
-                    model=vision_model,
-                    prompt=vision_prompt,
-                )
-                # Add vision usage with vision-specific pricing
-                usage_totals["input_tokens"] += vision_usage.get("input_tokens", 0)
-                usage_totals["output_tokens"] += vision_usage.get("output_tokens", 0)
-                # Calculate Vision Cost
-                v_in_price = float(getattr(self.config, "vision_input_price", None) or getattr(self.config, "input_price", 0.0) or 0.0)
-                v_out_price = float(getattr(self.config, "vision_output_price", None) or getattr(self.config, "output_price", 0.0) or 0.0)
-                if v_in_price > 0 or v_out_price > 0:
-                     vision_cost = (vision_usage.get("input_tokens", 0) / 1_000_000 * v_in_price) + (vision_usage.get("output_tokens", 0) / 1_000_000 * v_out_price)
-                vision_time = time.time() - vision_start
-                trace["vision"] = {
-                    "model": vision_model,
-                    "base_url": getattr(self.config, "vision_base_url", None) or self.config.base_url,
-                    "prompt": vision_prompt,
-                    "user_input": user_input or "",
-                    "images_count": len(images or []),
-                    "output": vision_text,
-                    "usage": vision_usage,
-                    "time": vision_time,
-                    "cost": vision_cost
-                }
-            # Instruct + pre-search
+            # --- 1. Instruct Stage (with images if provided) ---
             instruct_start = time.time()
             instruct_model = getattr(self.config, "instruct_model_name", None) or active_model
-            logger.info(f"Instruct Stage Config: instruct_model_name={getattr(self.config, 'instruct_model_name', None)}, active_model={active_model}, using: {instruct_model}")
             instruct_text, search_payloads, instruct_trace, instruct_usage, search_time = await self._run_instruct_stage(
                 user_input=user_input,
-                vision_text=vision_text,
+                images=images,  # Pass images directly to instruct
                 model=instruct_model,
             )
-            # Instruct time excludes search time (search_time is returned separately)
-            instruct_time = time.time() - instruct_start - search_time
-            # Calculate Instruct Cost
+            # Check refuse
+            if self._should_refuse:
+                return {
+                     "llm_response": "",
+                     "structured_response": {},
+                     "stats": stats,
+                     "model_used": active_model,
+                     "conversation_history": current_history,
+                     "refuse_answer": True,
+                     "refuse_reason": self._refuse_reason
+                }
+            # Check for critical search errors
+            if self._search_error:
+                return {
+                     "llm_response": "",
+                     "structured_response": {},
+                     "stats": stats,
+                     "model_used": active_model,
+                     "conversation_history": current_history,
+                     "refuse_answer": True,
+                     "refuse_reason": f"搜索服务异常: {self._search_error} 请联系管理员。"
+                }
+            usage_totals["input_tokens"] += instruct_usage.get("input_tokens", 0)
+            usage_totals["output_tokens"] += instruct_usage.get("output_tokens", 0)
             instruct_cost = 0.0
             i_in_price = float(getattr(self.config, "instruct_input_price", None) or getattr(self.config, "input_price", 0.0) or 0.0)
             i_out_price = float(getattr(self.config, "instruct_output_price", None) or getattr(self.config, "output_price", 0.0) or 0.0)
             if i_in_price > 0 or i_out_price > 0:
                 instruct_cost = (instruct_usage.get("input_tokens", 0) / 1_000_000 * i_in_price) + (instruct_usage.get("output_tokens", 0) / 1_000_000 * i_out_price)
-            # Add instruct usage
-            usage_totals["input_tokens"] += instruct_usage.get("input_tokens", 0)
-            usage_totals["output_tokens"] += instruct_usage.get("output_tokens", 0)
-            instruct_trace["time"] = instruct_time
             instruct_trace["cost"] = instruct_cost
             trace["instruct"] = instruct_trace
-            # Check if refuse_answer was called - terminate early
-            if self._should_refuse:
-                logger.info(f"Pipeline: refuse_answer triggered. Reason: {self._refuse_reason}")
-                stats["total_time"] = time.time() - start_time
-                return {
-                    "llm_response": "",
-                    "structured_response": {},
-                    "stats": stats,
-                    "model_used": active_model,
-                    "conversation_history": current_history,
-                    "refuse_answer": True,
-                    "refuse_reason": self._refuse_reason,
-                    "stages_used": [],
-                }
-            # Start agent loop
-            agent_start_time = time.time()
-            current_history.append({"role": "user", "content": user_input or "..."})
-            mode = instruct_trace.get("mode", self.current_mode).lower()
-            logger.success(f"Instruct Mode: {mode}")
-            self.current_mode = mode
+            # --- 2. Auto-Fetch Stage (Automatically fetch first 4 search results) ---
+            fetch_start = time.time()
+            fetch_trace = {}
+            page_screenshots: List[str] = []  # Base64 screenshots of fetched pages
-            # Determine max iterations
-            max_steps = 10 if mode == "agent" else 1
-            step = 0
-            agent_trace_steps: List[Dict[str, Any]] = []
-            last_system_prompt = ""
-            agent_tools: Optional[List[Dict[str, Any]]] = None
-            if mode == "agent":
-                agent_tools = [self.web_search_tool, self.image_search_tool, self.crawl_page_tool]
-            # Agent loop
-            while step < max_steps:
-                step += 1
-                logger.info(f"Pipeline: Agent step {step}/{max_steps}")
-                if step == 5 and mode == "agent":
-                    current_history.append(
-                        {
-                            "role": "system",
-                            "content": "System: [Next Step Final] Please start consolidating the answer; the next step must be the final response.",
-                        }
-                    )
-                tools_desc = ""
-                if agent_tools:
-                    tools_desc = "\n".join([
-                        "- internal_web_search(query): 触发搜索并缓存结果",
-                        "- crawl_page(url): 使用 Crawl4AI 抓取网页返回 Markdown"
-                    ])
-                user_msgs_text = user_input or ""
-                search_msgs_text = self._format_search_msgs()
-                # Image search results are NOT passed to LLM - they're for UI rendering only
+            fetch_urls = []
+            search_items = [r for r in self.all_web_results if r.get("_type") == "search"]
+            if search_items:
+                # Group search results by query
+                query_groups = {}
+                for r in search_items:
+                    q = r.get("query", "default")
+                    if q not in query_groups:
+                        query_groups[q] = []
+                    query_groups[q].append(r)
-                has_search_results = any(r.get("_type") == "search" for r in self.all_web_results)
-                has_image_results = any(r.get("_type") == "image" for r in self.all_web_results)  # For UI rendering only
-                # Build agent system prompt
-                mode_desc_text = AGENT_SP_TOOLS_AGENT_ADD.format(tools_desc=tools_desc) if mode == "agent" else AGENT_SP_TOOLS_STANDARD_ADD
-                system_prompt = AGENT_SP.format(
-                    user_msgs=user_msgs_text,
-                    mode=mode,
-                    mode_desc=mode_desc_text,
-                    language=getattr(self.config, "language", "Simplified Chinese")[:128]
-                )
+                raw_fetch_urls = []
+                # If multiple queries, take top 3 from each
+                if len(query_groups) > 1:
+                    logger.info(f"Pipeline: Multiple search queries detected ({len(query_groups)}). Taking top 3 from each.")
+                    for q, items in query_groups.items():
+                        for item in items[:3]:
+                            if item.get("url"):
+                                raw_fetch_urls.append(item.get("url"))
+                else:
+                    # Single query, take top 8
+                    raw_fetch_urls = [r.get("url") for r in search_items[:8] if r.get("url")]
-                # Append vision text if available
-                if vision_text:
-                    system_prompt += AGENT_SP_INSTRUCT_VISION_ADD.format(vision_msgs=vision_text)
+                # Deduplicate while preserving order and filter blocked domains
+                final_fetch_urls = []
+                blocked_domains = getattr(self.config, "fetch_blocked_domains", ["wikipedia.org", "csdn.net", "sohu.com", "sogou.com"])
+                if isinstance(blocked_domains, str):
+                    blocked_domains = [d.strip() for d in blocked_domains.split(",")]
-                # Append search results (text and page only, NOT images)
-                page_msgs_text = self._format_page_msgs()
-                all_search_parts = []
-                if has_search_results and search_msgs_text:
-                    all_search_parts.append(search_msgs_text)
-                if page_msgs_text:
-                    all_search_parts.append(page_msgs_text)
-                # Images are excluded from LLM prompt - they're for UI rendering only
+                for url in raw_fetch_urls:
+                    if url and url not in final_fetch_urls:
+                        # Check blocklist
+                        if any(domain in url.lower() for domain in blocked_domains):
+                            continue
+                        final_fetch_urls.append(url)
-                if all_search_parts:
-                    system_prompt += AGENT_SP_SEARCH_ADD.format(search_msgs="\n".join(all_search_parts))
-                last_system_prompt = system_prompt
+                fetch_urls = final_fetch_urls
-                messages = [{"role": "system", "content": system_prompt}]
-                messages.extend(current_history)
-                tools_for_step = agent_tools if (agent_tools and step < max_steps) else None
-                # Debug logging
-                if tools_for_step:
-                    logger.info(f"[Agent] Tools provided: {[t['function']['name'] for t in tools_for_step]}")
-                else:
-                    logger.warning(f"[Agent] NO TOOLS provided for step {step} (agent_tools={agent_tools is not None}, step<max={step < max_steps})")
-                step_llm_start = time.time()
-                response, step_usage = await self._safe_llm_call(
-                    messages=messages,
-                    model=active_model,
-                    tools=tools_for_step,
-                    tool_choice="auto" if tools_for_step else None,
-                    extra_body=self.config.extra_body,
-                )
-                step_llm_time = time.time() - step_llm_start
-                # Debug: Check response
-                has_tool_calls = response.tool_calls is not None and len(response.tool_calls) > 0
-                logger.info(f"[Agent] Response has_tool_calls={has_tool_calls}, has_content={bool(response.content)}")
+            # Check if search was performed but no URLs were available for fetching
+            has_search_call = False
+            if instruct_trace and "tool_calls" in instruct_trace:
+                has_search_call = any(tc.get("name") in ["web_search", "internal_web_search"] for tc in instruct_trace["tool_calls"])
+            if has_search_call and not fetch_urls:
+                 return {
+                     "llm_response": "",
+                     "structured_response": {},
+                     "stats": stats,
+                     "model_used": active_model,
+                     "conversation_history": current_history,
+                     "refuse_answer": True,
+                     "refuse_reason": "搜索结果为空或全部被过滤，无法生成回答。"
+                 }
+            if fetch_urls:
+                logger.info(f"Pipeline: Auto-fetching up to {len(fetch_urls)} pages (keeping fastest 5): {fetch_urls}")
-                # Accumulate agent usage
-                usage_totals["input_tokens"] += step_usage.get("input_tokens", 0)
-                usage_totals["output_tokens"] += step_usage.get("output_tokens", 0)
-                if response.tool_calls and tools_for_step:
-                    tool_calls = response.tool_calls
-                    stats["tool_calls_count"] += len(tool_calls)
-                    # Use model_dump to preserve provider-specific fields (e.g., Gemini's thought_signature)
-                    assistant_msg = response.model_dump(exclude_unset=True) if hasattr(response, "model_dump") else {
-                        "role": "assistant",
-                        "content": response.content,
-                        "tool_calls": [{"id": tc.id, "type": "function", "function": {"name": tc.function.name, "arguments": tc.function.arguments}} for tc in tool_calls]
-                    }
-                    current_history.append(assistant_msg)
-                    tasks = [self._safe_route_tool(tc) for tc in tool_calls]
-                    tool_start_time = time.time()
-                    results = await asyncio.gather(*tasks)
-                    tool_exec_time = time.time() - tool_start_time
-                    step_trace = {
-                        "step": step,
-                        "tool_calls": [self._tool_call_to_trace(tc) for tc in tool_calls],
-                        "tool_results": [],
-                        "tool_time": tool_exec_time,
-                        "llm_time": step_llm_time,
-                        "usage": step_usage,
-                    }
-                    for i, result in enumerate(results):
-                        tc = tool_calls[i]
-                        step_trace["tool_results"].append({"name": tc.function.name, "content": str(result)})
-                        current_history.append(
-                            {
-                                "tool_call_id": tc.id,
-                                "role": "tool",
-                                "name": tc.function.name,
-                                "content": str(result),
-                            }
-                        )
-                    agent_trace_steps.append(step_trace)
-                    continue
-                final_response_content = response.content or ""
-                current_history.append({"role": "assistant", "content": final_response_content})
-                agent_trace_steps.append({
-                    "step": step,
-                    "final": True,
-                    "output": final_response_content,
-                    "llm_time": step_llm_time,
-                    "usage": step_usage
-                })
-                break
+                # Execute fetch and get screenshots
+                await self._run_auto_fetch_with_screenshots(fetch_urls)
-            if not final_response_content:
-                final_response_content = "执行结束，但未生成内容。"
+                fetch_trace = {
+                    "model": "Auto",
+                    "urls_fetched": fetch_urls,
+                    "time": time.time() - fetch_start,
+                    "cost": 0.0,
+                }
+                trace["fetch"] = fetch_trace
-            structured = self._parse_tagged_response(final_response_content)
-            final_content = structured.get("response") or final_response_content
+            # Always collect screenshots from ALL page results (search auto-fetch + direct URL crawl)
+            fetch_items = [r for r in self.all_web_results if r.get("_type") == "page"]
+            for r in fetch_items:
+                if r.get("screenshot_b64"):
+                    page_screenshots.append(r["screenshot_b64"])
+            if fetch_trace:
+                fetch_trace["screenshots_count"] = len(page_screenshots)
-            agent_time = time.time() - agent_start_time
+            # --- 3. Summary Stage (with user images + page screenshots only) ---
+            summary_start = time.time()
+            summary_model = active_model
+            # Combine user images and page screenshots for summary
+            all_summary_images: List[str] = []
+            if images:
+                all_summary_images.extend(images)
+            all_summary_images.extend(page_screenshots)
-            # Calculate Agent Cost
-            agent_cost = 0.0
-            a_in_price = float(getattr(self.config, "input_price", 0.0) or 0.0)
-            a_out_price = float(getattr(self.config, "output_price", 0.0) or 0.0)
+            summary_content, summary_usage, summary_trace_info = await self._run_summary_stage(
+                user_input=user_input,
+                images=all_summary_images if all_summary_images else None,
+                has_page_screenshots=bool(page_screenshots),
+                model=summary_model
+            )
-            agent_input_tokens = usage_totals["input_tokens"] - vision_usage.get("input_tokens", 0) - instruct_usage.get("input_tokens", 0)
-            agent_output_tokens = usage_totals["output_tokens"] - vision_usage.get("output_tokens", 0) - instruct_usage.get("output_tokens", 0)
+            usage_totals["input_tokens"] += summary_usage.get("input_tokens", 0)
+            usage_totals["output_tokens"] += summary_usage.get("output_tokens", 0)
-            if a_in_price > 0 or a_out_price > 0:
-                agent_cost = (max(0, agent_input_tokens) / 1_000_000 * a_in_price) + (max(0, agent_output_tokens) / 1_000_000 * a_out_price)
-            trace["agent"] = {
-                "model": active_model,
-                "base_url": self.config.base_url,
-                "system_prompt": last_system_prompt,
-                "steps": agent_trace_steps,
-                "final_output": final_response_content,
-                "time": agent_time,
-                "cost": agent_cost
+            summary_cost = 0.0
+            s_in_price = float(getattr(self.config, "input_price", 0.0) or 0.0)
+            s_out_price = float(getattr(self.config, "output_price", 0.0) or 0.0)
+            if s_in_price > 0 or s_out_price > 0:
+                 summary_cost = (summary_usage.get("input_tokens", 0) / 1_000_000 * s_in_price) + (summary_usage.get("output_tokens", 0) / 1_000_000 * s_out_price)
+            trace["summary"] = {
+                "model": summary_model,
+                "system_prompt": summary_trace_info.get("prompt", ""),
+                "output": summary_content,
+                "usage": summary_usage,
+                "time": time.time() - summary_start,
+                "cost": summary_cost,
+                "images_count": len(all_summary_images)
             }
-            trace_markdown = self._render_trace_markdown(trace)
+            # --- Result Assembly ---
             stats["total_time"] = time.time() - start_time
-            stats["steps"] = step
-            # Calculate billing info correctly by summing up all actual costs
-            total_cost_sum = vision_cost + instruct_cost
-            for s in agent_trace_steps:
-                s_usage = s.get("usage", {})
-                if s_usage:
-                    s_in_price = float(getattr(self.config, "input_price", 0.0) or 0.0)
-                    s_out_price = float(getattr(self.config, "output_price", 0.0) or 0.0)
-                    total_cost_sum += (s_usage.get("input_tokens", 0) / 1_000_000 * s_in_price) + (s_usage.get("output_tokens", 0) / 1_000_000 * s_out_price)
+            structured = self._parse_tagged_response(summary_content)
+            final_content = structured.get("response") or summary_content
             billing_info = {
                 "input_tokens": usage_totals["input_tokens"],
                 "output_tokens": usage_totals["output_tokens"],
-                "total_cost": total_cost_sum,
+                "total_cost": instruct_cost + summary_cost
             }
-            # Build stages_used list for UI display
-            stages_used = []
-            def infer_icon(model_name: str, base_url: str) -> str:
-                model_lower = (model_name or "").lower()
-                url_lower = (base_url or "").lower()
-                if "deepseek" in model_lower or "deepseek" in url_lower: return "deepseek"
-                elif "claude" in model_lower or "anthropic" in url_lower: return "anthropic"
-                elif "gemini" in model_lower or "google" in url_lower: return "google"
-                elif "gpt" in model_lower or "openai" in url_lower: return "openai"
-                elif "qwen" in model_lower: return "qwen"
-                elif "openrouter" in url_lower: return "openrouter"
-                return "openai"
-            def infer_provider(base_url: str) -> str:
-                url_lower = (base_url or "").lower()
-                if "openrouter" in url_lower: return "OpenRouter"
-                elif "openai" in url_lower: return "OpenAI"
-                elif "anthropic" in url_lower: return "Anthropic"
-                elif "google" in url_lower: return "Google"
-                elif "deepseek" in url_lower: return "DeepSeek"
-                return ""
+            # Build stages_used
+            stages_used = []
-            if trace.get("vision"):
-                v = trace["vision"]
-                v_model = v.get("model", "")
-                v_base_url = v.get("base_url", "") or self.config.base_url
-                stages_used.append({
-                    "name": "Vision",
-                    "model": v_model,
-                    "icon_config": infer_icon(v_model, v_base_url),
-                    "provider": infer_provider(v_base_url),
-                    "time": v.get("time", 0),
-                    "cost": v.get("cost", 0.0)
+            # Get page info
+            fetch_items = [r for r in self.all_web_results if r.get("_type") == "page"]
+            crawled_pages_ui = []
+            for r in fetch_items:
+                domain = ""
+                try:
+                    from urllib.parse import urlparse
+                    domain = urlparse(r.get("url", "")).netloc
+                except: pass
+                crawled_pages_ui.append({
+                    "title": r.get("title", ""),
+                    "url": r.get("url", ""),
+                    "favicon_url": f"https://www.google.com/s2/favicons?domain={domain}&sz=32"
                 })
+            # Extract images from pages
+            extracted_images = []
+            seen_imgs = set()
+            junk_keywords = ["icon", "logo", "badge", "avatar", "button", "social", "footer", "header", "banner", "license", "by-nc", "hosted_by", "pixel", "tracker", "ad", "ads", "advert", "promotion", "shop", "store", "group", "join", "qr", "qrcode", "weibo", "weixin", "douyin", "xiaohongshu", "bilibili", "official", "follow", "subscribe", "app"]
+            for r in fetch_items:
+                if "images" in r:
+                    for img_url in r["images"]:
+                        if img_url not in seen_imgs:
+                            # Filter junk images
+                            lower_url = img_url.lower()
+                            if any(k in lower_url for k in junk_keywords):
+                                continue
+                            extracted_images.append({
+                                "title": r.get("title", "Image"),
+                                "url": img_url,
+                                "thumbnail": img_url,
+                                "domain": r.get("domain", "")
+                            })
+                            seen_imgs.add(img_url)
+            # Instruct Stage (with crawled pages and images)
             if trace.get("instruct"):
                 i = trace["instruct"]
-                i_model = i.get("model", "")
-                i_base_url = i.get("base_url", "") or self.config.base_url
+                # Total time = instruct + search + fetch (until summary starts)
+                instruct_total_time = (i.get("time", 0) or 0) + search_time
+                if trace.get("fetch"):
+                    instruct_total_time += trace["fetch"].get("time", 0)
                 stages_used.append({
                     "name": "Instruct",
-                    "model": i_model,
-                    "icon_config": infer_icon(i_model, i_base_url),
-                    "provider": infer_provider(i_base_url),
-                    "time": i.get("time", 0),
-                    "cost": i.get("cost", 0.0)
+                    "model": i.get("model"),
+                    "icon_config": "openai",
+                    "provider": "Instruct",
+                    "time": instruct_total_time,
+                    "cost": i.get("cost", 0),
+                    "has_images": bool(images),
+                    "crawled_pages": crawled_pages_ui,  # Add crawled pages here
+                    "image_references": extracted_images[:9]  # Add images here
                 })
-            # Show Search stage if we have ANY search results (text OR image)
-            if (has_search_results or has_image_results) and search_payloads:
-                # Collect initial search results for the Search stage card
-                initial_refs = [
-                    {"title": r.get("title", ""), "url": r.get("url", ""), "domain": r.get("domain", "")}
-                    for r in self.all_web_results if r.get("_type") == "search"
-                ]
-                initial_images = [
-                    {"title": r.get("title", ""), "url": r.get("url", ""), "thumbnail": r.get("thumbnail", "")}
-                    for r in self.all_web_results if r.get("_type") == "image"
-                ]
+            # Summary Stage
+            if trace.get("summary"):
+                s = trace["summary"]
                 stages_used.append({
-                    "name": "Search",
-                    "model": getattr(self.config, "search_name", "DuckDuckGo"),
-                    "icon_config": "search",
-                    "provider": getattr(self.config, 'search_provider', 'Crawl4AI'),
-                    "time": search_time,
-                    "cost": 0.0,
-                    "references": initial_refs,
-                    "image_references": initial_images
+                    "name": "Summary",
+                    "model": s.get("model"),
+                    "icon_config": "openai",
+                    "provider": "Summary",
+                    "time": s.get("time", 0),
+                    "cost": s.get("cost", 0),
+                    "images_count": s.get("images_count", 0)
                 })
-            # Add Crawler stage if Instruct used crawl_page
-            if trace.get("instruct"):
-                instruct_tool_calls = trace["instruct"].get("tool_calls", [])
-                crawl_calls = [tc for tc in instruct_tool_calls if tc.get("name") == "crawl_page"]
-                if crawl_calls:
-                    # Build crawled_pages list for UI
-                    crawled_pages = []
-                    for tc in crawl_calls:
-                        url = tc.get("arguments", {}).get("url", "")
-                        # Try to find cached result
-                        found = next((r for r in self.all_web_results if r.get("url") == url and r.get("_type") == "page"), None)
-                        if found:
-                            try:
-                                from urllib.parse import urlparse
-                                domain = urlparse(url).netloc
-                            except:
-                                domain = ""
-                            crawled_pages.append({
-                                "title": found.get("title", "Page"),
-                                "url": url,
-                                "favicon_url": f"https://www.google.com/s2/favicons?domain={domain}&sz=32"
-                            })
-                    stages_used.append({
-                        "name": "Crawler",
-                        "model": "Crawl4AI",
-                        "icon_config": "search",
-                        "provider": "网页抓取",
-                        "time": search_time,  # Use existing search_time which includes fetch time
-                        "cost": 0.0,
-                        "crawled_pages": crawled_pages
-                    })
-            # --- Granular Agent Stages (Grouped) ---
-            if trace.get("agent"):
-                a = trace["agent"]
-                a_model = a.get("model", "") or active_model
-                a_base_url = a.get("base_url", "") or self.config.base_url
-                steps = a.get("steps", [])
-                agent_icon = infer_icon(a_model, a_base_url)
-                agent_provider = infer_provider(a_base_url)
-                for s in steps:
-                    if "tool_calls" in s:
-                        # 1. Agent Thought Stage (with LLM time)
-                        # Calculate step cost
-                        step_usage = s.get("usage", {})
-                        step_cost = 0.0
-                        if a_in_price > 0 or a_out_price > 0:
-                             step_cost = (step_usage.get("input_tokens", 0) / 1_000_000 * a_in_price) + (step_usage.get("output_tokens", 0) / 1_000_000 * a_out_price)
-                        stages_used.append({
-                            "name": "Agent",
-                            "model": a_model,
-                            "icon_config": agent_icon,
-                            "provider": agent_provider,
-                            "time": s.get("llm_time", 0), "cost": step_cost
-                        })
-                        # 2. Grouped Tool Stages
-                        # Collect results for grouping
-                        search_group_items = []
-                        crawler_group_items = []
-                        tcs = s.get("tool_calls", [])
-                        trs = s.get("tool_results", [])
-                        for idx, tc in enumerate(tcs):
-                            t_name = tc.get("name")
-                            # Try to get result content if available
-                            t_res_content = trs[idx].get("content", "") if idx < len(trs) else ""
-                            if t_name in ["internal_web_search", "web_search", "internal_image_search"]:
-                                # We don't have per-call metadata easily unless we parse the 'result' string (which is JSON dump now for route_tool)
-                                # But search results are cached in self.all_web_results.
-                                # The 'content' of search tool result is basically "cached_for_prompt".
-                                # So we don't need to put items here, just show "Search" container.
-                                # But wait, if we want to show "what was searched", we can parse args.
-                                args = tc.get("arguments", {})
-                                query = args.get("query", "")
-                                if query:
-                                    search_group_items.append({"query": query})
-                            elif t_name == "crawl_page":
-                                # Get URL from arguments, title from result
-                                args = tc.get("arguments", {})
-                                url = args.get("url", "")
-                                title = "Page"
-                                try:
-                                    page_data = json.loads(t_res_content)
-                                    if isinstance(page_data, dict):
-                                        title = page_data.get("title", "Page")
-                                except:
-                                    pass
-                                if url:
-                                    try:
-                                        domain = urlparse(url).netloc
-                                    except:
-                                        domain = ""
-                                    crawler_group_items.append({
-                                        "title": title,
-                                        "url": url,
-                                        "favicon_url": f"https://www.google.com/s2/favicons?domain={domain}&sz=32"
-                                    })
-                        # Append Grouped Stages
-                        if search_group_items:
-                             stages_used.append({
-                                "name": "Search",
-                                "model": getattr(self.config, "search_name", "DuckDuckGo"),
-                                "icon_config": "search",
-                                "provider": "Agent Search",
-                                "time": s.get("tool_time", 0), "cost": 0,
-                                "queries": search_group_items # Render can use this if needed, or just show generic
-                            })
-                        if crawler_group_items:
-                            stages_used.append({
-                                "name": "Crawler",
-                                "model": "Crawl4AI",
-                                "icon_config": "browser",
-                                "provider": "Page Fetcher",
-                                "time": s.get("tool_time", 0), "cost": 0,
-                                "crawled_pages": crawler_group_items
-                            })
-                    elif s.get("final"):
-                        # Correctly calculate final step cost
-                        step_usage = s.get("usage", {})
-                        step_cost = 0.0
-                        if a_in_price > 0 or a_out_price > 0:
-                             step_cost = (step_usage.get("input_tokens", 0) / 1_000_000 * a_in_price) + (step_usage.get("output_tokens", 0) / 1_000_000 * a_out_price)
-                        stages_used.append({
-                            "name": "Agent",
-                            "model": a_model,
-                            "icon_config": agent_icon,
-                            "provider": agent_provider,
-                            "time": s.get("llm_time", 0),
-                            "cost": step_cost
-                        })
-            # Assign total time/cost to last Agent stage
-            # Sum up total time/cost for UI/stats (implicit via loop above)
-            # No need to assign everything to last agent anymore as we distribute it.
-            # --- Final Filter: Only show cited items in workflow cards ---
-            cited_urls = {ref['url'] for ref in (structured.get("references", []) +
-                                               structured.get("page_references", []) +
-                                               structured.get("image_references", []))}
-            # Find images already rendered in markdown content (to avoid duplicate display)
-            markdown_image_urls = set()
-            md_img_pattern = re.compile(r'!\[.*?\]\((https?://[^)]+)\)')
-            for match in md_img_pattern.finditer(final_content):
-                markdown_image_urls.add(match.group(1))
-            for s in stages_used:
-                if "references" in s and s["references"]:
-                    s["references"] = [r for r in s["references"] if r.get("url") in cited_urls]
-                # Filter out images already shown in markdown content
-                # Check both url AND thumbnail since either might be used in markdown
-                if "image_references" in s and s["image_references"]:
-                    s["image_references"] = [
-                        r for r in s["image_references"]
-                        if r.get("url") not in markdown_image_urls and (r.get("thumbnail") or "") not in markdown_image_urls
-                    ]
-                if "crawled_pages" in s and s["crawled_pages"]:
-                    s["crawled_pages"] = [r for r in s["crawled_pages"] if r.get("url") in cited_urls]
-            # Clean up conversation history: Remove tool calls and results to save tokens and avoid ID conflicts
-            # Keep only 'user' messages and 'assistant' messages without tool_calls (final answers)
-            cleaned_history = []
-            for msg in current_history:
-                if msg.get("role") == "tool":
-                    continue
-                if msg.get("role") == "assistant" and msg.get("tool_calls"):
-                    continue
-                cleaned_history.append(msg)
-            # Update the reference (since it might be used by caller)
-            current_history[:] = cleaned_history
-            # --- Apply cached images to reduce render time ---
-            # Collect all image URLs that need caching (avoid duplicates when thumbnail == url)
-            all_image_urls = set()
-            for img_ref in structured.get("image_references", []):
-                if img_ref.get("thumbnail"):
-                    all_image_urls.add(img_ref["thumbnail"])
-                if img_ref.get("url"):
-                    all_image_urls.add(img_ref["url"])
-            for stage in stages_used:
-                for img_ref in stage.get("image_references", []):
-                    if img_ref.get("thumbnail"):
-                        all_image_urls.add(img_ref["thumbnail"])
-                    if img_ref.get("url"):
-                        all_image_urls.add(img_ref["url"])
-            # Also collect image URLs from markdown content
-            markdown_img_pattern = re.compile(r'!\[.*?\]\((https?://[^)]+)\)')
-            markdown_urls = markdown_img_pattern.findall(final_content)
-            all_image_urls.update(markdown_urls)
+            # Construct final trace markdown
+            trace_markdown = self._render_trace_markdown(trace)
-            # Get cached versions (waits for pending downloads until agent ends)
-            if all_image_urls:
-                try:
-                    cached_map = await get_cached_images(list(all_image_urls))
-                    # Apply cached URLs to structured response
-                    for img_ref in structured.get("image_references", []):
-                        if img_ref.get("thumbnail") and img_ref["thumbnail"] in cached_map:
-                            img_ref["thumbnail"] = cached_map[img_ref["thumbnail"]]
-                        if img_ref.get("url") and img_ref["url"] in cached_map:
-                            img_ref["url"] = cached_map[img_ref["url"]]
-                    # Apply cached URLs to stages
-                    for stage in stages_used:
-                        for img_ref in stage.get("image_references", []):
-                            if img_ref.get("thumbnail") and img_ref["thumbnail"] in cached_map:
-                                img_ref["thumbnail"] = cached_map[img_ref["thumbnail"]]
-                            if img_ref.get("url") and img_ref["url"] in cached_map:
-                                img_ref["url"] = cached_map[img_ref["url"]]
-                    # Replace image URLs in markdown content with cached versions
-                    def replace_markdown_img(match):
-                        full_match = match.group(0)
-                        url = match.group(1)
-                        cached_url = cached_map.get(url)
-                        if cached_url and cached_url != url:
-                            return full_match.replace(url, cached_url)
-                        return full_match
-                    final_content = markdown_img_pattern.sub(replace_markdown_img, final_content)
-                    structured["response"] = markdown_img_pattern.sub(replace_markdown_img, structured.get("response", ""))
-                    # Log cache stats
-                    from .image_cache import get_image_cache
-                    cache_stats = get_image_cache().get_stats()
-                    logger.info(f"ImageCache stats: {cache_stats}")
-                except Exception as e:
-                    logger.warning(f"Failed to apply image cache: {e}")
+            # Update history
+            current_history.append({"role": "user", "content": user_input or "..."})
+            current_history.append({"role": "assistant", "content": final_content})
-            # Cancel all background image search/download tasks when agent ends
-            if self._image_search_tasks:
-                logger.info(f"Cancelling {len(self._image_search_tasks)} background image search tasks")
-                for task in self._image_search_tasks:
-                    if not task.done():
-                        task.cancel()
-                # Wait a bit for tasks to handle cancellation gracefully
-                try:
-                    await asyncio.gather(*self._image_search_tasks, return_exceptions=True)
-                except Exception:
-                    pass
-                self._image_search_tasks.clear()
-            # Also cancel any pending image downloads in the cache
-            from .image_cache import get_image_cache
-            cache = get_image_cache()
-            if cache._pending:
-                logger.info(f"Cancelling {len(cache._pending)} pending image downloads")
-                for task in cache._pending.values():
-                    if not task.done():
-                        task.cancel()
-                cache._pending.clear()
+            # Schedule async cache task (fire and forget - doesn't block return)
+            cache_data = {
+                "user_input": user_input,
+                "trace": trace,
+                "trace_markdown": trace_markdown,
+                "page_screenshots": page_screenshots,
+                "final_content": final_content,
+                "stages_used": stages_used,
+            }
+            asyncio.create_task(self._cache_run_async(cache_data))
             return {
                 "llm_response": final_content,
                 "structured_response": structured,
                 "stats": stats,
                 "model_used": active_model,
-                "vision_model_used": (selected_vision_model or getattr(self.config, "vision_model_name", None)) if images else None,
                 "conversation_history": current_history,
                 "trace_markdown": trace_markdown,
                 "billing_info": billing_info,
@@ -799,18 +403,11 @@ class ProcessingPipeline:
             if hasattr(self, '_image_search_tasks') and self._image_search_tasks:
                 for task in self._image_search_tasks:
                     if not task.done(): task.cancel()
-                # Wait briefly for cleanup
-                await asyncio.wait(self._image_search_tasks, timeout=0.1)
-                self._image_search_tasks.clear()
+                try:
+                    await asyncio.wait(self._image_search_tasks, timeout=0.1)
+                except Exception: pass
+                self._image_search_tasks = []
-            from .image_cache import get_image_cache
-            cache = get_image_cache()
-            if cache._pending:
-                pending_tasks = list(cache._pending.values())
-                for task in pending_tasks:
-                    if not task.done(): task.cancel()
-                await asyncio.wait(pending_tasks, timeout=0.1)
-                cache._pending.clear()
             return {
                 "llm_response": f"I encountered a critical error: {e}",
                 "stats": stats,
@@ -968,7 +565,26 @@ class ProcessingPipeline:
         if name == "internal_web_search" or name == "web_search":
             query = args.get("query")
-            web = await self.search_service.search(query)
+            try:
+                web = await self.search_service.search(query)
+            except Exception as e:
+                logger.error(f"Failed to execute search: {e}")
+                self._search_error = str(e)
+                raise e
+            # Filter blocked domains immediately
+            blocked_domains = getattr(self.config, "fetch_blocked_domains", ["wikipedia.org", "csdn.net", "baidu.com"])
+            if isinstance(blocked_domains, str):
+                blocked_domains = [d.strip() for d in blocked_domains.split(",")]
+            # Use list comprehension for filtering
+            original_count = len(web)
+            web = [
+                item for item in web
+                if not any(blocked in item.get("url", "").lower() for blocked in blocked_domains)
+            ]
+            if len(web) < original_count:
+                logger.info(f"Filtered {original_count - len(web)} blocked search results.")
             # Cache results and assign global IDs
             for item in web:
@@ -1018,6 +634,13 @@ class ProcessingPipeline:
             # Cache the crawled content with global ID
             self.global_id_counter += 1
+            # Generate screenshot for direct URL crawl (so LLM can see it)
+            screenshot_b64 = await self._render_page_screenshot(
+                title=result_dict.get("title", "Page"),
+                url=url,
+                content=result_dict.get("content", "")[:4000]
+            )
             cached_item = {
                 "_id": self.global_id_counter,
                 "_type": "page",
@@ -1026,6 +649,7 @@ class ProcessingPipeline:
                 "content": result_dict.get("content", ""),
                 "domain": "",
                 "is_crawled": True,
+                "screenshot_b64": screenshot_b64,  # Add screenshot
             }
             try:
                 from urllib.parse import urlparse
@@ -1091,45 +715,38 @@ class ProcessingPipeline:
         return response.choices[0].message, usage
-    async def _run_vision_stage(self, user_input: str, images: List[str], model: str, prompt: str) -> Tuple[str, Dict[str, int]]:
-        content_payload: List[Dict[str, Any]] = [{"type": "text", "text": user_input or ""}]
-        for img_b64 in images:
-            url = f"data:image/png;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
-            content_payload.append({"type": "image_url", "image_url": {"url": url}})
-        client = self._client_for(
-            api_key=getattr(self.config, "vision_api_key", None),
-            base_url=getattr(self.config, "vision_base_url", None),
-        )
-        response, usage = await self._safe_llm_call(
-            messages=[{"role": "system", "content": prompt}, {"role": "user", "content": content_payload}],
-            model=model,
-            client=client,
-            extra_body=getattr(self.config, "vision_extra_body", None),
-        )
-        return (response.content or "").strip(), usage
     async def _run_instruct_stage(
-        self, user_input: str, vision_text: str, model: str
+        self, user_input: str, images: List[str] = None, model: str = None
     ) -> Tuple[str, List[str], Dict[str, Any], Dict[str, int], float]:
-        """Returns (instruct_text, search_payloads, trace_dict, usage_dict, search_time)."""
-        # Instruct has access to: web_search, image_search, set_mode, crawl_page, refuse_answer
-        tools = [self.web_search_tool, self.image_search_tool, self.set_mode_tool, self.crawl_page_tool, self.refuse_answer_tool]
-        tools_desc = "- internal_web_search: 搜索文本\n- internal_image_search: 搜索图片\n- crawl_page: 获取网页内容\n- set_mode: 设定standard/agent模式\n- refuse_answer: 拒绝回答（敏感/违规内容）"
+        """Returns (instruct_text, search_payloads, trace_dict, usage_dict, search_time).
+        Images are now passed directly here (merged vision stage).
+        """
+        # Instruct has access to: web_search, crawl_page, refuse_answer
+        tools = [self.web_search_tool, self.crawl_page_tool, self.refuse_answer_tool]
+        tools_desc = "- internal_web_search: 搜索文本\n- crawl_page: 获取网页内容\n- refuse_answer: 拒绝回答（敏感/违规内容）"
         prompt = INSTRUCT_SP.format(user_msgs=user_input or "", tools_desc=tools_desc)
-        if vision_text:
-            prompt = f"{prompt}\\n\\n{INSTRUCT_SP_VISION_ADD.format(vision_msgs=vision_text)}"
         client = self._client_for(
             api_key=getattr(self.config, "instruct_api_key", None),
             base_url=getattr(self.config, "instruct_base_url", None),
         )
+        # Build user content - multimodal if images provided
+        if images:
+            user_content: List[Dict[str, Any]] = [{"type": "text", "text": user_input or "..."}]
+            for img_b64 in images:
+                url = f"data:image/png;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
+                user_content.append({"type": "image_url", "image_url": {"url": url}})
+        else:
+            user_content = user_input or "..."
         history: List[Dict[str, Any]] = [
             {"role": "system", "content": prompt},
-            {"role": "user", "content": user_input or "..."},
+            {"role": "user", "content": user_content},
         ]
         response, usage = await self._safe_llm_call(
@@ -1147,15 +764,14 @@ class ProcessingPipeline:
             "base_url": getattr(self.config, "instruct_base_url", None) or self.config.base_url,
             "prompt": prompt,
             "user_input": user_input or "",
-            "vision_add": vision_text or "",
+            "has_images": bool(images),
+            "images_count": len(images) if images else 0,
             "tool_calls": [],
             "tool_results": [],
             "output": "",
         }
         search_time = 0.0
-        mode = "standard"
-        mode_reason = ""
         if response.tool_calls:
             plan_dict = response.model_dump() if hasattr(response, "model_dump") else response
@@ -1177,27 +793,262 @@ class ProcessingPipeline:
                 if tc.function.name in ["web_search", "internal_web_search"]:
                     search_payloads.append(str(result))
-                elif tc.function.name == "set_mode":
-                    try:
-                        args = json.loads(html.unescape(tc.function.arguments))
-                    except Exception:
-                        args = {}
-                    mode = args.get("mode", mode)
-                    mode_reason = args.get("reason", "")
-            instruct_trace["mode"] = mode
-            if mode_reason:
-                instruct_trace["mode_reason"] = mode_reason
             instruct_trace["output"] = ""
             instruct_trace["usage"] = usage
             return "", search_payloads, instruct_trace, usage, search_time
-        instruct_trace["mode"] = mode
         instruct_trace["output"] = (response.content or "").strip()
         instruct_trace["usage"] = usage
         return "", search_payloads, instruct_trace, usage, 0.0
+    async def _run_auto_fetch_with_screenshots(self, urls: List[str]):
+        """
+        Automatically fetch URLs and generate screenshots of their content.
+        Stops after getting the first 5 successful results (fastest wins).
+        Screenshots are stored as base64 in the cached items.
+        """
+        if not urls:
+            return
+        # Get config
+        fetch_timeout = float(getattr(self.config, "fetch_timeout", 15.0))
+        max_results = int(getattr(self.config, "fetch_max_results", 5))
+        async def _fetch_and_screenshot(url: str):
+            try:
+                # Fetch page content
+                result_dict = await self.search_service.fetch_page(url)
+                self.global_id_counter += 1
+                # Generate screenshot from page content
+                screenshot_b64 = await self._render_page_screenshot(
+                    title=result_dict.get("title", "Page"),
+                    url=url,
+                    content=result_dict.get("content", "")[:4000]  # Limit content for screenshot
+                )
+                cached_item = {
+                    "_id": self.global_id_counter,
+                    "_type": "page",
+                    "title": result_dict.get("title", "Page"),
+                    "url": result_dict.get("url", url),
+                    "content": result_dict.get("content", ""),
+                    "images": result_dict.get("images", []),
+                    "domain": "",
+                    "is_crawled": True,
+                    "screenshot_b64": screenshot_b64,
+                }
+                try:
+                    from urllib.parse import urlparse
+                    cached_item["domain"] = urlparse(url).netloc
+                except:
+                    pass
+                return cached_item
+            except Exception as e:
+                logger.error(f"Failed to fetch/screenshot {url}: {e}")
+                return None
+        async def _fetch_with_timeout(url: str):
+            """Wrapper to apply timeout to each fetch operation."""
+            try:
+                return await asyncio.wait_for(_fetch_and_screenshot(url), timeout=fetch_timeout)
+            except asyncio.TimeoutError:
+                logger.warning(f"Fetch timeout ({fetch_timeout}s) exceeded for: {url}")
+                return None
+        # Create tasks for all URLs (track url -> task mapping)
+        url_to_task = {url: asyncio.create_task(_fetch_with_timeout(url)) for url in urls}
+        tasks = list(url_to_task.values())
+        first_url = urls[0] if urls else None
+        first_task = url_to_task.get(first_url) if first_url else None
+        # Collect first N successful results (fastest wins)
+        collected_results = {}  # url -> result
+        successful_count = 0
+        for coro in asyncio.as_completed(tasks):
+            try:
+                result = await coro
+                if result:
+                    # Find which URL this result belongs to
+                    result_url = result.get("url", "")
+                    collected_results[result_url] = result
+                    successful_count += 1
+                    # Only break if we have enough AND first URL is done (or failed)
+                    first_done = first_url in collected_results or (first_task and first_task.done())
+                    if successful_count >= max_results and first_done:
+                        logger.info(f"Got {max_results} successful results, cancelling remaining tasks")
+                        break
+            except Exception as e:
+                logger.warning(f"Fetch task failed: {e}")
+        # Ensure first URL task completes (if not already) before cancelling others
+        if first_task and not first_task.done():
+            logger.info("Waiting for first URL to complete...")
+            try:
+                result = await first_task
+                if result:
+                    collected_results[result.get("url", first_url)] = result
+            except Exception as e:
+                logger.warning(f"First URL fetch failed: {e}")
+        # Cancel remaining tasks
+        for task in tasks:
+            if not task.done():
+                task.cancel()
+        # Wait briefly for cancellation to propagate
+        if any(not t.done() for t in tasks):
+            await asyncio.gather(*tasks, return_exceptions=True)
+        # Add results in original URL order (not fetch speed order)
+        for url in urls:
+            if url in collected_results:
+                self.all_web_results.append(collected_results[url])
+    async def _render_page_screenshot(self, title: str, url: str, content: str) -> Optional[str]:
+        """
+        Render page content as a simple HTML and take a screenshot.
+        Returns base64 encoded image or None on failure.
+        Images are compressed to reduce LLM payload size.
+        """
+        import base64
+        import tempfile
+        try:
+            # Try to use the content renderer if available
+            from .render_vue import ContentRenderer
+            # Create a simple markdown representation for screenshot
+            markdown = f"> 来源: {url}\n\n# {title}\n\n{content}"  # Limit content
+            # Use temp file for screenshot
+            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
+                tmp_path = tmp.name
+            # Get or create renderer (reuse if possible)
+            if not hasattr(self, '_screenshot_renderer'):
+                self._screenshot_renderer = ContentRenderer(auto_start=True)
+                await self._screenshot_renderer.start(timeout=10000)
+            # Await the async render method
+            await self._screenshot_renderer.render(
+                markdown,
+                tmp_path,
+                stats={"total_time": 0},
+                references=[{"title": title, "url": url, "domain": ""}],
+            )
+            # Compress image to reduce LLM payload size (~350KB target)
+            img_bytes = await self._compress_image(tmp_path, max_width=600, quality=70)
+            # Cleanup
+            import os
+            os.unlink(tmp_path)
+            return base64.b64encode(img_bytes).decode("utf-8")
+        except Exception as e:
+            logger.warning(f"Failed to render page screenshot: {e}")
+            return None
+    async def _compress_image(self, image_path: str, max_width: int = 400, quality: int = 50) -> bytes:
+        """Compress image to reduce size for LLM payload."""
+        from io import BytesIO
+        try:
+            from PIL import Image
+            def _compress():
+                with Image.open(image_path) as img:
+                    # Calculate new height maintaining aspect ratio
+                    if img.width > max_width:
+                        ratio = max_width / img.width
+                        new_height = int(img.height * ratio)
+                        img = img.resize((max_width, new_height), Image.Resampling.LANCZOS)
+                    # Convert to RGB if necessary
+                    if img.mode in ('RGBA', 'P'):
+                        img = img.convert('RGB')
+                    # Save to buffer with compression
+                    buffer = BytesIO()
+                    img.save(buffer, format='JPEG', quality=quality, optimize=True)
+                    return buffer.getvalue()
+            return await asyncio.to_thread(_compress)
+        except ImportError:
+            # PIL not available, return original
+            logger.warning("PIL not available for image compression, using original")
+            with open(image_path, 'rb') as f:
+                return f.read()
+    async def _run_summary_stage(
+        self, user_input: str, images: List[str] = None,
+        has_page_screenshots: bool = False, model: str = None
+    ) -> Tuple[str, Dict[str, int], Dict[str, Any]]:
+        """
+        Generate final summary using page screenshots only.
+        Returns (content, usage, trace_info).
+        """
+        # Build system prompt
+        try:
+            language_conf = getattr(self.config, "language", "Simplified Chinese")
+            system_prompt = SUMMARY_SP.format(language=language_conf)
+        except Exception:
+            system_prompt = SUMMARY_SP
+        # Build user content - multimodal if images provided
+        if images:
+            user_content: List[Dict[str, Any]] = [{"type": "text", "text": user_input or "..."}]
+            for img_b64 in images:
+                url = f"data:image/jpeg;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
+                user_content.append({"type": "image_url", "image_url": {"url": url}})
+        else:
+            user_content = user_input or "..."
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_content}
+        ]
+        client = self._client_for(
+            api_key=getattr(self.config, "summary_api_key", None),
+            base_url=getattr(self.config, "summary_base_url", None)
+        )
+        response, usage = await self._safe_llm_call(
+            messages=messages,
+            model=model,
+            client=client,
+            extra_body=getattr(self.config, "summary_extra_body", None)
+        )
+        return (response.content or "").strip(), usage, {"prompt": system_prompt}
+    def _format_fetch_msgs(self) -> str:
+        """Format crawled page content for Summary prompt."""
+        if not self.all_web_results:
+            return ""
+        lines = []
+        for res in self.all_web_results:
+            if res.get("_type") != "page": continue
+            idx = res.get("_id")
+            title = (res.get("title", "") or "").strip()
+            url = res.get("url", "")
+            content = (res.get("content", "") or "").strip()
+            # Truncate content if too long? For now keep it full or rely on model context
+            lines.append(f"Title: {title}\nURL: {url}\nContent:\n{content}\n")
+        return "\n".join(lines)
     def _format_search_msgs(self) -> str:
         """Format search snippets only (not crawled pages)."""
         if not self.all_web_results:
@@ -1264,23 +1115,13 @@ class ProcessingPipeline:
         parts: List[str] = []
         parts.append("# Pipeline Trace\n")
-        if trace.get("vision"):
-            v = trace["vision"]
-            parts.append("## Vision\n")
-            parts.append(f"- model: `{v.get('model')}`")
-            parts.append(f"- base_url: `{v.get('base_url')}`")
-            parts.append(f"- images_count: `{v.get('images_count')}`\n")
-            parts.append("### Prompt\n")
-            parts.append(fence("text", v.get("prompt", "")))
-            parts.append("\n### Output\n")
-            parts.append(fence("text", v.get("output", "")))
-            parts.append("")
         if trace.get("instruct"):
             t = trace["instruct"]
             parts.append("## Instruct\n")
             parts.append(f"- model: `{t.get('model')}`")
-            parts.append(f"- base_url: `{t.get('base_url')}`\n")
+            parts.append(f"- base_url: `{t.get('base_url')}`")
+            parts.append(f"- has_images: `{t.get('has_images', False)}`")
+            parts.append(f"- images_count: `{t.get('images_count', 0)}`\n")
             parts.append("### Prompt\n")
             parts.append(fence("text", t.get("prompt", "")))
             if t.get("tool_calls"):
@@ -1293,20 +1134,79 @@ class ProcessingPipeline:
             parts.append(fence("text", t.get("output", "")))
             parts.append("")
-        if trace.get("agent"):
-            a = trace["agent"]
-            parts.append("## Agent\n")
-            parts.append(f"- model: `{a.get('model')}`")
-            parts.append(f"- base_url: `{a.get('base_url')}`\n")
+        if trace.get("fetch"):
+            f = trace["fetch"]
+            parts.append("## Auto-Fetch\n")
+            parts.append(f"- urls_fetched: `{f.get('urls_fetched', [])}`")
+            parts.append(f"- screenshots_count: `{f.get('screenshots_count', 0)}`\n")
+            parts.append("")
+        if trace.get("summary"):
+            s = trace["summary"]
+            parts.append("## Summary\n")
+            parts.append(f"- model: `{s.get('model')}`\n")
             parts.append("### System Prompt\n")
-            parts.append(fence("text", a.get("system_prompt", "")))
-            parts.append("\n### Steps\n")
-            parts.append(fence("json", json.dumps(a.get("steps", []), ensure_ascii=False, indent=2)))
-            parts.append("\n### Final Output\n")
-            parts.append(fence("text", a.get("final_output", "")))
+            parts.append(fence("text", s.get("system_prompt", "")))
+            parts.append("\n### Output\n")
+            parts.append(fence("text", s.get("output", "")))
+            parts.append("")
         return "\n".join(parts).strip() + "\n"
+    async def _cache_run_async(self, cache_data: Dict[str, Any]):
+        """
+        Async background task to cache run data (trace, screenshots) to a folder.
+        Saves to data/conversations/{timestamp}_{query}/
+        This runs after the response is sent, so it doesn't block the main pipeline.
+        """
+        import base64
+        from datetime import datetime
+        from pathlib import Path
+        try:
+            # Create cache directory: data/conversations/{timestamp}_{query}/
+            cache_base = Path(getattr(self.config, "conversations_dir", "data/conversations"))
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            user_input_short = (cache_data.get("user_input", "query") or "query")[:20]
+            # Clean filename
+            user_input_short = "".join(c if c.isalnum() or c in "._-" else "_" for c in user_input_short)
+            cache_dir = cache_base / f"{timestamp}_{user_input_short}"
+            cache_dir.mkdir(parents=True, exist_ok=True)
+            # Save conversation markdown (includes trace and response)
+            conversation_md = f"""# {cache_data.get("user_input", "Query")}
+## Response
+{cache_data.get("final_content", "")}
+---
+## Trace
+{cache_data.get("trace_markdown", "")}
+"""
+            conv_path = cache_dir / "conversation.md"
+            await asyncio.to_thread(
+                conv_path.write_text,
+                conversation_md,
+                encoding="utf-8"
+            )
+            # Save page screenshots
+            screenshots = cache_data.get("page_screenshots", [])
+            for i, screenshot_b64 in enumerate(screenshots):
+                if screenshot_b64:
+                    screenshot_path = cache_dir / f"page_{i+1}.jpg"
+                    img_bytes = base64.b64decode(screenshot_b64)
+                    await asyncio.to_thread(screenshot_path.write_bytes, img_bytes)
+            logger.debug(f"Conversation cached to: {cache_dir}")
+        except Exception as e:
+            # Don't fail silently but also don't crash the pipeline
+            logger.warning(f"Failed to cache conversation: {e}")
     async def close(self):
         try:
             await self.search_service.close()