PyPI - entari-plugin-hyw - Versions diffs - 4.0.0rc6__py3-none-any.whl → 4.0.0rc7__py3-none-any.whl - Mend

entari-plugin-hyw 4.0.0rc6py3-none-any.whl → 4.0.0rc7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (18) hide show

entari_plugin_hyw/__init__.py +24 -2
entari_plugin_hyw/assets/card-dist/index.html +26 -26
entari_plugin_hyw/browser/engines/default.py +166 -0
entari_plugin_hyw/browser/manager.py +1 -1
entari_plugin_hyw/browser/service.py +268 -27
entari_plugin_hyw/card-ui/src/App.vue +32 -1
entari_plugin_hyw/definitions.py +22 -3
entari_plugin_hyw/history.py +34 -44
entari_plugin_hyw/modular_pipeline.py +130 -36
entari_plugin_hyw/search.py +45 -9
entari_plugin_hyw/stage_base.py +3 -0
entari_plugin_hyw/stage_instruct.py +13 -3
entari_plugin_hyw/stage_summary.py +6 -0
entari_plugin_hyw/stage_vision.py +113 -0
{entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc7.dist-info}/METADATA +1 -1
{entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc7.dist-info}/RECORD +18 -16
{entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc7.dist-info}/WHEEL +0 -0
{entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc7.dist-info}/top_level.txt +0 -0

entari_plugin_hyw/history.py CHANGED Viewed

@@ -75,7 +75,7 @@ class HistoryManager:
                 self._context_history[context_id] = []
             self._context_history[context_id].append(key)
-    def save_to_disk(self, key: str, save_root: str = "data/conversations", image_path: Optional[str] = None, web_results: Optional[List[Dict]] = None):
+    def save_to_disk(self, key: str, save_root: str = "data/conversations", image_path: Optional[str] = None, web_results: Optional[List[Dict]] = None, vision_trace: Optional[Dict] = None, instruct_traces: Optional[List[Dict]] = None):
         """Save conversation history to specific folder structure"""
         import os
         import time
@@ -198,51 +198,41 @@ class HistoryManager:
                 except Exception as e:
                     print(f"Failed to copy output image: {e}")
-            # 4. Save Full Log (Readme style)
-            timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-            model_name = meta.get("model", "unknown")
-            code = self._key_to_code.get(key, "N/A")
-            md_content = f"# Conversation Log: {folder_name}\n\n"
-            md_content += f"- **Time**: {timestamp}\n"
-            md_content += f"- **Code**: {code}\n"
-            md_content += f"- **Model**: {model_name}\n\n"
-            md_content += "## History\n\n"
-            for msg in self._history[key]:
-                role = msg.get("role", "unknown").upper()
-                content = msg.get("content", "")
-                md_content += f"### {role}\n\n"
+            # 4. Save Vision Log (if vision stage was used)
+            if vision_trace and not vision_trace.get("skipped"):
+                vision_md = "# Vision Stage Log\n\n"
+                vision_md += f"- **Model**: {vision_trace.get('model', 'unknown')}\n"
+                vision_md += f"- **Time**: {vision_trace.get('time', 0):.2f}s\n"
+                vision_md += f"- **Images Count**: {vision_trace.get('images_count', 0)}\n"
+                vision_md += f"- **Input Tokens**: {vision_trace.get('usage', {}).get('input_tokens', 0)}\n"
+                vision_md += f"- **Output Tokens**: {vision_trace.get('usage', {}).get('output_tokens', 0)}\n\n"
+                vision_md += "## Vision Description Output\n\n"
+                vision_md += f"```\n{vision_trace.get('output', '')}\n```\n"
-                tool_calls = msg.get("tool_calls")
-                if tool_calls:
-                     try:
-                         tc_str = json.dumps(tool_calls, ensure_ascii=False, indent=2)
-                     except:
-                         tc_str = str(tool_calls)
-                     md_content += f"**Tool Calls**:\n```json\n{tc_str}\n```\n\n"
-                if role == "TOOL":
-                    try:
-                        # Try parsing as JSON first
-                        if isinstance(content, str):
-                            parsed = json.loads(content)
-                            pretty = json.dumps(parsed, ensure_ascii=False, indent=2)
-                            md_content += f"**Output**:\n```json\n{pretty}\n```\n\n"
-                        else:
-                            md_content += f"**Output**:\n```text\n{content}\n```\n\n"
-                    except:
-                         md_content += f"**Output**:\n```text\n{content}\n```\n\n"
-                else:
-                    if content:
-                        md_content += f"{content}\n\n"
-                md_content += "---\n\n"
+                with open(os.path.join(folder_path, "vision_log.md"), "w", encoding="utf-8") as f:
+                    f.write(vision_md)
-            with open(os.path.join(folder_path, "full_log.md"), "w", encoding="utf-8") as f:
-                f.write(md_content)
+            # 5. Save Instruct Log (all instruct rounds)
+            if instruct_traces:
+                instruct_md = "# Instruct Stage Log\n\n"
+                for i, trace in enumerate(instruct_traces):
+                    stage_name = trace.get("stage_name", f"Round {i+1}")
+                    instruct_md += f"## {stage_name}\n\n"
+                    instruct_md += f"- **Model**: {trace.get('model', 'unknown')}\n"
+                    instruct_md += f"- **Time**: {trace.get('time', 0):.2f}s\n"
+                    instruct_md += f"- **Tool Calls**: {trace.get('tool_calls', 0)}\n"
+                    instruct_md += f"- **Input Tokens**: {trace.get('usage', {}).get('input_tokens', 0)}\n"
+                    instruct_md += f"- **Output Tokens**: {trace.get('usage', {}).get('output_tokens', 0)}\n\n"
+                    output = trace.get("output", "")
+                    if output:
+                        instruct_md += "### Reasoning Output\n\n"
+                        instruct_md += f"```\n{output}\n```\n\n"
+                    instruct_md += "---\n\n"
+                with open(os.path.join(folder_path, "instruct_log.md"), "w", encoding="utf-8") as f:
+                    f.write(instruct_md)
         except Exception as e:
             print(f"Failed to save conversation: {e}")

entari_plugin_hyw/modular_pipeline.py CHANGED Viewed

@@ -16,6 +16,7 @@ from .stage_base import StageContext
 from .stage_instruct import InstructStage
 from .stage_instruct_deepsearch import InstructDeepsearchStage
 from .stage_summary import SummaryStage
+from .stage_vision import VisionStage
 from .search import SearchService
@@ -36,9 +37,15 @@ class ModularPipeline:
         self.client = AsyncOpenAI(base_url=config.base_url, api_key=config.api_key)
         # Initialize stages
-        self.instruct_stage = InstructStage(config, self.search_service, self.client)
+        self.instruct_stage = InstructStage(config, self.search_service, self.client, send_func=send_func)
         self.instruct_deepsearch_stage = InstructDeepsearchStage(config, self.search_service, self.client)
         self.summary_stage = SummaryStage(config, self.search_service, self.client)
+        self.vision_stage = VisionStage(config, self.search_service, self.client)
+    def _has_vision_model(self) -> bool:
+        """Check if a vision model is configured."""
+        vision_cfg = self.config.get_model_config("vision")
+        return bool(vision_cfg.get("model_name"))
     async def execute(
         self,
@@ -54,6 +61,9 @@ class ModularPipeline:
         stats = {"start_time": start_time}
         usage_totals = {"input_tokens": 0, "output_tokens": 0}
         active_model = model_name or self.config.model_name
+        if not active_model:
+             # Fallback to instruct model for logging/context
+             active_model = self.config.get_model_config("instruct").get("model_name")
         context = StageContext(
             user_input=user_input,
@@ -79,6 +89,24 @@ class ModularPipeline:
         try:
             logger.info(f"Pipeline: Processing '{user_input[:30]}...'")
+            # === Stage 0: Vision (if images and vision model configured) ===
+            if images and self._has_vision_model():
+                logger.info("Pipeline: Stage 0 - Vision (generating image description)")
+                vision_result = await self.vision_stage.execute(context, images)
+                if vision_result.success and vision_result.data.get("description"):
+                    context.vision_description = vision_result.data["description"]
+                    logger.info(f"Pipeline: Vision description generated ({len(context.vision_description)} chars)")
+                    # Add vision trace
+                    trace["vision"] = vision_result.trace
+                    usage_totals["input_tokens"] += vision_result.usage.get("input_tokens", 0)
+                    usage_totals["output_tokens"] += vision_result.usage.get("output_tokens", 0)
+                    # Clear images since we have the description now
+                    # (don't pass raw images to later stages when using vision model)
+                    images = []
             # === Stage 1: Instruct (Initial Discovery) ===
             logger.info("Pipeline: Stage 1 - Instruct")
             instruct_result = await self.instruct_stage.execute(context)
@@ -115,20 +143,73 @@ class ModularPipeline:
             else:
                 logger.info("Pipeline: Mode is 'fast', skipping deepsearch stage")
-            # === Stage 3: Summary ===
-            # Collect page screenshots if image mode (already rendered in InstructStage)
-            all_images = list(images) if images else []
+            # === Parallel Execution: Summary Generation + Image Prefetching ===
+            # We run image prefetching concurrently with Summary generation to save time.
+            # 1. Prepare candidates for prefetch (all images in search results)
+            all_candidate_urls = set()
+            for r in context.web_results:
+                # Add images from search results/pages
+                if r.get("images"):
+                    for img in r["images"]:
+                        if img and isinstance(img, str) and img.startswith("http"):
+                            all_candidate_urls.add(img)
+            prefetch_list = list(all_candidate_urls)
+            logger.info(f"Pipeline: Starting parallel execution (Summary + Prefetch {len(prefetch_list)} images)")
+            # 2. Define parallel tasks with timing
+            async def timed_summary():
+                t0 = time.time()
+                # Collect page screenshots if image mode
+                summary_input_images = list(images) if images else []
+                if context.image_input_supported:
+                    # Collect pre-rendered screenshots from web_results
+                    for r in context.web_results:
+                        if r.get("_type") == "page" and r.get("screenshot_b64"):
+                            summary_input_images.append(r["screenshot_b64"])
+                res = await self.summary_stage.execute(
+                    context,
+                    images=summary_input_images if summary_input_images else None
+                )
+                duration = time.time() - t0
+                return res, duration
+            async def timed_prefetch():
+                t0 = time.time()
+                if not prefetch_list:
+                    return {}, 0.0
+                try:
+                    from .image_cache import get_image_cache
+                    cache = get_image_cache()
+                    # Start prefetch (non-blocking kickoff)
+                    cache.start_prefetch(prefetch_list)
+                    # Wait for results (blocking until done)
+                    res = await cache.get_all_cached(prefetch_list)
+                    duration = time.time() - t0
+                    return res, duration
+                except Exception as e:
+                    logger.warning(f"Pipeline: Prefetch failed: {e}")
+                    return {}, time.time() - t0
+            # 3. Execute concurrently
+            summary_task = asyncio.create_task(timed_summary())
+            prefetch_task = asyncio.create_task(timed_prefetch())
+            # Wait for both to complete
+            await asyncio.wait([summary_task, prefetch_task])
+            # 4. Process results and log timing
+            summary_result, summary_time = await summary_task
+            cached_map, prefetch_time = await prefetch_task
-            if context.image_input_supported:
-                # Collect pre-rendered screenshots from web_results
-                for r in context.web_results:
-                    if r.get("_type") == "page" and r.get("screenshot_b64"):
-                        all_images.append(r["screenshot_b64"])
+            time_diff = abs(summary_time - prefetch_time)
+            if summary_time > prefetch_time:
+                logger.info(f"Pipeline: Image Prefetch finished first ({prefetch_time:.2f}s). Summary took {summary_time:.2f}s. (Waited {time_diff:.2f}s for Summary)")
+            else:
+                logger.info(f"Pipeline: Summary finished first ({summary_time:.2f}s). Image Prefetch took {prefetch_time:.2f}s. (Waited {time_diff:.2f}s for Prefetch)")
-            summary_result = await self.summary_stage.execute(
-                context,
-                images=all_images if all_images else None
-            )
             trace["summary"] = summary_result.trace
             usage_totals["input_tokens"] += summary_result.usage.get("input_tokens", 0)
             usage_totals["output_tokens"] += summary_result.usage.get("output_tokens", 0)
@@ -139,40 +220,30 @@ class ModularPipeline:
             stats["total_time"] = time.time() - start_time
             structured = self._parse_response(summary_content, context)
-            # === Image Caching (Prefetch images for UI) ===
-            try:
-                from .image_cache import get_image_cache
-                cache = get_image_cache()
-                # 1. Collect all image URLs from structured response
-                all_image_urls = []
-                for ref in structured.get("references", []):
-                    if ref.get("images"):
-                        all_image_urls.extend([img for img in ref["images"] if img and img.startswith("http")])
-                if all_image_urls:
-                    # 2. Prefetch (wait for them as we are about to render)
-                    cached_map = await cache.get_all_cached(all_image_urls)
-                    # 3. Update structured response with cached (base64) URLs
+            # === Apply Cached Images ===
+            # Update structured response using the map from parallel prefetch
+            if cached_map:
+                try:
+                    total_replaced = 0
                     for ref in structured.get("references", []):
                         if ref.get("images"):
-                            # Keep cached images, but preserve original URLs as fallback
                             new_images = []
                             for img in ref["images"]:
-                                # 1. Already Base64 (from Search Injection) -> Keep it
+                                # 1. Already Base64 -> Keep it
                                 if img.startswith("data:"):
                                     new_images.append(img)
                                     continue
-                                # 2. Cached successfully -> Keep it
+                                # 2. Check cache
                                 cached_val = cached_map.get(img)
                                 if cached_val and cached_val.startswith("data:"):
                                     new_images.append(cached_val)
-                                # 3. Else -> DROP IT (User request: "Delete Fallback, must download in advance")
+                                    total_replaced += 1
+                                # 3. Else -> DROP IT (as per policy)
                             ref["images"] = new_images
-            except Exception as e:
-                logger.warning(f"Pipeline: Image caching failed: {e}")
+                    logger.debug(f"Pipeline: Replaced {total_replaced} images with cached versions")
+                except Exception as e:
+                    logger.warning(f"Pipeline: Applying cached images failed: {e}")
             # Debug: Log image counts
             total_ref_images = sum(len(ref.get("images", []) or []) for ref in structured.get("references", []))
@@ -197,6 +268,8 @@ class ModularPipeline:
                 },
                 "stages_used": stages_used,
                 "web_results": context.web_results,
+                "vision_trace": trace.get("vision"),
+                "instruct_traces": trace.get("instruct_rounds", []),
             }
         except Exception as e:
@@ -314,6 +387,27 @@ class ModularPipeline:
                 "references": search_refs,
                 "description": f"Found {len(search_refs)} results."
             })
+        # 2. Vision Stage (if used)
+        if trace.get("vision"):
+            v = trace["vision"]
+            if not v.get("skipped"):
+                usage = v.get("usage", {})
+                vision_cfg = self.config.get_model_config("vision")
+                input_price = vision_cfg.get("input_price") or 0
+                output_price = vision_cfg.get("output_price") or 0
+                cost = (usage.get("input_tokens", 0) * input_price + usage.get("output_tokens", 0) * output_price) / 1_000_000
+                stages.append({
+                    "name": "Vision",
+                    "model": v.get("model"),
+                    "icon_config": "google",
+                    "provider": "Vision",
+                    "time": v.get("time", 0),
+                    "description": f"Analyzed {v.get('images_count', 0)} image(s).",
+                    "usage": usage,
+                    "cost": cost
+                })
         # 2. Instruct Rounds
         for i, t in enumerate(trace.get("instruct_rounds", [])):

entari_plugin_hyw/search.py CHANGED Viewed

@@ -10,6 +10,7 @@ from .browser.service import get_screenshot_service
 from .browser.engines.bing import BingEngine
 from .browser.engines.duckduckgo import DuckDuckGoEngine
 from .browser.engines.google import GoogleEngine
+from .browser.engines.default import DefaultEngine
 class SearchService:
     def __init__(self, config: Any):
@@ -21,8 +22,11 @@ class SearchService:
         # Domain blocking
         self._blocked_domains = getattr(config, "blocked_domains", []) or []
-        # Select Engine
-        self._engine_name = getattr(config, "search_engine", "bing").lower()
+        # Select Engine - DefaultEngine when not specified
+        self._engine_name = getattr(config, "search_engine", None)
+        if self._engine_name:
+            self._engine_name = self._engine_name.lower()
         if self._engine_name == "bing":
             self._engine = BingEngine()
         elif self._engine_name == "google":
@@ -30,8 +34,9 @@ class SearchService:
         elif self._engine_name == "duckduckgo":
             self._engine = DuckDuckGoEngine()
         else:
-            # Default fallback
-            self._engine = BingEngine()
+            # Default: use browser address bar search (Google-based)
+            self._engine = DefaultEngine()
+            self._engine_name = "default"
         logger.info(f"SearchService initialized with engine: {self._engine_name}")
@@ -39,7 +44,8 @@ class SearchService:
         return self._engine.build_url(query, self._default_limit)
     async def search_batch(self, queries: List[str]) -> List[List[Dict[str, Any]]]:
-        """Execute multiple searches concurrently."""
+        """Execute multiple searches concurrently using standard URL navigation."""
+        logger.info(f"SearchService: Batch searching {len(queries)} queries in parallel...")
         tasks = [self.search(q) for q in queries]
         return await asyncio.gather(*tasks)
@@ -58,17 +64,36 @@ class SearchService:
              final_query = f"{query} {exclusions}"
         url = self._build_search_url(final_query)
-        logger.info(f"Search: '{query}' -> {url}")
         results = []
         try:
-            # Fetch - Search parsing doesn't need screenshot, only HTML
-            page_data = await self.fetch_page_raw(url, include_screenshot=False)
+            # Check if this is an address bar search (DefaultEngine)
+            if url.startswith("__ADDRESS_BAR_SEARCH__:"):
+                # Extract query from marker
+                search_query = url.replace("__ADDRESS_BAR_SEARCH__:", "")
+                logger.info(f"Search: '{query}' -> [Address Bar Search]")
+                # Use address bar input method
+                service = get_screenshot_service(headless=self._headless)
+                page_data = await service.search_via_address_bar(search_query)
+            else:
+                logger.info(f"Search: '{query}' -> {url}")
+                # Standard URL navigation
+                page_data = await self.fetch_page_raw(url, include_screenshot=False)
             content = page_data.get("html", "") or page_data.get("content", "")
+            # Debug: Log content length
+            logger.debug(f"Search: Raw content length = {len(content)} chars")
+            if len(content) < 500:
+                logger.warning(f"Search: Content too short, may be empty/blocked. First 500 chars: {content[:500]}")
             # Parse Results (skip raw page - only return parsed results)
             if content and not content.startswith("Error"):
                 parsed = self._engine.parse(content)
+                # Debug: Log parse result
+                logger.info(f"Search: Engine {self._engine_name} parsed {len(parsed)} results from {len(content)} chars")
                 # JAVASCRIPT IMAGE INJECTION
                 # Inject base64 images from JS extraction if available
@@ -84,6 +109,17 @@ class SearchService:
                             parsed[i]["images"].insert(0, b64_src)
                 logger.info(f"Search parsed {len(parsed)} results for '{query}' using {self._engine_name}")
+                # ALWAYS add raw search page as hidden item for debug saving
+                # (even when 0 results, so we can debug the parser)
+                results.append({
+                    "title": f"[DEBUG] Raw Search: {query}",
+                    "url": url,
+                    "content": content[:50000],  # Limit to 50KB
+                    "_type": "search_raw_page",
+                    "_hidden": True,  # Don't show to LLM
+                })
                 results.extend(parsed)
             else:
                 logger.warning(f"Search failed/empty for '{query}': {content[:100]}")

entari_plugin_hyw/stage_base.py CHANGED Viewed

@@ -39,6 +39,9 @@ class StageContext:
     # Model capabilities
     image_input_supported: bool = True
+    # Vision description (from VisionStage)
+    vision_description: str = ""
     def next_id(self) -> int:
         """Get next global ID."""
         self.global_id_counter += 1

entari_plugin_hyw/stage_instruct.py CHANGED Viewed

@@ -8,7 +8,7 @@ Analyze user query and execute initial searches.
 import json
 import time
 import asyncio
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Callable, Awaitable
 from loguru import logger
 from openai import AsyncOpenAI
@@ -26,14 +26,15 @@ class InstructStage(BaseStage):
     def name(self) -> str:
         return "Instruct"
-    def __init__(self, config: Any, search_service: Any, client: AsyncOpenAI):
+    def __init__(self, config: Any, search_service: Any, client: AsyncOpenAI, send_func: Optional[Callable[[str], Awaitable[None]]] = None):
         super().__init__(config, search_service, client)
+        self.send_func = send_func
         self.refuse_answer_tool = get_refuse_answer_tool()
         self.web_search_tool = get_web_search_tool()
         self.crawl_page_tool = get_crawl_page_tool()
         self.set_mode_tool = get_set_mode_tool()
     async def execute(self, context: StageContext) -> StageResult:
         start_time = time.time()
@@ -113,6 +114,7 @@ class InstructStage(BaseStage):
         model = model_cfg.get("model_name") or self.config.model_name
         try:
+            logger.info(f"Instruct: Sending LLM request to {model}...")
             response = await client.chat.completions.create(
                 model=model,
                 messages=messages,
@@ -186,6 +188,14 @@ class InstructStage(BaseStage):
                 if mode in ("fast", "deepsearch"):
                     context.selected_mode = mode
                     logger.info(f"Instruct: Mode set to '{mode}'")
+                    # Notify immediately if deepsearch
+                    if mode == "deepsearch" and self.send_func:
+                        try:
+                            await self.send_func("🔍 正在进行深度研究，可能需要一些时间，请耐心等待...")
+                        except Exception as e:
+                            logger.warning(f"Instruct: Failed to send notification: {e}")
                     results_for_context.append({
                         "id": tc_id, "name": name, "content": f"Mode set to: {mode}"
                     })

entari_plugin_hyw/stage_summary.py CHANGED Viewed

@@ -47,6 +47,11 @@ class SummaryStage(BaseStage):
         # Build Context Message
         context_message = f"## Web Search & Page Content\n\n```context\n{full_context}\n```"
+        # Add vision description if present (from VisionStage)
+        if context.vision_description:
+            vision_context = f"## 用户图片描述\n\n{context.vision_description}"
+            context_message = f"{vision_context}\n\n{context_message}"
         # Build user content
         user_text = context.user_input or "..."
         if images:
@@ -104,6 +109,7 @@ class SummaryStage(BaseStage):
                 "provider": model_cfg.get("model_provider") or "Unknown",
                 "usage": usage,
                 "system_prompt": system_prompt,
+                "context_message": context_message,  # Includes vision description + search results
                 "output": content,
                 "time": time.time() - start_time,
                 "images_count": len(images) if images else 0,

entari_plugin_hyw/stage_vision.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""
+Vision Stage
+Generates image description using a vision-capable model.
+The description is then passed as context to subsequent stages.
+"""
+import time
+from typing import Any, Dict, List, Optional
+from loguru import logger
+from openai import AsyncOpenAI
+from .stage_base import BaseStage, StageContext, StageResult
+from .definitions import VISION_DESCRIPTION_SP
+class VisionStage(BaseStage):
+    """
+    Vision Stage: Generate image description.
+    Takes user images and text, calls a vision model to produce
+    a detailed description of the image content.
+    """
+    @property
+    def name(self) -> str:
+        return "Vision"
+    async def execute(
+        self,
+        context: StageContext,
+        images: List[str] = None
+    ) -> StageResult:
+        """Generate image description."""
+        start_time = time.time()
+        if not images:
+            return StageResult(
+                success=True,
+                data={"description": ""},
+                trace={"skipped": True, "reason": "No images provided"}
+            )
+        # Get model config for vision stage
+        model_cfg = self.config.get_model_config("vision")
+        model = model_cfg.get("model_name")
+        if not model:
+            logger.warning("VisionStage: No vision model configured, skipping")
+            return StageResult(
+                success=True,
+                data={"description": ""},
+                trace={"skipped": True, "reason": "No vision model configured"}
+            )
+        client = self._client_for(
+            api_key=model_cfg.get("api_key"),
+            base_url=model_cfg.get("base_url")
+        )
+        # Build user content with images
+        user_text = context.user_input or "请描述这张图片"
+        user_content: List[Dict[str, Any]] = [{"type": "text", "text": user_text}]
+        for img_b64 in images:
+            url = f"data:image/jpeg;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
+            user_content.append({"type": "image_url", "image_url": {"url": url}})
+        messages = [
+            {"role": "system", "content": VISION_DESCRIPTION_SP},
+            {"role": "user", "content": user_content}
+        ]
+        try:
+            logger.info(f"VisionStage: Calling model '{model}' with {len(images)} image(s)")
+            response = await client.chat.completions.create(
+                model=model,
+                messages=messages,
+                temperature=0.3,  # Lower temperature for factual description
+                extra_body=model_cfg.get("extra_body"),
+            )
+        except Exception as e:
+            logger.error(f"VisionStage LLM error: {e}")
+            return StageResult(
+                success=False,
+                error=str(e),
+                data={"description": ""},
+                trace={"error": str(e)}
+            )
+        usage = {"input_tokens": 0, "output_tokens": 0}
+        if hasattr(response, "usage") and response.usage:
+            usage["input_tokens"] = getattr(response.usage, "prompt_tokens", 0) or 0
+            usage["output_tokens"] = getattr(response.usage, "completion_tokens", 0) or 0
+        description = (response.choices[0].message.content or "").strip()
+        logger.info(f"VisionStage: Generated description ({len(description)} chars)")
+        return StageResult(
+            success=True,
+            data={"description": description},
+            usage=usage,
+            trace={
+                "model": model,
+                "provider": model_cfg.get("model_provider") or "Unknown",
+                "usage": usage,
+                "output": description,
+                "time": time.time() - start_time,
+                "images_count": len(images),
+            }
+        )

{entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: entari_plugin_hyw
-Version: 4.0.0rc6
+Version: 4.0.0rc7
 Summary: Use large language models to interpret chat messages
 Author-email: kumoSleeping <zjr2992@outlook.com>
 License: MIT

entari-plugin-hyw 4.0.0rc6__py3-none-any.whl → 4.0.0rc7__py3-none-any.whl

Potentially problematic release.

entari-plugin-hyw 4.0.0rc6py3-none-any.whl → 4.0.0rc7py3-none-any.whl