PyPI - entari-plugin-hyw - Versions diffs - 3.5.0rc1__py3-none-any.whl → 3.5.0rc2__py3-none-any.whl - Mend

entari-plugin-hyw 3.5.0rc1py3-none-any.whl → 3.5.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (32) hide show

entari_plugin_hyw/__init__.py +77 -82
entari_plugin_hyw/assets/card-dist/index.html +360 -99
entari_plugin_hyw/card-ui/src/App.vue +246 -52
entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +122 -67
entari_plugin_hyw/card-ui/src/components/StageCard.vue +46 -26
entari_plugin_hyw/card-ui/src/test_regex.js +103 -0
entari_plugin_hyw/card-ui/src/types.ts +1 -0
entari_plugin_hyw/{core/history.py → history.py} +25 -1
entari_plugin_hyw/image_cache.py +283 -0
entari_plugin_hyw/{core/pipeline.py → pipeline.py} +102 -27
entari_plugin_hyw/{utils/prompts.py → prompts.py} +7 -24
entari_plugin_hyw/render_vue.py +314 -0
entari_plugin_hyw/{utils/search.py → search.py} +227 -10
{entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/METADATA +1 -1
{entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/RECORD +18 -29
entari_plugin_hyw/core/__init__.py +0 -0
entari_plugin_hyw/core/config.py +0 -35
entari_plugin_hyw/core/hyw.py +0 -48
entari_plugin_hyw/core/render_vue.py +0 -255
entari_plugin_hyw/test_output/render_0.jpg +0 -0
entari_plugin_hyw/test_output/render_1.jpg +0 -0
entari_plugin_hyw/test_output/render_2.jpg +0 -0
entari_plugin_hyw/test_output/render_3.jpg +0 -0
entari_plugin_hyw/test_output/render_4.jpg +0 -0
entari_plugin_hyw/tests/ui_test_output.jpg +0 -0
entari_plugin_hyw/tests/verify_ui.py +0 -139
entari_plugin_hyw/utils/__init__.py +0 -2
entari_plugin_hyw/utils/browser.py +0 -40
entari_plugin_hyw/utils/playwright_tool.py +0 -36
/entari_plugin_hyw/{utils/misc.py → misc.py} +0 -0
{entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/WHEEL +0 -0
{entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/top_level.txt +0 -0

entari_plugin_hyw/image_cache.py ADDED Viewed

@@ -0,0 +1,283 @@
+"""
+Image Caching Module for Pre-downloading Images
+This module provides async image pre-download functionality to reduce render time.
+Images are downloaded in the background when search results are obtained,
+and cached as base64 data URLs for instant use during rendering.
+"""
+import asyncio
+import base64
+import hashlib
+from typing import Dict, List, Optional, Any
+from loguru import logger
+import httpx
+class ImageCache:
+    """
+    Async image cache that pre-downloads images as base64.
+    Usage:
+        cache = ImageCache()
+        # Start pre-downloading images (non-blocking)
+        cache.start_prefetch(image_urls)
+        # Later, get cached image (blocking if not ready)
+        cached_url = await cache.get_cached(url)  # Returns data:image/... or original URL
+    """
+    def __init__(
+        self,
+        max_size_kb: int = 500,  # Max image size to cache (KB)
+        timeout: float = 5.0,    # Download timeout per image
+        max_concurrent: int = 6,  # Max concurrent downloads
+    ):
+        self.max_size_bytes = max_size_kb * 1024
+        self.timeout = timeout
+        self.max_concurrent = max_concurrent
+        # Cache storage: url -> base64_data_url or None (if failed)
+        self._cache: Dict[str, Optional[str]] = {}
+        # Pending downloads: url -> asyncio.Task
+        self._pending: Dict[str, asyncio.Task] = {}
+        # Semaphore for concurrent downloads
+        self._semaphore = asyncio.Semaphore(max_concurrent)
+        # Lock for cache access
+        self._lock = asyncio.Lock()
+    def start_prefetch(self, urls: List[str]) -> None:
+        """
+        Start pre-downloading images in the background (non-blocking).
+        Args:
+            urls: List of image URLs to prefetch
+        """
+        if not httpx:
+            logger.warning("ImageCache: httpx not installed, prefetch disabled")
+            return
+        for url in urls:
+            if not url or not url.startswith("http"):
+                continue
+            if url in self._cache or url in self._pending:
+                continue
+            # Create background task
+            task = asyncio.create_task(self._download_image(url))
+            self._pending[url] = task
+    async def _download_image(self, url: str) -> Optional[str]:
+        """
+        Download a single image and convert to base64.
+        Returns:
+            Base64 data URL or None if failed/too large
+        """
+        async with self._semaphore:
+            try:
+                async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client:
+                    resp = await client.get(url, headers={
+                        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
+                    })
+                    resp.raise_for_status()
+                    # Check content length
+                    content_length = resp.headers.get("content-length")
+                    if content_length and int(content_length) > self.max_size_bytes:
+                        logger.debug(f"ImageCache: Skipping {url} (too large: {content_length} bytes)")
+                        async with self._lock:
+                            self._cache[url] = None
+                            self._pending.pop(url, None)
+                        return None
+                    # Read content
+                    content = resp.content
+                    if len(content) > self.max_size_bytes:
+                        logger.debug(f"ImageCache: Skipping {url} (content too large: {len(content)} bytes)")
+                        async with self._lock:
+                            self._cache[url] = None
+                            self._pending.pop(url, None)
+                        return None
+                    # Determine MIME type
+                    content_type = resp.headers.get("content-type", "").lower()
+                    if "jpeg" in content_type or "jpg" in content_type:
+                        mime = "image/jpeg"
+                    elif "png" in content_type:
+                        mime = "image/png"
+                    elif "gif" in content_type:
+                        mime = "image/gif"
+                    elif "webp" in content_type:
+                        mime = "image/webp"
+                    elif "svg" in content_type:
+                        mime = "image/svg+xml"
+                    else:
+                        # Try to infer from URL
+                        url_lower = url.lower()
+                        if ".jpg" in url_lower or ".jpeg" in url_lower:
+                            mime = "image/jpeg"
+                        elif ".png" in url_lower:
+                            mime = "image/png"
+                        elif ".gif" in url_lower:
+                            mime = "image/gif"
+                        elif ".webp" in url_lower:
+                            mime = "image/webp"
+                        elif ".svg" in url_lower:
+                            mime = "image/svg+xml"
+                        else:
+                            mime = "image/jpeg"  # Default fallback
+                    # Encode to base64
+                    b64 = base64.b64encode(content).decode("utf-8")
+                    data_url = f"data:{mime};base64,{b64}"
+                    async with self._lock:
+                        self._cache[url] = data_url
+                        self._pending.pop(url, None)
+                    logger.debug(f"ImageCache: Cached {url} ({len(content)} bytes)")
+                    return data_url
+            except asyncio.TimeoutError:
+                logger.debug(f"ImageCache: Timeout downloading {url}")
+            except Exception as e:
+                logger.debug(f"ImageCache: Failed to download {url}: {e}")
+            async with self._lock:
+                self._cache[url] = None
+                self._pending.pop(url, None)
+            return None
+    async def get_cached(self, url: str, wait: bool = True, wait_timeout: float = 3.0) -> str:
+        """
+        Get cached image data URL, or original URL if not cached.
+        Args:
+            url: Original image URL
+            wait: If True, wait for pending download to complete
+            wait_timeout: Max time to wait for pending download
+        Returns:
+            Cached data URL or original URL
+        """
+        if not url:
+            return url
+        # Check if already cached
+        async with self._lock:
+            if url in self._cache:
+                cached = self._cache[url]
+                return cached if cached else url  # Return original if cached as None (failed)
+            pending_task = self._pending.get(url)
+        # Wait for pending download if requested
+        if pending_task and wait:
+            try:
+                await asyncio.wait_for(asyncio.shield(pending_task), timeout=wait_timeout)
+                async with self._lock:
+                    cached = self._cache.get(url)
+                    return cached if cached else url
+            except asyncio.TimeoutError:
+                logger.debug(f"ImageCache: Timeout waiting for {url}")
+                return url
+            except Exception:
+                return url
+        return url
+    async def get_all_cached(self, urls: List[str], wait_timeout: float = 3.0) -> Dict[str, str]:
+        """
+        Get cached URLs for multiple images.
+        Args:
+            urls: List of original URLs
+            wait_timeout: Max time to wait for all pending downloads
+        Returns:
+            Dict mapping original URL to cached data URL (or original if not cached)
+        """
+        result = {}
+        # Wait for all pending downloads first
+        pending_tasks = []
+        async with self._lock:
+            for url in urls:
+                if url in self._pending:
+                    pending_tasks.append(self._pending[url])
+        if pending_tasks:
+            try:
+                await asyncio.wait_for(
+                    asyncio.gather(*pending_tasks, return_exceptions=True),
+                    timeout=wait_timeout
+                )
+            except asyncio.TimeoutError:
+                logger.debug(f"ImageCache: Timeout waiting for batch download")
+        # Collect results
+        for url in urls:
+            async with self._lock:
+                cached = self._cache.get(url)
+            result[url] = cached if cached else url
+        return result
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        cached_count = sum(1 for v in self._cache.values() if v is not None)
+        failed_count = sum(1 for v in self._cache.values() if v is None)
+        return {
+            "cached": cached_count,
+            "failed": failed_count,
+            "pending": len(self._pending),
+            "total": len(self._cache) + len(self._pending),
+        }
+    def clear(self) -> None:
+        """Clear all cached data."""
+        self._cache.clear()
+        for task in self._pending.values():
+            task.cancel()
+        self._pending.clear()
+# Global cache instance for reuse across requests
+_global_cache: Optional[ImageCache] = None
+def get_image_cache() -> ImageCache:
+    """Get or create the global image cache instance."""
+    global _global_cache
+    if _global_cache is None:
+        _global_cache = ImageCache()
+    return _global_cache
+async def prefetch_images(urls: List[str]) -> None:
+    """
+    Convenience function to start prefetching images.
+    Args:
+        urls: List of image URLs to prefetch
+    """
+    cache = get_image_cache()
+    cache.start_prefetch(urls)
+async def get_cached_images(urls: List[str], wait_timeout: float = 3.0) -> Dict[str, str]:
+    """
+    Convenience function to get cached images.
+    Args:
+        urls: List of original URLs
+        wait_timeout: Max time to wait
+    Returns:
+        Dict mapping original URL to cached data URL
+    """
+    cache = get_image_cache()
+    return await cache.get_all_cached(urls, wait_timeout=wait_timeout)

entari_plugin_hyw/{core/pipeline.py → pipeline.py} RENAMED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import html
 import json
+import re
 import time
 from contextlib import asynccontextmanager
 from typing import Any, Dict, List, Optional, Tuple
@@ -8,16 +9,14 @@ from typing import Any, Dict, List, Optional, Tuple
 from loguru import logger
 from openai import AsyncOpenAI
-from .config import HYWConfig
-from ..utils.search import SearchService
-from ..utils.prompts import (
+from .search import SearchService
+from .image_cache import get_cached_images
+from .prompts import (
     AGENT_SP,
     AGENT_SP_INSTRUCT_VISION_ADD,
     AGENT_SP_TOOLS_STANDARD_ADD,
     AGENT_SP_TOOLS_AGENT_ADD,
     AGENT_SP_SEARCH_ADD,
-    AGENT_SP_PAGE_ADD,
-    AGENT_SP_IMAGE_SEARCH_ADD,
     INSTRUCT_SP,
     INSTRUCT_SP_VISION_ADD,
     VISION_SP,
@@ -33,7 +32,7 @@ class ProcessingPipeline:
     Core pipeline (vision -> instruct/search -> agent).
     """
-    def __init__(self, config: HYWConfig):
+    def __init__(self, config: Any):
         self.config = config
         self.search_service = SearchService(config)
         self.client = AsyncOpenAI(base_url=self.config.base_url, api_key=self.config.api_key)
@@ -120,12 +119,9 @@ class ProcessingPipeline:
         final_response_content = ""
         structured: Dict[str, Any] = {}
-        # Reset search cache and ID counters for this execution
+        # Reset search cache and ID counter for this execution
         self.all_web_results = []
         self.global_id_counter = 0
-        self.search_id_counter = 0
-        self.page_id_counter = 0
-        self.image_id_counter = 0
         try:
             logger.info(f"Pipeline: Starting workflow for '{user_input}' using {active_model}")
@@ -189,7 +185,8 @@ class ProcessingPipeline:
                 vision_text=vision_text,
                 model=instruct_model,
             )
-            instruct_time = time.time() - instruct_start
+            # Instruct time excludes search time (search_time is returned separately)
+            instruct_time = time.time() - instruct_start - search_time
             # Calculate Instruct Cost
             instruct_cost = 0.0
@@ -266,17 +263,18 @@ class ProcessingPipeline:
                 if vision_text:
                     system_prompt += AGENT_SP_INSTRUCT_VISION_ADD.format(vision_msgs=vision_text)
-                # Append search results
-                if has_search_results and search_msgs_text:
-                    system_prompt += AGENT_SP_SEARCH_ADD.format(search_msgs=search_msgs_text)
-                # Append crawled page content
+                # Append all search results (text, page, image) in one block
                 page_msgs_text = self._format_page_msgs()
+                all_search_parts = []
+                if has_search_results and search_msgs_text:
+                    all_search_parts.append(search_msgs_text)
                 if page_msgs_text:
-                    system_prompt += AGENT_SP_PAGE_ADD.format(page_msgs=page_msgs_text)
+                    all_search_parts.append(page_msgs_text)
                 if has_image_results and image_msgs_text:
-                     system_prompt += AGENT_SP_IMAGE_SEARCH_ADD.format(image_search_msgs=image_msgs_text)
+                    all_search_parts.append(image_msgs_text)
+                if all_search_parts:
+                    system_prompt += AGENT_SP_SEARCH_ADD.format(search_msgs="\n".join(all_search_parts))
                 last_system_prompt = system_prompt
@@ -332,6 +330,7 @@ class ProcessingPipeline:
                         "tool_results": [],
                         "tool_time": tool_exec_time,
                         "llm_time": step_llm_time,
+                        "usage": step_usage,
                     }
                     for i, result in enumerate(results):
                         tc = tool_calls[i]
@@ -516,12 +515,18 @@ class ProcessingPipeline:
                 for s in steps:
                     if "tool_calls" in s:
                         # 1. Agent Thought Stage (with LLM time)
+                        # Calculate step cost
+                        step_usage = s.get("usage", {})
+                        step_cost = 0.0
+                        if a_in_price > 0 or a_out_price > 0:
+                             step_cost = (step_usage.get("input_tokens", 0) / 1_000_000 * a_in_price) + (step_usage.get("output_tokens", 0) / 1_000_000 * a_out_price)
                         stages_used.append({
                             "name": "Agent",
                             "model": a_model,
                             "icon_config": agent_icon,
                             "provider": agent_provider,
-                            "time": s.get("llm_time", 0), "cost": 0
+                            "time": s.get("llm_time", 0), "cost": step_cost
                         })
                         # 2. Grouped Tool Stages
@@ -602,21 +607,30 @@ class ProcessingPipeline:
                         })
             # Assign total time/cost to last Agent stage
-                last_agent = next((s for s in reversed(stages_used) if s["name"] == "Agent"), None)
-                if last_agent:
-                    last_agent["time"] = a.get("time", 0)
-                    last_agent["cost"] = a.get("cost", 0.0)
+            # Sum up total time/cost for UI/stats (implicit via loop above)
+            # No need to assign everything to last agent anymore as we distribute it.
             # --- Final Filter: Only show cited items in workflow cards ---
             cited_urls = {ref['url'] for ref in (structured.get("references", []) +
                                                structured.get("page_references", []) +
                                                structured.get("image_references", []))}
+            # Find images already rendered in markdown content (to avoid duplicate display)
+            markdown_image_urls = set()
+            md_img_pattern = re.compile(r'!\[.*?\]\((https?://[^)]+)\)')
+            for match in md_img_pattern.finditer(final_content):
+                markdown_image_urls.add(match.group(1))
             for s in stages_used:
                 if "references" in s and s["references"]:
                     s["references"] = [r for r in s["references"] if r.get("url") in cited_urls]
-                # if "image_references" in s and s["image_references"]:
-                #     s["image_references"] = [r for r in s["image_references"] if r.get("url") in cited_urls]
+                # Filter out images already shown in markdown content
+                # Check both url AND thumbnail since either might be used in markdown
+                if "image_references" in s and s["image_references"]:
+                    s["image_references"] = [
+                        r for r in s["image_references"]
+                        if r.get("url") not in markdown_image_urls and (r.get("thumbnail") or "") not in markdown_image_urls
+                    ]
                 if "crawled_pages" in s and s["crawled_pages"]:
                     s["crawled_pages"] = [r for r in s["crawled_pages"] if r.get("url") in cited_urls]
@@ -633,6 +647,67 @@ class ProcessingPipeline:
             # Update the reference (since it might be used by caller)
             current_history[:] = cleaned_history
+            # --- Apply cached images to reduce render time ---
+            # Collect all image URLs that need caching (avoid duplicates when thumbnail == url)
+            all_image_urls = set()
+            for img_ref in structured.get("image_references", []):
+                if img_ref.get("thumbnail"):
+                    all_image_urls.add(img_ref["thumbnail"])
+                if img_ref.get("url"):
+                    all_image_urls.add(img_ref["url"])
+            for stage in stages_used:
+                for img_ref in stage.get("image_references", []):
+                    if img_ref.get("thumbnail"):
+                        all_image_urls.add(img_ref["thumbnail"])
+                    if img_ref.get("url"):
+                        all_image_urls.add(img_ref["url"])
+            # Also collect image URLs from markdown content
+            markdown_img_pattern = re.compile(r'!\[.*?\]\((https?://[^)]+)\)')
+            markdown_urls = markdown_img_pattern.findall(final_content)
+            all_image_urls.update(markdown_urls)
+            # Get cached versions (waits for pending downloads, with timeout)
+            if all_image_urls:
+                try:
+                    cached_map = await get_cached_images(list(all_image_urls), wait_timeout=3.0)
+                    # Apply cached URLs to structured response
+                    for img_ref in structured.get("image_references", []):
+                        if img_ref.get("thumbnail") and img_ref["thumbnail"] in cached_map:
+                            img_ref["thumbnail"] = cached_map[img_ref["thumbnail"]]
+                        if img_ref.get("url") and img_ref["url"] in cached_map:
+                            img_ref["url"] = cached_map[img_ref["url"]]
+                    # Apply cached URLs to stages
+                    for stage in stages_used:
+                        for img_ref in stage.get("image_references", []):
+                            if img_ref.get("thumbnail") and img_ref["thumbnail"] in cached_map:
+                                img_ref["thumbnail"] = cached_map[img_ref["thumbnail"]]
+                            if img_ref.get("url") and img_ref["url"] in cached_map:
+                                img_ref["url"] = cached_map[img_ref["url"]]
+                    # Replace image URLs in markdown content with cached versions
+                    def replace_markdown_img(match):
+                        full_match = match.group(0)
+                        url = match.group(1)
+                        cached_url = cached_map.get(url)
+                        if cached_url and cached_url != url:
+                            return full_match.replace(url, cached_url)
+                        return full_match
+                    final_content = markdown_img_pattern.sub(replace_markdown_img, final_content)
+                    structured["response"] = markdown_img_pattern.sub(replace_markdown_img, structured.get("response", ""))
+                    # Log cache stats
+                    from .image_cache import get_image_cache
+                    cache_stats = get_image_cache().get_stats()
+                    logger.info(f"ImageCache stats: {cache_stats}")
+                except Exception as e:
+                    logger.warning(f"Failed to apply image cache: {e}")
             return {
                 "llm_response": final_content,
                 "structured_response": structured,
@@ -1128,4 +1203,4 @@ class ProcessingPipeline:
         except Exception:
             pass
         # Do NOT close shared crawler here, as pipeline instances are now per-request.
-        # Shared crawler lifecycle is managed by HYW.close() or global cleanup.
+        # Shared crawler lifecycle is managed globally.

entari_plugin_hyw/{utils/prompts.py → prompts.py} RENAMED Viewed

@@ -34,7 +34,7 @@ INSTRUCT_SP = """# 你是一个专业的指导专家.
 {tools_desc}
 ## 你的回复
-调用工具后无需回复额外文本节省token.
+调用工具后无需回复额外文本节省 token.
 ## 用户消息
 ```
@@ -61,10 +61,11 @@ AGENT_SP = """# 你是一个 Agent 总控专家, 你需要理解用户意图,
 - 正文格式:
   - 先给出一个 `# `大标题约 8-10 个字, 不要有多余废话, 不要直接回答用户的提问.
   - 然后紧接着给出一个 <summary>...</summary>, 除了给出一个约 100 字的纯文本简介, 介绍本次输出的长文的清晰、重点概括.
-  - 随后开始详细二级标题分段给出简洁准确可信的介绍.
-    - 二级标题 + markdown 正文(支持图片, 不支持js渲染公式、mermaid等)
-    - 语言描绘格式出彩, 推荐使用 2-4 列小表格清晰展示数据, 使用 `> ` 标记描述关键一句话, 使用 `- ` 标记分级叙述
-    - 请不要给出过长的代码、表格列数等, 请控制字数在 600 字内, 只讲重点.
+  - 随后开始详细二级标题 + markdown 正文, 语言描绘格式丰富多样, 简洁准确可信.
+  - 请不要给出过长的代码、表格列数等, 请控制字数在 600 字内, 只讲重点和准确的数据.
+  - 不支持渲染: 链接, 图片链接, mermaid
+  - 支持渲染: 公式, 代码高亮, 只在需要的时候给出.
+  - 图片链接、链接框架会自动渲染出, 你无需显式给出.
 - 引用:
   > 重要: 所有正文内容必须基于实际信息, 保证百分百真实度
   - 信息来源已按获取顺序编号为 [1], [2], [3]...
@@ -98,24 +99,6 @@ AGENT_SP_INSTRUCT_VISION_ADD = """
 """
 AGENT_SP_SEARCH_ADD = """
-## 搜索专家消息
-```text
+## 联网信息
 {search_msgs}
-```
-"""
-AGENT_SP_PAGE_ADD = """
-## 页面内容专家消息
-```text
-{page_msgs}
-```
-- 引用页面内容时, 必须使用 `page:id` 格式
-"""
-AGENT_SP_IMAGE_SEARCH_ADD = """
-## 图像搜索专家消息
-```text
-{image_search_msgs}
-```
-- 每进行一次 internal_image_search, 挑选 1 张图像插入正文
 """

entari-plugin-hyw 3.5.0rc1__py3-none-any.whl → 3.5.0rc2__py3-none-any.whl

Potentially problematic release.

entari-plugin-hyw 3.5.0rc1py3-none-any.whl → 3.5.0rc2py3-none-any.whl