PyPI - entari-plugin-hyw - Versions diffs - 3.3.5__py3-none-any.whl → 3.3.7__py3-none-any.whl - Mend

entari-plugin-hyw 3.3.5py3-none-any.whl → 3.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (15) hide show

entari_plugin_hyw/__init__.py +14 -351
entari_plugin_hyw/assets/libs/tailwind.css +1 -1
entari_plugin_hyw/assets/tailwind.input.css +1 -1
entari_plugin_hyw/assets/template.j2 +113 -20
entari_plugin_hyw/core/config.py +1 -0
entari_plugin_hyw/core/pipeline.py +131 -103
entari_plugin_hyw/core/render.py +65 -41
entari_plugin_hyw/utils/prompts.py +26 -16
entari_plugin_hyw/utils/search.py +233 -3
entari_plugin_hyw-3.3.7.dist-info/METADATA +142 -0
{entari_plugin_hyw-3.3.5.dist-info → entari_plugin_hyw-3.3.7.dist-info}/RECORD +13 -14
entari_plugin_hyw/core/render.py.bak +0 -926
entari_plugin_hyw-3.3.5.dist-info/METADATA +0 -142
{entari_plugin_hyw-3.3.5.dist-info → entari_plugin_hyw-3.3.7.dist-info}/WHEEL +0 -0
{entari_plugin_hyw-3.3.5.dist-info → entari_plugin_hyw-3.3.7.dist-info}/top_level.txt +0 -0

entari_plugin_hyw/assets/template.j2 CHANGED Viewed

@@ -16,6 +16,40 @@
     <script>{{ katex_auto_render_js | safe }}</script>
     <!-- @formatter:on -->
+    <style>
+    /* Fallback style for broken images in markdown content */
+    .img-error-fallback {
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        gap: 8px;
+        width: 100%;
+        aspect-ratio: 16 / 9;
+        margin-bottom: 8px;
+        background: linear-gradient(135deg, #d3e4fd 0%, #b7d3fe 50%, #8bb9fc 100%);
+        border-radius: 12px;
+        color: white;
+        font-size: 14px;
+        font-weight: 500;
+        box-shadow: 0 4px 12px rgba(59, 130, 246, 0.25);
+    }
+    .img-error-fallback i {
+        font-size: 20px;
+    }
+    /* Dynamic image sizing based on aspect ratio */
+    #markdown-content img {
+        border-radius: 8px;
+        margin-bottom: 8px;
+    }
+    #markdown-content img.img-horizontal {
+        width: 100%;
+        height: auto;
+    }
+    #markdown-content img.img-vertical {
+        width: 60%;
+        height: auto;
+    }
+    </style>
 </head>
 <body class="bg-[#f2f2f2] p-0 box-border m-0 font-sans text-gray-800">
@@ -135,7 +169,7 @@
                         {{ list_card(stage.icon_html, title_html, subtitle_html=stats_html, is_compact=True, icon_box_class=icon_box_class) }}
                         {# Nested Children (Indent & Connect) #}
-                        {% if stage.references or stage.flow_steps or stage.crawled_pages %}
+                        {% if stage.references or stage.image_references or stage.flow_steps or stage.crawled_pages %}
                             <div class="ml-4 pl-4 border-l-2 border-gray-200 mt-2 flex flex-col gap-2">
                                 {# References #}
@@ -158,18 +192,23 @@
                                     {% endfor %}
                                 {% endif %}
-                                {# Flow Steps #}
-                                {% if stage.flow_steps %}
-                                     <div class="text-[12px] uppercase font-bold text-orange-600 tracking-wider mb-1 mt-1">Flow</div>
-                                     {% for step in stage.flow_steps %}
-                                        {% set icon_box_class = "rounded-md border border-gray-100 bg-white text-gray-500 shrink-0" %}
+                                {# Image References #}
+                                {% if stage.image_references %}
+                                    <div class="text-[12px] uppercase font-bold text-blue-600 tracking-wider mb-1 mt-2">Images</div>
+                                    {% for img in stage.image_references %}
+                                        {% set favicon_url = "https://www.google.com/s2/favicons?domain=" + img.domain + "&sz=32" %}
-                                        {% set title_html = '<div class="text-[13px] font-semibold text-gray-900 underline decoration-gray-300 decoration-1 underline-offset-2 truncate">' + step.description + '</div>' %}
-                                        {% set subtitle_html = '<div class="text-[12px] text-gray-700 leading-tight truncate">' + step.description + '</div>' %}
-                                        {% set right_html = '<div class="flex items-center justify-center min-w-[16px] h-4 px-0.5 text-[10px] font-bold text-orange-700 bg-orange-50 border border-orange-200 rounded">' + ('abcdefghijklmnopqrstuvwxyz'[loop.index0]) + '</div>' %}
+                                        {% set img_icon %}
+                                        <img src="{{ favicon_url }}" class="w-3.5 h-3.5 rounded-sm opacity-80">
+                                        {% endset %}
-                                        {{ list_card(step.icon_svg, title_html, subtitle_html=subtitle_html, right_content_html=right_html, is_compact=True, icon_box_class=icon_box_class) }}
-                                     {% endfor %}
+                                        {% set img_icon_box = "bg-white rounded border border-gray-100 w-6 h-6 shrink-0" %}
+                                        {% set title_html = '<div class="text-[13px] font-medium text-gray-900 truncate">' + img.title + '</div>' %}
+                                        {% set subtitle_html = '<div class="text-[12px] text-gray-500 truncate">' + img.domain + '</div>' %}
+                                        {{ list_card(img_icon, title_html, subtitle_html=subtitle_html, link_url=img.url, is_compact=True, icon_box_class=img_icon_box) }}
+                                    {% endfor %}
                                 {% endif %}
                                 {# Crawled Pages #}
@@ -272,23 +311,46 @@
                     const fragment = document.createDocumentFragment();
                     let lastIndex = 0;
                     const text = textNode.nodeValue;
-                    const regex = /`?(search|page):(\d+)`?/gi;
+                    // Regex to capture:
+                    // 1. Optional brackets/parens: [(
+                    // 2. Type: search/page
+                    // 3. IDs: 1 or 1,2,3
+                    // 4. Closing: )]
+                    const regex = /[\[\(]?(search|page):\s*([\d,\s]+)[\]\)]?/gi;
                     let match;
                     while ((match = regex.exec(text)) !== null) {
+                        // Validate match: simple check to ensure it contains digits
+                        if (!/\d/.test(match[2])) continue;
                         fragment.appendChild(document.createTextNode(text.substring(lastIndex, match.index)));
+                        const fullMatch = match[0];
                         const type = match[1].toLowerCase();
-                        const id = match[2];
+                        const idString = match[2];
-                        const span = document.createElement("span");
-                        const isPage = type === "page";
-                        const colorClass = isPage
-                            ? "text-orange-600 bg-orange-50 border-orange-200"
-                            : "text-blue-600 bg-blue-50 border-blue-200";
+                        // Parse IDs (split by comma or space)
+                        const ids = idString.split(/[,\s]+/).filter(s => s.trim().length > 0);
-                        span.innerHTML = `<span class="inline-flex items-center justify-center min-w-[14px] h-4 px-0.5 text-[9px] font-bold ${colorClass} border rounded align-top -top-0.5 relative mx-0.5 cursor-default" title="${type}:${id}">${id}</span>`;
-                        fragment.appendChild(span.firstElementChild);
+                        // Check for standard format (allow plain or [brackets])
+                        // Standard: search:1, [search:1], page:1, [page:1]
+                        // Non-standard: (page:1), page:1,2, (page:1,2)
+                        const isStandard = /^[\[]?(search|page):\d+[\]]?$/i.test(fullMatch);
+                        if (!isStandard) {
+                            console.warn(`[Template] Detected non-standard citation format: "${fullMatch}". Rendered as: ${type}:${ids.join(',')}`);
+                        }
+                        ids.forEach(id => {
+                            const span = document.createElement("span");
+                            const isPage = type === "page";
+                            const colorClass = isPage
+                                ? "text-orange-600 bg-orange-50 border-orange-200"
+                                : "text-blue-600 bg-blue-50 border-blue-200";
+                            span.innerHTML = `<span class="inline-flex items-center justify-center min-w-[14px] h-4 px-0.5 text-[9px] font-bold ${colorClass} border rounded align-top -top-0.5 relative mx-0.5 cursor-default" title="${type}:${id}">${id}</span>`;
+                            fragment.appendChild(span.firstElementChild);
+                        });
                         lastIndex = regex.lastIndex;
                     }
@@ -301,6 +363,37 @@
             }
             processCitations(contentDiv);
+            // Handle broken images in markdown content
+            const contentImages = contentDiv.querySelectorAll('img');
+            contentImages.forEach(img => {
+                // Apply sizing class based on aspect ratio
+                const applySizeClass = function() {
+                    if (this.naturalWidth >= this.naturalHeight) {
+                        this.classList.add('img-horizontal');
+                    } else {
+                        this.classList.add('img-vertical');
+                    }
+                };
+                img.onerror = function() {
+                    const fallback = document.createElement('span');
+                    fallback.className = 'img-error-fallback';
+                    fallback.innerHTML = `<span style="font-size: 18px;">(｡•́︿•̀｡)</span><span>渲染失败</span>`;
+                    this.parentNode.replaceChild(fallback, this);
+                };
+                // Check if image already loaded
+                if (img.complete) {
+                    if (img.naturalHeight === 0) {
+                        img.onerror();
+                    } else {
+                        applySizeClass.call(img);
+                    }
+                } else {
+                    img.onload = applySizeClass;
+                }
+            });
         });
     </script>
 </body>

entari_plugin_hyw/core/config.py CHANGED Viewed

@@ -35,3 +35,4 @@ class HYWConfig:
     vision_output_price: Optional[float] = None
     intruct_input_price: Optional[float] = None
     intruct_output_price: Optional[float] = None

entari_plugin_hyw/core/pipeline.py CHANGED Viewed

@@ -39,6 +39,10 @@ class ProcessingPipeline:
         self.client = AsyncOpenAI(base_url=self.config.base_url, api_key=self.config.api_key)
         self.all_web_results = [] # Cache for search results
         self.current_mode = "standard"  # standard | agent
+        # Independent ID counters for each type
+        self.search_id_counter = 0
+        self.page_id_counter = 0
+        self.image_id_counter = 0
         self.web_search_tool = {
             "type": "function",
@@ -118,8 +122,11 @@ class ProcessingPipeline:
         final_response_content = ""
         structured: Dict[str, Any] = {}
-        # Reset search cache for this execution
+        # Reset search cache and ID counters for this execution
         self.all_web_results = []
+        self.search_id_counter = 0
+        self.page_id_counter = 0
+        self.image_id_counter = 0
         try:
             logger.info(f"Pipeline: Starting workflow for '{user_input}' using {active_model}")
@@ -244,8 +251,8 @@ class ProcessingPipeline:
                 search_msgs_text = self._format_search_msgs()
                 image_msgs_text = self._format_image_search_msgs()
-                has_search_results = any(not r.get("is_image") for r in self.all_web_results)
-                has_image_results = any(r.get("is_image") for r in self.all_web_results)
+                has_search_results = any(r.get("_type") == "search" for r in self.all_web_results)
+                has_image_results = any(r.get("_type") == "image" for r in self.all_web_results)
                 # Build agent system prompt
                 agent_prompt_tpl = getattr(self.config, "agent_system_prompt", None) or AGENT_SP
@@ -462,7 +469,7 @@ class ProcessingPipeline:
                     for tc in crawl_calls:
                         url = tc.get("arguments", {}).get("url", "")
                         # Try to find cached result
-                        found = next((r for r in self.all_web_results if r.get("url") == url and r.get("is_crawled")), None)
+                        found = next((r for r in self.all_web_results if r.get("url") == url and r.get("_type") == "page"), None)
                         if found:
                             try:
                                 from urllib.parse import urlparse
@@ -588,6 +595,19 @@ class ProcessingPipeline:
                     last_agent["time"] = a.get("time", 0)
                     last_agent["cost"] = a.get("cost", 0.0)
+            # Clean up conversation history: Remove tool calls and results to save tokens and avoid ID conflicts
+            # Keep only 'user' messages and 'assistant' messages without tool_calls (final answers)
+            cleaned_history = []
+            for msg in current_history:
+                if msg.get("role") == "tool":
+                    continue
+                if msg.get("role") == "assistant" and msg.get("tool_calls"):
+                    continue
+                cleaned_history.append(msg)
+            # Update the reference (since it might be used by caller)
+            current_history[:] = cleaned_history
             return {
                 "llm_response": final_content,
                 "structured_response": structured,
@@ -609,8 +629,8 @@ class ProcessingPipeline:
             }
     def _parse_tagged_response(self, text: str) -> Dict[str, Any]:
-        """Parse response for references and page references."""
-        parsed = {"response": "", "references": [], "page_references": [], "flow_steps": []}
+        """Parse response for references and page references reordered by appearance."""
+        parsed = {"response": "", "references": [], "page_references": [], "image_references": [], "flow_steps": []}
         if not text:
             return parsed
@@ -620,7 +640,6 @@ class ProcessingPipeline:
         # 1. Try to unwrap JSON if the model acted like a ReAct agent
         try:
-            # Check if it looks like JSON first to avoid performance hit
             if remaining_text.strip().startswith("{") and "action" in remaining_text:
                 data = json.loads(remaining_text)
                 if isinstance(data, dict) and "action_input" in data:
@@ -628,86 +647,104 @@ class ProcessingPipeline:
         except Exception:
             pass
-        id_map = {}        # Map original search ID (str) -> new index (int)
-        page_id_map = {}   # Map original page ID (str) -> new index (int)
+        # 2. Extract references from text first (Order by appearance)
+        # Pattern matches [search:123], [page:123], [image:123]
+        pattern = re.compile(r'\[(search|page|image):(\d+)\]', re.IGNORECASE)
+        matches = list(pattern.finditer(remaining_text))
+        search_map = {}  # old_id_str -> new_id (int)
+        page_map = {}
+        image_map = {}
+        def process_ref(tag_type, old_id):
+            # Find in all_web_results
+            result_item = next((r for r in self.all_web_results if r.get("_id") == old_id and r.get("_type") == tag_type), None)
+            if not result_item:
+                return
+            entry = {
+                "title": result_item.get("title", ""),
+                "url": result_item.get("url", ""),
+                "domain": result_item.get("domain", "")
+            }
+            if tag_type == "image":
+                 entry["thumbnail"] = result_item.get("thumbnail", "")
+            # Add to respective list and map
+            # Check maps to avoid duplicates
+            if tag_type == "search":
+                if str(old_id) not in search_map:
+                    parsed["references"].append(entry)
+                    search_map[str(old_id)] = len(parsed["references"])
+            elif tag_type == "page":
+                if str(old_id) not in page_map:
+                    parsed["page_references"].append(entry)
+                    page_map[str(old_id)] = len(parsed["page_references"])
+            elif tag_type == "image":
+                if str(old_id) not in image_map:
+                    parsed["image_references"].append(entry)
+                    image_map[str(old_id)] = len(parsed["image_references"])
+        # Pass 1: Text Body
+        for m in matches:
+            try:
+                process_ref(m.group(1).lower(), int(m.group(2)))
+            except ValueError:
+                continue
-        # Parse References Block (unified: contains both [search] and [page] entries)
+        # 3. Pass 2: References Block (Capture items missed in text)
         ref_block_match = re.search(r'```references\s*(.*?)\s*```', remaining_text, re.DOTALL | re.IGNORECASE)
         if ref_block_match:
             ref_content = ref_block_match.group(1).strip()
+            remaining_text = remaining_text.replace(ref_block_match.group(0), "").strip()
             for line in ref_content.split("\n"):
                 line = line.strip()
                 if not line: continue
+                # Match [id] [type]
+                # e.g. [1] [image] ... or [image:1] ...
-                # Match [id] [type] [title](url)
-                # e.g. [1] [search] [文本描述](url) or [5] [page] [页面标题](url)
-                id_match = re.match(r"^\[(\d+)\]", line)
-                type_match = re.search(r"\[(search|page)\]", line, re.IGNORECASE)
-                link_match = re.search(r"\[([^\[\]]+)\]\(([^)]+)\)", line)
-                idx = None
+                # Check for [id] [type] format
+                id_match = re.match(r"^\[(\d+)\]\s*\[(search|page|image)\]", line, re.IGNORECASE)
                 if id_match:
                     try:
-                        idx = int(id_match.group(1))
+                         process_ref(id_match.group(2).lower(), int(id_match.group(1)))
                     except ValueError:
                         pass
-                ref_type = "search"  # default
-                if type_match:
-                    ref_type = type_match.group(1).lower()
-                entry = None
-                if idx is not None and self.all_web_results:
-                    # For page type, only match crawled items
-                    if ref_type == "page":
-                        found = next((r for r in self.all_web_results if r.get("_id") == idx and r.get("is_crawled")), None)
-                    else:
-                        found = next((r for r in self.all_web_results if r.get("_id") == idx and not r.get("is_crawled")), None)
-                    if found:
-                        entry = {
-                            "title": found.get("title"),
-                            "url": found.get("url"),
-                            "domain": found.get("domain", "")
-                        }
-                if not entry and link_match:
-                    entry = {"title": link_match.group(1), "url": link_match.group(2)}
-                if entry:
-                    if ref_type == "page":
-                        parsed["page_references"].append(entry)
-                        if idx is not None:
-                            page_id_map[str(idx)] = len(parsed["page_references"])
-                    else:
-                        parsed["references"].append(entry)
-                        if idx is not None:
-                            id_map[str(idx)] = len(parsed["references"])
-            remaining_text = remaining_text.replace(ref_block_match.group(0), "").strip()
+                else:
+                    # Check for [type:id] format in list
+                    alt_match = re.match(r"^\[(search|page|image):(\d+)\]", line, re.IGNORECASE)
+                    if alt_match:
+                        try:
+                            process_ref(alt_match.group(1).lower(), int(alt_match.group(2)))
+                        except ValueError:
+                            pass
+        # 4. Replace tags in text with new sequential IDs
+        # 4. Replace tags in text with new sequential IDs
+        def replace_tag(match):
+            tag_type = match.group(1).lower()
+            old_id = match.group(2)
+            new_id = None
+            if tag_type == "search":
+                new_id = search_map.get(old_id)
+            elif tag_type == "page":
+                new_id = page_map.get(old_id)
+            elif tag_type == "image":
+                new_id = image_map.get(old_id)
+            if new_id is not None:
+                if tag_type == "image":
+                    return ""
+                return f"[{tag_type}:{new_id}]"
+            return match.group(0)
-        # Replace search:id citations
-        if id_map:
-            def replace_search_citation(match):
-                old_id = match.group(1) or match.group(2)
-                if old_id in id_map:
-                    return f"`search:{id_map[old_id]}`"
-                return match.group(0)
-            remaining_text = re.sub(r'\[(\d+)\]', replace_search_citation, remaining_text)
-            remaining_text = re.sub(r'(?<!`)search:(\d+)(?!`)', replace_search_citation, remaining_text)
-            remaining_text = re.sub(r'`search:(\d+)`', replace_search_citation, remaining_text)
-        # Replace page:id citations
-        if page_id_map:
-            def replace_page_citation(match):
-                old_id = match.group(1)
-                if old_id in page_id_map:
-                    return f"`page:{page_id_map[old_id]}`"
-                return match.group(0)
-            remaining_text = re.sub(r'(?<!`)page:(\d+)(?!`)', replace_page_citation, remaining_text)
-            remaining_text = re.sub(r'`page:(\d+)`', replace_page_citation, remaining_text)
+        remaining_text = pattern.sub(replace_tag, remaining_text)
         parsed["response"] = remaining_text.strip()
         return parsed
@@ -730,12 +767,11 @@ class ProcessingPipeline:
             query = args.get("query")
             web = await self.search_service.search(query)
-            # Cache results and assign IDs
-            current_max_id = max([item.get("_id", 0) for item in self.all_web_results], default=0)
+            # Cache results and assign search-specific IDs
             for item in web:
-                current_max_id += 1
-                item["_id"] = current_max_id
+                self.search_id_counter += 1
+                item["_id"] = self.search_id_counter
+                item["_type"] = "search"
                 item["query"] = query
                 self.all_web_results.append(item)
@@ -745,10 +781,11 @@ class ProcessingPipeline:
             query = args.get("query")
             images = await self.search_service.image_search(query)
-            current_max_id = max([item.get("_id", 0) for item in self.all_web_results], default=0)
+            # Cache results and assign image-specific IDs
             for item in images:
-                current_max_id += 1
-                item["_id"] = current_max_id
+                self.image_id_counter += 1
+                item["_id"] = self.image_id_counter
+                item["_type"] = "image"
                 item["query"] = query
                 item["is_image"] = True
                 self.all_web_results.append(item)
@@ -761,15 +798,15 @@ class ProcessingPipeline:
             # Returns Dict: {content, title, url}
             result_dict = await self.search_service.fetch_page(url)
-            # Cache the crawled content so Agent can access it
-            current_max_id = max([item.get("_id", 0) for item in self.all_web_results], default=0)
-            current_max_id += 1
+            # Cache the crawled content with page-specific ID
+            self.page_id_counter += 1
             cached_item = {
-                "_id": current_max_id,
+                "_id": self.page_id_counter,
+                "_type": "page",
                 "title": result_dict.get("title", "Page"),
                 "url": result_dict.get("url", url),
-                "content": result_dict.get("content", "")[:2000],  # Clip content for prompt
+                "content": result_dict.get("content", ""),
                 "domain": "",
                 "is_crawled": True,
             }
@@ -940,18 +977,13 @@ class ProcessingPipeline:
         if not self.all_web_results:
             return ""
-        def clip(s: str, n: int) -> str:
-            s = (s or "").strip()
-            return s if len(s) <= n else s[: n - 1] + "…"
         lines = []
         for res in self.all_web_results:
-            if res.get("is_image"): continue  # Skip images
-            if res.get("is_crawled"): continue  # Skip crawled pages (handled separately)
+            if res.get("_type") != "search": continue  # Only search results
             idx = res.get("_id")
-            title = clip(res.get("title", ""), 80)
+            title = (res.get("title", "") or "").strip()
             url = res.get("url", "")
-            content = clip(res.get("content", ""), 200)
+            content = (res.get("content", "") or "").strip()
             lines.append(f"[{idx}] Title: {title}\nURL: {url}\nSnippet: {content}\n")
         return "\n".join(lines)
@@ -961,17 +993,13 @@ class ProcessingPipeline:
         if not self.all_web_results:
             return ""
-        def clip(s: str, n: int) -> str:
-            s = (s or "").strip()
-            return s if len(s) <= n else s[: n - 1] + "…"
         lines = []
         for res in self.all_web_results:
-            if not res.get("is_crawled"): continue  # Only crawled pages
+            if res.get("_type") != "page": continue  # Only page results
             idx = res.get("_id")
-            title = clip(res.get("title", ""), 80)
+            title = (res.get("title", "") or "").strip()
             url = res.get("url", "")
-            content = clip(res.get("content", ""), 1500)  # More content for pages
+            content = (res.get("content", "") or "").strip()
             lines.append(f"[{idx}] Title: {title}\nURL: {url}\nContent: {content}\n")
         return "\n".join(lines)
@@ -982,7 +1010,7 @@ class ProcessingPipeline:
         lines = []
         for res in self.all_web_results:
-            if not res.get("is_image"): continue
+            if res.get("_type") != "image": continue  # Only image results
             idx = res.get("_id")
             title = res.get("title", "")
             url = res.get("image", "") or res.get("url", "")

entari-plugin-hyw 3.3.5__py3-none-any.whl → 3.3.7__py3-none-any.whl

Potentially problematic release.

entari-plugin-hyw 3.3.5py3-none-any.whl → 3.3.7py3-none-any.whl