PyPI - entari-plugin-hyw - Versions diffs - 4.0.0rc4__py3-none-any.whl → 4.0.0rc6__py3-none-any.whl - Mend

entari-plugin-hyw 4.0.0rc4py3-none-any.whl → 4.0.0rc6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (30) hide show

entari_plugin_hyw/__init__.py +216 -75
entari_plugin_hyw/assets/card-dist/index.html +70 -79
entari_plugin_hyw/browser/__init__.py +10 -0
entari_plugin_hyw/browser/engines/base.py +13 -0
entari_plugin_hyw/browser/engines/bing.py +95 -0
entari_plugin_hyw/browser/engines/duckduckgo.py +137 -0
entari_plugin_hyw/browser/engines/google.py +155 -0
entari_plugin_hyw/browser/landing.html +172 -0
entari_plugin_hyw/browser/manager.py +153 -0
entari_plugin_hyw/browser/service.py +304 -0
entari_plugin_hyw/card-ui/src/App.vue +526 -182
entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +7 -11
entari_plugin_hyw/card-ui/src/components/StageCard.vue +33 -30
entari_plugin_hyw/card-ui/src/types.ts +9 -0
entari_plugin_hyw/definitions.py +155 -0
entari_plugin_hyw/history.py +111 -33
entari_plugin_hyw/misc.py +34 -0
entari_plugin_hyw/modular_pipeline.py +384 -0
entari_plugin_hyw/render_vue.py +326 -239
entari_plugin_hyw/search.py +95 -708
entari_plugin_hyw/stage_base.py +92 -0
entari_plugin_hyw/stage_instruct.py +345 -0
entari_plugin_hyw/stage_instruct_deepsearch.py +104 -0
entari_plugin_hyw/stage_summary.py +164 -0
{entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/METADATA +4 -4
{entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/RECORD +28 -16
entari_plugin_hyw/pipeline.py +0 -1219
entari_plugin_hyw/prompts.py +0 -47
{entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/WHEEL +0 -0
{entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/top_level.txt +0 -0

entari_plugin_hyw/browser/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from .manager import get_shared_browser_manager, close_shared_browser
+from .service import get_screenshot_service, close_screenshot_service, prestart_browser
+__all__ = [
+    "get_shared_browser_manager",
+    "close_shared_browser",
+    "get_screenshot_service",
+    "close_screenshot_service",
+    "prestart_browser",
+]

entari_plugin_hyw/browser/engines/base.py ADDED Viewed

@@ -0,0 +1,13 @@
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any
+class SearchEngine(ABC):
+    @abstractmethod
+    def build_url(self, query: str, limit: int = 10) -> str:
+        """Build the search URL for the given query."""
+        pass
+    @abstractmethod
+    def parse(self, content: str) -> List[Dict[str, Any]]:
+        """Parse the raw HTML/Markdown content into a list of results."""
+        pass

entari_plugin_hyw/browser/engines/bing.py ADDED Viewed

@@ -0,0 +1,95 @@
+import urllib.parse
+import re
+from typing import List, Dict, Any
+from loguru import logger
+from .base import SearchEngine
+class BingEngine(SearchEngine):
+    """
+    Search engine implementation for Bing.
+    """
+    def build_url(self, query: str, limit: int = 10) -> str:
+        encoded_query = urllib.parse.quote(query)
+        base = "https://www.bing.com/search"
+        return f"{base}?form=&q={encoded_query}"
+    def parse(self, content: str) -> List[Dict[str, Any]]:
+        results = []
+        # Split by b_algo to isolate results
+        chunks = content.split('class="b_algo"')
+        # Helper to decode Bing URLs roughly
+        def decode_bing_url(u):
+            if "bing.com/ck/a?" not in u: return u
+            try:
+                # Url is usually like ...&u=a1<base64>&...
+                # We look for &u=...
+                import base64
+                match = re.search(r'[?&]u=a1([^&]+)', u)
+                if match:
+                    # Bing uses a modified base64 (url safe) and adds 'a1' prefix
+                    # We stripped 'a1' in regex match group
+                    b64 = match.group(1)
+                    # padding
+                    b64 += '=' * (-len(b64) % 4)
+                    # url safe
+                    b64 = b64.replace('-', '+').replace('_', '/')
+                    decoded = base64.b64decode(b64).decode('utf-8')
+                    return decoded
+            except Exception:
+                pass
+            return u
+        seen_urls = set()
+        for chunk in chunks[1:]:
+            # Exact regexes for title and snippet within the chunk
+            # Title: <h2><a href="...">...</a></h2>
+            link_match = re.search(r'<h2[^>]*>.*?<a[^>]+href=["\'](http[^"\']+)["\'][^>]*>(.*?)</a>', chunk, re.IGNORECASE | re.DOTALL)
+            if not link_match:
+                # Fallback: pure a tag
+                link_match = re.search(r'<a[^>]+href=["\'](http[^"\']+)["\'][^>]*>(.*?)</a>', chunk, re.IGNORECASE | re.DOTALL)
+            if link_match:
+                raw_url = link_match.group(1)
+                title_html = link_match.group(2)
+                title = re.sub(r'<[^>]+>', '', title_html).strip()
+                url = decode_bing_url(raw_url)
+                if url in seen_urls: continue
+                seen_urls.add(url)
+                # Snippet: class="b_caption" ... <p> ... </p> or just div text
+                snippet = ""
+                caption_match = re.search(r'class="b_caption"[^>]*>(.*?)</div>', chunk, re.IGNORECASE | re.DOTALL)
+                if caption_match:
+                    snippet_html = caption_match.group(1)
+                    snippet = re.sub(r'<[^>]+>', ' ', snippet_html).strip()
+                else:
+                    # Fallback snippet
+                    start = link_match.end()
+                    snippet = re.sub(r'<[^>]+>', ' ', chunk[start:start+600]).strip()
+                snippet = re.sub(r'\s+', ' ', snippet).strip()
+                # Image extraction (basic)
+                images = []
+                img_matches = re.findall(r'<img[^>]+src=["\'](http[^"\']+)["\']', chunk)
+                for img_url in img_matches:
+                    if not any(x in img_url for x in ['favicon', 'icon', 'tracking', 'pixel']):
+                         images.append(img_url)
+                if url and title:
+                    results.append({
+                        "title": title,
+                        "url": url,
+                        "domain": urllib.parse.urlparse(url).hostname or "",
+                        "content": snippet[:5000],
+                        "images": images[:3]
+                    })
+        logger.info(f"BingEngine parsed {len(results)} results.")
+        return results

entari_plugin_hyw/browser/engines/duckduckgo.py ADDED Viewed

@@ -0,0 +1,137 @@
+import urllib.parse
+import re
+from typing import List, Dict, Any
+from loguru import logger
+from .base import SearchEngine
+class DuckDuckGoEngine(SearchEngine):
+    """
+    Parser for DuckDuckGo Lite results.
+    Handles both Markdown (from Crawl4AI) and HTML (fallback).
+    """
+    def build_url(self, query: str, limit: int = 10) -> str:
+        encoded_query = urllib.parse.quote(query)
+        # Default fallback if not configurable per instance, but usually this is what we support as "searxng"
+        base = "https://lite.duckduckgo.com/lite/"
+        return f"{base}?q={encoded_query}"
+    def parse(self, content: str) -> List[Dict[str, Any]]:
+        # Prioritize HTML parsing if content looks like HTML
+        if "<html" in content.lower() or "<!doctype" in content.lower() or "<div" in content.lower():
+            results = self._parse_html(content)
+            if results:
+                return results
+        # Fallback to Markdown
+        return self._parse_markdown(content)
+    def _parse_html(self, content: str) -> List[Dict[str, Any]]:
+        results = []
+        seen_urls = set()
+        # Simple regex for DDG Lite / SearXNG HTML structure
+        link_regex = re.compile(r'<a[^>]+href=["\'](http[^"\']+)["\'][^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)
+        pos = 0
+        while True:
+            match = link_regex.search(content, pos)
+            if not match:
+                break
+            href = match.group(1)
+            title_html = match.group(2)
+            # Clean title
+            title = re.sub(r'<[^>]+>', '', title_html).strip()
+            pos = match.end()
+            # Filter junk
+            if "search" in href and "q=" in href: continue
+            if "google.com" in href or "bing.com" in href: continue
+            if href in seen_urls: continue
+            # Look ahead for snippet
+            snippet_chunk = content[pos:pos+1000]
+            snippet_match = re.search(r'(.*?)<a', snippet_chunk, re.DOTALL | re.IGNORECASE)
+            raw_snippet = snippet_match.group(1) if snippet_match else snippet_chunk
+            # Clean HTML tags from snippet
+            snippet = re.sub(r'<[^>]+>', ' ', raw_snippet)
+            snippet = re.sub(r'\s+', ' ', snippet).strip()
+            # No truncation as per user request (or very generous limit)
+            snippet = snippet[:5000]
+            # Valid result check
+            if title and len(title) > 2 and snippet:
+                # Extract images from the result block (rough heuristic)
+                images = []
+                img_matches = re.findall(r'<img[^>]+src=["\'](http[^"\']+)["\']', snippet_match.group(0) if snippet_match else snippet_chunk)
+                for img_url in img_matches:
+                    if not any(x in img_url for x in ['favicon', 'icon', 'tracking', 'pixel']):
+                         images.append(img_url)
+                results.append({
+                    "title": title,
+                    "url": href,
+                    "domain": urllib.parse.urlparse(href).hostname or "",
+                    "content": snippet,
+                    "images": images[:3] # Limit per result
+                })
+                seen_urls.add(href)
+        logger.info(f"DuckDuckGo Parser(HTML) found {len(results)} results.")
+        return results
+    def _parse_markdown(self, content: str) -> List[Dict[str, Any]]:
+        results = []
+        seen_urls = set()
+        # Link regex: [Title](URL)
+        link_regex = re.compile(r'\[(.*?)\]\((https?://.*?)\)')
+        lines = content.split('\n')
+        current_result = None
+        for line in lines:
+            line = line.strip()
+            if not line: continue
+            # Check for link
+            match = link_regex.search(line)
+            if match:
+                # Save previous result
+                if current_result:
+                    results.append(current_result)
+                title, href = match.groups()
+                # Filter junk
+                if "search" in href and "q=" in href: continue
+                if "google.com" in href or "bing.com" in href: continue
+                if href in seen_urls:
+                    current_result = None
+                    continue
+                seen_urls.add(href)
+                current_result = {
+                    "title": title,
+                    "url": href,
+                    "domain": urllib.parse.urlparse(href).hostname or "",
+                    "content": ""
+                }
+            elif current_result:
+                # Append snippet
+                if len(current_result["content"]) < 5000:
+                    current_result["content"] += " " + line
+        # Append last
+        if current_result:
+             results.append(current_result)
+        logger.info(f"DuckDuckGo Parser(Markdown) found {len(results)} results.")
+        return results

entari_plugin_hyw/browser/engines/google.py ADDED Viewed

@@ -0,0 +1,155 @@
+import urllib.parse
+import re
+from typing import List, Dict, Any
+from loguru import logger
+from .base import SearchEngine
+class GoogleEngine(SearchEngine):
+    """
+    Search engine implementation for Google.
+    Parses Google Search HTML results.
+    """
+    def build_url(self, query: str, limit: int = 10) -> str:
+        encoded_query = urllib.parse.quote(query)
+        return f"https://www.google.com/search?q={encoded_query}"
+    def parse(self, content: str) -> List[Dict[str, Any]]:
+        results = []
+        seen_urls = set()
+        # Google search results are in blocks with class="MjjYud" or similar containers
+        # Split by result blocks first for more accurate extraction
+        # Method 1: Split by common result block classes
+        block_patterns = [
+            r'<div class="MjjYud"[^>]*>',
+            r'<div class="tF2Cxc"[^>]*>',
+            r'<div class="g Ww4FFb"[^>]*>',
+        ]
+        blocks = [content]
+        for bp in block_patterns:
+            new_blocks = []
+            for block in blocks:
+                parts = re.split(bp, block)
+                new_blocks.extend(parts)
+            blocks = new_blocks
+        for block in blocks:
+            if len(block) < 100:
+                continue
+            # Find URL in this block - prefer links with h3 nearby
+            url_match = re.search(r'<a[^>]+href="(https?://(?!www\.google\.|google\.|webcache\.googleusercontent\.)[^"]+)"[^>]*>', block)
+            if not url_match:
+                continue
+            url = url_match.group(1)
+            if url in seen_urls or self._should_skip_url(url):
+                continue
+            # Find h3 title in this block
+            h3_match = re.search(r'<h3[^>]*>(.*?)</h3>', block, re.IGNORECASE | re.DOTALL)
+            if not h3_match:
+                continue
+            title = re.sub(r'<[^>]+>', '', h3_match.group(1)).strip()
+            if not title or len(title) < 2:
+                continue
+            seen_urls.add(url)
+            # Extract snippet from VwiC3b class (Google's snippet container)
+            snippet = ""
+            snippet_match = re.search(r'<div[^>]*class="[^"]*VwiC3b[^"]*"[^>]*>(.*?)</div>', block, re.IGNORECASE | re.DOTALL)
+            if snippet_match:
+                snippet = re.sub(r'<[^>]+>', ' ', snippet_match.group(1)).strip()
+                snippet = re.sub(r'\s+', ' ', snippet).strip()
+            # Fallback: look for any text after h3
+            if not snippet:
+                # Try other common snippet patterns
+                alt_patterns = [
+                    r'<span[^>]*class="[^"]*aCOpRe[^"]*"[^>]*>(.*?)</span>',
+                    r'<div[^>]*data-snc[^>]*>(.*?)</div>',
+                ]
+                for ap in alt_patterns:
+                    am = re.search(ap, block, re.IGNORECASE | re.DOTALL)
+                    if am:
+                        snippet = re.sub(r'<[^>]+>', ' ', am.group(1)).strip()
+                        snippet = re.sub(r'\s+', ' ', snippet).strip()
+                        break
+            # Extract images from this block
+            images = []
+            # Pattern 1: Regular img src (excluding data: and tracking pixels)
+            # Note: gstatic.com/images/branding is logo, but encrypted-tbn*.gstatic.com are thumbnails
+            img_matches = re.findall(r'<img[^>]+src="(https?://[^"]+)"', block)
+            for img_url in img_matches:
+                # Decode HTML entities
+                img_url = img_url.replace('&amp;', '&')
+                # Skip tracking/icon/small images (but allow encrypted-tbn which are valid thumbnails)
+                if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo', 'gstatic.com/images/branding', '1x1', 'transparent', 'gstatic.com/images/icons']):
+                    continue
+                if img_url not in images:
+                    images.append(img_url)
+            # Pattern 2: data-src (lazy loaded images)
+            data_src_matches = re.findall(r'data-src="(https?://[^"]+)"', block)
+            for img_url in data_src_matches:
+                img_url = img_url.replace('&amp;', '&')
+                if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo']):
+                    continue
+                if img_url not in images:
+                    images.append(img_url)
+            results.append({
+                "title": title,
+                "url": url,
+                "domain": urllib.parse.urlparse(url).hostname or "",
+                "content": snippet[:1000],
+                "images": images[:3]  # Limit to 3 images per result
+            })
+            if len(results) >= 15:
+                break
+        total_images = sum(len(r.get("images", [])) for r in results)
+        logger.info(f"GoogleEngine parsed {len(results)} results with {total_images} images total.")
+        return results
+    def _should_skip_url(self, url: str) -> bool:
+        """Check if URL should be skipped."""
+        skip_patterns = [
+            "google.com",
+            "googleusercontent.com",
+            "gstatic.com",
+            "youtube.com/watch",  # Keep channel/playlist but skip individual videos
+            "maps.google",
+            "translate.google",
+            "accounts.google",
+            "support.google",
+            "policies.google",
+            "schema.org",
+            "javascript:",
+            "data:",
+            "#",
+        ]
+        for pattern in skip_patterns:
+            if pattern in url.lower():
+                return True
+        # Skip very short URLs (likely invalid)
+        if len(url) < 20:
+            return True
+        # Skip URLs that are just root domains without path
+        parsed = urllib.parse.urlparse(url)
+        if not parsed.path or parsed.path == "/":
+            return True
+        return False

entari_plugin_hyw/browser/landing.html ADDED Viewed

@@ -0,0 +1,172 @@
+<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <title>entari-plugin-hyw Browser</title>
+    <style>
+        :root {
+            --theme-color: #ef4444;
+            --text-primary: #2c2c2e;
+            --text-body: #3a3a3c;
+            --text-muted: #86868b;
+            --bg-border: #f2f2f2;
+        }
+        body {
+            background-color: var(--bg-border);
+            color: var(--text-primary);
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+            margin: 0;
+            padding: 0;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            height: 100vh;
+            overflow: hidden;
+        }
+        #main-container {
+            width: 560px;
+            background: white;
+            padding: 40px;
+            box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
+            position: relative;
+            transform: scale(1.1);
+            /* Slightly larger for visibility */
+        }
+        .corner-badge {
+            position: absolute;
+            top: -10px;
+            left: -10px;
+            height: 28px;
+            padding: 0 12px;
+            background-color: var(--theme-color);
+            color: white;
+            display: flex;
+            align-items: center;
+            font-size: 12px;
+            font-weight: 800;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+            box-shadow: 0 2px 4px 0 rgba(0, 0, 0, 0.15);
+            z-index: 10;
+        }
+        h1 {
+            font-size: 32px;
+            font-weight: 900;
+            text-transform: uppercase;
+            letter-spacing: -1px;
+            margin: 0 0 24px 0;
+            line-height: 1.1;
+        }
+        .brand-text {
+            color: var(--theme-color);
+        }
+        .section-title {
+            font-size: 17px;
+            font-bold: 700;
+            text-transform: uppercase;
+            letter-spacing: -0.5px;
+            margin-bottom: 12px;
+            color: var(--text-primary);
+        }
+        .content-box {
+            padding-top: 24px;
+            border-top: 1px solid #eee;
+            margin-top: 32px;
+        }
+        .summary {
+            font-size: 15px;
+            line-height: 1.6;
+            color: var(--text-body);
+            text-align: justify;
+        }
+        .summary-en {
+            margin-top: 16px;
+            color: var(--text-muted);
+            font-size: 13px;
+            font-style: italic;
+            line-height: 1.5;
+        }
+        b {
+            color: var(--theme-color);
+            font-weight: 700;
+        }
+        /* Subtle loading animation to show it's "live" */
+        .progress-bar {
+            height: 3px;
+            background: #f3f3f3;
+            width: 100%;
+            margin-top: 32px;
+            position: relative;
+            overflow: hidden;
+        }
+        .progress-fill {
+            position: absolute;
+            height: 100%;
+            background: var(--theme-color);
+            width: 30%;
+            left: -30%;
+            animation: moveProgress 2s infinite ease-in-out;
+        }
+        @keyframes moveProgress {
+            0% {
+                left: -30%;
+                width: 20%;
+            }
+            50% {
+                width: 50%;
+            }
+            100% {
+                left: 100%;
+                width: 20%;
+            }
+        }
+    </style>
+</head>
+<body>
+    <div id="main-container">
+        <!-- Corner Badge Style -->
+        <div class="corner-badge">
+            Status: Ready
+        </div>
+        <h1>entari-plugin-<span class="brand-text">hyw</span></h1>
+        <div class="content-box">
+            <div class="section-title">Browser Service</div>
+            <div class="summary">
+                这是一个受 <b>entari-plugin-hyw</b> 插件控制的自动化浏览器实例。<br>
+                它负责网络搜索、内容爬取以及卡片 UI 的实时渲染。<br>
+                请勿关闭此窗口，以确保插件功能正常运行。
+            </div>
+            <div class="summary-en">
+                This is an automated browser instance controlled by the <b>entari-plugin-hyw</b> plugin.
+                It handles web searches, content crawling, and real-time Card UI rendering.
+                Please do not close this window to ensure the plugin functions correctly.
+            </div>
+        </div>
+        <div class="progress-bar">
+            <div class="progress-fill"></div>
+        </div>
+    </div>
+</body>
+</html>

entari-plugin-hyw 4.0.0rc4__py3-none-any.whl → 4.0.0rc6__py3-none-any.whl

Potentially problematic release.

entari-plugin-hyw 4.0.0rc4py3-none-any.whl → 4.0.0rc6py3-none-any.whl