PyPI - entari-plugin-hyw - Versions diffs - 4.0.0rc7__py3-none-any.whl → 4.0.0rc8__py3-none-any.whl - Mend - Supply Chain Defender

entari-plugin-hyw 4.0.0rc7py3-none-any.whl → 4.0.0rc8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (114) hide show

{entari_plugin_hyw/browser → hyw_core/browser_control}/service.py RENAMED Viewed

@@ -22,65 +22,22 @@ class ScreenshotService:
         self.headless = headless
         self._manager = None
         self._executor = ThreadPoolExecutor(max_workers=10)
-        self._search_tab_pool = []  # List of Tab objects
-        self._pool_lock = threading.Lock()
         if auto_start:
             self._ensure_ready()
-    def prepare_search_tabs_background(self, count: int, url: str = "https://www.google.com") -> None:
-        """
-        Pre-launch tabs for search (BACKGROUND - fire and forget).
-        Tabs are created in background thread, may not be ready immediately.
-        """
-        self._executor.submit(self._prepare_search_tabs_sync, count, url)
-    def _prepare_search_tabs_sync(self, count: int, url: str = "https://www.google.com"):
-        """Sync implementation of tab preparation - creates tabs in PARALLEL."""
+    def _get_tab(self, url: str) -> Any:
+        """Create a new tab and navigate to URL."""
+        self._ensure_ready()
+        return self._manager.new_tab(url)
+    def _release_tab(self, tab: Any):
+        """Close tab after use."""
+        if not tab: return
         try:
-            self._ensure_ready()
-            page = self._manager.page
-            if not page: return
-            with self._pool_lock:
-                current_count = len(self._search_tab_pool)
-                needed = count - current_count
-            if needed <= 0:
-                return
-            logger.info(f"ScreenshotService: Pre-launching {needed} search tabs for {url} (parallel)...")
-            # Create tabs in parallel using threads
-            created_tabs = [None] * needed
-            def create_single_tab(index):
-                try:
-                    tab = page.new_tab(url)
-                    created_tabs[index] = tab
-                    logger.debug(f"ScreenshotService: Tab {index} ready")
-                except Exception as e:
-                    logger.error(f"ScreenshotService: Failed to create tab {index}: {e}")
-            threads = []
-            for i in range(needed):
-                t = threading.Thread(target=create_single_tab, args=(i,))
-                t.start()
-                threads.append(t)
-            # Wait for all threads to complete
-            for t in threads:
-                t.join()
-            # Add successfully created tabs to pool
-            with self._pool_lock:
-                for tab in created_tabs:
-                    if tab:
-                        self._search_tab_pool.append(tab)
-                logger.info(f"ScreenshotService: Tab pool ready ({len(self._search_tab_pool)} tabs)")
-        except Exception as e:
-            logger.error(f"ScreenshotService: Failed to prepare tabs: {e}")
+            tab.close()
+        except:
+            pass
     async def search_via_page_input_batch(self, queries: List[str], url: str, selector: str = "#input") -> List[Dict[str, Any]]:
         """
@@ -104,17 +61,13 @@ class ScreenshotService:
         for i in range(len(queries)):
             tab = None
-            # Try to get from pool first
-            with self._pool_lock:
-                if self._search_tab_pool:
-                    tab = self._search_tab_pool.pop(0)
-                    logger.debug(f"ScreenshotService: Got tab {i} from pool")
+            # Try to get from pool first (using shared logic now)
+            try:
+                tab = self._get_tab(target_url)
+            except Exception as e:
+                logger.warning(f"ScreenshotService: Batch search tab creation failed: {e}")
-            if not tab:
-                # Create new
-                self._ensure_ready()
-                tab = self._manager.page.new_tab(target_url)
-                logger.debug(f"ScreenshotService: Created tab {i} for {target_url}")
             tabs.append(tab)
@@ -155,7 +108,8 @@ class ScreenshotService:
                 logger.debug(f"Search[{index}]: Waiting for search results...")
                 tab.wait.doc_loaded(timeout=10)
-                time.sleep(0.5)
+                # Reduced settle wait for extraction
+                time.sleep(0.1)
                 logger.debug(f"Search[{index}]: Extracting content...")
                 html = tab.html
@@ -178,8 +132,7 @@ class ScreenshotService:
                 logger.error(f"ScreenshotService: Search error for '{query}': {e}")
                 results[index] = {"content": f"Error: {e}", "title": "Error", "url": "", "html": ""}
             finally:
-                try: tab.close()
-                except: pass
+                self._release_tab(tab)
         threads = []
         for i, (tab, query) in enumerate(zip(tabs, queries)):
@@ -248,7 +201,7 @@ class ScreenshotService:
             # Small delay for address bar to focus
             import time as _time
-            _time.sleep(0.1)
+            _time.sleep(0.05)
             # Type the query
             tab.actions.type(query)
@@ -259,8 +212,8 @@ class ScreenshotService:
             # Wait for page to load
             try:
                 tab.wait.doc_loaded(timeout=timeout)
-                # Additional wait for search results
-                _time.sleep(1)
+                # Reduced wait for initial results
+                _time.sleep(0.2)
             except:
                 pass
@@ -292,6 +245,89 @@ class ScreenshotService:
                 try: tab.close()
                 except: pass
+    def _scroll_to_bottom(self, tab, step: int = 800, delay: float = 2.0, timeout: float = 10.0):
+        """
+        Scroll down gradually to trigger lazy loading.
+        Args:
+            delay: Max wait time per scroll step (seconds) if images aren't loading.
+        """
+        import time
+        start = time.time()
+        current_pos = 0
+        try:
+            while time.time() - start < timeout:
+                # Scroll down
+                current_pos += step
+                tab.run_js(f"window.scrollTo(0, {current_pos});")
+                # Active Wait: Check if images in viewport are loaded
+                # Poll every 100ms, up to 'delay' seconds
+                wait_start = time.time()
+                while time.time() - wait_start < delay:
+                    all_loaded = tab.run_js("""
+                        return (async () => {
+                            const imgs = Array.from(document.querySelectorAll('img'));
+                            const viewportHeight = window.innerHeight;
+                            // 1. Identify images currently in viewport
+                            const visibleImgs = imgs.filter(img => {
+                                const rect = img.getBoundingClientRect();
+                                return (rect.top < viewportHeight && rect.bottom > 0) && (rect.width > 0 && rect.height > 0);
+                            });
+                            if (visibleImgs.length === 0) return true;
+                            // 2. Check loading status using decode() AND heuristic for placeholders
+                            // Some sites load a tiny blurred placeholder first.
+                            const checks = visibleImgs.map(img => {
+                                // HEURISTIC: content is likely not ready if:
+                                // - img has 'data-src' but src is different (or src is empty)
+                                // - img has 'loading="lazy"' and is not complete
+                                // - naturalWidth is very small (placeholder) compared to display width
+                                const isPlaceholder = (
+                                    (img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
+                                    (img.naturalWidth < 50 && img.clientWidth > 100)
+                                );
+                                if (isPlaceholder) {
+                                    // If it looks like a placeholder, we return false (not loaded)
+                                    // unless it stays like this for too long (handled by outer timeout)
+                                    return Promise.resolve(false);
+                                }
+                                if (img.complete && img.naturalHeight > 0) return Promise.resolve(true);
+                                return img.decode().then(() => true).catch(() => false);
+                            });
+                            // Race against a small timeout to avoid hanging on one broken image
+                            const allDecoded = Promise.all(checks);
+                            const timeout = new Promise(resolve => setTimeout(() => resolve(false), 500));
+                            // If any check returned false (meaning placeholder or not decoded), result is false
+                            return Promise.race([allDecoded, timeout]).then(results => {
+                                if (!Array.isArray(results)) return results === true;
+                                return results.every(res => res === true);
+                            });
+                        })();
+                    """)
+                    if all_loaded:
+                        break
+                    time.sleep(0.1)
+                # Check if reached bottom
+                height = tab.run_js("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);")
+                if current_pos >= height:
+                    break
+            # Ensure final layout settle
+            time.sleep(0.2)
+        except Exception as e:
+            logger.warning(f"ScreenshotService: Scroll failed: {e}")
     def _fetch_page_sync(self, url: str, timeout: float, include_screenshot: bool) -> Dict[str, Any]:
         """Synchronous fetch logic."""
         if not url:
@@ -304,27 +340,48 @@ class ScreenshotService:
             if not page:
                 return {"content": "Error: Browser not available", "title": "Error", "url": url}
-            # New Tab with URL directly
-            tab = page.new_tab(url)
+            # Get from pool
+            tab = self._get_tab(url)
             # Wait logic - optimized for search pages
             is_search_page = any(s in url.lower() for s in ['search', 'bing.com', 'duckduckgo', 'google.com/search', 'searx'])
             if is_search_page:
-                # Optimized waiting for search engine results
-                try:
-                    # Google uses #search or #rso
-                    # DuckDuckGo uses #react-layout
-                    # Bing uses #b_results
-                    if 'google' in url.lower():
-                        # Wait for results container (fastest possible return)
-                        tab.ele('#search', timeout=timeout)
-                    elif 'bing' in url.lower():
-                        tab.ele('#b_results', timeout=timeout)
-                    else:
-                        # Generic search fallback
-                        tab.wait.doc_loaded(timeout=timeout)
-                except:
-                    pass
+                # Optimized waiting: Rapidly poll for ACTUAL results > 0
+                start_time = time.time()
+                # Special fast-path for DDG Lite (HTML only, no JS rendering needed)
+                if 'lite.duckduckgo' in url:
+                    # just wait for body, it's static HTML
+                     try:
+                         tab.wait.doc_loaded(timeout=timeout)
+                     except: pass
+                     # Sleep tiny bit to ensure render
+                     time.sleep(0.5)
+                else:
+                    while time.time() - start_time < timeout:
+                        found_results = False
+                        try:
+                            if 'google' in url.lower():
+                                # Check if we have any result items (.g, .MjjYud) or the main container (#search)
+                                # Using checks with minimal timeout to allow fast looping
+                                if tab.ele('.g', timeout=0.1) or tab.ele('.MjjYud', timeout=0.1) or tab.ele('#search', timeout=0.1):
+                                    found_results = True
+                            elif 'bing' in url.lower():
+                                if tab.ele('.b_algo', timeout=0.1) or tab.ele('#b_results', timeout=0.1):
+                                    found_results = True
+                            elif 'duckduckgo' in url.lower():
+                                if tab.ele('.result', timeout=0.1) or tab.ele('#react-layout', timeout=0.1):
+                                    found_results = True
+                            else:
+                                # Generic fallback: wait for body to be populated
+                                if tab.ele('body', timeout=0.1):
+                                    found_results = True
+                        except:
+                            pass
+                        if found_results:
+                            break
+                        time.sleep(0.05)  # Faster polling (50ms) as requested
             else:
                 # 1. Wait for document to settle (Fast Dynamic Wait)
                 try:
@@ -451,8 +508,7 @@ class ScreenshotService:
             return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
         finally:
             if tab:
-                try: tab.close()
-                except: pass
+               self._release_tab(tab)
     async def fetch_pages_batch(self, urls: List[str], timeout: float = 20.0, include_screenshot: bool = True) -> List[Dict[str, Any]]:
         """Fetch multiple pages concurrently."""
@@ -461,6 +517,13 @@ class ScreenshotService:
         tasks = [self.fetch_page(url, timeout, include_screenshot) for url in urls]
         return await asyncio.gather(*tasks, return_exceptions=True)
+    async def screenshot_urls_batch(self, urls: List[str], timeout: float = 15.0, full_page: bool = True) -> List[Optional[str]]:
+        """Take screenshots of multiple URLs concurrently."""
+        if not urls: return []
+        logger.info(f"ScreenshotService: Batch screenshot {len(urls)} URLs")
+        tasks = [self.screenshot_url(url, timeout=timeout, full_page=full_page) for url in urls]
+        return await asyncio.gather(*tasks, return_exceptions=True)
     async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 80) -> Optional[str]:
         """Screenshot URL (Async wrapper for sync)."""
         loop = asyncio.get_running_loop()
@@ -481,35 +544,147 @@ class ScreenshotService:
             tab = page.new_tab(url)
             try:
-                if wait_load:
-                    tab.wait.load_complete(timeout=timeout)
-                else:
-                    tab.wait.doc_loaded(timeout=timeout)
-            except: pass
+                # Wait for full page load (including JS execution)
+                tab.wait.load_complete(timeout=timeout)
+                # Wait for actual content to appear (for CDN verification pages)
+                # Smart Wait Logic (Final Robust):
+                # 1. FORCED WAIT: 1.5s to allow initial redirects/rendering to start.
+                # 2. Browser ReadyState Complete
+                # 3. Height Stable for 2.0 seconds (20 checks)
+                # 4. Text > 100 chars (Crucial: Distinguishes stable content from stable spinners)
+                # 5. No Blacklist phrases
+                time.sleep(1.5)  # user request: force wait 1.5s before detection
+                last_h = 0
+                stable_count = 0
+                for i in range(200):  # Max 200 iterations (~20s)
+                    try:
+                        state = tab.run_js('''
+                            return {
+                                ready: document.readyState === 'complete',
+                                title: document.title,
+                                height: Math.max(
+                                    document.body.scrollHeight || 0,
+                                    document.documentElement.scrollHeight || 0
+                                ),
+                                text: document.body.innerText.substring(0, 1000) || "",
+                                html: document.body.innerHTML.substring(0, 500) // Debug intro
+                            };
+                        ''') or {'ready': False, 'title': "", 'height': 0, 'text': ""}
+                        is_ready = state.get('ready', False)
+                        title = state.get('title', "").lower()
+                        current_h = int(state.get('height', 0))
+                        text_content = state.get('text', "")
+                        text_len = len(text_content)
+                        text_lower = text_content.lower()
+                        # Blacklist check
+                        is_verification = "checking your browser" in text_lower or \
+                                        "just a moment" in text_lower or \
+                                        "please wait" in text_lower or \
+                                        "security check" in title or \
+                                        "just a moment" in title
+                        # Stability check
+                        if current_h == last_h:
+                            stable_count += 1
+                        else:
+                            stable_count = 0
+                        # Conditions
+                        has_content = text_len > 100 # At least 100 real chars
+                        is_stable = stable_count >= 20  # Always require 2s stability
+                        # Pass if all conditions met
+                        if is_ready and not is_verification and has_content and is_stable:
+                            break
+                        last_h = current_h
+                        # Wait timing
+                        try: tab.wait.eles_loaded(timeout=0.1)
+                        except: pass
+                    except Exception:
+                        stable_count = 0
+                        try: time.sleep(0.1)
+                        except: pass
+                        continue
+                # DEBUG: Save HTML to inspect what happened (in data dir)
+                try:
+                    import os
+                    log_path = os.path.join(os.getcwd(), "data", "browser.log.html")
+                    with open(log_path, "w", encoding="utf-8") as f:
+                        f.write(f"<!-- URL: {url} -->\n")
+                        f.write(tab.html)
+                except: pass
+                # Use faster scroll step (800) to ensure lazy loaded images appear
+                self._scroll_to_bottom(tab, step=800, delay=2.0, timeout=min(timeout, 10))
+            except:
+                pass
-            # Wait for main element
-            if tab.ele("#main-container"):
-                pass
+            # Refine calculation: Set viewport width to 1024
+            capture_width = 1024
+            # Calculate actual content height after lazy loading
+            try:
+                # Use a robust height calculation
+                content_height = tab.run_js('''
+                    return Math.max(
+                        document.body.scrollHeight || 0,
+                        document.documentElement.scrollHeight || 0,
+                        document.body.offsetHeight || 0,
+                        document.documentElement.offsetHeight || 0,
+                        document.documentElement.clientHeight || 0
+                    );
+                ''')
+                # Add a small buffer and cap at 15000px to prevent memory issues
+                h = min(int(content_height) + 50, 15000)
+            except:
+                h = 1000  # Fallback
+            # Set viewport to full content size for single-shot capture
+            try:
+                tab.run_cdp('Emulation.setDeviceMetricsOverride',
+                            width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
+            except:
+                pass
             # Scrollbar Hiding
             from .manager import SharedBrowserManager
             SharedBrowserManager.hide_scrollbars(tab)
-            tab.run_js("""
-                const style = document.createElement('style');
-                style.textContent = `
-                    ::-webkit-scrollbar { display: none !important; }
-                    html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
-                `;
-                document.head.appendChild(style);
-                document.documentElement.style.overflow = 'hidden';
-                document.body.style.overflow = 'hidden';
-            """)
-            ele = tab.ele("#main-container")
-            if ele:
-                return ele.get_screenshot(as_base64='jpg', quality=quality)
-            else:
-                return tab.get_screenshot(as_base64='jpg', full_page=full_page, quality=quality)
+            # Scroll back to top before screenshot
+            tab.run_js("window.scrollTo(0, 0);")
+            # Content is already loaded by _scroll_to_bottom which waits for images
+            # Just recalculate final height (content may have grown during scrolling)
+            try:
+                final_height = tab.run_js('''
+                    return Math.max(
+                        document.body.scrollHeight || 0,
+                        document.documentElement.scrollHeight || 0,
+                        document.body.offsetHeight || 0,
+                        document.documentElement.offsetHeight || 0
+                    );
+                ''')
+                final_h = min(int(final_height) + 50, 15000)
+                if final_h != h:
+                    tab.run_cdp('Emulation.setDeviceMetricsOverride',
+                                width=capture_width, height=final_h, deviceScaleFactor=1, mobile=False)
+            except:
+                pass
+            # Use full_page=False because we manually set the viewport to the full height
+            # This avoids stitching artifacts and blank spaces
+            return tab.get_screenshot(as_base64='jpg', full_page=False)
         except Exception as e:
             logger.error(f"ScreenshotService: Screenshot URL failed: {e}")

hyw_core/config.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""
+hyw_core.config - Configuration Management
+Provides standalone configuration for hyw-core with optional passthrough from parent packages.
+"""
+from dataclasses import dataclass, field
+from typing import Dict, List, Any, Optional
+@dataclass
+class ModelConfig:
+    """Configuration for a specific model."""
+    model_name: Optional[str] = None
+    api_key: Optional[str] = None
+    base_url: Optional[str] = None
+    extra_body: Optional[Dict[str, Any]] = None
+    model_provider: Optional[str] = None
+    input_price: Optional[float] = None
+    output_price: Optional[float] = None
+    image_input: bool = True
+@dataclass
+class HywCoreConfig:
+    """
+    Core configuration for hyw-core.
+    Can be used standalone or with passthrough from parent packages.
+    Usage:
+        # Standalone from YAML
+        config = HywCoreConfig.from_yaml("config.yaml")
+        # Passthrough from parent
+        config = HywCoreConfig.from_dict({
+            "model_name": parent_config.model_name,
+            "api_key": parent_config.api_key,
+            ...
+        })
+    """
+    # LLM Configuration
+    models: List[Dict[str, Any]] = field(default_factory=list)
+    model_name: str = ""
+    api_key: str = ""
+    base_url: str = ""
+    temperature: float = 0.4
+    # Stage-specific model overrides
+    instruct_model: Optional[str] = None
+    instruct_api_key: Optional[str] = None
+    instruct_base_url: Optional[str] = None
+    instruct_extra_body: Optional[Dict[str, Any]] = None
+    summary_model: Optional[str] = None
+    summary_api_key: Optional[str] = None
+    summary_base_url: Optional[str] = None
+    summary_extra_body: Optional[Dict[str, Any]] = None
+    # Search Configuration
+    search_engine: str = "duckduckgo"
+    search_limit: int = 10
+    blocked_domains: List[str] = field(default_factory=list)
+    # Browser Configuration
+    headless: bool = True
+    fetch_timeout: float = 20.0
+    # Output Configuration
+    language: str = "Simplified Chinese"
+    theme_color: str = "#ef4444"
+    # Pricing (for cost estimation)
+    input_price: float = 0.0
+    output_price: float = 0.0
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "HywCoreConfig":
+        """
+        Create config from dictionary.
+        Used for passthrough from parent packages.
+        Filters out unknown fields to allow flexible passthrough.
+        """
+        import dataclasses
+        field_names = {f.name for f in dataclasses.fields(cls)}
+        filtered_data = {k: v for k, v in data.items() if k in field_names}
+        return cls(**filtered_data)
+    @classmethod
+    def from_yaml(cls, path: str) -> "HywCoreConfig":
+        """
+        Load config from YAML file.
+        Used for standalone usage.
+        """
+        import yaml
+        with open(path, 'r', encoding='utf-8') as f:
+            data = yaml.safe_load(f) or {}
+        return cls.from_dict(data)
+    def get_model_config(self, stage: str) -> ModelConfig:
+        """
+        Get resolved model config for a stage.
+        Args:
+            stage: "instruct", "qa", or "main" (summary)
+        Returns:
+            ModelConfig with resolved settings
+        """
+        # Determine primary and secondary stage config keys
+        if stage == "instruct":
+            primary_prefix = "instruct_"
+            secondary_prefix = None
+        elif stage == "qa":
+            primary_prefix = "qa_"
+            secondary_prefix = "instruct_"
+        else:  # "main" / summary
+            primary_prefix = "summary_"
+            secondary_prefix = None
+        def resolve(field_name: str, is_essential: bool = True):
+            """Resolve a field with fallback: Primary -> Secondary -> Root."""
+            # Try primary
+            if primary_prefix:
+                val = getattr(self, f"{primary_prefix}{field_name}", None)
+                if val:
+                    return val
+            # Try secondary
+            if secondary_prefix:
+                val = getattr(self, f"{secondary_prefix}{field_name}", None)
+                if val:
+                    return val
+            # Fallback to root
+            return getattr(self, field_name, None)
+        return ModelConfig(
+            model_name=resolve("model") or resolve("model_name") or self.model_name,
+            api_key=resolve("api_key") or self.api_key,
+            base_url=resolve("base_url") or self.base_url,
+            extra_body=resolve("extra_body"),
+            model_provider=resolve("model_provider"),
+            input_price=resolve("input_price") or self.input_price,
+            output_price=resolve("output_price") or self.output_price,
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary."""
+        import dataclasses
+        return dataclasses.asdict(self)