PyPI - entari-plugin-hyw - Versions diffs - 4.0.0rc9__py3-none-any.whl → 4.0.0rc11__py3-none-any.whl - Mend

entari-plugin-hyw 4.0.0rc9py3-none-any.whl → 4.0.0rc11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (17) hide show

entari_plugin_hyw/__init__.py +76 -39
{entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc11.dist-info}/METADATA +1 -1
{entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc11.dist-info}/RECORD +16 -14
hyw_core/browser_control/__init__.py +1 -3
hyw_core/browser_control/assets/card-dist/index.html +48 -32
hyw_core/browser_control/engines/__init__.py +0 -2
hyw_core/browser_control/manager.py +18 -5
hyw_core/browser_control/renderer.py +36 -6
hyw_core/browser_control/service.py +245 -142
hyw_core/core.py +0 -7
hyw_core/crawling/__init__.py +18 -0
hyw_core/crawling/completeness.py +437 -0
hyw_core/crawling/models.py +88 -0
hyw_core/search.py +4 -6
hyw_core/browser_control/engines/google.py +0 -155
{entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc11.dist-info}/WHEEL +0 -0
{entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc11.dist-info}/top_level.txt +0 -0

hyw_core/browser_control/engines/__init__.py CHANGED Viewed

@@ -5,13 +5,11 @@ Provides search engine adapters for different search providers.
 """
 from .base import SearchEngine
-from .google import GoogleEngine
 from .duckduckgo import DuckDuckGoEngine
 from .default import DefaultEngine
 __all__ = [
     "SearchEngine",
-    "GoogleEngine",
     "DuckDuckGoEngine",
     "DefaultEngine",
 ]

hyw_core/browser_control/manager.py CHANGED Viewed

@@ -124,15 +124,28 @@ class SharedBrowserManager:
     @staticmethod
     def hide_scrollbars(page: ChromiumPage):
         """
-        Robustly hide scrollbars using CDP commands.
-        This eliminates the reserved space/gutter that standard CSS might miss.
+        Robustly hide scrollbars using CDP commands AND CSS injection.
+        This provides double protection against scrollbar gutters.
         """
         try:
-            # Emulation.setScrollbarsHidden is a CDP command
+            # 1. CDP Command
             page.run_cdp('Emulation.setScrollbarsHidden', hidden=True)
-            logger.debug("SharedBrowserManager: CDP scrollbars hidden.")
+            # 2. CSS Injection (Standard + Webkit)
+            css = """
+                ::-webkit-scrollbar { display: none !important; width: 0 !important; height: 0 !important; }
+                * { -ms-overflow-style: none !important; scrollbar-width: none !important; }
+            """
+            # Inject into current page
+            page.run_js(f"""
+                const style = document.createElement('style');
+                style.textContent = `{css}`;
+                document.head.appendChild(style);
+            """)
+            logger.debug("SharedBrowserManager: Scrollbars hidden via CDP + CSS.")
         except Exception as e:
-            logger.warning(f"SharedBrowserManager: Failed to hide scrollbars via CDP: {e}")
+            logger.warning(f"SharedBrowserManager: Failed to hide scrollbars: {e}")
 # Module-level singleton accessor

hyw_core/browser_control/renderer.py CHANGED Viewed

@@ -55,16 +55,43 @@ class ContentRenderer:
         loop = asyncio.get_running_loop()
         return await loop.run_in_executor(self._executor, self._prepare_tab_sync)
-    def _wait_for_render_finished(self, tab, timeout: float = 3.0):
+    def _wait_for_render_finished(self, tab, timeout: float = 12.0, context: str = ""):
         """Wait for window.RENDER_FINISHED to be true in the tab."""
         import time as pytime
         start = pytime.time()
+        # Check initial state
+        initial_state = tab.run_js("return window.RENDER_FINISHED")
+        logger.debug(f"ContentRenderer[{context}]: Starting wait, initial RENDER_FINISHED={initial_state}")
+        # If already true, it's stale from previous render - need to wait for JS to reset it
+        if initial_state:
+            logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED was true, waiting for reset...")
+            # Wait for JS to reset it to false (updateRenderData sets it to false)
+            reset_start = pytime.time()
+            while pytime.time() - reset_start < 1.0:  # 1s max to wait for reset
+                is_reset = tab.run_js("return window.RENDER_FINISHED")
+                if not is_reset:
+                    logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED reset to false")
+                    break
+                pytime.sleep(0.05)
+            else:
+                logger.warning(f"ContentRenderer[{context}]: RENDER_FINISHED not reset, force resetting via JS")
+                tab.run_js("window.RENDER_FINISHED = false")
+        # Now wait for it to become true
+        poll_count = 0
         while pytime.time() - start < timeout:
             is_finished = tab.run_js("return window.RENDER_FINISHED")
+            poll_count += 1
             if is_finished:
+                elapsed = pytime.time() - start
+                logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED=true after {elapsed:.2f}s ({poll_count} polls)")
                 return True
-            pytime.sleep(0.05) # Fast polling
-        logger.warning(f"ContentRenderer: Wait for RENDER_FINISHED timed out after {timeout}s")
+            pytime.sleep(0.1)  # Poll every 100ms
+        elapsed = pytime.time() - start
+        logger.warning(f"ContentRenderer[{context}]: Wait for RENDER_FINISHED timed out after {elapsed:.2f}s ({poll_count} polls)")
         return False
     def _prepare_tab_sync(self) -> str:
@@ -89,8 +116,9 @@ class ContentRenderer:
                 "theme_color": "#ef4444",
             }
+            logger.debug(f"ContentRenderer: Calling warmup updateRenderData for tab {tab_id}")
             tab.run_js(f"window.updateRenderData({json.dumps(warmup_data)})")
-            self._wait_for_render_finished(tab, timeout=5.0)
+            self._wait_for_render_finished(tab, timeout=12.0, context=f"warmup:{tab_id}")
             # Wait for main-container after warmup (Vue needs to render it)
             tab.ele('#main-container', timeout=3)
@@ -203,7 +231,7 @@ class ContentRenderer:
             # 1. Update Data & Wait for Finished flag
             tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
-            self._wait_for_render_finished(tab)
+            self._wait_for_render_finished(tab, context=f"batch:{tab_id}")
             # 2. Dynamic Resize
             # Get actual content height to prevent clipping
@@ -330,10 +358,12 @@ class ContentRenderer:
                 "theme_color": theme_color,
             }
+            actual_tab_id = getattr(tab, 'tab_id', 'unknown')
+            logger.info(f"ContentRenderer: Calling updateRenderData for tab {actual_tab_id}, markdown length={len(markdown_content)}")
             tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
             # Wait for event-driven finish
-            self._wait_for_render_finished(tab, timeout=5.0)
+            self._wait_for_render_finished(tab, timeout=12.0, context=f"render:{actual_tab_id}")
             # Dynamic Resize
             scroll_height = tab.run_js('return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);')

hyw_core/browser_control/service.py CHANGED Viewed

@@ -13,6 +13,10 @@ from typing import Optional, Dict, Any, List
 from loguru import logger
 import trafilatura
+# Import intelligent completeness checker
+from ..crawling.completeness import CompletenessChecker, trigger_lazy_load
+from ..crawling.models import CrawlConfig
 class ScreenshotService:
     """
     Browser Service using DrissionPage.
@@ -281,14 +285,27 @@ class ScreenshotService:
                             // 2. Check loading status using decode() AND heuristic for placeholders
                             // Some sites load a tiny blurred placeholder first.
                             const checks = visibleImgs.map(img => {
-                                // HEURISTIC: content is likely not ready if:
-                                // - img has 'data-src' but src is different (or src is empty)
-                                // - img has 'loading="lazy"' and is not complete
-                                // - naturalWidth is very small (placeholder) compared to display width
+                                // Enhanced placeholder detection (matching completeness.py)
+                                const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
+                                                img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
+                                const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
+                                const loadingAttr = img.getAttribute('loading') || '';
+                                const src = img.src || '';
                                 const isPlaceholder = (
-                                    (img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
-                                    (img.naturalWidth < 50 && img.clientWidth > 100)
+                                    // data-src not yet loaded
+                                    (dataSrc && img.src !== dataSrc) ||
+                                    // Natural size much smaller than display (blurred placeholder)
+                                    (img.naturalWidth < 50 && img.clientWidth > 100) ||
+                                    (img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
+                                    // Lazy-loading class indicators
+                                    className.includes('lazy') ||
+                                    className.includes('lazyload') ||
+                                    className.includes('lozad') ||
+                                    // CSS blur filter applied
+                                    (window.getComputedStyle(img).filter || '').includes('blur') ||
+                                    // loading="lazy" + not complete
+                                    (loadingAttr === 'lazy' && !img.complete)
                                 );
                                 if (isPlaceholder) {
@@ -537,81 +554,71 @@ class ScreenshotService:
         """Synchronous screenshot."""
         if not url: return None
         tab = None
+        capture_width = 1024  # Standard capture width
         try:
             self._ensure_ready()
             page = self._manager.page
             if not page: return None
-            tab = page.new_tab(url)
+            # Create blank tab first
+            tab = page.new_tab()
-            # Start monitoring network traffic
+            # Set viewport BEFORE navigation so page renders at target width from the start
+            # This eliminates the need for post-load resize and reflow
             try:
-                # Listen for data packets (XHR/Fetch/POST)
-                # Targets: xhr, fetch. POST usually falls under these or Document.
-                tab.listen.start(targets=True) # Listen to everything for now to be safe
-            except Exception as e:
-                logger.warning(f"ScreenshotService: Failed to start network listener: {e}")
+                tab.run_cdp('Emulation.setDeviceMetricsOverride',
+                            width=capture_width, height=900, deviceScaleFactor=1, mobile=False)
+            except:
+                pass
+            # Now navigate to the URL - page will render at target width
+            tab.get(url)
+            # Network monitoring removed - not needed for simplified wait logic
+            # Initialize crawl config for completeness checking (defined outside try for scope)
+            crawl_config = CrawlConfig(
+                scan_full_page=True,
+                scroll_step=800,
+                scroll_delay=0.5,
+                scroll_timeout=20.0,  # Increased for lazy-loading pages
+                image_load_timeout=3.0,  # Image loading timeout: 3 seconds
+                image_stability_checks=3,
+            )
+            # === Start scrolling immediately to trigger lazy loading ===
+            # Scroll first, then wait for DOM - this allows lazy loading to start in parallel
+            logger.info(f"ScreenshotService: Starting lazy load scroll for {url} (before DOM wait)")
+            trigger_lazy_load(tab, crawl_config)
+            # Now wait for DOM to be ready (after scroll has triggered lazy loading)
             try:
                 # Wait for full page load (including JS execution)
-                tab.wait.load_complete(timeout=timeout)
+                try:
+                    tab.wait.doc_loaded(timeout=timeout)
+                except:
+                    pass
                 # Wait for actual content to appear (for CDN verification pages)
-                # Smart Wait Logic (Final Robust):
-                # 1. Network Idle: Wait for silence in XHR/POST
-                # 2. Stability: Wait for Height/Text/DOM stability
+                # Simplified wait logic for screenshot: just check basic readiness
-                time.sleep(1.5)  # user request: force wait 1.5s before detection
-                last_h = 0
-                last_text_len = 0
-                last_html_len = 0
-                stable_count = 0
-                for i in range(200):  # Max 200 iterations (~20s)
+                for i in range(20):  # Max 20 iterations (~1s) - much faster
                     try:
-                        # 1. Check Network Activity
-                        has_recent_network = False
-                        try:
-                            # Iterate over any captured packets since last check
-                            for packet in tab.listen.steps(timeout=0.01):
-                                # Check if it's a significant request (POST or XHR/Fetch)
-                                method = packet.method.upper()
-                                r_type = packet.resourceType.upper() if getattr(packet, 'resourceType', None) else ""
-                                # Interested in: POST requests OR any XHR/Fetch response
-                                if method == 'POST' or 'XMLHTTPREQUEST' in r_type or 'FETCH' in r_type:
-                                    # Ignore some common noise? (Optional: analytics, tracking)
-                                    # For now, simplistic approach: any API traffic resets stability
-                                    has_recent_network = True
-                                    # logger.debug(f"Network Activity: {method} {packet.url[:50]}")
-                                    break
-                        except:
-                            pass
-                        # 2. Check DOM State
                         state = tab.run_js('''
                             return {
                                 ready: document.readyState === 'complete',
                                 title: document.title,
-                                height: Math.max(
-                                    document.body.scrollHeight || 0,
-                                    document.documentElement.scrollHeight || 0
-                                ),
-                                text: document.body.innerText.substring(0, 1000) || "",
-                                html_len: document.body.innerHTML.length || 0
+                                text: document.body.innerText.substring(0, 500) || ""
                             };
-                        ''') or {'ready': False, 'title': "", 'height': 0, 'text': "", 'html_len': 0}
+                        ''') or {'ready': False, 'title': "", 'text': ""}
                         is_ready = state.get('ready', False)
                         title = state.get('title', "").lower()
-                        current_h = int(state.get('height', 0))
-                        text_content = state.get('text', "")
-                        text_len = len(text_content)
-                        html_len = int(state.get('html_len', 0))
-                        text_lower = text_content.lower()
+                        text_lower = state.get('text', "").lower()
+                        text_len = len(text_lower)
-                        # Blacklist check (Loading indicators)
+                        # Check for verification pages
                         is_verification = "checking your browser" in text_lower or \
                                         "just a moment" in text_lower or \
                                         "please wait" in text_lower or \
@@ -619,54 +626,19 @@ class ScreenshotService:
                                         "just a moment" in title or \
                                         "loading..." in title
-                        # Stability check (Multi-metric + Network)
-                        # We require STABILITY across Height, Text Length, DOM Size AND Network Silence
-                        is_height_stable = current_h == last_h
-                        is_text_stable = abs(text_len - last_text_len) < 5  # Allow minor fluctuations
-                        is_dom_stable = abs(html_len - last_html_len) < 20 # Allow minor fluctuations (ads/tracking)
-                        if is_height_stable and is_text_stable and is_dom_stable and not has_recent_network:
-                            stable_count += 1
-                        else:
-                            # Reset if ANY metric changed or NETWORK active
-                            stable_count = 0
-                            # if has_recent_network: logger.debug("Stability reset: Network Activity")
+                        # Basic content check
+                        has_content = text_len > 100
-                        # Conditions
-                        has_content = text_len > 100 # At least 100 real chars
-                        # Dynamic Stability Requirement:
-                        # If page looks like it's loading (small content), require longer stability
-                        if has_recent_network:
-                             # If we just saw network, enforce at least 1s (10 ticks) clean silence even for large pages
-                             required_stability = max(10, 40 if text_len < 500 else 25)
-                        else:
-                             required_stability = 40 if text_len < 500 else 25  # 4.0s or 2.5s
-                        is_stable = stable_count >= required_stability
-                        # Pass if all conditions met
-                        if is_ready and not is_verification and has_content and is_stable:
+                        # Pass if ready, not verification, and has content
+                        if is_ready and not is_verification and has_content:
+                            logger.debug(f"ScreenshotService: Page ready after {i * 0.05:.2f}s")
                             break
-                        last_h = current_h
-                        last_text_len = text_len
-                        last_html_len = html_len
-                        # Wait timing within loop (tab.listen.steps consumed some time, so sleep less)
-                        try: time.sleep(0.05)
-                        except: pass
+                        time.sleep(0.05)
                     except Exception:
-                        stable_count = 0
-                        try: time.sleep(0.1)
-                        except: pass
+                        time.sleep(0.05)
                         continue
-                # Cleanup listener
-                try: tab.listen.stop()
-                except: pass
                 # DEBUG: Save HTML to inspect what happened (in data dir)
                 try:
                     import os
@@ -675,50 +647,179 @@ class ScreenshotService:
                         f.write(f"<!-- URL: {url} -->\n")
                         f.write(tab.html)
                 except: pass
-                # Use faster scroll step (800) to ensure lazy loaded images appear
-                self._scroll_to_bottom(tab, step=800, delay=2.0, timeout=min(timeout, 10))
-            except:
-                pass
-            # Refine calculation: Set viewport width to 1024
-            capture_width = 1024
-            # Calculate actual content height after lazy loading
-            try:
-                # Use a robust height calculation
-                content_height = tab.run_js('''
-                    return Math.max(
-                        document.body.scrollHeight || 0,
-                        document.documentElement.scrollHeight || 0,
-                        document.body.offsetHeight || 0,
-                        document.documentElement.offsetHeight || 0,
-                        document.documentElement.clientHeight || 0
-                    );
-                ''')
-                # Add a small buffer and cap at 15000px to prevent memory issues
-                h = min(int(content_height) + 50, 15000)
-            except:
-                h = 1000  # Fallback
+            except Exception as e:
+                logger.warning(f"ScreenshotService: Page readiness check failed: {e}")
-            # Set viewport to full content size for single-shot capture
-            try:
-                tab.run_cdp('Emulation.setDeviceMetricsOverride',
-                            width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
-            except:
-                pass
-            # Scrollbar Hiding
+            # Scrollbar Hiding first (before any height calculation)
             from .manager import SharedBrowserManager
             SharedBrowserManager.hide_scrollbars(tab)
-            # Scroll back to top before screenshot
+            # Scroll back to top
             tab.run_js("window.scrollTo(0, 0);")
-            # Content is already loaded by _scroll_to_bottom which waits for images
-            # Just recalculate final height (content may have grown during scrolling)
+            # Image loading monitoring with time tracking - DISABLED
+            # No longer waiting for images to load
+            # Initialize image tracking JavaScript
+            # image_tracking_js = """
+            # (() => {
+            #     if (!window._imageLoadTracker) {
+            #         window._imageLoadTracker = {
+            #             startTime: Date.now(),
+            #             images: new Map()
+            #         };
+            #
+            #         const imgs = Array.from(document.querySelectorAll('img'));
+            #         const minSize = 50;
+            #
+            #         imgs.forEach((img, idx) => {
+            #             if (img.clientWidth < minSize && img.clientHeight < minSize) return;
+            #
+            #             const src = img.src || img.getAttribute('data-src') || '';
+            #             const key = `${idx}_${src.substring(0, 100)}`;
+            #
+            #             if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
+            #                 // Already loaded
+            #                 window._imageLoadTracker.images.set(key, {
+            #                     src: src.substring(0, 150),
+            #                     status: 'loaded',
+            #                     loadTime: 0,  // Already loaded before tracking
+            #                     naturalSize: [img.naturalWidth, img.naturalHeight],
+            #                     displaySize: [img.clientWidth, img.clientHeight]
+            #                 });
+            #             } else {
+            #                 // Track loading
+            #                 window._imageLoadTracker.images.set(key, {
+            #                     src: src.substring(0, 150),
+            #                     status: 'pending',
+            #                     startTime: Date.now(),
+            #                     naturalSize: [img.naturalWidth, img.naturalHeight],
+            #                     displaySize: [img.clientWidth, img.clientHeight]
+            #                 });
+            #
+            #                 // Add load event listener
+            #                 img.addEventListener('load', () => {
+            #                     const entry = window._imageLoadTracker.images.get(key);
+            #                     if (entry && entry.status === 'pending') {
+            #                         entry.status = 'loaded';
+            #                         entry.loadTime = Date.now() - entry.startTime;
+            #                     }
+            #                 });
+            #
+            #                 img.addEventListener('error', () => {
+            #                     const entry = window._imageLoadTracker.images.get(key);
+            #                     if (entry && entry.status === 'pending') {
+            #                         entry.status = 'failed';
+            #                         entry.loadTime = Date.now() - entry.startTime;
+            #                     }
+            #                 });
+            #             }
+            #         });
+            #     }
+            #
+            #     // Return current status
+            #     const results = [];
+            #     window._imageLoadTracker.images.forEach((value, key) => {
+            #         const entry = {
+            #             src: value.src,
+            #             status: value.status,
+            #             loadTime: value.status === 'loaded' ? (value.loadTime || 0) : (Date.now() - value.startTime),
+            #             naturalSize: value.naturalSize,
+            #             displaySize: value.displaySize
+            #         };
+            #         results.push(entry);
+            #     });
+            #
+            #     return {
+            #         total: results.length,
+            #         loaded: results.filter(r => r.status === 'loaded').length,
+            #         pending: results.filter(r => r.status === 'pending').length,
+            #         failed: results.filter(r => r.status === 'failed').length,
+            #         details: results
+            #     };
+            # })()
+            # """
+            #
+            # # Initialize tracking
+            # tab.run_js(image_tracking_js)
+            #
+            # # Monitor image loading with dynamic stop logic
+            # check_interval = 0.2  # Check every 200ms
+            # image_timeout = 3.0  # Image loading timeout: 3 seconds
+            # monitoring_start = time.time()
+            # loaded_times = []  # Track load times of completed images
+            #
+            # logger.info(f"ScreenshotService: Starting image load monitoring (timeout={image_timeout}s)...")
+            #
+            # while True:
+            #     elapsed = time.time() - monitoring_start
+            #
+            #     # Check timeout first
+            #     if elapsed >= image_timeout:
+            #         logger.info(f"ScreenshotService: Image loading timeout ({image_timeout}s) reached")
+            #         break
+            #
+            #     # Get current image status
+            #     status = tab.run_js(image_tracking_js, as_expr=True) or {
+            #         'total': 0, 'loaded': 0, 'pending': 0, 'failed': 0, 'details': []
+            #     }
+            #
+            #     # Log each image's status and load time
+            #     for img_detail in status.get('details', []):
+            #         src_short = img_detail.get('src', '')[:80]
+            #         status_str = img_detail.get('status', 'unknown')
+            #         load_time = img_detail.get('loadTime', 0)
+            #         logger.info(
+            #             f"ScreenshotService: Image [{status_str}] "
+            #             f"loadTime={load_time:.0f}ms "
+            #             f"src={src_short}"
+            #         )
+            #
+            #     # Collect load times of completed images
+            #     loaded_times = [
+            #         img.get('loadTime', 0)
+            #         for img in status.get('details', [])
+            #         if img.get('status') == 'loaded' and img.get('loadTime', 0) > 0
+            #     ]
+            #
+            #     pending_count = status.get('pending', 0)
+            #     loaded_count = status.get('loaded', 0)
+            #
+            #     # Check stop conditions
+            #     if pending_count == 0:
+            #         logger.info(f"ScreenshotService: All images loaded. Total: {status.get('total', 0)}, Loaded: {loaded_count}")
+            #         break
+            #
+            #     # Check dynamic stop condition (if we have loaded images to calculate average)
+            #     if loaded_times:
+            #         avg_load_time = sum(loaded_times) / len(loaded_times)
+            #         max_wait_time = avg_load_time * 2
+            #
+            #         # Check if any pending image has exceeded max wait time
+            #         pending_images = [
+            #             img for img in status.get('details', [])
+            #             if img.get('status') == 'pending'
+            #         ]
+            #
+            #         should_stop = False
+            #         for pending_img in pending_images:
+            #             wait_time = pending_img.get('loadTime', 0)
+            #             if wait_time >= max_wait_time:
+            #                 should_stop = True
+            #                 logger.info(
+            #                     f"ScreenshotService: Stopping - pending image waited {wait_time:.0f}ms, "
+            #                     f"exceeds 2x avg load time ({max_wait_time:.0f}ms, avg={avg_load_time:.0f}ms)"
+            #                 )
+            #                 break
+            #
+            #         if should_stop:
+            #             break
+            #
+            #     # Wait before next check
+            #     time.sleep(check_interval)
+            # Now calculate final height ONCE after all content loaded
+            # CompletenessChecker already verified height stability
             try:
                 final_height = tab.run_js('''
                     return Math.max(
@@ -728,13 +829,15 @@ class ScreenshotService:
                         document.documentElement.offsetHeight || 0
                     );
                 ''')
-                final_h = min(int(final_height) + 50, 15000)
-                if final_h != h:
-                    tab.run_cdp('Emulation.setDeviceMetricsOverride',
-                                width=capture_width, height=final_h, deviceScaleFactor=1, mobile=False)
+                h = min(int(final_height) + 50, 15000)
+                tab.run_cdp('Emulation.setDeviceMetricsOverride',
+                            width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
             except:
                 pass
+            # Final scroll to top
+            tab.run_js("window.scrollTo(0, 0);")
             # Use full_page=False because we manually set the viewport to the full height
             # This avoids stitching artifacts and blank spaces
             return tab.get_screenshot(as_base64='jpg', full_page=False)

hyw_core/core.py CHANGED Viewed

@@ -10,7 +10,6 @@ from dataclasses import dataclass, field
 from typing import Dict, List, Any, Optional, Callable, Awaitable
 from loguru import logger
-from openai import AsyncOpenAI
 from .config import HywCoreConfig, ModelConfig
 from .pipeline import ModularPipeline
@@ -95,12 +94,6 @@ class HywCore:
         self.config = config
         self._send_func = send_func
-        # Create OpenAI client
-        self._client = AsyncOpenAI(
-            api_key=config.api_key,
-            base_url=config.base_url if config.base_url else None
-        )
         # Create search service
         self._search_service = SearchService(config)

hyw_core/crawling/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""
+hyw_core.crawling - Intelligent Web Crawling Module
+Provides Crawl4AI-inspired adaptive crawling with:
+- Page completeness guarantees (image loading verification)
+- Content quality scoring
+- Adaptive stop logic
+"""
+from .models import CrawlConfig, PageResult, CompletenessResult
+from .completeness import CompletenessChecker
+__all__ = [
+    "CrawlConfig",
+    "PageResult",
+    "CompletenessResult",
+    "CompletenessChecker",
+]

entari-plugin-hyw 4.0.0rc9__py3-none-any.whl → 4.0.0rc11__py3-none-any.whl

Potentially problematic release.

entari-plugin-hyw 4.0.0rc9py3-none-any.whl → 4.0.0rc11py3-none-any.whl