entari-plugin-hyw 4.0.0rc9__py3-none-any.whl → 4.0.0rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

@@ -5,13 +5,11 @@ Provides search engine adapters for different search providers.
5
5
  """
6
6
 
7
7
  from .base import SearchEngine
8
- from .google import GoogleEngine
9
8
  from .duckduckgo import DuckDuckGoEngine
10
9
  from .default import DefaultEngine
11
10
 
12
11
  __all__ = [
13
12
  "SearchEngine",
14
- "GoogleEngine",
15
13
  "DuckDuckGoEngine",
16
14
  "DefaultEngine",
17
15
  ]
@@ -124,15 +124,28 @@ class SharedBrowserManager:
124
124
  @staticmethod
125
125
  def hide_scrollbars(page: ChromiumPage):
126
126
  """
127
- Robustly hide scrollbars using CDP commands.
128
- This eliminates the reserved space/gutter that standard CSS might miss.
127
+ Robustly hide scrollbars using CDP commands AND CSS injection.
128
+ This provides double protection against scrollbar gutters.
129
129
  """
130
130
  try:
131
- # Emulation.setScrollbarsHidden is a CDP command
131
+ # 1. CDP Command
132
132
  page.run_cdp('Emulation.setScrollbarsHidden', hidden=True)
133
- logger.debug("SharedBrowserManager: CDP scrollbars hidden.")
133
+
134
+ # 2. CSS Injection (Standard + Webkit)
135
+ css = """
136
+ ::-webkit-scrollbar { display: none !important; width: 0 !important; height: 0 !important; }
137
+ * { -ms-overflow-style: none !important; scrollbar-width: none !important; }
138
+ """
139
+ # Inject into current page
140
+ page.run_js(f"""
141
+ const style = document.createElement('style');
142
+ style.textContent = `{css}`;
143
+ document.head.appendChild(style);
144
+ """)
145
+
146
+ logger.debug("SharedBrowserManager: Scrollbars hidden via CDP + CSS.")
134
147
  except Exception as e:
135
- logger.warning(f"SharedBrowserManager: Failed to hide scrollbars via CDP: {e}")
148
+ logger.warning(f"SharedBrowserManager: Failed to hide scrollbars: {e}")
136
149
 
137
150
 
138
151
  # Module-level singleton accessor
@@ -55,16 +55,43 @@ class ContentRenderer:
55
55
  loop = asyncio.get_running_loop()
56
56
  return await loop.run_in_executor(self._executor, self._prepare_tab_sync)
57
57
 
58
- def _wait_for_render_finished(self, tab, timeout: float = 3.0):
58
+ def _wait_for_render_finished(self, tab, timeout: float = 12.0, context: str = ""):
59
59
  """Wait for window.RENDER_FINISHED to be true in the tab."""
60
60
  import time as pytime
61
61
  start = pytime.time()
62
+
63
+ # Check initial state
64
+ initial_state = tab.run_js("return window.RENDER_FINISHED")
65
+ logger.debug(f"ContentRenderer[{context}]: Starting wait, initial RENDER_FINISHED={initial_state}")
66
+
67
+ # If already true, it's stale from previous render - need to wait for JS to reset it
68
+ if initial_state:
69
+ logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED was true, waiting for reset...")
70
+ # Wait for JS to reset it to false (updateRenderData sets it to false)
71
+ reset_start = pytime.time()
72
+ while pytime.time() - reset_start < 1.0: # 1s max to wait for reset
73
+ is_reset = tab.run_js("return window.RENDER_FINISHED")
74
+ if not is_reset:
75
+ logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED reset to false")
76
+ break
77
+ pytime.sleep(0.05)
78
+ else:
79
+ logger.warning(f"ContentRenderer[{context}]: RENDER_FINISHED not reset, force resetting via JS")
80
+ tab.run_js("window.RENDER_FINISHED = false")
81
+
82
+ # Now wait for it to become true
83
+ poll_count = 0
62
84
  while pytime.time() - start < timeout:
63
85
  is_finished = tab.run_js("return window.RENDER_FINISHED")
86
+ poll_count += 1
64
87
  if is_finished:
88
+ elapsed = pytime.time() - start
89
+ logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED=true after {elapsed:.2f}s ({poll_count} polls)")
65
90
  return True
66
- pytime.sleep(0.05) # Fast polling
67
- logger.warning(f"ContentRenderer: Wait for RENDER_FINISHED timed out after {timeout}s")
91
+ pytime.sleep(0.1) # Poll every 100ms
92
+
93
+ elapsed = pytime.time() - start
94
+ logger.warning(f"ContentRenderer[{context}]: Wait for RENDER_FINISHED timed out after {elapsed:.2f}s ({poll_count} polls)")
68
95
  return False
69
96
 
70
97
  def _prepare_tab_sync(self) -> str:
@@ -89,8 +116,9 @@ class ContentRenderer:
89
116
  "theme_color": "#ef4444",
90
117
  }
91
118
 
119
+ logger.debug(f"ContentRenderer: Calling warmup updateRenderData for tab {tab_id}")
92
120
  tab.run_js(f"window.updateRenderData({json.dumps(warmup_data)})")
93
- self._wait_for_render_finished(tab, timeout=5.0)
121
+ self._wait_for_render_finished(tab, timeout=12.0, context=f"warmup:{tab_id}")
94
122
 
95
123
  # Wait for main-container after warmup (Vue needs to render it)
96
124
  tab.ele('#main-container', timeout=3)
@@ -203,7 +231,7 @@ class ContentRenderer:
203
231
 
204
232
  # 1. Update Data & Wait for Finished flag
205
233
  tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
206
- self._wait_for_render_finished(tab)
234
+ self._wait_for_render_finished(tab, context=f"batch:{tab_id}")
207
235
 
208
236
  # 2. Dynamic Resize
209
237
  # Get actual content height to prevent clipping
@@ -330,10 +358,12 @@ class ContentRenderer:
330
358
  "theme_color": theme_color,
331
359
  }
332
360
 
361
+ actual_tab_id = getattr(tab, 'tab_id', 'unknown')
362
+ logger.info(f"ContentRenderer: Calling updateRenderData for tab {actual_tab_id}, markdown length={len(markdown_content)}")
333
363
  tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
334
364
 
335
365
  # Wait for event-driven finish
336
- self._wait_for_render_finished(tab, timeout=5.0)
366
+ self._wait_for_render_finished(tab, timeout=12.0, context=f"render:{actual_tab_id}")
337
367
 
338
368
  # Dynamic Resize
339
369
  scroll_height = tab.run_js('return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);')
@@ -13,6 +13,10 @@ from typing import Optional, Dict, Any, List
13
13
  from loguru import logger
14
14
  import trafilatura
15
15
 
16
+ # Import intelligent completeness checker
17
+ from ..crawling.completeness import CompletenessChecker, trigger_lazy_load
18
+ from ..crawling.models import CrawlConfig
19
+
16
20
  class ScreenshotService:
17
21
  """
18
22
  Browser Service using DrissionPage.
@@ -281,14 +285,27 @@ class ScreenshotService:
281
285
  // 2. Check loading status using decode() AND heuristic for placeholders
282
286
  // Some sites load a tiny blurred placeholder first.
283
287
  const checks = visibleImgs.map(img => {
284
- // HEURISTIC: content is likely not ready if:
285
- // - img has 'data-src' but src is different (or src is empty)
286
- // - img has 'loading="lazy"' and is not complete
287
- // - naturalWidth is very small (placeholder) compared to display width
288
+ // Enhanced placeholder detection (matching completeness.py)
289
+ const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
290
+ img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
291
+ const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
292
+ const loadingAttr = img.getAttribute('loading') || '';
293
+ const src = img.src || '';
288
294
 
289
295
  const isPlaceholder = (
290
- (img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
291
- (img.naturalWidth < 50 && img.clientWidth > 100)
296
+ // data-src not yet loaded
297
+ (dataSrc && img.src !== dataSrc) ||
298
+ // Natural size much smaller than display (blurred placeholder)
299
+ (img.naturalWidth < 50 && img.clientWidth > 100) ||
300
+ (img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
301
+ // Lazy-loading class indicators
302
+ className.includes('lazy') ||
303
+ className.includes('lazyload') ||
304
+ className.includes('lozad') ||
305
+ // CSS blur filter applied
306
+ (window.getComputedStyle(img).filter || '').includes('blur') ||
307
+ // loading="lazy" + not complete
308
+ (loadingAttr === 'lazy' && !img.complete)
292
309
  );
293
310
 
294
311
  if (isPlaceholder) {
@@ -537,81 +554,71 @@ class ScreenshotService:
537
554
  """Synchronous screenshot."""
538
555
  if not url: return None
539
556
  tab = None
557
+ capture_width = 1024 # Standard capture width
558
+
540
559
  try:
541
560
  self._ensure_ready()
542
561
  page = self._manager.page
543
562
  if not page: return None
544
563
 
545
- tab = page.new_tab(url)
564
+ # Create blank tab first
565
+ tab = page.new_tab()
546
566
 
547
- # Start monitoring network traffic
567
+ # Set viewport BEFORE navigation so page renders at target width from the start
568
+ # This eliminates the need for post-load resize and reflow
548
569
  try:
549
- # Listen for data packets (XHR/Fetch/POST)
550
- # Targets: xhr, fetch. POST usually falls under these or Document.
551
- tab.listen.start(targets=True) # Listen to everything for now to be safe
552
- except Exception as e:
553
- logger.warning(f"ScreenshotService: Failed to start network listener: {e}")
554
-
570
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
571
+ width=capture_width, height=900, deviceScaleFactor=1, mobile=False)
572
+ except:
573
+ pass
574
+
575
+ # Now navigate to the URL - page will render at target width
576
+ tab.get(url)
577
+
578
+ # Network monitoring removed - not needed for simplified wait logic
579
+
580
+ # Initialize crawl config for completeness checking (defined outside try for scope)
581
+ crawl_config = CrawlConfig(
582
+ scan_full_page=True,
583
+ scroll_step=800,
584
+ scroll_delay=0.5,
585
+ scroll_timeout=20.0, # Increased for lazy-loading pages
586
+ image_load_timeout=3.0, # Image loading timeout: 3 seconds
587
+ image_stability_checks=3,
588
+ )
589
+
590
+ # === Start scrolling immediately to trigger lazy loading ===
591
+ # Scroll first, then wait for DOM - this allows lazy loading to start in parallel
592
+ logger.info(f"ScreenshotService: Starting lazy load scroll for {url} (before DOM wait)")
593
+ trigger_lazy_load(tab, crawl_config)
594
+
595
+ # Now wait for DOM to be ready (after scroll has triggered lazy loading)
555
596
  try:
556
597
  # Wait for full page load (including JS execution)
557
- tab.wait.load_complete(timeout=timeout)
598
+ try:
599
+ tab.wait.doc_loaded(timeout=timeout)
600
+ except:
601
+ pass
558
602
 
559
603
  # Wait for actual content to appear (for CDN verification pages)
560
- # Smart Wait Logic (Final Robust):
561
- # 1. Network Idle: Wait for silence in XHR/POST
562
- # 2. Stability: Wait for Height/Text/DOM stability
604
+ # Simplified wait logic for screenshot: just check basic readiness
563
605
 
564
- time.sleep(1.5) # user request: force wait 1.5s before detection
565
-
566
- last_h = 0
567
- last_text_len = 0
568
- last_html_len = 0
569
- stable_count = 0
570
-
571
- for i in range(200): # Max 200 iterations (~20s)
606
+ for i in range(20): # Max 20 iterations (~1s) - much faster
572
607
  try:
573
- # 1. Check Network Activity
574
- has_recent_network = False
575
- try:
576
- # Iterate over any captured packets since last check
577
- for packet in tab.listen.steps(timeout=0.01):
578
- # Check if it's a significant request (POST or XHR/Fetch)
579
- method = packet.method.upper()
580
- r_type = packet.resourceType.upper() if getattr(packet, 'resourceType', None) else ""
581
-
582
- # Interested in: POST requests OR any XHR/Fetch response
583
- if method == 'POST' or 'XMLHTTPREQUEST' in r_type or 'FETCH' in r_type:
584
- # Ignore some common noise? (Optional: analytics, tracking)
585
- # For now, simplistic approach: any API traffic resets stability
586
- has_recent_network = True
587
- # logger.debug(f"Network Activity: {method} {packet.url[:50]}")
588
- break
589
- except:
590
- pass
591
-
592
- # 2. Check DOM State
593
608
  state = tab.run_js('''
594
609
  return {
595
610
  ready: document.readyState === 'complete',
596
611
  title: document.title,
597
- height: Math.max(
598
- document.body.scrollHeight || 0,
599
- document.documentElement.scrollHeight || 0
600
- ),
601
- text: document.body.innerText.substring(0, 1000) || "",
602
- html_len: document.body.innerHTML.length || 0
612
+ text: document.body.innerText.substring(0, 500) || ""
603
613
  };
604
- ''') or {'ready': False, 'title': "", 'height': 0, 'text': "", 'html_len': 0}
614
+ ''') or {'ready': False, 'title': "", 'text': ""}
605
615
 
606
616
  is_ready = state.get('ready', False)
607
617
  title = state.get('title', "").lower()
608
- current_h = int(state.get('height', 0))
609
- text_content = state.get('text', "")
610
- text_len = len(text_content)
611
- html_len = int(state.get('html_len', 0))
612
- text_lower = text_content.lower()
618
+ text_lower = state.get('text', "").lower()
619
+ text_len = len(text_lower)
613
620
 
614
- # Blacklist check (Loading indicators)
621
+ # Check for verification pages
615
622
  is_verification = "checking your browser" in text_lower or \
616
623
  "just a moment" in text_lower or \
617
624
  "please wait" in text_lower or \
@@ -619,54 +626,19 @@ class ScreenshotService:
619
626
  "just a moment" in title or \
620
627
  "loading..." in title
621
628
 
622
- # Stability check (Multi-metric + Network)
623
- # We require STABILITY across Height, Text Length, DOM Size AND Network Silence
624
- is_height_stable = current_h == last_h
625
- is_text_stable = abs(text_len - last_text_len) < 5 # Allow minor fluctuations
626
- is_dom_stable = abs(html_len - last_html_len) < 20 # Allow minor fluctuations (ads/tracking)
627
-
628
- if is_height_stable and is_text_stable and is_dom_stable and not has_recent_network:
629
- stable_count += 1
630
- else:
631
- # Reset if ANY metric changed or NETWORK active
632
- stable_count = 0
633
- # if has_recent_network: logger.debug("Stability reset: Network Activity")
629
+ # Basic content check
630
+ has_content = text_len > 100
634
631
 
635
- # Conditions
636
- has_content = text_len > 100 # At least 100 real chars
637
-
638
- # Dynamic Stability Requirement:
639
- # If page looks like it's loading (small content), require longer stability
640
- if has_recent_network:
641
- # If we just saw network, enforce at least 1s (10 ticks) clean silence even for large pages
642
- required_stability = max(10, 40 if text_len < 500 else 25)
643
- else:
644
- required_stability = 40 if text_len < 500 else 25 # 4.0s or 2.5s
645
-
646
- is_stable = stable_count >= required_stability
647
-
648
- # Pass if all conditions met
649
- if is_ready and not is_verification and has_content and is_stable:
632
+ # Pass if ready, not verification, and has content
633
+ if is_ready and not is_verification and has_content:
634
+ logger.debug(f"ScreenshotService: Page ready after {i * 0.05:.2f}s")
650
635
  break
651
-
652
- last_h = current_h
653
- last_text_len = text_len
654
- last_html_len = html_len
655
-
656
- # Wait timing within loop (tab.listen.steps consumed some time, so sleep less)
657
- try: time.sleep(0.05)
658
- except: pass
659
636
 
637
+ time.sleep(0.05)
660
638
  except Exception:
661
- stable_count = 0
662
- try: time.sleep(0.1)
663
- except: pass
639
+ time.sleep(0.05)
664
640
  continue
665
641
 
666
- # Cleanup listener
667
- try: tab.listen.stop()
668
- except: pass
669
-
670
642
  # DEBUG: Save HTML to inspect what happened (in data dir)
671
643
  try:
672
644
  import os
@@ -675,50 +647,179 @@ class ScreenshotService:
675
647
  f.write(f"<!-- URL: {url} -->\n")
676
648
  f.write(tab.html)
677
649
  except: pass
678
-
679
- # Use faster scroll step (800) to ensure lazy loaded images appear
680
- self._scroll_to_bottom(tab, step=800, delay=2.0, timeout=min(timeout, 10))
681
-
682
- except:
683
- pass
684
-
685
- # Refine calculation: Set viewport width to 1024
686
650
 
687
- capture_width = 1024
688
-
689
- # Calculate actual content height after lazy loading
690
- try:
691
- # Use a robust height calculation
692
- content_height = tab.run_js('''
693
- return Math.max(
694
- document.body.scrollHeight || 0,
695
- document.documentElement.scrollHeight || 0,
696
- document.body.offsetHeight || 0,
697
- document.documentElement.offsetHeight || 0,
698
- document.documentElement.clientHeight || 0
699
- );
700
- ''')
701
- # Add a small buffer and cap at 15000px to prevent memory issues
702
- h = min(int(content_height) + 50, 15000)
703
- except:
704
- h = 1000 # Fallback
651
+ except Exception as e:
652
+ logger.warning(f"ScreenshotService: Page readiness check failed: {e}")
705
653
 
706
- # Set viewport to full content size for single-shot capture
707
- try:
708
- tab.run_cdp('Emulation.setDeviceMetricsOverride',
709
- width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
710
- except:
711
- pass
712
-
713
- # Scrollbar Hiding
654
+ # Scrollbar Hiding first (before any height calculation)
714
655
  from .manager import SharedBrowserManager
715
656
  SharedBrowserManager.hide_scrollbars(tab)
716
657
 
717
- # Scroll back to top before screenshot
658
+ # Scroll back to top
718
659
  tab.run_js("window.scrollTo(0, 0);")
719
660
 
720
- # Content is already loaded by _scroll_to_bottom which waits for images
721
- # Just recalculate final height (content may have grown during scrolling)
661
+ # Image loading monitoring with time tracking - DISABLED
662
+ # No longer waiting for images to load
663
+ # Initialize image tracking JavaScript
664
+ # image_tracking_js = """
665
+ # (() => {
666
+ # if (!window._imageLoadTracker) {
667
+ # window._imageLoadTracker = {
668
+ # startTime: Date.now(),
669
+ # images: new Map()
670
+ # };
671
+ #
672
+ # const imgs = Array.from(document.querySelectorAll('img'));
673
+ # const minSize = 50;
674
+ #
675
+ # imgs.forEach((img, idx) => {
676
+ # if (img.clientWidth < minSize && img.clientHeight < minSize) return;
677
+ #
678
+ # const src = img.src || img.getAttribute('data-src') || '';
679
+ # const key = `${idx}_${src.substring(0, 100)}`;
680
+ #
681
+ # if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
682
+ # // Already loaded
683
+ # window._imageLoadTracker.images.set(key, {
684
+ # src: src.substring(0, 150),
685
+ # status: 'loaded',
686
+ # loadTime: 0, // Already loaded before tracking
687
+ # naturalSize: [img.naturalWidth, img.naturalHeight],
688
+ # displaySize: [img.clientWidth, img.clientHeight]
689
+ # });
690
+ # } else {
691
+ # // Track loading
692
+ # window._imageLoadTracker.images.set(key, {
693
+ # src: src.substring(0, 150),
694
+ # status: 'pending',
695
+ # startTime: Date.now(),
696
+ # naturalSize: [img.naturalWidth, img.naturalHeight],
697
+ # displaySize: [img.clientWidth, img.clientHeight]
698
+ # });
699
+ #
700
+ # // Add load event listener
701
+ # img.addEventListener('load', () => {
702
+ # const entry = window._imageLoadTracker.images.get(key);
703
+ # if (entry && entry.status === 'pending') {
704
+ # entry.status = 'loaded';
705
+ # entry.loadTime = Date.now() - entry.startTime;
706
+ # }
707
+ # });
708
+ #
709
+ # img.addEventListener('error', () => {
710
+ # const entry = window._imageLoadTracker.images.get(key);
711
+ # if (entry && entry.status === 'pending') {
712
+ # entry.status = 'failed';
713
+ # entry.loadTime = Date.now() - entry.startTime;
714
+ # }
715
+ # });
716
+ # }
717
+ # });
718
+ # }
719
+ #
720
+ # // Return current status
721
+ # const results = [];
722
+ # window._imageLoadTracker.images.forEach((value, key) => {
723
+ # const entry = {
724
+ # src: value.src,
725
+ # status: value.status,
726
+ # loadTime: value.status === 'loaded' ? (value.loadTime || 0) : (Date.now() - value.startTime),
727
+ # naturalSize: value.naturalSize,
728
+ # displaySize: value.displaySize
729
+ # };
730
+ # results.push(entry);
731
+ # });
732
+ #
733
+ # return {
734
+ # total: results.length,
735
+ # loaded: results.filter(r => r.status === 'loaded').length,
736
+ # pending: results.filter(r => r.status === 'pending').length,
737
+ # failed: results.filter(r => r.status === 'failed').length,
738
+ # details: results
739
+ # };
740
+ # })()
741
+ # """
742
+ #
743
+ # # Initialize tracking
744
+ # tab.run_js(image_tracking_js)
745
+ #
746
+ # # Monitor image loading with dynamic stop logic
747
+ # check_interval = 0.2 # Check every 200ms
748
+ # image_timeout = 3.0 # Image loading timeout: 3 seconds
749
+ # monitoring_start = time.time()
750
+ # loaded_times = [] # Track load times of completed images
751
+ #
752
+ # logger.info(f"ScreenshotService: Starting image load monitoring (timeout={image_timeout}s)...")
753
+ #
754
+ # while True:
755
+ # elapsed = time.time() - monitoring_start
756
+ #
757
+ # # Check timeout first
758
+ # if elapsed >= image_timeout:
759
+ # logger.info(f"ScreenshotService: Image loading timeout ({image_timeout}s) reached")
760
+ # break
761
+ #
762
+ # # Get current image status
763
+ # status = tab.run_js(image_tracking_js, as_expr=True) or {
764
+ # 'total': 0, 'loaded': 0, 'pending': 0, 'failed': 0, 'details': []
765
+ # }
766
+ #
767
+ # # Log each image's status and load time
768
+ # for img_detail in status.get('details', []):
769
+ # src_short = img_detail.get('src', '')[:80]
770
+ # status_str = img_detail.get('status', 'unknown')
771
+ # load_time = img_detail.get('loadTime', 0)
772
+ # logger.info(
773
+ # f"ScreenshotService: Image [{status_str}] "
774
+ # f"loadTime={load_time:.0f}ms "
775
+ # f"src={src_short}"
776
+ # )
777
+ #
778
+ # # Collect load times of completed images
779
+ # loaded_times = [
780
+ # img.get('loadTime', 0)
781
+ # for img in status.get('details', [])
782
+ # if img.get('status') == 'loaded' and img.get('loadTime', 0) > 0
783
+ # ]
784
+ #
785
+ # pending_count = status.get('pending', 0)
786
+ # loaded_count = status.get('loaded', 0)
787
+ #
788
+ # # Check stop conditions
789
+ # if pending_count == 0:
790
+ # logger.info(f"ScreenshotService: All images loaded. Total: {status.get('total', 0)}, Loaded: {loaded_count}")
791
+ # break
792
+ #
793
+ # # Check dynamic stop condition (if we have loaded images to calculate average)
794
+ # if loaded_times:
795
+ # avg_load_time = sum(loaded_times) / len(loaded_times)
796
+ # max_wait_time = avg_load_time * 2
797
+ #
798
+ # # Check if any pending image has exceeded max wait time
799
+ # pending_images = [
800
+ # img for img in status.get('details', [])
801
+ # if img.get('status') == 'pending'
802
+ # ]
803
+ #
804
+ # should_stop = False
805
+ # for pending_img in pending_images:
806
+ # wait_time = pending_img.get('loadTime', 0)
807
+ # if wait_time >= max_wait_time:
808
+ # should_stop = True
809
+ # logger.info(
810
+ # f"ScreenshotService: Stopping - pending image waited {wait_time:.0f}ms, "
811
+ # f"exceeds 2x avg load time ({max_wait_time:.0f}ms, avg={avg_load_time:.0f}ms)"
812
+ # )
813
+ # break
814
+ #
815
+ # if should_stop:
816
+ # break
817
+ #
818
+ # # Wait before next check
819
+ # time.sleep(check_interval)
820
+
821
+ # Now calculate final height ONCE after all content loaded
822
+ # CompletenessChecker already verified height stability
722
823
  try:
723
824
  final_height = tab.run_js('''
724
825
  return Math.max(
@@ -728,13 +829,15 @@ class ScreenshotService:
728
829
  document.documentElement.offsetHeight || 0
729
830
  );
730
831
  ''')
731
- final_h = min(int(final_height) + 50, 15000)
732
- if final_h != h:
733
- tab.run_cdp('Emulation.setDeviceMetricsOverride',
734
- width=capture_width, height=final_h, deviceScaleFactor=1, mobile=False)
832
+ h = min(int(final_height) + 50, 15000)
833
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
834
+ width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
735
835
  except:
736
836
  pass
737
837
 
838
+ # Final scroll to top
839
+ tab.run_js("window.scrollTo(0, 0);")
840
+
738
841
  # Use full_page=False because we manually set the viewport to the full height
739
842
  # This avoids stitching artifacts and blank spaces
740
843
  return tab.get_screenshot(as_base64='jpg', full_page=False)
hyw_core/core.py CHANGED
@@ -10,7 +10,6 @@ from dataclasses import dataclass, field
10
10
  from typing import Dict, List, Any, Optional, Callable, Awaitable
11
11
 
12
12
  from loguru import logger
13
- from openai import AsyncOpenAI
14
13
 
15
14
  from .config import HywCoreConfig, ModelConfig
16
15
  from .pipeline import ModularPipeline
@@ -95,12 +94,6 @@ class HywCore:
95
94
  self.config = config
96
95
  self._send_func = send_func
97
96
 
98
- # Create OpenAI client
99
- self._client = AsyncOpenAI(
100
- api_key=config.api_key,
101
- base_url=config.base_url if config.base_url else None
102
- )
103
-
104
97
  # Create search service
105
98
  self._search_service = SearchService(config)
106
99
 
@@ -0,0 +1,18 @@
1
+ """
2
+ hyw_core.crawling - Intelligent Web Crawling Module
3
+
4
+ Provides Crawl4AI-inspired adaptive crawling with:
5
+ - Page completeness guarantees (image loading verification)
6
+ - Content quality scoring
7
+ - Adaptive stop logic
8
+ """
9
+
10
+ from .models import CrawlConfig, PageResult, CompletenessResult
11
+ from .completeness import CompletenessChecker
12
+
13
+ __all__ = [
14
+ "CrawlConfig",
15
+ "PageResult",
16
+ "CompletenessResult",
17
+ "CompletenessChecker",
18
+ ]