entari-plugin-hyw 4.0.0rc8__py3-none-any.whl → 4.0.0rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

@@ -5,13 +5,11 @@ Provides search engine adapters for different search providers.
5
5
  """
6
6
 
7
7
  from .base import SearchEngine
8
- from .google import GoogleEngine
9
8
  from .duckduckgo import DuckDuckGoEngine
10
9
  from .default import DefaultEngine
11
10
 
12
11
  __all__ = [
13
12
  "SearchEngine",
14
- "GoogleEngine",
15
13
  "DuckDuckGoEngine",
16
14
  "DefaultEngine",
17
15
  ]
@@ -124,15 +124,28 @@ class SharedBrowserManager:
124
124
  @staticmethod
125
125
  def hide_scrollbars(page: ChromiumPage):
126
126
  """
127
- Robustly hide scrollbars using CDP commands.
128
- This eliminates the reserved space/gutter that standard CSS might miss.
127
+ Robustly hide scrollbars using CDP commands AND CSS injection.
128
+ This provides double protection against scrollbar gutters.
129
129
  """
130
130
  try:
131
- # Emulation.setScrollbarsHidden is a CDP command
131
+ # 1. CDP Command
132
132
  page.run_cdp('Emulation.setScrollbarsHidden', hidden=True)
133
- logger.debug("SharedBrowserManager: CDP scrollbars hidden.")
133
+
134
+ # 2. CSS Injection (Standard + Webkit)
135
+ css = """
136
+ ::-webkit-scrollbar { display: none !important; width: 0 !important; height: 0 !important; }
137
+ * { -ms-overflow-style: none !important; scrollbar-width: none !important; }
138
+ """
139
+ # Inject into current page
140
+ page.run_js(f"""
141
+ const style = document.createElement('style');
142
+ style.textContent = `{css}`;
143
+ document.head.appendChild(style);
144
+ """)
145
+
146
+ logger.debug("SharedBrowserManager: Scrollbars hidden via CDP + CSS.")
134
147
  except Exception as e:
135
- logger.warning(f"SharedBrowserManager: Failed to hide scrollbars via CDP: {e}")
148
+ logger.warning(f"SharedBrowserManager: Failed to hide scrollbars: {e}")
136
149
 
137
150
 
138
151
  # Module-level singleton accessor
@@ -55,16 +55,43 @@ class ContentRenderer:
55
55
  loop = asyncio.get_running_loop()
56
56
  return await loop.run_in_executor(self._executor, self._prepare_tab_sync)
57
57
 
58
- def _wait_for_render_finished(self, tab, timeout: float = 3.0):
58
+ def _wait_for_render_finished(self, tab, timeout: float = 12.0, context: str = ""):
59
59
  """Wait for window.RENDER_FINISHED to be true in the tab."""
60
60
  import time as pytime
61
61
  start = pytime.time()
62
+
63
+ # Check initial state
64
+ initial_state = tab.run_js("return window.RENDER_FINISHED")
65
+ logger.debug(f"ContentRenderer[{context}]: Starting wait, initial RENDER_FINISHED={initial_state}")
66
+
67
+ # If already true, it's stale from previous render - need to wait for JS to reset it
68
+ if initial_state:
69
+ logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED was true, waiting for reset...")
70
+ # Wait for JS to reset it to false (updateRenderData sets it to false)
71
+ reset_start = pytime.time()
72
+ while pytime.time() - reset_start < 1.0: # 1s max to wait for reset
73
+ is_reset = tab.run_js("return window.RENDER_FINISHED")
74
+ if not is_reset:
75
+ logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED reset to false")
76
+ break
77
+ pytime.sleep(0.05)
78
+ else:
79
+ logger.warning(f"ContentRenderer[{context}]: RENDER_FINISHED not reset, force resetting via JS")
80
+ tab.run_js("window.RENDER_FINISHED = false")
81
+
82
+ # Now wait for it to become true
83
+ poll_count = 0
62
84
  while pytime.time() - start < timeout:
63
85
  is_finished = tab.run_js("return window.RENDER_FINISHED")
86
+ poll_count += 1
64
87
  if is_finished:
88
+ elapsed = pytime.time() - start
89
+ logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED=true after {elapsed:.2f}s ({poll_count} polls)")
65
90
  return True
66
- pytime.sleep(0.05) # Fast polling
67
- logger.warning(f"ContentRenderer: Wait for RENDER_FINISHED timed out after {timeout}s")
91
+ pytime.sleep(0.1) # Poll every 100ms
92
+
93
+ elapsed = pytime.time() - start
94
+ logger.warning(f"ContentRenderer[{context}]: Wait for RENDER_FINISHED timed out after {elapsed:.2f}s ({poll_count} polls)")
68
95
  return False
69
96
 
70
97
  def _prepare_tab_sync(self) -> str:
@@ -89,8 +116,9 @@ class ContentRenderer:
89
116
  "theme_color": "#ef4444",
90
117
  }
91
118
 
119
+ logger.debug(f"ContentRenderer: Calling warmup updateRenderData for tab {tab_id}")
92
120
  tab.run_js(f"window.updateRenderData({json.dumps(warmup_data)})")
93
- self._wait_for_render_finished(tab, timeout=5.0)
121
+ self._wait_for_render_finished(tab, timeout=12.0, context=f"warmup:{tab_id}")
94
122
 
95
123
  # Wait for main-container after warmup (Vue needs to render it)
96
124
  tab.ele('#main-container', timeout=3)
@@ -203,7 +231,7 @@ class ContentRenderer:
203
231
 
204
232
  # 1. Update Data & Wait for Finished flag
205
233
  tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
206
- self._wait_for_render_finished(tab)
234
+ self._wait_for_render_finished(tab, context=f"batch:{tab_id}")
207
235
 
208
236
  # 2. Dynamic Resize
209
237
  # Get actual content height to prevent clipping
@@ -330,10 +358,12 @@ class ContentRenderer:
330
358
  "theme_color": theme_color,
331
359
  }
332
360
 
361
+ actual_tab_id = getattr(tab, 'tab_id', 'unknown')
362
+ logger.info(f"ContentRenderer: Calling updateRenderData for tab {actual_tab_id}, markdown length={len(markdown_content)}")
333
363
  tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
334
364
 
335
365
  # Wait for event-driven finish
336
- self._wait_for_render_finished(tab, timeout=5.0)
366
+ self._wait_for_render_finished(tab, timeout=12.0, context=f"render:{actual_tab_id}")
337
367
 
338
368
  # Dynamic Resize
339
369
  scroll_height = tab.run_js('return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);')
@@ -13,6 +13,10 @@ from typing import Optional, Dict, Any, List
13
13
  from loguru import logger
14
14
  import trafilatura
15
15
 
16
+ # Import intelligent completeness checker
17
+ from ..crawling.completeness import CompletenessChecker, trigger_lazy_load
18
+ from ..crawling.models import CrawlConfig
19
+
16
20
  class ScreenshotService:
17
21
  """
18
22
  Browser Service using DrissionPage.
@@ -537,31 +541,83 @@ class ScreenshotService:
537
541
  """Synchronous screenshot."""
538
542
  if not url: return None
539
543
  tab = None
544
+ capture_width = 1024 # Standard capture width
545
+
540
546
  try:
541
547
  self._ensure_ready()
542
548
  page = self._manager.page
543
549
  if not page: return None
544
550
 
545
- tab = page.new_tab(url)
551
+ # Create blank tab first
552
+ tab = page.new_tab()
553
+
554
+ # Set viewport BEFORE navigation so page renders at target width from the start
555
+ # This eliminates the need for post-load resize and reflow
556
+ try:
557
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
558
+ width=capture_width, height=900, deviceScaleFactor=1, mobile=False)
559
+ except:
560
+ pass
561
+
562
+ # Now navigate to the URL - page will render at target width
563
+ tab.get(url)
564
+
565
+ # Start monitoring network traffic
566
+ try:
567
+ # Listen for data packets (XHR/Fetch/POST)
568
+ # Targets: xhr, fetch. POST usually falls under these or Document.
569
+ tab.listen.start(targets=True) # Listen to everything for now to be safe
570
+ except Exception as e:
571
+ logger.warning(f"ScreenshotService: Failed to start network listener: {e}")
572
+
573
+ # Initialize crawl config for completeness checking (defined outside try for scope)
574
+ crawl_config = CrawlConfig(
575
+ scan_full_page=True,
576
+ scroll_step=800,
577
+ scroll_delay=0.5,
578
+ scroll_timeout=min(timeout, 10),
579
+ image_load_timeout=8.0,
580
+ image_stability_checks=3,
581
+ )
582
+
546
583
  try:
547
584
  # Wait for full page load (including JS execution)
548
585
  tab.wait.load_complete(timeout=timeout)
549
586
 
550
587
  # Wait for actual content to appear (for CDN verification pages)
551
588
  # Smart Wait Logic (Final Robust):
552
- # 1. FORCED WAIT: 1.5s to allow initial redirects/rendering to start.
553
- # 2. Browser ReadyState Complete
554
- # 3. Height Stable for 2.0 seconds (20 checks)
555
- # 4. Text > 100 chars (Crucial: Distinguishes stable content from stable spinners)
556
- # 5. No Blacklist phrases
589
+ # 1. Network Idle: Wait for silence in XHR/POST
590
+ # 2. Stability: Wait for Height/Text/DOM stability
557
591
 
558
592
  time.sleep(1.5) # user request: force wait 1.5s before detection
559
593
 
560
594
  last_h = 0
595
+ last_text_len = 0
596
+ last_html_len = 0
561
597
  stable_count = 0
562
598
 
563
599
  for i in range(200): # Max 200 iterations (~20s)
564
600
  try:
601
+ # 1. Check Network Activity
602
+ has_recent_network = False
603
+ try:
604
+ # Iterate over any captured packets since last check
605
+ for packet in tab.listen.steps(timeout=0.01):
606
+ # Check if it's a significant request (POST or XHR/Fetch)
607
+ method = packet.method.upper()
608
+ r_type = packet.resourceType.upper() if getattr(packet, 'resourceType', None) else ""
609
+
610
+ # Interested in: POST requests OR any XHR/Fetch response
611
+ if method == 'POST' or 'XMLHTTPREQUEST' in r_type or 'FETCH' in r_type:
612
+ # Ignore some common noise? (Optional: analytics, tracking)
613
+ # For now, simplistic approach: any API traffic resets stability
614
+ has_recent_network = True
615
+ # logger.debug(f"Network Activity: {method} {packet.url[:50]}")
616
+ break
617
+ except:
618
+ pass
619
+
620
+ # 2. Check DOM State
565
621
  state = tab.run_js('''
566
622
  return {
567
623
  ready: document.readyState === 'complete',
@@ -571,42 +627,62 @@ class ScreenshotService:
571
627
  document.documentElement.scrollHeight || 0
572
628
  ),
573
629
  text: document.body.innerText.substring(0, 1000) || "",
574
- html: document.body.innerHTML.substring(0, 500) // Debug intro
630
+ html_len: document.body.innerHTML.length || 0
575
631
  };
576
- ''') or {'ready': False, 'title': "", 'height': 0, 'text': ""}
632
+ ''') or {'ready': False, 'title': "", 'height': 0, 'text': "", 'html_len': 0}
577
633
 
578
634
  is_ready = state.get('ready', False)
579
635
  title = state.get('title', "").lower()
580
636
  current_h = int(state.get('height', 0))
581
637
  text_content = state.get('text', "")
582
638
  text_len = len(text_content)
639
+ html_len = int(state.get('html_len', 0))
583
640
  text_lower = text_content.lower()
584
641
 
585
- # Blacklist check
642
+ # Blacklist check (Loading indicators)
586
643
  is_verification = "checking your browser" in text_lower or \
587
644
  "just a moment" in text_lower or \
588
645
  "please wait" in text_lower or \
589
646
  "security check" in title or \
590
- "just a moment" in title
647
+ "just a moment" in title or \
648
+ "loading..." in title
591
649
 
592
- # Stability check
593
- if current_h == last_h:
650
+ # Stability check (Multi-metric + Network)
651
+ # We require STABILITY across Height, Text Length, DOM Size AND Network Silence
652
+ is_height_stable = current_h == last_h
653
+ is_text_stable = abs(text_len - last_text_len) < 5 # Allow minor fluctuations
654
+ is_dom_stable = abs(html_len - last_html_len) < 20 # Allow minor fluctuations (ads/tracking)
655
+
656
+ if is_height_stable and is_text_stable and is_dom_stable and not has_recent_network:
594
657
  stable_count += 1
595
658
  else:
659
+ # Reset if ANY metric changed or NETWORK active
596
660
  stable_count = 0
661
+ # if has_recent_network: logger.debug("Stability reset: Network Activity")
597
662
 
598
663
  # Conditions
599
664
  has_content = text_len > 100 # At least 100 real chars
600
- is_stable = stable_count >= 20 # Always require 2s stability
665
+
666
+ # Dynamic Stability Requirement:
667
+ # If page looks like it's loading (small content), require longer stability
668
+ if has_recent_network:
669
+ # If we just saw network, enforce at least 1s (10 ticks) clean silence even for large pages
670
+ required_stability = max(10, 40 if text_len < 500 else 25)
671
+ else:
672
+ required_stability = 40 if text_len < 500 else 25 # 4.0s or 2.5s
673
+
674
+ is_stable = stable_count >= required_stability
601
675
 
602
676
  # Pass if all conditions met
603
677
  if is_ready and not is_verification and has_content and is_stable:
604
678
  break
605
679
 
606
680
  last_h = current_h
681
+ last_text_len = text_len
682
+ last_html_len = html_len
607
683
 
608
- # Wait timing
609
- try: tab.wait.eles_loaded(timeout=0.1)
684
+ # Wait timing within loop (tab.listen.steps consumed some time, so sleep less)
685
+ try: time.sleep(0.05)
610
686
  except: pass
611
687
 
612
688
  except Exception:
@@ -615,6 +691,10 @@ class ScreenshotService:
615
691
  except: pass
616
692
  continue
617
693
 
694
+ # Cleanup listener
695
+ try: tab.listen.stop()
696
+ except: pass
697
+
618
698
  # DEBUG: Save HTML to inspect what happened (in data dir)
619
699
  try:
620
700
  import os
@@ -624,48 +704,33 @@ class ScreenshotService:
624
704
  f.write(tab.html)
625
705
  except: pass
626
706
 
627
- # Use faster scroll step (800) to ensure lazy loaded images appear
628
- self._scroll_to_bottom(tab, step=800, delay=2.0, timeout=min(timeout, 10))
707
+ # Use crawling module for lazy loading trigger (config defined above)
708
+ trigger_lazy_load(tab, crawl_config)
629
709
 
630
710
  except:
631
711
  pass
632
712
 
633
- # Refine calculation: Set viewport width to 1024
634
- capture_width = 1024
635
-
636
- # Calculate actual content height after lazy loading
637
- try:
638
- # Use a robust height calculation
639
- content_height = tab.run_js('''
640
- return Math.max(
641
- document.body.scrollHeight || 0,
642
- document.documentElement.scrollHeight || 0,
643
- document.body.offsetHeight || 0,
644
- document.documentElement.offsetHeight || 0,
645
- document.documentElement.clientHeight || 0
646
- );
647
- ''')
648
- # Add a small buffer and cap at 15000px to prevent memory issues
649
- h = min(int(content_height) + 50, 15000)
650
- except:
651
- h = 1000 # Fallback
652
-
653
- # Set viewport to full content size for single-shot capture
654
- try:
655
- tab.run_cdp('Emulation.setDeviceMetricsOverride',
656
- width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
657
- except:
658
- pass
659
-
660
- # Scrollbar Hiding
713
+ # Scrollbar Hiding first (before any height calculation)
661
714
  from .manager import SharedBrowserManager
662
715
  SharedBrowserManager.hide_scrollbars(tab)
663
716
 
664
- # Scroll back to top before screenshot
717
+ # Scroll back to top
665
718
  tab.run_js("window.scrollTo(0, 0);")
666
719
 
667
- # Content is already loaded by _scroll_to_bottom which waits for images
668
- # Just recalculate final height (content may have grown during scrolling)
720
+ # Use CompletenessChecker to verify all images are loaded
721
+ checker = CompletenessChecker(crawl_config)
722
+ completeness = checker.wait_for_complete(tab, timeout=crawl_config.image_load_timeout)
723
+
724
+ logger.info(
725
+ f"ScreenshotService: Image completeness: "
726
+ f"{completeness.loaded_images}/{completeness.total_images} loaded, "
727
+ f"{completeness.failed_images} pending, "
728
+ f"{completeness.placeholder_images} placeholders, "
729
+ f"complete={completeness.is_complete}"
730
+ )
731
+
732
+ # Now calculate final height ONCE after all content loaded
733
+ # CompletenessChecker already verified height stability
669
734
  try:
670
735
  final_height = tab.run_js('''
671
736
  return Math.max(
@@ -675,13 +740,15 @@ class ScreenshotService:
675
740
  document.documentElement.offsetHeight || 0
676
741
  );
677
742
  ''')
678
- final_h = min(int(final_height) + 50, 15000)
679
- if final_h != h:
680
- tab.run_cdp('Emulation.setDeviceMetricsOverride',
681
- width=capture_width, height=final_h, deviceScaleFactor=1, mobile=False)
743
+ h = min(int(final_height) + 50, 15000)
744
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
745
+ width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
682
746
  except:
683
747
  pass
684
748
 
749
+ # Final scroll to top
750
+ tab.run_js("window.scrollTo(0, 0);")
751
+
685
752
  # Use full_page=False because we manually set the viewport to the full height
686
753
  # This avoids stitching artifacts and blank spaces
687
754
  return tab.get_screenshot(as_base64='jpg', full_page=False)
@@ -0,0 +1,18 @@
1
+ """
2
+ hyw_core.crawling - Intelligent Web Crawling Module
3
+
4
+ Provides Crawl4AI-inspired adaptive crawling with:
5
+ - Page completeness guarantees (image loading verification)
6
+ - Content quality scoring
7
+ - Adaptive stop logic
8
+ """
9
+
10
+ from .models import CrawlConfig, PageResult, CompletenessResult
11
+ from .completeness import CompletenessChecker
12
+
13
+ __all__ = [
14
+ "CrawlConfig",
15
+ "PageResult",
16
+ "CompletenessResult",
17
+ "CompletenessChecker",
18
+ ]