entari-plugin-hyw 4.0.0rc10__py3-none-any.whl → 4.0.0rc12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -285,14 +285,27 @@ class ScreenshotService:
285
285
  // 2. Check loading status using decode() AND heuristic for placeholders
286
286
  // Some sites load a tiny blurred placeholder first.
287
287
  const checks = visibleImgs.map(img => {
288
- // HEURISTIC: content is likely not ready if:
289
- // - img has 'data-src' but src is different (or src is empty)
290
- // - img has 'loading="lazy"' and is not complete
291
- // - naturalWidth is very small (placeholder) compared to display width
288
+ // Enhanced placeholder detection (matching completeness.py)
289
+ const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
290
+ img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
291
+ const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
292
+ const loadingAttr = img.getAttribute('loading') || '';
293
+ const src = img.src || '';
292
294
 
293
295
  const isPlaceholder = (
294
- (img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
295
- (img.naturalWidth < 50 && img.clientWidth > 100)
296
+ // data-src not yet loaded
297
+ (dataSrc && img.src !== dataSrc) ||
298
+ // Natural size much smaller than display (blurred placeholder)
299
+ (img.naturalWidth < 50 && img.clientWidth > 100) ||
300
+ (img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
301
+ // Lazy-loading class indicators
302
+ className.includes('lazy') ||
303
+ className.includes('lazyload') ||
304
+ className.includes('lozad') ||
305
+ // CSS blur filter applied
306
+ (window.getComputedStyle(img).filter || '').includes('blur') ||
307
+ // loading="lazy" + not complete
308
+ (loadingAttr === 'lazy' && !img.complete)
296
309
  );
297
310
 
298
311
  if (isPlaceholder) {
@@ -529,19 +542,54 @@ class ScreenshotService:
529
542
  return await asyncio.gather(*tasks, return_exceptions=True)
530
543
 
531
544
  async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 80) -> Optional[str]:
532
- """Screenshot URL (Async wrapper for sync)."""
545
+ """Screenshot URL (Async wrapper for sync). Returns base64 string only."""
533
546
  loop = asyncio.get_running_loop()
534
- return await loop.run_in_executor(
547
+ result = await loop.run_in_executor(
548
+ self._executor,
549
+ self._screenshot_sync,
550
+ url, wait_load, timeout, full_page, quality, False # extract_content=False
551
+ )
552
+ # Backward compatible: return just the screenshot for old callers
553
+ if isinstance(result, dict):
554
+ return result.get("screenshot_b64")
555
+ return result
556
+
557
+ async def screenshot_with_content(self, url: str, timeout: float = 15.0, max_content_length: int = 8000) -> Dict[str, Any]:
558
+ """
559
+ Screenshot URL and extract page content.
560
+
561
+ Returns:
562
+ Dict with:
563
+ - screenshot_b64: base64 encoded screenshot
564
+ - content: trafilatura extracted text (truncated to max_content_length)
565
+ - title: page title
566
+ - url: final URL
567
+ """
568
+ loop = asyncio.get_running_loop()
569
+ result = await loop.run_in_executor(
535
570
  self._executor,
536
571
  self._screenshot_sync,
537
- url, wait_load, timeout, full_page, quality
572
+ url, True, timeout, False, 65, True # quality=65 for balance, extract_content=True
538
573
  )
574
+
575
+ if not isinstance(result, dict):
576
+ return {"screenshot_b64": result, "content": "", "title": "", "url": url}
577
+
578
+ # Truncate content if needed
579
+ content = result.get("content", "") or ""
580
+ if len(content) > max_content_length:
581
+ content = content[:max_content_length] + "\n\n[内容已截断...]"
582
+ result["content"] = content
583
+
584
+ return result
539
585
 
540
- def _screenshot_sync(self, url: str, wait_load: bool, timeout: float, full_page: bool, quality: int) -> Optional[str]:
541
- """Synchronous screenshot."""
542
- if not url: return None
586
+
587
+ def _screenshot_sync(self, url: str, wait_load: bool, timeout: float, full_page: bool, quality: int, extract_content: bool = False) -> Any:
588
+ """Synchronous screenshot. If extract_content=True, returns Dict else str."""
589
+ if not url:
590
+ return {"screenshot_b64": None, "content": "", "title": "", "url": url} if extract_content else None
543
591
  tab = None
544
- capture_width = 1024 # Standard capture width
592
+ capture_width = 1440 # Higher resolution for readability
545
593
 
546
594
  try:
547
595
  self._ensure_ready()
@@ -562,84 +610,50 @@ class ScreenshotService:
562
610
  # Now navigate to the URL - page will render at target width
563
611
  tab.get(url)
564
612
 
565
- # Start monitoring network traffic
566
- try:
567
- # Listen for data packets (XHR/Fetch/POST)
568
- # Targets: xhr, fetch. POST usually falls under these or Document.
569
- tab.listen.start(targets=True) # Listen to everything for now to be safe
570
- except Exception as e:
571
- logger.warning(f"ScreenshotService: Failed to start network listener: {e}")
613
+ # Network monitoring removed - not needed for simplified wait logic
572
614
 
573
615
  # Initialize crawl config for completeness checking (defined outside try for scope)
574
616
  crawl_config = CrawlConfig(
575
617
  scan_full_page=True,
576
618
  scroll_step=800,
577
619
  scroll_delay=0.5,
578
- scroll_timeout=min(timeout, 10),
579
- image_load_timeout=8.0,
620
+ scroll_timeout=20.0, # Increased for lazy-loading pages
621
+ image_load_timeout=3.0, # Image loading timeout: 3 seconds
580
622
  image_stability_checks=3,
581
623
  )
582
624
 
625
+ # === Start scrolling immediately to trigger lazy loading ===
626
+ # Scroll first, then wait for DOM - this allows lazy loading to start in parallel
627
+ logger.info(f"ScreenshotService: Starting lazy load scroll for {url} (before DOM wait)")
628
+ trigger_lazy_load(tab, crawl_config)
629
+
630
+ # Now wait for DOM to be ready (after scroll has triggered lazy loading)
583
631
  try:
584
632
  # Wait for full page load (including JS execution)
585
- tab.wait.load_complete(timeout=timeout)
633
+ try:
634
+ tab.wait.doc_loaded(timeout=timeout)
635
+ except:
636
+ pass
586
637
 
587
638
  # Wait for actual content to appear (for CDN verification pages)
588
- # Smart Wait Logic (Final Robust):
589
- # 1. Network Idle: Wait for silence in XHR/POST
590
- # 2. Stability: Wait for Height/Text/DOM stability
591
-
592
- time.sleep(1.5) # user request: force wait 1.5s before detection
639
+ # Simplified wait logic for screenshot: just check basic readiness
593
640
 
594
- last_h = 0
595
- last_text_len = 0
596
- last_html_len = 0
597
- stable_count = 0
598
-
599
- for i in range(200): # Max 200 iterations (~20s)
641
+ for i in range(20): # Max 20 iterations (~1s) - much faster
600
642
  try:
601
- # 1. Check Network Activity
602
- has_recent_network = False
603
- try:
604
- # Iterate over any captured packets since last check
605
- for packet in tab.listen.steps(timeout=0.01):
606
- # Check if it's a significant request (POST or XHR/Fetch)
607
- method = packet.method.upper()
608
- r_type = packet.resourceType.upper() if getattr(packet, 'resourceType', None) else ""
609
-
610
- # Interested in: POST requests OR any XHR/Fetch response
611
- if method == 'POST' or 'XMLHTTPREQUEST' in r_type or 'FETCH' in r_type:
612
- # Ignore some common noise? (Optional: analytics, tracking)
613
- # For now, simplistic approach: any API traffic resets stability
614
- has_recent_network = True
615
- # logger.debug(f"Network Activity: {method} {packet.url[:50]}")
616
- break
617
- except:
618
- pass
619
-
620
- # 2. Check DOM State
621
643
  state = tab.run_js('''
622
644
  return {
623
645
  ready: document.readyState === 'complete',
624
646
  title: document.title,
625
- height: Math.max(
626
- document.body.scrollHeight || 0,
627
- document.documentElement.scrollHeight || 0
628
- ),
629
- text: document.body.innerText.substring(0, 1000) || "",
630
- html_len: document.body.innerHTML.length || 0
647
+ text: document.body.innerText.substring(0, 500) || ""
631
648
  };
632
- ''') or {'ready': False, 'title': "", 'height': 0, 'text': "", 'html_len': 0}
649
+ ''') or {'ready': False, 'title': "", 'text': ""}
633
650
 
634
651
  is_ready = state.get('ready', False)
635
652
  title = state.get('title', "").lower()
636
- current_h = int(state.get('height', 0))
637
- text_content = state.get('text', "")
638
- text_len = len(text_content)
639
- html_len = int(state.get('html_len', 0))
640
- text_lower = text_content.lower()
653
+ text_lower = state.get('text', "").lower()
654
+ text_len = len(text_lower)
641
655
 
642
- # Blacklist check (Loading indicators)
656
+ # Check for verification pages
643
657
  is_verification = "checking your browser" in text_lower or \
644
658
  "just a moment" in text_lower or \
645
659
  "please wait" in text_lower or \
@@ -647,54 +661,19 @@ class ScreenshotService:
647
661
  "just a moment" in title or \
648
662
  "loading..." in title
649
663
 
650
- # Stability check (Multi-metric + Network)
651
- # We require STABILITY across Height, Text Length, DOM Size AND Network Silence
652
- is_height_stable = current_h == last_h
653
- is_text_stable = abs(text_len - last_text_len) < 5 # Allow minor fluctuations
654
- is_dom_stable = abs(html_len - last_html_len) < 20 # Allow minor fluctuations (ads/tracking)
655
-
656
- if is_height_stable and is_text_stable and is_dom_stable and not has_recent_network:
657
- stable_count += 1
658
- else:
659
- # Reset if ANY metric changed or NETWORK active
660
- stable_count = 0
661
- # if has_recent_network: logger.debug("Stability reset: Network Activity")
662
-
663
- # Conditions
664
- has_content = text_len > 100 # At least 100 real chars
665
-
666
- # Dynamic Stability Requirement:
667
- # If page looks like it's loading (small content), require longer stability
668
- if has_recent_network:
669
- # If we just saw network, enforce at least 1s (10 ticks) clean silence even for large pages
670
- required_stability = max(10, 40 if text_len < 500 else 25)
671
- else:
672
- required_stability = 40 if text_len < 500 else 25 # 4.0s or 2.5s
673
-
674
- is_stable = stable_count >= required_stability
664
+ # Basic content check
665
+ has_content = text_len > 100
675
666
 
676
- # Pass if all conditions met
677
- if is_ready and not is_verification and has_content and is_stable:
667
+ # Pass if ready, not verification, and has content
668
+ if is_ready and not is_verification and has_content:
669
+ logger.debug(f"ScreenshotService: Page ready after {i * 0.05:.2f}s")
678
670
  break
679
-
680
- last_h = current_h
681
- last_text_len = text_len
682
- last_html_len = html_len
683
-
684
- # Wait timing within loop (tab.listen.steps consumed some time, so sleep less)
685
- try: time.sleep(0.05)
686
- except: pass
687
671
 
672
+ time.sleep(0.05)
688
673
  except Exception:
689
- stable_count = 0
690
- try: time.sleep(0.1)
691
- except: pass
674
+ time.sleep(0.05)
692
675
  continue
693
676
 
694
- # Cleanup listener
695
- try: tab.listen.stop()
696
- except: pass
697
-
698
677
  # DEBUG: Save HTML to inspect what happened (in data dir)
699
678
  try:
700
679
  import os
@@ -703,12 +682,9 @@ class ScreenshotService:
703
682
  f.write(f"<!-- URL: {url} -->\n")
704
683
  f.write(tab.html)
705
684
  except: pass
706
-
707
- # Use crawling module for lazy loading trigger (config defined above)
708
- trigger_lazy_load(tab, crawl_config)
709
-
710
- except:
711
- pass
685
+
686
+ except Exception as e:
687
+ logger.warning(f"ScreenshotService: Page readiness check failed: {e}")
712
688
 
713
689
  # Scrollbar Hiding first (before any height calculation)
714
690
  from .manager import SharedBrowserManager
@@ -717,17 +693,165 @@ class ScreenshotService:
717
693
  # Scroll back to top
718
694
  tab.run_js("window.scrollTo(0, 0);")
719
695
 
720
- # Use CompletenessChecker to verify all images are loaded
721
- checker = CompletenessChecker(crawl_config)
722
- completeness = checker.wait_for_complete(tab, timeout=crawl_config.image_load_timeout)
723
-
724
- logger.info(
725
- f"ScreenshotService: Image completeness: "
726
- f"{completeness.loaded_images}/{completeness.total_images} loaded, "
727
- f"{completeness.failed_images} pending, "
728
- f"{completeness.placeholder_images} placeholders, "
729
- f"complete={completeness.is_complete}"
730
- )
696
+ # Image loading monitoring with time tracking - DISABLED
697
+ # No longer waiting for images to load
698
+ # Initialize image tracking JavaScript
699
+ # image_tracking_js = """
700
+ # (() => {
701
+ # if (!window._imageLoadTracker) {
702
+ # window._imageLoadTracker = {
703
+ # startTime: Date.now(),
704
+ # images: new Map()
705
+ # };
706
+ #
707
+ # const imgs = Array.from(document.querySelectorAll('img'));
708
+ # const minSize = 50;
709
+ #
710
+ # imgs.forEach((img, idx) => {
711
+ # if (img.clientWidth < minSize && img.clientHeight < minSize) return;
712
+ #
713
+ # const src = img.src || img.getAttribute('data-src') || '';
714
+ # const key = `${idx}_${src.substring(0, 100)}`;
715
+ #
716
+ # if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
717
+ # // Already loaded
718
+ # window._imageLoadTracker.images.set(key, {
719
+ # src: src.substring(0, 150),
720
+ # status: 'loaded',
721
+ # loadTime: 0, // Already loaded before tracking
722
+ # naturalSize: [img.naturalWidth, img.naturalHeight],
723
+ # displaySize: [img.clientWidth, img.clientHeight]
724
+ # });
725
+ # } else {
726
+ # // Track loading
727
+ # window._imageLoadTracker.images.set(key, {
728
+ # src: src.substring(0, 150),
729
+ # status: 'pending',
730
+ # startTime: Date.now(),
731
+ # naturalSize: [img.naturalWidth, img.naturalHeight],
732
+ # displaySize: [img.clientWidth, img.clientHeight]
733
+ # });
734
+ #
735
+ # // Add load event listener
736
+ # img.addEventListener('load', () => {
737
+ # const entry = window._imageLoadTracker.images.get(key);
738
+ # if (entry && entry.status === 'pending') {
739
+ # entry.status = 'loaded';
740
+ # entry.loadTime = Date.now() - entry.startTime;
741
+ # }
742
+ # });
743
+ #
744
+ # img.addEventListener('error', () => {
745
+ # const entry = window._imageLoadTracker.images.get(key);
746
+ # if (entry && entry.status === 'pending') {
747
+ # entry.status = 'failed';
748
+ # entry.loadTime = Date.now() - entry.startTime;
749
+ # }
750
+ # });
751
+ # }
752
+ # });
753
+ # }
754
+ #
755
+ # // Return current status
756
+ # const results = [];
757
+ # window._imageLoadTracker.images.forEach((value, key) => {
758
+ # const entry = {
759
+ # src: value.src,
760
+ # status: value.status,
761
+ # loadTime: value.status === 'loaded' ? (value.loadTime || 0) : (Date.now() - value.startTime),
762
+ # naturalSize: value.naturalSize,
763
+ # displaySize: value.displaySize
764
+ # };
765
+ # results.push(entry);
766
+ # });
767
+ #
768
+ # return {
769
+ # total: results.length,
770
+ # loaded: results.filter(r => r.status === 'loaded').length,
771
+ # pending: results.filter(r => r.status === 'pending').length,
772
+ # failed: results.filter(r => r.status === 'failed').length,
773
+ # details: results
774
+ # };
775
+ # })()
776
+ # """
777
+ #
778
+ # # Initialize tracking
779
+ # tab.run_js(image_tracking_js)
780
+ #
781
+ # # Monitor image loading with dynamic stop logic
782
+ # check_interval = 0.2 # Check every 200ms
783
+ # image_timeout = 3.0 # Image loading timeout: 3 seconds
784
+ # monitoring_start = time.time()
785
+ # loaded_times = [] # Track load times of completed images
786
+ #
787
+ # logger.info(f"ScreenshotService: Starting image load monitoring (timeout={image_timeout}s)...")
788
+ #
789
+ # while True:
790
+ # elapsed = time.time() - monitoring_start
791
+ #
792
+ # # Check timeout first
793
+ # if elapsed >= image_timeout:
794
+ # logger.info(f"ScreenshotService: Image loading timeout ({image_timeout}s) reached")
795
+ # break
796
+ #
797
+ # # Get current image status
798
+ # status = tab.run_js(image_tracking_js, as_expr=True) or {
799
+ # 'total': 0, 'loaded': 0, 'pending': 0, 'failed': 0, 'details': []
800
+ # }
801
+ #
802
+ # # Log each image's status and load time
803
+ # for img_detail in status.get('details', []):
804
+ # src_short = img_detail.get('src', '')[:80]
805
+ # status_str = img_detail.get('status', 'unknown')
806
+ # load_time = img_detail.get('loadTime', 0)
807
+ # logger.info(
808
+ # f"ScreenshotService: Image [{status_str}] "
809
+ # f"loadTime={load_time:.0f}ms "
810
+ # f"src={src_short}"
811
+ # )
812
+ #
813
+ # # Collect load times of completed images
814
+ # loaded_times = [
815
+ # img.get('loadTime', 0)
816
+ # for img in status.get('details', [])
817
+ # if img.get('status') == 'loaded' and img.get('loadTime', 0) > 0
818
+ # ]
819
+ #
820
+ # pending_count = status.get('pending', 0)
821
+ # loaded_count = status.get('loaded', 0)
822
+ #
823
+ # # Check stop conditions
824
+ # if pending_count == 0:
825
+ # logger.info(f"ScreenshotService: All images loaded. Total: {status.get('total', 0)}, Loaded: {loaded_count}")
826
+ # break
827
+ #
828
+ # # Check dynamic stop condition (if we have loaded images to calculate average)
829
+ # if loaded_times:
830
+ # avg_load_time = sum(loaded_times) / len(loaded_times)
831
+ # max_wait_time = avg_load_time * 2
832
+ #
833
+ # # Check if any pending image has exceeded max wait time
834
+ # pending_images = [
835
+ # img for img in status.get('details', [])
836
+ # if img.get('status') == 'pending'
837
+ # ]
838
+ #
839
+ # should_stop = False
840
+ # for pending_img in pending_images:
841
+ # wait_time = pending_img.get('loadTime', 0)
842
+ # if wait_time >= max_wait_time:
843
+ # should_stop = True
844
+ # logger.info(
845
+ # f"ScreenshotService: Stopping - pending image waited {wait_time:.0f}ms, "
846
+ # f"exceeds 2x avg load time ({max_wait_time:.0f}ms, avg={avg_load_time:.0f}ms)"
847
+ # )
848
+ # break
849
+ #
850
+ # if should_stop:
851
+ # break
852
+ #
853
+ # # Wait before next check
854
+ # time.sleep(check_interval)
731
855
 
732
856
  # Now calculate final height ONCE after all content loaded
733
857
  # CompletenessChecker already verified height stability
@@ -749,13 +873,42 @@ class ScreenshotService:
749
873
  # Final scroll to top
750
874
  tab.run_js("window.scrollTo(0, 0);")
751
875
 
752
- # Use full_page=False because we manually set the viewport to the full height
753
- # This avoids stitching artifacts and blank spaces
754
- return tab.get_screenshot(as_base64='jpg', full_page=False)
876
+ # Capture screenshot
877
+ screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
878
+
879
+ # Extract content if requested
880
+ if extract_content:
881
+ try:
882
+ html = tab.html
883
+ title = tab.title
884
+ final_url = tab.url
885
+
886
+ # Minimal trafilatura settings to reduce token consumption
887
+ content = trafilatura.extract(
888
+ html,
889
+ include_links=False, # No links to reduce tokens
890
+ include_images=False, # No image descriptions
891
+ include_comments=False, # No comments
892
+ include_tables=False, # No tables (can be verbose)
893
+ favor_precision=True, # Favor precision over recall
894
+ output_format="txt" # Plain text (no markdown formatting)
895
+ ) or ""
896
+
897
+ return {
898
+ "screenshot_b64": screenshot_b64,
899
+ "content": content,
900
+ "title": title,
901
+ "url": final_url
902
+ }
903
+ except Exception as e:
904
+ logger.warning(f"ScreenshotService: Content extraction failed: {e}")
905
+ return {"screenshot_b64": screenshot_b64, "content": "", "title": "", "url": url}
906
+
907
+ return screenshot_b64
755
908
 
756
909
  except Exception as e:
757
910
  logger.error(f"ScreenshotService: Screenshot URL failed: {e}")
758
- return None
911
+ return {"screenshot_b64": None, "content": "", "title": "", "url": url} if extract_content else None
759
912
  finally:
760
913
  if tab:
761
914
  try: tab.close()