entari-plugin-hyw 4.0.0rc10__py3-none-any.whl → 4.0.0rc12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- entari_plugin_hyw/__init__.py +191 -140
- entari_plugin_hyw/filters.py +83 -0
- entari_plugin_hyw/misc.py +42 -0
- {entari_plugin_hyw-4.0.0rc10.dist-info → entari_plugin_hyw-4.0.0rc12.dist-info}/METADATA +1 -1
- {entari_plugin_hyw-4.0.0rc10.dist-info → entari_plugin_hyw-4.0.0rc12.dist-info}/RECORD +14 -12
- hyw_core/agent.py +648 -0
- hyw_core/browser_control/service.py +283 -130
- hyw_core/core.py +148 -8
- hyw_core/crawling/completeness.py +99 -10
- hyw_core/definitions.py +70 -52
- hyw_core/search.py +10 -0
- hyw_core/stages/summary.py +1 -3
- {entari_plugin_hyw-4.0.0rc10.dist-info → entari_plugin_hyw-4.0.0rc12.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-4.0.0rc10.dist-info → entari_plugin_hyw-4.0.0rc12.dist-info}/top_level.txt +0 -0
|
@@ -285,14 +285,27 @@ class ScreenshotService:
|
|
|
285
285
|
// 2. Check loading status using decode() AND heuristic for placeholders
|
|
286
286
|
// Some sites load a tiny blurred placeholder first.
|
|
287
287
|
const checks = visibleImgs.map(img => {
|
|
288
|
-
//
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
288
|
+
// Enhanced placeholder detection (matching completeness.py)
|
|
289
|
+
const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
|
|
290
|
+
img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
|
|
291
|
+
const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
|
|
292
|
+
const loadingAttr = img.getAttribute('loading') || '';
|
|
293
|
+
const src = img.src || '';
|
|
292
294
|
|
|
293
295
|
const isPlaceholder = (
|
|
294
|
-
|
|
295
|
-
(
|
|
296
|
+
// data-src not yet loaded
|
|
297
|
+
(dataSrc && img.src !== dataSrc) ||
|
|
298
|
+
// Natural size much smaller than display (blurred placeholder)
|
|
299
|
+
(img.naturalWidth < 50 && img.clientWidth > 100) ||
|
|
300
|
+
(img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
|
|
301
|
+
// Lazy-loading class indicators
|
|
302
|
+
className.includes('lazy') ||
|
|
303
|
+
className.includes('lazyload') ||
|
|
304
|
+
className.includes('lozad') ||
|
|
305
|
+
// CSS blur filter applied
|
|
306
|
+
(window.getComputedStyle(img).filter || '').includes('blur') ||
|
|
307
|
+
// loading="lazy" + not complete
|
|
308
|
+
(loadingAttr === 'lazy' && !img.complete)
|
|
296
309
|
);
|
|
297
310
|
|
|
298
311
|
if (isPlaceholder) {
|
|
@@ -529,19 +542,54 @@ class ScreenshotService:
|
|
|
529
542
|
return await asyncio.gather(*tasks, return_exceptions=True)
|
|
530
543
|
|
|
531
544
|
async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 80) -> Optional[str]:
|
|
532
|
-
"""Screenshot URL (Async wrapper for sync)."""
|
|
545
|
+
"""Screenshot URL (Async wrapper for sync). Returns base64 string only."""
|
|
533
546
|
loop = asyncio.get_running_loop()
|
|
534
|
-
|
|
547
|
+
result = await loop.run_in_executor(
|
|
548
|
+
self._executor,
|
|
549
|
+
self._screenshot_sync,
|
|
550
|
+
url, wait_load, timeout, full_page, quality, False # extract_content=False
|
|
551
|
+
)
|
|
552
|
+
# Backward compatible: return just the screenshot for old callers
|
|
553
|
+
if isinstance(result, dict):
|
|
554
|
+
return result.get("screenshot_b64")
|
|
555
|
+
return result
|
|
556
|
+
|
|
557
|
+
async def screenshot_with_content(self, url: str, timeout: float = 15.0, max_content_length: int = 8000) -> Dict[str, Any]:
|
|
558
|
+
"""
|
|
559
|
+
Screenshot URL and extract page content.
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
Dict with:
|
|
563
|
+
- screenshot_b64: base64 encoded screenshot
|
|
564
|
+
- content: trafilatura extracted text (truncated to max_content_length)
|
|
565
|
+
- title: page title
|
|
566
|
+
- url: final URL
|
|
567
|
+
"""
|
|
568
|
+
loop = asyncio.get_running_loop()
|
|
569
|
+
result = await loop.run_in_executor(
|
|
535
570
|
self._executor,
|
|
536
571
|
self._screenshot_sync,
|
|
537
|
-
url,
|
|
572
|
+
url, True, timeout, False, 65, True # quality=65 for balance, extract_content=True
|
|
538
573
|
)
|
|
574
|
+
|
|
575
|
+
if not isinstance(result, dict):
|
|
576
|
+
return {"screenshot_b64": result, "content": "", "title": "", "url": url}
|
|
577
|
+
|
|
578
|
+
# Truncate content if needed
|
|
579
|
+
content = result.get("content", "") or ""
|
|
580
|
+
if len(content) > max_content_length:
|
|
581
|
+
content = content[:max_content_length] + "\n\n[内容已截断...]"
|
|
582
|
+
result["content"] = content
|
|
583
|
+
|
|
584
|
+
return result
|
|
539
585
|
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
586
|
+
|
|
587
|
+
def _screenshot_sync(self, url: str, wait_load: bool, timeout: float, full_page: bool, quality: int, extract_content: bool = False) -> Any:
|
|
588
|
+
"""Synchronous screenshot. If extract_content=True, returns Dict else str."""
|
|
589
|
+
if not url:
|
|
590
|
+
return {"screenshot_b64": None, "content": "", "title": "", "url": url} if extract_content else None
|
|
543
591
|
tab = None
|
|
544
|
-
capture_width =
|
|
592
|
+
capture_width = 1440 # Higher resolution for readability
|
|
545
593
|
|
|
546
594
|
try:
|
|
547
595
|
self._ensure_ready()
|
|
@@ -562,84 +610,50 @@ class ScreenshotService:
|
|
|
562
610
|
# Now navigate to the URL - page will render at target width
|
|
563
611
|
tab.get(url)
|
|
564
612
|
|
|
565
|
-
#
|
|
566
|
-
try:
|
|
567
|
-
# Listen for data packets (XHR/Fetch/POST)
|
|
568
|
-
# Targets: xhr, fetch. POST usually falls under these or Document.
|
|
569
|
-
tab.listen.start(targets=True) # Listen to everything for now to be safe
|
|
570
|
-
except Exception as e:
|
|
571
|
-
logger.warning(f"ScreenshotService: Failed to start network listener: {e}")
|
|
613
|
+
# Network monitoring removed - not needed for simplified wait logic
|
|
572
614
|
|
|
573
615
|
# Initialize crawl config for completeness checking (defined outside try for scope)
|
|
574
616
|
crawl_config = CrawlConfig(
|
|
575
617
|
scan_full_page=True,
|
|
576
618
|
scroll_step=800,
|
|
577
619
|
scroll_delay=0.5,
|
|
578
|
-
scroll_timeout=
|
|
579
|
-
image_load_timeout=
|
|
620
|
+
scroll_timeout=20.0, # Increased for lazy-loading pages
|
|
621
|
+
image_load_timeout=3.0, # Image loading timeout: 3 seconds
|
|
580
622
|
image_stability_checks=3,
|
|
581
623
|
)
|
|
582
624
|
|
|
625
|
+
# === Start scrolling immediately to trigger lazy loading ===
|
|
626
|
+
# Scroll first, then wait for DOM - this allows lazy loading to start in parallel
|
|
627
|
+
logger.info(f"ScreenshotService: Starting lazy load scroll for {url} (before DOM wait)")
|
|
628
|
+
trigger_lazy_load(tab, crawl_config)
|
|
629
|
+
|
|
630
|
+
# Now wait for DOM to be ready (after scroll has triggered lazy loading)
|
|
583
631
|
try:
|
|
584
632
|
# Wait for full page load (including JS execution)
|
|
585
|
-
|
|
633
|
+
try:
|
|
634
|
+
tab.wait.doc_loaded(timeout=timeout)
|
|
635
|
+
except:
|
|
636
|
+
pass
|
|
586
637
|
|
|
587
638
|
# Wait for actual content to appear (for CDN verification pages)
|
|
588
|
-
#
|
|
589
|
-
# 1. Network Idle: Wait for silence in XHR/POST
|
|
590
|
-
# 2. Stability: Wait for Height/Text/DOM stability
|
|
591
|
-
|
|
592
|
-
time.sleep(1.5) # user request: force wait 1.5s before detection
|
|
639
|
+
# Simplified wait logic for screenshot: just check basic readiness
|
|
593
640
|
|
|
594
|
-
|
|
595
|
-
last_text_len = 0
|
|
596
|
-
last_html_len = 0
|
|
597
|
-
stable_count = 0
|
|
598
|
-
|
|
599
|
-
for i in range(200): # Max 200 iterations (~20s)
|
|
641
|
+
for i in range(20): # Max 20 iterations (~1s) - much faster
|
|
600
642
|
try:
|
|
601
|
-
# 1. Check Network Activity
|
|
602
|
-
has_recent_network = False
|
|
603
|
-
try:
|
|
604
|
-
# Iterate over any captured packets since last check
|
|
605
|
-
for packet in tab.listen.steps(timeout=0.01):
|
|
606
|
-
# Check if it's a significant request (POST or XHR/Fetch)
|
|
607
|
-
method = packet.method.upper()
|
|
608
|
-
r_type = packet.resourceType.upper() if getattr(packet, 'resourceType', None) else ""
|
|
609
|
-
|
|
610
|
-
# Interested in: POST requests OR any XHR/Fetch response
|
|
611
|
-
if method == 'POST' or 'XMLHTTPREQUEST' in r_type or 'FETCH' in r_type:
|
|
612
|
-
# Ignore some common noise? (Optional: analytics, tracking)
|
|
613
|
-
# For now, simplistic approach: any API traffic resets stability
|
|
614
|
-
has_recent_network = True
|
|
615
|
-
# logger.debug(f"Network Activity: {method} {packet.url[:50]}")
|
|
616
|
-
break
|
|
617
|
-
except:
|
|
618
|
-
pass
|
|
619
|
-
|
|
620
|
-
# 2. Check DOM State
|
|
621
643
|
state = tab.run_js('''
|
|
622
644
|
return {
|
|
623
645
|
ready: document.readyState === 'complete',
|
|
624
646
|
title: document.title,
|
|
625
|
-
|
|
626
|
-
document.body.scrollHeight || 0,
|
|
627
|
-
document.documentElement.scrollHeight || 0
|
|
628
|
-
),
|
|
629
|
-
text: document.body.innerText.substring(0, 1000) || "",
|
|
630
|
-
html_len: document.body.innerHTML.length || 0
|
|
647
|
+
text: document.body.innerText.substring(0, 500) || ""
|
|
631
648
|
};
|
|
632
|
-
''') or {'ready': False, 'title': "", '
|
|
649
|
+
''') or {'ready': False, 'title': "", 'text': ""}
|
|
633
650
|
|
|
634
651
|
is_ready = state.get('ready', False)
|
|
635
652
|
title = state.get('title', "").lower()
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
text_len = len(text_content)
|
|
639
|
-
html_len = int(state.get('html_len', 0))
|
|
640
|
-
text_lower = text_content.lower()
|
|
653
|
+
text_lower = state.get('text', "").lower()
|
|
654
|
+
text_len = len(text_lower)
|
|
641
655
|
|
|
642
|
-
#
|
|
656
|
+
# Check for verification pages
|
|
643
657
|
is_verification = "checking your browser" in text_lower or \
|
|
644
658
|
"just a moment" in text_lower or \
|
|
645
659
|
"please wait" in text_lower or \
|
|
@@ -647,54 +661,19 @@ class ScreenshotService:
|
|
|
647
661
|
"just a moment" in title or \
|
|
648
662
|
"loading..." in title
|
|
649
663
|
|
|
650
|
-
#
|
|
651
|
-
|
|
652
|
-
is_height_stable = current_h == last_h
|
|
653
|
-
is_text_stable = abs(text_len - last_text_len) < 5 # Allow minor fluctuations
|
|
654
|
-
is_dom_stable = abs(html_len - last_html_len) < 20 # Allow minor fluctuations (ads/tracking)
|
|
655
|
-
|
|
656
|
-
if is_height_stable and is_text_stable and is_dom_stable and not has_recent_network:
|
|
657
|
-
stable_count += 1
|
|
658
|
-
else:
|
|
659
|
-
# Reset if ANY metric changed or NETWORK active
|
|
660
|
-
stable_count = 0
|
|
661
|
-
# if has_recent_network: logger.debug("Stability reset: Network Activity")
|
|
662
|
-
|
|
663
|
-
# Conditions
|
|
664
|
-
has_content = text_len > 100 # At least 100 real chars
|
|
665
|
-
|
|
666
|
-
# Dynamic Stability Requirement:
|
|
667
|
-
# If page looks like it's loading (small content), require longer stability
|
|
668
|
-
if has_recent_network:
|
|
669
|
-
# If we just saw network, enforce at least 1s (10 ticks) clean silence even for large pages
|
|
670
|
-
required_stability = max(10, 40 if text_len < 500 else 25)
|
|
671
|
-
else:
|
|
672
|
-
required_stability = 40 if text_len < 500 else 25 # 4.0s or 2.5s
|
|
673
|
-
|
|
674
|
-
is_stable = stable_count >= required_stability
|
|
664
|
+
# Basic content check
|
|
665
|
+
has_content = text_len > 100
|
|
675
666
|
|
|
676
|
-
# Pass if
|
|
677
|
-
if is_ready and not is_verification and has_content
|
|
667
|
+
# Pass if ready, not verification, and has content
|
|
668
|
+
if is_ready and not is_verification and has_content:
|
|
669
|
+
logger.debug(f"ScreenshotService: Page ready after {i * 0.05:.2f}s")
|
|
678
670
|
break
|
|
679
|
-
|
|
680
|
-
last_h = current_h
|
|
681
|
-
last_text_len = text_len
|
|
682
|
-
last_html_len = html_len
|
|
683
|
-
|
|
684
|
-
# Wait timing within loop (tab.listen.steps consumed some time, so sleep less)
|
|
685
|
-
try: time.sleep(0.05)
|
|
686
|
-
except: pass
|
|
687
671
|
|
|
672
|
+
time.sleep(0.05)
|
|
688
673
|
except Exception:
|
|
689
|
-
|
|
690
|
-
try: time.sleep(0.1)
|
|
691
|
-
except: pass
|
|
674
|
+
time.sleep(0.05)
|
|
692
675
|
continue
|
|
693
676
|
|
|
694
|
-
# Cleanup listener
|
|
695
|
-
try: tab.listen.stop()
|
|
696
|
-
except: pass
|
|
697
|
-
|
|
698
677
|
# DEBUG: Save HTML to inspect what happened (in data dir)
|
|
699
678
|
try:
|
|
700
679
|
import os
|
|
@@ -703,12 +682,9 @@ class ScreenshotService:
|
|
|
703
682
|
f.write(f"<!-- URL: {url} -->\n")
|
|
704
683
|
f.write(tab.html)
|
|
705
684
|
except: pass
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
except:
|
|
711
|
-
pass
|
|
685
|
+
|
|
686
|
+
except Exception as e:
|
|
687
|
+
logger.warning(f"ScreenshotService: Page readiness check failed: {e}")
|
|
712
688
|
|
|
713
689
|
# Scrollbar Hiding first (before any height calculation)
|
|
714
690
|
from .manager import SharedBrowserManager
|
|
@@ -717,17 +693,165 @@ class ScreenshotService:
|
|
|
717
693
|
# Scroll back to top
|
|
718
694
|
tab.run_js("window.scrollTo(0, 0);")
|
|
719
695
|
|
|
720
|
-
#
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
696
|
+
# Image loading monitoring with time tracking - DISABLED
|
|
697
|
+
# No longer waiting for images to load
|
|
698
|
+
# Initialize image tracking JavaScript
|
|
699
|
+
# image_tracking_js = """
|
|
700
|
+
# (() => {
|
|
701
|
+
# if (!window._imageLoadTracker) {
|
|
702
|
+
# window._imageLoadTracker = {
|
|
703
|
+
# startTime: Date.now(),
|
|
704
|
+
# images: new Map()
|
|
705
|
+
# };
|
|
706
|
+
#
|
|
707
|
+
# const imgs = Array.from(document.querySelectorAll('img'));
|
|
708
|
+
# const minSize = 50;
|
|
709
|
+
#
|
|
710
|
+
# imgs.forEach((img, idx) => {
|
|
711
|
+
# if (img.clientWidth < minSize && img.clientHeight < minSize) return;
|
|
712
|
+
#
|
|
713
|
+
# const src = img.src || img.getAttribute('data-src') || '';
|
|
714
|
+
# const key = `${idx}_${src.substring(0, 100)}`;
|
|
715
|
+
#
|
|
716
|
+
# if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
|
|
717
|
+
# // Already loaded
|
|
718
|
+
# window._imageLoadTracker.images.set(key, {
|
|
719
|
+
# src: src.substring(0, 150),
|
|
720
|
+
# status: 'loaded',
|
|
721
|
+
# loadTime: 0, // Already loaded before tracking
|
|
722
|
+
# naturalSize: [img.naturalWidth, img.naturalHeight],
|
|
723
|
+
# displaySize: [img.clientWidth, img.clientHeight]
|
|
724
|
+
# });
|
|
725
|
+
# } else {
|
|
726
|
+
# // Track loading
|
|
727
|
+
# window._imageLoadTracker.images.set(key, {
|
|
728
|
+
# src: src.substring(0, 150),
|
|
729
|
+
# status: 'pending',
|
|
730
|
+
# startTime: Date.now(),
|
|
731
|
+
# naturalSize: [img.naturalWidth, img.naturalHeight],
|
|
732
|
+
# displaySize: [img.clientWidth, img.clientHeight]
|
|
733
|
+
# });
|
|
734
|
+
#
|
|
735
|
+
# // Add load event listener
|
|
736
|
+
# img.addEventListener('load', () => {
|
|
737
|
+
# const entry = window._imageLoadTracker.images.get(key);
|
|
738
|
+
# if (entry && entry.status === 'pending') {
|
|
739
|
+
# entry.status = 'loaded';
|
|
740
|
+
# entry.loadTime = Date.now() - entry.startTime;
|
|
741
|
+
# }
|
|
742
|
+
# });
|
|
743
|
+
#
|
|
744
|
+
# img.addEventListener('error', () => {
|
|
745
|
+
# const entry = window._imageLoadTracker.images.get(key);
|
|
746
|
+
# if (entry && entry.status === 'pending') {
|
|
747
|
+
# entry.status = 'failed';
|
|
748
|
+
# entry.loadTime = Date.now() - entry.startTime;
|
|
749
|
+
# }
|
|
750
|
+
# });
|
|
751
|
+
# }
|
|
752
|
+
# });
|
|
753
|
+
# }
|
|
754
|
+
#
|
|
755
|
+
# // Return current status
|
|
756
|
+
# const results = [];
|
|
757
|
+
# window._imageLoadTracker.images.forEach((value, key) => {
|
|
758
|
+
# const entry = {
|
|
759
|
+
# src: value.src,
|
|
760
|
+
# status: value.status,
|
|
761
|
+
# loadTime: value.status === 'loaded' ? (value.loadTime || 0) : (Date.now() - value.startTime),
|
|
762
|
+
# naturalSize: value.naturalSize,
|
|
763
|
+
# displaySize: value.displaySize
|
|
764
|
+
# };
|
|
765
|
+
# results.push(entry);
|
|
766
|
+
# });
|
|
767
|
+
#
|
|
768
|
+
# return {
|
|
769
|
+
# total: results.length,
|
|
770
|
+
# loaded: results.filter(r => r.status === 'loaded').length,
|
|
771
|
+
# pending: results.filter(r => r.status === 'pending').length,
|
|
772
|
+
# failed: results.filter(r => r.status === 'failed').length,
|
|
773
|
+
# details: results
|
|
774
|
+
# };
|
|
775
|
+
# })()
|
|
776
|
+
# """
|
|
777
|
+
#
|
|
778
|
+
# # Initialize tracking
|
|
779
|
+
# tab.run_js(image_tracking_js)
|
|
780
|
+
#
|
|
781
|
+
# # Monitor image loading with dynamic stop logic
|
|
782
|
+
# check_interval = 0.2 # Check every 200ms
|
|
783
|
+
# image_timeout = 3.0 # Image loading timeout: 3 seconds
|
|
784
|
+
# monitoring_start = time.time()
|
|
785
|
+
# loaded_times = [] # Track load times of completed images
|
|
786
|
+
#
|
|
787
|
+
# logger.info(f"ScreenshotService: Starting image load monitoring (timeout={image_timeout}s)...")
|
|
788
|
+
#
|
|
789
|
+
# while True:
|
|
790
|
+
# elapsed = time.time() - monitoring_start
|
|
791
|
+
#
|
|
792
|
+
# # Check timeout first
|
|
793
|
+
# if elapsed >= image_timeout:
|
|
794
|
+
# logger.info(f"ScreenshotService: Image loading timeout ({image_timeout}s) reached")
|
|
795
|
+
# break
|
|
796
|
+
#
|
|
797
|
+
# # Get current image status
|
|
798
|
+
# status = tab.run_js(image_tracking_js, as_expr=True) or {
|
|
799
|
+
# 'total': 0, 'loaded': 0, 'pending': 0, 'failed': 0, 'details': []
|
|
800
|
+
# }
|
|
801
|
+
#
|
|
802
|
+
# # Log each image's status and load time
|
|
803
|
+
# for img_detail in status.get('details', []):
|
|
804
|
+
# src_short = img_detail.get('src', '')[:80]
|
|
805
|
+
# status_str = img_detail.get('status', 'unknown')
|
|
806
|
+
# load_time = img_detail.get('loadTime', 0)
|
|
807
|
+
# logger.info(
|
|
808
|
+
# f"ScreenshotService: Image [{status_str}] "
|
|
809
|
+
# f"loadTime={load_time:.0f}ms "
|
|
810
|
+
# f"src={src_short}"
|
|
811
|
+
# )
|
|
812
|
+
#
|
|
813
|
+
# # Collect load times of completed images
|
|
814
|
+
# loaded_times = [
|
|
815
|
+
# img.get('loadTime', 0)
|
|
816
|
+
# for img in status.get('details', [])
|
|
817
|
+
# if img.get('status') == 'loaded' and img.get('loadTime', 0) > 0
|
|
818
|
+
# ]
|
|
819
|
+
#
|
|
820
|
+
# pending_count = status.get('pending', 0)
|
|
821
|
+
# loaded_count = status.get('loaded', 0)
|
|
822
|
+
#
|
|
823
|
+
# # Check stop conditions
|
|
824
|
+
# if pending_count == 0:
|
|
825
|
+
# logger.info(f"ScreenshotService: All images loaded. Total: {status.get('total', 0)}, Loaded: {loaded_count}")
|
|
826
|
+
# break
|
|
827
|
+
#
|
|
828
|
+
# # Check dynamic stop condition (if we have loaded images to calculate average)
|
|
829
|
+
# if loaded_times:
|
|
830
|
+
# avg_load_time = sum(loaded_times) / len(loaded_times)
|
|
831
|
+
# max_wait_time = avg_load_time * 2
|
|
832
|
+
#
|
|
833
|
+
# # Check if any pending image has exceeded max wait time
|
|
834
|
+
# pending_images = [
|
|
835
|
+
# img for img in status.get('details', [])
|
|
836
|
+
# if img.get('status') == 'pending'
|
|
837
|
+
# ]
|
|
838
|
+
#
|
|
839
|
+
# should_stop = False
|
|
840
|
+
# for pending_img in pending_images:
|
|
841
|
+
# wait_time = pending_img.get('loadTime', 0)
|
|
842
|
+
# if wait_time >= max_wait_time:
|
|
843
|
+
# should_stop = True
|
|
844
|
+
# logger.info(
|
|
845
|
+
# f"ScreenshotService: Stopping - pending image waited {wait_time:.0f}ms, "
|
|
846
|
+
# f"exceeds 2x avg load time ({max_wait_time:.0f}ms, avg={avg_load_time:.0f}ms)"
|
|
847
|
+
# )
|
|
848
|
+
# break
|
|
849
|
+
#
|
|
850
|
+
# if should_stop:
|
|
851
|
+
# break
|
|
852
|
+
#
|
|
853
|
+
# # Wait before next check
|
|
854
|
+
# time.sleep(check_interval)
|
|
731
855
|
|
|
732
856
|
# Now calculate final height ONCE after all content loaded
|
|
733
857
|
# CompletenessChecker already verified height stability
|
|
@@ -749,13 +873,42 @@ class ScreenshotService:
|
|
|
749
873
|
# Final scroll to top
|
|
750
874
|
tab.run_js("window.scrollTo(0, 0);")
|
|
751
875
|
|
|
752
|
-
#
|
|
753
|
-
|
|
754
|
-
|
|
876
|
+
# Capture screenshot
|
|
877
|
+
screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
|
|
878
|
+
|
|
879
|
+
# Extract content if requested
|
|
880
|
+
if extract_content:
|
|
881
|
+
try:
|
|
882
|
+
html = tab.html
|
|
883
|
+
title = tab.title
|
|
884
|
+
final_url = tab.url
|
|
885
|
+
|
|
886
|
+
# Minimal trafilatura settings to reduce token consumption
|
|
887
|
+
content = trafilatura.extract(
|
|
888
|
+
html,
|
|
889
|
+
include_links=False, # No links to reduce tokens
|
|
890
|
+
include_images=False, # No image descriptions
|
|
891
|
+
include_comments=False, # No comments
|
|
892
|
+
include_tables=False, # No tables (can be verbose)
|
|
893
|
+
favor_precision=True, # Favor precision over recall
|
|
894
|
+
output_format="txt" # Plain text (no markdown formatting)
|
|
895
|
+
) or ""
|
|
896
|
+
|
|
897
|
+
return {
|
|
898
|
+
"screenshot_b64": screenshot_b64,
|
|
899
|
+
"content": content,
|
|
900
|
+
"title": title,
|
|
901
|
+
"url": final_url
|
|
902
|
+
}
|
|
903
|
+
except Exception as e:
|
|
904
|
+
logger.warning(f"ScreenshotService: Content extraction failed: {e}")
|
|
905
|
+
return {"screenshot_b64": screenshot_b64, "content": "", "title": "", "url": url}
|
|
906
|
+
|
|
907
|
+
return screenshot_b64
|
|
755
908
|
|
|
756
909
|
except Exception as e:
|
|
757
910
|
logger.error(f"ScreenshotService: Screenshot URL failed: {e}")
|
|
758
|
-
return None
|
|
911
|
+
return {"screenshot_b64": None, "content": "", "title": "", "url": url} if extract_content else None
|
|
759
912
|
finally:
|
|
760
913
|
if tab:
|
|
761
914
|
try: tab.close()
|