entari-plugin-hyw 4.0.0rc9__py3-none-any.whl → 4.0.0rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/__init__.py +62 -34
- {entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc10.dist-info}/METADATA +1 -1
- {entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc10.dist-info}/RECORD +15 -13
- hyw_core/browser_control/__init__.py +1 -3
- hyw_core/browser_control/assets/card-dist/index.html +48 -32
- hyw_core/browser_control/engines/__init__.py +0 -2
- hyw_core/browser_control/manager.py +18 -5
- hyw_core/browser_control/renderer.py +36 -6
- hyw_core/browser_control/service.py +53 -39
- hyw_core/crawling/__init__.py +18 -0
- hyw_core/crawling/completeness.py +348 -0
- hyw_core/crawling/models.py +88 -0
- hyw_core/search.py +4 -6
- hyw_core/browser_control/engines/google.py +0 -155
- {entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc10.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc10.dist-info}/top_level.txt +0 -0
|
@@ -5,13 +5,11 @@ Provides search engine adapters for different search providers.
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from .base import SearchEngine
|
|
8
|
-
from .google import GoogleEngine
|
|
9
8
|
from .duckduckgo import DuckDuckGoEngine
|
|
10
9
|
from .default import DefaultEngine
|
|
11
10
|
|
|
12
11
|
__all__ = [
|
|
13
12
|
"SearchEngine",
|
|
14
|
-
"GoogleEngine",
|
|
15
13
|
"DuckDuckGoEngine",
|
|
16
14
|
"DefaultEngine",
|
|
17
15
|
]
|
|
@@ -124,15 +124,28 @@ class SharedBrowserManager:
|
|
|
124
124
|
@staticmethod
|
|
125
125
|
def hide_scrollbars(page: ChromiumPage):
|
|
126
126
|
"""
|
|
127
|
-
Robustly hide scrollbars using CDP commands.
|
|
128
|
-
This
|
|
127
|
+
Robustly hide scrollbars using CDP commands AND CSS injection.
|
|
128
|
+
This provides double protection against scrollbar gutters.
|
|
129
129
|
"""
|
|
130
130
|
try:
|
|
131
|
-
#
|
|
131
|
+
# 1. CDP Command
|
|
132
132
|
page.run_cdp('Emulation.setScrollbarsHidden', hidden=True)
|
|
133
|
-
|
|
133
|
+
|
|
134
|
+
# 2. CSS Injection (Standard + Webkit)
|
|
135
|
+
css = """
|
|
136
|
+
::-webkit-scrollbar { display: none !important; width: 0 !important; height: 0 !important; }
|
|
137
|
+
* { -ms-overflow-style: none !important; scrollbar-width: none !important; }
|
|
138
|
+
"""
|
|
139
|
+
# Inject into current page
|
|
140
|
+
page.run_js(f"""
|
|
141
|
+
const style = document.createElement('style');
|
|
142
|
+
style.textContent = `{css}`;
|
|
143
|
+
document.head.appendChild(style);
|
|
144
|
+
""")
|
|
145
|
+
|
|
146
|
+
logger.debug("SharedBrowserManager: Scrollbars hidden via CDP + CSS.")
|
|
134
147
|
except Exception as e:
|
|
135
|
-
logger.warning(f"SharedBrowserManager: Failed to hide scrollbars
|
|
148
|
+
logger.warning(f"SharedBrowserManager: Failed to hide scrollbars: {e}")
|
|
136
149
|
|
|
137
150
|
|
|
138
151
|
# Module-level singleton accessor
|
|
@@ -55,16 +55,43 @@ class ContentRenderer:
|
|
|
55
55
|
loop = asyncio.get_running_loop()
|
|
56
56
|
return await loop.run_in_executor(self._executor, self._prepare_tab_sync)
|
|
57
57
|
|
|
58
|
-
def _wait_for_render_finished(self, tab, timeout: float =
|
|
58
|
+
def _wait_for_render_finished(self, tab, timeout: float = 12.0, context: str = ""):
|
|
59
59
|
"""Wait for window.RENDER_FINISHED to be true in the tab."""
|
|
60
60
|
import time as pytime
|
|
61
61
|
start = pytime.time()
|
|
62
|
+
|
|
63
|
+
# Check initial state
|
|
64
|
+
initial_state = tab.run_js("return window.RENDER_FINISHED")
|
|
65
|
+
logger.debug(f"ContentRenderer[{context}]: Starting wait, initial RENDER_FINISHED={initial_state}")
|
|
66
|
+
|
|
67
|
+
# If already true, it's stale from previous render - need to wait for JS to reset it
|
|
68
|
+
if initial_state:
|
|
69
|
+
logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED was true, waiting for reset...")
|
|
70
|
+
# Wait for JS to reset it to false (updateRenderData sets it to false)
|
|
71
|
+
reset_start = pytime.time()
|
|
72
|
+
while pytime.time() - reset_start < 1.0: # 1s max to wait for reset
|
|
73
|
+
is_reset = tab.run_js("return window.RENDER_FINISHED")
|
|
74
|
+
if not is_reset:
|
|
75
|
+
logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED reset to false")
|
|
76
|
+
break
|
|
77
|
+
pytime.sleep(0.05)
|
|
78
|
+
else:
|
|
79
|
+
logger.warning(f"ContentRenderer[{context}]: RENDER_FINISHED not reset, force resetting via JS")
|
|
80
|
+
tab.run_js("window.RENDER_FINISHED = false")
|
|
81
|
+
|
|
82
|
+
# Now wait for it to become true
|
|
83
|
+
poll_count = 0
|
|
62
84
|
while pytime.time() - start < timeout:
|
|
63
85
|
is_finished = tab.run_js("return window.RENDER_FINISHED")
|
|
86
|
+
poll_count += 1
|
|
64
87
|
if is_finished:
|
|
88
|
+
elapsed = pytime.time() - start
|
|
89
|
+
logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED=true after {elapsed:.2f}s ({poll_count} polls)")
|
|
65
90
|
return True
|
|
66
|
-
pytime.sleep(0.
|
|
67
|
-
|
|
91
|
+
pytime.sleep(0.1) # Poll every 100ms
|
|
92
|
+
|
|
93
|
+
elapsed = pytime.time() - start
|
|
94
|
+
logger.warning(f"ContentRenderer[{context}]: Wait for RENDER_FINISHED timed out after {elapsed:.2f}s ({poll_count} polls)")
|
|
68
95
|
return False
|
|
69
96
|
|
|
70
97
|
def _prepare_tab_sync(self) -> str:
|
|
@@ -89,8 +116,9 @@ class ContentRenderer:
|
|
|
89
116
|
"theme_color": "#ef4444",
|
|
90
117
|
}
|
|
91
118
|
|
|
119
|
+
logger.debug(f"ContentRenderer: Calling warmup updateRenderData for tab {tab_id}")
|
|
92
120
|
tab.run_js(f"window.updateRenderData({json.dumps(warmup_data)})")
|
|
93
|
-
self._wait_for_render_finished(tab, timeout=
|
|
121
|
+
self._wait_for_render_finished(tab, timeout=12.0, context=f"warmup:{tab_id}")
|
|
94
122
|
|
|
95
123
|
# Wait for main-container after warmup (Vue needs to render it)
|
|
96
124
|
tab.ele('#main-container', timeout=3)
|
|
@@ -203,7 +231,7 @@ class ContentRenderer:
|
|
|
203
231
|
|
|
204
232
|
# 1. Update Data & Wait for Finished flag
|
|
205
233
|
tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
|
|
206
|
-
self._wait_for_render_finished(tab)
|
|
234
|
+
self._wait_for_render_finished(tab, context=f"batch:{tab_id}")
|
|
207
235
|
|
|
208
236
|
# 2. Dynamic Resize
|
|
209
237
|
# Get actual content height to prevent clipping
|
|
@@ -330,10 +358,12 @@ class ContentRenderer:
|
|
|
330
358
|
"theme_color": theme_color,
|
|
331
359
|
}
|
|
332
360
|
|
|
361
|
+
actual_tab_id = getattr(tab, 'tab_id', 'unknown')
|
|
362
|
+
logger.info(f"ContentRenderer: Calling updateRenderData for tab {actual_tab_id}, markdown length={len(markdown_content)}")
|
|
333
363
|
tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
|
|
334
364
|
|
|
335
365
|
# Wait for event-driven finish
|
|
336
|
-
self._wait_for_render_finished(tab, timeout=
|
|
366
|
+
self._wait_for_render_finished(tab, timeout=12.0, context=f"render:{actual_tab_id}")
|
|
337
367
|
|
|
338
368
|
# Dynamic Resize
|
|
339
369
|
scroll_height = tab.run_js('return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);')
|
|
@@ -13,6 +13,10 @@ from typing import Optional, Dict, Any, List
|
|
|
13
13
|
from loguru import logger
|
|
14
14
|
import trafilatura
|
|
15
15
|
|
|
16
|
+
# Import intelligent completeness checker
|
|
17
|
+
from ..crawling.completeness import CompletenessChecker, trigger_lazy_load
|
|
18
|
+
from ..crawling.models import CrawlConfig
|
|
19
|
+
|
|
16
20
|
class ScreenshotService:
|
|
17
21
|
"""
|
|
18
22
|
Browser Service using DrissionPage.
|
|
@@ -537,12 +541,26 @@ class ScreenshotService:
|
|
|
537
541
|
"""Synchronous screenshot."""
|
|
538
542
|
if not url: return None
|
|
539
543
|
tab = None
|
|
544
|
+
capture_width = 1024 # Standard capture width
|
|
545
|
+
|
|
540
546
|
try:
|
|
541
547
|
self._ensure_ready()
|
|
542
548
|
page = self._manager.page
|
|
543
549
|
if not page: return None
|
|
544
550
|
|
|
545
|
-
tab
|
|
551
|
+
# Create blank tab first
|
|
552
|
+
tab = page.new_tab()
|
|
553
|
+
|
|
554
|
+
# Set viewport BEFORE navigation so page renders at target width from the start
|
|
555
|
+
# This eliminates the need for post-load resize and reflow
|
|
556
|
+
try:
|
|
557
|
+
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
558
|
+
width=capture_width, height=900, deviceScaleFactor=1, mobile=False)
|
|
559
|
+
except:
|
|
560
|
+
pass
|
|
561
|
+
|
|
562
|
+
# Now navigate to the URL - page will render at target width
|
|
563
|
+
tab.get(url)
|
|
546
564
|
|
|
547
565
|
# Start monitoring network traffic
|
|
548
566
|
try:
|
|
@@ -552,6 +570,16 @@ class ScreenshotService:
|
|
|
552
570
|
except Exception as e:
|
|
553
571
|
logger.warning(f"ScreenshotService: Failed to start network listener: {e}")
|
|
554
572
|
|
|
573
|
+
# Initialize crawl config for completeness checking (defined outside try for scope)
|
|
574
|
+
crawl_config = CrawlConfig(
|
|
575
|
+
scan_full_page=True,
|
|
576
|
+
scroll_step=800,
|
|
577
|
+
scroll_delay=0.5,
|
|
578
|
+
scroll_timeout=min(timeout, 10),
|
|
579
|
+
image_load_timeout=8.0,
|
|
580
|
+
image_stability_checks=3,
|
|
581
|
+
)
|
|
582
|
+
|
|
555
583
|
try:
|
|
556
584
|
# Wait for full page load (including JS execution)
|
|
557
585
|
tab.wait.load_complete(timeout=timeout)
|
|
@@ -676,49 +704,33 @@ class ScreenshotService:
|
|
|
676
704
|
f.write(tab.html)
|
|
677
705
|
except: pass
|
|
678
706
|
|
|
679
|
-
# Use
|
|
680
|
-
|
|
707
|
+
# Use crawling module for lazy loading trigger (config defined above)
|
|
708
|
+
trigger_lazy_load(tab, crawl_config)
|
|
681
709
|
|
|
682
710
|
except:
|
|
683
711
|
pass
|
|
684
712
|
|
|
685
|
-
#
|
|
686
|
-
|
|
687
|
-
capture_width = 1024
|
|
688
|
-
|
|
689
|
-
# Calculate actual content height after lazy loading
|
|
690
|
-
try:
|
|
691
|
-
# Use a robust height calculation
|
|
692
|
-
content_height = tab.run_js('''
|
|
693
|
-
return Math.max(
|
|
694
|
-
document.body.scrollHeight || 0,
|
|
695
|
-
document.documentElement.scrollHeight || 0,
|
|
696
|
-
document.body.offsetHeight || 0,
|
|
697
|
-
document.documentElement.offsetHeight || 0,
|
|
698
|
-
document.documentElement.clientHeight || 0
|
|
699
|
-
);
|
|
700
|
-
''')
|
|
701
|
-
# Add a small buffer and cap at 15000px to prevent memory issues
|
|
702
|
-
h = min(int(content_height) + 50, 15000)
|
|
703
|
-
except:
|
|
704
|
-
h = 1000 # Fallback
|
|
705
|
-
|
|
706
|
-
# Set viewport to full content size for single-shot capture
|
|
707
|
-
try:
|
|
708
|
-
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
709
|
-
width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
|
|
710
|
-
except:
|
|
711
|
-
pass
|
|
712
|
-
|
|
713
|
-
# Scrollbar Hiding
|
|
713
|
+
# Scrollbar Hiding first (before any height calculation)
|
|
714
714
|
from .manager import SharedBrowserManager
|
|
715
715
|
SharedBrowserManager.hide_scrollbars(tab)
|
|
716
716
|
|
|
717
|
-
# Scroll back to top
|
|
717
|
+
# Scroll back to top
|
|
718
718
|
tab.run_js("window.scrollTo(0, 0);")
|
|
719
719
|
|
|
720
|
-
#
|
|
721
|
-
|
|
720
|
+
# Use CompletenessChecker to verify all images are loaded
|
|
721
|
+
checker = CompletenessChecker(crawl_config)
|
|
722
|
+
completeness = checker.wait_for_complete(tab, timeout=crawl_config.image_load_timeout)
|
|
723
|
+
|
|
724
|
+
logger.info(
|
|
725
|
+
f"ScreenshotService: Image completeness: "
|
|
726
|
+
f"{completeness.loaded_images}/{completeness.total_images} loaded, "
|
|
727
|
+
f"{completeness.failed_images} pending, "
|
|
728
|
+
f"{completeness.placeholder_images} placeholders, "
|
|
729
|
+
f"complete={completeness.is_complete}"
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
# Now calculate final height ONCE after all content loaded
|
|
733
|
+
# CompletenessChecker already verified height stability
|
|
722
734
|
try:
|
|
723
735
|
final_height = tab.run_js('''
|
|
724
736
|
return Math.max(
|
|
@@ -728,13 +740,15 @@ class ScreenshotService:
|
|
|
728
740
|
document.documentElement.offsetHeight || 0
|
|
729
741
|
);
|
|
730
742
|
''')
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
width=capture_width, height=final_h, deviceScaleFactor=1, mobile=False)
|
|
743
|
+
h = min(int(final_height) + 50, 15000)
|
|
744
|
+
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
745
|
+
width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
|
|
735
746
|
except:
|
|
736
747
|
pass
|
|
737
748
|
|
|
749
|
+
# Final scroll to top
|
|
750
|
+
tab.run_js("window.scrollTo(0, 0);")
|
|
751
|
+
|
|
738
752
|
# Use full_page=False because we manually set the viewport to the full height
|
|
739
753
|
# This avoids stitching artifacts and blank spaces
|
|
740
754
|
return tab.get_screenshot(as_base64='jpg', full_page=False)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
hyw_core.crawling - Intelligent Web Crawling Module
|
|
3
|
+
|
|
4
|
+
Provides Crawl4AI-inspired adaptive crawling with:
|
|
5
|
+
- Page completeness guarantees (image loading verification)
|
|
6
|
+
- Content quality scoring
|
|
7
|
+
- Adaptive stop logic
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .models import CrawlConfig, PageResult, CompletenessResult
|
|
11
|
+
from .completeness import CompletenessChecker
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"CrawlConfig",
|
|
15
|
+
"PageResult",
|
|
16
|
+
"CompletenessResult",
|
|
17
|
+
"CompletenessChecker",
|
|
18
|
+
]
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Page Completeness Checker
|
|
3
|
+
|
|
4
|
+
Multi-signal page readiness detection with focus on image loading guarantees.
|
|
5
|
+
Implements Crawl4AI's wait_for_images and networkidle concepts.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any, Optional, Dict
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from loguru import logger
|
|
12
|
+
|
|
13
|
+
from .models import CrawlConfig, CompletenessResult
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# CSP-compliant JavaScript for image loading verification
|
|
17
|
+
# Based on Crawl4AI's wait_for_images logic
|
|
18
|
+
IMAGE_CHECK_JS = """
|
|
19
|
+
(() => {
|
|
20
|
+
const results = {
|
|
21
|
+
total: 0,
|
|
22
|
+
loaded: 0,
|
|
23
|
+
failed: 0,
|
|
24
|
+
placeholders: 0,
|
|
25
|
+
details: []
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
const imgs = Array.from(document.querySelectorAll('img'));
|
|
29
|
+
const viewportHeight = window.innerHeight;
|
|
30
|
+
const minSize = %d; // Injected from Python
|
|
31
|
+
|
|
32
|
+
for (const img of imgs) {
|
|
33
|
+
// Skip tiny images (icons, tracking pixels)
|
|
34
|
+
if (img.clientWidth < minSize && img.clientHeight < minSize) {
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Skip images outside viewport (unless scan_full_page)
|
|
39
|
+
const rect = img.getBoundingClientRect();
|
|
40
|
+
const inViewport = rect.top < viewportHeight * 2 && rect.bottom > -viewportHeight;
|
|
41
|
+
|
|
42
|
+
if (!inViewport && !%s) { // scan_full_page injected
|
|
43
|
+
continue;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
results.total++;
|
|
47
|
+
|
|
48
|
+
const src = img.src || img.getAttribute('data-src') || '';
|
|
49
|
+
const isPlaceholder = (
|
|
50
|
+
(img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
|
|
51
|
+
(img.naturalWidth < 50 && img.clientWidth > 100) ||
|
|
52
|
+
src.includes('placeholder') ||
|
|
53
|
+
src.includes('loading') ||
|
|
54
|
+
src.startsWith('data:image/svg+xml') ||
|
|
55
|
+
(img.naturalWidth === 1 && img.naturalHeight === 1)
|
|
56
|
+
);
|
|
57
|
+
|
|
58
|
+
if (isPlaceholder) {
|
|
59
|
+
results.placeholders++;
|
|
60
|
+
results.details.push({
|
|
61
|
+
src: src.substring(0, 100),
|
|
62
|
+
status: 'placeholder',
|
|
63
|
+
natural: [img.naturalWidth, img.naturalHeight],
|
|
64
|
+
display: [img.clientWidth, img.clientHeight]
|
|
65
|
+
});
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Check if fully loaded
|
|
70
|
+
if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
|
|
71
|
+
results.loaded++;
|
|
72
|
+
results.details.push({
|
|
73
|
+
src: src.substring(0, 100),
|
|
74
|
+
status: 'loaded',
|
|
75
|
+
natural: [img.naturalWidth, img.naturalHeight]
|
|
76
|
+
});
|
|
77
|
+
} else {
|
|
78
|
+
results.failed++;
|
|
79
|
+
results.details.push({
|
|
80
|
+
src: src.substring(0, 100),
|
|
81
|
+
status: 'pending',
|
|
82
|
+
complete: img.complete,
|
|
83
|
+
natural: [img.naturalWidth, img.naturalHeight]
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return results;
|
|
89
|
+
})()
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
# JavaScript for height stability check
|
|
93
|
+
HEIGHT_CHECK_JS = """
|
|
94
|
+
(() => {
|
|
95
|
+
return {
|
|
96
|
+
height: Math.max(
|
|
97
|
+
document.body.scrollHeight || 0,
|
|
98
|
+
document.documentElement.scrollHeight || 0,
|
|
99
|
+
document.body.offsetHeight || 0,
|
|
100
|
+
document.documentElement.offsetHeight || 0
|
|
101
|
+
),
|
|
102
|
+
ready: document.readyState === 'complete'
|
|
103
|
+
};
|
|
104
|
+
})()
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class CompletenessChecker:
|
|
109
|
+
"""
|
|
110
|
+
Multi-signal page completeness verification.
|
|
111
|
+
|
|
112
|
+
Signals checked:
|
|
113
|
+
1. Image Loading (naturalWidth > 0, not placeholder)
|
|
114
|
+
2. Height Stability (no layout shifts)
|
|
115
|
+
3. DOM Ready State
|
|
116
|
+
|
|
117
|
+
Usage:
|
|
118
|
+
checker = CompletenessChecker(CrawlConfig())
|
|
119
|
+
result = checker.check(tab)
|
|
120
|
+
|
|
121
|
+
# Or wait until complete
|
|
122
|
+
result = checker.wait_for_complete(tab)
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def __init__(self, config: Optional[CrawlConfig] = None):
|
|
126
|
+
self._config = config or CrawlConfig()
|
|
127
|
+
|
|
128
|
+
def check(self, tab: Any) -> CompletenessResult:
|
|
129
|
+
"""
|
|
130
|
+
Perform a single completeness check.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
tab: DrissionPage tab object
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
CompletenessResult with all signals
|
|
137
|
+
"""
|
|
138
|
+
start = time.time()
|
|
139
|
+
|
|
140
|
+
# Check images
|
|
141
|
+
img_result = self._check_images(tab)
|
|
142
|
+
|
|
143
|
+
# Check height
|
|
144
|
+
height_result = self._check_height(tab)
|
|
145
|
+
|
|
146
|
+
# Determine overall completeness
|
|
147
|
+
# Complete if: all real images loaded AND height is stable
|
|
148
|
+
total_pending = img_result.get('failed', 0) + img_result.get('placeholders', 0)
|
|
149
|
+
all_images_loaded = total_pending == 0 or img_result.get('total', 0) == 0
|
|
150
|
+
|
|
151
|
+
return CompletenessResult(
|
|
152
|
+
is_complete=all_images_loaded and height_result.get('ready', False),
|
|
153
|
+
total_images=img_result.get('total', 0),
|
|
154
|
+
loaded_images=img_result.get('loaded', 0),
|
|
155
|
+
failed_images=img_result.get('failed', 0),
|
|
156
|
+
placeholder_images=img_result.get('placeholders', 0),
|
|
157
|
+
height=height_result.get('height', 0),
|
|
158
|
+
height_stable=True, # Single check can't determine stability
|
|
159
|
+
network_idle=True, # TODO: network monitoring
|
|
160
|
+
check_duration=time.time() - start
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def wait_for_complete(
|
|
164
|
+
self,
|
|
165
|
+
tab: Any,
|
|
166
|
+
timeout: Optional[float] = None
|
|
167
|
+
) -> CompletenessResult:
|
|
168
|
+
"""
|
|
169
|
+
Wait for page to be fully loaded with image guarantees.
|
|
170
|
+
|
|
171
|
+
Uses multi-signal approach:
|
|
172
|
+
1. Poll image loading status
|
|
173
|
+
2. Track height stability
|
|
174
|
+
3. Respect timeout
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
tab: DrissionPage tab object
|
|
178
|
+
timeout: Max wait time (default from config)
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
CompletenessResult (may not be complete if timeout)
|
|
182
|
+
"""
|
|
183
|
+
timeout = timeout or self._config.image_load_timeout
|
|
184
|
+
start = time.time()
|
|
185
|
+
|
|
186
|
+
height_history = []
|
|
187
|
+
stable_count = 0
|
|
188
|
+
last_result = None
|
|
189
|
+
|
|
190
|
+
logger.debug(f"CompletenessChecker: Starting wait (timeout={timeout}s)")
|
|
191
|
+
|
|
192
|
+
while time.time() - start < timeout:
|
|
193
|
+
result = self.check(tab)
|
|
194
|
+
last_result = result
|
|
195
|
+
|
|
196
|
+
# Track height stability
|
|
197
|
+
height_history.append(result.height)
|
|
198
|
+
if len(height_history) > self._config.height_stability_checks:
|
|
199
|
+
height_history.pop(0)
|
|
200
|
+
|
|
201
|
+
# Check if height is stable
|
|
202
|
+
height_stable = False
|
|
203
|
+
if len(height_history) >= self._config.height_stability_checks:
|
|
204
|
+
max_h = max(height_history)
|
|
205
|
+
min_h = min(height_history)
|
|
206
|
+
height_stable = (max_h - min_h) <= self._config.height_stability_threshold
|
|
207
|
+
|
|
208
|
+
# Log progress
|
|
209
|
+
elapsed = time.time() - start
|
|
210
|
+
logger.debug(
|
|
211
|
+
f"CompletenessChecker: [{elapsed:.1f}s] "
|
|
212
|
+
f"images={result.loaded_images}/{result.total_images} "
|
|
213
|
+
f"pending={result.failed_images} "
|
|
214
|
+
f"placeholders={result.placeholder_images} "
|
|
215
|
+
f"height={result.height} stable={height_stable}"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Check if all images loaded AND height stable
|
|
219
|
+
all_loaded = (
|
|
220
|
+
result.failed_images == 0 and
|
|
221
|
+
result.placeholder_images == 0
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
if all_loaded and height_stable:
|
|
225
|
+
stable_count += 1
|
|
226
|
+
if stable_count >= self._config.image_stability_checks:
|
|
227
|
+
logger.info(
|
|
228
|
+
f"CompletenessChecker: Page complete! "
|
|
229
|
+
f"{result.loaded_images} images loaded, "
|
|
230
|
+
f"height={result.height}, "
|
|
231
|
+
f"took {elapsed:.2f}s"
|
|
232
|
+
)
|
|
233
|
+
return CompletenessResult(
|
|
234
|
+
is_complete=True,
|
|
235
|
+
total_images=result.total_images,
|
|
236
|
+
loaded_images=result.loaded_images,
|
|
237
|
+
failed_images=0,
|
|
238
|
+
placeholder_images=0,
|
|
239
|
+
height=result.height,
|
|
240
|
+
height_stable=True,
|
|
241
|
+
network_idle=True,
|
|
242
|
+
check_duration=elapsed
|
|
243
|
+
)
|
|
244
|
+
else:
|
|
245
|
+
stable_count = 0
|
|
246
|
+
|
|
247
|
+
# Wait before next check
|
|
248
|
+
time.sleep(self._config.image_check_interval)
|
|
249
|
+
|
|
250
|
+
# Timeout reached
|
|
251
|
+
elapsed = time.time() - start
|
|
252
|
+
logger.warning(
|
|
253
|
+
f"CompletenessChecker: Timeout after {elapsed:.1f}s! "
|
|
254
|
+
f"images={last_result.loaded_images}/{last_result.total_images} "
|
|
255
|
+
f"pending={last_result.failed_images}"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Return last result with is_complete=False
|
|
259
|
+
if last_result:
|
|
260
|
+
return CompletenessResult(
|
|
261
|
+
is_complete=False,
|
|
262
|
+
total_images=last_result.total_images,
|
|
263
|
+
loaded_images=last_result.loaded_images,
|
|
264
|
+
failed_images=last_result.failed_images,
|
|
265
|
+
placeholder_images=last_result.placeholder_images,
|
|
266
|
+
height=last_result.height,
|
|
267
|
+
height_stable=len(set(height_history)) == 1,
|
|
268
|
+
network_idle=True,
|
|
269
|
+
check_duration=elapsed
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
return CompletenessResult(
|
|
273
|
+
is_complete=False,
|
|
274
|
+
total_images=0, loaded_images=0, failed_images=0, placeholder_images=0,
|
|
275
|
+
height=0, height_stable=False, network_idle=False,
|
|
276
|
+
check_duration=elapsed
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def _check_images(self, tab: Any) -> Dict[str, Any]:
|
|
280
|
+
"""Run image check JavaScript and return results."""
|
|
281
|
+
try:
|
|
282
|
+
js = IMAGE_CHECK_JS % (
|
|
283
|
+
self._config.min_image_size,
|
|
284
|
+
'true' if self._config.scan_full_page else 'false'
|
|
285
|
+
)
|
|
286
|
+
result = tab.run_js(js, as_expr=True)
|
|
287
|
+
return result or {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.warning(f"CompletenessChecker: Image check failed: {e}")
|
|
290
|
+
return {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
|
|
291
|
+
|
|
292
|
+
def _check_height(self, tab: Any) -> Dict[str, Any]:
|
|
293
|
+
"""Run height check JavaScript and return results."""
|
|
294
|
+
try:
|
|
295
|
+
result = tab.run_js(HEIGHT_CHECK_JS, as_expr=True)
|
|
296
|
+
return result or {'height': 0, 'ready': False}
|
|
297
|
+
except Exception as e:
|
|
298
|
+
logger.warning(f"CompletenessChecker: Height check failed: {e}")
|
|
299
|
+
return {'height': 0, 'ready': False}
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def trigger_lazy_load(tab: Any, config: Optional[CrawlConfig] = None) -> None:
|
|
303
|
+
"""
|
|
304
|
+
Scroll through page to trigger lazy-loaded images.
|
|
305
|
+
|
|
306
|
+
Implements Crawl4AI's scan_full_page behavior.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
tab: DrissionPage tab object
|
|
310
|
+
config: Crawl configuration
|
|
311
|
+
"""
|
|
312
|
+
config = config or CrawlConfig()
|
|
313
|
+
if not config.scan_full_page:
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
start = time.time()
|
|
317
|
+
current_pos = 0
|
|
318
|
+
|
|
319
|
+
logger.debug("CompletenessChecker: Starting lazy load trigger scroll")
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
while time.time() - start < config.scroll_timeout:
|
|
323
|
+
# Scroll down
|
|
324
|
+
current_pos += config.scroll_step
|
|
325
|
+
tab.run_js(f"window.scrollTo(0, {current_pos});")
|
|
326
|
+
|
|
327
|
+
# Wait for lazy load triggers
|
|
328
|
+
time.sleep(config.scroll_delay)
|
|
329
|
+
|
|
330
|
+
# Check if reached bottom
|
|
331
|
+
height = tab.run_js("""
|
|
332
|
+
return Math.max(
|
|
333
|
+
document.body.scrollHeight || 0,
|
|
334
|
+
document.documentElement.scrollHeight || 0
|
|
335
|
+
);
|
|
336
|
+
""", as_expr=True) or 0
|
|
337
|
+
|
|
338
|
+
if current_pos >= height:
|
|
339
|
+
break
|
|
340
|
+
|
|
341
|
+
# Scroll back to top
|
|
342
|
+
tab.run_js("window.scrollTo(0, 0);")
|
|
343
|
+
|
|
344
|
+
elapsed = time.time() - start
|
|
345
|
+
logger.debug(f"CompletenessChecker: Lazy load scroll complete in {elapsed:.1f}s")
|
|
346
|
+
|
|
347
|
+
except Exception as e:
|
|
348
|
+
logger.warning(f"CompletenessChecker: Lazy load scroll failed: {e}")
|