entari-plugin-hyw 4.0.0rc9__py3-none-any.whl → 4.0.0rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

@@ -5,13 +5,11 @@ Provides search engine adapters for different search providers.
5
5
  """
6
6
 
7
7
  from .base import SearchEngine
8
- from .google import GoogleEngine
9
8
  from .duckduckgo import DuckDuckGoEngine
10
9
  from .default import DefaultEngine
11
10
 
12
11
  __all__ = [
13
12
  "SearchEngine",
14
- "GoogleEngine",
15
13
  "DuckDuckGoEngine",
16
14
  "DefaultEngine",
17
15
  ]
@@ -124,15 +124,28 @@ class SharedBrowserManager:
124
124
  @staticmethod
125
125
  def hide_scrollbars(page: ChromiumPage):
126
126
  """
127
- Robustly hide scrollbars using CDP commands.
128
- This eliminates the reserved space/gutter that standard CSS might miss.
127
+ Robustly hide scrollbars using CDP commands AND CSS injection.
128
+ This provides double protection against scrollbar gutters.
129
129
  """
130
130
  try:
131
- # Emulation.setScrollbarsHidden is a CDP command
131
+ # 1. CDP Command
132
132
  page.run_cdp('Emulation.setScrollbarsHidden', hidden=True)
133
- logger.debug("SharedBrowserManager: CDP scrollbars hidden.")
133
+
134
+ # 2. CSS Injection (Standard + Webkit)
135
+ css = """
136
+ ::-webkit-scrollbar { display: none !important; width: 0 !important; height: 0 !important; }
137
+ * { -ms-overflow-style: none !important; scrollbar-width: none !important; }
138
+ """
139
+ # Inject into current page
140
+ page.run_js(f"""
141
+ const style = document.createElement('style');
142
+ style.textContent = `{css}`;
143
+ document.head.appendChild(style);
144
+ """)
145
+
146
+ logger.debug("SharedBrowserManager: Scrollbars hidden via CDP + CSS.")
134
147
  except Exception as e:
135
- logger.warning(f"SharedBrowserManager: Failed to hide scrollbars via CDP: {e}")
148
+ logger.warning(f"SharedBrowserManager: Failed to hide scrollbars: {e}")
136
149
 
137
150
 
138
151
  # Module-level singleton accessor
@@ -55,16 +55,43 @@ class ContentRenderer:
55
55
  loop = asyncio.get_running_loop()
56
56
  return await loop.run_in_executor(self._executor, self._prepare_tab_sync)
57
57
 
58
- def _wait_for_render_finished(self, tab, timeout: float = 3.0):
58
+ def _wait_for_render_finished(self, tab, timeout: float = 12.0, context: str = ""):
59
59
  """Wait for window.RENDER_FINISHED to be true in the tab."""
60
60
  import time as pytime
61
61
  start = pytime.time()
62
+
63
+ # Check initial state
64
+ initial_state = tab.run_js("return window.RENDER_FINISHED")
65
+ logger.debug(f"ContentRenderer[{context}]: Starting wait, initial RENDER_FINISHED={initial_state}")
66
+
67
+ # If already true, it's stale from previous render - need to wait for JS to reset it
68
+ if initial_state:
69
+ logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED was true, waiting for reset...")
70
+ # Wait for JS to reset it to false (updateRenderData sets it to false)
71
+ reset_start = pytime.time()
72
+ while pytime.time() - reset_start < 1.0: # 1s max to wait for reset
73
+ is_reset = tab.run_js("return window.RENDER_FINISHED")
74
+ if not is_reset:
75
+ logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED reset to false")
76
+ break
77
+ pytime.sleep(0.05)
78
+ else:
79
+ logger.warning(f"ContentRenderer[{context}]: RENDER_FINISHED not reset, force resetting via JS")
80
+ tab.run_js("window.RENDER_FINISHED = false")
81
+
82
+ # Now wait for it to become true
83
+ poll_count = 0
62
84
  while pytime.time() - start < timeout:
63
85
  is_finished = tab.run_js("return window.RENDER_FINISHED")
86
+ poll_count += 1
64
87
  if is_finished:
88
+ elapsed = pytime.time() - start
89
+ logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED=true after {elapsed:.2f}s ({poll_count} polls)")
65
90
  return True
66
- pytime.sleep(0.05) # Fast polling
67
- logger.warning(f"ContentRenderer: Wait for RENDER_FINISHED timed out after {timeout}s")
91
+ pytime.sleep(0.1) # Poll every 100ms
92
+
93
+ elapsed = pytime.time() - start
94
+ logger.warning(f"ContentRenderer[{context}]: Wait for RENDER_FINISHED timed out after {elapsed:.2f}s ({poll_count} polls)")
68
95
  return False
69
96
 
70
97
  def _prepare_tab_sync(self) -> str:
@@ -89,8 +116,9 @@ class ContentRenderer:
89
116
  "theme_color": "#ef4444",
90
117
  }
91
118
 
119
+ logger.debug(f"ContentRenderer: Calling warmup updateRenderData for tab {tab_id}")
92
120
  tab.run_js(f"window.updateRenderData({json.dumps(warmup_data)})")
93
- self._wait_for_render_finished(tab, timeout=5.0)
121
+ self._wait_for_render_finished(tab, timeout=12.0, context=f"warmup:{tab_id}")
94
122
 
95
123
  # Wait for main-container after warmup (Vue needs to render it)
96
124
  tab.ele('#main-container', timeout=3)
@@ -203,7 +231,7 @@ class ContentRenderer:
203
231
 
204
232
  # 1. Update Data & Wait for Finished flag
205
233
  tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
206
- self._wait_for_render_finished(tab)
234
+ self._wait_for_render_finished(tab, context=f"batch:{tab_id}")
207
235
 
208
236
  # 2. Dynamic Resize
209
237
  # Get actual content height to prevent clipping
@@ -330,10 +358,12 @@ class ContentRenderer:
330
358
  "theme_color": theme_color,
331
359
  }
332
360
 
361
+ actual_tab_id = getattr(tab, 'tab_id', 'unknown')
362
+ logger.info(f"ContentRenderer: Calling updateRenderData for tab {actual_tab_id}, markdown length={len(markdown_content)}")
333
363
  tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
334
364
 
335
365
  # Wait for event-driven finish
336
- self._wait_for_render_finished(tab, timeout=5.0)
366
+ self._wait_for_render_finished(tab, timeout=12.0, context=f"render:{actual_tab_id}")
337
367
 
338
368
  # Dynamic Resize
339
369
  scroll_height = tab.run_js('return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);')
@@ -13,6 +13,10 @@ from typing import Optional, Dict, Any, List
13
13
  from loguru import logger
14
14
  import trafilatura
15
15
 
16
+ # Import intelligent completeness checker
17
+ from ..crawling.completeness import CompletenessChecker, trigger_lazy_load
18
+ from ..crawling.models import CrawlConfig
19
+
16
20
  class ScreenshotService:
17
21
  """
18
22
  Browser Service using DrissionPage.
@@ -537,12 +541,26 @@ class ScreenshotService:
537
541
  """Synchronous screenshot."""
538
542
  if not url: return None
539
543
  tab = None
544
+ capture_width = 1024 # Standard capture width
545
+
540
546
  try:
541
547
  self._ensure_ready()
542
548
  page = self._manager.page
543
549
  if not page: return None
544
550
 
545
- tab = page.new_tab(url)
551
+ # Create blank tab first
552
+ tab = page.new_tab()
553
+
554
+ # Set viewport BEFORE navigation so page renders at target width from the start
555
+ # This eliminates the need for post-load resize and reflow
556
+ try:
557
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
558
+ width=capture_width, height=900, deviceScaleFactor=1, mobile=False)
559
+ except:
560
+ pass
561
+
562
+ # Now navigate to the URL - page will render at target width
563
+ tab.get(url)
546
564
 
547
565
  # Start monitoring network traffic
548
566
  try:
@@ -552,6 +570,16 @@ class ScreenshotService:
552
570
  except Exception as e:
553
571
  logger.warning(f"ScreenshotService: Failed to start network listener: {e}")
554
572
 
573
+ # Initialize crawl config for completeness checking (defined outside try for scope)
574
+ crawl_config = CrawlConfig(
575
+ scan_full_page=True,
576
+ scroll_step=800,
577
+ scroll_delay=0.5,
578
+ scroll_timeout=min(timeout, 10),
579
+ image_load_timeout=8.0,
580
+ image_stability_checks=3,
581
+ )
582
+
555
583
  try:
556
584
  # Wait for full page load (including JS execution)
557
585
  tab.wait.load_complete(timeout=timeout)
@@ -676,49 +704,33 @@ class ScreenshotService:
676
704
  f.write(tab.html)
677
705
  except: pass
678
706
 
679
- # Use faster scroll step (800) to ensure lazy loaded images appear
680
- self._scroll_to_bottom(tab, step=800, delay=2.0, timeout=min(timeout, 10))
707
+ # Use crawling module for lazy loading trigger (config defined above)
708
+ trigger_lazy_load(tab, crawl_config)
681
709
 
682
710
  except:
683
711
  pass
684
712
 
685
- # Refine calculation: Set viewport width to 1024
686
-
687
- capture_width = 1024
688
-
689
- # Calculate actual content height after lazy loading
690
- try:
691
- # Use a robust height calculation
692
- content_height = tab.run_js('''
693
- return Math.max(
694
- document.body.scrollHeight || 0,
695
- document.documentElement.scrollHeight || 0,
696
- document.body.offsetHeight || 0,
697
- document.documentElement.offsetHeight || 0,
698
- document.documentElement.clientHeight || 0
699
- );
700
- ''')
701
- # Add a small buffer and cap at 15000px to prevent memory issues
702
- h = min(int(content_height) + 50, 15000)
703
- except:
704
- h = 1000 # Fallback
705
-
706
- # Set viewport to full content size for single-shot capture
707
- try:
708
- tab.run_cdp('Emulation.setDeviceMetricsOverride',
709
- width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
710
- except:
711
- pass
712
-
713
- # Scrollbar Hiding
713
+ # Scrollbar Hiding first (before any height calculation)
714
714
  from .manager import SharedBrowserManager
715
715
  SharedBrowserManager.hide_scrollbars(tab)
716
716
 
717
- # Scroll back to top before screenshot
717
+ # Scroll back to top
718
718
  tab.run_js("window.scrollTo(0, 0);")
719
719
 
720
- # Content is already loaded by _scroll_to_bottom which waits for images
721
- # Just recalculate final height (content may have grown during scrolling)
720
+ # Use CompletenessChecker to verify all images are loaded
721
+ checker = CompletenessChecker(crawl_config)
722
+ completeness = checker.wait_for_complete(tab, timeout=crawl_config.image_load_timeout)
723
+
724
+ logger.info(
725
+ f"ScreenshotService: Image completeness: "
726
+ f"{completeness.loaded_images}/{completeness.total_images} loaded, "
727
+ f"{completeness.failed_images} pending, "
728
+ f"{completeness.placeholder_images} placeholders, "
729
+ f"complete={completeness.is_complete}"
730
+ )
731
+
732
+ # Now calculate final height ONCE after all content loaded
733
+ # CompletenessChecker already verified height stability
722
734
  try:
723
735
  final_height = tab.run_js('''
724
736
  return Math.max(
@@ -728,13 +740,15 @@ class ScreenshotService:
728
740
  document.documentElement.offsetHeight || 0
729
741
  );
730
742
  ''')
731
- final_h = min(int(final_height) + 50, 15000)
732
- if final_h != h:
733
- tab.run_cdp('Emulation.setDeviceMetricsOverride',
734
- width=capture_width, height=final_h, deviceScaleFactor=1, mobile=False)
743
+ h = min(int(final_height) + 50, 15000)
744
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
745
+ width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
735
746
  except:
736
747
  pass
737
748
 
749
+ # Final scroll to top
750
+ tab.run_js("window.scrollTo(0, 0);")
751
+
738
752
  # Use full_page=False because we manually set the viewport to the full height
739
753
  # This avoids stitching artifacts and blank spaces
740
754
  return tab.get_screenshot(as_base64='jpg', full_page=False)
@@ -0,0 +1,18 @@
1
+ """
2
+ hyw_core.crawling - Intelligent Web Crawling Module
3
+
4
+ Provides Crawl4AI-inspired adaptive crawling with:
5
+ - Page completeness guarantees (image loading verification)
6
+ - Content quality scoring
7
+ - Adaptive stop logic
8
+ """
9
+
10
+ from .models import CrawlConfig, PageResult, CompletenessResult
11
+ from .completeness import CompletenessChecker
12
+
13
+ __all__ = [
14
+ "CrawlConfig",
15
+ "PageResult",
16
+ "CompletenessResult",
17
+ "CompletenessChecker",
18
+ ]
@@ -0,0 +1,348 @@
1
+ """
2
+ Page Completeness Checker
3
+
4
+ Multi-signal page readiness detection with focus on image loading guarantees.
5
+ Implements Crawl4AI's wait_for_images and networkidle concepts.
6
+ """
7
+
8
+ import time
9
+ from typing import Any, Optional, Dict
10
+ from dataclasses import dataclass
11
+ from loguru import logger
12
+
13
+ from .models import CrawlConfig, CompletenessResult
14
+
15
+
16
+ # CSP-compliant JavaScript for image loading verification
17
+ # Based on Crawl4AI's wait_for_images logic
18
+ IMAGE_CHECK_JS = """
19
+ (() => {
20
+ const results = {
21
+ total: 0,
22
+ loaded: 0,
23
+ failed: 0,
24
+ placeholders: 0,
25
+ details: []
26
+ };
27
+
28
+ const imgs = Array.from(document.querySelectorAll('img'));
29
+ const viewportHeight = window.innerHeight;
30
+ const minSize = %d; // Injected from Python
31
+
32
+ for (const img of imgs) {
33
+ // Skip tiny images (icons, tracking pixels)
34
+ if (img.clientWidth < minSize && img.clientHeight < minSize) {
35
+ continue;
36
+ }
37
+
38
+ // Skip images outside viewport (unless scan_full_page)
39
+ const rect = img.getBoundingClientRect();
40
+ const inViewport = rect.top < viewportHeight * 2 && rect.bottom > -viewportHeight;
41
+
42
+ if (!inViewport && !%s) { // scan_full_page injected
43
+ continue;
44
+ }
45
+
46
+ results.total++;
47
+
48
+ const src = img.src || img.getAttribute('data-src') || '';
49
+ const isPlaceholder = (
50
+ (img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
51
+ (img.naturalWidth < 50 && img.clientWidth > 100) ||
52
+ src.includes('placeholder') ||
53
+ src.includes('loading') ||
54
+ src.startsWith('data:image/svg+xml') ||
55
+ (img.naturalWidth === 1 && img.naturalHeight === 1)
56
+ );
57
+
58
+ if (isPlaceholder) {
59
+ results.placeholders++;
60
+ results.details.push({
61
+ src: src.substring(0, 100),
62
+ status: 'placeholder',
63
+ natural: [img.naturalWidth, img.naturalHeight],
64
+ display: [img.clientWidth, img.clientHeight]
65
+ });
66
+ continue;
67
+ }
68
+
69
+ // Check if fully loaded
70
+ if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
71
+ results.loaded++;
72
+ results.details.push({
73
+ src: src.substring(0, 100),
74
+ status: 'loaded',
75
+ natural: [img.naturalWidth, img.naturalHeight]
76
+ });
77
+ } else {
78
+ results.failed++;
79
+ results.details.push({
80
+ src: src.substring(0, 100),
81
+ status: 'pending',
82
+ complete: img.complete,
83
+ natural: [img.naturalWidth, img.naturalHeight]
84
+ });
85
+ }
86
+ }
87
+
88
+ return results;
89
+ })()
90
+ """
91
+
92
+ # JavaScript for height stability check
93
+ HEIGHT_CHECK_JS = """
94
+ (() => {
95
+ return {
96
+ height: Math.max(
97
+ document.body.scrollHeight || 0,
98
+ document.documentElement.scrollHeight || 0,
99
+ document.body.offsetHeight || 0,
100
+ document.documentElement.offsetHeight || 0
101
+ ),
102
+ ready: document.readyState === 'complete'
103
+ };
104
+ })()
105
+ """
106
+
107
+
108
+ class CompletenessChecker:
109
+ """
110
+ Multi-signal page completeness verification.
111
+
112
+ Signals checked:
113
+ 1. Image Loading (naturalWidth > 0, not placeholder)
114
+ 2. Height Stability (no layout shifts)
115
+ 3. DOM Ready State
116
+
117
+ Usage:
118
+ checker = CompletenessChecker(CrawlConfig())
119
+ result = checker.check(tab)
120
+
121
+ # Or wait until complete
122
+ result = checker.wait_for_complete(tab)
123
+ """
124
+
125
+ def __init__(self, config: Optional[CrawlConfig] = None):
126
+ self._config = config or CrawlConfig()
127
+
128
+ def check(self, tab: Any) -> CompletenessResult:
129
+ """
130
+ Perform a single completeness check.
131
+
132
+ Args:
133
+ tab: DrissionPage tab object
134
+
135
+ Returns:
136
+ CompletenessResult with all signals
137
+ """
138
+ start = time.time()
139
+
140
+ # Check images
141
+ img_result = self._check_images(tab)
142
+
143
+ # Check height
144
+ height_result = self._check_height(tab)
145
+
146
+ # Determine overall completeness
147
+ # Complete if: all real images loaded AND height is stable
148
+ total_pending = img_result.get('failed', 0) + img_result.get('placeholders', 0)
149
+ all_images_loaded = total_pending == 0 or img_result.get('total', 0) == 0
150
+
151
+ return CompletenessResult(
152
+ is_complete=all_images_loaded and height_result.get('ready', False),
153
+ total_images=img_result.get('total', 0),
154
+ loaded_images=img_result.get('loaded', 0),
155
+ failed_images=img_result.get('failed', 0),
156
+ placeholder_images=img_result.get('placeholders', 0),
157
+ height=height_result.get('height', 0),
158
+ height_stable=True, # Single check can't determine stability
159
+ network_idle=True, # TODO: network monitoring
160
+ check_duration=time.time() - start
161
+ )
162
+
163
+ def wait_for_complete(
164
+ self,
165
+ tab: Any,
166
+ timeout: Optional[float] = None
167
+ ) -> CompletenessResult:
168
+ """
169
+ Wait for page to be fully loaded with image guarantees.
170
+
171
+ Uses multi-signal approach:
172
+ 1. Poll image loading status
173
+ 2. Track height stability
174
+ 3. Respect timeout
175
+
176
+ Args:
177
+ tab: DrissionPage tab object
178
+ timeout: Max wait time (default from config)
179
+
180
+ Returns:
181
+ CompletenessResult (may not be complete if timeout)
182
+ """
183
+ timeout = timeout or self._config.image_load_timeout
184
+ start = time.time()
185
+
186
+ height_history = []
187
+ stable_count = 0
188
+ last_result = None
189
+
190
+ logger.debug(f"CompletenessChecker: Starting wait (timeout={timeout}s)")
191
+
192
+ while time.time() - start < timeout:
193
+ result = self.check(tab)
194
+ last_result = result
195
+
196
+ # Track height stability
197
+ height_history.append(result.height)
198
+ if len(height_history) > self._config.height_stability_checks:
199
+ height_history.pop(0)
200
+
201
+ # Check if height is stable
202
+ height_stable = False
203
+ if len(height_history) >= self._config.height_stability_checks:
204
+ max_h = max(height_history)
205
+ min_h = min(height_history)
206
+ height_stable = (max_h - min_h) <= self._config.height_stability_threshold
207
+
208
+ # Log progress
209
+ elapsed = time.time() - start
210
+ logger.debug(
211
+ f"CompletenessChecker: [{elapsed:.1f}s] "
212
+ f"images={result.loaded_images}/{result.total_images} "
213
+ f"pending={result.failed_images} "
214
+ f"placeholders={result.placeholder_images} "
215
+ f"height={result.height} stable={height_stable}"
216
+ )
217
+
218
+ # Check if all images loaded AND height stable
219
+ all_loaded = (
220
+ result.failed_images == 0 and
221
+ result.placeholder_images == 0
222
+ )
223
+
224
+ if all_loaded and height_stable:
225
+ stable_count += 1
226
+ if stable_count >= self._config.image_stability_checks:
227
+ logger.info(
228
+ f"CompletenessChecker: Page complete! "
229
+ f"{result.loaded_images} images loaded, "
230
+ f"height={result.height}, "
231
+ f"took {elapsed:.2f}s"
232
+ )
233
+ return CompletenessResult(
234
+ is_complete=True,
235
+ total_images=result.total_images,
236
+ loaded_images=result.loaded_images,
237
+ failed_images=0,
238
+ placeholder_images=0,
239
+ height=result.height,
240
+ height_stable=True,
241
+ network_idle=True,
242
+ check_duration=elapsed
243
+ )
244
+ else:
245
+ stable_count = 0
246
+
247
+ # Wait before next check
248
+ time.sleep(self._config.image_check_interval)
249
+
250
+ # Timeout reached
251
+ elapsed = time.time() - start
252
+ logger.warning(
253
+ f"CompletenessChecker: Timeout after {elapsed:.1f}s! "
254
+ f"images={last_result.loaded_images}/{last_result.total_images} "
255
+ f"pending={last_result.failed_images}"
256
+ )
257
+
258
+ # Return last result with is_complete=False
259
+ if last_result:
260
+ return CompletenessResult(
261
+ is_complete=False,
262
+ total_images=last_result.total_images,
263
+ loaded_images=last_result.loaded_images,
264
+ failed_images=last_result.failed_images,
265
+ placeholder_images=last_result.placeholder_images,
266
+ height=last_result.height,
267
+ height_stable=len(set(height_history)) == 1,
268
+ network_idle=True,
269
+ check_duration=elapsed
270
+ )
271
+
272
+ return CompletenessResult(
273
+ is_complete=False,
274
+ total_images=0, loaded_images=0, failed_images=0, placeholder_images=0,
275
+ height=0, height_stable=False, network_idle=False,
276
+ check_duration=elapsed
277
+ )
278
+
279
+ def _check_images(self, tab: Any) -> Dict[str, Any]:
280
+ """Run image check JavaScript and return results."""
281
+ try:
282
+ js = IMAGE_CHECK_JS % (
283
+ self._config.min_image_size,
284
+ 'true' if self._config.scan_full_page else 'false'
285
+ )
286
+ result = tab.run_js(js, as_expr=True)
287
+ return result or {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
288
+ except Exception as e:
289
+ logger.warning(f"CompletenessChecker: Image check failed: {e}")
290
+ return {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
291
+
292
+ def _check_height(self, tab: Any) -> Dict[str, Any]:
293
+ """Run height check JavaScript and return results."""
294
+ try:
295
+ result = tab.run_js(HEIGHT_CHECK_JS, as_expr=True)
296
+ return result or {'height': 0, 'ready': False}
297
+ except Exception as e:
298
+ logger.warning(f"CompletenessChecker: Height check failed: {e}")
299
+ return {'height': 0, 'ready': False}
300
+
301
+
302
+ def trigger_lazy_load(tab: Any, config: Optional[CrawlConfig] = None) -> None:
303
+ """
304
+ Scroll through page to trigger lazy-loaded images.
305
+
306
+ Implements Crawl4AI's scan_full_page behavior.
307
+
308
+ Args:
309
+ tab: DrissionPage tab object
310
+ config: Crawl configuration
311
+ """
312
+ config = config or CrawlConfig()
313
+ if not config.scan_full_page:
314
+ return
315
+
316
+ start = time.time()
317
+ current_pos = 0
318
+
319
+ logger.debug("CompletenessChecker: Starting lazy load trigger scroll")
320
+
321
+ try:
322
+ while time.time() - start < config.scroll_timeout:
323
+ # Scroll down
324
+ current_pos += config.scroll_step
325
+ tab.run_js(f"window.scrollTo(0, {current_pos});")
326
+
327
+ # Wait for lazy load triggers
328
+ time.sleep(config.scroll_delay)
329
+
330
+ # Check if reached bottom
331
+ height = tab.run_js("""
332
+ return Math.max(
333
+ document.body.scrollHeight || 0,
334
+ document.documentElement.scrollHeight || 0
335
+ );
336
+ """, as_expr=True) or 0
337
+
338
+ if current_pos >= height:
339
+ break
340
+
341
+ # Scroll back to top
342
+ tab.run_js("window.scrollTo(0, 0);")
343
+
344
+ elapsed = time.time() - start
345
+ logger.debug(f"CompletenessChecker: Lazy load scroll complete in {elapsed:.1f}s")
346
+
347
+ except Exception as e:
348
+ logger.warning(f"CompletenessChecker: Lazy load scroll failed: {e}")