entari-plugin-hyw 4.0.0rc9__py3-none-any.whl → 4.0.0rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

@@ -0,0 +1,437 @@
1
+ """
2
+ Page Completeness Checker
3
+
4
+ Multi-signal page readiness detection with focus on image loading guarantees.
5
+ Implements Crawl4AI's wait_for_images and networkidle concepts.
6
+ """
7
+
8
+ import time
9
+ from typing import Any, Optional, Dict
10
+ from dataclasses import dataclass
11
+ from loguru import logger
12
+
13
+ from .models import CrawlConfig, CompletenessResult
14
+
15
+
16
+ # CSP-compliant JavaScript for image loading verification
17
+ # Based on Crawl4AI's wait_for_images logic
18
+ IMAGE_CHECK_JS = """
19
+ (() => {
20
+ const results = {
21
+ total: 0,
22
+ loaded: 0,
23
+ failed: 0,
24
+ placeholders: 0,
25
+ details: []
26
+ };
27
+
28
+ const imgs = Array.from(document.querySelectorAll('img'));
29
+ const viewportHeight = window.innerHeight;
30
+ const minSize = %d; // Injected from Python
31
+
32
+ for (const img of imgs) {
33
+ // Skip tiny images (icons, tracking pixels)
34
+ if (img.clientWidth < minSize && img.clientHeight < minSize) {
35
+ continue;
36
+ }
37
+
38
+ // Skip images outside viewport (unless scan_full_page)
39
+ const rect = img.getBoundingClientRect();
40
+ const inViewport = rect.top < viewportHeight * 2 && rect.bottom > -viewportHeight;
41
+
42
+ if (!inViewport && !%s) { // scan_full_page injected
43
+ continue;
44
+ }
45
+
46
+ results.total++;
47
+
48
+ const src = img.src || img.getAttribute('data-src') || '';
49
+ const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
50
+ img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
51
+ const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
52
+ const loadingAttr = img.getAttribute('loading') || '';
53
+
54
+ // Enhanced placeholder detection for blurred preview images (like mcmod.cn)
55
+ const isPlaceholder = (
56
+ // 1. data-src exists but not yet loaded into src
57
+ (dataSrc && img.src !== dataSrc) ||
58
+ // 2. Natural size much smaller than display size (blurred placeholder)
59
+ (img.naturalWidth < 50 && img.clientWidth > 100) ||
60
+ (img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
61
+ // 3. Common placeholder keywords in src
62
+ src.includes('placeholder') ||
63
+ src.includes('loading') ||
64
+ src.includes('blank') ||
65
+ // 4. SVG placeholder or 1x1 tracking pixel
66
+ src.startsWith('data:image/svg+xml') ||
67
+ (img.naturalWidth === 1 && img.naturalHeight === 1) ||
68
+ // 5. Lazy-loading class indicators (common patterns)
69
+ className.includes('lazy') ||
70
+ className.includes('lazyload') ||
71
+ className.includes('lozad') ||
72
+ className.includes('b-lazy') ||
73
+ // 6. Blur indicators (common for LQIP - Low Quality Image Placeholder)
74
+ className.includes('blur') ||
75
+ src.includes('blur') ||
76
+ src.includes('thumb') ||
77
+ src.includes('thumbnail') ||
78
+ // 7. loading="lazy" + not complete (browser native lazy loading)
79
+ (loadingAttr === 'lazy' && !img.complete) ||
80
+ // 8. CSS blur filter applied (visual blurring)
81
+ (window.getComputedStyle(img).filter || '').includes('blur')
82
+ );
83
+
84
+ if (isPlaceholder) {
85
+ results.placeholders++;
86
+ results.details.push({
87
+ src: src.substring(0, 100),
88
+ status: 'placeholder',
89
+ natural: [img.naturalWidth, img.naturalHeight],
90
+ display: [img.clientWidth, img.clientHeight]
91
+ });
92
+ continue;
93
+ }
94
+
95
+ // Check if fully loaded
96
+ if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
97
+ results.loaded++;
98
+ results.details.push({
99
+ src: src.substring(0, 100),
100
+ status: 'loaded',
101
+ natural: [img.naturalWidth, img.naturalHeight]
102
+ });
103
+ } else {
104
+ results.failed++;
105
+ results.details.push({
106
+ src: src.substring(0, 100),
107
+ status: 'pending',
108
+ complete: img.complete,
109
+ natural: [img.naturalWidth, img.naturalHeight]
110
+ });
111
+ }
112
+ }
113
+
114
+ return results;
115
+ })()
116
+ """
117
+
118
+ # JavaScript for height stability check
119
+ HEIGHT_CHECK_JS = """
120
+ (() => {
121
+ return {
122
+ height: Math.max(
123
+ document.body.scrollHeight || 0,
124
+ document.documentElement.scrollHeight || 0,
125
+ document.body.offsetHeight || 0,
126
+ document.documentElement.offsetHeight || 0
127
+ ),
128
+ ready: document.readyState === 'complete'
129
+ };
130
+ })()
131
+ """
132
+
133
+
134
+ class CompletenessChecker:
135
+ """
136
+ Multi-signal page completeness verification.
137
+
138
+ Signals checked:
139
+ 1. Image Loading (naturalWidth > 0, not placeholder)
140
+ 2. Height Stability (no layout shifts)
141
+ 3. DOM Ready State
142
+
143
+ Usage:
144
+ checker = CompletenessChecker(CrawlConfig())
145
+ result = checker.check(tab)
146
+
147
+ # Or wait until complete
148
+ result = checker.wait_for_complete(tab)
149
+ """
150
+
151
+ def __init__(self, config: Optional[CrawlConfig] = None):
152
+ self._config = config or CrawlConfig()
153
+
154
+ def check(self, tab: Any) -> CompletenessResult:
155
+ """
156
+ Perform a single completeness check.
157
+
158
+ Args:
159
+ tab: DrissionPage tab object
160
+
161
+ Returns:
162
+ CompletenessResult with all signals
163
+ """
164
+ start = time.time()
165
+
166
+ # Check images
167
+ img_result = self._check_images(tab)
168
+
169
+ # Check height
170
+ height_result = self._check_height(tab)
171
+
172
+ # Determine overall completeness
173
+ # Complete if: all real images loaded AND height is stable
174
+ total_pending = img_result.get('failed', 0) + img_result.get('placeholders', 0)
175
+ all_images_loaded = total_pending == 0 or img_result.get('total', 0) == 0
176
+
177
+ return CompletenessResult(
178
+ is_complete=all_images_loaded and height_result.get('ready', False),
179
+ total_images=img_result.get('total', 0),
180
+ loaded_images=img_result.get('loaded', 0),
181
+ failed_images=img_result.get('failed', 0),
182
+ placeholder_images=img_result.get('placeholders', 0),
183
+ height=height_result.get('height', 0),
184
+ height_stable=True, # Single check can't determine stability
185
+ network_idle=True, # TODO: network monitoring
186
+ check_duration=time.time() - start
187
+ )
188
+
189
+ def wait_for_complete(
190
+ self,
191
+ tab: Any,
192
+ timeout: Optional[float] = None
193
+ ) -> CompletenessResult:
194
+ """
195
+ Wait for page to be fully loaded with image guarantees.
196
+
197
+ Uses multi-signal approach:
198
+ 1. Poll image loading status
199
+ 2. Track height stability
200
+ 3. Respect timeout
201
+
202
+ Args:
203
+ tab: DrissionPage tab object
204
+ timeout: Max wait time (default from config)
205
+
206
+ Returns:
207
+ CompletenessResult (may not be complete if timeout)
208
+ """
209
+ timeout = timeout or self._config.image_load_timeout
210
+ start = time.time()
211
+
212
+ height_history = []
213
+ stable_count = 0
214
+ last_result = None
215
+
216
+ logger.debug(f"CompletenessChecker: Starting wait (timeout={timeout}s)")
217
+
218
+ while time.time() - start < timeout:
219
+ result = self.check(tab)
220
+ last_result = result
221
+
222
+ # Track height stability
223
+ height_history.append(result.height)
224
+ if len(height_history) > self._config.height_stability_checks:
225
+ height_history.pop(0)
226
+
227
+ # Check if height is stable
228
+ height_stable = False
229
+ if len(height_history) >= self._config.height_stability_checks:
230
+ max_h = max(height_history)
231
+ min_h = min(height_history)
232
+ height_stable = (max_h - min_h) <= self._config.height_stability_threshold
233
+
234
+ # Log progress
235
+ elapsed = time.time() - start
236
+ logger.debug(
237
+ f"CompletenessChecker: [{elapsed:.1f}s] "
238
+ f"images={result.loaded_images}/{result.total_images} "
239
+ f"pending={result.failed_images} "
240
+ f"placeholders={result.placeholder_images} "
241
+ f"height={result.height} stable={height_stable}"
242
+ )
243
+
244
+ # Check if all images loaded AND height stable
245
+ all_loaded = (
246
+ result.failed_images == 0 and
247
+ result.placeholder_images == 0
248
+ )
249
+
250
+ if all_loaded and height_stable:
251
+ stable_count += 1
252
+ if stable_count >= self._config.image_stability_checks:
253
+ logger.info(
254
+ f"CompletenessChecker: Page complete! "
255
+ f"{result.loaded_images} images loaded, "
256
+ f"height={result.height}, "
257
+ f"took {elapsed:.2f}s"
258
+ )
259
+ return CompletenessResult(
260
+ is_complete=True,
261
+ total_images=result.total_images,
262
+ loaded_images=result.loaded_images,
263
+ failed_images=0,
264
+ placeholder_images=0,
265
+ height=result.height,
266
+ height_stable=True,
267
+ network_idle=True,
268
+ check_duration=elapsed
269
+ )
270
+ else:
271
+ stable_count = 0
272
+
273
+ # Wait before next check
274
+ time.sleep(self._config.image_check_interval)
275
+
276
+ # Timeout reached
277
+ elapsed = time.time() - start
278
+ logger.warning(
279
+ f"CompletenessChecker: Timeout after {elapsed:.1f}s! "
280
+ f"images={last_result.loaded_images}/{last_result.total_images} "
281
+ f"pending={last_result.failed_images}"
282
+ )
283
+
284
+ # Return last result with is_complete=False
285
+ if last_result:
286
+ return CompletenessResult(
287
+ is_complete=False,
288
+ total_images=last_result.total_images,
289
+ loaded_images=last_result.loaded_images,
290
+ failed_images=last_result.failed_images,
291
+ placeholder_images=last_result.placeholder_images,
292
+ height=last_result.height,
293
+ height_stable=len(set(height_history)) == 1,
294
+ network_idle=True,
295
+ check_duration=elapsed
296
+ )
297
+
298
+ return CompletenessResult(
299
+ is_complete=False,
300
+ total_images=0, loaded_images=0, failed_images=0, placeholder_images=0,
301
+ height=0, height_stable=False, network_idle=False,
302
+ check_duration=elapsed
303
+ )
304
+
305
+ def _check_images(self, tab: Any) -> Dict[str, Any]:
306
+ """Run image check JavaScript and return results."""
307
+ try:
308
+ js = IMAGE_CHECK_JS % (
309
+ self._config.min_image_size,
310
+ 'true' if self._config.scan_full_page else 'false'
311
+ )
312
+ result = tab.run_js(js, as_expr=True)
313
+ return result or {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
314
+ except Exception as e:
315
+ logger.warning(f"CompletenessChecker: Image check failed: {e}")
316
+ return {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
317
+
318
+ def _check_height(self, tab: Any) -> Dict[str, Any]:
319
+ """Run height check JavaScript and return results."""
320
+ try:
321
+ result = tab.run_js(HEIGHT_CHECK_JS, as_expr=True)
322
+ return result or {'height': 0, 'ready': False}
323
+ except Exception as e:
324
+ logger.warning(f"CompletenessChecker: Height check failed: {e}")
325
+ return {'height': 0, 'ready': False}
326
+
327
+
328
+ def trigger_lazy_load(tab: Any, config: Optional[CrawlConfig] = None) -> None:
329
+ """
330
+ Scroll through page to trigger lazy-loaded images.
331
+
332
+ Implements Crawl4AI's scan_full_page behavior.
333
+ Strategy: Fast scroll with minimal delay (0.2s) per step to trigger network requests,
334
+ then wait at the bottom for all images to settle.
335
+
336
+ Args:
337
+ tab: DrissionPage tab object
338
+ config: Crawl configuration
339
+ """
340
+ config = config or CrawlConfig()
341
+ if not config.scan_full_page:
342
+ return
343
+
344
+ start = time.time()
345
+ current_pos = 0
346
+
347
+ logger.info(f"CompletenessChecker: Starting lazy load scroll (fast scroll + final wait)")
348
+
349
+ try:
350
+ max_scroll_steps = 100
351
+ step_count = 0
352
+
353
+ # 1. Fast Scroll Phase
354
+ while step_count < max_scroll_steps:
355
+ step_count += 1
356
+ current_pos += config.scroll_step
357
+ tab.run_js(f"window.scrollTo(0, {current_pos});")
358
+
359
+ # Simple fixed delay per step (0.2s) as requested
360
+ time.sleep(0.2)
361
+
362
+ # Check if reached bottom
363
+ height = tab.run_js("""
364
+ Math.max(
365
+ document.body.scrollHeight || 0,
366
+ document.documentElement.scrollHeight || 0
367
+ )
368
+ """, as_expr=True) or 0
369
+
370
+ if current_pos >= height:
371
+ logger.debug(f"CompletenessChecker: Reached bottom at position {current_pos}")
372
+ break
373
+
374
+ # 2. Wait Phase at Bottom (Wait for images to settle - reduced timeout)
375
+ logger.debug("CompletenessChecker: Reached bottom, waiting for images to settle (max 2s)...")
376
+ wait_start = time.time()
377
+ max_wait_at_bottom = 2.0 # Reduced from 8s to 2s - scroll usually triggers loading quickly
378
+
379
+ # Quick check: just verify images are not placeholders (simplified check)
380
+ check_all_images_js = """
381
+ (() => {
382
+ const imgs = Array.from(document.querySelectorAll('img'));
383
+ if (imgs.length === 0) return true;
384
+
385
+ // Quick check: count non-placeholder images that are loaded
386
+ let loaded_count = 0;
387
+ let total_count = 0;
388
+
389
+ for (const img of imgs) {
390
+ // Skip tiny images
391
+ if (img.clientWidth < 50 && img.clientHeight < 50) continue;
392
+
393
+ total_count++;
394
+ const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') || '';
395
+ const src = img.src || '';
396
+
397
+ // Check if placeholder
398
+ const isPlaceholder = (
399
+ (dataSrc && img.src !== dataSrc) ||
400
+ (img.naturalWidth < 50 && img.clientWidth > 100) ||
401
+ src.includes('placeholder') || src.includes('loading')
402
+ );
403
+
404
+ // If not placeholder and loaded, count it
405
+ if (!isPlaceholder && img.complete && img.naturalWidth > 0) {
406
+ loaded_count++;
407
+ }
408
+ }
409
+
410
+ // If most images are loaded (80%+), consider it done
411
+ return total_count === 0 || (loaded_count / total_count) >= 0.8;
412
+ })()
413
+ """
414
+
415
+ # Quick check loop with shorter interval
416
+ check_count = 0
417
+ max_checks = 4 # 2s / 0.5s = 4 checks max
418
+ while check_count < max_checks:
419
+ try:
420
+ all_loaded = tab.run_js(check_all_images_js, as_expr=True)
421
+ if all_loaded:
422
+ elapsed_wait = time.time() - wait_start
423
+ logger.debug(f"CompletenessChecker: Images settled at bottom in {elapsed_wait:.1f}s")
424
+ break
425
+ except:
426
+ pass
427
+ time.sleep(0.5)
428
+ check_count += 1
429
+
430
+ # Scroll back to top
431
+ tab.run_js("window.scrollTo(0, 0);")
432
+
433
+ elapsed = time.time() - start
434
+ logger.info(f"CompletenessChecker: Lazy load scroll complete - {step_count} steps in {elapsed:.1f}s")
435
+
436
+ except Exception as e:
437
+ logger.warning(f"CompletenessChecker: Lazy load scroll failed: {e}")
@@ -0,0 +1,88 @@
1
+ """
2
+ Crawling Data Models
3
+
4
+ Core data structures for the intelligent crawling system.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import List, Optional, Set
9
+
10
+
11
+ @dataclass
12
+ class CrawlConfig:
13
+ """
14
+ Crawling configuration with Crawl4AI-style parameters.
15
+
16
+ Focuses on page completeness and image loading guarantees.
17
+ """
18
+ # Wait Strategy (Priority: Image Loading)
19
+ wait_for_images: bool = True
20
+ wait_until: str = "networkidle" # domcontentloaded | networkidle | load
21
+ delay_before_return: float = 0.1
22
+ page_timeout: float = 30.0
23
+
24
+ # Image Loading Specific
25
+ image_load_timeout: float = 10.0 # Max wait for images
26
+ image_stability_checks: int = 3 # Consecutive stable checks needed
27
+ image_check_interval: float = 0.2 # Interval between checks
28
+ min_image_size: int = 50 # Ignore images smaller than this
29
+
30
+ # Scroll for lazy loading
31
+ scan_full_page: bool = True
32
+ scroll_step: int = 800
33
+ scroll_delay: float = 0.5
34
+ scroll_timeout: float = 15.0
35
+
36
+ # Height Stability
37
+ height_stability_checks: int = 3
38
+ height_stability_threshold: int = 10 # pixels
39
+
40
+ # Future: Adaptive Stop Logic
41
+ confidence_threshold: float = 0.75
42
+ min_gain_threshold: float = 0.1
43
+ max_pages: int = 20
44
+
45
+
46
+ @dataclass
47
+ class CompletenessResult:
48
+ """Result from completeness check."""
49
+ is_complete: bool
50
+ total_images: int
51
+ loaded_images: int
52
+ failed_images: int
53
+ placeholder_images: int
54
+ height: int
55
+ height_stable: bool
56
+ network_idle: bool
57
+ check_duration: float
58
+
59
+ @property
60
+ def image_load_ratio(self) -> float:
61
+ if self.total_images == 0:
62
+ return 1.0
63
+ return self.loaded_images / self.total_images
64
+
65
+
66
+ @dataclass
67
+ class PageResult:
68
+ """Result from fetching a single page."""
69
+ url: str
70
+ final_url: str
71
+ title: str
72
+ html: str
73
+ content: str # Extracted markdown
74
+ images: List[str] = field(default_factory=list) # base64 images
75
+ screenshot: Optional[str] = None
76
+
77
+ # Quality Signals
78
+ load_time: float = 0.0
79
+ completeness: Optional[CompletenessResult] = None
80
+
81
+ # Error handling
82
+ error: Optional[str] = None
83
+
84
+ @property
85
+ def is_complete(self) -> bool:
86
+ if self.completeness is None:
87
+ return False
88
+ return self.completeness.is_complete
hyw_core/search.py CHANGED
@@ -8,7 +8,6 @@ from loguru import logger
8
8
  from .browser_control.service import get_screenshot_service
9
9
  # Search engines from browser_control subpackage
10
10
  from .browser_control.engines.duckduckgo import DuckDuckGoEngine
11
- from .browser_control.engines.google import GoogleEngine
12
11
  from .browser_control.engines.default import DefaultEngine
13
12
 
14
13
  class SearchService:
@@ -21,15 +20,14 @@ class SearchService:
21
20
  # Domain blocking
22
21
  self._blocked_domains = getattr(config, "blocked_domains", []) or []
23
22
 
24
- # Select Engine - DefaultEngine when not specified
23
+ # Select Engine - DuckDuckGo is the default and only engine
25
24
  self._engine_name = getattr(config, "search_engine", None)
26
25
  if self._engine_name:
27
26
  self._engine_name = self._engine_name.lower()
28
27
 
29
- if self._engine_name == "google":
30
- self._engine = GoogleEngine()
31
- elif self._engine_name == "default_address_bar": # Explicitly requested address bar capability if needed
32
- self._engine = DefaultEngine()
28
+ if self._engine_name == "default_address_bar":
29
+ # Explicitly requested address bar capability if needed
30
+ self._engine = DefaultEngine()
33
31
  else:
34
32
  # Default: use DuckDuckGo
35
33
  self._engine = DuckDuckGoEngine()