entari-plugin-hyw 0.3.5__py3-none-any.whl → 4.0.0rc14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (78) hide show
  1. entari_plugin_hyw/Untitled-1 +1865 -0
  2. entari_plugin_hyw/__init__.py +979 -116
  3. entari_plugin_hyw/filters.py +83 -0
  4. entari_plugin_hyw/history.py +251 -0
  5. entari_plugin_hyw/misc.py +214 -0
  6. entari_plugin_hyw/search_cache.py +154 -0
  7. entari_plugin_hyw-4.0.0rc14.dist-info/METADATA +118 -0
  8. entari_plugin_hyw-4.0.0rc14.dist-info/RECORD +72 -0
  9. {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/WHEEL +1 -1
  10. {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/top_level.txt +1 -0
  11. hyw_core/__init__.py +94 -0
  12. hyw_core/agent.py +768 -0
  13. hyw_core/browser_control/__init__.py +63 -0
  14. hyw_core/browser_control/assets/card-dist/index.html +425 -0
  15. hyw_core/browser_control/assets/card-dist/logos/anthropic.svg +1 -0
  16. hyw_core/browser_control/assets/card-dist/logos/cerebras.svg +9 -0
  17. hyw_core/browser_control/assets/card-dist/logos/deepseek.png +0 -0
  18. hyw_core/browser_control/assets/card-dist/logos/gemini.svg +1 -0
  19. hyw_core/browser_control/assets/card-dist/logos/google.svg +1 -0
  20. hyw_core/browser_control/assets/card-dist/logos/grok.png +0 -0
  21. hyw_core/browser_control/assets/card-dist/logos/huggingface.png +0 -0
  22. hyw_core/browser_control/assets/card-dist/logos/microsoft.svg +15 -0
  23. hyw_core/browser_control/assets/card-dist/logos/minimax.png +0 -0
  24. hyw_core/browser_control/assets/card-dist/logos/mistral.png +0 -0
  25. hyw_core/browser_control/assets/card-dist/logos/nvida.png +0 -0
  26. hyw_core/browser_control/assets/card-dist/logos/openai.svg +1 -0
  27. hyw_core/browser_control/assets/card-dist/logos/openrouter.png +0 -0
  28. hyw_core/browser_control/assets/card-dist/logos/perplexity.svg +24 -0
  29. hyw_core/browser_control/assets/card-dist/logos/qwen.png +0 -0
  30. hyw_core/browser_control/assets/card-dist/logos/xai.png +0 -0
  31. hyw_core/browser_control/assets/card-dist/logos/xiaomi.png +0 -0
  32. hyw_core/browser_control/assets/card-dist/logos/zai.png +0 -0
  33. hyw_core/browser_control/assets/card-dist/vite.svg +1 -0
  34. hyw_core/browser_control/assets/index.html +5691 -0
  35. hyw_core/browser_control/assets/logos/anthropic.svg +1 -0
  36. hyw_core/browser_control/assets/logos/cerebras.svg +9 -0
  37. hyw_core/browser_control/assets/logos/deepseek.png +0 -0
  38. hyw_core/browser_control/assets/logos/gemini.svg +1 -0
  39. hyw_core/browser_control/assets/logos/google.svg +1 -0
  40. hyw_core/browser_control/assets/logos/grok.png +0 -0
  41. hyw_core/browser_control/assets/logos/huggingface.png +0 -0
  42. hyw_core/browser_control/assets/logos/microsoft.svg +15 -0
  43. hyw_core/browser_control/assets/logos/minimax.png +0 -0
  44. hyw_core/browser_control/assets/logos/mistral.png +0 -0
  45. hyw_core/browser_control/assets/logos/nvida.png +0 -0
  46. hyw_core/browser_control/assets/logos/openai.svg +1 -0
  47. hyw_core/browser_control/assets/logos/openrouter.png +0 -0
  48. hyw_core/browser_control/assets/logos/perplexity.svg +24 -0
  49. hyw_core/browser_control/assets/logos/qwen.png +0 -0
  50. hyw_core/browser_control/assets/logos/xai.png +0 -0
  51. hyw_core/browser_control/assets/logos/xiaomi.png +0 -0
  52. hyw_core/browser_control/assets/logos/zai.png +0 -0
  53. hyw_core/browser_control/engines/__init__.py +15 -0
  54. hyw_core/browser_control/engines/base.py +13 -0
  55. hyw_core/browser_control/engines/default.py +166 -0
  56. hyw_core/browser_control/engines/duckduckgo.py +171 -0
  57. hyw_core/browser_control/landing.html +172 -0
  58. hyw_core/browser_control/manager.py +173 -0
  59. hyw_core/browser_control/renderer.py +446 -0
  60. hyw_core/browser_control/service.py +940 -0
  61. hyw_core/config.py +154 -0
  62. hyw_core/core.py +462 -0
  63. hyw_core/crawling/__init__.py +18 -0
  64. hyw_core/crawling/completeness.py +437 -0
  65. hyw_core/crawling/models.py +88 -0
  66. hyw_core/definitions.py +104 -0
  67. hyw_core/image_cache.py +274 -0
  68. hyw_core/pipeline.py +502 -0
  69. hyw_core/search.py +171 -0
  70. hyw_core/stages/__init__.py +21 -0
  71. hyw_core/stages/base.py +95 -0
  72. hyw_core/stages/summary.py +191 -0
  73. entari_plugin_hyw/agent.py +0 -419
  74. entari_plugin_hyw/compressor.py +0 -59
  75. entari_plugin_hyw/tools.py +0 -236
  76. entari_plugin_hyw/vision.py +0 -35
  77. entari_plugin_hyw-0.3.5.dist-info/METADATA +0 -112
  78. entari_plugin_hyw-0.3.5.dist-info/RECORD +0 -9
@@ -0,0 +1,940 @@
1
+ """
2
+ Browser Service (DrissionPage)
3
+
4
+ Provides page fetching and screenshot capabilities using DrissionPage.
5
+ """
6
+
7
+ import asyncio
8
+ import base64
9
+ import threading
10
+ import time
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ from typing import Optional, Dict, Any, List
13
+ from loguru import logger
14
+ import trafilatura
15
+
16
+ # Import intelligent completeness checker
17
+ from ..crawling.completeness import CompletenessChecker, trigger_lazy_load
18
+ from ..crawling.models import CrawlConfig
19
+
20
+ class ScreenshotService:
21
+ """
22
+ Browser Service using DrissionPage.
23
+ """
24
+
25
+ def __init__(self, headless: bool = True, auto_start: bool = True):
26
+ self.headless = headless
27
+ self._manager = None
28
+ self._executor = ThreadPoolExecutor(max_workers=10)
29
+
30
+ if auto_start:
31
+ self._ensure_ready()
32
+
33
+ def _get_tab(self, url: str) -> Any:
34
+ """Create a new tab and navigate to URL."""
35
+ self._ensure_ready()
36
+ return self._manager.new_tab(url)
37
+
38
+ def _release_tab(self, tab: Any):
39
+ """Close tab after use."""
40
+ if not tab: return
41
+ try:
42
+ tab.close()
43
+ except:
44
+ pass
45
+
46
+ async def search_via_page_input_batch(self, queries: List[str], url: str, selector: str = "#input") -> List[Dict[str, Any]]:
47
+ """
48
+ Execute concurrent searches using page inputs.
49
+ """
50
+ loop = asyncio.get_running_loop()
51
+ return await loop.run_in_executor(
52
+ self._executor,
53
+ self._search_via_page_input_batch_sync,
54
+ queries, url, selector
55
+ )
56
+
57
+ def _search_via_page_input_batch_sync(self, queries: List[str], url: str, selector: str) -> List[Dict[str, Any]]:
58
+ """Sync batch execution - create tabs sequentially, search in parallel."""
59
+ results = [None] * len(queries)
60
+ tabs = []
61
+
62
+ # Phase 1: Get/create tabs SEQUENTIALLY (DrissionPage isn't thread-safe for new_tab)
63
+ target_url = url or "https://www.google.com"
64
+ logger.info(f"ScreenshotService: Acquiring {len(queries)} tabs for parallel search...")
65
+
66
+ for i in range(len(queries)):
67
+ tab = None
68
+ # Try to get from pool first (using shared logic now)
69
+ try:
70
+ tab = self._get_tab(target_url)
71
+ except Exception as e:
72
+ logger.warning(f"ScreenshotService: Batch search tab creation failed: {e}")
73
+
74
+
75
+
76
+ tabs.append(tab)
77
+
78
+ logger.info(f"ScreenshotService: {len(tabs)} tabs ready, starting parallel searches...")
79
+
80
+ # Phase 2: Execute searches in PARALLEL
81
+ def run_search(index, tab, query):
82
+ try:
83
+ logger.debug(f"Search[{index}]: Starting for '{query}' on {tab.url}")
84
+
85
+ # Wait for page to be ready first
86
+ try:
87
+ tab.wait.doc_loaded(timeout=10)
88
+ except:
89
+ pass
90
+
91
+ # Find input element with wait
92
+ logger.debug(f"Search[{index}]: Looking for input with selector '{selector}'")
93
+ ele = tab.ele(selector, timeout=5)
94
+ if not ele:
95
+ logger.debug(f"Search[{index}]: Primary selector failed, trying fallbacks")
96
+ for fallback in ["textarea[name='q']", "#APjFqb", "input[name='q']", "input[type='text']"]:
97
+ ele = tab.ele(fallback, timeout=2)
98
+ if ele:
99
+ logger.debug(f"Search[{index}]: Found input with fallback '{fallback}'")
100
+ break
101
+
102
+ if not ele:
103
+ logger.error(f"Search[{index}]: No input element found on {tab.url}!")
104
+ results[index] = {"content": "Error: input not found", "title": "Error", "url": tab.url, "html": tab.html[:5000]}
105
+ return
106
+
107
+ logger.debug(f"Search[{index}]: Typing query...")
108
+ ele.input(query)
109
+
110
+ logger.debug(f"Search[{index}]: Pressing Enter...")
111
+ tab.actions.key_down('enter').key_up('enter')
112
+
113
+ logger.debug(f"Search[{index}]: Waiting for search results...")
114
+ tab.wait.doc_loaded(timeout=10)
115
+ # Reduced settle wait for extraction
116
+ time.sleep(0.1)
117
+
118
+ logger.debug(f"Search[{index}]: Extracting content...")
119
+ html = tab.html
120
+ content = trafilatura.extract(
121
+ html, include_links=True, include_images=True, include_comments=False,
122
+ include_tables=True, favor_precision=False, output_format="markdown"
123
+ ) or ""
124
+
125
+ logger.info(f"ScreenshotService: Search '{query}' completed -> {tab.url}")
126
+
127
+ results[index] = {
128
+ "content": content,
129
+ "html": html,
130
+ "title": tab.title,
131
+ "url": tab.url,
132
+ "images": []
133
+ }
134
+
135
+ except Exception as e:
136
+ logger.error(f"ScreenshotService: Search error for '{query}': {e}")
137
+ results[index] = {"content": f"Error: {e}", "title": "Error", "url": "", "html": ""}
138
+ finally:
139
+ self._release_tab(tab)
140
+
141
+ threads = []
142
+ for i, (tab, query) in enumerate(zip(tabs, queries)):
143
+ t = threading.Thread(target=run_search, args=(i, tab, query))
144
+ t.start()
145
+ threads.append(t)
146
+
147
+ for t in threads:
148
+ t.join()
149
+
150
+ return results
151
+
152
+ def _ensure_ready(self):
153
+ """Ensure shared browser is ready."""
154
+ from .manager import get_shared_browser_manager
155
+ self._manager = get_shared_browser_manager(headless=self.headless)
156
+
157
+ async def fetch_page(self, url: str, timeout: float = 10.0, include_screenshot: bool = True) -> Dict[str, Any]:
158
+ """
159
+ Fetch page content (and optionally screenshot).
160
+ Runs in a thread executor to avoid blocking the async loop.
161
+ """
162
+ loop = asyncio.get_running_loop()
163
+ return await loop.run_in_executor(
164
+ self._executor,
165
+ self._fetch_page_sync,
166
+ url,
167
+ timeout,
168
+ include_screenshot
169
+ )
170
+
171
+ async def search_via_address_bar(self, query: str, timeout: float = 20.0) -> Dict[str, Any]:
172
+ """
173
+ Search using browser's address bar (uses browser's default search engine).
174
+ Simulates: Ctrl+L (focus address bar) -> type query -> Enter
175
+ """
176
+ loop = asyncio.get_running_loop()
177
+ return await loop.run_in_executor(
178
+ self._executor,
179
+ self._search_via_address_bar_sync,
180
+ query,
181
+ timeout
182
+ )
183
+
184
+ def _search_via_address_bar_sync(self, query: str, timeout: float) -> Dict[str, Any]:
185
+ """Synchronous address bar search logic."""
186
+ if not query:
187
+ return {"content": "Error: missing query", "title": "Error", "url": "", "html": ""}
188
+
189
+ tab = None
190
+ try:
191
+ self._ensure_ready()
192
+ page = self._manager.page
193
+ if not page:
194
+ return {"content": "Error: Browser not available", "title": "Error", "url": "", "html": ""}
195
+
196
+ # Open new blank tab
197
+ tab = page.new_tab()
198
+
199
+ # Focus address bar with Ctrl+L (or Cmd+L on Mac)
200
+ import platform
201
+ if platform.system() == "Darwin":
202
+ tab.actions.key_down('cmd').key_down('l').key_up('l').key_up('cmd')
203
+ else:
204
+ tab.actions.key_down('ctrl').key_down('l').key_up('l').key_up('ctrl')
205
+
206
+ # Small delay for address bar to focus
207
+ import time as _time
208
+ _time.sleep(0.05)
209
+
210
+ # Type the query
211
+ tab.actions.type(query)
212
+
213
+ # Press Enter to search
214
+ tab.actions.key_down('enter').key_up('enter')
215
+
216
+ # Wait for page to load
217
+ try:
218
+ tab.wait.doc_loaded(timeout=timeout)
219
+ # Reduced wait for initial results
220
+ _time.sleep(0.2)
221
+ except:
222
+ pass
223
+
224
+ html = tab.html
225
+ title = tab.title
226
+ final_url = tab.url
227
+
228
+ # Extract content
229
+ content = trafilatura.extract(
230
+ html, include_links=True, include_images=True, include_comments=False,
231
+ include_tables=True, favor_precision=False, output_format="markdown"
232
+ ) or ""
233
+
234
+ logger.info(f"ScreenshotService: Address bar search completed -> {final_url}")
235
+
236
+ return {
237
+ "content": content,
238
+ "html": html,
239
+ "title": title,
240
+ "url": final_url,
241
+ "images": []
242
+ }
243
+
244
+ except Exception as e:
245
+ logger.error(f"ScreenshotService: Address bar search failed: {e}")
246
+ return {"content": f"Error: search failed ({e})", "title": "Error", "url": "", "html": ""}
247
+ finally:
248
+ if tab:
249
+ try: tab.close()
250
+ except: pass
251
+
252
+ def _scroll_to_bottom(self, tab, step: int = 800, delay: float = 2.0, timeout: float = 10.0):
253
+ """
254
+ Scroll down gradually to trigger lazy loading.
255
+
256
+ Args:
257
+ delay: Max wait time per scroll step (seconds) if images aren't loading.
258
+ """
259
+ import time
260
+ start = time.time()
261
+ current_pos = 0
262
+ try:
263
+ while time.time() - start < timeout:
264
+ # Scroll down
265
+ current_pos += step
266
+ tab.run_js(f"window.scrollTo(0, {current_pos});")
267
+
268
+ # Active Wait: Check if images in viewport are loaded
269
+ # Poll every 100ms, up to 'delay' seconds
270
+ wait_start = time.time()
271
+ while time.time() - wait_start < delay:
272
+ all_loaded = tab.run_js("""
273
+ return (async () => {
274
+ const imgs = Array.from(document.querySelectorAll('img'));
275
+ const viewportHeight = window.innerHeight;
276
+
277
+ // 1. Identify images currently in viewport
278
+ const visibleImgs = imgs.filter(img => {
279
+ const rect = img.getBoundingClientRect();
280
+ return (rect.top < viewportHeight && rect.bottom > 0) && (rect.width > 0 && rect.height > 0);
281
+ });
282
+
283
+ if (visibleImgs.length === 0) return true;
284
+
285
+ // 2. Check loading status using decode() AND heuristic for placeholders
286
+ // Some sites load a tiny blurred placeholder first.
287
+ const checks = visibleImgs.map(img => {
288
+ // Enhanced placeholder detection (matching completeness.py)
289
+ const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
290
+ img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
291
+ const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
292
+ const loadingAttr = img.getAttribute('loading') || '';
293
+ const src = img.src || '';
294
+
295
+ const isPlaceholder = (
296
+ // data-src not yet loaded
297
+ (dataSrc && img.src !== dataSrc) ||
298
+ // Natural size much smaller than display (blurred placeholder)
299
+ (img.naturalWidth < 50 && img.clientWidth > 100) ||
300
+ (img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
301
+ // Lazy-loading class indicators
302
+ className.includes('lazy') ||
303
+ className.includes('lazyload') ||
304
+ className.includes('lozad') ||
305
+ // CSS blur filter applied
306
+ (window.getComputedStyle(img).filter || '').includes('blur') ||
307
+ // loading="lazy" + not complete
308
+ (loadingAttr === 'lazy' && !img.complete)
309
+ );
310
+
311
+ if (isPlaceholder) {
312
+ // If it looks like a placeholder, we return false (not loaded)
313
+ // unless it stays like this for too long (handled by outer timeout)
314
+ return Promise.resolve(false);
315
+ }
316
+
317
+ if (img.complete && img.naturalHeight > 0) return Promise.resolve(true);
318
+
319
+ return img.decode().then(() => true).catch(() => false);
320
+ });
321
+
322
+ // Race against a small timeout to avoid hanging on one broken image
323
+ const allDecoded = Promise.all(checks);
324
+ const timeout = new Promise(resolve => setTimeout(() => resolve(false), 500));
325
+
326
+ // If any check returned false (meaning placeholder or not decoded), result is false
327
+ return Promise.race([allDecoded, timeout]).then(results => {
328
+ if (!Array.isArray(results)) return results === true;
329
+ return results.every(res => res === true);
330
+ });
331
+ })();
332
+ """)
333
+ if all_loaded:
334
+ break
335
+ time.sleep(0.1)
336
+
337
+ # Check if reached bottom
338
+ height = tab.run_js("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);")
339
+ if current_pos >= height:
340
+ break
341
+
342
+ # Ensure final layout settle
343
+ time.sleep(0.2)
344
+
345
+ except Exception as e:
346
+ logger.warning(f"ScreenshotService: Scroll failed: {e}")
347
+
348
+ def _fetch_page_sync(self, url: str, timeout: float, include_screenshot: bool) -> Dict[str, Any]:
349
+ """Synchronous fetch logic."""
350
+ if not url:
351
+ return {"content": "Error: missing url", "title": "Error", "url": ""}
352
+
353
+ tab = None
354
+ try:
355
+ self._ensure_ready()
356
+ page = self._manager.page
357
+ if not page:
358
+ return {"content": "Error: Browser not available", "title": "Error", "url": url}
359
+
360
+ # Get from pool
361
+ tab = self._get_tab(url)
362
+
363
+ # Wait logic - optimized for search pages
364
+ is_search_page = any(s in url.lower() for s in ['search', 'bing.com', 'duckduckgo', 'google.com/search', 'searx'])
365
+ if is_search_page:
366
+ # Optimized waiting: Rapidly poll for ACTUAL results > 0
367
+ start_time = time.time()
368
+
369
+ # Special fast-path for DDG Lite (HTML only, no JS rendering needed)
370
+ if 'lite.duckduckgo' in url:
371
+ # just wait for body, it's static HTML
372
+ try:
373
+ tab.wait.doc_loaded(timeout=timeout)
374
+ except: pass
375
+ # Sleep tiny bit to ensure render
376
+ time.sleep(0.5)
377
+ else:
378
+ while time.time() - start_time < timeout:
379
+ found_results = False
380
+ try:
381
+ if 'google' in url.lower():
382
+ # Check if we have any result items (.g, .MjjYud) or the main container (#search)
383
+ # Using checks with minimal timeout to allow fast looping
384
+ if tab.ele('.g', timeout=0.1) or tab.ele('.MjjYud', timeout=0.1) or tab.ele('#search', timeout=0.1):
385
+ found_results = True
386
+ elif 'bing' in url.lower():
387
+ if tab.ele('.b_algo', timeout=0.1) or tab.ele('#b_results', timeout=0.1):
388
+ found_results = True
389
+ elif 'duckduckgo' in url.lower():
390
+ if tab.ele('.result', timeout=0.1) or tab.ele('#react-layout', timeout=0.1):
391
+ found_results = True
392
+ else:
393
+ # Generic fallback: wait for body to be populated
394
+ if tab.ele('body', timeout=0.1):
395
+ found_results = True
396
+ except:
397
+ pass
398
+
399
+ if found_results:
400
+ break
401
+ time.sleep(0.05) # Faster polling (50ms) as requested
402
+ else:
403
+ # 1. Wait for document to settle (Fast Dynamic Wait)
404
+ try:
405
+ tab.wait.doc_loaded(timeout=timeout)
406
+ except: pass
407
+
408
+ html = tab.html
409
+ title = tab.title
410
+ final_url = tab.url
411
+
412
+ raw_screenshot_b64 = None
413
+ if include_screenshot:
414
+ try:
415
+ # Scrollbar Hiding Best Effort
416
+ from .manager import SharedBrowserManager
417
+ SharedBrowserManager.hide_scrollbars(tab)
418
+
419
+ # Inject CSS
420
+ tab.run_js("""
421
+ const style = document.createElement('style');
422
+ style.textContent = `
423
+ ::-webkit-scrollbar { display: none !important; }
424
+ html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
425
+ `;
426
+ document.head.appendChild(style);
427
+ document.documentElement.style.overflow = 'hidden';
428
+ document.body.style.overflow = 'hidden';
429
+ """)
430
+
431
+ raw_screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
432
+ except Exception as e:
433
+ logger.warning(f"ScreenshotService: Failed to capture screenshot: {e}")
434
+
435
+ # Extract content
436
+ content = trafilatura.extract(
437
+ html, include_links=True, include_images=True, include_comments=False,
438
+ include_tables=True, favor_precision=False, output_format="markdown"
439
+ ) or ""
440
+
441
+ # 2. Extract Images via Parallelized JS (Gallery)
442
+ # Strategy: For search pages, use Canvas to grab already loaded images (Instant)
443
+ # For other pages, use fetch (more robust for lazy load)
444
+ images_b64 = []
445
+ try:
446
+ js_code = """
447
+ (async () => {
448
+ const blocklist = ['logo', 'icon', 'avatar', 'ad', 'pixel', 'tracker', 'button', 'menu', 'nav'];
449
+ const candidates = Array.from(document.querySelectorAll('img'));
450
+ const validImages = [];
451
+
452
+ // Helper: Get base64 from loaded image via Canvas
453
+ const getBase64 = (img) => {
454
+ try {
455
+ const canvas = document.createElement('canvas');
456
+ canvas.width = img.naturalWidth;
457
+ canvas.height = img.naturalHeight;
458
+ const ctx = canvas.getContext('2d');
459
+ ctx.drawImage(img, 0, 0);
460
+ return canvas.toDataURL('image/jpeg').split(',')[1];
461
+ } catch(e) { return null; }
462
+ };
463
+
464
+ for (const img of candidates) {
465
+ if (validImages.length >= 8) break;
466
+
467
+ if (img.naturalWidth < 100 || img.naturalHeight < 80) continue;
468
+
469
+ const alt = (img.alt || '').toLowerCase();
470
+ const cls = (typeof img.className === 'string' ? img.className : '').toLowerCase();
471
+ const src = (img.src || '').toLowerCase();
472
+
473
+ if (blocklist.some(b => alt.includes(b) || cls.includes(b) || src.includes(b))) continue;
474
+
475
+ // 1. Try Canvas (Instant for loaded images)
476
+ if (img.complete && img.naturalHeight > 0) {
477
+ const b64 = getBase64(img);
478
+ if (b64) {
479
+ validImages.push(b64);
480
+ continue;
481
+ }
482
+ }
483
+
484
+ // 2. Fallback to fetch (only for non-search pages to avoid delay)
485
+ // We skip fetch for search pages to ensure speed
486
+ if (!window.location.href.includes('google') && !window.location.href.includes('search')) {
487
+ try {
488
+ const controller = new AbortController();
489
+ const id = setTimeout(() => controller.abort(), 2000);
490
+ const resp = await fetch(img.src, { signal: controller.signal });
491
+ clearTimeout(id);
492
+ const blob = await resp.blob();
493
+ const b64 = await new Promise(resolve => {
494
+ const reader = new FileReader();
495
+ reader.onloadend = () => resolve(reader.result.split(',')[1]);
496
+ reader.onerror = () => resolve(null);
497
+ reader.readAsDataURL(blob);
498
+ });
499
+ if (b64) validImages.push(b64);
500
+ } catch(e) {}
501
+ }
502
+ }
503
+ return validImages;
504
+ })()
505
+ """
506
+ images_b64 = tab.run_js(js_code, as_expr=True) or []
507
+
508
+ if images_b64:
509
+ logger.info(f"ScreenshotService: Extracted {len(images_b64)} images for {url}")
510
+
511
+ except Exception as e:
512
+ logger.warning(f"ScreenshotService: Image extraction failed: {e}")
513
+
514
+ return {
515
+ "content": content,
516
+ "html": html,
517
+ "title": title,
518
+ "url": final_url,
519
+ "raw_screenshot_b64": raw_screenshot_b64,
520
+ "images": images_b64
521
+ }
522
+
523
+ except Exception as e:
524
+ logger.error(f"ScreenshotService: Failed to fetch {url}: {e}")
525
+ return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
526
+ finally:
527
+ if tab:
528
+ self._release_tab(tab)
529
+
530
+ async def fetch_pages_batch(self, urls: List[str], timeout: float = 20.0, include_screenshot: bool = True) -> List[Dict[str, Any]]:
531
+ """Fetch multiple pages concurrently."""
532
+ if not urls: return []
533
+ logger.info(f"ScreenshotService: Batch fetching {len(urls)} URLs (screenshots={include_screenshot})")
534
+ tasks = [self.fetch_page(url, timeout, include_screenshot) for url in urls]
535
+ return await asyncio.gather(*tasks, return_exceptions=True)
536
+
537
+ async def screenshot_urls_batch(self, urls: List[str], timeout: float = 15.0, full_page: bool = True) -> List[Optional[str]]:
538
+ """Take screenshots of multiple URLs concurrently."""
539
+ if not urls: return []
540
+ logger.info(f"ScreenshotService: Batch screenshot {len(urls)} URLs")
541
+ tasks = [self.screenshot_url(url, timeout=timeout, full_page=full_page) for url in urls]
542
+ return await asyncio.gather(*tasks, return_exceptions=True)
543
+
544
+ async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 80) -> Optional[str]:
545
+ """Screenshot URL (Async wrapper for sync). Returns base64 string only."""
546
+ loop = asyncio.get_running_loop()
547
+ result = await loop.run_in_executor(
548
+ self._executor,
549
+ self._screenshot_sync,
550
+ url, wait_load, timeout, full_page, quality, False # extract_content=False
551
+ )
552
+ # Backward compatible: return just the screenshot for old callers
553
+ if isinstance(result, dict):
554
+ return result.get("screenshot_b64")
555
+ return result
556
+
557
+ async def screenshot_with_content(self, url: str, timeout: float = 15.0, max_content_length: int = 8000) -> Dict[str, Any]:
558
+ """
559
+ Screenshot URL and extract page content.
560
+
561
+ Returns:
562
+ Dict with:
563
+ - screenshot_b64: base64 encoded screenshot
564
+ - content: trafilatura extracted text (truncated to max_content_length)
565
+ - title: page title
566
+ - url: final URL
567
+ """
568
+ loop = asyncio.get_running_loop()
569
+ result = await loop.run_in_executor(
570
+ self._executor,
571
+ self._screenshot_sync,
572
+ url, True, timeout, False, 65, True # quality=65 for balance, extract_content=True
573
+ )
574
+
575
+ if not isinstance(result, dict):
576
+ return {"screenshot_b64": result, "content": "", "title": "", "url": url}
577
+
578
+ # Truncate content if needed
579
+ content = result.get("content", "") or ""
580
+ if len(content) > max_content_length:
581
+ content = content[:max_content_length] + "\n\n[内容已截断...]"
582
+ result["content"] = content
583
+
584
+ return result
585
+
586
+
587
+ def _screenshot_sync(self, url: str, wait_load: bool, timeout: float, full_page: bool, quality: int, extract_content: bool = False) -> Any:
588
+ """Synchronous screenshot. If extract_content=True, returns Dict else str."""
589
+ if not url:
590
+ return {"screenshot_b64": None, "content": "", "title": "", "url": url} if extract_content else None
591
+ tab = None
592
+ capture_width = 1440 # Higher resolution for readability
593
+
594
+ try:
595
+ self._ensure_ready()
596
+ page = self._manager.page
597
+ if not page: return None
598
+
599
+ # Create blank tab first
600
+ tab = page.new_tab()
601
+
602
+ # Set viewport BEFORE navigation so page renders at target width from the start
603
+ # This eliminates the need for post-load resize and reflow
604
+ try:
605
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
606
+ width=capture_width, height=900, deviceScaleFactor=1, mobile=False)
607
+ except:
608
+ pass
609
+
610
+ # Now navigate to the URL - page will render at target width
611
+ tab.get(url)
612
+
613
+ # Network monitoring removed - not needed for simplified wait logic
614
+
615
+ # Initialize crawl config for completeness checking (defined outside try for scope)
616
+ crawl_config = CrawlConfig(
617
+ scan_full_page=True,
618
+ scroll_step=800,
619
+ scroll_delay=0.5,
620
+ scroll_timeout=20.0, # Increased for lazy-loading pages
621
+ image_load_timeout=3.0, # Image loading timeout: 3 seconds
622
+ image_stability_checks=3,
623
+ )
624
+
625
+ # === Start scrolling immediately to trigger lazy loading ===
626
+ # Scroll first, then wait for DOM - this allows lazy loading to start in parallel
627
+ logger.info(f"ScreenshotService: Starting lazy load scroll for {url} (before DOM wait)")
628
+ trigger_lazy_load(tab, crawl_config)
629
+
630
+ # Now wait for DOM to be ready (after scroll has triggered lazy loading)
631
+ try:
632
+ # Wait for full page load (including JS execution)
633
+ try:
634
+ tab.wait.doc_loaded(timeout=timeout)
635
+ except:
636
+ pass
637
+
638
+ # Wait for actual content to appear (for CDN verification pages)
639
+ # Simplified wait logic for screenshot: just check basic readiness
640
+
641
+ for i in range(20): # Max 20 iterations (~1s) - much faster
642
+ try:
643
+ state = tab.run_js('''
644
+ return {
645
+ ready: document.readyState === 'complete',
646
+ title: document.title,
647
+ text: document.body.innerText.substring(0, 500) || ""
648
+ };
649
+ ''') or {'ready': False, 'title': "", 'text': ""}
650
+
651
+ is_ready = state.get('ready', False)
652
+ title = state.get('title', "").lower()
653
+ text_lower = state.get('text', "").lower()
654
+ text_len = len(text_lower)
655
+
656
+ # Check for verification pages
657
+ is_verification = "checking your browser" in text_lower or \
658
+ "just a moment" in text_lower or \
659
+ "please wait" in text_lower or \
660
+ "security check" in title or \
661
+ "just a moment" in title or \
662
+ "loading..." in title
663
+
664
+ # Basic content check
665
+ has_content = text_len > 100
666
+
667
+ # Pass if ready, not verification, and has content
668
+ if is_ready and not is_verification and has_content:
669
+ logger.debug(f"ScreenshotService: Page ready after {i * 0.05:.2f}s")
670
+ break
671
+
672
+ time.sleep(0.05)
673
+ except Exception:
674
+ time.sleep(0.05)
675
+ continue
676
+
677
+ # DEBUG: Save HTML to inspect what happened (in data dir)
678
+ try:
679
+ import os
680
+ log_path = os.path.join(os.getcwd(), "data", "browser.log.html")
681
+ with open(log_path, "w", encoding="utf-8") as f:
682
+ f.write(f"<!-- URL: {url} -->\n")
683
+ f.write(tab.html)
684
+ except: pass
685
+
686
+ except Exception as e:
687
+ logger.warning(f"ScreenshotService: Page readiness check failed: {e}")
688
+
689
+ # Scrollbar Hiding first (before any height calculation)
690
+ from .manager import SharedBrowserManager
691
+ SharedBrowserManager.hide_scrollbars(tab)
692
+
693
+ # Scroll back to top
694
+ tab.run_js("window.scrollTo(0, 0);")
695
+
696
+ # Image loading monitoring with time tracking - DISABLED
697
+ # No longer waiting for images to load
698
+ # Initialize image tracking JavaScript
699
+ # image_tracking_js = """
700
+ # (() => {
701
+ # if (!window._imageLoadTracker) {
702
+ # window._imageLoadTracker = {
703
+ # startTime: Date.now(),
704
+ # images: new Map()
705
+ # };
706
+ #
707
+ # const imgs = Array.from(document.querySelectorAll('img'));
708
+ # const minSize = 50;
709
+ #
710
+ # imgs.forEach((img, idx) => {
711
+ # if (img.clientWidth < minSize && img.clientHeight < minSize) return;
712
+ #
713
+ # const src = img.src || img.getAttribute('data-src') || '';
714
+ # const key = `${idx}_${src.substring(0, 100)}`;
715
+ #
716
+ # if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
717
+ # // Already loaded
718
+ # window._imageLoadTracker.images.set(key, {
719
+ # src: src.substring(0, 150),
720
+ # status: 'loaded',
721
+ # loadTime: 0, // Already loaded before tracking
722
+ # naturalSize: [img.naturalWidth, img.naturalHeight],
723
+ # displaySize: [img.clientWidth, img.clientHeight]
724
+ # });
725
+ # } else {
726
+ # // Track loading
727
+ # window._imageLoadTracker.images.set(key, {
728
+ # src: src.substring(0, 150),
729
+ # status: 'pending',
730
+ # startTime: Date.now(),
731
+ # naturalSize: [img.naturalWidth, img.naturalHeight],
732
+ # displaySize: [img.clientWidth, img.clientHeight]
733
+ # });
734
+ #
735
+ # // Add load event listener
736
+ # img.addEventListener('load', () => {
737
+ # const entry = window._imageLoadTracker.images.get(key);
738
+ # if (entry && entry.status === 'pending') {
739
+ # entry.status = 'loaded';
740
+ # entry.loadTime = Date.now() - entry.startTime;
741
+ # }
742
+ # });
743
+ #
744
+ # img.addEventListener('error', () => {
745
+ # const entry = window._imageLoadTracker.images.get(key);
746
+ # if (entry && entry.status === 'pending') {
747
+ # entry.status = 'failed';
748
+ # entry.loadTime = Date.now() - entry.startTime;
749
+ # }
750
+ # });
751
+ # }
752
+ # });
753
+ # }
754
+ #
755
+ # // Return current status
756
+ # const results = [];
757
+ # window._imageLoadTracker.images.forEach((value, key) => {
758
+ # const entry = {
759
+ # src: value.src,
760
+ # status: value.status,
761
+ # loadTime: value.status === 'loaded' ? (value.loadTime || 0) : (Date.now() - value.startTime),
762
+ # naturalSize: value.naturalSize,
763
+ # displaySize: value.displaySize
764
+ # };
765
+ # results.push(entry);
766
+ # });
767
+ #
768
+ # return {
769
+ # total: results.length,
770
+ # loaded: results.filter(r => r.status === 'loaded').length,
771
+ # pending: results.filter(r => r.status === 'pending').length,
772
+ # failed: results.filter(r => r.status === 'failed').length,
773
+ # details: results
774
+ # };
775
+ # })()
776
+ # """
777
+ #
778
+ # # Initialize tracking
779
+ # tab.run_js(image_tracking_js)
780
+ #
781
+ # # Monitor image loading with dynamic stop logic
782
+ # check_interval = 0.2 # Check every 200ms
783
+ # image_timeout = 3.0 # Image loading timeout: 3 seconds
784
+ # monitoring_start = time.time()
785
+ # loaded_times = [] # Track load times of completed images
786
+ #
787
+ # logger.info(f"ScreenshotService: Starting image load monitoring (timeout={image_timeout}s)...")
788
+ #
789
+ # while True:
790
+ # elapsed = time.time() - monitoring_start
791
+ #
792
+ # # Check timeout first
793
+ # if elapsed >= image_timeout:
794
+ # logger.info(f"ScreenshotService: Image loading timeout ({image_timeout}s) reached")
795
+ # break
796
+ #
797
+ # # Get current image status
798
+ # status = tab.run_js(image_tracking_js, as_expr=True) or {
799
+ # 'total': 0, 'loaded': 0, 'pending': 0, 'failed': 0, 'details': []
800
+ # }
801
+ #
802
+ # # Log each image's status and load time
803
+ # for img_detail in status.get('details', []):
804
+ # src_short = img_detail.get('src', '')[:80]
805
+ # status_str = img_detail.get('status', 'unknown')
806
+ # load_time = img_detail.get('loadTime', 0)
807
+ # logger.info(
808
+ # f"ScreenshotService: Image [{status_str}] "
809
+ # f"loadTime={load_time:.0f}ms "
810
+ # f"src={src_short}"
811
+ # )
812
+ #
813
+ # # Collect load times of completed images
814
+ # loaded_times = [
815
+ # img.get('loadTime', 0)
816
+ # for img in status.get('details', [])
817
+ # if img.get('status') == 'loaded' and img.get('loadTime', 0) > 0
818
+ # ]
819
+ #
820
+ # pending_count = status.get('pending', 0)
821
+ # loaded_count = status.get('loaded', 0)
822
+ #
823
+ # # Check stop conditions
824
+ # if pending_count == 0:
825
+ # logger.info(f"ScreenshotService: All images loaded. Total: {status.get('total', 0)}, Loaded: {loaded_count}")
826
+ # break
827
+ #
828
+ # # Check dynamic stop condition (if we have loaded images to calculate average)
829
+ # if loaded_times:
830
+ # avg_load_time = sum(loaded_times) / len(loaded_times)
831
+ # max_wait_time = avg_load_time * 2
832
+ #
833
+ # # Check if any pending image has exceeded max wait time
834
+ # pending_images = [
835
+ # img for img in status.get('details', [])
836
+ # if img.get('status') == 'pending'
837
+ # ]
838
+ #
839
+ # should_stop = False
840
+ # for pending_img in pending_images:
841
+ # wait_time = pending_img.get('loadTime', 0)
842
+ # if wait_time >= max_wait_time:
843
+ # should_stop = True
844
+ # logger.info(
845
+ # f"ScreenshotService: Stopping - pending image waited {wait_time:.0f}ms, "
846
+ # f"exceeds 2x avg load time ({max_wait_time:.0f}ms, avg={avg_load_time:.0f}ms)"
847
+ # )
848
+ # break
849
+ #
850
+ # if should_stop:
851
+ # break
852
+ #
853
+ # # Wait before next check
854
+ # time.sleep(check_interval)
855
+
856
+ # Now calculate final height ONCE after all content loaded
857
+ # CompletenessChecker already verified height stability
858
+ try:
859
+ final_height = tab.run_js('''
860
+ return Math.max(
861
+ document.body.scrollHeight || 0,
862
+ document.documentElement.scrollHeight || 0,
863
+ document.body.offsetHeight || 0,
864
+ document.documentElement.offsetHeight || 0
865
+ );
866
+ ''')
867
+ h = min(int(final_height) + 50, 15000)
868
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
869
+ width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
870
+ except:
871
+ pass
872
+
873
+ # Final scroll to top
874
+ tab.run_js("window.scrollTo(0, 0);")
875
+
876
+ # Capture screenshot
877
+ screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
878
+
879
+ # Extract content if requested
880
+ if extract_content:
881
+ try:
882
+ html = tab.html
883
+ title = tab.title
884
+ final_url = tab.url
885
+
886
+ # Minimal trafilatura settings to reduce token consumption
887
+ content = trafilatura.extract(
888
+ html,
889
+ include_links=False, # No links to reduce tokens
890
+ include_images=False, # No image descriptions
891
+ include_comments=False, # No comments
892
+ include_tables=False, # No tables (can be verbose)
893
+ favor_precision=True, # Favor precision over recall
894
+ output_format="txt" # Plain text (no markdown formatting)
895
+ ) or ""
896
+
897
+ return {
898
+ "screenshot_b64": screenshot_b64,
899
+ "content": content,
900
+ "title": title,
901
+ "url": final_url
902
+ }
903
+ except Exception as e:
904
+ logger.warning(f"ScreenshotService: Content extraction failed: {e}")
905
+ return {"screenshot_b64": screenshot_b64, "content": "", "title": "", "url": url}
906
+
907
+ return screenshot_b64
908
+
909
+ except Exception as e:
910
+ logger.error(f"ScreenshotService: Screenshot URL failed: {e}")
911
+ return {"screenshot_b64": None, "content": "", "title": "", "url": url} if extract_content else None
912
+ finally:
913
+ if tab:
914
+ try: tab.close()
915
+ except: pass
916
+
917
+ async def close(self):
918
+ self._executor.shutdown(wait=False)
919
+ logger.info("ScreenshotService: Closed.")
920
+
921
+ async def close_async(self):
922
+ await self.close()
923
+
924
+ # Singleton
925
+ _screenshot_service: Optional[ScreenshotService] = None
926
+
927
+ def get_screenshot_service(headless: bool = True) -> ScreenshotService:
928
+ global _screenshot_service
929
+ if _screenshot_service is None:
930
+ _screenshot_service = ScreenshotService(headless=headless, auto_start=True)
931
+ return _screenshot_service
932
+
933
+ async def close_screenshot_service():
934
+ global _screenshot_service
935
+ if _screenshot_service:
936
+ await _screenshot_service.close()
937
+ _screenshot_service = None
938
+
939
+ def prestart_browser(headless: bool = True):
940
+ get_screenshot_service(headless=headless)