entari-plugin-hyw 4.0.0rc17__py3-none-any.whl → 4.0.0rc19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (55) hide show
  1. entari_plugin_hyw-4.0.0rc19.dist-info/METADATA +26 -0
  2. entari_plugin_hyw-4.0.0rc19.dist-info/RECORD +4 -0
  3. entari_plugin_hyw-4.0.0rc19.dist-info/top_level.txt +1 -0
  4. entari_plugin_hyw/__init__.py +0 -914
  5. entari_plugin_hyw/filters.py +0 -83
  6. entari_plugin_hyw/history.py +0 -251
  7. entari_plugin_hyw/misc.py +0 -214
  8. entari_plugin_hyw/search_cache.py +0 -253
  9. entari_plugin_hyw-4.0.0rc17.dist-info/METADATA +0 -119
  10. entari_plugin_hyw-4.0.0rc17.dist-info/RECORD +0 -52
  11. entari_plugin_hyw-4.0.0rc17.dist-info/top_level.txt +0 -2
  12. hyw_core/__init__.py +0 -94
  13. hyw_core/agent.py +0 -876
  14. hyw_core/browser_control/__init__.py +0 -63
  15. hyw_core/browser_control/assets/card-dist/index.html +0 -429
  16. hyw_core/browser_control/assets/card-dist/logos/anthropic.svg +0 -1
  17. hyw_core/browser_control/assets/card-dist/logos/cerebras.svg +0 -9
  18. hyw_core/browser_control/assets/card-dist/logos/deepseek.png +0 -0
  19. hyw_core/browser_control/assets/card-dist/logos/gemini.svg +0 -1
  20. hyw_core/browser_control/assets/card-dist/logos/google.svg +0 -1
  21. hyw_core/browser_control/assets/card-dist/logos/grok.png +0 -0
  22. hyw_core/browser_control/assets/card-dist/logos/huggingface.png +0 -0
  23. hyw_core/browser_control/assets/card-dist/logos/microsoft.svg +0 -15
  24. hyw_core/browser_control/assets/card-dist/logos/minimax.png +0 -0
  25. hyw_core/browser_control/assets/card-dist/logos/mistral.png +0 -0
  26. hyw_core/browser_control/assets/card-dist/logos/nvida.png +0 -0
  27. hyw_core/browser_control/assets/card-dist/logos/openai.svg +0 -1
  28. hyw_core/browser_control/assets/card-dist/logos/openrouter.png +0 -0
  29. hyw_core/browser_control/assets/card-dist/logos/perplexity.svg +0 -24
  30. hyw_core/browser_control/assets/card-dist/logos/qwen.png +0 -0
  31. hyw_core/browser_control/assets/card-dist/logos/xai.png +0 -0
  32. hyw_core/browser_control/assets/card-dist/logos/xiaomi.png +0 -0
  33. hyw_core/browser_control/assets/card-dist/logos/zai.png +0 -0
  34. hyw_core/browser_control/assets/card-dist/vite.svg +0 -1
  35. hyw_core/browser_control/engines/__init__.py +0 -15
  36. hyw_core/browser_control/engines/base.py +0 -13
  37. hyw_core/browser_control/engines/default.py +0 -166
  38. hyw_core/browser_control/engines/duckduckgo.py +0 -171
  39. hyw_core/browser_control/landing.html +0 -172
  40. hyw_core/browser_control/manager.py +0 -173
  41. hyw_core/browser_control/renderer.py +0 -446
  42. hyw_core/browser_control/service.py +0 -1002
  43. hyw_core/config.py +0 -154
  44. hyw_core/core.py +0 -454
  45. hyw_core/crawling/__init__.py +0 -18
  46. hyw_core/crawling/completeness.py +0 -437
  47. hyw_core/crawling/models.py +0 -88
  48. hyw_core/definitions.py +0 -166
  49. hyw_core/image_cache.py +0 -274
  50. hyw_core/pipeline.py +0 -502
  51. hyw_core/search.py +0 -169
  52. hyw_core/stages/__init__.py +0 -21
  53. hyw_core/stages/base.py +0 -95
  54. hyw_core/stages/summary.py +0 -218
  55. {entari_plugin_hyw-4.0.0rc17.dist-info → entari_plugin_hyw-4.0.0rc19.dist-info}/WHEEL +0 -0
@@ -1,1002 +0,0 @@
1
- """
2
- Browser Service (DrissionPage)
3
-
4
- Provides page fetching and screenshot capabilities using DrissionPage.
5
- """
6
-
7
- import asyncio
8
- import base64
9
- import threading
10
- import time
11
- from concurrent.futures import ThreadPoolExecutor
12
- from typing import Optional, Dict, Any, List
13
- from loguru import logger
14
- import trafilatura
15
- from PIL import Image
16
- from io import BytesIO
17
-
18
- # Import intelligent completeness checker
19
- from ..crawling.completeness import CompletenessChecker, trigger_lazy_load
20
- from ..crawling.models import CrawlConfig
21
-
22
- class ScreenshotService:
23
- """
24
- Browser Service using DrissionPage.
25
- """
26
-
27
- def __init__(self, headless: bool = True, auto_start: bool = True):
28
- self.headless = headless
29
- self._manager = None
30
- self._executor = ThreadPoolExecutor(max_workers=10)
31
-
32
- if auto_start:
33
- self._ensure_ready()
34
-
35
- def _get_tab(self, url: str) -> Any:
36
- """Create a new tab and navigate to URL."""
37
- self._ensure_ready()
38
- return self._manager.new_tab(url)
39
-
40
- def _release_tab(self, tab: Any):
41
- """Close tab after use."""
42
- if not tab: return
43
- try:
44
- tab.close()
45
- except:
46
- pass
47
-
48
- async def search_via_page_input_batch(self, queries: List[str], url: str, selector: str = "#input") -> List[Dict[str, Any]]:
49
- """
50
- Execute concurrent searches using page inputs.
51
- """
52
- loop = asyncio.get_running_loop()
53
- return await loop.run_in_executor(
54
- self._executor,
55
- self._search_via_page_input_batch_sync,
56
- queries, url, selector
57
- )
58
-
59
- def _search_via_page_input_batch_sync(self, queries: List[str], url: str, selector: str) -> List[Dict[str, Any]]:
60
- """Sync batch execution - create tabs sequentially, search in parallel."""
61
- results = [None] * len(queries)
62
- tabs = []
63
-
64
- # Phase 1: Get/create tabs SEQUENTIALLY (DrissionPage isn't thread-safe for new_tab)
65
- target_url = url or "https://www.google.com"
66
- logger.info(f"ScreenshotService: Acquiring {len(queries)} tabs for parallel search...")
67
-
68
- for i in range(len(queries)):
69
- tab = None
70
- # Try to get from pool first (using shared logic now)
71
- try:
72
- tab = self._get_tab(target_url)
73
- except Exception as e:
74
- logger.warning(f"ScreenshotService: Batch search tab creation failed: {e}")
75
-
76
-
77
-
78
- tabs.append(tab)
79
-
80
- logger.info(f"ScreenshotService: {len(tabs)} tabs ready, starting parallel searches...")
81
-
82
- # Phase 2: Execute searches in PARALLEL
83
- def run_search(index, tab, query):
84
- try:
85
- logger.debug(f"Search[{index}]: Starting for '{query}' on {tab.url}")
86
-
87
- # Wait for page to be ready first
88
- try:
89
- tab.wait.doc_loaded(timeout=10)
90
- except:
91
- pass
92
-
93
- # Find input element with wait
94
- logger.debug(f"Search[{index}]: Looking for input with selector '{selector}'")
95
- ele = tab.ele(selector, timeout=5)
96
- if not ele:
97
- logger.debug(f"Search[{index}]: Primary selector failed, trying fallbacks")
98
- for fallback in ["textarea[name='q']", "#APjFqb", "input[name='q']", "input[type='text']"]:
99
- ele = tab.ele(fallback, timeout=2)
100
- if ele:
101
- logger.debug(f"Search[{index}]: Found input with fallback '{fallback}'")
102
- break
103
-
104
- if not ele:
105
- logger.error(f"Search[{index}]: No input element found on {tab.url}!")
106
- results[index] = {"content": "Error: input not found", "title": "Error", "url": tab.url, "html": tab.html[:5000]}
107
- return
108
-
109
- logger.debug(f"Search[{index}]: Typing query...")
110
- ele.input(query)
111
-
112
- logger.debug(f"Search[{index}]: Pressing Enter...")
113
- tab.actions.key_down('enter').key_up('enter')
114
-
115
- logger.debug(f"Search[{index}]: Waiting for search results...")
116
- tab.wait.doc_loaded(timeout=10)
117
- # Reduced settle wait for extraction
118
- time.sleep(0.1)
119
-
120
- logger.debug(f"Search[{index}]: Extracting content...")
121
- html = tab.html
122
- content = trafilatura.extract(
123
- html, include_links=True, include_images=True, include_comments=False,
124
- include_tables=True, favor_precision=False, output_format="markdown"
125
- ) or ""
126
-
127
- logger.info(f"ScreenshotService: Search '{query}' completed -> {tab.url}")
128
-
129
- results[index] = {
130
- "content": content,
131
- "html": html,
132
- "title": tab.title,
133
- "url": tab.url,
134
- "images": []
135
- }
136
-
137
- except Exception as e:
138
- logger.error(f"ScreenshotService: Search error for '{query}': {e}")
139
- results[index] = {"content": f"Error: {e}", "title": "Error", "url": "", "html": ""}
140
- finally:
141
- self._release_tab(tab)
142
-
143
- threads = []
144
- for i, (tab, query) in enumerate(zip(tabs, queries)):
145
- t = threading.Thread(target=run_search, args=(i, tab, query))
146
- t.start()
147
- threads.append(t)
148
-
149
- for t in threads:
150
- t.join()
151
-
152
- return results
153
-
154
- def _ensure_ready(self):
155
- """Ensure shared browser is ready."""
156
- from .manager import get_shared_browser_manager
157
- self._manager = get_shared_browser_manager(headless=self.headless)
158
-
159
- async def fetch_page(self, url: str, timeout: float = 10.0, include_screenshot: bool = True) -> Dict[str, Any]:
160
- """
161
- Fetch page content (and optionally screenshot).
162
- Runs in a thread executor to avoid blocking the async loop.
163
- """
164
- loop = asyncio.get_running_loop()
165
- return await loop.run_in_executor(
166
- self._executor,
167
- self._fetch_page_sync,
168
- url,
169
- timeout,
170
- include_screenshot
171
- )
172
-
173
- async def search_via_address_bar(self, query: str, timeout: float = 20.0) -> Dict[str, Any]:
174
- """
175
- Search using browser's address bar (uses browser's default search engine).
176
- Simulates: Ctrl+L (focus address bar) -> type query -> Enter
177
- """
178
- loop = asyncio.get_running_loop()
179
- return await loop.run_in_executor(
180
- self._executor,
181
- self._search_via_address_bar_sync,
182
- query,
183
- timeout
184
- )
185
-
186
- def _search_via_address_bar_sync(self, query: str, timeout: float) -> Dict[str, Any]:
187
- """Synchronous address bar search logic."""
188
- if not query:
189
- return {"content": "Error: missing query", "title": "Error", "url": "", "html": ""}
190
-
191
- tab = None
192
- try:
193
- self._ensure_ready()
194
- page = self._manager.page
195
- if not page:
196
- return {"content": "Error: Browser not available", "title": "Error", "url": "", "html": ""}
197
-
198
- # Open new blank tab
199
- tab = page.new_tab()
200
-
201
- # Focus address bar with Ctrl+L (or Cmd+L on Mac)
202
- import platform
203
- if platform.system() == "Darwin":
204
- tab.actions.key_down('cmd').key_down('l').key_up('l').key_up('cmd')
205
- else:
206
- tab.actions.key_down('ctrl').key_down('l').key_up('l').key_up('ctrl')
207
-
208
- # Small delay for address bar to focus
209
- import time as _time
210
- _time.sleep(0.05)
211
-
212
- # Type the query
213
- tab.actions.type(query)
214
-
215
- # Press Enter to search
216
- tab.actions.key_down('enter').key_up('enter')
217
-
218
- # Wait for page to load
219
- try:
220
- tab.wait.doc_loaded(timeout=timeout)
221
- # Reduced wait for initial results
222
- _time.sleep(0.2)
223
- except:
224
- pass
225
-
226
- html = tab.html
227
- title = tab.title
228
- final_url = tab.url
229
-
230
- # Extract content
231
- content = trafilatura.extract(
232
- html, include_links=True, include_images=True, include_comments=False,
233
- include_tables=True, favor_precision=False, output_format="markdown"
234
- ) or ""
235
-
236
- logger.info(f"ScreenshotService: Address bar search completed -> {final_url}")
237
-
238
- return {
239
- "content": content,
240
- "html": html,
241
- "title": title,
242
- "url": final_url,
243
- "images": []
244
- }
245
-
246
- except Exception as e:
247
- logger.error(f"ScreenshotService: Address bar search failed: {e}")
248
- return {"content": f"Error: search failed ({e})", "title": "Error", "url": "", "html": ""}
249
- finally:
250
- if tab:
251
- try: tab.close()
252
- except: pass
253
-
254
- def _scroll_to_bottom(self, tab, step: int = 800, delay: float = 2.0, timeout: float = 10.0):
255
- """
256
- Scroll down gradually to trigger lazy loading.
257
-
258
- Args:
259
- delay: Max wait time per scroll step (seconds) if images aren't loading.
260
- """
261
- import time
262
- start = time.time()
263
- current_pos = 0
264
- try:
265
- while time.time() - start < timeout:
266
- # Scroll down
267
- current_pos += step
268
- tab.run_js(f"window.scrollTo(0, {current_pos});")
269
-
270
- # Active Wait: Check if images in viewport are loaded
271
- # Poll every 100ms, up to 'delay' seconds
272
- wait_start = time.time()
273
- while time.time() - wait_start < delay:
274
- all_loaded = tab.run_js("""
275
- return (async () => {
276
- const imgs = Array.from(document.querySelectorAll('img'));
277
- const viewportHeight = window.innerHeight;
278
-
279
- // 1. Identify images currently in viewport
280
- const visibleImgs = imgs.filter(img => {
281
- const rect = img.getBoundingClientRect();
282
- return (rect.top < viewportHeight && rect.bottom > 0) && (rect.width > 0 && rect.height > 0);
283
- });
284
-
285
- if (visibleImgs.length === 0) return true;
286
-
287
- // 2. Check loading status using decode() AND heuristic for placeholders
288
- // Some sites load a tiny blurred placeholder first.
289
- const checks = visibleImgs.map(img => {
290
- // Enhanced placeholder detection (matching completeness.py)
291
- const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
292
- img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
293
- const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
294
- const loadingAttr = img.getAttribute('loading') || '';
295
- const src = img.src || '';
296
-
297
- const isPlaceholder = (
298
- // data-src not yet loaded
299
- (dataSrc && img.src !== dataSrc) ||
300
- // Natural size much smaller than display (blurred placeholder)
301
- (img.naturalWidth < 50 && img.clientWidth > 100) ||
302
- (img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
303
- // Lazy-loading class indicators
304
- className.includes('lazy') ||
305
- className.includes('lazyload') ||
306
- className.includes('lozad') ||
307
- // CSS blur filter applied
308
- (window.getComputedStyle(img).filter || '').includes('blur') ||
309
- // loading="lazy" + not complete
310
- (loadingAttr === 'lazy' && !img.complete)
311
- );
312
-
313
- if (isPlaceholder) {
314
- // If it looks like a placeholder, we return false (not loaded)
315
- // unless it stays like this for too long (handled by outer timeout)
316
- return Promise.resolve(false);
317
- }
318
-
319
- if (img.complete && img.naturalHeight > 0) return Promise.resolve(true);
320
-
321
- return img.decode().then(() => true).catch(() => false);
322
- });
323
-
324
- // Race against a small timeout to avoid hanging on one broken image
325
- const allDecoded = Promise.all(checks);
326
- const timeout = new Promise(resolve => setTimeout(() => resolve(false), 500));
327
-
328
- // If any check returned false (meaning placeholder or not decoded), result is false
329
- return Promise.race([allDecoded, timeout]).then(results => {
330
- if (!Array.isArray(results)) return results === true;
331
- return results.every(res => res === true);
332
- });
333
- })();
334
- """)
335
- if all_loaded:
336
- break
337
- time.sleep(0.1)
338
-
339
- # Check if reached bottom
340
- height = tab.run_js("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);")
341
- if current_pos >= height:
342
- break
343
-
344
- # Ensure final layout settle
345
- time.sleep(0.2)
346
-
347
- except Exception as e:
348
- logger.warning(f"ScreenshotService: Scroll failed: {e}")
349
-
350
- def _fetch_page_sync(self, url: str, timeout: float, include_screenshot: bool) -> Dict[str, Any]:
351
- """Synchronous fetch logic."""
352
- if not url:
353
- return {"content": "Error: missing url", "title": "Error", "url": ""}
354
-
355
- tab = None
356
- try:
357
- self._ensure_ready()
358
- page = self._manager.page
359
- if not page:
360
- return {"content": "Error: Browser not available", "title": "Error", "url": url}
361
-
362
- # Get from pool
363
- tab = self._get_tab(url)
364
-
365
- # Wait logic - optimized for search pages
366
- is_search_page = any(s in url.lower() for s in ['search', 'bing.com', 'duckduckgo', 'google.com/search', 'searx'])
367
- if is_search_page:
368
- # Optimized waiting: Rapidly poll for ACTUAL results > 0
369
- start_time = time.time()
370
-
371
- # Special fast-path for DDG Lite (HTML only, no JS rendering needed)
372
- if 'lite.duckduckgo' in url:
373
- # just wait for body, it's static HTML
374
- try:
375
- tab.wait.doc_loaded(timeout=timeout)
376
- except: pass
377
- # Sleep tiny bit to ensure render
378
- time.sleep(0.5)
379
- else:
380
- while time.time() - start_time < timeout:
381
- found_results = False
382
- try:
383
- if 'google' in url.lower():
384
- # Check if we have any result items (.g, .MjjYud) or the main container (#search)
385
- # Using checks with minimal timeout to allow fast looping
386
- if tab.ele('.g', timeout=0.1) or tab.ele('.MjjYud', timeout=0.1) or tab.ele('#search', timeout=0.1):
387
- found_results = True
388
- elif 'bing' in url.lower():
389
- if tab.ele('.b_algo', timeout=0.1) or tab.ele('#b_results', timeout=0.1):
390
- found_results = True
391
- elif 'duckduckgo' in url.lower():
392
- if tab.ele('.result', timeout=0.1) or tab.ele('#react-layout', timeout=0.1):
393
- found_results = True
394
- else:
395
- # Generic fallback: wait for body to be populated
396
- if tab.ele('body', timeout=0.1):
397
- found_results = True
398
- except:
399
- pass
400
-
401
- if found_results:
402
- break
403
- time.sleep(0.05) # Faster polling (50ms) as requested
404
- else:
405
- # 1. Wait for document to settle (Fast Dynamic Wait)
406
- try:
407
- tab.wait.doc_loaded(timeout=timeout)
408
- except: pass
409
-
410
- html = tab.html
411
- title = tab.title
412
- final_url = tab.url
413
-
414
- raw_screenshot_b64 = None
415
- if include_screenshot:
416
- try:
417
- # Scrollbar Hiding Best Effort
418
- from .manager import SharedBrowserManager
419
- SharedBrowserManager.hide_scrollbars(tab)
420
-
421
- # Inject CSS
422
- tab.run_js("""
423
- const style = document.createElement('style');
424
- style.textContent = `
425
- ::-webkit-scrollbar { display: none !important; }
426
- html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
427
- `;
428
- document.head.appendChild(style);
429
- document.documentElement.style.overflow = 'hidden';
430
- document.body.style.overflow = 'hidden';
431
- """)
432
-
433
- raw_screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
434
- except Exception as e:
435
- logger.warning(f"ScreenshotService: Failed to capture screenshot: {e}")
436
-
437
- # Extract content
438
- content = trafilatura.extract(
439
- html, include_links=True, include_images=True, include_comments=False,
440
- include_tables=True, favor_precision=False, output_format="markdown"
441
- ) or ""
442
-
443
- # 2. Extract Images via Parallelized JS (Gallery)
444
- # Strategy: For search pages, use Canvas to grab already loaded images (Instant)
445
- # For other pages, use fetch (more robust for lazy load)
446
- images_b64 = []
447
- try:
448
- js_code = """
449
- (async () => {
450
- const blocklist = ['logo', 'icon', 'avatar', 'ad', 'pixel', 'tracker', 'button', 'menu', 'nav'];
451
- const candidates = Array.from(document.querySelectorAll('img'));
452
- const validImages = [];
453
-
454
- // Helper: Get base64 from loaded image via Canvas
455
- const getBase64 = (img) => {
456
- try {
457
- const canvas = document.createElement('canvas');
458
- canvas.width = img.naturalWidth;
459
- canvas.height = img.naturalHeight;
460
- const ctx = canvas.getContext('2d');
461
- ctx.drawImage(img, 0, 0);
462
- return canvas.toDataURL('image/jpeg').split(',')[1];
463
- } catch(e) { return null; }
464
- };
465
-
466
- for (const img of candidates) {
467
- if (validImages.length >= 8) break;
468
-
469
- if (img.naturalWidth < 100 || img.naturalHeight < 80) continue;
470
-
471
- const alt = (img.alt || '').toLowerCase();
472
- const cls = (typeof img.className === 'string' ? img.className : '').toLowerCase();
473
- const src = (img.src || '').toLowerCase();
474
-
475
- if (blocklist.some(b => alt.includes(b) || cls.includes(b) || src.includes(b))) continue;
476
-
477
- // 1. Try Canvas (Instant for loaded images)
478
- if (img.complete && img.naturalHeight > 0) {
479
- const b64 = getBase64(img);
480
- if (b64) {
481
- validImages.push(b64);
482
- continue;
483
- }
484
- }
485
-
486
- // 2. Fallback to fetch (only for non-search pages to avoid delay)
487
- // We skip fetch for search pages to ensure speed
488
- if (!window.location.href.includes('google') && !window.location.href.includes('search')) {
489
- try {
490
- const controller = new AbortController();
491
- const id = setTimeout(() => controller.abort(), 2000);
492
- const resp = await fetch(img.src, { signal: controller.signal });
493
- clearTimeout(id);
494
- const blob = await resp.blob();
495
- const b64 = await new Promise(resolve => {
496
- const reader = new FileReader();
497
- reader.onloadend = () => resolve(reader.result.split(',')[1]);
498
- reader.onerror = () => resolve(null);
499
- reader.readAsDataURL(blob);
500
- });
501
- if (b64) validImages.push(b64);
502
- } catch(e) {}
503
- }
504
- }
505
- return validImages;
506
- })()
507
- """
508
- images_b64 = tab.run_js(js_code, as_expr=True) or []
509
-
510
- if images_b64:
511
- logger.info(f"ScreenshotService: Extracted {len(images_b64)} images for {url}")
512
-
513
- except Exception as e:
514
- logger.warning(f"ScreenshotService: Image extraction failed: {e}")
515
-
516
- return {
517
- "content": content,
518
- "html": html,
519
- "title": title,
520
- "url": final_url,
521
- "raw_screenshot_b64": raw_screenshot_b64,
522
- "images": images_b64
523
- }
524
-
525
- except Exception as e:
526
- logger.error(f"ScreenshotService: Failed to fetch {url}: {e}")
527
- return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
528
- finally:
529
- if tab:
530
- self._release_tab(tab)
531
-
532
- async def fetch_pages_batch(self, urls: List[str], timeout: float = 20.0, include_screenshot: bool = True) -> List[Dict[str, Any]]:
533
- """Fetch multiple pages concurrently."""
534
- if not urls: return []
535
- logger.info(f"ScreenshotService: Batch fetching {len(urls)} URLs (screenshots={include_screenshot})")
536
- tasks = [self.fetch_page(url, timeout, include_screenshot) for url in urls]
537
- return await asyncio.gather(*tasks, return_exceptions=True)
538
-
539
- async def screenshot_urls_batch(self, urls: List[str], timeout: float = 15.0, full_page: bool = True) -> List[Optional[str]]:
540
- """Take screenshots of multiple URLs concurrently."""
541
- if not urls: return []
542
- logger.info(f"ScreenshotService: Batch screenshot {len(urls)} URLs")
543
- tasks = [self.screenshot_url(url, timeout=timeout, full_page=full_page) for url in urls]
544
- return await asyncio.gather(*tasks, return_exceptions=True)
545
-
546
- async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 90, scale: int = 1) -> Optional[str]:
547
- """Screenshot URL (Async wrapper for sync). Returns base64 string only."""
548
- loop = asyncio.get_running_loop()
549
- result = await loop.run_in_executor(
550
- self._executor,
551
- self._screenshot_sync,
552
- url, wait_load, timeout, full_page, quality, scale, False # extract_content=False
553
- )
554
- # Backward compatible: return just the screenshot for old callers
555
- if isinstance(result, dict):
556
- return result.get("screenshot_b64")
557
- return result
558
-
559
- async def screenshot_with_content(self, url: str, timeout: float = 15.0, max_content_length: int = 8000) -> Dict[str, Any]:
560
- """
561
- Screenshot URL and extract page content.
562
-
563
- Returns:
564
- Dict with:
565
- - screenshot_b64: base64 encoded screenshot
566
- - content: trafilatura extracted text (truncated to max_content_length)
567
- - title: page title
568
- - url: final URL
569
- """
570
- loop = asyncio.get_running_loop()
571
- result = await loop.run_in_executor(
572
- self._executor,
573
- self._screenshot_sync,
574
- url, True, timeout, False, 80, 2, True # quality=80 for balance, scale=2, extract_content=True
575
- )
576
-
577
- if not isinstance(result, dict):
578
- return {"screenshot_b64": result, "content": "", "title": "", "url": url}
579
-
580
- # Truncate content if needed
581
- content = result.get("content", "") or ""
582
- if len(content) > max_content_length:
583
- content = content[:max_content_length] + "\n\n[内容已截断...]"
584
- result["content"] = content
585
-
586
- return result
587
-
588
-
589
- def _screenshot_sync(self, url: str, wait_load: bool, timeout: float, full_page: bool, quality: int, scale: int = 1, extract_content: bool = False) -> Any:
590
- """Synchronous screenshot. If extract_content=True, returns Dict else str."""
591
- if not url:
592
- return {"screenshot_b64": None, "content": "", "title": "", "url": url} if extract_content else None
593
- tab = None
594
- capture_width = 3000 # Increased for more comfortable page size while maintaining high resolution
595
-
596
- try:
597
- self._ensure_ready()
598
- page = self._manager.page
599
- if not page: return None
600
-
601
- # Create blank tab first
602
- tab = page.new_tab()
603
-
604
- # Set viewport BEFORE navigation so page renders at target width from the start
605
- # This eliminates the need for post-load resize and reflow
606
- try:
607
- tab.run_cdp('Emulation.setDeviceMetricsOverride',
608
- width=capture_width, height=900, deviceScaleFactor=scale, mobile=False)
609
- except:
610
- pass
611
-
612
- # Now navigate to the URL - page will render at target width
613
- tab.get(url)
614
-
615
- # Network monitoring removed - not needed for simplified wait logic
616
-
617
- # Initialize crawl config for completeness checking (defined outside try for scope)
618
- crawl_config = CrawlConfig(
619
- scan_full_page=True,
620
- scroll_step=800,
621
- scroll_delay=0.5,
622
- scroll_timeout=20.0, # Increased for lazy-loading pages
623
- image_load_timeout=3.0, # Image loading timeout: 3 seconds
624
- image_stability_checks=3,
625
- )
626
-
627
- # === Start scrolling immediately to trigger lazy loading ===
628
- # Scroll first, then wait for DOM - this allows lazy loading to start in parallel
629
- logger.info(f"ScreenshotService: Starting lazy load scroll for {url} (before DOM wait)")
630
- trigger_lazy_load(tab, crawl_config)
631
-
632
- # Now wait for DOM to be ready (after scroll has triggered lazy loading)
633
- try:
634
- # Wait for full page load (including JS execution)
635
- try:
636
- tab.wait.doc_loaded(timeout=timeout)
637
- except:
638
- pass
639
-
640
- # Wait for actual content to appear (for CDN verification pages)
641
- # Simplified wait logic for screenshot: just check basic readiness
642
-
643
- for i in range(20): # Max 20 iterations (~1s) - much faster
644
- try:
645
- state = tab.run_js('''
646
- return {
647
- ready: document.readyState === 'complete',
648
- title: document.title,
649
- text: document.body.innerText.substring(0, 500) || ""
650
- };
651
- ''') or {'ready': False, 'title': "", 'text': ""}
652
-
653
- is_ready = state.get('ready', False)
654
- title = state.get('title', "").lower()
655
- text_lower = state.get('text', "").lower()
656
- text_len = len(text_lower)
657
-
658
- # Check for verification pages
659
- is_verification = "checking your browser" in text_lower or \
660
- "just a moment" in text_lower or \
661
- "please wait" in text_lower or \
662
- "security check" in title or \
663
- "just a moment" in title or \
664
- "loading..." in title
665
-
666
- # Basic content check
667
- has_content = text_len > 100
668
-
669
- # Pass if ready, not verification, and has content
670
- if is_ready and not is_verification and has_content:
671
- logger.debug(f"ScreenshotService: Page ready after {i * 0.05:.2f}s")
672
- break
673
-
674
- time.sleep(0.05)
675
- except Exception:
676
- time.sleep(0.05)
677
- continue
678
-
679
- # DEBUG: Save HTML to inspect what happened (in data dir)
680
- try:
681
- import os
682
- log_path = os.path.join(os.getcwd(), "data", "browser.log.html")
683
- with open(log_path, "w", encoding="utf-8") as f:
684
- f.write(f"<!-- URL: {url} -->\n")
685
- f.write(tab.html)
686
- except: pass
687
-
688
- except Exception as e:
689
- logger.warning(f"ScreenshotService: Page readiness check failed: {e}")
690
-
691
- # Scrollbar Hiding first (before any height calculation)
692
- from .manager import SharedBrowserManager
693
- SharedBrowserManager.hide_scrollbars(tab)
694
-
695
- # Scroll back to top
696
- tab.run_js("window.scrollTo(0, 0);")
697
-
698
- # Image loading monitoring with time tracking - DISABLED
699
- # No longer waiting for images to load
700
- # Initialize image tracking JavaScript
701
- # image_tracking_js = """
702
- # (() => {
703
- # if (!window._imageLoadTracker) {
704
- # window._imageLoadTracker = {
705
- # startTime: Date.now(),
706
- # images: new Map()
707
- # };
708
- #
709
- # const imgs = Array.from(document.querySelectorAll('img'));
710
- # const minSize = 50;
711
- #
712
- # imgs.forEach((img, idx) => {
713
- # if (img.clientWidth < minSize && img.clientHeight < minSize) return;
714
- #
715
- # const src = img.src || img.getAttribute('data-src') || '';
716
- # const key = `${idx}_${src.substring(0, 100)}`;
717
- #
718
- # if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
719
- # // Already loaded
720
- # window._imageLoadTracker.images.set(key, {
721
- # src: src.substring(0, 150),
722
- # status: 'loaded',
723
- # loadTime: 0, // Already loaded before tracking
724
- # naturalSize: [img.naturalWidth, img.naturalHeight],
725
- # displaySize: [img.clientWidth, img.clientHeight]
726
- # });
727
- # } else {
728
- # // Track loading
729
- # window._imageLoadTracker.images.set(key, {
730
- # src: src.substring(0, 150),
731
- # status: 'pending',
732
- # startTime: Date.now(),
733
- # naturalSize: [img.naturalWidth, img.naturalHeight],
734
- # displaySize: [img.clientWidth, img.clientHeight]
735
- # });
736
- #
737
- # // Add load event listener
738
- # img.addEventListener('load', () => {
739
- # const entry = window._imageLoadTracker.images.get(key);
740
- # if (entry && entry.status === 'pending') {
741
- # entry.status = 'loaded';
742
- # entry.loadTime = Date.now() - entry.startTime;
743
- # }
744
- # });
745
- #
746
- # img.addEventListener('error', () => {
747
- # const entry = window._imageLoadTracker.images.get(key);
748
- # if (entry && entry.status === 'pending') {
749
- # entry.status = 'failed';
750
- # entry.loadTime = Date.now() - entry.startTime;
751
- # }
752
- # });
753
- # }
754
- # });
755
- # }
756
- #
757
- # // Return current status
758
- # const results = [];
759
- # window._imageLoadTracker.images.forEach((value, key) => {
760
- # const entry = {
761
- # src: value.src,
762
- # status: value.status,
763
- # loadTime: value.status === 'loaded' ? (value.loadTime || 0) : (Date.now() - value.startTime),
764
- # naturalSize: value.naturalSize,
765
- # displaySize: value.displaySize
766
- # };
767
- # results.push(entry);
768
- # });
769
- #
770
- # return {
771
- # total: results.length,
772
- # loaded: results.filter(r => r.status === 'loaded').length,
773
- # pending: results.filter(r => r.status === 'pending').length,
774
- # failed: results.filter(r => r.status === 'failed').length,
775
- # details: results
776
- # };
777
- # })()
778
- # """
779
- #
780
- # # Initialize tracking
781
- # tab.run_js(image_tracking_js)
782
- #
783
- # # Monitor image loading with dynamic stop logic
784
- # check_interval = 0.2 # Check every 200ms
785
- # image_timeout = 3.0 # Image loading timeout: 3 seconds
786
- # monitoring_start = time.time()
787
- # loaded_times = [] # Track load times of completed images
788
- #
789
- # logger.info(f"ScreenshotService: Starting image load monitoring (timeout={image_timeout}s)...")
790
- #
791
- # while True:
792
- # elapsed = time.time() - monitoring_start
793
- #
794
- # # Check timeout first
795
- # if elapsed >= image_timeout:
796
- # logger.info(f"ScreenshotService: Image loading timeout ({image_timeout}s) reached")
797
- # break
798
- #
799
- # # Get current image status
800
- # status = tab.run_js(image_tracking_js, as_expr=True) or {
801
- # 'total': 0, 'loaded': 0, 'pending': 0, 'failed': 0, 'details': []
802
- # }
803
- #
804
- # # Log each image's status and load time
805
- # for img_detail in status.get('details', []):
806
- # src_short = img_detail.get('src', '')[:80]
807
- # status_str = img_detail.get('status', 'unknown')
808
- # load_time = img_detail.get('loadTime', 0)
809
- # logger.info(
810
- # f"ScreenshotService: Image [{status_str}] "
811
- # f"loadTime={load_time:.0f}ms "
812
- # f"src={src_short}"
813
- # )
814
- #
815
- # # Collect load times of completed images
816
- # loaded_times = [
817
- # img.get('loadTime', 0)
818
- # for img in status.get('details', [])
819
- # if img.get('status') == 'loaded' and img.get('loadTime', 0) > 0
820
- # ]
821
- #
822
- # pending_count = status.get('pending', 0)
823
- # loaded_count = status.get('loaded', 0)
824
- #
825
- # # Check stop conditions
826
- # if pending_count == 0:
827
- # logger.info(f"ScreenshotService: All images loaded. Total: {status.get('total', 0)}, Loaded: {loaded_count}")
828
- # break
829
- #
830
- # # Check dynamic stop condition (if we have loaded images to calculate average)
831
- # if loaded_times:
832
- # avg_load_time = sum(loaded_times) / len(loaded_times)
833
- # max_wait_time = avg_load_time * 2
834
- #
835
- # # Check if any pending image has exceeded max wait time
836
- # pending_images = [
837
- # img for img in status.get('details', [])
838
- # if img.get('status') == 'pending'
839
- # ]
840
- #
841
- # should_stop = False
842
- # for pending_img in pending_images:
843
- # wait_time = pending_img.get('loadTime', 0)
844
- # if wait_time >= max_wait_time:
845
- # should_stop = True
846
- # logger.info(
847
- # f"ScreenshotService: Stopping - pending image waited {wait_time:.0f}ms, "
848
- # f"exceeds 2x avg load time ({max_wait_time:.0f}ms, avg={avg_load_time:.0f}ms)"
849
- # )
850
- # break
851
- #
852
- # if should_stop:
853
- # break
854
- #
855
- # # Wait before next check
856
- # time.sleep(check_interval)
857
-
858
- # Now calculate final height ONCE after all content loaded
859
- # CompletenessChecker already verified height stability
860
- try:
861
- final_height = tab.run_js('''
862
- return Math.max(
863
- document.body.scrollHeight || 0,
864
- document.documentElement.scrollHeight || 0,
865
- document.body.offsetHeight || 0,
866
- document.documentElement.offsetHeight || 0
867
- );
868
- ''')
869
- h = min(int(final_height) + 50, 15000)
870
- tab.run_cdp('Emulation.setDeviceMetricsOverride',
871
- width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
872
- except:
873
- pass
874
-
875
- # Final scroll to top
876
- tab.run_js("window.scrollTo(0, 0);")
877
-
878
- # Capture screenshot
879
- screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
880
-
881
- # Use Pillow for intelligent compression
882
- if screenshot_b64 and quality < 95: # Only compress if quality is not near maximum
883
- try:
884
- img_bytes = base64.b64decode(screenshot_b64)
885
- img = Image.open(BytesIO(img_bytes))
886
-
887
- # Convert to RGB if not already (some images might be RGBA, which JPEG doesn't support)
888
- if img.mode in ("RGBA", "P"):
889
- img = img.convert("RGB")
890
-
891
- output_buffer = BytesIO()
892
- img.save(output_buffer, format="WebP", quality=quality, optimize=True) # Output as WebP format
893
- screenshot_b64 = base64.b64encode(output_buffer.getvalue()).decode()
894
- logger.debug(f"ScreenshotService: Applied Pillow compression with quality={quality}")
895
- except Exception as e:
896
- logger.warning(f"ScreenshotService: Pillow compression failed: {e}")
897
-
898
- # Extract content if requested
899
- if extract_content:
900
- try:
901
- html = tab.html
902
- title = tab.title
903
- final_url = tab.url
904
-
905
- # Minimal trafilatura settings to reduce token consumption
906
- content = trafilatura.extract(
907
- html,
908
- include_links=False, # No links to reduce tokens
909
- include_images=False, # No image descriptions
910
- include_comments=False, # No comments
911
- include_tables=False, # No tables (can be verbose)
912
- favor_precision=True, # Favor precision over recall
913
- output_format="txt" # Plain text (no markdown formatting)
914
- ) or ""
915
-
916
- return {
917
- "screenshot_b64": screenshot_b64,
918
- "content": content,
919
- "title": title,
920
- "url": final_url
921
- }
922
- except Exception as e:
923
- logger.warning(f"ScreenshotService: Content extraction failed: {e}")
924
- return {"screenshot_b64": screenshot_b64, "content": "", "title": "", "url": url}
925
-
926
- return screenshot_b64
927
-
928
- except Exception as e:
929
- logger.error(f"ScreenshotService: Screenshot URL failed: {e}")
930
- return {"screenshot_b64": None, "content": "", "title": "", "url": url} if extract_content else None
931
- finally:
932
- if tab:
933
- try: tab.close()
934
- except: pass
935
-
936
-
937
- async def execute_script(self, script: str) -> Dict[str, Any]:
938
- """
939
- Execute JavaScript in the current active page context.
940
- This reuses the shared browser instance.
941
- """
942
- loop = asyncio.get_running_loop()
943
- return await loop.run_in_executor(
944
- self._executor,
945
- self._execute_script_sync,
946
- script
947
- )
948
-
949
- def _execute_script_sync(self, script: str) -> Dict[str, Any]:
950
- """Synchronous JS execution."""
951
- try:
952
- self._ensure_ready()
953
- page = self._manager.page
954
- if not page:
955
- return {"success": False, "error": "Browser not available"}
956
-
957
- # Get current active tab or first tab
958
- # Fix: ChromiumPage object has no attribute 'tabs'
959
- # We use the page object itself as it represents the active tab controller
960
- tab = page
961
- if not tab:
962
- return {"success": False, "error": "No active tab"}
963
-
964
- logger.info(f"ScreenshotService: Executing JS on {tab.url}")
965
-
966
- # Execute JS
967
- result = tab.run_js(script)
968
-
969
- return {
970
- "success": True,
971
- "result": result,
972
- "url": tab.url,
973
- "title": tab.title
974
- }
975
- except Exception as e:
976
- logger.error(f"ScreenshotService: JS execution failed: {e}")
977
- return {"success": False, "error": str(e)}
978
-
979
- async def close(self):
980
- self._executor.shutdown(wait=False)
981
- logger.info("ScreenshotService: Closed.")
982
-
983
- async def close_async(self):
984
- await self.close()
985
-
986
- # Singleton
987
- _screenshot_service: Optional[ScreenshotService] = None
988
-
989
- def get_screenshot_service(headless: bool = True) -> ScreenshotService:
990
- global _screenshot_service
991
- if _screenshot_service is None:
992
- _screenshot_service = ScreenshotService(headless=headless, auto_start=True)
993
- return _screenshot_service
994
-
995
- async def close_screenshot_service():
996
- global _screenshot_service
997
- if _screenshot_service:
998
- await _screenshot_service.close()
999
- _screenshot_service = None
1000
-
1001
- def prestart_browser(headless: bool = True):
1002
- get_screenshot_service(headless=headless)