entari-plugin-hyw 4.0.0rc6__py3-none-any.whl → 4.0.0rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (114) hide show
  1. entari_plugin_hyw/Untitled-1 +1865 -0
  2. entari_plugin_hyw/__init__.py +733 -379
  3. entari_plugin_hyw/history.py +60 -57
  4. entari_plugin_hyw/misc.py +3 -0
  5. entari_plugin_hyw/search_cache.py +154 -0
  6. {entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc8.dist-info}/METADATA +3 -1
  7. entari_plugin_hyw-4.0.0rc8.dist-info/RECORD +68 -0
  8. {entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc8.dist-info}/WHEEL +1 -1
  9. {entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc8.dist-info}/top_level.txt +1 -0
  10. hyw_core/__init__.py +94 -0
  11. hyw_core/browser_control/__init__.py +65 -0
  12. hyw_core/browser_control/assets/card-dist/index.html +409 -0
  13. hyw_core/browser_control/assets/index.html +5691 -0
  14. hyw_core/browser_control/engines/__init__.py +17 -0
  15. hyw_core/browser_control/engines/default.py +166 -0
  16. {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/duckduckgo.py +42 -8
  17. {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/google.py +1 -1
  18. {entari_plugin_hyw/browser → hyw_core/browser_control}/manager.py +15 -8
  19. entari_plugin_hyw/render_vue.py → hyw_core/browser_control/renderer.py +29 -14
  20. hyw_core/browser_control/service.py +720 -0
  21. hyw_core/config.py +154 -0
  22. hyw_core/core.py +322 -0
  23. hyw_core/definitions.py +83 -0
  24. entari_plugin_hyw/modular_pipeline.py → hyw_core/pipeline.py +204 -86
  25. {entari_plugin_hyw → hyw_core}/search.py +60 -19
  26. hyw_core/stages/__init__.py +21 -0
  27. entari_plugin_hyw/stage_base.py → hyw_core/stages/base.py +3 -0
  28. entari_plugin_hyw/stage_summary.py → hyw_core/stages/summary.py +36 -7
  29. entari_plugin_hyw/assets/card-dist/index.html +0 -387
  30. entari_plugin_hyw/browser/__init__.py +0 -10
  31. entari_plugin_hyw/browser/engines/bing.py +0 -95
  32. entari_plugin_hyw/browser/service.py +0 -304
  33. entari_plugin_hyw/card-ui/.gitignore +0 -24
  34. entari_plugin_hyw/card-ui/README.md +0 -5
  35. entari_plugin_hyw/card-ui/index.html +0 -16
  36. entari_plugin_hyw/card-ui/package-lock.json +0 -2342
  37. entari_plugin_hyw/card-ui/package.json +0 -31
  38. entari_plugin_hyw/card-ui/public/logos/anthropic.svg +0 -1
  39. entari_plugin_hyw/card-ui/public/logos/cerebras.svg +0 -9
  40. entari_plugin_hyw/card-ui/public/logos/deepseek.png +0 -0
  41. entari_plugin_hyw/card-ui/public/logos/gemini.svg +0 -1
  42. entari_plugin_hyw/card-ui/public/logos/google.svg +0 -1
  43. entari_plugin_hyw/card-ui/public/logos/grok.png +0 -0
  44. entari_plugin_hyw/card-ui/public/logos/huggingface.png +0 -0
  45. entari_plugin_hyw/card-ui/public/logos/microsoft.svg +0 -15
  46. entari_plugin_hyw/card-ui/public/logos/minimax.png +0 -0
  47. entari_plugin_hyw/card-ui/public/logos/mistral.png +0 -0
  48. entari_plugin_hyw/card-ui/public/logos/nvida.png +0 -0
  49. entari_plugin_hyw/card-ui/public/logos/openai.svg +0 -1
  50. entari_plugin_hyw/card-ui/public/logos/openrouter.png +0 -0
  51. entari_plugin_hyw/card-ui/public/logos/perplexity.svg +0 -24
  52. entari_plugin_hyw/card-ui/public/logos/qwen.png +0 -0
  53. entari_plugin_hyw/card-ui/public/logos/xai.png +0 -0
  54. entari_plugin_hyw/card-ui/public/logos/xiaomi.png +0 -0
  55. entari_plugin_hyw/card-ui/public/logos/zai.png +0 -0
  56. entari_plugin_hyw/card-ui/public/vite.svg +0 -1
  57. entari_plugin_hyw/card-ui/src/App.vue +0 -756
  58. entari_plugin_hyw/card-ui/src/assets/vue.svg +0 -1
  59. entari_plugin_hyw/card-ui/src/components/HelloWorld.vue +0 -41
  60. entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +0 -382
  61. entari_plugin_hyw/card-ui/src/components/SectionCard.vue +0 -41
  62. entari_plugin_hyw/card-ui/src/components/StageCard.vue +0 -240
  63. entari_plugin_hyw/card-ui/src/main.ts +0 -5
  64. entari_plugin_hyw/card-ui/src/style.css +0 -29
  65. entari_plugin_hyw/card-ui/src/test_regex.js +0 -103
  66. entari_plugin_hyw/card-ui/src/types.ts +0 -61
  67. entari_plugin_hyw/card-ui/tsconfig.app.json +0 -16
  68. entari_plugin_hyw/card-ui/tsconfig.json +0 -7
  69. entari_plugin_hyw/card-ui/tsconfig.node.json +0 -26
  70. entari_plugin_hyw/card-ui/vite.config.ts +0 -16
  71. entari_plugin_hyw/definitions.py +0 -155
  72. entari_plugin_hyw/stage_instruct.py +0 -345
  73. entari_plugin_hyw/stage_instruct_deepsearch.py +0 -104
  74. entari_plugin_hyw-4.0.0rc6.dist-info/RECORD +0 -100
  75. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/anthropic.svg +0 -0
  76. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/cerebras.svg +0 -0
  77. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/deepseek.png +0 -0
  78. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/gemini.svg +0 -0
  79. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/google.svg +0 -0
  80. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/grok.png +0 -0
  81. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/huggingface.png +0 -0
  82. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/microsoft.svg +0 -0
  83. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/minimax.png +0 -0
  84. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/mistral.png +0 -0
  85. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/nvida.png +0 -0
  86. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/openai.svg +0 -0
  87. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/openrouter.png +0 -0
  88. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/perplexity.svg +0 -0
  89. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/qwen.png +0 -0
  90. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/xai.png +0 -0
  91. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/xiaomi.png +0 -0
  92. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/zai.png +0 -0
  93. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/vite.svg +0 -0
  94. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/anthropic.svg +0 -0
  95. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/cerebras.svg +0 -0
  96. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/deepseek.png +0 -0
  97. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/gemini.svg +0 -0
  98. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/google.svg +0 -0
  99. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/grok.png +0 -0
  100. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/huggingface.png +0 -0
  101. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/microsoft.svg +0 -0
  102. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/minimax.png +0 -0
  103. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/mistral.png +0 -0
  104. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/nvida.png +0 -0
  105. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/openai.svg +0 -0
  106. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/openrouter.png +0 -0
  107. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/perplexity.svg +0 -0
  108. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/qwen.png +0 -0
  109. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/xai.png +0 -0
  110. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/xiaomi.png +0 -0
  111. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/zai.png +0 -0
  112. {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/base.py +0 -0
  113. {entari_plugin_hyw/browser → hyw_core/browser_control}/landing.html +0 -0
  114. {entari_plugin_hyw → hyw_core}/image_cache.py +0 -0
@@ -0,0 +1,720 @@
1
+ """
2
+ Browser Service (DrissionPage)
3
+
4
+ Provides page fetching and screenshot capabilities using DrissionPage.
5
+ """
6
+
7
+ import asyncio
8
+ import base64
9
+ import threading
10
+ import time
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ from typing import Optional, Dict, Any, List
13
+ from loguru import logger
14
+ import trafilatura
15
+
16
+ class ScreenshotService:
17
+ """
18
+ Browser Service using DrissionPage.
19
+ """
20
+
21
+ def __init__(self, headless: bool = True, auto_start: bool = True):
22
+ self.headless = headless
23
+ self._manager = None
24
+ self._executor = ThreadPoolExecutor(max_workers=10)
25
+
26
+ if auto_start:
27
+ self._ensure_ready()
28
+
29
+ def _get_tab(self, url: str) -> Any:
30
+ """Create a new tab and navigate to URL."""
31
+ self._ensure_ready()
32
+ return self._manager.new_tab(url)
33
+
34
+ def _release_tab(self, tab: Any):
35
+ """Close tab after use."""
36
+ if not tab: return
37
+ try:
38
+ tab.close()
39
+ except:
40
+ pass
41
+
42
+ async def search_via_page_input_batch(self, queries: List[str], url: str, selector: str = "#input") -> List[Dict[str, Any]]:
43
+ """
44
+ Execute concurrent searches using page inputs.
45
+ """
46
+ loop = asyncio.get_running_loop()
47
+ return await loop.run_in_executor(
48
+ self._executor,
49
+ self._search_via_page_input_batch_sync,
50
+ queries, url, selector
51
+ )
52
+
53
+ def _search_via_page_input_batch_sync(self, queries: List[str], url: str, selector: str) -> List[Dict[str, Any]]:
54
+ """Sync batch execution - create tabs sequentially, search in parallel."""
55
+ results = [None] * len(queries)
56
+ tabs = []
57
+
58
+ # Phase 1: Get/create tabs SEQUENTIALLY (DrissionPage isn't thread-safe for new_tab)
59
+ target_url = url or "https://www.google.com"
60
+ logger.info(f"ScreenshotService: Acquiring {len(queries)} tabs for parallel search...")
61
+
62
+ for i in range(len(queries)):
63
+ tab = None
64
+ # Try to get from pool first (using shared logic now)
65
+ try:
66
+ tab = self._get_tab(target_url)
67
+ except Exception as e:
68
+ logger.warning(f"ScreenshotService: Batch search tab creation failed: {e}")
69
+
70
+
71
+
72
+ tabs.append(tab)
73
+
74
+ logger.info(f"ScreenshotService: {len(tabs)} tabs ready, starting parallel searches...")
75
+
76
+ # Phase 2: Execute searches in PARALLEL
77
+ def run_search(index, tab, query):
78
+ try:
79
+ logger.debug(f"Search[{index}]: Starting for '{query}' on {tab.url}")
80
+
81
+ # Wait for page to be ready first
82
+ try:
83
+ tab.wait.doc_loaded(timeout=10)
84
+ except:
85
+ pass
86
+
87
+ # Find input element with wait
88
+ logger.debug(f"Search[{index}]: Looking for input with selector '{selector}'")
89
+ ele = tab.ele(selector, timeout=5)
90
+ if not ele:
91
+ logger.debug(f"Search[{index}]: Primary selector failed, trying fallbacks")
92
+ for fallback in ["textarea[name='q']", "#APjFqb", "input[name='q']", "input[type='text']"]:
93
+ ele = tab.ele(fallback, timeout=2)
94
+ if ele:
95
+ logger.debug(f"Search[{index}]: Found input with fallback '{fallback}'")
96
+ break
97
+
98
+ if not ele:
99
+ logger.error(f"Search[{index}]: No input element found on {tab.url}!")
100
+ results[index] = {"content": "Error: input not found", "title": "Error", "url": tab.url, "html": tab.html[:5000]}
101
+ return
102
+
103
+ logger.debug(f"Search[{index}]: Typing query...")
104
+ ele.input(query)
105
+
106
+ logger.debug(f"Search[{index}]: Pressing Enter...")
107
+ tab.actions.key_down('enter').key_up('enter')
108
+
109
+ logger.debug(f"Search[{index}]: Waiting for search results...")
110
+ tab.wait.doc_loaded(timeout=10)
111
+ # Reduced settle wait for extraction
112
+ time.sleep(0.1)
113
+
114
+ logger.debug(f"Search[{index}]: Extracting content...")
115
+ html = tab.html
116
+ content = trafilatura.extract(
117
+ html, include_links=True, include_images=True, include_comments=False,
118
+ include_tables=True, favor_precision=False, output_format="markdown"
119
+ ) or ""
120
+
121
+ logger.info(f"ScreenshotService: Search '{query}' completed -> {tab.url}")
122
+
123
+ results[index] = {
124
+ "content": content,
125
+ "html": html,
126
+ "title": tab.title,
127
+ "url": tab.url,
128
+ "images": []
129
+ }
130
+
131
+ except Exception as e:
132
+ logger.error(f"ScreenshotService: Search error for '{query}': {e}")
133
+ results[index] = {"content": f"Error: {e}", "title": "Error", "url": "", "html": ""}
134
+ finally:
135
+ self._release_tab(tab)
136
+
137
+ threads = []
138
+ for i, (tab, query) in enumerate(zip(tabs, queries)):
139
+ t = threading.Thread(target=run_search, args=(i, tab, query))
140
+ t.start()
141
+ threads.append(t)
142
+
143
+ for t in threads:
144
+ t.join()
145
+
146
+ return results
147
+
148
+ def _ensure_ready(self):
149
+ """Ensure shared browser is ready."""
150
+ from .manager import get_shared_browser_manager
151
+ self._manager = get_shared_browser_manager(headless=self.headless)
152
+
153
+ async def fetch_page(self, url: str, timeout: float = 10.0, include_screenshot: bool = True) -> Dict[str, Any]:
154
+ """
155
+ Fetch page content (and optionally screenshot).
156
+ Runs in a thread executor to avoid blocking the async loop.
157
+ """
158
+ loop = asyncio.get_running_loop()
159
+ return await loop.run_in_executor(
160
+ self._executor,
161
+ self._fetch_page_sync,
162
+ url,
163
+ timeout,
164
+ include_screenshot
165
+ )
166
+
167
+ async def search_via_address_bar(self, query: str, timeout: float = 20.0) -> Dict[str, Any]:
168
+ """
169
+ Search using browser's address bar (uses browser's default search engine).
170
+ Simulates: Ctrl+L (focus address bar) -> type query -> Enter
171
+ """
172
+ loop = asyncio.get_running_loop()
173
+ return await loop.run_in_executor(
174
+ self._executor,
175
+ self._search_via_address_bar_sync,
176
+ query,
177
+ timeout
178
+ )
179
+
180
+ def _search_via_address_bar_sync(self, query: str, timeout: float) -> Dict[str, Any]:
181
+ """Synchronous address bar search logic."""
182
+ if not query:
183
+ return {"content": "Error: missing query", "title": "Error", "url": "", "html": ""}
184
+
185
+ tab = None
186
+ try:
187
+ self._ensure_ready()
188
+ page = self._manager.page
189
+ if not page:
190
+ return {"content": "Error: Browser not available", "title": "Error", "url": "", "html": ""}
191
+
192
+ # Open new blank tab
193
+ tab = page.new_tab()
194
+
195
+ # Focus address bar with Ctrl+L (or Cmd+L on Mac)
196
+ import platform
197
+ if platform.system() == "Darwin":
198
+ tab.actions.key_down('cmd').key_down('l').key_up('l').key_up('cmd')
199
+ else:
200
+ tab.actions.key_down('ctrl').key_down('l').key_up('l').key_up('ctrl')
201
+
202
+ # Small delay for address bar to focus
203
+ import time as _time
204
+ _time.sleep(0.05)
205
+
206
+ # Type the query
207
+ tab.actions.type(query)
208
+
209
+ # Press Enter to search
210
+ tab.actions.key_down('enter').key_up('enter')
211
+
212
+ # Wait for page to load
213
+ try:
214
+ tab.wait.doc_loaded(timeout=timeout)
215
+ # Reduced wait for initial results
216
+ _time.sleep(0.2)
217
+ except:
218
+ pass
219
+
220
+ html = tab.html
221
+ title = tab.title
222
+ final_url = tab.url
223
+
224
+ # Extract content
225
+ content = trafilatura.extract(
226
+ html, include_links=True, include_images=True, include_comments=False,
227
+ include_tables=True, favor_precision=False, output_format="markdown"
228
+ ) or ""
229
+
230
+ logger.info(f"ScreenshotService: Address bar search completed -> {final_url}")
231
+
232
+ return {
233
+ "content": content,
234
+ "html": html,
235
+ "title": title,
236
+ "url": final_url,
237
+ "images": []
238
+ }
239
+
240
+ except Exception as e:
241
+ logger.error(f"ScreenshotService: Address bar search failed: {e}")
242
+ return {"content": f"Error: search failed ({e})", "title": "Error", "url": "", "html": ""}
243
+ finally:
244
+ if tab:
245
+ try: tab.close()
246
+ except: pass
247
+
248
+ def _scroll_to_bottom(self, tab, step: int = 800, delay: float = 2.0, timeout: float = 10.0):
249
+ """
250
+ Scroll down gradually to trigger lazy loading.
251
+
252
+ Args:
253
+ delay: Max wait time per scroll step (seconds) if images aren't loading.
254
+ """
255
+ import time
256
+ start = time.time()
257
+ current_pos = 0
258
+ try:
259
+ while time.time() - start < timeout:
260
+ # Scroll down
261
+ current_pos += step
262
+ tab.run_js(f"window.scrollTo(0, {current_pos});")
263
+
264
+ # Active Wait: Check if images in viewport are loaded
265
+ # Poll every 100ms, up to 'delay' seconds
266
+ wait_start = time.time()
267
+ while time.time() - wait_start < delay:
268
+ all_loaded = tab.run_js("""
269
+ return (async () => {
270
+ const imgs = Array.from(document.querySelectorAll('img'));
271
+ const viewportHeight = window.innerHeight;
272
+
273
+ // 1. Identify images currently in viewport
274
+ const visibleImgs = imgs.filter(img => {
275
+ const rect = img.getBoundingClientRect();
276
+ return (rect.top < viewportHeight && rect.bottom > 0) && (rect.width > 0 && rect.height > 0);
277
+ });
278
+
279
+ if (visibleImgs.length === 0) return true;
280
+
281
+ // 2. Check loading status using decode() AND heuristic for placeholders
282
+ // Some sites load a tiny blurred placeholder first.
283
+ const checks = visibleImgs.map(img => {
284
+ // HEURISTIC: content is likely not ready if:
285
+ // - img has 'data-src' but src is different (or src is empty)
286
+ // - img has 'loading="lazy"' and is not complete
287
+ // - naturalWidth is very small (placeholder) compared to display width
288
+
289
+ const isPlaceholder = (
290
+ (img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
291
+ (img.naturalWidth < 50 && img.clientWidth > 100)
292
+ );
293
+
294
+ if (isPlaceholder) {
295
+ // If it looks like a placeholder, we return false (not loaded)
296
+ // unless it stays like this for too long (handled by outer timeout)
297
+ return Promise.resolve(false);
298
+ }
299
+
300
+ if (img.complete && img.naturalHeight > 0) return Promise.resolve(true);
301
+
302
+ return img.decode().then(() => true).catch(() => false);
303
+ });
304
+
305
+ // Race against a small timeout to avoid hanging on one broken image
306
+ const allDecoded = Promise.all(checks);
307
+ const timeout = new Promise(resolve => setTimeout(() => resolve(false), 500));
308
+
309
+ // If any check returned false (meaning placeholder or not decoded), result is false
310
+ return Promise.race([allDecoded, timeout]).then(results => {
311
+ if (!Array.isArray(results)) return results === true;
312
+ return results.every(res => res === true);
313
+ });
314
+ })();
315
+ """)
316
+ if all_loaded:
317
+ break
318
+ time.sleep(0.1)
319
+
320
+ # Check if reached bottom
321
+ height = tab.run_js("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);")
322
+ if current_pos >= height:
323
+ break
324
+
325
+ # Ensure final layout settle
326
+ time.sleep(0.2)
327
+
328
+ except Exception as e:
329
+ logger.warning(f"ScreenshotService: Scroll failed: {e}")
330
+
331
+ def _fetch_page_sync(self, url: str, timeout: float, include_screenshot: bool) -> Dict[str, Any]:
332
+ """Synchronous fetch logic."""
333
+ if not url:
334
+ return {"content": "Error: missing url", "title": "Error", "url": ""}
335
+
336
+ tab = None
337
+ try:
338
+ self._ensure_ready()
339
+ page = self._manager.page
340
+ if not page:
341
+ return {"content": "Error: Browser not available", "title": "Error", "url": url}
342
+
343
+ # Get from pool
344
+ tab = self._get_tab(url)
345
+
346
+ # Wait logic - optimized for search pages
347
+ is_search_page = any(s in url.lower() for s in ['search', 'bing.com', 'duckduckgo', 'google.com/search', 'searx'])
348
+ if is_search_page:
349
+ # Optimized waiting: Rapidly poll for ACTUAL results > 0
350
+ start_time = time.time()
351
+
352
+ # Special fast-path for DDG Lite (HTML only, no JS rendering needed)
353
+ if 'lite.duckduckgo' in url:
354
+ # just wait for body, it's static HTML
355
+ try:
356
+ tab.wait.doc_loaded(timeout=timeout)
357
+ except: pass
358
+ # Sleep tiny bit to ensure render
359
+ time.sleep(0.5)
360
+ else:
361
+ while time.time() - start_time < timeout:
362
+ found_results = False
363
+ try:
364
+ if 'google' in url.lower():
365
+ # Check if we have any result items (.g, .MjjYud) or the main container (#search)
366
+ # Using checks with minimal timeout to allow fast looping
367
+ if tab.ele('.g', timeout=0.1) or tab.ele('.MjjYud', timeout=0.1) or tab.ele('#search', timeout=0.1):
368
+ found_results = True
369
+ elif 'bing' in url.lower():
370
+ if tab.ele('.b_algo', timeout=0.1) or tab.ele('#b_results', timeout=0.1):
371
+ found_results = True
372
+ elif 'duckduckgo' in url.lower():
373
+ if tab.ele('.result', timeout=0.1) or tab.ele('#react-layout', timeout=0.1):
374
+ found_results = True
375
+ else:
376
+ # Generic fallback: wait for body to be populated
377
+ if tab.ele('body', timeout=0.1):
378
+ found_results = True
379
+ except:
380
+ pass
381
+
382
+ if found_results:
383
+ break
384
+ time.sleep(0.05) # Faster polling (50ms) as requested
385
+ else:
386
+ # 1. Wait for document to settle (Fast Dynamic Wait)
387
+ try:
388
+ tab.wait.doc_loaded(timeout=timeout)
389
+ except: pass
390
+
391
+ html = tab.html
392
+ title = tab.title
393
+ final_url = tab.url
394
+
395
+ raw_screenshot_b64 = None
396
+ if include_screenshot:
397
+ try:
398
+ # Scrollbar Hiding Best Effort
399
+ from .manager import SharedBrowserManager
400
+ SharedBrowserManager.hide_scrollbars(tab)
401
+
402
+ # Inject CSS
403
+ tab.run_js("""
404
+ const style = document.createElement('style');
405
+ style.textContent = `
406
+ ::-webkit-scrollbar { display: none !important; }
407
+ html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
408
+ `;
409
+ document.head.appendChild(style);
410
+ document.documentElement.style.overflow = 'hidden';
411
+ document.body.style.overflow = 'hidden';
412
+ """)
413
+
414
+ raw_screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
415
+ except Exception as e:
416
+ logger.warning(f"ScreenshotService: Failed to capture screenshot: {e}")
417
+
418
+ # Extract content
419
+ content = trafilatura.extract(
420
+ html, include_links=True, include_images=True, include_comments=False,
421
+ include_tables=True, favor_precision=False, output_format="markdown"
422
+ ) or ""
423
+
424
+ # 2. Extract Images via Parallelized JS (Gallery)
425
+ # Strategy: For search pages, use Canvas to grab already loaded images (Instant)
426
+ # For other pages, use fetch (more robust for lazy load)
427
+ images_b64 = []
428
+ try:
429
+ js_code = """
430
+ (async () => {
431
+ const blocklist = ['logo', 'icon', 'avatar', 'ad', 'pixel', 'tracker', 'button', 'menu', 'nav'];
432
+ const candidates = Array.from(document.querySelectorAll('img'));
433
+ const validImages = [];
434
+
435
+ // Helper: Get base64 from loaded image via Canvas
436
+ const getBase64 = (img) => {
437
+ try {
438
+ const canvas = document.createElement('canvas');
439
+ canvas.width = img.naturalWidth;
440
+ canvas.height = img.naturalHeight;
441
+ const ctx = canvas.getContext('2d');
442
+ ctx.drawImage(img, 0, 0);
443
+ return canvas.toDataURL('image/jpeg').split(',')[1];
444
+ } catch(e) { return null; }
445
+ };
446
+
447
+ for (const img of candidates) {
448
+ if (validImages.length >= 8) break;
449
+
450
+ if (img.naturalWidth < 100 || img.naturalHeight < 80) continue;
451
+
452
+ const alt = (img.alt || '').toLowerCase();
453
+ const cls = (typeof img.className === 'string' ? img.className : '').toLowerCase();
454
+ const src = (img.src || '').toLowerCase();
455
+
456
+ if (blocklist.some(b => alt.includes(b) || cls.includes(b) || src.includes(b))) continue;
457
+
458
+ // 1. Try Canvas (Instant for loaded images)
459
+ if (img.complete && img.naturalHeight > 0) {
460
+ const b64 = getBase64(img);
461
+ if (b64) {
462
+ validImages.push(b64);
463
+ continue;
464
+ }
465
+ }
466
+
467
+ // 2. Fallback to fetch (only for non-search pages to avoid delay)
468
+ // We skip fetch for search pages to ensure speed
469
+ if (!window.location.href.includes('google') && !window.location.href.includes('search')) {
470
+ try {
471
+ const controller = new AbortController();
472
+ const id = setTimeout(() => controller.abort(), 2000);
473
+ const resp = await fetch(img.src, { signal: controller.signal });
474
+ clearTimeout(id);
475
+ const blob = await resp.blob();
476
+ const b64 = await new Promise(resolve => {
477
+ const reader = new FileReader();
478
+ reader.onloadend = () => resolve(reader.result.split(',')[1]);
479
+ reader.onerror = () => resolve(null);
480
+ reader.readAsDataURL(blob);
481
+ });
482
+ if (b64) validImages.push(b64);
483
+ } catch(e) {}
484
+ }
485
+ }
486
+ return validImages;
487
+ })()
488
+ """
489
+ images_b64 = tab.run_js(js_code, as_expr=True) or []
490
+
491
+ if images_b64:
492
+ logger.info(f"ScreenshotService: Extracted {len(images_b64)} images for {url}")
493
+
494
+ except Exception as e:
495
+ logger.warning(f"ScreenshotService: Image extraction failed: {e}")
496
+
497
+ return {
498
+ "content": content,
499
+ "html": html,
500
+ "title": title,
501
+ "url": final_url,
502
+ "raw_screenshot_b64": raw_screenshot_b64,
503
+ "images": images_b64
504
+ }
505
+
506
+ except Exception as e:
507
+ logger.error(f"ScreenshotService: Failed to fetch {url}: {e}")
508
+ return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
509
+ finally:
510
+ if tab:
511
+ self._release_tab(tab)
512
+
513
+ async def fetch_pages_batch(self, urls: List[str], timeout: float = 20.0, include_screenshot: bool = True) -> List[Dict[str, Any]]:
514
+ """Fetch multiple pages concurrently."""
515
+ if not urls: return []
516
+ logger.info(f"ScreenshotService: Batch fetching {len(urls)} URLs (screenshots={include_screenshot})")
517
+ tasks = [self.fetch_page(url, timeout, include_screenshot) for url in urls]
518
+ return await asyncio.gather(*tasks, return_exceptions=True)
519
+
520
+ async def screenshot_urls_batch(self, urls: List[str], timeout: float = 15.0, full_page: bool = True) -> List[Optional[str]]:
521
+ """Take screenshots of multiple URLs concurrently."""
522
+ if not urls: return []
523
+ logger.info(f"ScreenshotService: Batch screenshot {len(urls)} URLs")
524
+ tasks = [self.screenshot_url(url, timeout=timeout, full_page=full_page) for url in urls]
525
+ return await asyncio.gather(*tasks, return_exceptions=True)
526
+
527
+ async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 80) -> Optional[str]:
528
+ """Screenshot URL (Async wrapper for sync)."""
529
+ loop = asyncio.get_running_loop()
530
+ return await loop.run_in_executor(
531
+ self._executor,
532
+ self._screenshot_sync,
533
+ url, wait_load, timeout, full_page, quality
534
+ )
535
+
536
+ def _screenshot_sync(self, url: str, wait_load: bool, timeout: float, full_page: bool, quality: int) -> Optional[str]:
537
+ """Synchronous screenshot."""
538
+ if not url: return None
539
+ tab = None
540
+ try:
541
+ self._ensure_ready()
542
+ page = self._manager.page
543
+ if not page: return None
544
+
545
+ tab = page.new_tab(url)
546
+ try:
547
+ # Wait for full page load (including JS execution)
548
+ tab.wait.load_complete(timeout=timeout)
549
+
550
+ # Wait for actual content to appear (for CDN verification pages)
551
+ # Smart Wait Logic (Final Robust):
552
+ # 1. FORCED WAIT: 1.5s to allow initial redirects/rendering to start.
553
+ # 2. Browser ReadyState Complete
554
+ # 3. Height Stable for 2.0 seconds (20 checks)
555
+ # 4. Text > 100 chars (Crucial: Distinguishes stable content from stable spinners)
556
+ # 5. No Blacklist phrases
557
+
558
+ time.sleep(1.5) # user request: force wait 1.5s before detection
559
+
560
+ last_h = 0
561
+ stable_count = 0
562
+
563
+ for i in range(200): # Max 200 iterations (~20s)
564
+ try:
565
+ state = tab.run_js('''
566
+ return {
567
+ ready: document.readyState === 'complete',
568
+ title: document.title,
569
+ height: Math.max(
570
+ document.body.scrollHeight || 0,
571
+ document.documentElement.scrollHeight || 0
572
+ ),
573
+ text: document.body.innerText.substring(0, 1000) || "",
574
+ html: document.body.innerHTML.substring(0, 500) // Debug intro
575
+ };
576
+ ''') or {'ready': False, 'title': "", 'height': 0, 'text': ""}
577
+
578
+ is_ready = state.get('ready', False)
579
+ title = state.get('title', "").lower()
580
+ current_h = int(state.get('height', 0))
581
+ text_content = state.get('text', "")
582
+ text_len = len(text_content)
583
+ text_lower = text_content.lower()
584
+
585
+ # Blacklist check
586
+ is_verification = "checking your browser" in text_lower or \
587
+ "just a moment" in text_lower or \
588
+ "please wait" in text_lower or \
589
+ "security check" in title or \
590
+ "just a moment" in title
591
+
592
+ # Stability check
593
+ if current_h == last_h:
594
+ stable_count += 1
595
+ else:
596
+ stable_count = 0
597
+
598
+ # Conditions
599
+ has_content = text_len > 100 # At least 100 real chars
600
+ is_stable = stable_count >= 20 # Always require 2s stability
601
+
602
+ # Pass if all conditions met
603
+ if is_ready and not is_verification and has_content and is_stable:
604
+ break
605
+
606
+ last_h = current_h
607
+
608
+ # Wait timing
609
+ try: tab.wait.eles_loaded(timeout=0.1)
610
+ except: pass
611
+
612
+ except Exception:
613
+ stable_count = 0
614
+ try: time.sleep(0.1)
615
+ except: pass
616
+ continue
617
+
618
+ # DEBUG: Save HTML to inspect what happened (in data dir)
619
+ try:
620
+ import os
621
+ log_path = os.path.join(os.getcwd(), "data", "browser.log.html")
622
+ with open(log_path, "w", encoding="utf-8") as f:
623
+ f.write(f"<!-- URL: {url} -->\n")
624
+ f.write(tab.html)
625
+ except: pass
626
+
627
+ # Use faster scroll step (800) to ensure lazy loaded images appear
628
+ self._scroll_to_bottom(tab, step=800, delay=2.0, timeout=min(timeout, 10))
629
+
630
+ except:
631
+ pass
632
+
633
+ # Refine calculation: Set viewport width to 1024
634
+ capture_width = 1024
635
+
636
+ # Calculate actual content height after lazy loading
637
+ try:
638
+ # Use a robust height calculation
639
+ content_height = tab.run_js('''
640
+ return Math.max(
641
+ document.body.scrollHeight || 0,
642
+ document.documentElement.scrollHeight || 0,
643
+ document.body.offsetHeight || 0,
644
+ document.documentElement.offsetHeight || 0,
645
+ document.documentElement.clientHeight || 0
646
+ );
647
+ ''')
648
+ # Add a small buffer and cap at 15000px to prevent memory issues
649
+ h = min(int(content_height) + 50, 15000)
650
+ except:
651
+ h = 1000 # Fallback
652
+
653
+ # Set viewport to full content size for single-shot capture
654
+ try:
655
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
656
+ width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
657
+ except:
658
+ pass
659
+
660
+ # Scrollbar Hiding
661
+ from .manager import SharedBrowserManager
662
+ SharedBrowserManager.hide_scrollbars(tab)
663
+
664
+ # Scroll back to top before screenshot
665
+ tab.run_js("window.scrollTo(0, 0);")
666
+
667
+ # Content is already loaded by _scroll_to_bottom which waits for images
668
+ # Just recalculate final height (content may have grown during scrolling)
669
+ try:
670
+ final_height = tab.run_js('''
671
+ return Math.max(
672
+ document.body.scrollHeight || 0,
673
+ document.documentElement.scrollHeight || 0,
674
+ document.body.offsetHeight || 0,
675
+ document.documentElement.offsetHeight || 0
676
+ );
677
+ ''')
678
+ final_h = min(int(final_height) + 50, 15000)
679
+ if final_h != h:
680
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
681
+ width=capture_width, height=final_h, deviceScaleFactor=1, mobile=False)
682
+ except:
683
+ pass
684
+
685
+ # Use full_page=False because we manually set the viewport to the full height
686
+ # This avoids stitching artifacts and blank spaces
687
+ return tab.get_screenshot(as_base64='jpg', full_page=False)
688
+
689
+ except Exception as e:
690
+ logger.error(f"ScreenshotService: Screenshot URL failed: {e}")
691
+ return None
692
+ finally:
693
+ if tab:
694
+ try: tab.close()
695
+ except: pass
696
+
697
+ async def close(self):
698
+ self._executor.shutdown(wait=False)
699
+ logger.info("ScreenshotService: Closed.")
700
+
701
+ async def close_async(self):
702
+ await self.close()
703
+
704
+ # Singleton
705
+ _screenshot_service: Optional[ScreenshotService] = None
706
+
707
+ def get_screenshot_service(headless: bool = True) -> ScreenshotService:
708
+ global _screenshot_service
709
+ if _screenshot_service is None:
710
+ _screenshot_service = ScreenshotService(headless=headless, auto_start=True)
711
+ return _screenshot_service
712
+
713
+ async def close_screenshot_service():
714
+ global _screenshot_service
715
+ if _screenshot_service:
716
+ await _screenshot_service.close()
717
+ _screenshot_service = None
718
+
719
+ def prestart_browser(headless: bool = True):
720
+ get_screenshot_service(headless=headless)