entari-plugin-hyw 0.3.5__py3-none-any.whl → 4.0.0rc14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/Untitled-1 +1865 -0
- entari_plugin_hyw/__init__.py +979 -116
- entari_plugin_hyw/filters.py +83 -0
- entari_plugin_hyw/history.py +251 -0
- entari_plugin_hyw/misc.py +214 -0
- entari_plugin_hyw/search_cache.py +154 -0
- entari_plugin_hyw-4.0.0rc14.dist-info/METADATA +118 -0
- entari_plugin_hyw-4.0.0rc14.dist-info/RECORD +72 -0
- {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/WHEEL +1 -1
- {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/top_level.txt +1 -0
- hyw_core/__init__.py +94 -0
- hyw_core/agent.py +768 -0
- hyw_core/browser_control/__init__.py +63 -0
- hyw_core/browser_control/assets/card-dist/index.html +425 -0
- hyw_core/browser_control/assets/card-dist/logos/anthropic.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/cerebras.svg +9 -0
- hyw_core/browser_control/assets/card-dist/logos/deepseek.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/gemini.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/google.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/grok.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/huggingface.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/microsoft.svg +15 -0
- hyw_core/browser_control/assets/card-dist/logos/minimax.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/mistral.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/nvida.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/openai.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/openrouter.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/perplexity.svg +24 -0
- hyw_core/browser_control/assets/card-dist/logos/qwen.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/xai.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/xiaomi.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/zai.png +0 -0
- hyw_core/browser_control/assets/card-dist/vite.svg +1 -0
- hyw_core/browser_control/assets/index.html +5691 -0
- hyw_core/browser_control/assets/logos/anthropic.svg +1 -0
- hyw_core/browser_control/assets/logos/cerebras.svg +9 -0
- hyw_core/browser_control/assets/logos/deepseek.png +0 -0
- hyw_core/browser_control/assets/logos/gemini.svg +1 -0
- hyw_core/browser_control/assets/logos/google.svg +1 -0
- hyw_core/browser_control/assets/logos/grok.png +0 -0
- hyw_core/browser_control/assets/logos/huggingface.png +0 -0
- hyw_core/browser_control/assets/logos/microsoft.svg +15 -0
- hyw_core/browser_control/assets/logos/minimax.png +0 -0
- hyw_core/browser_control/assets/logos/mistral.png +0 -0
- hyw_core/browser_control/assets/logos/nvida.png +0 -0
- hyw_core/browser_control/assets/logos/openai.svg +1 -0
- hyw_core/browser_control/assets/logos/openrouter.png +0 -0
- hyw_core/browser_control/assets/logos/perplexity.svg +24 -0
- hyw_core/browser_control/assets/logos/qwen.png +0 -0
- hyw_core/browser_control/assets/logos/xai.png +0 -0
- hyw_core/browser_control/assets/logos/xiaomi.png +0 -0
- hyw_core/browser_control/assets/logos/zai.png +0 -0
- hyw_core/browser_control/engines/__init__.py +15 -0
- hyw_core/browser_control/engines/base.py +13 -0
- hyw_core/browser_control/engines/default.py +166 -0
- hyw_core/browser_control/engines/duckduckgo.py +171 -0
- hyw_core/browser_control/landing.html +172 -0
- hyw_core/browser_control/manager.py +173 -0
- hyw_core/browser_control/renderer.py +446 -0
- hyw_core/browser_control/service.py +940 -0
- hyw_core/config.py +154 -0
- hyw_core/core.py +462 -0
- hyw_core/crawling/__init__.py +18 -0
- hyw_core/crawling/completeness.py +437 -0
- hyw_core/crawling/models.py +88 -0
- hyw_core/definitions.py +104 -0
- hyw_core/image_cache.py +274 -0
- hyw_core/pipeline.py +502 -0
- hyw_core/search.py +171 -0
- hyw_core/stages/__init__.py +21 -0
- hyw_core/stages/base.py +95 -0
- hyw_core/stages/summary.py +191 -0
- entari_plugin_hyw/agent.py +0 -419
- entari_plugin_hyw/compressor.py +0 -59
- entari_plugin_hyw/tools.py +0 -236
- entari_plugin_hyw/vision.py +0 -35
- entari_plugin_hyw-0.3.5.dist-info/METADATA +0 -112
- entari_plugin_hyw-0.3.5.dist-info/RECORD +0 -9
|
@@ -0,0 +1,940 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Browser Service (DrissionPage)
|
|
3
|
+
|
|
4
|
+
Provides page fetching and screenshot capabilities using DrissionPage.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import base64
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
12
|
+
from typing import Optional, Dict, Any, List
|
|
13
|
+
from loguru import logger
|
|
14
|
+
import trafilatura
|
|
15
|
+
|
|
16
|
+
# Import intelligent completeness checker
|
|
17
|
+
from ..crawling.completeness import CompletenessChecker, trigger_lazy_load
|
|
18
|
+
from ..crawling.models import CrawlConfig
|
|
19
|
+
|
|
20
|
+
class ScreenshotService:
|
|
21
|
+
"""
|
|
22
|
+
Browser Service using DrissionPage.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, headless: bool = True, auto_start: bool = True):
|
|
26
|
+
self.headless = headless
|
|
27
|
+
self._manager = None
|
|
28
|
+
self._executor = ThreadPoolExecutor(max_workers=10)
|
|
29
|
+
|
|
30
|
+
if auto_start:
|
|
31
|
+
self._ensure_ready()
|
|
32
|
+
|
|
33
|
+
def _get_tab(self, url: str) -> Any:
|
|
34
|
+
"""Create a new tab and navigate to URL."""
|
|
35
|
+
self._ensure_ready()
|
|
36
|
+
return self._manager.new_tab(url)
|
|
37
|
+
|
|
38
|
+
def _release_tab(self, tab: Any):
|
|
39
|
+
"""Close tab after use."""
|
|
40
|
+
if not tab: return
|
|
41
|
+
try:
|
|
42
|
+
tab.close()
|
|
43
|
+
except:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
async def search_via_page_input_batch(self, queries: List[str], url: str, selector: str = "#input") -> List[Dict[str, Any]]:
|
|
47
|
+
"""
|
|
48
|
+
Execute concurrent searches using page inputs.
|
|
49
|
+
"""
|
|
50
|
+
loop = asyncio.get_running_loop()
|
|
51
|
+
return await loop.run_in_executor(
|
|
52
|
+
self._executor,
|
|
53
|
+
self._search_via_page_input_batch_sync,
|
|
54
|
+
queries, url, selector
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def _search_via_page_input_batch_sync(self, queries: List[str], url: str, selector: str) -> List[Dict[str, Any]]:
|
|
58
|
+
"""Sync batch execution - create tabs sequentially, search in parallel."""
|
|
59
|
+
results = [None] * len(queries)
|
|
60
|
+
tabs = []
|
|
61
|
+
|
|
62
|
+
# Phase 1: Get/create tabs SEQUENTIALLY (DrissionPage isn't thread-safe for new_tab)
|
|
63
|
+
target_url = url or "https://www.google.com"
|
|
64
|
+
logger.info(f"ScreenshotService: Acquiring {len(queries)} tabs for parallel search...")
|
|
65
|
+
|
|
66
|
+
for i in range(len(queries)):
|
|
67
|
+
tab = None
|
|
68
|
+
# Try to get from pool first (using shared logic now)
|
|
69
|
+
try:
|
|
70
|
+
tab = self._get_tab(target_url)
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.warning(f"ScreenshotService: Batch search tab creation failed: {e}")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
tabs.append(tab)
|
|
77
|
+
|
|
78
|
+
logger.info(f"ScreenshotService: {len(tabs)} tabs ready, starting parallel searches...")
|
|
79
|
+
|
|
80
|
+
# Phase 2: Execute searches in PARALLEL
|
|
81
|
+
def run_search(index, tab, query):
|
|
82
|
+
try:
|
|
83
|
+
logger.debug(f"Search[{index}]: Starting for '{query}' on {tab.url}")
|
|
84
|
+
|
|
85
|
+
# Wait for page to be ready first
|
|
86
|
+
try:
|
|
87
|
+
tab.wait.doc_loaded(timeout=10)
|
|
88
|
+
except:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
# Find input element with wait
|
|
92
|
+
logger.debug(f"Search[{index}]: Looking for input with selector '{selector}'")
|
|
93
|
+
ele = tab.ele(selector, timeout=5)
|
|
94
|
+
if not ele:
|
|
95
|
+
logger.debug(f"Search[{index}]: Primary selector failed, trying fallbacks")
|
|
96
|
+
for fallback in ["textarea[name='q']", "#APjFqb", "input[name='q']", "input[type='text']"]:
|
|
97
|
+
ele = tab.ele(fallback, timeout=2)
|
|
98
|
+
if ele:
|
|
99
|
+
logger.debug(f"Search[{index}]: Found input with fallback '{fallback}'")
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
if not ele:
|
|
103
|
+
logger.error(f"Search[{index}]: No input element found on {tab.url}!")
|
|
104
|
+
results[index] = {"content": "Error: input not found", "title": "Error", "url": tab.url, "html": tab.html[:5000]}
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
logger.debug(f"Search[{index}]: Typing query...")
|
|
108
|
+
ele.input(query)
|
|
109
|
+
|
|
110
|
+
logger.debug(f"Search[{index}]: Pressing Enter...")
|
|
111
|
+
tab.actions.key_down('enter').key_up('enter')
|
|
112
|
+
|
|
113
|
+
logger.debug(f"Search[{index}]: Waiting for search results...")
|
|
114
|
+
tab.wait.doc_loaded(timeout=10)
|
|
115
|
+
# Reduced settle wait for extraction
|
|
116
|
+
time.sleep(0.1)
|
|
117
|
+
|
|
118
|
+
logger.debug(f"Search[{index}]: Extracting content...")
|
|
119
|
+
html = tab.html
|
|
120
|
+
content = trafilatura.extract(
|
|
121
|
+
html, include_links=True, include_images=True, include_comments=False,
|
|
122
|
+
include_tables=True, favor_precision=False, output_format="markdown"
|
|
123
|
+
) or ""
|
|
124
|
+
|
|
125
|
+
logger.info(f"ScreenshotService: Search '{query}' completed -> {tab.url}")
|
|
126
|
+
|
|
127
|
+
results[index] = {
|
|
128
|
+
"content": content,
|
|
129
|
+
"html": html,
|
|
130
|
+
"title": tab.title,
|
|
131
|
+
"url": tab.url,
|
|
132
|
+
"images": []
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.error(f"ScreenshotService: Search error for '{query}': {e}")
|
|
137
|
+
results[index] = {"content": f"Error: {e}", "title": "Error", "url": "", "html": ""}
|
|
138
|
+
finally:
|
|
139
|
+
self._release_tab(tab)
|
|
140
|
+
|
|
141
|
+
threads = []
|
|
142
|
+
for i, (tab, query) in enumerate(zip(tabs, queries)):
|
|
143
|
+
t = threading.Thread(target=run_search, args=(i, tab, query))
|
|
144
|
+
t.start()
|
|
145
|
+
threads.append(t)
|
|
146
|
+
|
|
147
|
+
for t in threads:
|
|
148
|
+
t.join()
|
|
149
|
+
|
|
150
|
+
return results
|
|
151
|
+
|
|
152
|
+
def _ensure_ready(self):
|
|
153
|
+
"""Ensure shared browser is ready."""
|
|
154
|
+
from .manager import get_shared_browser_manager
|
|
155
|
+
self._manager = get_shared_browser_manager(headless=self.headless)
|
|
156
|
+
|
|
157
|
+
async def fetch_page(self, url: str, timeout: float = 10.0, include_screenshot: bool = True) -> Dict[str, Any]:
|
|
158
|
+
"""
|
|
159
|
+
Fetch page content (and optionally screenshot).
|
|
160
|
+
Runs in a thread executor to avoid blocking the async loop.
|
|
161
|
+
"""
|
|
162
|
+
loop = asyncio.get_running_loop()
|
|
163
|
+
return await loop.run_in_executor(
|
|
164
|
+
self._executor,
|
|
165
|
+
self._fetch_page_sync,
|
|
166
|
+
url,
|
|
167
|
+
timeout,
|
|
168
|
+
include_screenshot
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
async def search_via_address_bar(self, query: str, timeout: float = 20.0) -> Dict[str, Any]:
|
|
172
|
+
"""
|
|
173
|
+
Search using browser's address bar (uses browser's default search engine).
|
|
174
|
+
Simulates: Ctrl+L (focus address bar) -> type query -> Enter
|
|
175
|
+
"""
|
|
176
|
+
loop = asyncio.get_running_loop()
|
|
177
|
+
return await loop.run_in_executor(
|
|
178
|
+
self._executor,
|
|
179
|
+
self._search_via_address_bar_sync,
|
|
180
|
+
query,
|
|
181
|
+
timeout
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
def _search_via_address_bar_sync(self, query: str, timeout: float) -> Dict[str, Any]:
|
|
185
|
+
"""Synchronous address bar search logic."""
|
|
186
|
+
if not query:
|
|
187
|
+
return {"content": "Error: missing query", "title": "Error", "url": "", "html": ""}
|
|
188
|
+
|
|
189
|
+
tab = None
|
|
190
|
+
try:
|
|
191
|
+
self._ensure_ready()
|
|
192
|
+
page = self._manager.page
|
|
193
|
+
if not page:
|
|
194
|
+
return {"content": "Error: Browser not available", "title": "Error", "url": "", "html": ""}
|
|
195
|
+
|
|
196
|
+
# Open new blank tab
|
|
197
|
+
tab = page.new_tab()
|
|
198
|
+
|
|
199
|
+
# Focus address bar with Ctrl+L (or Cmd+L on Mac)
|
|
200
|
+
import platform
|
|
201
|
+
if platform.system() == "Darwin":
|
|
202
|
+
tab.actions.key_down('cmd').key_down('l').key_up('l').key_up('cmd')
|
|
203
|
+
else:
|
|
204
|
+
tab.actions.key_down('ctrl').key_down('l').key_up('l').key_up('ctrl')
|
|
205
|
+
|
|
206
|
+
# Small delay for address bar to focus
|
|
207
|
+
import time as _time
|
|
208
|
+
_time.sleep(0.05)
|
|
209
|
+
|
|
210
|
+
# Type the query
|
|
211
|
+
tab.actions.type(query)
|
|
212
|
+
|
|
213
|
+
# Press Enter to search
|
|
214
|
+
tab.actions.key_down('enter').key_up('enter')
|
|
215
|
+
|
|
216
|
+
# Wait for page to load
|
|
217
|
+
try:
|
|
218
|
+
tab.wait.doc_loaded(timeout=timeout)
|
|
219
|
+
# Reduced wait for initial results
|
|
220
|
+
_time.sleep(0.2)
|
|
221
|
+
except:
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
html = tab.html
|
|
225
|
+
title = tab.title
|
|
226
|
+
final_url = tab.url
|
|
227
|
+
|
|
228
|
+
# Extract content
|
|
229
|
+
content = trafilatura.extract(
|
|
230
|
+
html, include_links=True, include_images=True, include_comments=False,
|
|
231
|
+
include_tables=True, favor_precision=False, output_format="markdown"
|
|
232
|
+
) or ""
|
|
233
|
+
|
|
234
|
+
logger.info(f"ScreenshotService: Address bar search completed -> {final_url}")
|
|
235
|
+
|
|
236
|
+
return {
|
|
237
|
+
"content": content,
|
|
238
|
+
"html": html,
|
|
239
|
+
"title": title,
|
|
240
|
+
"url": final_url,
|
|
241
|
+
"images": []
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
except Exception as e:
|
|
245
|
+
logger.error(f"ScreenshotService: Address bar search failed: {e}")
|
|
246
|
+
return {"content": f"Error: search failed ({e})", "title": "Error", "url": "", "html": ""}
|
|
247
|
+
finally:
|
|
248
|
+
if tab:
|
|
249
|
+
try: tab.close()
|
|
250
|
+
except: pass
|
|
251
|
+
|
|
252
|
+
def _scroll_to_bottom(self, tab, step: int = 800, delay: float = 2.0, timeout: float = 10.0):
|
|
253
|
+
"""
|
|
254
|
+
Scroll down gradually to trigger lazy loading.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
delay: Max wait time per scroll step (seconds) if images aren't loading.
|
|
258
|
+
"""
|
|
259
|
+
import time
|
|
260
|
+
start = time.time()
|
|
261
|
+
current_pos = 0
|
|
262
|
+
try:
|
|
263
|
+
while time.time() - start < timeout:
|
|
264
|
+
# Scroll down
|
|
265
|
+
current_pos += step
|
|
266
|
+
tab.run_js(f"window.scrollTo(0, {current_pos});")
|
|
267
|
+
|
|
268
|
+
# Active Wait: Check if images in viewport are loaded
|
|
269
|
+
# Poll every 100ms, up to 'delay' seconds
|
|
270
|
+
wait_start = time.time()
|
|
271
|
+
while time.time() - wait_start < delay:
|
|
272
|
+
all_loaded = tab.run_js("""
|
|
273
|
+
return (async () => {
|
|
274
|
+
const imgs = Array.from(document.querySelectorAll('img'));
|
|
275
|
+
const viewportHeight = window.innerHeight;
|
|
276
|
+
|
|
277
|
+
// 1. Identify images currently in viewport
|
|
278
|
+
const visibleImgs = imgs.filter(img => {
|
|
279
|
+
const rect = img.getBoundingClientRect();
|
|
280
|
+
return (rect.top < viewportHeight && rect.bottom > 0) && (rect.width > 0 && rect.height > 0);
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
if (visibleImgs.length === 0) return true;
|
|
284
|
+
|
|
285
|
+
// 2. Check loading status using decode() AND heuristic for placeholders
|
|
286
|
+
// Some sites load a tiny blurred placeholder first.
|
|
287
|
+
const checks = visibleImgs.map(img => {
|
|
288
|
+
// Enhanced placeholder detection (matching completeness.py)
|
|
289
|
+
const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
|
|
290
|
+
img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
|
|
291
|
+
const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
|
|
292
|
+
const loadingAttr = img.getAttribute('loading') || '';
|
|
293
|
+
const src = img.src || '';
|
|
294
|
+
|
|
295
|
+
const isPlaceholder = (
|
|
296
|
+
// data-src not yet loaded
|
|
297
|
+
(dataSrc && img.src !== dataSrc) ||
|
|
298
|
+
// Natural size much smaller than display (blurred placeholder)
|
|
299
|
+
(img.naturalWidth < 50 && img.clientWidth > 100) ||
|
|
300
|
+
(img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
|
|
301
|
+
// Lazy-loading class indicators
|
|
302
|
+
className.includes('lazy') ||
|
|
303
|
+
className.includes('lazyload') ||
|
|
304
|
+
className.includes('lozad') ||
|
|
305
|
+
// CSS blur filter applied
|
|
306
|
+
(window.getComputedStyle(img).filter || '').includes('blur') ||
|
|
307
|
+
// loading="lazy" + not complete
|
|
308
|
+
(loadingAttr === 'lazy' && !img.complete)
|
|
309
|
+
);
|
|
310
|
+
|
|
311
|
+
if (isPlaceholder) {
|
|
312
|
+
// If it looks like a placeholder, we return false (not loaded)
|
|
313
|
+
// unless it stays like this for too long (handled by outer timeout)
|
|
314
|
+
return Promise.resolve(false);
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
if (img.complete && img.naturalHeight > 0) return Promise.resolve(true);
|
|
318
|
+
|
|
319
|
+
return img.decode().then(() => true).catch(() => false);
|
|
320
|
+
});
|
|
321
|
+
|
|
322
|
+
// Race against a small timeout to avoid hanging on one broken image
|
|
323
|
+
const allDecoded = Promise.all(checks);
|
|
324
|
+
const timeout = new Promise(resolve => setTimeout(() => resolve(false), 500));
|
|
325
|
+
|
|
326
|
+
// If any check returned false (meaning placeholder or not decoded), result is false
|
|
327
|
+
return Promise.race([allDecoded, timeout]).then(results => {
|
|
328
|
+
if (!Array.isArray(results)) return results === true;
|
|
329
|
+
return results.every(res => res === true);
|
|
330
|
+
});
|
|
331
|
+
})();
|
|
332
|
+
""")
|
|
333
|
+
if all_loaded:
|
|
334
|
+
break
|
|
335
|
+
time.sleep(0.1)
|
|
336
|
+
|
|
337
|
+
# Check if reached bottom
|
|
338
|
+
height = tab.run_js("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);")
|
|
339
|
+
if current_pos >= height:
|
|
340
|
+
break
|
|
341
|
+
|
|
342
|
+
# Ensure final layout settle
|
|
343
|
+
time.sleep(0.2)
|
|
344
|
+
|
|
345
|
+
except Exception as e:
|
|
346
|
+
logger.warning(f"ScreenshotService: Scroll failed: {e}")
|
|
347
|
+
|
|
348
|
+
def _fetch_page_sync(self, url: str, timeout: float, include_screenshot: bool) -> Dict[str, Any]:
|
|
349
|
+
"""Synchronous fetch logic."""
|
|
350
|
+
if not url:
|
|
351
|
+
return {"content": "Error: missing url", "title": "Error", "url": ""}
|
|
352
|
+
|
|
353
|
+
tab = None
|
|
354
|
+
try:
|
|
355
|
+
self._ensure_ready()
|
|
356
|
+
page = self._manager.page
|
|
357
|
+
if not page:
|
|
358
|
+
return {"content": "Error: Browser not available", "title": "Error", "url": url}
|
|
359
|
+
|
|
360
|
+
# Get from pool
|
|
361
|
+
tab = self._get_tab(url)
|
|
362
|
+
|
|
363
|
+
# Wait logic - optimized for search pages
|
|
364
|
+
is_search_page = any(s in url.lower() for s in ['search', 'bing.com', 'duckduckgo', 'google.com/search', 'searx'])
|
|
365
|
+
if is_search_page:
|
|
366
|
+
# Optimized waiting: Rapidly poll for ACTUAL results > 0
|
|
367
|
+
start_time = time.time()
|
|
368
|
+
|
|
369
|
+
# Special fast-path for DDG Lite (HTML only, no JS rendering needed)
|
|
370
|
+
if 'lite.duckduckgo' in url:
|
|
371
|
+
# just wait for body, it's static HTML
|
|
372
|
+
try:
|
|
373
|
+
tab.wait.doc_loaded(timeout=timeout)
|
|
374
|
+
except: pass
|
|
375
|
+
# Sleep tiny bit to ensure render
|
|
376
|
+
time.sleep(0.5)
|
|
377
|
+
else:
|
|
378
|
+
while time.time() - start_time < timeout:
|
|
379
|
+
found_results = False
|
|
380
|
+
try:
|
|
381
|
+
if 'google' in url.lower():
|
|
382
|
+
# Check if we have any result items (.g, .MjjYud) or the main container (#search)
|
|
383
|
+
# Using checks with minimal timeout to allow fast looping
|
|
384
|
+
if tab.ele('.g', timeout=0.1) or tab.ele('.MjjYud', timeout=0.1) or tab.ele('#search', timeout=0.1):
|
|
385
|
+
found_results = True
|
|
386
|
+
elif 'bing' in url.lower():
|
|
387
|
+
if tab.ele('.b_algo', timeout=0.1) or tab.ele('#b_results', timeout=0.1):
|
|
388
|
+
found_results = True
|
|
389
|
+
elif 'duckduckgo' in url.lower():
|
|
390
|
+
if tab.ele('.result', timeout=0.1) or tab.ele('#react-layout', timeout=0.1):
|
|
391
|
+
found_results = True
|
|
392
|
+
else:
|
|
393
|
+
# Generic fallback: wait for body to be populated
|
|
394
|
+
if tab.ele('body', timeout=0.1):
|
|
395
|
+
found_results = True
|
|
396
|
+
except:
|
|
397
|
+
pass
|
|
398
|
+
|
|
399
|
+
if found_results:
|
|
400
|
+
break
|
|
401
|
+
time.sleep(0.05) # Faster polling (50ms) as requested
|
|
402
|
+
else:
|
|
403
|
+
# 1. Wait for document to settle (Fast Dynamic Wait)
|
|
404
|
+
try:
|
|
405
|
+
tab.wait.doc_loaded(timeout=timeout)
|
|
406
|
+
except: pass
|
|
407
|
+
|
|
408
|
+
html = tab.html
|
|
409
|
+
title = tab.title
|
|
410
|
+
final_url = tab.url
|
|
411
|
+
|
|
412
|
+
raw_screenshot_b64 = None
|
|
413
|
+
if include_screenshot:
|
|
414
|
+
try:
|
|
415
|
+
# Scrollbar Hiding Best Effort
|
|
416
|
+
from .manager import SharedBrowserManager
|
|
417
|
+
SharedBrowserManager.hide_scrollbars(tab)
|
|
418
|
+
|
|
419
|
+
# Inject CSS
|
|
420
|
+
tab.run_js("""
|
|
421
|
+
const style = document.createElement('style');
|
|
422
|
+
style.textContent = `
|
|
423
|
+
::-webkit-scrollbar { display: none !important; }
|
|
424
|
+
html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
|
|
425
|
+
`;
|
|
426
|
+
document.head.appendChild(style);
|
|
427
|
+
document.documentElement.style.overflow = 'hidden';
|
|
428
|
+
document.body.style.overflow = 'hidden';
|
|
429
|
+
""")
|
|
430
|
+
|
|
431
|
+
raw_screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
|
|
432
|
+
except Exception as e:
|
|
433
|
+
logger.warning(f"ScreenshotService: Failed to capture screenshot: {e}")
|
|
434
|
+
|
|
435
|
+
# Extract content
|
|
436
|
+
content = trafilatura.extract(
|
|
437
|
+
html, include_links=True, include_images=True, include_comments=False,
|
|
438
|
+
include_tables=True, favor_precision=False, output_format="markdown"
|
|
439
|
+
) or ""
|
|
440
|
+
|
|
441
|
+
# 2. Extract Images via Parallelized JS (Gallery)
|
|
442
|
+
# Strategy: For search pages, use Canvas to grab already loaded images (Instant)
|
|
443
|
+
# For other pages, use fetch (more robust for lazy load)
|
|
444
|
+
images_b64 = []
|
|
445
|
+
try:
|
|
446
|
+
js_code = """
|
|
447
|
+
(async () => {
|
|
448
|
+
const blocklist = ['logo', 'icon', 'avatar', 'ad', 'pixel', 'tracker', 'button', 'menu', 'nav'];
|
|
449
|
+
const candidates = Array.from(document.querySelectorAll('img'));
|
|
450
|
+
const validImages = [];
|
|
451
|
+
|
|
452
|
+
// Helper: Get base64 from loaded image via Canvas
|
|
453
|
+
const getBase64 = (img) => {
|
|
454
|
+
try {
|
|
455
|
+
const canvas = document.createElement('canvas');
|
|
456
|
+
canvas.width = img.naturalWidth;
|
|
457
|
+
canvas.height = img.naturalHeight;
|
|
458
|
+
const ctx = canvas.getContext('2d');
|
|
459
|
+
ctx.drawImage(img, 0, 0);
|
|
460
|
+
return canvas.toDataURL('image/jpeg').split(',')[1];
|
|
461
|
+
} catch(e) { return null; }
|
|
462
|
+
};
|
|
463
|
+
|
|
464
|
+
for (const img of candidates) {
|
|
465
|
+
if (validImages.length >= 8) break;
|
|
466
|
+
|
|
467
|
+
if (img.naturalWidth < 100 || img.naturalHeight < 80) continue;
|
|
468
|
+
|
|
469
|
+
const alt = (img.alt || '').toLowerCase();
|
|
470
|
+
const cls = (typeof img.className === 'string' ? img.className : '').toLowerCase();
|
|
471
|
+
const src = (img.src || '').toLowerCase();
|
|
472
|
+
|
|
473
|
+
if (blocklist.some(b => alt.includes(b) || cls.includes(b) || src.includes(b))) continue;
|
|
474
|
+
|
|
475
|
+
// 1. Try Canvas (Instant for loaded images)
|
|
476
|
+
if (img.complete && img.naturalHeight > 0) {
|
|
477
|
+
const b64 = getBase64(img);
|
|
478
|
+
if (b64) {
|
|
479
|
+
validImages.push(b64);
|
|
480
|
+
continue;
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// 2. Fallback to fetch (only for non-search pages to avoid delay)
|
|
485
|
+
// We skip fetch for search pages to ensure speed
|
|
486
|
+
if (!window.location.href.includes('google') && !window.location.href.includes('search')) {
|
|
487
|
+
try {
|
|
488
|
+
const controller = new AbortController();
|
|
489
|
+
const id = setTimeout(() => controller.abort(), 2000);
|
|
490
|
+
const resp = await fetch(img.src, { signal: controller.signal });
|
|
491
|
+
clearTimeout(id);
|
|
492
|
+
const blob = await resp.blob();
|
|
493
|
+
const b64 = await new Promise(resolve => {
|
|
494
|
+
const reader = new FileReader();
|
|
495
|
+
reader.onloadend = () => resolve(reader.result.split(',')[1]);
|
|
496
|
+
reader.onerror = () => resolve(null);
|
|
497
|
+
reader.readAsDataURL(blob);
|
|
498
|
+
});
|
|
499
|
+
if (b64) validImages.push(b64);
|
|
500
|
+
} catch(e) {}
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
return validImages;
|
|
504
|
+
})()
|
|
505
|
+
"""
|
|
506
|
+
images_b64 = tab.run_js(js_code, as_expr=True) or []
|
|
507
|
+
|
|
508
|
+
if images_b64:
|
|
509
|
+
logger.info(f"ScreenshotService: Extracted {len(images_b64)} images for {url}")
|
|
510
|
+
|
|
511
|
+
except Exception as e:
|
|
512
|
+
logger.warning(f"ScreenshotService: Image extraction failed: {e}")
|
|
513
|
+
|
|
514
|
+
return {
|
|
515
|
+
"content": content,
|
|
516
|
+
"html": html,
|
|
517
|
+
"title": title,
|
|
518
|
+
"url": final_url,
|
|
519
|
+
"raw_screenshot_b64": raw_screenshot_b64,
|
|
520
|
+
"images": images_b64
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
except Exception as e:
|
|
524
|
+
logger.error(f"ScreenshotService: Failed to fetch {url}: {e}")
|
|
525
|
+
return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
|
|
526
|
+
finally:
|
|
527
|
+
if tab:
|
|
528
|
+
self._release_tab(tab)
|
|
529
|
+
|
|
530
|
+
async def fetch_pages_batch(self, urls: List[str], timeout: float = 20.0, include_screenshot: bool = True) -> List[Dict[str, Any]]:
|
|
531
|
+
"""Fetch multiple pages concurrently."""
|
|
532
|
+
if not urls: return []
|
|
533
|
+
logger.info(f"ScreenshotService: Batch fetching {len(urls)} URLs (screenshots={include_screenshot})")
|
|
534
|
+
tasks = [self.fetch_page(url, timeout, include_screenshot) for url in urls]
|
|
535
|
+
return await asyncio.gather(*tasks, return_exceptions=True)
|
|
536
|
+
|
|
537
|
+
async def screenshot_urls_batch(self, urls: List[str], timeout: float = 15.0, full_page: bool = True) -> List[Optional[str]]:
|
|
538
|
+
"""Take screenshots of multiple URLs concurrently."""
|
|
539
|
+
if not urls: return []
|
|
540
|
+
logger.info(f"ScreenshotService: Batch screenshot {len(urls)} URLs")
|
|
541
|
+
tasks = [self.screenshot_url(url, timeout=timeout, full_page=full_page) for url in urls]
|
|
542
|
+
return await asyncio.gather(*tasks, return_exceptions=True)
|
|
543
|
+
|
|
544
|
+
async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 80) -> Optional[str]:
|
|
545
|
+
"""Screenshot URL (Async wrapper for sync). Returns base64 string only."""
|
|
546
|
+
loop = asyncio.get_running_loop()
|
|
547
|
+
result = await loop.run_in_executor(
|
|
548
|
+
self._executor,
|
|
549
|
+
self._screenshot_sync,
|
|
550
|
+
url, wait_load, timeout, full_page, quality, False # extract_content=False
|
|
551
|
+
)
|
|
552
|
+
# Backward compatible: return just the screenshot for old callers
|
|
553
|
+
if isinstance(result, dict):
|
|
554
|
+
return result.get("screenshot_b64")
|
|
555
|
+
return result
|
|
556
|
+
|
|
557
|
+
async def screenshot_with_content(self, url: str, timeout: float = 15.0, max_content_length: int = 8000) -> Dict[str, Any]:
|
|
558
|
+
"""
|
|
559
|
+
Screenshot URL and extract page content.
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
Dict with:
|
|
563
|
+
- screenshot_b64: base64 encoded screenshot
|
|
564
|
+
- content: trafilatura extracted text (truncated to max_content_length)
|
|
565
|
+
- title: page title
|
|
566
|
+
- url: final URL
|
|
567
|
+
"""
|
|
568
|
+
loop = asyncio.get_running_loop()
|
|
569
|
+
result = await loop.run_in_executor(
|
|
570
|
+
self._executor,
|
|
571
|
+
self._screenshot_sync,
|
|
572
|
+
url, True, timeout, False, 65, True # quality=65 for balance, extract_content=True
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
if not isinstance(result, dict):
|
|
576
|
+
return {"screenshot_b64": result, "content": "", "title": "", "url": url}
|
|
577
|
+
|
|
578
|
+
# Truncate content if needed
|
|
579
|
+
content = result.get("content", "") or ""
|
|
580
|
+
if len(content) > max_content_length:
|
|
581
|
+
content = content[:max_content_length] + "\n\n[内容已截断...]"
|
|
582
|
+
result["content"] = content
|
|
583
|
+
|
|
584
|
+
return result
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def _screenshot_sync(self, url: str, wait_load: bool, timeout: float, full_page: bool, quality: int, extract_content: bool = False) -> Any:
|
|
588
|
+
"""Synchronous screenshot. If extract_content=True, returns Dict else str."""
|
|
589
|
+
if not url:
|
|
590
|
+
return {"screenshot_b64": None, "content": "", "title": "", "url": url} if extract_content else None
|
|
591
|
+
tab = None
|
|
592
|
+
capture_width = 1440 # Higher resolution for readability
|
|
593
|
+
|
|
594
|
+
try:
|
|
595
|
+
self._ensure_ready()
|
|
596
|
+
page = self._manager.page
|
|
597
|
+
if not page: return None
|
|
598
|
+
|
|
599
|
+
# Create blank tab first
|
|
600
|
+
tab = page.new_tab()
|
|
601
|
+
|
|
602
|
+
# Set viewport BEFORE navigation so page renders at target width from the start
|
|
603
|
+
# This eliminates the need for post-load resize and reflow
|
|
604
|
+
try:
|
|
605
|
+
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
606
|
+
width=capture_width, height=900, deviceScaleFactor=1, mobile=False)
|
|
607
|
+
except:
|
|
608
|
+
pass
|
|
609
|
+
|
|
610
|
+
# Now navigate to the URL - page will render at target width
|
|
611
|
+
tab.get(url)
|
|
612
|
+
|
|
613
|
+
# Network monitoring removed - not needed for simplified wait logic
|
|
614
|
+
|
|
615
|
+
# Initialize crawl config for completeness checking (defined outside try for scope)
|
|
616
|
+
crawl_config = CrawlConfig(
|
|
617
|
+
scan_full_page=True,
|
|
618
|
+
scroll_step=800,
|
|
619
|
+
scroll_delay=0.5,
|
|
620
|
+
scroll_timeout=20.0, # Increased for lazy-loading pages
|
|
621
|
+
image_load_timeout=3.0, # Image loading timeout: 3 seconds
|
|
622
|
+
image_stability_checks=3,
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
# === Start scrolling immediately to trigger lazy loading ===
|
|
626
|
+
# Scroll first, then wait for DOM - this allows lazy loading to start in parallel
|
|
627
|
+
logger.info(f"ScreenshotService: Starting lazy load scroll for {url} (before DOM wait)")
|
|
628
|
+
trigger_lazy_load(tab, crawl_config)
|
|
629
|
+
|
|
630
|
+
# Now wait for DOM to be ready (after scroll has triggered lazy loading)
|
|
631
|
+
try:
|
|
632
|
+
# Wait for full page load (including JS execution)
|
|
633
|
+
try:
|
|
634
|
+
tab.wait.doc_loaded(timeout=timeout)
|
|
635
|
+
except:
|
|
636
|
+
pass
|
|
637
|
+
|
|
638
|
+
# Wait for actual content to appear (for CDN verification pages)
|
|
639
|
+
# Simplified wait logic for screenshot: just check basic readiness
|
|
640
|
+
|
|
641
|
+
for i in range(20): # Max 20 iterations (~1s) - much faster
|
|
642
|
+
try:
|
|
643
|
+
state = tab.run_js('''
|
|
644
|
+
return {
|
|
645
|
+
ready: document.readyState === 'complete',
|
|
646
|
+
title: document.title,
|
|
647
|
+
text: document.body.innerText.substring(0, 500) || ""
|
|
648
|
+
};
|
|
649
|
+
''') or {'ready': False, 'title': "", 'text': ""}
|
|
650
|
+
|
|
651
|
+
is_ready = state.get('ready', False)
|
|
652
|
+
title = state.get('title', "").lower()
|
|
653
|
+
text_lower = state.get('text', "").lower()
|
|
654
|
+
text_len = len(text_lower)
|
|
655
|
+
|
|
656
|
+
# Check for verification pages
|
|
657
|
+
is_verification = "checking your browser" in text_lower or \
|
|
658
|
+
"just a moment" in text_lower or \
|
|
659
|
+
"please wait" in text_lower or \
|
|
660
|
+
"security check" in title or \
|
|
661
|
+
"just a moment" in title or \
|
|
662
|
+
"loading..." in title
|
|
663
|
+
|
|
664
|
+
# Basic content check
|
|
665
|
+
has_content = text_len > 100
|
|
666
|
+
|
|
667
|
+
# Pass if ready, not verification, and has content
|
|
668
|
+
if is_ready and not is_verification and has_content:
|
|
669
|
+
logger.debug(f"ScreenshotService: Page ready after {i * 0.05:.2f}s")
|
|
670
|
+
break
|
|
671
|
+
|
|
672
|
+
time.sleep(0.05)
|
|
673
|
+
except Exception:
|
|
674
|
+
time.sleep(0.05)
|
|
675
|
+
continue
|
|
676
|
+
|
|
677
|
+
# DEBUG: Save HTML to inspect what happened (in data dir)
|
|
678
|
+
try:
|
|
679
|
+
import os
|
|
680
|
+
log_path = os.path.join(os.getcwd(), "data", "browser.log.html")
|
|
681
|
+
with open(log_path, "w", encoding="utf-8") as f:
|
|
682
|
+
f.write(f"<!-- URL: {url} -->\n")
|
|
683
|
+
f.write(tab.html)
|
|
684
|
+
except: pass
|
|
685
|
+
|
|
686
|
+
except Exception as e:
|
|
687
|
+
logger.warning(f"ScreenshotService: Page readiness check failed: {e}")
|
|
688
|
+
|
|
689
|
+
# Scrollbar Hiding first (before any height calculation)
|
|
690
|
+
from .manager import SharedBrowserManager
|
|
691
|
+
SharedBrowserManager.hide_scrollbars(tab)
|
|
692
|
+
|
|
693
|
+
# Scroll back to top
|
|
694
|
+
tab.run_js("window.scrollTo(0, 0);")
|
|
695
|
+
|
|
696
|
+
# Image loading monitoring with time tracking - DISABLED
|
|
697
|
+
# No longer waiting for images to load
|
|
698
|
+
# Initialize image tracking JavaScript
|
|
699
|
+
# image_tracking_js = """
|
|
700
|
+
# (() => {
|
|
701
|
+
# if (!window._imageLoadTracker) {
|
|
702
|
+
# window._imageLoadTracker = {
|
|
703
|
+
# startTime: Date.now(),
|
|
704
|
+
# images: new Map()
|
|
705
|
+
# };
|
|
706
|
+
#
|
|
707
|
+
# const imgs = Array.from(document.querySelectorAll('img'));
|
|
708
|
+
# const minSize = 50;
|
|
709
|
+
#
|
|
710
|
+
# imgs.forEach((img, idx) => {
|
|
711
|
+
# if (img.clientWidth < minSize && img.clientHeight < minSize) return;
|
|
712
|
+
#
|
|
713
|
+
# const src = img.src || img.getAttribute('data-src') || '';
|
|
714
|
+
# const key = `${idx}_${src.substring(0, 100)}`;
|
|
715
|
+
#
|
|
716
|
+
# if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
|
|
717
|
+
# // Already loaded
|
|
718
|
+
# window._imageLoadTracker.images.set(key, {
|
|
719
|
+
# src: src.substring(0, 150),
|
|
720
|
+
# status: 'loaded',
|
|
721
|
+
# loadTime: 0, // Already loaded before tracking
|
|
722
|
+
# naturalSize: [img.naturalWidth, img.naturalHeight],
|
|
723
|
+
# displaySize: [img.clientWidth, img.clientHeight]
|
|
724
|
+
# });
|
|
725
|
+
# } else {
|
|
726
|
+
# // Track loading
|
|
727
|
+
# window._imageLoadTracker.images.set(key, {
|
|
728
|
+
# src: src.substring(0, 150),
|
|
729
|
+
# status: 'pending',
|
|
730
|
+
# startTime: Date.now(),
|
|
731
|
+
# naturalSize: [img.naturalWidth, img.naturalHeight],
|
|
732
|
+
# displaySize: [img.clientWidth, img.clientHeight]
|
|
733
|
+
# });
|
|
734
|
+
#
|
|
735
|
+
# // Add load event listener
|
|
736
|
+
# img.addEventListener('load', () => {
|
|
737
|
+
# const entry = window._imageLoadTracker.images.get(key);
|
|
738
|
+
# if (entry && entry.status === 'pending') {
|
|
739
|
+
# entry.status = 'loaded';
|
|
740
|
+
# entry.loadTime = Date.now() - entry.startTime;
|
|
741
|
+
# }
|
|
742
|
+
# });
|
|
743
|
+
#
|
|
744
|
+
# img.addEventListener('error', () => {
|
|
745
|
+
# const entry = window._imageLoadTracker.images.get(key);
|
|
746
|
+
# if (entry && entry.status === 'pending') {
|
|
747
|
+
# entry.status = 'failed';
|
|
748
|
+
# entry.loadTime = Date.now() - entry.startTime;
|
|
749
|
+
# }
|
|
750
|
+
# });
|
|
751
|
+
# }
|
|
752
|
+
# });
|
|
753
|
+
# }
|
|
754
|
+
#
|
|
755
|
+
# // Return current status
|
|
756
|
+
# const results = [];
|
|
757
|
+
# window._imageLoadTracker.images.forEach((value, key) => {
|
|
758
|
+
# const entry = {
|
|
759
|
+
# src: value.src,
|
|
760
|
+
# status: value.status,
|
|
761
|
+
# loadTime: value.status === 'loaded' ? (value.loadTime || 0) : (Date.now() - value.startTime),
|
|
762
|
+
# naturalSize: value.naturalSize,
|
|
763
|
+
# displaySize: value.displaySize
|
|
764
|
+
# };
|
|
765
|
+
# results.push(entry);
|
|
766
|
+
# });
|
|
767
|
+
#
|
|
768
|
+
# return {
|
|
769
|
+
# total: results.length,
|
|
770
|
+
# loaded: results.filter(r => r.status === 'loaded').length,
|
|
771
|
+
# pending: results.filter(r => r.status === 'pending').length,
|
|
772
|
+
# failed: results.filter(r => r.status === 'failed').length,
|
|
773
|
+
# details: results
|
|
774
|
+
# };
|
|
775
|
+
# })()
|
|
776
|
+
# """
|
|
777
|
+
#
|
|
778
|
+
# # Initialize tracking
|
|
779
|
+
# tab.run_js(image_tracking_js)
|
|
780
|
+
#
|
|
781
|
+
# # Monitor image loading with dynamic stop logic
|
|
782
|
+
# check_interval = 0.2 # Check every 200ms
|
|
783
|
+
# image_timeout = 3.0 # Image loading timeout: 3 seconds
|
|
784
|
+
# monitoring_start = time.time()
|
|
785
|
+
# loaded_times = [] # Track load times of completed images
|
|
786
|
+
#
|
|
787
|
+
# logger.info(f"ScreenshotService: Starting image load monitoring (timeout={image_timeout}s)...")
|
|
788
|
+
#
|
|
789
|
+
# while True:
|
|
790
|
+
# elapsed = time.time() - monitoring_start
|
|
791
|
+
#
|
|
792
|
+
# # Check timeout first
|
|
793
|
+
# if elapsed >= image_timeout:
|
|
794
|
+
# logger.info(f"ScreenshotService: Image loading timeout ({image_timeout}s) reached")
|
|
795
|
+
# break
|
|
796
|
+
#
|
|
797
|
+
# # Get current image status
|
|
798
|
+
# status = tab.run_js(image_tracking_js, as_expr=True) or {
|
|
799
|
+
# 'total': 0, 'loaded': 0, 'pending': 0, 'failed': 0, 'details': []
|
|
800
|
+
# }
|
|
801
|
+
#
|
|
802
|
+
# # Log each image's status and load time
|
|
803
|
+
# for img_detail in status.get('details', []):
|
|
804
|
+
# src_short = img_detail.get('src', '')[:80]
|
|
805
|
+
# status_str = img_detail.get('status', 'unknown')
|
|
806
|
+
# load_time = img_detail.get('loadTime', 0)
|
|
807
|
+
# logger.info(
|
|
808
|
+
# f"ScreenshotService: Image [{status_str}] "
|
|
809
|
+
# f"loadTime={load_time:.0f}ms "
|
|
810
|
+
# f"src={src_short}"
|
|
811
|
+
# )
|
|
812
|
+
#
|
|
813
|
+
# # Collect load times of completed images
|
|
814
|
+
# loaded_times = [
|
|
815
|
+
# img.get('loadTime', 0)
|
|
816
|
+
# for img in status.get('details', [])
|
|
817
|
+
# if img.get('status') == 'loaded' and img.get('loadTime', 0) > 0
|
|
818
|
+
# ]
|
|
819
|
+
#
|
|
820
|
+
# pending_count = status.get('pending', 0)
|
|
821
|
+
# loaded_count = status.get('loaded', 0)
|
|
822
|
+
#
|
|
823
|
+
# # Check stop conditions
|
|
824
|
+
# if pending_count == 0:
|
|
825
|
+
# logger.info(f"ScreenshotService: All images loaded. Total: {status.get('total', 0)}, Loaded: {loaded_count}")
|
|
826
|
+
# break
|
|
827
|
+
#
|
|
828
|
+
# # Check dynamic stop condition (if we have loaded images to calculate average)
|
|
829
|
+
# if loaded_times:
|
|
830
|
+
# avg_load_time = sum(loaded_times) / len(loaded_times)
|
|
831
|
+
# max_wait_time = avg_load_time * 2
|
|
832
|
+
#
|
|
833
|
+
# # Check if any pending image has exceeded max wait time
|
|
834
|
+
# pending_images = [
|
|
835
|
+
# img for img in status.get('details', [])
|
|
836
|
+
# if img.get('status') == 'pending'
|
|
837
|
+
# ]
|
|
838
|
+
#
|
|
839
|
+
# should_stop = False
|
|
840
|
+
# for pending_img in pending_images:
|
|
841
|
+
# wait_time = pending_img.get('loadTime', 0)
|
|
842
|
+
# if wait_time >= max_wait_time:
|
|
843
|
+
# should_stop = True
|
|
844
|
+
# logger.info(
|
|
845
|
+
# f"ScreenshotService: Stopping - pending image waited {wait_time:.0f}ms, "
|
|
846
|
+
# f"exceeds 2x avg load time ({max_wait_time:.0f}ms, avg={avg_load_time:.0f}ms)"
|
|
847
|
+
# )
|
|
848
|
+
# break
|
|
849
|
+
#
|
|
850
|
+
# if should_stop:
|
|
851
|
+
# break
|
|
852
|
+
#
|
|
853
|
+
# # Wait before next check
|
|
854
|
+
# time.sleep(check_interval)
|
|
855
|
+
|
|
856
|
+
# Now calculate final height ONCE after all content loaded
|
|
857
|
+
# CompletenessChecker already verified height stability
|
|
858
|
+
try:
|
|
859
|
+
final_height = tab.run_js('''
|
|
860
|
+
return Math.max(
|
|
861
|
+
document.body.scrollHeight || 0,
|
|
862
|
+
document.documentElement.scrollHeight || 0,
|
|
863
|
+
document.body.offsetHeight || 0,
|
|
864
|
+
document.documentElement.offsetHeight || 0
|
|
865
|
+
);
|
|
866
|
+
''')
|
|
867
|
+
h = min(int(final_height) + 50, 15000)
|
|
868
|
+
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
869
|
+
width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
|
|
870
|
+
except:
|
|
871
|
+
pass
|
|
872
|
+
|
|
873
|
+
# Final scroll to top
|
|
874
|
+
tab.run_js("window.scrollTo(0, 0);")
|
|
875
|
+
|
|
876
|
+
# Capture screenshot
|
|
877
|
+
screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
|
|
878
|
+
|
|
879
|
+
# Extract content if requested
|
|
880
|
+
if extract_content:
|
|
881
|
+
try:
|
|
882
|
+
html = tab.html
|
|
883
|
+
title = tab.title
|
|
884
|
+
final_url = tab.url
|
|
885
|
+
|
|
886
|
+
# Minimal trafilatura settings to reduce token consumption
|
|
887
|
+
content = trafilatura.extract(
|
|
888
|
+
html,
|
|
889
|
+
include_links=False, # No links to reduce tokens
|
|
890
|
+
include_images=False, # No image descriptions
|
|
891
|
+
include_comments=False, # No comments
|
|
892
|
+
include_tables=False, # No tables (can be verbose)
|
|
893
|
+
favor_precision=True, # Favor precision over recall
|
|
894
|
+
output_format="txt" # Plain text (no markdown formatting)
|
|
895
|
+
) or ""
|
|
896
|
+
|
|
897
|
+
return {
|
|
898
|
+
"screenshot_b64": screenshot_b64,
|
|
899
|
+
"content": content,
|
|
900
|
+
"title": title,
|
|
901
|
+
"url": final_url
|
|
902
|
+
}
|
|
903
|
+
except Exception as e:
|
|
904
|
+
logger.warning(f"ScreenshotService: Content extraction failed: {e}")
|
|
905
|
+
return {"screenshot_b64": screenshot_b64, "content": "", "title": "", "url": url}
|
|
906
|
+
|
|
907
|
+
return screenshot_b64
|
|
908
|
+
|
|
909
|
+
except Exception as e:
|
|
910
|
+
logger.error(f"ScreenshotService: Screenshot URL failed: {e}")
|
|
911
|
+
return {"screenshot_b64": None, "content": "", "title": "", "url": url} if extract_content else None
|
|
912
|
+
finally:
|
|
913
|
+
if tab:
|
|
914
|
+
try: tab.close()
|
|
915
|
+
except: pass
|
|
916
|
+
|
|
917
|
+
async def close(self):
|
|
918
|
+
self._executor.shutdown(wait=False)
|
|
919
|
+
logger.info("ScreenshotService: Closed.")
|
|
920
|
+
|
|
921
|
+
async def close_async(self):
|
|
922
|
+
await self.close()
|
|
923
|
+
|
|
924
|
+
# Singleton
|
|
925
|
+
_screenshot_service: Optional[ScreenshotService] = None
|
|
926
|
+
|
|
927
|
+
def get_screenshot_service(headless: bool = True) -> ScreenshotService:
|
|
928
|
+
global _screenshot_service
|
|
929
|
+
if _screenshot_service is None:
|
|
930
|
+
_screenshot_service = ScreenshotService(headless=headless, auto_start=True)
|
|
931
|
+
return _screenshot_service
|
|
932
|
+
|
|
933
|
+
async def close_screenshot_service():
|
|
934
|
+
global _screenshot_service
|
|
935
|
+
if _screenshot_service:
|
|
936
|
+
await _screenshot_service.close()
|
|
937
|
+
_screenshot_service = None
|
|
938
|
+
|
|
939
|
+
def prestart_browser(headless: bool = True):
|
|
940
|
+
get_screenshot_service(headless=headless)
|