entari-plugin-hyw 4.0.0rc7__py3-none-any.whl → 4.0.0rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- entari_plugin_hyw/Untitled-1 +1865 -0
- entari_plugin_hyw/__init__.py +726 -394
- entari_plugin_hyw/history.py +26 -13
- entari_plugin_hyw/misc.py +3 -0
- entari_plugin_hyw/search_cache.py +154 -0
- {entari_plugin_hyw-4.0.0rc7.dist-info → entari_plugin_hyw-4.0.0rc9.dist-info}/METADATA +3 -1
- entari_plugin_hyw-4.0.0rc9.dist-info/RECORD +68 -0
- {entari_plugin_hyw-4.0.0rc7.dist-info → entari_plugin_hyw-4.0.0rc9.dist-info}/WHEEL +1 -1
- {entari_plugin_hyw-4.0.0rc7.dist-info → entari_plugin_hyw-4.0.0rc9.dist-info}/top_level.txt +1 -0
- hyw_core/__init__.py +94 -0
- hyw_core/browser_control/__init__.py +65 -0
- hyw_core/browser_control/assets/card-dist/index.html +409 -0
- hyw_core/browser_control/assets/index.html +5691 -0
- hyw_core/browser_control/engines/__init__.py +17 -0
- {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/duckduckgo.py +42 -8
- {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/google.py +1 -1
- {entari_plugin_hyw/browser → hyw_core/browser_control}/manager.py +15 -8
- entari_plugin_hyw/render_vue.py → hyw_core/browser_control/renderer.py +29 -14
- {entari_plugin_hyw/browser → hyw_core/browser_control}/service.py +340 -112
- hyw_core/config.py +154 -0
- hyw_core/core.py +322 -0
- hyw_core/definitions.py +83 -0
- entari_plugin_hyw/modular_pipeline.py → hyw_core/pipeline.py +121 -97
- {entari_plugin_hyw → hyw_core}/search.py +19 -14
- hyw_core/stages/__init__.py +21 -0
- entari_plugin_hyw/stage_base.py → hyw_core/stages/base.py +2 -2
- entari_plugin_hyw/stage_summary.py → hyw_core/stages/summary.py +34 -11
- entari_plugin_hyw/assets/card-dist/index.html +0 -387
- entari_plugin_hyw/browser/__init__.py +0 -10
- entari_plugin_hyw/browser/engines/bing.py +0 -95
- entari_plugin_hyw/card-ui/.gitignore +0 -24
- entari_plugin_hyw/card-ui/README.md +0 -5
- entari_plugin_hyw/card-ui/index.html +0 -16
- entari_plugin_hyw/card-ui/package-lock.json +0 -2342
- entari_plugin_hyw/card-ui/package.json +0 -31
- entari_plugin_hyw/card-ui/public/logos/anthropic.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/cerebras.svg +0 -9
- entari_plugin_hyw/card-ui/public/logos/deepseek.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/gemini.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/google.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/grok.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/huggingface.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/microsoft.svg +0 -15
- entari_plugin_hyw/card-ui/public/logos/minimax.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/mistral.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/nvida.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/openai.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/openrouter.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/perplexity.svg +0 -24
- entari_plugin_hyw/card-ui/public/logos/qwen.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/xai.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/xiaomi.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/zai.png +0 -0
- entari_plugin_hyw/card-ui/public/vite.svg +0 -1
- entari_plugin_hyw/card-ui/src/App.vue +0 -787
- entari_plugin_hyw/card-ui/src/assets/vue.svg +0 -1
- entari_plugin_hyw/card-ui/src/components/HelloWorld.vue +0 -41
- entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +0 -382
- entari_plugin_hyw/card-ui/src/components/SectionCard.vue +0 -41
- entari_plugin_hyw/card-ui/src/components/StageCard.vue +0 -240
- entari_plugin_hyw/card-ui/src/main.ts +0 -5
- entari_plugin_hyw/card-ui/src/style.css +0 -29
- entari_plugin_hyw/card-ui/src/test_regex.js +0 -103
- entari_plugin_hyw/card-ui/src/types.ts +0 -61
- entari_plugin_hyw/card-ui/tsconfig.app.json +0 -16
- entari_plugin_hyw/card-ui/tsconfig.json +0 -7
- entari_plugin_hyw/card-ui/tsconfig.node.json +0 -26
- entari_plugin_hyw/card-ui/vite.config.ts +0 -16
- entari_plugin_hyw/definitions.py +0 -174
- entari_plugin_hyw/stage_instruct.py +0 -355
- entari_plugin_hyw/stage_instruct_deepsearch.py +0 -104
- entari_plugin_hyw/stage_vision.py +0 -113
- entari_plugin_hyw-4.0.0rc7.dist-info/RECORD +0 -102
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/anthropic.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/cerebras.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/deepseek.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/gemini.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/google.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/grok.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/huggingface.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/microsoft.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/minimax.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/mistral.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/nvida.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/openai.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/openrouter.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/perplexity.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/qwen.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/xai.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/xiaomi.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/zai.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/vite.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/anthropic.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/cerebras.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/deepseek.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/gemini.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/google.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/grok.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/huggingface.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/microsoft.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/minimax.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/mistral.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/nvida.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/openai.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/openrouter.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/perplexity.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/qwen.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/xai.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/xiaomi.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/zai.png +0 -0
- {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/base.py +0 -0
- {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/default.py +0 -0
- {entari_plugin_hyw/browser → hyw_core/browser_control}/landing.html +0 -0
- {entari_plugin_hyw → hyw_core}/image_cache.py +0 -0
|
@@ -22,65 +22,22 @@ class ScreenshotService:
|
|
|
22
22
|
self.headless = headless
|
|
23
23
|
self._manager = None
|
|
24
24
|
self._executor = ThreadPoolExecutor(max_workers=10)
|
|
25
|
-
self._search_tab_pool = [] # List of Tab objects
|
|
26
|
-
self._pool_lock = threading.Lock()
|
|
27
25
|
|
|
28
26
|
if auto_start:
|
|
29
27
|
self._ensure_ready()
|
|
30
|
-
|
|
31
|
-
def prepare_search_tabs_background(self, count: int, url: str = "https://www.google.com") -> None:
|
|
32
|
-
"""
|
|
33
|
-
Pre-launch tabs for search (BACKGROUND - fire and forget).
|
|
34
|
-
Tabs are created in background thread, may not be ready immediately.
|
|
35
|
-
"""
|
|
36
|
-
self._executor.submit(self._prepare_search_tabs_sync, count, url)
|
|
37
28
|
|
|
38
|
-
def
|
|
39
|
-
"""
|
|
29
|
+
def _get_tab(self, url: str) -> Any:
|
|
30
|
+
"""Create a new tab and navigate to URL."""
|
|
31
|
+
self._ensure_ready()
|
|
32
|
+
return self._manager.new_tab(url)
|
|
33
|
+
|
|
34
|
+
def _release_tab(self, tab: Any):
|
|
35
|
+
"""Close tab after use."""
|
|
36
|
+
if not tab: return
|
|
40
37
|
try:
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
with self._pool_lock:
|
|
46
|
-
current_count = len(self._search_tab_pool)
|
|
47
|
-
needed = count - current_count
|
|
48
|
-
|
|
49
|
-
if needed <= 0:
|
|
50
|
-
return
|
|
51
|
-
|
|
52
|
-
logger.info(f"ScreenshotService: Pre-launching {needed} search tabs for {url} (parallel)...")
|
|
53
|
-
|
|
54
|
-
# Create tabs in parallel using threads
|
|
55
|
-
created_tabs = [None] * needed
|
|
56
|
-
|
|
57
|
-
def create_single_tab(index):
|
|
58
|
-
try:
|
|
59
|
-
tab = page.new_tab(url)
|
|
60
|
-
created_tabs[index] = tab
|
|
61
|
-
logger.debug(f"ScreenshotService: Tab {index} ready")
|
|
62
|
-
except Exception as e:
|
|
63
|
-
logger.error(f"ScreenshotService: Failed to create tab {index}: {e}")
|
|
64
|
-
|
|
65
|
-
threads = []
|
|
66
|
-
for i in range(needed):
|
|
67
|
-
t = threading.Thread(target=create_single_tab, args=(i,))
|
|
68
|
-
t.start()
|
|
69
|
-
threads.append(t)
|
|
70
|
-
|
|
71
|
-
# Wait for all threads to complete
|
|
72
|
-
for t in threads:
|
|
73
|
-
t.join()
|
|
74
|
-
|
|
75
|
-
# Add successfully created tabs to pool
|
|
76
|
-
with self._pool_lock:
|
|
77
|
-
for tab in created_tabs:
|
|
78
|
-
if tab:
|
|
79
|
-
self._search_tab_pool.append(tab)
|
|
80
|
-
logger.info(f"ScreenshotService: Tab pool ready ({len(self._search_tab_pool)} tabs)")
|
|
81
|
-
|
|
82
|
-
except Exception as e:
|
|
83
|
-
logger.error(f"ScreenshotService: Failed to prepare tabs: {e}")
|
|
38
|
+
tab.close()
|
|
39
|
+
except:
|
|
40
|
+
pass
|
|
84
41
|
|
|
85
42
|
async def search_via_page_input_batch(self, queries: List[str], url: str, selector: str = "#input") -> List[Dict[str, Any]]:
|
|
86
43
|
"""
|
|
@@ -104,17 +61,13 @@ class ScreenshotService:
|
|
|
104
61
|
|
|
105
62
|
for i in range(len(queries)):
|
|
106
63
|
tab = None
|
|
107
|
-
# Try to get from pool first
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
64
|
+
# Try to get from pool first (using shared logic now)
|
|
65
|
+
try:
|
|
66
|
+
tab = self._get_tab(target_url)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logger.warning(f"ScreenshotService: Batch search tab creation failed: {e}")
|
|
112
69
|
|
|
113
|
-
|
|
114
|
-
# Create new
|
|
115
|
-
self._ensure_ready()
|
|
116
|
-
tab = self._manager.page.new_tab(target_url)
|
|
117
|
-
logger.debug(f"ScreenshotService: Created tab {i} for {target_url}")
|
|
70
|
+
|
|
118
71
|
|
|
119
72
|
tabs.append(tab)
|
|
120
73
|
|
|
@@ -155,7 +108,8 @@ class ScreenshotService:
|
|
|
155
108
|
|
|
156
109
|
logger.debug(f"Search[{index}]: Waiting for search results...")
|
|
157
110
|
tab.wait.doc_loaded(timeout=10)
|
|
158
|
-
|
|
111
|
+
# Reduced settle wait for extraction
|
|
112
|
+
time.sleep(0.1)
|
|
159
113
|
|
|
160
114
|
logger.debug(f"Search[{index}]: Extracting content...")
|
|
161
115
|
html = tab.html
|
|
@@ -178,8 +132,7 @@ class ScreenshotService:
|
|
|
178
132
|
logger.error(f"ScreenshotService: Search error for '{query}': {e}")
|
|
179
133
|
results[index] = {"content": f"Error: {e}", "title": "Error", "url": "", "html": ""}
|
|
180
134
|
finally:
|
|
181
|
-
|
|
182
|
-
except: pass
|
|
135
|
+
self._release_tab(tab)
|
|
183
136
|
|
|
184
137
|
threads = []
|
|
185
138
|
for i, (tab, query) in enumerate(zip(tabs, queries)):
|
|
@@ -248,7 +201,7 @@ class ScreenshotService:
|
|
|
248
201
|
|
|
249
202
|
# Small delay for address bar to focus
|
|
250
203
|
import time as _time
|
|
251
|
-
_time.sleep(0.
|
|
204
|
+
_time.sleep(0.05)
|
|
252
205
|
|
|
253
206
|
# Type the query
|
|
254
207
|
tab.actions.type(query)
|
|
@@ -259,8 +212,8 @@ class ScreenshotService:
|
|
|
259
212
|
# Wait for page to load
|
|
260
213
|
try:
|
|
261
214
|
tab.wait.doc_loaded(timeout=timeout)
|
|
262
|
-
#
|
|
263
|
-
_time.sleep(
|
|
215
|
+
# Reduced wait for initial results
|
|
216
|
+
_time.sleep(0.2)
|
|
264
217
|
except:
|
|
265
218
|
pass
|
|
266
219
|
|
|
@@ -292,6 +245,89 @@ class ScreenshotService:
|
|
|
292
245
|
try: tab.close()
|
|
293
246
|
except: pass
|
|
294
247
|
|
|
248
|
+
def _scroll_to_bottom(self, tab, step: int = 800, delay: float = 2.0, timeout: float = 10.0):
|
|
249
|
+
"""
|
|
250
|
+
Scroll down gradually to trigger lazy loading.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
delay: Max wait time per scroll step (seconds) if images aren't loading.
|
|
254
|
+
"""
|
|
255
|
+
import time
|
|
256
|
+
start = time.time()
|
|
257
|
+
current_pos = 0
|
|
258
|
+
try:
|
|
259
|
+
while time.time() - start < timeout:
|
|
260
|
+
# Scroll down
|
|
261
|
+
current_pos += step
|
|
262
|
+
tab.run_js(f"window.scrollTo(0, {current_pos});")
|
|
263
|
+
|
|
264
|
+
# Active Wait: Check if images in viewport are loaded
|
|
265
|
+
# Poll every 100ms, up to 'delay' seconds
|
|
266
|
+
wait_start = time.time()
|
|
267
|
+
while time.time() - wait_start < delay:
|
|
268
|
+
all_loaded = tab.run_js("""
|
|
269
|
+
return (async () => {
|
|
270
|
+
const imgs = Array.from(document.querySelectorAll('img'));
|
|
271
|
+
const viewportHeight = window.innerHeight;
|
|
272
|
+
|
|
273
|
+
// 1. Identify images currently in viewport
|
|
274
|
+
const visibleImgs = imgs.filter(img => {
|
|
275
|
+
const rect = img.getBoundingClientRect();
|
|
276
|
+
return (rect.top < viewportHeight && rect.bottom > 0) && (rect.width > 0 && rect.height > 0);
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
if (visibleImgs.length === 0) return true;
|
|
280
|
+
|
|
281
|
+
// 2. Check loading status using decode() AND heuristic for placeholders
|
|
282
|
+
// Some sites load a tiny blurred placeholder first.
|
|
283
|
+
const checks = visibleImgs.map(img => {
|
|
284
|
+
// HEURISTIC: content is likely not ready if:
|
|
285
|
+
// - img has 'data-src' but src is different (or src is empty)
|
|
286
|
+
// - img has 'loading="lazy"' and is not complete
|
|
287
|
+
// - naturalWidth is very small (placeholder) compared to display width
|
|
288
|
+
|
|
289
|
+
const isPlaceholder = (
|
|
290
|
+
(img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
|
|
291
|
+
(img.naturalWidth < 50 && img.clientWidth > 100)
|
|
292
|
+
);
|
|
293
|
+
|
|
294
|
+
if (isPlaceholder) {
|
|
295
|
+
// If it looks like a placeholder, we return false (not loaded)
|
|
296
|
+
// unless it stays like this for too long (handled by outer timeout)
|
|
297
|
+
return Promise.resolve(false);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
if (img.complete && img.naturalHeight > 0) return Promise.resolve(true);
|
|
301
|
+
|
|
302
|
+
return img.decode().then(() => true).catch(() => false);
|
|
303
|
+
});
|
|
304
|
+
|
|
305
|
+
// Race against a small timeout to avoid hanging on one broken image
|
|
306
|
+
const allDecoded = Promise.all(checks);
|
|
307
|
+
const timeout = new Promise(resolve => setTimeout(() => resolve(false), 500));
|
|
308
|
+
|
|
309
|
+
// If any check returned false (meaning placeholder or not decoded), result is false
|
|
310
|
+
return Promise.race([allDecoded, timeout]).then(results => {
|
|
311
|
+
if (!Array.isArray(results)) return results === true;
|
|
312
|
+
return results.every(res => res === true);
|
|
313
|
+
});
|
|
314
|
+
})();
|
|
315
|
+
""")
|
|
316
|
+
if all_loaded:
|
|
317
|
+
break
|
|
318
|
+
time.sleep(0.1)
|
|
319
|
+
|
|
320
|
+
# Check if reached bottom
|
|
321
|
+
height = tab.run_js("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);")
|
|
322
|
+
if current_pos >= height:
|
|
323
|
+
break
|
|
324
|
+
|
|
325
|
+
# Ensure final layout settle
|
|
326
|
+
time.sleep(0.2)
|
|
327
|
+
|
|
328
|
+
except Exception as e:
|
|
329
|
+
logger.warning(f"ScreenshotService: Scroll failed: {e}")
|
|
330
|
+
|
|
295
331
|
def _fetch_page_sync(self, url: str, timeout: float, include_screenshot: bool) -> Dict[str, Any]:
|
|
296
332
|
"""Synchronous fetch logic."""
|
|
297
333
|
if not url:
|
|
@@ -304,27 +340,48 @@ class ScreenshotService:
|
|
|
304
340
|
if not page:
|
|
305
341
|
return {"content": "Error: Browser not available", "title": "Error", "url": url}
|
|
306
342
|
|
|
307
|
-
#
|
|
308
|
-
tab =
|
|
343
|
+
# Get from pool
|
|
344
|
+
tab = self._get_tab(url)
|
|
309
345
|
|
|
310
346
|
# Wait logic - optimized for search pages
|
|
311
347
|
is_search_page = any(s in url.lower() for s in ['search', 'bing.com', 'duckduckgo', 'google.com/search', 'searx'])
|
|
312
348
|
if is_search_page:
|
|
313
|
-
# Optimized waiting for
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
349
|
+
# Optimized waiting: Rapidly poll for ACTUAL results > 0
|
|
350
|
+
start_time = time.time()
|
|
351
|
+
|
|
352
|
+
# Special fast-path for DDG Lite (HTML only, no JS rendering needed)
|
|
353
|
+
if 'lite.duckduckgo' in url:
|
|
354
|
+
# just wait for body, it's static HTML
|
|
355
|
+
try:
|
|
356
|
+
tab.wait.doc_loaded(timeout=timeout)
|
|
357
|
+
except: pass
|
|
358
|
+
# Sleep tiny bit to ensure render
|
|
359
|
+
time.sleep(0.5)
|
|
360
|
+
else:
|
|
361
|
+
while time.time() - start_time < timeout:
|
|
362
|
+
found_results = False
|
|
363
|
+
try:
|
|
364
|
+
if 'google' in url.lower():
|
|
365
|
+
# Check if we have any result items (.g, .MjjYud) or the main container (#search)
|
|
366
|
+
# Using checks with minimal timeout to allow fast looping
|
|
367
|
+
if tab.ele('.g', timeout=0.1) or tab.ele('.MjjYud', timeout=0.1) or tab.ele('#search', timeout=0.1):
|
|
368
|
+
found_results = True
|
|
369
|
+
elif 'bing' in url.lower():
|
|
370
|
+
if tab.ele('.b_algo', timeout=0.1) or tab.ele('#b_results', timeout=0.1):
|
|
371
|
+
found_results = True
|
|
372
|
+
elif 'duckduckgo' in url.lower():
|
|
373
|
+
if tab.ele('.result', timeout=0.1) or tab.ele('#react-layout', timeout=0.1):
|
|
374
|
+
found_results = True
|
|
375
|
+
else:
|
|
376
|
+
# Generic fallback: wait for body to be populated
|
|
377
|
+
if tab.ele('body', timeout=0.1):
|
|
378
|
+
found_results = True
|
|
379
|
+
except:
|
|
380
|
+
pass
|
|
381
|
+
|
|
382
|
+
if found_results:
|
|
383
|
+
break
|
|
384
|
+
time.sleep(0.05) # Faster polling (50ms) as requested
|
|
328
385
|
else:
|
|
329
386
|
# 1. Wait for document to settle (Fast Dynamic Wait)
|
|
330
387
|
try:
|
|
@@ -451,8 +508,7 @@ class ScreenshotService:
|
|
|
451
508
|
return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
|
|
452
509
|
finally:
|
|
453
510
|
if tab:
|
|
454
|
-
|
|
455
|
-
except: pass
|
|
511
|
+
self._release_tab(tab)
|
|
456
512
|
|
|
457
513
|
async def fetch_pages_batch(self, urls: List[str], timeout: float = 20.0, include_screenshot: bool = True) -> List[Dict[str, Any]]:
|
|
458
514
|
"""Fetch multiple pages concurrently."""
|
|
@@ -461,6 +517,13 @@ class ScreenshotService:
|
|
|
461
517
|
tasks = [self.fetch_page(url, timeout, include_screenshot) for url in urls]
|
|
462
518
|
return await asyncio.gather(*tasks, return_exceptions=True)
|
|
463
519
|
|
|
520
|
+
async def screenshot_urls_batch(self, urls: List[str], timeout: float = 15.0, full_page: bool = True) -> List[Optional[str]]:
|
|
521
|
+
"""Take screenshots of multiple URLs concurrently."""
|
|
522
|
+
if not urls: return []
|
|
523
|
+
logger.info(f"ScreenshotService: Batch screenshot {len(urls)} URLs")
|
|
524
|
+
tasks = [self.screenshot_url(url, timeout=timeout, full_page=full_page) for url in urls]
|
|
525
|
+
return await asyncio.gather(*tasks, return_exceptions=True)
|
|
526
|
+
|
|
464
527
|
async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 80) -> Optional[str]:
|
|
465
528
|
"""Screenshot URL (Async wrapper for sync)."""
|
|
466
529
|
loop = asyncio.get_running_loop()
|
|
@@ -480,36 +543,201 @@ class ScreenshotService:
|
|
|
480
543
|
if not page: return None
|
|
481
544
|
|
|
482
545
|
tab = page.new_tab(url)
|
|
546
|
+
|
|
547
|
+
# Start monitoring network traffic
|
|
483
548
|
try:
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
549
|
+
# Listen for data packets (XHR/Fetch/POST)
|
|
550
|
+
# Targets: xhr, fetch. POST usually falls under these or Document.
|
|
551
|
+
tab.listen.start(targets=True) # Listen to everything for now to be safe
|
|
552
|
+
except Exception as e:
|
|
553
|
+
logger.warning(f"ScreenshotService: Failed to start network listener: {e}")
|
|
554
|
+
|
|
555
|
+
try:
|
|
556
|
+
# Wait for full page load (including JS execution)
|
|
557
|
+
tab.wait.load_complete(timeout=timeout)
|
|
558
|
+
|
|
559
|
+
# Wait for actual content to appear (for CDN verification pages)
|
|
560
|
+
# Smart Wait Logic (Final Robust):
|
|
561
|
+
# 1. Network Idle: Wait for silence in XHR/POST
|
|
562
|
+
# 2. Stability: Wait for Height/Text/DOM stability
|
|
563
|
+
|
|
564
|
+
time.sleep(1.5) # user request: force wait 1.5s before detection
|
|
565
|
+
|
|
566
|
+
last_h = 0
|
|
567
|
+
last_text_len = 0
|
|
568
|
+
last_html_len = 0
|
|
569
|
+
stable_count = 0
|
|
570
|
+
|
|
571
|
+
for i in range(200): # Max 200 iterations (~20s)
|
|
572
|
+
try:
|
|
573
|
+
# 1. Check Network Activity
|
|
574
|
+
has_recent_network = False
|
|
575
|
+
try:
|
|
576
|
+
# Iterate over any captured packets since last check
|
|
577
|
+
for packet in tab.listen.steps(timeout=0.01):
|
|
578
|
+
# Check if it's a significant request (POST or XHR/Fetch)
|
|
579
|
+
method = packet.method.upper()
|
|
580
|
+
r_type = packet.resourceType.upper() if getattr(packet, 'resourceType', None) else ""
|
|
581
|
+
|
|
582
|
+
# Interested in: POST requests OR any XHR/Fetch response
|
|
583
|
+
if method == 'POST' or 'XMLHTTPREQUEST' in r_type or 'FETCH' in r_type:
|
|
584
|
+
# Ignore some common noise? (Optional: analytics, tracking)
|
|
585
|
+
# For now, simplistic approach: any API traffic resets stability
|
|
586
|
+
has_recent_network = True
|
|
587
|
+
# logger.debug(f"Network Activity: {method} {packet.url[:50]}")
|
|
588
|
+
break
|
|
589
|
+
except:
|
|
590
|
+
pass
|
|
591
|
+
|
|
592
|
+
# 2. Check DOM State
|
|
593
|
+
state = tab.run_js('''
|
|
594
|
+
return {
|
|
595
|
+
ready: document.readyState === 'complete',
|
|
596
|
+
title: document.title,
|
|
597
|
+
height: Math.max(
|
|
598
|
+
document.body.scrollHeight || 0,
|
|
599
|
+
document.documentElement.scrollHeight || 0
|
|
600
|
+
),
|
|
601
|
+
text: document.body.innerText.substring(0, 1000) || "",
|
|
602
|
+
html_len: document.body.innerHTML.length || 0
|
|
603
|
+
};
|
|
604
|
+
''') or {'ready': False, 'title': "", 'height': 0, 'text': "", 'html_len': 0}
|
|
605
|
+
|
|
606
|
+
is_ready = state.get('ready', False)
|
|
607
|
+
title = state.get('title', "").lower()
|
|
608
|
+
current_h = int(state.get('height', 0))
|
|
609
|
+
text_content = state.get('text', "")
|
|
610
|
+
text_len = len(text_content)
|
|
611
|
+
html_len = int(state.get('html_len', 0))
|
|
612
|
+
text_lower = text_content.lower()
|
|
613
|
+
|
|
614
|
+
# Blacklist check (Loading indicators)
|
|
615
|
+
is_verification = "checking your browser" in text_lower or \
|
|
616
|
+
"just a moment" in text_lower or \
|
|
617
|
+
"please wait" in text_lower or \
|
|
618
|
+
"security check" in title or \
|
|
619
|
+
"just a moment" in title or \
|
|
620
|
+
"loading..." in title
|
|
621
|
+
|
|
622
|
+
# Stability check (Multi-metric + Network)
|
|
623
|
+
# We require STABILITY across Height, Text Length, DOM Size AND Network Silence
|
|
624
|
+
is_height_stable = current_h == last_h
|
|
625
|
+
is_text_stable = abs(text_len - last_text_len) < 5 # Allow minor fluctuations
|
|
626
|
+
is_dom_stable = abs(html_len - last_html_len) < 20 # Allow minor fluctuations (ads/tracking)
|
|
627
|
+
|
|
628
|
+
if is_height_stable and is_text_stable and is_dom_stable and not has_recent_network:
|
|
629
|
+
stable_count += 1
|
|
630
|
+
else:
|
|
631
|
+
# Reset if ANY metric changed or NETWORK active
|
|
632
|
+
stable_count = 0
|
|
633
|
+
# if has_recent_network: logger.debug("Stability reset: Network Activity")
|
|
634
|
+
|
|
635
|
+
# Conditions
|
|
636
|
+
has_content = text_len > 100 # At least 100 real chars
|
|
637
|
+
|
|
638
|
+
# Dynamic Stability Requirement:
|
|
639
|
+
# If page looks like it's loading (small content), require longer stability
|
|
640
|
+
if has_recent_network:
|
|
641
|
+
# If we just saw network, enforce at least 1s (10 ticks) clean silence even for large pages
|
|
642
|
+
required_stability = max(10, 40 if text_len < 500 else 25)
|
|
643
|
+
else:
|
|
644
|
+
required_stability = 40 if text_len < 500 else 25 # 4.0s or 2.5s
|
|
645
|
+
|
|
646
|
+
is_stable = stable_count >= required_stability
|
|
647
|
+
|
|
648
|
+
# Pass if all conditions met
|
|
649
|
+
if is_ready and not is_verification and has_content and is_stable:
|
|
650
|
+
break
|
|
651
|
+
|
|
652
|
+
last_h = current_h
|
|
653
|
+
last_text_len = text_len
|
|
654
|
+
last_html_len = html_len
|
|
655
|
+
|
|
656
|
+
# Wait timing within loop (tab.listen.steps consumed some time, so sleep less)
|
|
657
|
+
try: time.sleep(0.05)
|
|
658
|
+
except: pass
|
|
659
|
+
|
|
660
|
+
except Exception:
|
|
661
|
+
stable_count = 0
|
|
662
|
+
try: time.sleep(0.1)
|
|
663
|
+
except: pass
|
|
664
|
+
continue
|
|
665
|
+
|
|
666
|
+
# Cleanup listener
|
|
667
|
+
try: tab.listen.stop()
|
|
668
|
+
except: pass
|
|
669
|
+
|
|
670
|
+
# DEBUG: Save HTML to inspect what happened (in data dir)
|
|
671
|
+
try:
|
|
672
|
+
import os
|
|
673
|
+
log_path = os.path.join(os.getcwd(), "data", "browser.log.html")
|
|
674
|
+
with open(log_path, "w", encoding="utf-8") as f:
|
|
675
|
+
f.write(f"<!-- URL: {url} -->\n")
|
|
676
|
+
f.write(tab.html)
|
|
677
|
+
except: pass
|
|
678
|
+
|
|
679
|
+
# Use faster scroll step (800) to ensure lazy loaded images appear
|
|
680
|
+
self._scroll_to_bottom(tab, step=800, delay=2.0, timeout=min(timeout, 10))
|
|
681
|
+
|
|
682
|
+
except:
|
|
683
|
+
pass
|
|
489
684
|
|
|
490
|
-
#
|
|
491
|
-
|
|
492
|
-
|
|
685
|
+
# Refine calculation: Set viewport width to 1024
|
|
686
|
+
|
|
687
|
+
capture_width = 1024
|
|
493
688
|
|
|
689
|
+
# Calculate actual content height after lazy loading
|
|
690
|
+
try:
|
|
691
|
+
# Use a robust height calculation
|
|
692
|
+
content_height = tab.run_js('''
|
|
693
|
+
return Math.max(
|
|
694
|
+
document.body.scrollHeight || 0,
|
|
695
|
+
document.documentElement.scrollHeight || 0,
|
|
696
|
+
document.body.offsetHeight || 0,
|
|
697
|
+
document.documentElement.offsetHeight || 0,
|
|
698
|
+
document.documentElement.clientHeight || 0
|
|
699
|
+
);
|
|
700
|
+
''')
|
|
701
|
+
# Add a small buffer and cap at 15000px to prevent memory issues
|
|
702
|
+
h = min(int(content_height) + 50, 15000)
|
|
703
|
+
except:
|
|
704
|
+
h = 1000 # Fallback
|
|
705
|
+
|
|
706
|
+
# Set viewport to full content size for single-shot capture
|
|
707
|
+
try:
|
|
708
|
+
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
709
|
+
width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
|
|
710
|
+
except:
|
|
711
|
+
pass
|
|
712
|
+
|
|
494
713
|
# Scrollbar Hiding
|
|
495
714
|
from .manager import SharedBrowserManager
|
|
496
715
|
SharedBrowserManager.hide_scrollbars(tab)
|
|
497
|
-
tab.run_js("""
|
|
498
|
-
const style = document.createElement('style');
|
|
499
|
-
style.textContent = `
|
|
500
|
-
::-webkit-scrollbar { display: none !important; }
|
|
501
|
-
html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
|
|
502
|
-
`;
|
|
503
|
-
document.head.appendChild(style);
|
|
504
|
-
document.documentElement.style.overflow = 'hidden';
|
|
505
|
-
document.body.style.overflow = 'hidden';
|
|
506
|
-
""")
|
|
507
716
|
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
717
|
+
# Scroll back to top before screenshot
|
|
718
|
+
tab.run_js("window.scrollTo(0, 0);")
|
|
719
|
+
|
|
720
|
+
# Content is already loaded by _scroll_to_bottom which waits for images
|
|
721
|
+
# Just recalculate final height (content may have grown during scrolling)
|
|
722
|
+
try:
|
|
723
|
+
final_height = tab.run_js('''
|
|
724
|
+
return Math.max(
|
|
725
|
+
document.body.scrollHeight || 0,
|
|
726
|
+
document.documentElement.scrollHeight || 0,
|
|
727
|
+
document.body.offsetHeight || 0,
|
|
728
|
+
document.documentElement.offsetHeight || 0
|
|
729
|
+
);
|
|
730
|
+
''')
|
|
731
|
+
final_h = min(int(final_height) + 50, 15000)
|
|
732
|
+
if final_h != h:
|
|
733
|
+
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
734
|
+
width=capture_width, height=final_h, deviceScaleFactor=1, mobile=False)
|
|
735
|
+
except:
|
|
736
|
+
pass
|
|
737
|
+
|
|
738
|
+
# Use full_page=False because we manually set the viewport to the full height
|
|
739
|
+
# This avoids stitching artifacts and blank spaces
|
|
740
|
+
return tab.get_screenshot(as_base64='jpg', full_page=False)
|
|
513
741
|
|
|
514
742
|
except Exception as e:
|
|
515
743
|
logger.error(f"ScreenshotService: Screenshot URL failed: {e}")
|