entari-plugin-hyw 4.0.0rc6__py3-none-any.whl → 4.0.0rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/Untitled-1 +1865 -0
- entari_plugin_hyw/__init__.py +733 -379
- entari_plugin_hyw/history.py +60 -57
- entari_plugin_hyw/misc.py +3 -0
- entari_plugin_hyw/search_cache.py +154 -0
- {entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc8.dist-info}/METADATA +3 -1
- entari_plugin_hyw-4.0.0rc8.dist-info/RECORD +68 -0
- {entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc8.dist-info}/WHEEL +1 -1
- {entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc8.dist-info}/top_level.txt +1 -0
- hyw_core/__init__.py +94 -0
- hyw_core/browser_control/__init__.py +65 -0
- hyw_core/browser_control/assets/card-dist/index.html +409 -0
- hyw_core/browser_control/assets/index.html +5691 -0
- hyw_core/browser_control/engines/__init__.py +17 -0
- hyw_core/browser_control/engines/default.py +166 -0
- {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/duckduckgo.py +42 -8
- {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/google.py +1 -1
- {entari_plugin_hyw/browser → hyw_core/browser_control}/manager.py +15 -8
- entari_plugin_hyw/render_vue.py → hyw_core/browser_control/renderer.py +29 -14
- hyw_core/browser_control/service.py +720 -0
- hyw_core/config.py +154 -0
- hyw_core/core.py +322 -0
- hyw_core/definitions.py +83 -0
- entari_plugin_hyw/modular_pipeline.py → hyw_core/pipeline.py +204 -86
- {entari_plugin_hyw → hyw_core}/search.py +60 -19
- hyw_core/stages/__init__.py +21 -0
- entari_plugin_hyw/stage_base.py → hyw_core/stages/base.py +3 -0
- entari_plugin_hyw/stage_summary.py → hyw_core/stages/summary.py +36 -7
- entari_plugin_hyw/assets/card-dist/index.html +0 -387
- entari_plugin_hyw/browser/__init__.py +0 -10
- entari_plugin_hyw/browser/engines/bing.py +0 -95
- entari_plugin_hyw/browser/service.py +0 -304
- entari_plugin_hyw/card-ui/.gitignore +0 -24
- entari_plugin_hyw/card-ui/README.md +0 -5
- entari_plugin_hyw/card-ui/index.html +0 -16
- entari_plugin_hyw/card-ui/package-lock.json +0 -2342
- entari_plugin_hyw/card-ui/package.json +0 -31
- entari_plugin_hyw/card-ui/public/logos/anthropic.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/cerebras.svg +0 -9
- entari_plugin_hyw/card-ui/public/logos/deepseek.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/gemini.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/google.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/grok.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/huggingface.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/microsoft.svg +0 -15
- entari_plugin_hyw/card-ui/public/logos/minimax.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/mistral.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/nvida.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/openai.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/openrouter.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/perplexity.svg +0 -24
- entari_plugin_hyw/card-ui/public/logos/qwen.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/xai.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/xiaomi.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/zai.png +0 -0
- entari_plugin_hyw/card-ui/public/vite.svg +0 -1
- entari_plugin_hyw/card-ui/src/App.vue +0 -756
- entari_plugin_hyw/card-ui/src/assets/vue.svg +0 -1
- entari_plugin_hyw/card-ui/src/components/HelloWorld.vue +0 -41
- entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +0 -382
- entari_plugin_hyw/card-ui/src/components/SectionCard.vue +0 -41
- entari_plugin_hyw/card-ui/src/components/StageCard.vue +0 -240
- entari_plugin_hyw/card-ui/src/main.ts +0 -5
- entari_plugin_hyw/card-ui/src/style.css +0 -29
- entari_plugin_hyw/card-ui/src/test_regex.js +0 -103
- entari_plugin_hyw/card-ui/src/types.ts +0 -61
- entari_plugin_hyw/card-ui/tsconfig.app.json +0 -16
- entari_plugin_hyw/card-ui/tsconfig.json +0 -7
- entari_plugin_hyw/card-ui/tsconfig.node.json +0 -26
- entari_plugin_hyw/card-ui/vite.config.ts +0 -16
- entari_plugin_hyw/definitions.py +0 -155
- entari_plugin_hyw/stage_instruct.py +0 -345
- entari_plugin_hyw/stage_instruct_deepsearch.py +0 -104
- entari_plugin_hyw-4.0.0rc6.dist-info/RECORD +0 -100
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/anthropic.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/cerebras.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/deepseek.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/gemini.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/google.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/grok.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/huggingface.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/microsoft.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/minimax.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/mistral.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/nvida.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/openai.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/openrouter.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/perplexity.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/qwen.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/xai.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/xiaomi.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/zai.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/vite.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/anthropic.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/cerebras.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/deepseek.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/gemini.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/google.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/grok.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/huggingface.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/microsoft.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/minimax.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/mistral.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/nvida.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/openai.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/openrouter.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/perplexity.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/qwen.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/xai.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/xiaomi.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/zai.png +0 -0
- {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/base.py +0 -0
- {entari_plugin_hyw/browser → hyw_core/browser_control}/landing.html +0 -0
- {entari_plugin_hyw → hyw_core}/image_cache.py +0 -0
|
@@ -0,0 +1,720 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Browser Service (DrissionPage)
|
|
3
|
+
|
|
4
|
+
Provides page fetching and screenshot capabilities using DrissionPage.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import base64
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
12
|
+
from typing import Optional, Dict, Any, List
|
|
13
|
+
from loguru import logger
|
|
14
|
+
import trafilatura
|
|
15
|
+
|
|
16
|
+
class ScreenshotService:
|
|
17
|
+
"""
|
|
18
|
+
Browser Service using DrissionPage.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, headless: bool = True, auto_start: bool = True):
|
|
22
|
+
self.headless = headless
|
|
23
|
+
self._manager = None
|
|
24
|
+
self._executor = ThreadPoolExecutor(max_workers=10)
|
|
25
|
+
|
|
26
|
+
if auto_start:
|
|
27
|
+
self._ensure_ready()
|
|
28
|
+
|
|
29
|
+
def _get_tab(self, url: str) -> Any:
|
|
30
|
+
"""Create a new tab and navigate to URL."""
|
|
31
|
+
self._ensure_ready()
|
|
32
|
+
return self._manager.new_tab(url)
|
|
33
|
+
|
|
34
|
+
def _release_tab(self, tab: Any):
|
|
35
|
+
"""Close tab after use."""
|
|
36
|
+
if not tab: return
|
|
37
|
+
try:
|
|
38
|
+
tab.close()
|
|
39
|
+
except:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
async def search_via_page_input_batch(self, queries: List[str], url: str, selector: str = "#input") -> List[Dict[str, Any]]:
|
|
43
|
+
"""
|
|
44
|
+
Execute concurrent searches using page inputs.
|
|
45
|
+
"""
|
|
46
|
+
loop = asyncio.get_running_loop()
|
|
47
|
+
return await loop.run_in_executor(
|
|
48
|
+
self._executor,
|
|
49
|
+
self._search_via_page_input_batch_sync,
|
|
50
|
+
queries, url, selector
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def _search_via_page_input_batch_sync(self, queries: List[str], url: str, selector: str) -> List[Dict[str, Any]]:
|
|
54
|
+
"""Sync batch execution - create tabs sequentially, search in parallel."""
|
|
55
|
+
results = [None] * len(queries)
|
|
56
|
+
tabs = []
|
|
57
|
+
|
|
58
|
+
# Phase 1: Get/create tabs SEQUENTIALLY (DrissionPage isn't thread-safe for new_tab)
|
|
59
|
+
target_url = url or "https://www.google.com"
|
|
60
|
+
logger.info(f"ScreenshotService: Acquiring {len(queries)} tabs for parallel search...")
|
|
61
|
+
|
|
62
|
+
for i in range(len(queries)):
|
|
63
|
+
tab = None
|
|
64
|
+
# Try to get from pool first (using shared logic now)
|
|
65
|
+
try:
|
|
66
|
+
tab = self._get_tab(target_url)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logger.warning(f"ScreenshotService: Batch search tab creation failed: {e}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
tabs.append(tab)
|
|
73
|
+
|
|
74
|
+
logger.info(f"ScreenshotService: {len(tabs)} tabs ready, starting parallel searches...")
|
|
75
|
+
|
|
76
|
+
# Phase 2: Execute searches in PARALLEL
|
|
77
|
+
def run_search(index, tab, query):
|
|
78
|
+
try:
|
|
79
|
+
logger.debug(f"Search[{index}]: Starting for '{query}' on {tab.url}")
|
|
80
|
+
|
|
81
|
+
# Wait for page to be ready first
|
|
82
|
+
try:
|
|
83
|
+
tab.wait.doc_loaded(timeout=10)
|
|
84
|
+
except:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
# Find input element with wait
|
|
88
|
+
logger.debug(f"Search[{index}]: Looking for input with selector '{selector}'")
|
|
89
|
+
ele = tab.ele(selector, timeout=5)
|
|
90
|
+
if not ele:
|
|
91
|
+
logger.debug(f"Search[{index}]: Primary selector failed, trying fallbacks")
|
|
92
|
+
for fallback in ["textarea[name='q']", "#APjFqb", "input[name='q']", "input[type='text']"]:
|
|
93
|
+
ele = tab.ele(fallback, timeout=2)
|
|
94
|
+
if ele:
|
|
95
|
+
logger.debug(f"Search[{index}]: Found input with fallback '{fallback}'")
|
|
96
|
+
break
|
|
97
|
+
|
|
98
|
+
if not ele:
|
|
99
|
+
logger.error(f"Search[{index}]: No input element found on {tab.url}!")
|
|
100
|
+
results[index] = {"content": "Error: input not found", "title": "Error", "url": tab.url, "html": tab.html[:5000]}
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
logger.debug(f"Search[{index}]: Typing query...")
|
|
104
|
+
ele.input(query)
|
|
105
|
+
|
|
106
|
+
logger.debug(f"Search[{index}]: Pressing Enter...")
|
|
107
|
+
tab.actions.key_down('enter').key_up('enter')
|
|
108
|
+
|
|
109
|
+
logger.debug(f"Search[{index}]: Waiting for search results...")
|
|
110
|
+
tab.wait.doc_loaded(timeout=10)
|
|
111
|
+
# Reduced settle wait for extraction
|
|
112
|
+
time.sleep(0.1)
|
|
113
|
+
|
|
114
|
+
logger.debug(f"Search[{index}]: Extracting content...")
|
|
115
|
+
html = tab.html
|
|
116
|
+
content = trafilatura.extract(
|
|
117
|
+
html, include_links=True, include_images=True, include_comments=False,
|
|
118
|
+
include_tables=True, favor_precision=False, output_format="markdown"
|
|
119
|
+
) or ""
|
|
120
|
+
|
|
121
|
+
logger.info(f"ScreenshotService: Search '{query}' completed -> {tab.url}")
|
|
122
|
+
|
|
123
|
+
results[index] = {
|
|
124
|
+
"content": content,
|
|
125
|
+
"html": html,
|
|
126
|
+
"title": tab.title,
|
|
127
|
+
"url": tab.url,
|
|
128
|
+
"images": []
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.error(f"ScreenshotService: Search error for '{query}': {e}")
|
|
133
|
+
results[index] = {"content": f"Error: {e}", "title": "Error", "url": "", "html": ""}
|
|
134
|
+
finally:
|
|
135
|
+
self._release_tab(tab)
|
|
136
|
+
|
|
137
|
+
threads = []
|
|
138
|
+
for i, (tab, query) in enumerate(zip(tabs, queries)):
|
|
139
|
+
t = threading.Thread(target=run_search, args=(i, tab, query))
|
|
140
|
+
t.start()
|
|
141
|
+
threads.append(t)
|
|
142
|
+
|
|
143
|
+
for t in threads:
|
|
144
|
+
t.join()
|
|
145
|
+
|
|
146
|
+
return results
|
|
147
|
+
|
|
148
|
+
def _ensure_ready(self):
|
|
149
|
+
"""Ensure shared browser is ready."""
|
|
150
|
+
from .manager import get_shared_browser_manager
|
|
151
|
+
self._manager = get_shared_browser_manager(headless=self.headless)
|
|
152
|
+
|
|
153
|
+
async def fetch_page(self, url: str, timeout: float = 10.0, include_screenshot: bool = True) -> Dict[str, Any]:
|
|
154
|
+
"""
|
|
155
|
+
Fetch page content (and optionally screenshot).
|
|
156
|
+
Runs in a thread executor to avoid blocking the async loop.
|
|
157
|
+
"""
|
|
158
|
+
loop = asyncio.get_running_loop()
|
|
159
|
+
return await loop.run_in_executor(
|
|
160
|
+
self._executor,
|
|
161
|
+
self._fetch_page_sync,
|
|
162
|
+
url,
|
|
163
|
+
timeout,
|
|
164
|
+
include_screenshot
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
async def search_via_address_bar(self, query: str, timeout: float = 20.0) -> Dict[str, Any]:
|
|
168
|
+
"""
|
|
169
|
+
Search using browser's address bar (uses browser's default search engine).
|
|
170
|
+
Simulates: Ctrl+L (focus address bar) -> type query -> Enter
|
|
171
|
+
"""
|
|
172
|
+
loop = asyncio.get_running_loop()
|
|
173
|
+
return await loop.run_in_executor(
|
|
174
|
+
self._executor,
|
|
175
|
+
self._search_via_address_bar_sync,
|
|
176
|
+
query,
|
|
177
|
+
timeout
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def _search_via_address_bar_sync(self, query: str, timeout: float) -> Dict[str, Any]:
|
|
181
|
+
"""Synchronous address bar search logic."""
|
|
182
|
+
if not query:
|
|
183
|
+
return {"content": "Error: missing query", "title": "Error", "url": "", "html": ""}
|
|
184
|
+
|
|
185
|
+
tab = None
|
|
186
|
+
try:
|
|
187
|
+
self._ensure_ready()
|
|
188
|
+
page = self._manager.page
|
|
189
|
+
if not page:
|
|
190
|
+
return {"content": "Error: Browser not available", "title": "Error", "url": "", "html": ""}
|
|
191
|
+
|
|
192
|
+
# Open new blank tab
|
|
193
|
+
tab = page.new_tab()
|
|
194
|
+
|
|
195
|
+
# Focus address bar with Ctrl+L (or Cmd+L on Mac)
|
|
196
|
+
import platform
|
|
197
|
+
if platform.system() == "Darwin":
|
|
198
|
+
tab.actions.key_down('cmd').key_down('l').key_up('l').key_up('cmd')
|
|
199
|
+
else:
|
|
200
|
+
tab.actions.key_down('ctrl').key_down('l').key_up('l').key_up('ctrl')
|
|
201
|
+
|
|
202
|
+
# Small delay for address bar to focus
|
|
203
|
+
import time as _time
|
|
204
|
+
_time.sleep(0.05)
|
|
205
|
+
|
|
206
|
+
# Type the query
|
|
207
|
+
tab.actions.type(query)
|
|
208
|
+
|
|
209
|
+
# Press Enter to search
|
|
210
|
+
tab.actions.key_down('enter').key_up('enter')
|
|
211
|
+
|
|
212
|
+
# Wait for page to load
|
|
213
|
+
try:
|
|
214
|
+
tab.wait.doc_loaded(timeout=timeout)
|
|
215
|
+
# Reduced wait for initial results
|
|
216
|
+
_time.sleep(0.2)
|
|
217
|
+
except:
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
html = tab.html
|
|
221
|
+
title = tab.title
|
|
222
|
+
final_url = tab.url
|
|
223
|
+
|
|
224
|
+
# Extract content
|
|
225
|
+
content = trafilatura.extract(
|
|
226
|
+
html, include_links=True, include_images=True, include_comments=False,
|
|
227
|
+
include_tables=True, favor_precision=False, output_format="markdown"
|
|
228
|
+
) or ""
|
|
229
|
+
|
|
230
|
+
logger.info(f"ScreenshotService: Address bar search completed -> {final_url}")
|
|
231
|
+
|
|
232
|
+
return {
|
|
233
|
+
"content": content,
|
|
234
|
+
"html": html,
|
|
235
|
+
"title": title,
|
|
236
|
+
"url": final_url,
|
|
237
|
+
"images": []
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
except Exception as e:
|
|
241
|
+
logger.error(f"ScreenshotService: Address bar search failed: {e}")
|
|
242
|
+
return {"content": f"Error: search failed ({e})", "title": "Error", "url": "", "html": ""}
|
|
243
|
+
finally:
|
|
244
|
+
if tab:
|
|
245
|
+
try: tab.close()
|
|
246
|
+
except: pass
|
|
247
|
+
|
|
248
|
+
def _scroll_to_bottom(self, tab, step: int = 800, delay: float = 2.0, timeout: float = 10.0):
|
|
249
|
+
"""
|
|
250
|
+
Scroll down gradually to trigger lazy loading.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
delay: Max wait time per scroll step (seconds) if images aren't loading.
|
|
254
|
+
"""
|
|
255
|
+
import time
|
|
256
|
+
start = time.time()
|
|
257
|
+
current_pos = 0
|
|
258
|
+
try:
|
|
259
|
+
while time.time() - start < timeout:
|
|
260
|
+
# Scroll down
|
|
261
|
+
current_pos += step
|
|
262
|
+
tab.run_js(f"window.scrollTo(0, {current_pos});")
|
|
263
|
+
|
|
264
|
+
# Active Wait: Check if images in viewport are loaded
|
|
265
|
+
# Poll every 100ms, up to 'delay' seconds
|
|
266
|
+
wait_start = time.time()
|
|
267
|
+
while time.time() - wait_start < delay:
|
|
268
|
+
all_loaded = tab.run_js("""
|
|
269
|
+
return (async () => {
|
|
270
|
+
const imgs = Array.from(document.querySelectorAll('img'));
|
|
271
|
+
const viewportHeight = window.innerHeight;
|
|
272
|
+
|
|
273
|
+
// 1. Identify images currently in viewport
|
|
274
|
+
const visibleImgs = imgs.filter(img => {
|
|
275
|
+
const rect = img.getBoundingClientRect();
|
|
276
|
+
return (rect.top < viewportHeight && rect.bottom > 0) && (rect.width > 0 && rect.height > 0);
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
if (visibleImgs.length === 0) return true;
|
|
280
|
+
|
|
281
|
+
// 2. Check loading status using decode() AND heuristic for placeholders
|
|
282
|
+
// Some sites load a tiny blurred placeholder first.
|
|
283
|
+
const checks = visibleImgs.map(img => {
|
|
284
|
+
// HEURISTIC: content is likely not ready if:
|
|
285
|
+
// - img has 'data-src' but src is different (or src is empty)
|
|
286
|
+
// - img has 'loading="lazy"' and is not complete
|
|
287
|
+
// - naturalWidth is very small (placeholder) compared to display width
|
|
288
|
+
|
|
289
|
+
const isPlaceholder = (
|
|
290
|
+
(img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
|
|
291
|
+
(img.naturalWidth < 50 && img.clientWidth > 100)
|
|
292
|
+
);
|
|
293
|
+
|
|
294
|
+
if (isPlaceholder) {
|
|
295
|
+
// If it looks like a placeholder, we return false (not loaded)
|
|
296
|
+
// unless it stays like this for too long (handled by outer timeout)
|
|
297
|
+
return Promise.resolve(false);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
if (img.complete && img.naturalHeight > 0) return Promise.resolve(true);
|
|
301
|
+
|
|
302
|
+
return img.decode().then(() => true).catch(() => false);
|
|
303
|
+
});
|
|
304
|
+
|
|
305
|
+
// Race against a small timeout to avoid hanging on one broken image
|
|
306
|
+
const allDecoded = Promise.all(checks);
|
|
307
|
+
const timeout = new Promise(resolve => setTimeout(() => resolve(false), 500));
|
|
308
|
+
|
|
309
|
+
// If any check returned false (meaning placeholder or not decoded), result is false
|
|
310
|
+
return Promise.race([allDecoded, timeout]).then(results => {
|
|
311
|
+
if (!Array.isArray(results)) return results === true;
|
|
312
|
+
return results.every(res => res === true);
|
|
313
|
+
});
|
|
314
|
+
})();
|
|
315
|
+
""")
|
|
316
|
+
if all_loaded:
|
|
317
|
+
break
|
|
318
|
+
time.sleep(0.1)
|
|
319
|
+
|
|
320
|
+
# Check if reached bottom
|
|
321
|
+
height = tab.run_js("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);")
|
|
322
|
+
if current_pos >= height:
|
|
323
|
+
break
|
|
324
|
+
|
|
325
|
+
# Ensure final layout settle
|
|
326
|
+
time.sleep(0.2)
|
|
327
|
+
|
|
328
|
+
except Exception as e:
|
|
329
|
+
logger.warning(f"ScreenshotService: Scroll failed: {e}")
|
|
330
|
+
|
|
331
|
+
def _fetch_page_sync(self, url: str, timeout: float, include_screenshot: bool) -> Dict[str, Any]:
|
|
332
|
+
"""Synchronous fetch logic."""
|
|
333
|
+
if not url:
|
|
334
|
+
return {"content": "Error: missing url", "title": "Error", "url": ""}
|
|
335
|
+
|
|
336
|
+
tab = None
|
|
337
|
+
try:
|
|
338
|
+
self._ensure_ready()
|
|
339
|
+
page = self._manager.page
|
|
340
|
+
if not page:
|
|
341
|
+
return {"content": "Error: Browser not available", "title": "Error", "url": url}
|
|
342
|
+
|
|
343
|
+
# Get from pool
|
|
344
|
+
tab = self._get_tab(url)
|
|
345
|
+
|
|
346
|
+
# Wait logic - optimized for search pages
|
|
347
|
+
is_search_page = any(s in url.lower() for s in ['search', 'bing.com', 'duckduckgo', 'google.com/search', 'searx'])
|
|
348
|
+
if is_search_page:
|
|
349
|
+
# Optimized waiting: Rapidly poll for ACTUAL results > 0
|
|
350
|
+
start_time = time.time()
|
|
351
|
+
|
|
352
|
+
# Special fast-path for DDG Lite (HTML only, no JS rendering needed)
|
|
353
|
+
if 'lite.duckduckgo' in url:
|
|
354
|
+
# just wait for body, it's static HTML
|
|
355
|
+
try:
|
|
356
|
+
tab.wait.doc_loaded(timeout=timeout)
|
|
357
|
+
except: pass
|
|
358
|
+
# Sleep tiny bit to ensure render
|
|
359
|
+
time.sleep(0.5)
|
|
360
|
+
else:
|
|
361
|
+
while time.time() - start_time < timeout:
|
|
362
|
+
found_results = False
|
|
363
|
+
try:
|
|
364
|
+
if 'google' in url.lower():
|
|
365
|
+
# Check if we have any result items (.g, .MjjYud) or the main container (#search)
|
|
366
|
+
# Using checks with minimal timeout to allow fast looping
|
|
367
|
+
if tab.ele('.g', timeout=0.1) or tab.ele('.MjjYud', timeout=0.1) or tab.ele('#search', timeout=0.1):
|
|
368
|
+
found_results = True
|
|
369
|
+
elif 'bing' in url.lower():
|
|
370
|
+
if tab.ele('.b_algo', timeout=0.1) or tab.ele('#b_results', timeout=0.1):
|
|
371
|
+
found_results = True
|
|
372
|
+
elif 'duckduckgo' in url.lower():
|
|
373
|
+
if tab.ele('.result', timeout=0.1) or tab.ele('#react-layout', timeout=0.1):
|
|
374
|
+
found_results = True
|
|
375
|
+
else:
|
|
376
|
+
# Generic fallback: wait for body to be populated
|
|
377
|
+
if tab.ele('body', timeout=0.1):
|
|
378
|
+
found_results = True
|
|
379
|
+
except:
|
|
380
|
+
pass
|
|
381
|
+
|
|
382
|
+
if found_results:
|
|
383
|
+
break
|
|
384
|
+
time.sleep(0.05) # Faster polling (50ms) as requested
|
|
385
|
+
else:
|
|
386
|
+
# 1. Wait for document to settle (Fast Dynamic Wait)
|
|
387
|
+
try:
|
|
388
|
+
tab.wait.doc_loaded(timeout=timeout)
|
|
389
|
+
except: pass
|
|
390
|
+
|
|
391
|
+
html = tab.html
|
|
392
|
+
title = tab.title
|
|
393
|
+
final_url = tab.url
|
|
394
|
+
|
|
395
|
+
raw_screenshot_b64 = None
|
|
396
|
+
if include_screenshot:
|
|
397
|
+
try:
|
|
398
|
+
# Scrollbar Hiding Best Effort
|
|
399
|
+
from .manager import SharedBrowserManager
|
|
400
|
+
SharedBrowserManager.hide_scrollbars(tab)
|
|
401
|
+
|
|
402
|
+
# Inject CSS
|
|
403
|
+
tab.run_js("""
|
|
404
|
+
const style = document.createElement('style');
|
|
405
|
+
style.textContent = `
|
|
406
|
+
::-webkit-scrollbar { display: none !important; }
|
|
407
|
+
html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
|
|
408
|
+
`;
|
|
409
|
+
document.head.appendChild(style);
|
|
410
|
+
document.documentElement.style.overflow = 'hidden';
|
|
411
|
+
document.body.style.overflow = 'hidden';
|
|
412
|
+
""")
|
|
413
|
+
|
|
414
|
+
raw_screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
|
|
415
|
+
except Exception as e:
|
|
416
|
+
logger.warning(f"ScreenshotService: Failed to capture screenshot: {e}")
|
|
417
|
+
|
|
418
|
+
# Extract content
|
|
419
|
+
content = trafilatura.extract(
|
|
420
|
+
html, include_links=True, include_images=True, include_comments=False,
|
|
421
|
+
include_tables=True, favor_precision=False, output_format="markdown"
|
|
422
|
+
) or ""
|
|
423
|
+
|
|
424
|
+
# 2. Extract Images via Parallelized JS (Gallery)
|
|
425
|
+
# Strategy: For search pages, use Canvas to grab already loaded images (Instant)
|
|
426
|
+
# For other pages, use fetch (more robust for lazy load)
|
|
427
|
+
images_b64 = []
|
|
428
|
+
try:
|
|
429
|
+
js_code = """
|
|
430
|
+
(async () => {
|
|
431
|
+
const blocklist = ['logo', 'icon', 'avatar', 'ad', 'pixel', 'tracker', 'button', 'menu', 'nav'];
|
|
432
|
+
const candidates = Array.from(document.querySelectorAll('img'));
|
|
433
|
+
const validImages = [];
|
|
434
|
+
|
|
435
|
+
// Helper: Get base64 from loaded image via Canvas
|
|
436
|
+
const getBase64 = (img) => {
|
|
437
|
+
try {
|
|
438
|
+
const canvas = document.createElement('canvas');
|
|
439
|
+
canvas.width = img.naturalWidth;
|
|
440
|
+
canvas.height = img.naturalHeight;
|
|
441
|
+
const ctx = canvas.getContext('2d');
|
|
442
|
+
ctx.drawImage(img, 0, 0);
|
|
443
|
+
return canvas.toDataURL('image/jpeg').split(',')[1];
|
|
444
|
+
} catch(e) { return null; }
|
|
445
|
+
};
|
|
446
|
+
|
|
447
|
+
for (const img of candidates) {
|
|
448
|
+
if (validImages.length >= 8) break;
|
|
449
|
+
|
|
450
|
+
if (img.naturalWidth < 100 || img.naturalHeight < 80) continue;
|
|
451
|
+
|
|
452
|
+
const alt = (img.alt || '').toLowerCase();
|
|
453
|
+
const cls = (typeof img.className === 'string' ? img.className : '').toLowerCase();
|
|
454
|
+
const src = (img.src || '').toLowerCase();
|
|
455
|
+
|
|
456
|
+
if (blocklist.some(b => alt.includes(b) || cls.includes(b) || src.includes(b))) continue;
|
|
457
|
+
|
|
458
|
+
// 1. Try Canvas (Instant for loaded images)
|
|
459
|
+
if (img.complete && img.naturalHeight > 0) {
|
|
460
|
+
const b64 = getBase64(img);
|
|
461
|
+
if (b64) {
|
|
462
|
+
validImages.push(b64);
|
|
463
|
+
continue;
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// 2. Fallback to fetch (only for non-search pages to avoid delay)
|
|
468
|
+
// We skip fetch for search pages to ensure speed
|
|
469
|
+
if (!window.location.href.includes('google') && !window.location.href.includes('search')) {
|
|
470
|
+
try {
|
|
471
|
+
const controller = new AbortController();
|
|
472
|
+
const id = setTimeout(() => controller.abort(), 2000);
|
|
473
|
+
const resp = await fetch(img.src, { signal: controller.signal });
|
|
474
|
+
clearTimeout(id);
|
|
475
|
+
const blob = await resp.blob();
|
|
476
|
+
const b64 = await new Promise(resolve => {
|
|
477
|
+
const reader = new FileReader();
|
|
478
|
+
reader.onloadend = () => resolve(reader.result.split(',')[1]);
|
|
479
|
+
reader.onerror = () => resolve(null);
|
|
480
|
+
reader.readAsDataURL(blob);
|
|
481
|
+
});
|
|
482
|
+
if (b64) validImages.push(b64);
|
|
483
|
+
} catch(e) {}
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
return validImages;
|
|
487
|
+
})()
|
|
488
|
+
"""
|
|
489
|
+
images_b64 = tab.run_js(js_code, as_expr=True) or []
|
|
490
|
+
|
|
491
|
+
if images_b64:
|
|
492
|
+
logger.info(f"ScreenshotService: Extracted {len(images_b64)} images for {url}")
|
|
493
|
+
|
|
494
|
+
except Exception as e:
|
|
495
|
+
logger.warning(f"ScreenshotService: Image extraction failed: {e}")
|
|
496
|
+
|
|
497
|
+
return {
|
|
498
|
+
"content": content,
|
|
499
|
+
"html": html,
|
|
500
|
+
"title": title,
|
|
501
|
+
"url": final_url,
|
|
502
|
+
"raw_screenshot_b64": raw_screenshot_b64,
|
|
503
|
+
"images": images_b64
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
except Exception as e:
|
|
507
|
+
logger.error(f"ScreenshotService: Failed to fetch {url}: {e}")
|
|
508
|
+
return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
|
|
509
|
+
finally:
|
|
510
|
+
if tab:
|
|
511
|
+
self._release_tab(tab)
|
|
512
|
+
|
|
513
|
+
async def fetch_pages_batch(self, urls: List[str], timeout: float = 20.0, include_screenshot: bool = True) -> List[Dict[str, Any]]:
|
|
514
|
+
"""Fetch multiple pages concurrently."""
|
|
515
|
+
if not urls: return []
|
|
516
|
+
logger.info(f"ScreenshotService: Batch fetching {len(urls)} URLs (screenshots={include_screenshot})")
|
|
517
|
+
tasks = [self.fetch_page(url, timeout, include_screenshot) for url in urls]
|
|
518
|
+
return await asyncio.gather(*tasks, return_exceptions=True)
|
|
519
|
+
|
|
520
|
+
async def screenshot_urls_batch(self, urls: List[str], timeout: float = 15.0, full_page: bool = True) -> List[Optional[str]]:
|
|
521
|
+
"""Take screenshots of multiple URLs concurrently."""
|
|
522
|
+
if not urls: return []
|
|
523
|
+
logger.info(f"ScreenshotService: Batch screenshot {len(urls)} URLs")
|
|
524
|
+
tasks = [self.screenshot_url(url, timeout=timeout, full_page=full_page) for url in urls]
|
|
525
|
+
return await asyncio.gather(*tasks, return_exceptions=True)
|
|
526
|
+
|
|
527
|
+
async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 80) -> Optional[str]:
|
|
528
|
+
"""Screenshot URL (Async wrapper for sync)."""
|
|
529
|
+
loop = asyncio.get_running_loop()
|
|
530
|
+
return await loop.run_in_executor(
|
|
531
|
+
self._executor,
|
|
532
|
+
self._screenshot_sync,
|
|
533
|
+
url, wait_load, timeout, full_page, quality
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
def _screenshot_sync(self, url: str, wait_load: bool, timeout: float, full_page: bool, quality: int) -> Optional[str]:
|
|
537
|
+
"""Synchronous screenshot."""
|
|
538
|
+
if not url: return None
|
|
539
|
+
tab = None
|
|
540
|
+
try:
|
|
541
|
+
self._ensure_ready()
|
|
542
|
+
page = self._manager.page
|
|
543
|
+
if not page: return None
|
|
544
|
+
|
|
545
|
+
tab = page.new_tab(url)
|
|
546
|
+
try:
|
|
547
|
+
# Wait for full page load (including JS execution)
|
|
548
|
+
tab.wait.load_complete(timeout=timeout)
|
|
549
|
+
|
|
550
|
+
# Wait for actual content to appear (for CDN verification pages)
|
|
551
|
+
# Smart Wait Logic (Final Robust):
|
|
552
|
+
# 1. FORCED WAIT: 1.5s to allow initial redirects/rendering to start.
|
|
553
|
+
# 2. Browser ReadyState Complete
|
|
554
|
+
# 3. Height Stable for 2.0 seconds (20 checks)
|
|
555
|
+
# 4. Text > 100 chars (Crucial: Distinguishes stable content from stable spinners)
|
|
556
|
+
# 5. No Blacklist phrases
|
|
557
|
+
|
|
558
|
+
time.sleep(1.5) # user request: force wait 1.5s before detection
|
|
559
|
+
|
|
560
|
+
last_h = 0
|
|
561
|
+
stable_count = 0
|
|
562
|
+
|
|
563
|
+
for i in range(200): # Max 200 iterations (~20s)
|
|
564
|
+
try:
|
|
565
|
+
state = tab.run_js('''
|
|
566
|
+
return {
|
|
567
|
+
ready: document.readyState === 'complete',
|
|
568
|
+
title: document.title,
|
|
569
|
+
height: Math.max(
|
|
570
|
+
document.body.scrollHeight || 0,
|
|
571
|
+
document.documentElement.scrollHeight || 0
|
|
572
|
+
),
|
|
573
|
+
text: document.body.innerText.substring(0, 1000) || "",
|
|
574
|
+
html: document.body.innerHTML.substring(0, 500) // Debug intro
|
|
575
|
+
};
|
|
576
|
+
''') or {'ready': False, 'title': "", 'height': 0, 'text': ""}
|
|
577
|
+
|
|
578
|
+
is_ready = state.get('ready', False)
|
|
579
|
+
title = state.get('title', "").lower()
|
|
580
|
+
current_h = int(state.get('height', 0))
|
|
581
|
+
text_content = state.get('text', "")
|
|
582
|
+
text_len = len(text_content)
|
|
583
|
+
text_lower = text_content.lower()
|
|
584
|
+
|
|
585
|
+
# Blacklist check
|
|
586
|
+
is_verification = "checking your browser" in text_lower or \
|
|
587
|
+
"just a moment" in text_lower or \
|
|
588
|
+
"please wait" in text_lower or \
|
|
589
|
+
"security check" in title or \
|
|
590
|
+
"just a moment" in title
|
|
591
|
+
|
|
592
|
+
# Stability check
|
|
593
|
+
if current_h == last_h:
|
|
594
|
+
stable_count += 1
|
|
595
|
+
else:
|
|
596
|
+
stable_count = 0
|
|
597
|
+
|
|
598
|
+
# Conditions
|
|
599
|
+
has_content = text_len > 100 # At least 100 real chars
|
|
600
|
+
is_stable = stable_count >= 20 # Always require 2s stability
|
|
601
|
+
|
|
602
|
+
# Pass if all conditions met
|
|
603
|
+
if is_ready and not is_verification and has_content and is_stable:
|
|
604
|
+
break
|
|
605
|
+
|
|
606
|
+
last_h = current_h
|
|
607
|
+
|
|
608
|
+
# Wait timing
|
|
609
|
+
try: tab.wait.eles_loaded(timeout=0.1)
|
|
610
|
+
except: pass
|
|
611
|
+
|
|
612
|
+
except Exception:
|
|
613
|
+
stable_count = 0
|
|
614
|
+
try: time.sleep(0.1)
|
|
615
|
+
except: pass
|
|
616
|
+
continue
|
|
617
|
+
|
|
618
|
+
# DEBUG: Save HTML to inspect what happened (in data dir)
|
|
619
|
+
try:
|
|
620
|
+
import os
|
|
621
|
+
log_path = os.path.join(os.getcwd(), "data", "browser.log.html")
|
|
622
|
+
with open(log_path, "w", encoding="utf-8") as f:
|
|
623
|
+
f.write(f"<!-- URL: {url} -->\n")
|
|
624
|
+
f.write(tab.html)
|
|
625
|
+
except: pass
|
|
626
|
+
|
|
627
|
+
# Use faster scroll step (800) to ensure lazy loaded images appear
|
|
628
|
+
self._scroll_to_bottom(tab, step=800, delay=2.0, timeout=min(timeout, 10))
|
|
629
|
+
|
|
630
|
+
except:
|
|
631
|
+
pass
|
|
632
|
+
|
|
633
|
+
# Refine calculation: Set viewport width to 1024
|
|
634
|
+
capture_width = 1024
|
|
635
|
+
|
|
636
|
+
# Calculate actual content height after lazy loading
|
|
637
|
+
try:
|
|
638
|
+
# Use a robust height calculation
|
|
639
|
+
content_height = tab.run_js('''
|
|
640
|
+
return Math.max(
|
|
641
|
+
document.body.scrollHeight || 0,
|
|
642
|
+
document.documentElement.scrollHeight || 0,
|
|
643
|
+
document.body.offsetHeight || 0,
|
|
644
|
+
document.documentElement.offsetHeight || 0,
|
|
645
|
+
document.documentElement.clientHeight || 0
|
|
646
|
+
);
|
|
647
|
+
''')
|
|
648
|
+
# Add a small buffer and cap at 15000px to prevent memory issues
|
|
649
|
+
h = min(int(content_height) + 50, 15000)
|
|
650
|
+
except:
|
|
651
|
+
h = 1000 # Fallback
|
|
652
|
+
|
|
653
|
+
# Set viewport to full content size for single-shot capture
|
|
654
|
+
try:
|
|
655
|
+
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
656
|
+
width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
|
|
657
|
+
except:
|
|
658
|
+
pass
|
|
659
|
+
|
|
660
|
+
# Scrollbar Hiding
|
|
661
|
+
from .manager import SharedBrowserManager
|
|
662
|
+
SharedBrowserManager.hide_scrollbars(tab)
|
|
663
|
+
|
|
664
|
+
# Scroll back to top before screenshot
|
|
665
|
+
tab.run_js("window.scrollTo(0, 0);")
|
|
666
|
+
|
|
667
|
+
# Content is already loaded by _scroll_to_bottom which waits for images
|
|
668
|
+
# Just recalculate final height (content may have grown during scrolling)
|
|
669
|
+
try:
|
|
670
|
+
final_height = tab.run_js('''
|
|
671
|
+
return Math.max(
|
|
672
|
+
document.body.scrollHeight || 0,
|
|
673
|
+
document.documentElement.scrollHeight || 0,
|
|
674
|
+
document.body.offsetHeight || 0,
|
|
675
|
+
document.documentElement.offsetHeight || 0
|
|
676
|
+
);
|
|
677
|
+
''')
|
|
678
|
+
final_h = min(int(final_height) + 50, 15000)
|
|
679
|
+
if final_h != h:
|
|
680
|
+
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
681
|
+
width=capture_width, height=final_h, deviceScaleFactor=1, mobile=False)
|
|
682
|
+
except:
|
|
683
|
+
pass
|
|
684
|
+
|
|
685
|
+
# Use full_page=False because we manually set the viewport to the full height
|
|
686
|
+
# This avoids stitching artifacts and blank spaces
|
|
687
|
+
return tab.get_screenshot(as_base64='jpg', full_page=False)
|
|
688
|
+
|
|
689
|
+
except Exception as e:
|
|
690
|
+
logger.error(f"ScreenshotService: Screenshot URL failed: {e}")
|
|
691
|
+
return None
|
|
692
|
+
finally:
|
|
693
|
+
if tab:
|
|
694
|
+
try: tab.close()
|
|
695
|
+
except: pass
|
|
696
|
+
|
|
697
|
+
async def close(self):
|
|
698
|
+
self._executor.shutdown(wait=False)
|
|
699
|
+
logger.info("ScreenshotService: Closed.")
|
|
700
|
+
|
|
701
|
+
async def close_async(self):
|
|
702
|
+
await self.close()
|
|
703
|
+
|
|
704
|
+
# Singleton
|
|
705
|
+
_screenshot_service: Optional[ScreenshotService] = None
|
|
706
|
+
|
|
707
|
+
def get_screenshot_service(headless: bool = True) -> ScreenshotService:
|
|
708
|
+
global _screenshot_service
|
|
709
|
+
if _screenshot_service is None:
|
|
710
|
+
_screenshot_service = ScreenshotService(headless=headless, auto_start=True)
|
|
711
|
+
return _screenshot_service
|
|
712
|
+
|
|
713
|
+
async def close_screenshot_service():
|
|
714
|
+
global _screenshot_service
|
|
715
|
+
if _screenshot_service:
|
|
716
|
+
await _screenshot_service.close()
|
|
717
|
+
_screenshot_service = None
|
|
718
|
+
|
|
719
|
+
def prestart_browser(headless: bool = True):
|
|
720
|
+
get_screenshot_service(headless=headless)
|