entari-plugin-hyw 4.0.0rc4__py3-none-any.whl → 4.0.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (30) hide show
  1. entari_plugin_hyw/__init__.py +216 -75
  2. entari_plugin_hyw/assets/card-dist/index.html +70 -79
  3. entari_plugin_hyw/browser/__init__.py +10 -0
  4. entari_plugin_hyw/browser/engines/base.py +13 -0
  5. entari_plugin_hyw/browser/engines/bing.py +95 -0
  6. entari_plugin_hyw/browser/engines/duckduckgo.py +137 -0
  7. entari_plugin_hyw/browser/engines/google.py +155 -0
  8. entari_plugin_hyw/browser/landing.html +172 -0
  9. entari_plugin_hyw/browser/manager.py +153 -0
  10. entari_plugin_hyw/browser/service.py +304 -0
  11. entari_plugin_hyw/card-ui/src/App.vue +526 -182
  12. entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +7 -11
  13. entari_plugin_hyw/card-ui/src/components/StageCard.vue +33 -30
  14. entari_plugin_hyw/card-ui/src/types.ts +9 -0
  15. entari_plugin_hyw/definitions.py +155 -0
  16. entari_plugin_hyw/history.py +111 -33
  17. entari_plugin_hyw/misc.py +34 -0
  18. entari_plugin_hyw/modular_pipeline.py +384 -0
  19. entari_plugin_hyw/render_vue.py +326 -239
  20. entari_plugin_hyw/search.py +95 -708
  21. entari_plugin_hyw/stage_base.py +92 -0
  22. entari_plugin_hyw/stage_instruct.py +345 -0
  23. entari_plugin_hyw/stage_instruct_deepsearch.py +104 -0
  24. entari_plugin_hyw/stage_summary.py +164 -0
  25. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/METADATA +4 -4
  26. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/RECORD +28 -16
  27. entari_plugin_hyw/pipeline.py +0 -1219
  28. entari_plugin_hyw/prompts.py +0 -47
  29. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/WHEEL +0 -0
  30. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,153 @@
1
+ """
2
+ Shared Browser Manager (DrissionPage)
3
+
4
+ Manages a single ChromiumPage browser instance.
5
+ Supports multi-tab concurrency via DrissionPage's built-in tab management.
6
+ """
7
+
8
+ import threading
9
+ from typing import Optional, Any
10
+ from loguru import logger
11
+ from DrissionPage import ChromiumPage, ChromiumOptions
12
+ from DrissionPage.errors import PageDisconnectedError
13
+
14
+ class SharedBrowserManager:
15
+ """
16
+ Manages a shared DrissionPage Chromium browser.
17
+ """
18
+
19
+ _instance: Optional["SharedBrowserManager"] = None
20
+ _lock = threading.Lock()
21
+
22
+ def __init__(self, headless: bool = True):
23
+ self.headless = headless
24
+ self._page: Optional[ChromiumPage] = None
25
+ self._starting = False
26
+ self._tab_lock = threading.Lock()
27
+
28
+ @classmethod
29
+ def get_instance(cls, headless: bool = True) -> "SharedBrowserManager":
30
+ """Get or create the singleton SharedBrowserManager."""
31
+ with cls._lock:
32
+ if cls._instance is None:
33
+ cls._instance = cls(headless=headless)
34
+ return cls._instance
35
+
36
+ def start(self) -> bool:
37
+ """Start the Chromium browser."""
38
+ if self._page is not None:
39
+ # Check if alive
40
+ try:
41
+ if self._page.run_cdp('Browser.getVersion'):
42
+ return True
43
+ except (PageDisconnectedError, Exception):
44
+ self._page = None
45
+
46
+ with self._lock:
47
+ if self._starting:
48
+ return False
49
+ self._starting = True
50
+
51
+ try:
52
+ logger.info("SharedBrowserManager: Starting DrissionPage browser...")
53
+
54
+ # Configure options
55
+ co = ChromiumOptions()
56
+ co.headless(self.headless)
57
+ co.auto_port() # Auto find available port
58
+
59
+ # Anti-detection settings
60
+ co.set_argument('--no-sandbox')
61
+ co.set_argument('--disable-gpu')
62
+ # Essential for loading local files and avoiding CORS issues
63
+ co.set_argument('--allow-file-access-from-files')
64
+ co.set_argument('--disable-web-security')
65
+ # Hide scrollbars globally
66
+ co.set_argument('--hide-scrollbars')
67
+ # 十万的原因是滚动条屏蔽(大概吧)
68
+ co.set_argument('--window-size=1280,20000')
69
+ self._page = ChromiumPage(addr_or_opts=co)
70
+
71
+ # Show Landing Page
72
+ try:
73
+ import os
74
+ landing_path = os.path.join(os.path.dirname(__file__), 'landing.html')
75
+ if os.path.exists(landing_path):
76
+ self._page.get(f"file://{landing_path}")
77
+ except Exception as e:
78
+ logger.warning(f"SharedBrowserManager: Failed to show landing page: {e}")
79
+
80
+ logger.success(f"SharedBrowserManager: Browser ready (port={self._page.address})")
81
+ return True
82
+
83
+ except Exception as e:
84
+ logger.error(f"SharedBrowserManager: Failed to start browser: {e}")
85
+ self._page = None
86
+ raise
87
+ finally:
88
+ self._starting = False
89
+
90
+ @property
91
+ def page(self) -> Optional[ChromiumPage]:
92
+ """Get the main ChromiumPage instance."""
93
+ if self._page is None:
94
+ self.start()
95
+ return self._page
96
+
97
+ def new_tab(self, url: str) -> Any:
98
+ """Thread-safe tab creation."""
99
+ with self._tab_lock:
100
+ page = self.page
101
+ if not page:
102
+ raise RuntimeError("Browser not available")
103
+ return page.new_tab(url)
104
+
105
+ def close(self):
106
+ """Shutdown the browser."""
107
+ with self._lock:
108
+ if self._page:
109
+ try:
110
+ self._page.quit()
111
+ logger.info("SharedBrowserManager: Browser closed.")
112
+ except Exception as e:
113
+ logger.warning(f"SharedBrowserManager: Error closing browser: {e}")
114
+ finally:
115
+ self._page = None
116
+
117
+ @staticmethod
118
+ def hide_scrollbars(page: ChromiumPage):
119
+ """
120
+ Robustly hide scrollbars using CDP commands.
121
+ This eliminates the reserved space/gutter that standard CSS might miss.
122
+ """
123
+ try:
124
+ # Emulation.setScrollbarsHidden is a CDP command
125
+ page.run_cdp('Emulation.setScrollbarsHidden', hidden=True)
126
+ logger.debug("SharedBrowserManager: CDP scrollbars hidden.")
127
+ except Exception as e:
128
+ logger.warning(f"SharedBrowserManager: Failed to hide scrollbars via CDP: {e}")
129
+
130
+
131
+ # Module-level singleton accessor
132
+ _shared_manager: Optional[SharedBrowserManager] = None
133
+
134
+
135
+ def get_shared_browser_manager(headless: bool = True) -> SharedBrowserManager:
136
+ """
137
+ Get or create the shared browser manager.
138
+ """
139
+ global _shared_manager
140
+
141
+ if _shared_manager is None:
142
+ _shared_manager = SharedBrowserManager.get_instance(headless=headless)
143
+ _shared_manager.start()
144
+
145
+ return _shared_manager
146
+
147
+
148
+ def close_shared_browser():
149
+ """Close the shared browser manager."""
150
+ global _shared_manager
151
+ if _shared_manager:
152
+ _shared_manager.close()
153
+ _shared_manager = None
@@ -0,0 +1,304 @@
1
+ """
2
+ Browser Service (DrissionPage)
3
+
4
+ Provides page fetching and screenshot capabilities using DrissionPage.
5
+ """
6
+
7
+ import asyncio
8
+ import base64
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from typing import Optional, Dict, Any, List
11
+ from loguru import logger
12
+ import trafilatura
13
+
14
+ class ScreenshotService:
15
+ """
16
+ Browser Service using DrissionPage.
17
+ """
18
+
19
+ def __init__(self, headless: bool = True, auto_start: bool = True):
20
+ self.headless = headless
21
+ self._manager = None
22
+ self._executor = ThreadPoolExecutor(max_workers=10)
23
+
24
+ if auto_start:
25
+ self._ensure_ready()
26
+
27
+ def _ensure_ready(self):
28
+ """Ensure shared browser is ready."""
29
+ from .manager import get_shared_browser_manager
30
+ self._manager = get_shared_browser_manager(headless=self.headless)
31
+
32
+ async def fetch_page(self, url: str, timeout: float = 20.0, include_screenshot: bool = True) -> Dict[str, Any]:
33
+ """
34
+ Fetch page content (and optionally screenshot).
35
+ Runs in a thread executor to avoid blocking the async loop.
36
+ """
37
+ loop = asyncio.get_running_loop()
38
+ return await loop.run_in_executor(
39
+ self._executor,
40
+ self._fetch_page_sync,
41
+ url,
42
+ timeout,
43
+ include_screenshot
44
+ )
45
+
46
+ def _fetch_page_sync(self, url: str, timeout: float, include_screenshot: bool) -> Dict[str, Any]:
47
+ """Synchronous fetch logic."""
48
+ if not url:
49
+ return {"content": "Error: missing url", "title": "Error", "url": ""}
50
+
51
+ tab = None
52
+ try:
53
+ self._ensure_ready()
54
+ page = self._manager.page
55
+ if not page:
56
+ return {"content": "Error: Browser not available", "title": "Error", "url": url}
57
+
58
+ # New Tab
59
+ tab = page.new_tab(url)
60
+
61
+ # Wait logic
62
+ is_search_page = any(s in url.lower() for s in ['search', 'bing.com', 'duckduckgo', 'google.com/search', 'searx'])
63
+ if is_search_page:
64
+ # Quick check for results
65
+ result_selectors = ['#results', '#b_results', '#search', '#links', '.result']
66
+ for selector in result_selectors:
67
+ if tab.ele(selector, timeout=1):
68
+ break
69
+ else:
70
+ # 1. Wait for document to settle (Fast Dynamic Wait)
71
+ try:
72
+ tab.wait.doc_loaded(timeout=5)
73
+ # Brief check for loading overlays (fast skip if none)
74
+ tab.run_js("""
75
+ (async () => {
76
+ const isVisible = (el) => !!(el.offsetWidth || el.offsetHeight || el.getClientRects().length);
77
+ for (let i = 0; i < 15; i++) {
78
+ const indicators = Array.from(document.querySelectorAll('*')).filter(el => {
79
+ try {
80
+ const text = (el.textContent || '').toLowerCase();
81
+ const id = (el.id || '').toLowerCase();
82
+ const cls = (el.getAttribute('class') || '').toLowerCase();
83
+ return (text.includes('loading') || id.includes('loading') || cls.includes('loading')) && isVisible(el);
84
+ } catch(e) { return false; }
85
+ });
86
+ if (indicators.length === 0) break;
87
+ await new Promise(r => setTimeout(r, 100));
88
+ }
89
+ })()
90
+ """, as_expr=True)
91
+ except: pass
92
+
93
+ html = tab.html
94
+ title = tab.title
95
+ final_url = tab.url
96
+
97
+ raw_screenshot_b64 = None
98
+ if include_screenshot:
99
+ try:
100
+ # Scrollbar Hiding Best Effort
101
+ from .manager import SharedBrowserManager
102
+ SharedBrowserManager.hide_scrollbars(tab)
103
+
104
+ # Inject CSS
105
+ tab.run_js("""
106
+ const style = document.createElement('style');
107
+ style.textContent = `
108
+ ::-webkit-scrollbar { display: none !important; }
109
+ html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
110
+ `;
111
+ document.head.appendChild(style);
112
+ document.documentElement.style.overflow = 'hidden';
113
+ document.body.style.overflow = 'hidden';
114
+ """)
115
+
116
+ raw_screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
117
+ except Exception as e:
118
+ logger.warning(f"ScreenshotService: Failed to capture screenshot: {e}")
119
+
120
+ # Extract content
121
+ content = trafilatura.extract(
122
+ html, include_links=True, include_images=True, include_comments=False,
123
+ include_tables=True, favor_precision=False, output_format="markdown"
124
+ ) or ""
125
+
126
+ # 2. Extract Images via Parallelized JS (Gallery)
127
+ # Strategy: For search pages, use Canvas to grab already loaded images (Instant)
128
+ # For other pages, use fetch (more robust for lazy load)
129
+ images_b64 = []
130
+ try:
131
+ js_code = """
132
+ (async () => {
133
+ const blocklist = ['logo', 'icon', 'avatar', 'ad', 'pixel', 'tracker', 'button', 'menu', 'nav'];
134
+ const candidates = Array.from(document.querySelectorAll('img'));
135
+ const validImages = [];
136
+
137
+ // Helper: Get base64 from loaded image via Canvas
138
+ const getBase64 = (img) => {
139
+ try {
140
+ const canvas = document.createElement('canvas');
141
+ canvas.width = img.naturalWidth;
142
+ canvas.height = img.naturalHeight;
143
+ const ctx = canvas.getContext('2d');
144
+ ctx.drawImage(img, 0, 0);
145
+ return canvas.toDataURL('image/jpeg').split(',')[1];
146
+ } catch(e) { return null; }
147
+ };
148
+
149
+ for (const img of candidates) {
150
+ if (validImages.length >= 8) break;
151
+
152
+ if (img.naturalWidth < 100 || img.naturalHeight < 80) continue;
153
+
154
+ const alt = (img.alt || '').toLowerCase();
155
+ const cls = (typeof img.className === 'string' ? img.className : '').toLowerCase();
156
+ const src = (img.src || '').toLowerCase();
157
+
158
+ if (blocklist.some(b => alt.includes(b) || cls.includes(b) || src.includes(b))) continue;
159
+
160
+ // 1. Try Canvas (Instant for loaded images)
161
+ if (img.complete && img.naturalHeight > 0) {
162
+ const b64 = getBase64(img);
163
+ if (b64) {
164
+ validImages.push(b64);
165
+ continue;
166
+ }
167
+ }
168
+
169
+ // 2. Fallback to fetch (only for non-search pages to avoid delay)
170
+ // We skip fetch for search pages to ensure speed
171
+ if (!window.location.href.includes('google') && !window.location.href.includes('search')) {
172
+ try {
173
+ const controller = new AbortController();
174
+ const id = setTimeout(() => controller.abort(), 2000);
175
+ const resp = await fetch(img.src, { signal: controller.signal });
176
+ clearTimeout(id);
177
+ const blob = await resp.blob();
178
+ const b64 = await new Promise(resolve => {
179
+ const reader = new FileReader();
180
+ reader.onloadend = () => resolve(reader.result.split(',')[1]);
181
+ reader.onerror = () => resolve(null);
182
+ reader.readAsDataURL(blob);
183
+ });
184
+ if (b64) validImages.push(b64);
185
+ } catch(e) {}
186
+ }
187
+ }
188
+ return validImages;
189
+ })()
190
+ """
191
+ images_b64 = tab.run_js(js_code, as_expr=True) or []
192
+
193
+ if images_b64:
194
+ logger.info(f"ScreenshotService: Extracted {len(images_b64)} images for {url}")
195
+
196
+ except Exception as e:
197
+ logger.warning(f"ScreenshotService: Image extraction failed: {e}")
198
+
199
+ return {
200
+ "content": content,
201
+ "html": html,
202
+ "title": title,
203
+ "url": final_url,
204
+ "raw_screenshot_b64": raw_screenshot_b64,
205
+ "images": images_b64
206
+ }
207
+
208
+ except Exception as e:
209
+ logger.error(f"ScreenshotService: Failed to fetch {url}: {e}")
210
+ return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
211
+ finally:
212
+ if tab:
213
+ try: tab.close()
214
+ except: pass
215
+
216
+ async def fetch_pages_batch(self, urls: List[str], timeout: float = 20.0, include_screenshot: bool = True) -> List[Dict[str, Any]]:
217
+ """Fetch multiple pages concurrently."""
218
+ if not urls: return []
219
+ logger.info(f"ScreenshotService: Batch fetching {len(urls)} URLs (screenshots={include_screenshot})")
220
+ tasks = [self.fetch_page(url, timeout, include_screenshot) for url in urls]
221
+ return await asyncio.gather(*tasks, return_exceptions=True)
222
+
223
+ async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 80) -> Optional[str]:
224
+ """Screenshot URL (Async wrapper for sync)."""
225
+ loop = asyncio.get_running_loop()
226
+ return await loop.run_in_executor(
227
+ self._executor,
228
+ self._screenshot_sync,
229
+ url, wait_load, timeout, full_page, quality
230
+ )
231
+
232
+ def _screenshot_sync(self, url: str, wait_load: bool, timeout: float, full_page: bool, quality: int) -> Optional[str]:
233
+ """Synchronous screenshot."""
234
+ if not url: return None
235
+ tab = None
236
+ try:
237
+ self._ensure_ready()
238
+ page = self._manager.page
239
+ if not page: return None
240
+
241
+ tab = page.new_tab(url)
242
+ try:
243
+ if wait_load:
244
+ tab.wait.load_complete(timeout=timeout)
245
+ else:
246
+ tab.wait.doc_loaded(timeout=timeout)
247
+ except: pass
248
+
249
+ # Wait for main element
250
+ if tab.ele("#main-container"):
251
+ pass
252
+
253
+ # Scrollbar Hiding
254
+ from .manager import SharedBrowserManager
255
+ SharedBrowserManager.hide_scrollbars(tab)
256
+ tab.run_js("""
257
+ const style = document.createElement('style');
258
+ style.textContent = `
259
+ ::-webkit-scrollbar { display: none !important; }
260
+ html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
261
+ `;
262
+ document.head.appendChild(style);
263
+ document.documentElement.style.overflow = 'hidden';
264
+ document.body.style.overflow = 'hidden';
265
+ """)
266
+
267
+ ele = tab.ele("#main-container")
268
+ if ele:
269
+ return ele.get_screenshot(as_base64='jpg', quality=quality)
270
+ else:
271
+ return tab.get_screenshot(as_base64='jpg', full_page=full_page, quality=quality)
272
+
273
+ except Exception as e:
274
+ logger.error(f"ScreenshotService: Screenshot URL failed: {e}")
275
+ return None
276
+ finally:
277
+ if tab:
278
+ try: tab.close()
279
+ except: pass
280
+
281
+ async def close(self):
282
+ self._executor.shutdown(wait=False)
283
+ logger.info("ScreenshotService: Closed.")
284
+
285
+ async def close_async(self):
286
+ await self.close()
287
+
288
+ # Singleton
289
+ _screenshot_service: Optional[ScreenshotService] = None
290
+
291
+ def get_screenshot_service(headless: bool = True) -> ScreenshotService:
292
+ global _screenshot_service
293
+ if _screenshot_service is None:
294
+ _screenshot_service = ScreenshotService(headless=headless, auto_start=True)
295
+ return _screenshot_service
296
+
297
+ async def close_screenshot_service():
298
+ global _screenshot_service
299
+ if _screenshot_service:
300
+ await _screenshot_service.close()
301
+ _screenshot_service = None
302
+
303
+ def prestart_browser(headless: bool = True):
304
+ get_screenshot_service(headless=headless)