entari-plugin-hyw 4.0.0rc6__py3-none-any.whl → 4.0.0rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/Untitled-1 +1865 -0
- entari_plugin_hyw/__init__.py +733 -379
- entari_plugin_hyw/history.py +60 -57
- entari_plugin_hyw/misc.py +3 -0
- entari_plugin_hyw/search_cache.py +154 -0
- {entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc8.dist-info}/METADATA +3 -1
- entari_plugin_hyw-4.0.0rc8.dist-info/RECORD +68 -0
- {entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc8.dist-info}/WHEEL +1 -1
- {entari_plugin_hyw-4.0.0rc6.dist-info → entari_plugin_hyw-4.0.0rc8.dist-info}/top_level.txt +1 -0
- hyw_core/__init__.py +94 -0
- hyw_core/browser_control/__init__.py +65 -0
- hyw_core/browser_control/assets/card-dist/index.html +409 -0
- hyw_core/browser_control/assets/index.html +5691 -0
- hyw_core/browser_control/engines/__init__.py +17 -0
- hyw_core/browser_control/engines/default.py +166 -0
- {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/duckduckgo.py +42 -8
- {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/google.py +1 -1
- {entari_plugin_hyw/browser → hyw_core/browser_control}/manager.py +15 -8
- entari_plugin_hyw/render_vue.py → hyw_core/browser_control/renderer.py +29 -14
- hyw_core/browser_control/service.py +720 -0
- hyw_core/config.py +154 -0
- hyw_core/core.py +322 -0
- hyw_core/definitions.py +83 -0
- entari_plugin_hyw/modular_pipeline.py → hyw_core/pipeline.py +204 -86
- {entari_plugin_hyw → hyw_core}/search.py +60 -19
- hyw_core/stages/__init__.py +21 -0
- entari_plugin_hyw/stage_base.py → hyw_core/stages/base.py +3 -0
- entari_plugin_hyw/stage_summary.py → hyw_core/stages/summary.py +36 -7
- entari_plugin_hyw/assets/card-dist/index.html +0 -387
- entari_plugin_hyw/browser/__init__.py +0 -10
- entari_plugin_hyw/browser/engines/bing.py +0 -95
- entari_plugin_hyw/browser/service.py +0 -304
- entari_plugin_hyw/card-ui/.gitignore +0 -24
- entari_plugin_hyw/card-ui/README.md +0 -5
- entari_plugin_hyw/card-ui/index.html +0 -16
- entari_plugin_hyw/card-ui/package-lock.json +0 -2342
- entari_plugin_hyw/card-ui/package.json +0 -31
- entari_plugin_hyw/card-ui/public/logos/anthropic.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/cerebras.svg +0 -9
- entari_plugin_hyw/card-ui/public/logos/deepseek.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/gemini.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/google.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/grok.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/huggingface.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/microsoft.svg +0 -15
- entari_plugin_hyw/card-ui/public/logos/minimax.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/mistral.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/nvida.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/openai.svg +0 -1
- entari_plugin_hyw/card-ui/public/logos/openrouter.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/perplexity.svg +0 -24
- entari_plugin_hyw/card-ui/public/logos/qwen.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/xai.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/xiaomi.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/zai.png +0 -0
- entari_plugin_hyw/card-ui/public/vite.svg +0 -1
- entari_plugin_hyw/card-ui/src/App.vue +0 -756
- entari_plugin_hyw/card-ui/src/assets/vue.svg +0 -1
- entari_plugin_hyw/card-ui/src/components/HelloWorld.vue +0 -41
- entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +0 -382
- entari_plugin_hyw/card-ui/src/components/SectionCard.vue +0 -41
- entari_plugin_hyw/card-ui/src/components/StageCard.vue +0 -240
- entari_plugin_hyw/card-ui/src/main.ts +0 -5
- entari_plugin_hyw/card-ui/src/style.css +0 -29
- entari_plugin_hyw/card-ui/src/test_regex.js +0 -103
- entari_plugin_hyw/card-ui/src/types.ts +0 -61
- entari_plugin_hyw/card-ui/tsconfig.app.json +0 -16
- entari_plugin_hyw/card-ui/tsconfig.json +0 -7
- entari_plugin_hyw/card-ui/tsconfig.node.json +0 -26
- entari_plugin_hyw/card-ui/vite.config.ts +0 -16
- entari_plugin_hyw/definitions.py +0 -155
- entari_plugin_hyw/stage_instruct.py +0 -345
- entari_plugin_hyw/stage_instruct_deepsearch.py +0 -104
- entari_plugin_hyw-4.0.0rc6.dist-info/RECORD +0 -100
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/anthropic.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/cerebras.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/deepseek.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/gemini.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/google.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/grok.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/huggingface.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/microsoft.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/minimax.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/mistral.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/nvida.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/openai.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/openrouter.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/perplexity.svg +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/qwen.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/xai.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/xiaomi.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/zai.png +0 -0
- {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/vite.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/anthropic.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/cerebras.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/deepseek.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/gemini.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/google.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/grok.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/huggingface.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/microsoft.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/minimax.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/mistral.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/nvida.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/openai.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/openrouter.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/perplexity.svg +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/qwen.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/xai.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/xiaomi.png +0 -0
- {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/zai.png +0 -0
- {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/base.py +0 -0
- {entari_plugin_hyw/browser → hyw_core/browser_control}/landing.html +0 -0
- {entari_plugin_hyw → hyw_core}/image_cache.py +0 -0
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
from .manager import get_shared_browser_manager, close_shared_browser
|
|
2
|
-
from .service import get_screenshot_service, close_screenshot_service, prestart_browser
|
|
3
|
-
|
|
4
|
-
__all__ = [
|
|
5
|
-
"get_shared_browser_manager",
|
|
6
|
-
"close_shared_browser",
|
|
7
|
-
"get_screenshot_service",
|
|
8
|
-
"close_screenshot_service",
|
|
9
|
-
"prestart_browser",
|
|
10
|
-
]
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import urllib.parse
|
|
3
|
-
import re
|
|
4
|
-
from typing import List, Dict, Any
|
|
5
|
-
from loguru import logger
|
|
6
|
-
from .base import SearchEngine
|
|
7
|
-
|
|
8
|
-
class BingEngine(SearchEngine):
|
|
9
|
-
"""
|
|
10
|
-
Search engine implementation for Bing.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def build_url(self, query: str, limit: int = 10) -> str:
|
|
14
|
-
encoded_query = urllib.parse.quote(query)
|
|
15
|
-
base = "https://www.bing.com/search"
|
|
16
|
-
return f"{base}?form=&q={encoded_query}"
|
|
17
|
-
|
|
18
|
-
def parse(self, content: str) -> List[Dict[str, Any]]:
|
|
19
|
-
results = []
|
|
20
|
-
# Split by b_algo to isolate results
|
|
21
|
-
chunks = content.split('class="b_algo"')
|
|
22
|
-
|
|
23
|
-
# Helper to decode Bing URLs roughly
|
|
24
|
-
def decode_bing_url(u):
|
|
25
|
-
if "bing.com/ck/a?" not in u: return u
|
|
26
|
-
try:
|
|
27
|
-
# Url is usually like ...&u=a1<base64>&...
|
|
28
|
-
# We look for &u=...
|
|
29
|
-
import base64
|
|
30
|
-
match = re.search(r'[?&]u=a1([^&]+)', u)
|
|
31
|
-
if match:
|
|
32
|
-
# Bing uses a modified base64 (url safe) and adds 'a1' prefix
|
|
33
|
-
# We stripped 'a1' in regex match group
|
|
34
|
-
b64 = match.group(1)
|
|
35
|
-
# padding
|
|
36
|
-
b64 += '=' * (-len(b64) % 4)
|
|
37
|
-
# url safe
|
|
38
|
-
b64 = b64.replace('-', '+').replace('_', '/')
|
|
39
|
-
decoded = base64.b64decode(b64).decode('utf-8')
|
|
40
|
-
return decoded
|
|
41
|
-
except Exception:
|
|
42
|
-
pass
|
|
43
|
-
return u
|
|
44
|
-
|
|
45
|
-
seen_urls = set()
|
|
46
|
-
|
|
47
|
-
for chunk in chunks[1:]:
|
|
48
|
-
# Exact regexes for title and snippet within the chunk
|
|
49
|
-
# Title: <h2><a href="...">...</a></h2>
|
|
50
|
-
link_match = re.search(r'<h2[^>]*>.*?<a[^>]+href=["\'](http[^"\']+)["\'][^>]*>(.*?)</a>', chunk, re.IGNORECASE | re.DOTALL)
|
|
51
|
-
if not link_match:
|
|
52
|
-
# Fallback: pure a tag
|
|
53
|
-
link_match = re.search(r'<a[^>]+href=["\'](http[^"\']+)["\'][^>]*>(.*?)</a>', chunk, re.IGNORECASE | re.DOTALL)
|
|
54
|
-
|
|
55
|
-
if link_match:
|
|
56
|
-
raw_url = link_match.group(1)
|
|
57
|
-
title_html = link_match.group(2)
|
|
58
|
-
title = re.sub(r'<[^>]+>', '', title_html).strip()
|
|
59
|
-
|
|
60
|
-
url = decode_bing_url(raw_url)
|
|
61
|
-
|
|
62
|
-
if url in seen_urls: continue
|
|
63
|
-
seen_urls.add(url)
|
|
64
|
-
|
|
65
|
-
# Snippet: class="b_caption" ... <p> ... </p> or just div text
|
|
66
|
-
snippet = ""
|
|
67
|
-
caption_match = re.search(r'class="b_caption"[^>]*>(.*?)</div>', chunk, re.IGNORECASE | re.DOTALL)
|
|
68
|
-
if caption_match:
|
|
69
|
-
snippet_html = caption_match.group(1)
|
|
70
|
-
snippet = re.sub(r'<[^>]+>', ' ', snippet_html).strip()
|
|
71
|
-
else:
|
|
72
|
-
# Fallback snippet
|
|
73
|
-
start = link_match.end()
|
|
74
|
-
snippet = re.sub(r'<[^>]+>', ' ', chunk[start:start+600]).strip()
|
|
75
|
-
|
|
76
|
-
snippet = re.sub(r'\s+', ' ', snippet).strip()
|
|
77
|
-
|
|
78
|
-
# Image extraction (basic)
|
|
79
|
-
images = []
|
|
80
|
-
img_matches = re.findall(r'<img[^>]+src=["\'](http[^"\']+)["\']', chunk)
|
|
81
|
-
for img_url in img_matches:
|
|
82
|
-
if not any(x in img_url for x in ['favicon', 'icon', 'tracking', 'pixel']):
|
|
83
|
-
images.append(img_url)
|
|
84
|
-
|
|
85
|
-
if url and title:
|
|
86
|
-
results.append({
|
|
87
|
-
"title": title,
|
|
88
|
-
"url": url,
|
|
89
|
-
"domain": urllib.parse.urlparse(url).hostname or "",
|
|
90
|
-
"content": snippet[:5000],
|
|
91
|
-
"images": images[:3]
|
|
92
|
-
})
|
|
93
|
-
|
|
94
|
-
logger.info(f"BingEngine parsed {len(results)} results.")
|
|
95
|
-
return results
|
|
@@ -1,304 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Browser Service (DrissionPage)
|
|
3
|
-
|
|
4
|
-
Provides page fetching and screenshot capabilities using DrissionPage.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import asyncio
|
|
8
|
-
import base64
|
|
9
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
-
from typing import Optional, Dict, Any, List
|
|
11
|
-
from loguru import logger
|
|
12
|
-
import trafilatura
|
|
13
|
-
|
|
14
|
-
class ScreenshotService:
|
|
15
|
-
"""
|
|
16
|
-
Browser Service using DrissionPage.
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
def __init__(self, headless: bool = True, auto_start: bool = True):
|
|
20
|
-
self.headless = headless
|
|
21
|
-
self._manager = None
|
|
22
|
-
self._executor = ThreadPoolExecutor(max_workers=10)
|
|
23
|
-
|
|
24
|
-
if auto_start:
|
|
25
|
-
self._ensure_ready()
|
|
26
|
-
|
|
27
|
-
def _ensure_ready(self):
|
|
28
|
-
"""Ensure shared browser is ready."""
|
|
29
|
-
from .manager import get_shared_browser_manager
|
|
30
|
-
self._manager = get_shared_browser_manager(headless=self.headless)
|
|
31
|
-
|
|
32
|
-
async def fetch_page(self, url: str, timeout: float = 20.0, include_screenshot: bool = True) -> Dict[str, Any]:
|
|
33
|
-
"""
|
|
34
|
-
Fetch page content (and optionally screenshot).
|
|
35
|
-
Runs in a thread executor to avoid blocking the async loop.
|
|
36
|
-
"""
|
|
37
|
-
loop = asyncio.get_running_loop()
|
|
38
|
-
return await loop.run_in_executor(
|
|
39
|
-
self._executor,
|
|
40
|
-
self._fetch_page_sync,
|
|
41
|
-
url,
|
|
42
|
-
timeout,
|
|
43
|
-
include_screenshot
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
def _fetch_page_sync(self, url: str, timeout: float, include_screenshot: bool) -> Dict[str, Any]:
|
|
47
|
-
"""Synchronous fetch logic."""
|
|
48
|
-
if not url:
|
|
49
|
-
return {"content": "Error: missing url", "title": "Error", "url": ""}
|
|
50
|
-
|
|
51
|
-
tab = None
|
|
52
|
-
try:
|
|
53
|
-
self._ensure_ready()
|
|
54
|
-
page = self._manager.page
|
|
55
|
-
if not page:
|
|
56
|
-
return {"content": "Error: Browser not available", "title": "Error", "url": url}
|
|
57
|
-
|
|
58
|
-
# New Tab
|
|
59
|
-
tab = page.new_tab(url)
|
|
60
|
-
|
|
61
|
-
# Wait logic
|
|
62
|
-
is_search_page = any(s in url.lower() for s in ['search', 'bing.com', 'duckduckgo', 'google.com/search', 'searx'])
|
|
63
|
-
if is_search_page:
|
|
64
|
-
# Quick check for results
|
|
65
|
-
result_selectors = ['#results', '#b_results', '#search', '#links', '.result']
|
|
66
|
-
for selector in result_selectors:
|
|
67
|
-
if tab.ele(selector, timeout=1):
|
|
68
|
-
break
|
|
69
|
-
else:
|
|
70
|
-
# 1. Wait for document to settle (Fast Dynamic Wait)
|
|
71
|
-
try:
|
|
72
|
-
tab.wait.doc_loaded(timeout=5)
|
|
73
|
-
# Brief check for loading overlays (fast skip if none)
|
|
74
|
-
tab.run_js("""
|
|
75
|
-
(async () => {
|
|
76
|
-
const isVisible = (el) => !!(el.offsetWidth || el.offsetHeight || el.getClientRects().length);
|
|
77
|
-
for (let i = 0; i < 15; i++) {
|
|
78
|
-
const indicators = Array.from(document.querySelectorAll('*')).filter(el => {
|
|
79
|
-
try {
|
|
80
|
-
const text = (el.textContent || '').toLowerCase();
|
|
81
|
-
const id = (el.id || '').toLowerCase();
|
|
82
|
-
const cls = (el.getAttribute('class') || '').toLowerCase();
|
|
83
|
-
return (text.includes('loading') || id.includes('loading') || cls.includes('loading')) && isVisible(el);
|
|
84
|
-
} catch(e) { return false; }
|
|
85
|
-
});
|
|
86
|
-
if (indicators.length === 0) break;
|
|
87
|
-
await new Promise(r => setTimeout(r, 100));
|
|
88
|
-
}
|
|
89
|
-
})()
|
|
90
|
-
""", as_expr=True)
|
|
91
|
-
except: pass
|
|
92
|
-
|
|
93
|
-
html = tab.html
|
|
94
|
-
title = tab.title
|
|
95
|
-
final_url = tab.url
|
|
96
|
-
|
|
97
|
-
raw_screenshot_b64 = None
|
|
98
|
-
if include_screenshot:
|
|
99
|
-
try:
|
|
100
|
-
# Scrollbar Hiding Best Effort
|
|
101
|
-
from .manager import SharedBrowserManager
|
|
102
|
-
SharedBrowserManager.hide_scrollbars(tab)
|
|
103
|
-
|
|
104
|
-
# Inject CSS
|
|
105
|
-
tab.run_js("""
|
|
106
|
-
const style = document.createElement('style');
|
|
107
|
-
style.textContent = `
|
|
108
|
-
::-webkit-scrollbar { display: none !important; }
|
|
109
|
-
html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
|
|
110
|
-
`;
|
|
111
|
-
document.head.appendChild(style);
|
|
112
|
-
document.documentElement.style.overflow = 'hidden';
|
|
113
|
-
document.body.style.overflow = 'hidden';
|
|
114
|
-
""")
|
|
115
|
-
|
|
116
|
-
raw_screenshot_b64 = tab.get_screenshot(as_base64='jpg', full_page=False)
|
|
117
|
-
except Exception as e:
|
|
118
|
-
logger.warning(f"ScreenshotService: Failed to capture screenshot: {e}")
|
|
119
|
-
|
|
120
|
-
# Extract content
|
|
121
|
-
content = trafilatura.extract(
|
|
122
|
-
html, include_links=True, include_images=True, include_comments=False,
|
|
123
|
-
include_tables=True, favor_precision=False, output_format="markdown"
|
|
124
|
-
) or ""
|
|
125
|
-
|
|
126
|
-
# 2. Extract Images via Parallelized JS (Gallery)
|
|
127
|
-
# Strategy: For search pages, use Canvas to grab already loaded images (Instant)
|
|
128
|
-
# For other pages, use fetch (more robust for lazy load)
|
|
129
|
-
images_b64 = []
|
|
130
|
-
try:
|
|
131
|
-
js_code = """
|
|
132
|
-
(async () => {
|
|
133
|
-
const blocklist = ['logo', 'icon', 'avatar', 'ad', 'pixel', 'tracker', 'button', 'menu', 'nav'];
|
|
134
|
-
const candidates = Array.from(document.querySelectorAll('img'));
|
|
135
|
-
const validImages = [];
|
|
136
|
-
|
|
137
|
-
// Helper: Get base64 from loaded image via Canvas
|
|
138
|
-
const getBase64 = (img) => {
|
|
139
|
-
try {
|
|
140
|
-
const canvas = document.createElement('canvas');
|
|
141
|
-
canvas.width = img.naturalWidth;
|
|
142
|
-
canvas.height = img.naturalHeight;
|
|
143
|
-
const ctx = canvas.getContext('2d');
|
|
144
|
-
ctx.drawImage(img, 0, 0);
|
|
145
|
-
return canvas.toDataURL('image/jpeg').split(',')[1];
|
|
146
|
-
} catch(e) { return null; }
|
|
147
|
-
};
|
|
148
|
-
|
|
149
|
-
for (const img of candidates) {
|
|
150
|
-
if (validImages.length >= 8) break;
|
|
151
|
-
|
|
152
|
-
if (img.naturalWidth < 100 || img.naturalHeight < 80) continue;
|
|
153
|
-
|
|
154
|
-
const alt = (img.alt || '').toLowerCase();
|
|
155
|
-
const cls = (typeof img.className === 'string' ? img.className : '').toLowerCase();
|
|
156
|
-
const src = (img.src || '').toLowerCase();
|
|
157
|
-
|
|
158
|
-
if (blocklist.some(b => alt.includes(b) || cls.includes(b) || src.includes(b))) continue;
|
|
159
|
-
|
|
160
|
-
// 1. Try Canvas (Instant for loaded images)
|
|
161
|
-
if (img.complete && img.naturalHeight > 0) {
|
|
162
|
-
const b64 = getBase64(img);
|
|
163
|
-
if (b64) {
|
|
164
|
-
validImages.push(b64);
|
|
165
|
-
continue;
|
|
166
|
-
}
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
// 2. Fallback to fetch (only for non-search pages to avoid delay)
|
|
170
|
-
// We skip fetch for search pages to ensure speed
|
|
171
|
-
if (!window.location.href.includes('google') && !window.location.href.includes('search')) {
|
|
172
|
-
try {
|
|
173
|
-
const controller = new AbortController();
|
|
174
|
-
const id = setTimeout(() => controller.abort(), 2000);
|
|
175
|
-
const resp = await fetch(img.src, { signal: controller.signal });
|
|
176
|
-
clearTimeout(id);
|
|
177
|
-
const blob = await resp.blob();
|
|
178
|
-
const b64 = await new Promise(resolve => {
|
|
179
|
-
const reader = new FileReader();
|
|
180
|
-
reader.onloadend = () => resolve(reader.result.split(',')[1]);
|
|
181
|
-
reader.onerror = () => resolve(null);
|
|
182
|
-
reader.readAsDataURL(blob);
|
|
183
|
-
});
|
|
184
|
-
if (b64) validImages.push(b64);
|
|
185
|
-
} catch(e) {}
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
return validImages;
|
|
189
|
-
})()
|
|
190
|
-
"""
|
|
191
|
-
images_b64 = tab.run_js(js_code, as_expr=True) or []
|
|
192
|
-
|
|
193
|
-
if images_b64:
|
|
194
|
-
logger.info(f"ScreenshotService: Extracted {len(images_b64)} images for {url}")
|
|
195
|
-
|
|
196
|
-
except Exception as e:
|
|
197
|
-
logger.warning(f"ScreenshotService: Image extraction failed: {e}")
|
|
198
|
-
|
|
199
|
-
return {
|
|
200
|
-
"content": content,
|
|
201
|
-
"html": html,
|
|
202
|
-
"title": title,
|
|
203
|
-
"url": final_url,
|
|
204
|
-
"raw_screenshot_b64": raw_screenshot_b64,
|
|
205
|
-
"images": images_b64
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
except Exception as e:
|
|
209
|
-
logger.error(f"ScreenshotService: Failed to fetch {url}: {e}")
|
|
210
|
-
return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
|
|
211
|
-
finally:
|
|
212
|
-
if tab:
|
|
213
|
-
try: tab.close()
|
|
214
|
-
except: pass
|
|
215
|
-
|
|
216
|
-
async def fetch_pages_batch(self, urls: List[str], timeout: float = 20.0, include_screenshot: bool = True) -> List[Dict[str, Any]]:
|
|
217
|
-
"""Fetch multiple pages concurrently."""
|
|
218
|
-
if not urls: return []
|
|
219
|
-
logger.info(f"ScreenshotService: Batch fetching {len(urls)} URLs (screenshots={include_screenshot})")
|
|
220
|
-
tasks = [self.fetch_page(url, timeout, include_screenshot) for url in urls]
|
|
221
|
-
return await asyncio.gather(*tasks, return_exceptions=True)
|
|
222
|
-
|
|
223
|
-
async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 80) -> Optional[str]:
|
|
224
|
-
"""Screenshot URL (Async wrapper for sync)."""
|
|
225
|
-
loop = asyncio.get_running_loop()
|
|
226
|
-
return await loop.run_in_executor(
|
|
227
|
-
self._executor,
|
|
228
|
-
self._screenshot_sync,
|
|
229
|
-
url, wait_load, timeout, full_page, quality
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
def _screenshot_sync(self, url: str, wait_load: bool, timeout: float, full_page: bool, quality: int) -> Optional[str]:
|
|
233
|
-
"""Synchronous screenshot."""
|
|
234
|
-
if not url: return None
|
|
235
|
-
tab = None
|
|
236
|
-
try:
|
|
237
|
-
self._ensure_ready()
|
|
238
|
-
page = self._manager.page
|
|
239
|
-
if not page: return None
|
|
240
|
-
|
|
241
|
-
tab = page.new_tab(url)
|
|
242
|
-
try:
|
|
243
|
-
if wait_load:
|
|
244
|
-
tab.wait.load_complete(timeout=timeout)
|
|
245
|
-
else:
|
|
246
|
-
tab.wait.doc_loaded(timeout=timeout)
|
|
247
|
-
except: pass
|
|
248
|
-
|
|
249
|
-
# Wait for main element
|
|
250
|
-
if tab.ele("#main-container"):
|
|
251
|
-
pass
|
|
252
|
-
|
|
253
|
-
# Scrollbar Hiding
|
|
254
|
-
from .manager import SharedBrowserManager
|
|
255
|
-
SharedBrowserManager.hide_scrollbars(tab)
|
|
256
|
-
tab.run_js("""
|
|
257
|
-
const style = document.createElement('style');
|
|
258
|
-
style.textContent = `
|
|
259
|
-
::-webkit-scrollbar { display: none !important; }
|
|
260
|
-
html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
|
|
261
|
-
`;
|
|
262
|
-
document.head.appendChild(style);
|
|
263
|
-
document.documentElement.style.overflow = 'hidden';
|
|
264
|
-
document.body.style.overflow = 'hidden';
|
|
265
|
-
""")
|
|
266
|
-
|
|
267
|
-
ele = tab.ele("#main-container")
|
|
268
|
-
if ele:
|
|
269
|
-
return ele.get_screenshot(as_base64='jpg', quality=quality)
|
|
270
|
-
else:
|
|
271
|
-
return tab.get_screenshot(as_base64='jpg', full_page=full_page, quality=quality)
|
|
272
|
-
|
|
273
|
-
except Exception as e:
|
|
274
|
-
logger.error(f"ScreenshotService: Screenshot URL failed: {e}")
|
|
275
|
-
return None
|
|
276
|
-
finally:
|
|
277
|
-
if tab:
|
|
278
|
-
try: tab.close()
|
|
279
|
-
except: pass
|
|
280
|
-
|
|
281
|
-
async def close(self):
|
|
282
|
-
self._executor.shutdown(wait=False)
|
|
283
|
-
logger.info("ScreenshotService: Closed.")
|
|
284
|
-
|
|
285
|
-
async def close_async(self):
|
|
286
|
-
await self.close()
|
|
287
|
-
|
|
288
|
-
# Singleton
|
|
289
|
-
_screenshot_service: Optional[ScreenshotService] = None
|
|
290
|
-
|
|
291
|
-
def get_screenshot_service(headless: bool = True) -> ScreenshotService:
|
|
292
|
-
global _screenshot_service
|
|
293
|
-
if _screenshot_service is None:
|
|
294
|
-
_screenshot_service = ScreenshotService(headless=headless, auto_start=True)
|
|
295
|
-
return _screenshot_service
|
|
296
|
-
|
|
297
|
-
async def close_screenshot_service():
|
|
298
|
-
global _screenshot_service
|
|
299
|
-
if _screenshot_service:
|
|
300
|
-
await _screenshot_service.close()
|
|
301
|
-
_screenshot_service = None
|
|
302
|
-
|
|
303
|
-
def prestart_browser(headless: bool = True):
|
|
304
|
-
get_screenshot_service(headless=headless)
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
# Logs
|
|
2
|
-
logs
|
|
3
|
-
*.log
|
|
4
|
-
npm-debug.log*
|
|
5
|
-
yarn-debug.log*
|
|
6
|
-
yarn-error.log*
|
|
7
|
-
pnpm-debug.log*
|
|
8
|
-
lerna-debug.log*
|
|
9
|
-
|
|
10
|
-
node_modules
|
|
11
|
-
dist
|
|
12
|
-
dist-ssr
|
|
13
|
-
*.local
|
|
14
|
-
|
|
15
|
-
# Editor directories and files
|
|
16
|
-
.vscode/*
|
|
17
|
-
!.vscode/extensions.json
|
|
18
|
-
.idea
|
|
19
|
-
.DS_Store
|
|
20
|
-
*.suo
|
|
21
|
-
*.ntvs*
|
|
22
|
-
*.njsproj
|
|
23
|
-
*.sln
|
|
24
|
-
*.sw?
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
# Vue 3 + TypeScript + Vite
|
|
2
|
-
|
|
3
|
-
This template should help get you started developing with Vue 3 and TypeScript in Vite. The template uses Vue 3 `<script setup>` SFCs, check out the [script setup docs](https://v3.vuejs.org/api/sfc-script-setup.html#sfc-script-setup) to learn more.
|
|
4
|
-
|
|
5
|
-
Learn more about the recommended Project Setup and IDE Support in the [Vue Docs TypeScript Guide](https://vuejs.org/guide/typescript/overview.html#project-setup).
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
<!DOCTYPE html>
|
|
2
|
-
<html lang="zh-CN">
|
|
3
|
-
|
|
4
|
-
<head>
|
|
5
|
-
<meta charset="UTF-8" />
|
|
6
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
|
-
<title>Card Renderer</title>
|
|
8
|
-
<script>window.RENDER_DATA = {}</script>
|
|
9
|
-
</head>
|
|
10
|
-
|
|
11
|
-
<body>
|
|
12
|
-
<div id="app"></div>
|
|
13
|
-
<script type="module" src="/src/main.ts"></script>
|
|
14
|
-
</body>
|
|
15
|
-
|
|
16
|
-
</html>
|