entari-plugin-hyw 4.0.0rc4__py3-none-any.whl → 4.0.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (30) hide show
  1. entari_plugin_hyw/__init__.py +216 -75
  2. entari_plugin_hyw/assets/card-dist/index.html +70 -79
  3. entari_plugin_hyw/browser/__init__.py +10 -0
  4. entari_plugin_hyw/browser/engines/base.py +13 -0
  5. entari_plugin_hyw/browser/engines/bing.py +95 -0
  6. entari_plugin_hyw/browser/engines/duckduckgo.py +137 -0
  7. entari_plugin_hyw/browser/engines/google.py +155 -0
  8. entari_plugin_hyw/browser/landing.html +172 -0
  9. entari_plugin_hyw/browser/manager.py +153 -0
  10. entari_plugin_hyw/browser/service.py +304 -0
  11. entari_plugin_hyw/card-ui/src/App.vue +526 -182
  12. entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +7 -11
  13. entari_plugin_hyw/card-ui/src/components/StageCard.vue +33 -30
  14. entari_plugin_hyw/card-ui/src/types.ts +9 -0
  15. entari_plugin_hyw/definitions.py +155 -0
  16. entari_plugin_hyw/history.py +111 -33
  17. entari_plugin_hyw/misc.py +34 -0
  18. entari_plugin_hyw/modular_pipeline.py +384 -0
  19. entari_plugin_hyw/render_vue.py +326 -239
  20. entari_plugin_hyw/search.py +95 -708
  21. entari_plugin_hyw/stage_base.py +92 -0
  22. entari_plugin_hyw/stage_instruct.py +345 -0
  23. entari_plugin_hyw/stage_instruct_deepsearch.py +104 -0
  24. entari_plugin_hyw/stage_summary.py +164 -0
  25. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/METADATA +4 -4
  26. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/RECORD +28 -16
  27. entari_plugin_hyw/pipeline.py +0 -1219
  28. entari_plugin_hyw/prompts.py +0 -47
  29. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/WHEEL +0 -0
  30. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,10 @@
1
+ from .manager import get_shared_browser_manager, close_shared_browser
2
+ from .service import get_screenshot_service, close_screenshot_service, prestart_browser
3
+
4
+ __all__ = [
5
+ "get_shared_browser_manager",
6
+ "close_shared_browser",
7
+ "get_screenshot_service",
8
+ "close_screenshot_service",
9
+ "prestart_browser",
10
+ ]
@@ -0,0 +1,13 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Dict, Any
3
+
4
+ class SearchEngine(ABC):
5
+ @abstractmethod
6
+ def build_url(self, query: str, limit: int = 10) -> str:
7
+ """Build the search URL for the given query."""
8
+ pass
9
+
10
+ @abstractmethod
11
+ def parse(self, content: str) -> List[Dict[str, Any]]:
12
+ """Parse the raw HTML/Markdown content into a list of results."""
13
+ pass
@@ -0,0 +1,95 @@
1
+
2
+ import urllib.parse
3
+ import re
4
+ from typing import List, Dict, Any
5
+ from loguru import logger
6
+ from .base import SearchEngine
7
+
8
+ class BingEngine(SearchEngine):
9
+ """
10
+ Search engine implementation for Bing.
11
+ """
12
+
13
+ def build_url(self, query: str, limit: int = 10) -> str:
14
+ encoded_query = urllib.parse.quote(query)
15
+ base = "https://www.bing.com/search"
16
+ return f"{base}?form=&q={encoded_query}"
17
+
18
+ def parse(self, content: str) -> List[Dict[str, Any]]:
19
+ results = []
20
+ # Split by b_algo to isolate results
21
+ chunks = content.split('class="b_algo"')
22
+
23
+ # Helper to decode Bing URLs roughly
24
+ def decode_bing_url(u):
25
+ if "bing.com/ck/a?" not in u: return u
26
+ try:
27
+ # Url is usually like ...&u=a1<base64>&...
28
+ # We look for &u=...
29
+ import base64
30
+ match = re.search(r'[?&]u=a1([^&]+)', u)
31
+ if match:
32
+ # Bing uses a modified base64 (url safe) and adds 'a1' prefix
33
+ # We stripped 'a1' in regex match group
34
+ b64 = match.group(1)
35
+ # padding
36
+ b64 += '=' * (-len(b64) % 4)
37
+ # url safe
38
+ b64 = b64.replace('-', '+').replace('_', '/')
39
+ decoded = base64.b64decode(b64).decode('utf-8')
40
+ return decoded
41
+ except Exception:
42
+ pass
43
+ return u
44
+
45
+ seen_urls = set()
46
+
47
+ for chunk in chunks[1:]:
48
+ # Exact regexes for title and snippet within the chunk
49
+ # Title: <h2><a href="...">...</a></h2>
50
+ link_match = re.search(r'<h2[^>]*>.*?<a[^>]+href=["\'](http[^"\']+)["\'][^>]*>(.*?)</a>', chunk, re.IGNORECASE | re.DOTALL)
51
+ if not link_match:
52
+ # Fallback: pure a tag
53
+ link_match = re.search(r'<a[^>]+href=["\'](http[^"\']+)["\'][^>]*>(.*?)</a>', chunk, re.IGNORECASE | re.DOTALL)
54
+
55
+ if link_match:
56
+ raw_url = link_match.group(1)
57
+ title_html = link_match.group(2)
58
+ title = re.sub(r'<[^>]+>', '', title_html).strip()
59
+
60
+ url = decode_bing_url(raw_url)
61
+
62
+ if url in seen_urls: continue
63
+ seen_urls.add(url)
64
+
65
+ # Snippet: class="b_caption" ... <p> ... </p> or just div text
66
+ snippet = ""
67
+ caption_match = re.search(r'class="b_caption"[^>]*>(.*?)</div>', chunk, re.IGNORECASE | re.DOTALL)
68
+ if caption_match:
69
+ snippet_html = caption_match.group(1)
70
+ snippet = re.sub(r'<[^>]+>', ' ', snippet_html).strip()
71
+ else:
72
+ # Fallback snippet
73
+ start = link_match.end()
74
+ snippet = re.sub(r'<[^>]+>', ' ', chunk[start:start+600]).strip()
75
+
76
+ snippet = re.sub(r'\s+', ' ', snippet).strip()
77
+
78
+ # Image extraction (basic)
79
+ images = []
80
+ img_matches = re.findall(r'<img[^>]+src=["\'](http[^"\']+)["\']', chunk)
81
+ for img_url in img_matches:
82
+ if not any(x in img_url for x in ['favicon', 'icon', 'tracking', 'pixel']):
83
+ images.append(img_url)
84
+
85
+ if url and title:
86
+ results.append({
87
+ "title": title,
88
+ "url": url,
89
+ "domain": urllib.parse.urlparse(url).hostname or "",
90
+ "content": snippet[:5000],
91
+ "images": images[:3]
92
+ })
93
+
94
+ logger.info(f"BingEngine parsed {len(results)} results.")
95
+ return results
@@ -0,0 +1,137 @@
1
+
2
+ import urllib.parse
3
+ import re
4
+ from typing import List, Dict, Any
5
+ from loguru import logger
6
+ from .base import SearchEngine
7
+
8
+ class DuckDuckGoEngine(SearchEngine):
9
+ """
10
+ Parser for DuckDuckGo Lite results.
11
+ Handles both Markdown (from Crawl4AI) and HTML (fallback).
12
+ """
13
+
14
+ def build_url(self, query: str, limit: int = 10) -> str:
15
+ encoded_query = urllib.parse.quote(query)
16
+ # Default fallback if not configurable per instance, but usually this is what we support as "searxng"
17
+ base = "https://lite.duckduckgo.com/lite/"
18
+ return f"{base}?q={encoded_query}"
19
+
20
+ def parse(self, content: str) -> List[Dict[str, Any]]:
21
+ # Prioritize HTML parsing if content looks like HTML
22
+ if "<html" in content.lower() or "<!doctype" in content.lower() or "<div" in content.lower():
23
+ results = self._parse_html(content)
24
+ if results:
25
+ return results
26
+
27
+ # Fallback to Markdown
28
+ return self._parse_markdown(content)
29
+
30
+ def _parse_html(self, content: str) -> List[Dict[str, Any]]:
31
+ results = []
32
+ seen_urls = set()
33
+
34
+ # Simple regex for DDG Lite / SearXNG HTML structure
35
+ link_regex = re.compile(r'<a[^>]+href=["\'](http[^"\']+)["\'][^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)
36
+
37
+ pos = 0
38
+ while True:
39
+ match = link_regex.search(content, pos)
40
+ if not match:
41
+ break
42
+
43
+ href = match.group(1)
44
+ title_html = match.group(2)
45
+
46
+ # Clean title
47
+ title = re.sub(r'<[^>]+>', '', title_html).strip()
48
+
49
+ pos = match.end()
50
+
51
+ # Filter junk
52
+ if "search" in href and "q=" in href: continue
53
+ if "google.com" in href or "bing.com" in href: continue
54
+ if href in seen_urls: continue
55
+
56
+ # Look ahead for snippet
57
+ snippet_chunk = content[pos:pos+1000]
58
+ snippet_match = re.search(r'(.*?)<a', snippet_chunk, re.DOTALL | re.IGNORECASE)
59
+ raw_snippet = snippet_match.group(1) if snippet_match else snippet_chunk
60
+
61
+ # Clean HTML tags from snippet
62
+ snippet = re.sub(r'<[^>]+>', ' ', raw_snippet)
63
+ snippet = re.sub(r'\s+', ' ', snippet).strip()
64
+
65
+ # No truncation as per user request (or very generous limit)
66
+ snippet = snippet[:5000]
67
+
68
+ # Valid result check
69
+ if title and len(title) > 2 and snippet:
70
+ # Extract images from the result block (rough heuristic)
71
+ images = []
72
+ img_matches = re.findall(r'<img[^>]+src=["\'](http[^"\']+)["\']', snippet_match.group(0) if snippet_match else snippet_chunk)
73
+ for img_url in img_matches:
74
+ if not any(x in img_url for x in ['favicon', 'icon', 'tracking', 'pixel']):
75
+ images.append(img_url)
76
+
77
+ results.append({
78
+ "title": title,
79
+ "url": href,
80
+ "domain": urllib.parse.urlparse(href).hostname or "",
81
+ "content": snippet,
82
+ "images": images[:3] # Limit per result
83
+ })
84
+ seen_urls.add(href)
85
+
86
+ logger.info(f"DuckDuckGo Parser(HTML) found {len(results)} results.")
87
+ return results
88
+
89
+ def _parse_markdown(self, content: str) -> List[Dict[str, Any]]:
90
+ results = []
91
+ seen_urls = set()
92
+
93
+ # Link regex: [Title](URL)
94
+ link_regex = re.compile(r'\[(.*?)\]\((https?://.*?)\)')
95
+
96
+ lines = content.split('\n')
97
+ current_result = None
98
+
99
+ for line in lines:
100
+ line = line.strip()
101
+ if not line: continue
102
+
103
+ # Check for link
104
+ match = link_regex.search(line)
105
+ if match:
106
+ # Save previous result
107
+ if current_result:
108
+ results.append(current_result)
109
+
110
+ title, href = match.groups()
111
+
112
+ # Filter junk
113
+ if "search" in href and "q=" in href: continue
114
+ if "google.com" in href or "bing.com" in href: continue
115
+ if href in seen_urls:
116
+ current_result = None
117
+ continue
118
+
119
+ seen_urls.add(href)
120
+
121
+ current_result = {
122
+ "title": title,
123
+ "url": href,
124
+ "domain": urllib.parse.urlparse(href).hostname or "",
125
+ "content": ""
126
+ }
127
+ elif current_result:
128
+ # Append snippet
129
+ if len(current_result["content"]) < 5000:
130
+ current_result["content"] += " " + line
131
+
132
+ # Append last
133
+ if current_result:
134
+ results.append(current_result)
135
+
136
+ logger.info(f"DuckDuckGo Parser(Markdown) found {len(results)} results.")
137
+ return results
@@ -0,0 +1,155 @@
1
+
2
+ import urllib.parse
3
+ import re
4
+ from typing import List, Dict, Any
5
+ from loguru import logger
6
+ from .base import SearchEngine
7
+
8
+
9
+ class GoogleEngine(SearchEngine):
10
+ """
11
+ Search engine implementation for Google.
12
+ Parses Google Search HTML results.
13
+ """
14
+
15
+ def build_url(self, query: str, limit: int = 10) -> str:
16
+ encoded_query = urllib.parse.quote(query)
17
+ return f"https://www.google.com/search?q={encoded_query}"
18
+
19
+ def parse(self, content: str) -> List[Dict[str, Any]]:
20
+ results = []
21
+ seen_urls = set()
22
+
23
+ # Google search results are in blocks with class="MjjYud" or similar containers
24
+ # Split by result blocks first for more accurate extraction
25
+
26
+ # Method 1: Split by common result block classes
27
+ block_patterns = [
28
+ r'<div class="MjjYud"[^>]*>',
29
+ r'<div class="tF2Cxc"[^>]*>',
30
+ r'<div class="g Ww4FFb"[^>]*>',
31
+ ]
32
+
33
+ blocks = [content]
34
+ for bp in block_patterns:
35
+ new_blocks = []
36
+ for block in blocks:
37
+ parts = re.split(bp, block)
38
+ new_blocks.extend(parts)
39
+ blocks = new_blocks
40
+
41
+ for block in blocks:
42
+ if len(block) < 100:
43
+ continue
44
+
45
+ # Find URL in this block - prefer links with h3 nearby
46
+ url_match = re.search(r'<a[^>]+href="(https?://(?!www\.google\.|google\.|webcache\.googleusercontent\.)[^"]+)"[^>]*>', block)
47
+ if not url_match:
48
+ continue
49
+
50
+ url = url_match.group(1)
51
+ if url in seen_urls or self._should_skip_url(url):
52
+ continue
53
+
54
+ # Find h3 title in this block
55
+ h3_match = re.search(r'<h3[^>]*>(.*?)</h3>', block, re.IGNORECASE | re.DOTALL)
56
+ if not h3_match:
57
+ continue
58
+
59
+ title = re.sub(r'<[^>]+>', '', h3_match.group(1)).strip()
60
+ if not title or len(title) < 2:
61
+ continue
62
+
63
+ seen_urls.add(url)
64
+
65
+ # Extract snippet from VwiC3b class (Google's snippet container)
66
+ snippet = ""
67
+ snippet_match = re.search(r'<div[^>]*class="[^"]*VwiC3b[^"]*"[^>]*>(.*?)</div>', block, re.IGNORECASE | re.DOTALL)
68
+ if snippet_match:
69
+ snippet = re.sub(r'<[^>]+>', ' ', snippet_match.group(1)).strip()
70
+ snippet = re.sub(r'\s+', ' ', snippet).strip()
71
+
72
+ # Fallback: look for any text after h3
73
+ if not snippet:
74
+ # Try other common snippet patterns
75
+ alt_patterns = [
76
+ r'<span[^>]*class="[^"]*aCOpRe[^"]*"[^>]*>(.*?)</span>',
77
+ r'<div[^>]*data-snc[^>]*>(.*?)</div>',
78
+ ]
79
+ for ap in alt_patterns:
80
+ am = re.search(ap, block, re.IGNORECASE | re.DOTALL)
81
+ if am:
82
+ snippet = re.sub(r'<[^>]+>', ' ', am.group(1)).strip()
83
+ snippet = re.sub(r'\s+', ' ', snippet).strip()
84
+ break
85
+
86
+ # Extract images from this block
87
+ images = []
88
+ # Pattern 1: Regular img src (excluding data: and tracking pixels)
89
+ # Note: gstatic.com/images/branding is logo, but encrypted-tbn*.gstatic.com are thumbnails
90
+ img_matches = re.findall(r'<img[^>]+src="(https?://[^"]+)"', block)
91
+ for img_url in img_matches:
92
+ # Decode HTML entities
93
+ img_url = img_url.replace('&amp;', '&')
94
+ # Skip tracking/icon/small images (but allow encrypted-tbn which are valid thumbnails)
95
+ if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo', 'gstatic.com/images/branding', '1x1', 'transparent', 'gstatic.com/images/icons']):
96
+ continue
97
+ if img_url not in images:
98
+ images.append(img_url)
99
+
100
+ # Pattern 2: data-src (lazy loaded images)
101
+ data_src_matches = re.findall(r'data-src="(https?://[^"]+)"', block)
102
+ for img_url in data_src_matches:
103
+ img_url = img_url.replace('&amp;', '&')
104
+ if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo']):
105
+ continue
106
+ if img_url not in images:
107
+ images.append(img_url)
108
+
109
+ results.append({
110
+ "title": title,
111
+ "url": url,
112
+ "domain": urllib.parse.urlparse(url).hostname or "",
113
+ "content": snippet[:1000],
114
+ "images": images[:3] # Limit to 3 images per result
115
+ })
116
+
117
+ if len(results) >= 15:
118
+ break
119
+
120
+ total_images = sum(len(r.get("images", [])) for r in results)
121
+ logger.info(f"GoogleEngine parsed {len(results)} results with {total_images} images total.")
122
+ return results
123
+
124
+ def _should_skip_url(self, url: str) -> bool:
125
+ """Check if URL should be skipped."""
126
+ skip_patterns = [
127
+ "google.com",
128
+ "googleusercontent.com",
129
+ "gstatic.com",
130
+ "youtube.com/watch", # Keep channel/playlist but skip individual videos
131
+ "maps.google",
132
+ "translate.google",
133
+ "accounts.google",
134
+ "support.google",
135
+ "policies.google",
136
+ "schema.org",
137
+ "javascript:",
138
+ "data:",
139
+ "#",
140
+ ]
141
+
142
+ for pattern in skip_patterns:
143
+ if pattern in url.lower():
144
+ return True
145
+
146
+ # Skip very short URLs (likely invalid)
147
+ if len(url) < 20:
148
+ return True
149
+
150
+ # Skip URLs that are just root domains without path
151
+ parsed = urllib.parse.urlparse(url)
152
+ if not parsed.path or parsed.path == "/":
153
+ return True
154
+
155
+ return False
@@ -0,0 +1,172 @@
1
+ <!DOCTYPE html>
2
+ <html lang="zh-CN">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <title>entari-plugin-hyw Browser</title>
7
+ <style>
8
+ :root {
9
+ --theme-color: #ef4444;
10
+ --text-primary: #2c2c2e;
11
+ --text-body: #3a3a3c;
12
+ --text-muted: #86868b;
13
+ --bg-border: #f2f2f2;
14
+ }
15
+
16
+ body {
17
+ background-color: var(--bg-border);
18
+ color: var(--text-primary);
19
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
20
+ margin: 0;
21
+ padding: 0;
22
+ display: flex;
23
+ align-items: center;
24
+ justify-content: center;
25
+ height: 100vh;
26
+ overflow: hidden;
27
+ }
28
+
29
+ #main-container {
30
+ width: 560px;
31
+ background: white;
32
+ padding: 40px;
33
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
34
+ position: relative;
35
+ transform: scale(1.1);
36
+ /* Slightly larger for visibility */
37
+ }
38
+
39
+ .corner-badge {
40
+ position: absolute;
41
+ top: -10px;
42
+ left: -10px;
43
+ height: 28px;
44
+ padding: 0 12px;
45
+ background-color: var(--theme-color);
46
+ color: white;
47
+ display: flex;
48
+ align-items: center;
49
+ font-size: 12px;
50
+ font-weight: 800;
51
+ text-transform: uppercase;
52
+ letter-spacing: 0.5px;
53
+ box-shadow: 0 2px 4px 0 rgba(0, 0, 0, 0.15);
54
+ z-index: 10;
55
+ }
56
+
57
+ h1 {
58
+ font-size: 32px;
59
+ font-weight: 900;
60
+ text-transform: uppercase;
61
+ letter-spacing: -1px;
62
+ margin: 0 0 24px 0;
63
+ line-height: 1.1;
64
+ }
65
+
66
+ .brand-text {
67
+ color: var(--theme-color);
68
+ }
69
+
70
+ .section-title {
71
+ font-size: 17px;
72
+ font-bold: 700;
73
+ text-transform: uppercase;
74
+ letter-spacing: -0.5px;
75
+ margin-bottom: 12px;
76
+ color: var(--text-primary);
77
+ }
78
+
79
+ .content-box {
80
+ padding-top: 24px;
81
+ border-top: 1px solid #eee;
82
+ margin-top: 32px;
83
+ }
84
+
85
+ .summary {
86
+ font-size: 15px;
87
+ line-height: 1.6;
88
+ color: var(--text-body);
89
+ text-align: justify;
90
+ }
91
+
92
+ .summary-en {
93
+ margin-top: 16px;
94
+ color: var(--text-muted);
95
+ font-size: 13px;
96
+ font-style: italic;
97
+ line-height: 1.5;
98
+ }
99
+
100
+ b {
101
+ color: var(--theme-color);
102
+ font-weight: 700;
103
+ }
104
+
105
+ /* Subtle loading animation to show it's "live" */
106
+ .progress-bar {
107
+ height: 3px;
108
+ background: #f3f3f3;
109
+ width: 100%;
110
+ margin-top: 32px;
111
+ position: relative;
112
+ overflow: hidden;
113
+ }
114
+
115
+ .progress-fill {
116
+ position: absolute;
117
+ height: 100%;
118
+ background: var(--theme-color);
119
+ width: 30%;
120
+ left: -30%;
121
+ animation: moveProgress 2s infinite ease-in-out;
122
+ }
123
+
124
+ @keyframes moveProgress {
125
+ 0% {
126
+ left: -30%;
127
+ width: 20%;
128
+ }
129
+
130
+ 50% {
131
+ width: 50%;
132
+ }
133
+
134
+ 100% {
135
+ left: 100%;
136
+ width: 20%;
137
+ }
138
+ }
139
+ </style>
140
+ </head>
141
+
142
+ <body>
143
+ <div id="main-container">
144
+ <!-- Corner Badge Style -->
145
+ <div class="corner-badge">
146
+ Status: Ready
147
+ </div>
148
+
149
+ <h1>entari-plugin-<span class="brand-text">hyw</span></h1>
150
+
151
+ <div class="content-box">
152
+ <div class="section-title">Browser Service</div>
153
+ <div class="summary">
154
+ 这是一个受 <b>entari-plugin-hyw</b> 插件控制的自动化浏览器实例。<br>
155
+ 它负责网络搜索、内容爬取以及卡片 UI 的实时渲染。<br>
156
+ 请勿关闭此窗口,以确保插件功能正常运行。
157
+ </div>
158
+
159
+ <div class="summary-en">
160
+ This is an automated browser instance controlled by the <b>entari-plugin-hyw</b> plugin.
161
+ It handles web searches, content crawling, and real-time Card UI rendering.
162
+ Please do not close this window to ensure the plugin functions correctly.
163
+ </div>
164
+ </div>
165
+
166
+ <div class="progress-bar">
167
+ <div class="progress-fill"></div>
168
+ </div>
169
+ </div>
170
+ </body>
171
+
172
+ </html>