entari-plugin-hyw 4.0.0rc5__py3-none-any.whl → 4.0.0rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/__init__.py +71 -9
- entari_plugin_hyw/assets/card-dist/index.html +26 -26
- entari_plugin_hyw/browser/engines/default.py +166 -0
- entari_plugin_hyw/browser/engines/{searxng.py → duckduckgo.py} +4 -4
- entari_plugin_hyw/browser/engines/google.py +155 -0
- entari_plugin_hyw/browser/manager.py +1 -1
- entari_plugin_hyw/browser/service.py +323 -53
- entari_plugin_hyw/card-ui/src/App.vue +32 -1
- entari_plugin_hyw/definitions.py +55 -11
- entari_plugin_hyw/history.py +34 -44
- entari_plugin_hyw/misc.py +34 -0
- entari_plugin_hyw/modular_pipeline.py +177 -50
- entari_plugin_hyw/search.py +67 -25
- entari_plugin_hyw/stage_base.py +7 -0
- entari_plugin_hyw/stage_instruct.py +34 -7
- entari_plugin_hyw/stage_instruct_deepsearch.py +104 -0
- entari_plugin_hyw/stage_summary.py +6 -0
- entari_plugin_hyw/stage_vision.py +113 -0
- {entari_plugin_hyw-4.0.0rc5.dist-info → entari_plugin_hyw-4.0.0rc7.dist-info}/METADATA +1 -1
- {entari_plugin_hyw-4.0.0rc5.dist-info → entari_plugin_hyw-4.0.0rc7.dist-info}/RECORD +22 -19
- entari_plugin_hyw/stage_instruct_review.py +0 -92
- {entari_plugin_hyw-4.0.0rc5.dist-info → entari_plugin_hyw-4.0.0rc7.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-4.0.0rc5.dist-info → entari_plugin_hyw-4.0.0rc7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
|
|
2
|
+
import urllib.parse
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Dict, Any
|
|
5
|
+
from loguru import logger
|
|
6
|
+
from .base import SearchEngine
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DefaultEngine(SearchEngine):
|
|
10
|
+
"""
|
|
11
|
+
Default browser address bar search engine.
|
|
12
|
+
Uses the browser's address bar to search (Ctrl+L -> type -> Enter).
|
|
13
|
+
This uses whatever default search engine the browser is configured with.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
# Special marker to indicate this engine uses address bar input
|
|
17
|
+
USE_ADDRESS_BAR = True
|
|
18
|
+
|
|
19
|
+
def build_url(self, query: str, limit: int = 10) -> str:
|
|
20
|
+
"""
|
|
21
|
+
For address bar search, we don't build a URL.
|
|
22
|
+
Return the raw query - SearchService will handle the address bar input.
|
|
23
|
+
"""
|
|
24
|
+
# Return a special marker so SearchService knows to use address bar
|
|
25
|
+
return f"__ADDRESS_BAR_SEARCH__:{query}"
|
|
26
|
+
|
|
27
|
+
def parse(self, content: str) -> List[Dict[str, Any]]:
|
|
28
|
+
"""
|
|
29
|
+
Parse search results from whatever search engine the browser uses.
|
|
30
|
+
We detect the engine from the HTML and use appropriate parsing.
|
|
31
|
+
"""
|
|
32
|
+
results = []
|
|
33
|
+
seen_urls = set()
|
|
34
|
+
|
|
35
|
+
# Detect which search engine based on content
|
|
36
|
+
is_google = 'google' in content.lower() and ('class="g"' in content or 'data-hveid' in content)
|
|
37
|
+
is_bing = 'bing' in content.lower() and 'b_algo' in content
|
|
38
|
+
is_duckduckgo = 'duckduckgo' in content.lower()
|
|
39
|
+
|
|
40
|
+
if is_google:
|
|
41
|
+
results = self._parse_google(content, seen_urls)
|
|
42
|
+
elif is_bing:
|
|
43
|
+
results = self._parse_bing(content, seen_urls)
|
|
44
|
+
elif is_duckduckgo:
|
|
45
|
+
results = self._parse_duckduckgo(content, seen_urls)
|
|
46
|
+
else:
|
|
47
|
+
# Generic fallback
|
|
48
|
+
results = self._parse_generic(content, seen_urls)
|
|
49
|
+
|
|
50
|
+
logger.info(f"DefaultEngine parsed {len(results)} results (detected: {'google' if is_google else 'bing' if is_bing else 'ddg' if is_duckduckgo else 'generic'})")
|
|
51
|
+
return results
|
|
52
|
+
|
|
53
|
+
def _parse_google(self, content: str, seen_urls: set) -> List[Dict[str, Any]]:
|
|
54
|
+
"""Parse Google search results."""
|
|
55
|
+
results = []
|
|
56
|
+
# Look for result links
|
|
57
|
+
link_regex = re.compile(
|
|
58
|
+
r'<a[^>]+href="(https?://(?!google\.com|accounts\.google)[^"]+)"[^>]*>([^<]+)</a>',
|
|
59
|
+
re.IGNORECASE
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
for match in link_regex.finditer(content):
|
|
63
|
+
if len(results) >= 15:
|
|
64
|
+
break
|
|
65
|
+
href = match.group(1)
|
|
66
|
+
title = match.group(2).strip()
|
|
67
|
+
|
|
68
|
+
if href in seen_urls or not title or len(title) < 3:
|
|
69
|
+
continue
|
|
70
|
+
if any(x in href for x in ['google.com', 'gstatic.com', 'youtube.com/redirect']):
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
seen_urls.add(href)
|
|
74
|
+
results.append({
|
|
75
|
+
"title": re.sub(r'<[^>]+>', '', title),
|
|
76
|
+
"url": href,
|
|
77
|
+
"domain": urllib.parse.urlparse(href).hostname or "",
|
|
78
|
+
"content": "",
|
|
79
|
+
})
|
|
80
|
+
return results
|
|
81
|
+
|
|
82
|
+
def _parse_bing(self, content: str, seen_urls: set) -> List[Dict[str, Any]]:
|
|
83
|
+
"""Parse Bing search results."""
|
|
84
|
+
results = []
|
|
85
|
+
link_regex = re.compile(
|
|
86
|
+
r'<a[^>]+href="(https?://(?!bing\.com|microsoft\.com)[^"]+)"[^>]*>(.*?)</a>',
|
|
87
|
+
re.IGNORECASE | re.DOTALL
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
for match in link_regex.finditer(content):
|
|
91
|
+
if len(results) >= 15:
|
|
92
|
+
break
|
|
93
|
+
href = match.group(1)
|
|
94
|
+
title_html = match.group(2)
|
|
95
|
+
title = re.sub(r'<[^>]+>', '', title_html).strip()
|
|
96
|
+
|
|
97
|
+
if href in seen_urls or not title or len(title) < 3:
|
|
98
|
+
continue
|
|
99
|
+
if any(x in href for x in ['bing.com', 'microsoft.com', 'msn.com']):
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
seen_urls.add(href)
|
|
103
|
+
results.append({
|
|
104
|
+
"title": title,
|
|
105
|
+
"url": href,
|
|
106
|
+
"domain": urllib.parse.urlparse(href).hostname or "",
|
|
107
|
+
"content": "",
|
|
108
|
+
})
|
|
109
|
+
return results
|
|
110
|
+
|
|
111
|
+
def _parse_duckduckgo(self, content: str, seen_urls: set) -> List[Dict[str, Any]]:
|
|
112
|
+
"""Parse DuckDuckGo results."""
|
|
113
|
+
results = []
|
|
114
|
+
link_regex = re.compile(
|
|
115
|
+
r'<a[^>]+href="(https?://(?!duckduckgo\.com)[^"]+)"[^>]*>(.*?)</a>',
|
|
116
|
+
re.IGNORECASE | re.DOTALL
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
for match in link_regex.finditer(content):
|
|
120
|
+
if len(results) >= 15:
|
|
121
|
+
break
|
|
122
|
+
href = match.group(1)
|
|
123
|
+
title_html = match.group(2)
|
|
124
|
+
title = re.sub(r'<[^>]+>', '', title_html).strip()
|
|
125
|
+
|
|
126
|
+
if href in seen_urls or not title or len(title) < 3:
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
seen_urls.add(href)
|
|
130
|
+
results.append({
|
|
131
|
+
"title": title,
|
|
132
|
+
"url": href,
|
|
133
|
+
"domain": urllib.parse.urlparse(href).hostname or "",
|
|
134
|
+
"content": "",
|
|
135
|
+
})
|
|
136
|
+
return results
|
|
137
|
+
|
|
138
|
+
def _parse_generic(self, content: str, seen_urls: set) -> List[Dict[str, Any]]:
|
|
139
|
+
"""Generic link parser for unknown search engines."""
|
|
140
|
+
results = []
|
|
141
|
+
link_regex = re.compile(
|
|
142
|
+
r'<a[^>]+href="(https?://[^"]+)"[^>]*>([^<]+)</a>',
|
|
143
|
+
re.IGNORECASE
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
for match in link_regex.finditer(content):
|
|
147
|
+
if len(results) >= 15:
|
|
148
|
+
break
|
|
149
|
+
href = match.group(1)
|
|
150
|
+
title = match.group(2).strip()
|
|
151
|
+
|
|
152
|
+
if href in seen_urls or not title or len(title) < 5:
|
|
153
|
+
continue
|
|
154
|
+
# Skip common non-result URLs
|
|
155
|
+
if any(x in href for x in ['javascript:', 'mailto:', '#', 'login', 'signin', 'account']):
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
seen_urls.add(href)
|
|
159
|
+
results.append({
|
|
160
|
+
"title": title,
|
|
161
|
+
"url": href,
|
|
162
|
+
"domain": urllib.parse.urlparse(href).hostname or "",
|
|
163
|
+
"content": "",
|
|
164
|
+
})
|
|
165
|
+
return results
|
|
166
|
+
|
|
@@ -5,9 +5,9 @@ from typing import List, Dict, Any
|
|
|
5
5
|
from loguru import logger
|
|
6
6
|
from .base import SearchEngine
|
|
7
7
|
|
|
8
|
-
class
|
|
8
|
+
class DuckDuckGoEngine(SearchEngine):
|
|
9
9
|
"""
|
|
10
|
-
Parser for DuckDuckGo
|
|
10
|
+
Parser for DuckDuckGo Lite results.
|
|
11
11
|
Handles both Markdown (from Crawl4AI) and HTML (fallback).
|
|
12
12
|
"""
|
|
13
13
|
|
|
@@ -83,7 +83,7 @@ class SearXNGEngine(SearchEngine):
|
|
|
83
83
|
})
|
|
84
84
|
seen_urls.add(href)
|
|
85
85
|
|
|
86
|
-
logger.info(f"
|
|
86
|
+
logger.info(f"DuckDuckGo Parser(HTML) found {len(results)} results.")
|
|
87
87
|
return results
|
|
88
88
|
|
|
89
89
|
def _parse_markdown(self, content: str) -> List[Dict[str, Any]]:
|
|
@@ -133,5 +133,5 @@ class SearXNGEngine(SearchEngine):
|
|
|
133
133
|
if current_result:
|
|
134
134
|
results.append(current_result)
|
|
135
135
|
|
|
136
|
-
logger.info(f"
|
|
136
|
+
logger.info(f"DuckDuckGo Parser(Markdown) found {len(results)} results.")
|
|
137
137
|
return results
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
|
|
2
|
+
import urllib.parse
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Dict, Any
|
|
5
|
+
from loguru import logger
|
|
6
|
+
from .base import SearchEngine
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GoogleEngine(SearchEngine):
|
|
10
|
+
"""
|
|
11
|
+
Search engine implementation for Google.
|
|
12
|
+
Parses Google Search HTML results.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def build_url(self, query: str, limit: int = 10) -> str:
|
|
16
|
+
encoded_query = urllib.parse.quote(query)
|
|
17
|
+
return f"https://www.google.com/search?q={encoded_query}"
|
|
18
|
+
|
|
19
|
+
def parse(self, content: str) -> List[Dict[str, Any]]:
|
|
20
|
+
results = []
|
|
21
|
+
seen_urls = set()
|
|
22
|
+
|
|
23
|
+
# Google search results are in blocks with class="MjjYud" or similar containers
|
|
24
|
+
# Split by result blocks first for more accurate extraction
|
|
25
|
+
|
|
26
|
+
# Method 1: Split by common result block classes
|
|
27
|
+
block_patterns = [
|
|
28
|
+
r'<div class="MjjYud"[^>]*>',
|
|
29
|
+
r'<div class="tF2Cxc"[^>]*>',
|
|
30
|
+
r'<div class="g Ww4FFb"[^>]*>',
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
blocks = [content]
|
|
34
|
+
for bp in block_patterns:
|
|
35
|
+
new_blocks = []
|
|
36
|
+
for block in blocks:
|
|
37
|
+
parts = re.split(bp, block)
|
|
38
|
+
new_blocks.extend(parts)
|
|
39
|
+
blocks = new_blocks
|
|
40
|
+
|
|
41
|
+
for block in blocks:
|
|
42
|
+
if len(block) < 100:
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
# Find URL in this block - prefer links with h3 nearby
|
|
46
|
+
url_match = re.search(r'<a[^>]+href="(https?://(?!www\.google\.|google\.|webcache\.googleusercontent\.)[^"]+)"[^>]*>', block)
|
|
47
|
+
if not url_match:
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
url = url_match.group(1)
|
|
51
|
+
if url in seen_urls or self._should_skip_url(url):
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
# Find h3 title in this block
|
|
55
|
+
h3_match = re.search(r'<h3[^>]*>(.*?)</h3>', block, re.IGNORECASE | re.DOTALL)
|
|
56
|
+
if not h3_match:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
title = re.sub(r'<[^>]+>', '', h3_match.group(1)).strip()
|
|
60
|
+
if not title or len(title) < 2:
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
seen_urls.add(url)
|
|
64
|
+
|
|
65
|
+
# Extract snippet from VwiC3b class (Google's snippet container)
|
|
66
|
+
snippet = ""
|
|
67
|
+
snippet_match = re.search(r'<div[^>]*class="[^"]*VwiC3b[^"]*"[^>]*>(.*?)</div>', block, re.IGNORECASE | re.DOTALL)
|
|
68
|
+
if snippet_match:
|
|
69
|
+
snippet = re.sub(r'<[^>]+>', ' ', snippet_match.group(1)).strip()
|
|
70
|
+
snippet = re.sub(r'\s+', ' ', snippet).strip()
|
|
71
|
+
|
|
72
|
+
# Fallback: look for any text after h3
|
|
73
|
+
if not snippet:
|
|
74
|
+
# Try other common snippet patterns
|
|
75
|
+
alt_patterns = [
|
|
76
|
+
r'<span[^>]*class="[^"]*aCOpRe[^"]*"[^>]*>(.*?)</span>',
|
|
77
|
+
r'<div[^>]*data-snc[^>]*>(.*?)</div>',
|
|
78
|
+
]
|
|
79
|
+
for ap in alt_patterns:
|
|
80
|
+
am = re.search(ap, block, re.IGNORECASE | re.DOTALL)
|
|
81
|
+
if am:
|
|
82
|
+
snippet = re.sub(r'<[^>]+>', ' ', am.group(1)).strip()
|
|
83
|
+
snippet = re.sub(r'\s+', ' ', snippet).strip()
|
|
84
|
+
break
|
|
85
|
+
|
|
86
|
+
# Extract images from this block
|
|
87
|
+
images = []
|
|
88
|
+
# Pattern 1: Regular img src (excluding data: and tracking pixels)
|
|
89
|
+
# Note: gstatic.com/images/branding is logo, but encrypted-tbn*.gstatic.com are thumbnails
|
|
90
|
+
img_matches = re.findall(r'<img[^>]+src="(https?://[^"]+)"', block)
|
|
91
|
+
for img_url in img_matches:
|
|
92
|
+
# Decode HTML entities
|
|
93
|
+
img_url = img_url.replace('&', '&')
|
|
94
|
+
# Skip tracking/icon/small images (but allow encrypted-tbn which are valid thumbnails)
|
|
95
|
+
if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo', 'gstatic.com/images/branding', '1x1', 'transparent', 'gstatic.com/images/icons']):
|
|
96
|
+
continue
|
|
97
|
+
if img_url not in images:
|
|
98
|
+
images.append(img_url)
|
|
99
|
+
|
|
100
|
+
# Pattern 2: data-src (lazy loaded images)
|
|
101
|
+
data_src_matches = re.findall(r'data-src="(https?://[^"]+)"', block)
|
|
102
|
+
for img_url in data_src_matches:
|
|
103
|
+
img_url = img_url.replace('&', '&')
|
|
104
|
+
if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo']):
|
|
105
|
+
continue
|
|
106
|
+
if img_url not in images:
|
|
107
|
+
images.append(img_url)
|
|
108
|
+
|
|
109
|
+
results.append({
|
|
110
|
+
"title": title,
|
|
111
|
+
"url": url,
|
|
112
|
+
"domain": urllib.parse.urlparse(url).hostname or "",
|
|
113
|
+
"content": snippet[:1000],
|
|
114
|
+
"images": images[:3] # Limit to 3 images per result
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
if len(results) >= 15:
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
total_images = sum(len(r.get("images", [])) for r in results)
|
|
121
|
+
logger.info(f"GoogleEngine parsed {len(results)} results with {total_images} images total.")
|
|
122
|
+
return results
|
|
123
|
+
|
|
124
|
+
def _should_skip_url(self, url: str) -> bool:
|
|
125
|
+
"""Check if URL should be skipped."""
|
|
126
|
+
skip_patterns = [
|
|
127
|
+
"google.com",
|
|
128
|
+
"googleusercontent.com",
|
|
129
|
+
"gstatic.com",
|
|
130
|
+
"youtube.com/watch", # Keep channel/playlist but skip individual videos
|
|
131
|
+
"maps.google",
|
|
132
|
+
"translate.google",
|
|
133
|
+
"accounts.google",
|
|
134
|
+
"support.google",
|
|
135
|
+
"policies.google",
|
|
136
|
+
"schema.org",
|
|
137
|
+
"javascript:",
|
|
138
|
+
"data:",
|
|
139
|
+
"#",
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
for pattern in skip_patterns:
|
|
143
|
+
if pattern in url.lower():
|
|
144
|
+
return True
|
|
145
|
+
|
|
146
|
+
# Skip very short URLs (likely invalid)
|
|
147
|
+
if len(url) < 20:
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
# Skip URLs that are just root domains without path
|
|
151
|
+
parsed = urllib.parse.urlparse(url)
|
|
152
|
+
if not parsed.path or parsed.path == "/":
|
|
153
|
+
return True
|
|
154
|
+
|
|
155
|
+
return False
|
|
@@ -65,7 +65,7 @@ class SharedBrowserManager:
|
|
|
65
65
|
# Hide scrollbars globally
|
|
66
66
|
co.set_argument('--hide-scrollbars')
|
|
67
67
|
# 十万的原因是滚动条屏蔽(大概吧)
|
|
68
|
-
co.set_argument('--window-size=1280,
|
|
68
|
+
co.set_argument('--window-size=1280,9000')
|
|
69
69
|
self._page = ChromiumPage(addr_or_opts=co)
|
|
70
70
|
|
|
71
71
|
# Show Landing Page
|