entari-plugin-hyw 4.0.0rc5__py3-none-any.whl → 4.0.0rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

@@ -0,0 +1,166 @@
1
+
2
+ import urllib.parse
3
+ import re
4
+ from typing import List, Dict, Any
5
+ from loguru import logger
6
+ from .base import SearchEngine
7
+
8
+
9
+ class DefaultEngine(SearchEngine):
10
+ """
11
+ Default browser address bar search engine.
12
+ Uses the browser's address bar to search (Ctrl+L -> type -> Enter).
13
+ This uses whatever default search engine the browser is configured with.
14
+ """
15
+
16
+ # Special marker to indicate this engine uses address bar input
17
+ USE_ADDRESS_BAR = True
18
+
19
+ def build_url(self, query: str, limit: int = 10) -> str:
20
+ """
21
+ For address bar search, we don't build a URL.
22
+ Return the raw query - SearchService will handle the address bar input.
23
+ """
24
+ # Return a special marker so SearchService knows to use address bar
25
+ return f"__ADDRESS_BAR_SEARCH__:{query}"
26
+
27
+ def parse(self, content: str) -> List[Dict[str, Any]]:
28
+ """
29
+ Parse search results from whatever search engine the browser uses.
30
+ We detect the engine from the HTML and use appropriate parsing.
31
+ """
32
+ results = []
33
+ seen_urls = set()
34
+
35
+ # Detect which search engine based on content
36
+ is_google = 'google' in content.lower() and ('class="g"' in content or 'data-hveid' in content)
37
+ is_bing = 'bing' in content.lower() and 'b_algo' in content
38
+ is_duckduckgo = 'duckduckgo' in content.lower()
39
+
40
+ if is_google:
41
+ results = self._parse_google(content, seen_urls)
42
+ elif is_bing:
43
+ results = self._parse_bing(content, seen_urls)
44
+ elif is_duckduckgo:
45
+ results = self._parse_duckduckgo(content, seen_urls)
46
+ else:
47
+ # Generic fallback
48
+ results = self._parse_generic(content, seen_urls)
49
+
50
+ logger.info(f"DefaultEngine parsed {len(results)} results (detected: {'google' if is_google else 'bing' if is_bing else 'ddg' if is_duckduckgo else 'generic'})")
51
+ return results
52
+
53
+ def _parse_google(self, content: str, seen_urls: set) -> List[Dict[str, Any]]:
54
+ """Parse Google search results."""
55
+ results = []
56
+ # Look for result links
57
+ link_regex = re.compile(
58
+ r'<a[^>]+href="(https?://(?!google\.com|accounts\.google)[^"]+)"[^>]*>([^<]+)</a>',
59
+ re.IGNORECASE
60
+ )
61
+
62
+ for match in link_regex.finditer(content):
63
+ if len(results) >= 15:
64
+ break
65
+ href = match.group(1)
66
+ title = match.group(2).strip()
67
+
68
+ if href in seen_urls or not title or len(title) < 3:
69
+ continue
70
+ if any(x in href for x in ['google.com', 'gstatic.com', 'youtube.com/redirect']):
71
+ continue
72
+
73
+ seen_urls.add(href)
74
+ results.append({
75
+ "title": re.sub(r'<[^>]+>', '', title),
76
+ "url": href,
77
+ "domain": urllib.parse.urlparse(href).hostname or "",
78
+ "content": "",
79
+ })
80
+ return results
81
+
82
+ def _parse_bing(self, content: str, seen_urls: set) -> List[Dict[str, Any]]:
83
+ """Parse Bing search results."""
84
+ results = []
85
+ link_regex = re.compile(
86
+ r'<a[^>]+href="(https?://(?!bing\.com|microsoft\.com)[^"]+)"[^>]*>(.*?)</a>',
87
+ re.IGNORECASE | re.DOTALL
88
+ )
89
+
90
+ for match in link_regex.finditer(content):
91
+ if len(results) >= 15:
92
+ break
93
+ href = match.group(1)
94
+ title_html = match.group(2)
95
+ title = re.sub(r'<[^>]+>', '', title_html).strip()
96
+
97
+ if href in seen_urls or not title or len(title) < 3:
98
+ continue
99
+ if any(x in href for x in ['bing.com', 'microsoft.com', 'msn.com']):
100
+ continue
101
+
102
+ seen_urls.add(href)
103
+ results.append({
104
+ "title": title,
105
+ "url": href,
106
+ "domain": urllib.parse.urlparse(href).hostname or "",
107
+ "content": "",
108
+ })
109
+ return results
110
+
111
+ def _parse_duckduckgo(self, content: str, seen_urls: set) -> List[Dict[str, Any]]:
112
+ """Parse DuckDuckGo results."""
113
+ results = []
114
+ link_regex = re.compile(
115
+ r'<a[^>]+href="(https?://(?!duckduckgo\.com)[^"]+)"[^>]*>(.*?)</a>',
116
+ re.IGNORECASE | re.DOTALL
117
+ )
118
+
119
+ for match in link_regex.finditer(content):
120
+ if len(results) >= 15:
121
+ break
122
+ href = match.group(1)
123
+ title_html = match.group(2)
124
+ title = re.sub(r'<[^>]+>', '', title_html).strip()
125
+
126
+ if href in seen_urls or not title or len(title) < 3:
127
+ continue
128
+
129
+ seen_urls.add(href)
130
+ results.append({
131
+ "title": title,
132
+ "url": href,
133
+ "domain": urllib.parse.urlparse(href).hostname or "",
134
+ "content": "",
135
+ })
136
+ return results
137
+
138
+ def _parse_generic(self, content: str, seen_urls: set) -> List[Dict[str, Any]]:
139
+ """Generic link parser for unknown search engines."""
140
+ results = []
141
+ link_regex = re.compile(
142
+ r'<a[^>]+href="(https?://[^"]+)"[^>]*>([^<]+)</a>',
143
+ re.IGNORECASE
144
+ )
145
+
146
+ for match in link_regex.finditer(content):
147
+ if len(results) >= 15:
148
+ break
149
+ href = match.group(1)
150
+ title = match.group(2).strip()
151
+
152
+ if href in seen_urls or not title or len(title) < 5:
153
+ continue
154
+ # Skip common non-result URLs
155
+ if any(x in href for x in ['javascript:', 'mailto:', '#', 'login', 'signin', 'account']):
156
+ continue
157
+
158
+ seen_urls.add(href)
159
+ results.append({
160
+ "title": title,
161
+ "url": href,
162
+ "domain": urllib.parse.urlparse(href).hostname or "",
163
+ "content": "",
164
+ })
165
+ return results
166
+
@@ -5,9 +5,9 @@ from typing import List, Dict, Any
5
5
  from loguru import logger
6
6
  from .base import SearchEngine
7
7
 
8
- class SearXNGEngine(SearchEngine):
8
+ class DuckDuckGoEngine(SearchEngine):
9
9
  """
10
- Parser for DuckDuckGo and SearXNG results.
10
+ Parser for DuckDuckGo Lite results.
11
11
  Handles both Markdown (from Crawl4AI) and HTML (fallback).
12
12
  """
13
13
 
@@ -83,7 +83,7 @@ class SearXNGEngine(SearchEngine):
83
83
  })
84
84
  seen_urls.add(href)
85
85
 
86
- logger.info(f"SearXNG Parser(HTML) found {len(results)} results.")
86
+ logger.info(f"DuckDuckGo Parser(HTML) found {len(results)} results.")
87
87
  return results
88
88
 
89
89
  def _parse_markdown(self, content: str) -> List[Dict[str, Any]]:
@@ -133,5 +133,5 @@ class SearXNGEngine(SearchEngine):
133
133
  if current_result:
134
134
  results.append(current_result)
135
135
 
136
- logger.info(f"SearXNG Parser(Markdown) found {len(results)} results.")
136
+ logger.info(f"DuckDuckGo Parser(Markdown) found {len(results)} results.")
137
137
  return results
@@ -0,0 +1,155 @@
1
+
2
+ import urllib.parse
3
+ import re
4
+ from typing import List, Dict, Any
5
+ from loguru import logger
6
+ from .base import SearchEngine
7
+
8
+
9
+ class GoogleEngine(SearchEngine):
10
+ """
11
+ Search engine implementation for Google.
12
+ Parses Google Search HTML results.
13
+ """
14
+
15
+ def build_url(self, query: str, limit: int = 10) -> str:
16
+ encoded_query = urllib.parse.quote(query)
17
+ return f"https://www.google.com/search?q={encoded_query}"
18
+
19
+ def parse(self, content: str) -> List[Dict[str, Any]]:
20
+ results = []
21
+ seen_urls = set()
22
+
23
+ # Google search results are in blocks with class="MjjYud" or similar containers
24
+ # Split by result blocks first for more accurate extraction
25
+
26
+ # Method 1: Split by common result block classes
27
+ block_patterns = [
28
+ r'<div class="MjjYud"[^>]*>',
29
+ r'<div class="tF2Cxc"[^>]*>',
30
+ r'<div class="g Ww4FFb"[^>]*>',
31
+ ]
32
+
33
+ blocks = [content]
34
+ for bp in block_patterns:
35
+ new_blocks = []
36
+ for block in blocks:
37
+ parts = re.split(bp, block)
38
+ new_blocks.extend(parts)
39
+ blocks = new_blocks
40
+
41
+ for block in blocks:
42
+ if len(block) < 100:
43
+ continue
44
+
45
+ # Find URL in this block - prefer links with h3 nearby
46
+ url_match = re.search(r'<a[^>]+href="(https?://(?!www\.google\.|google\.|webcache\.googleusercontent\.)[^"]+)"[^>]*>', block)
47
+ if not url_match:
48
+ continue
49
+
50
+ url = url_match.group(1)
51
+ if url in seen_urls or self._should_skip_url(url):
52
+ continue
53
+
54
+ # Find h3 title in this block
55
+ h3_match = re.search(r'<h3[^>]*>(.*?)</h3>', block, re.IGNORECASE | re.DOTALL)
56
+ if not h3_match:
57
+ continue
58
+
59
+ title = re.sub(r'<[^>]+>', '', h3_match.group(1)).strip()
60
+ if not title or len(title) < 2:
61
+ continue
62
+
63
+ seen_urls.add(url)
64
+
65
+ # Extract snippet from VwiC3b class (Google's snippet container)
66
+ snippet = ""
67
+ snippet_match = re.search(r'<div[^>]*class="[^"]*VwiC3b[^"]*"[^>]*>(.*?)</div>', block, re.IGNORECASE | re.DOTALL)
68
+ if snippet_match:
69
+ snippet = re.sub(r'<[^>]+>', ' ', snippet_match.group(1)).strip()
70
+ snippet = re.sub(r'\s+', ' ', snippet).strip()
71
+
72
+ # Fallback: look for any text after h3
73
+ if not snippet:
74
+ # Try other common snippet patterns
75
+ alt_patterns = [
76
+ r'<span[^>]*class="[^"]*aCOpRe[^"]*"[^>]*>(.*?)</span>',
77
+ r'<div[^>]*data-snc[^>]*>(.*?)</div>',
78
+ ]
79
+ for ap in alt_patterns:
80
+ am = re.search(ap, block, re.IGNORECASE | re.DOTALL)
81
+ if am:
82
+ snippet = re.sub(r'<[^>]+>', ' ', am.group(1)).strip()
83
+ snippet = re.sub(r'\s+', ' ', snippet).strip()
84
+ break
85
+
86
+ # Extract images from this block
87
+ images = []
88
+ # Pattern 1: Regular img src (excluding data: and tracking pixels)
89
+ # Note: gstatic.com/images/branding is logo, but encrypted-tbn*.gstatic.com are thumbnails
90
+ img_matches = re.findall(r'<img[^>]+src="(https?://[^"]+)"', block)
91
+ for img_url in img_matches:
92
+ # Decode HTML entities
93
+ img_url = img_url.replace('&amp;', '&')
94
+ # Skip tracking/icon/small images (but allow encrypted-tbn which are valid thumbnails)
95
+ if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo', 'gstatic.com/images/branding', '1x1', 'transparent', 'gstatic.com/images/icons']):
96
+ continue
97
+ if img_url not in images:
98
+ images.append(img_url)
99
+
100
+ # Pattern 2: data-src (lazy loaded images)
101
+ data_src_matches = re.findall(r'data-src="(https?://[^"]+)"', block)
102
+ for img_url in data_src_matches:
103
+ img_url = img_url.replace('&amp;', '&')
104
+ if any(x in img_url.lower() for x in ['favicon', 'icon', 'tracking', 'pixel', 'logo']):
105
+ continue
106
+ if img_url not in images:
107
+ images.append(img_url)
108
+
109
+ results.append({
110
+ "title": title,
111
+ "url": url,
112
+ "domain": urllib.parse.urlparse(url).hostname or "",
113
+ "content": snippet[:1000],
114
+ "images": images[:3] # Limit to 3 images per result
115
+ })
116
+
117
+ if len(results) >= 15:
118
+ break
119
+
120
+ total_images = sum(len(r.get("images", [])) for r in results)
121
+ logger.info(f"GoogleEngine parsed {len(results)} results with {total_images} images total.")
122
+ return results
123
+
124
+ def _should_skip_url(self, url: str) -> bool:
125
+ """Check if URL should be skipped."""
126
+ skip_patterns = [
127
+ "google.com",
128
+ "googleusercontent.com",
129
+ "gstatic.com",
130
+ "youtube.com/watch", # Keep channel/playlist but skip individual videos
131
+ "maps.google",
132
+ "translate.google",
133
+ "accounts.google",
134
+ "support.google",
135
+ "policies.google",
136
+ "schema.org",
137
+ "javascript:",
138
+ "data:",
139
+ "#",
140
+ ]
141
+
142
+ for pattern in skip_patterns:
143
+ if pattern in url.lower():
144
+ return True
145
+
146
+ # Skip very short URLs (likely invalid)
147
+ if len(url) < 20:
148
+ return True
149
+
150
+ # Skip URLs that are just root domains without path
151
+ parsed = urllib.parse.urlparse(url)
152
+ if not parsed.path or parsed.path == "/":
153
+ return True
154
+
155
+ return False
@@ -65,7 +65,7 @@ class SharedBrowserManager:
65
65
  # Hide scrollbars globally
66
66
  co.set_argument('--hide-scrollbars')
67
67
  # 十万的原因是滚动条屏蔽(大概吧)
68
- co.set_argument('--window-size=1280,20000')
68
+ co.set_argument('--window-size=1280,9000')
69
69
  self._page = ChromiumPage(addr_or_opts=co)
70
70
 
71
71
  # Show Landing Page