chatgpt-mirai-qq-bot-web-search 0.1.14__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: chatgpt-mirai-qq-bot-web-search
3
- Version: 0.1.14
3
+ Version: 0.2.1
4
4
  Summary: WebSearch adapter for lss233/chatgpt-mirai-qq-bot
5
5
  Home-page: https://github.com/chuanSir123/web_search
6
6
  Author: chuanSir
@@ -0,0 +1,11 @@
1
+ web_search/__init__.py,sha256=zVZLb5A-im5XETwohgxyE-UCxjSvYl6I2OC3LnEQhdQ,4360
2
+ web_search/blocks.py,sha256=S3RsV9CCTKAsKUNhewg__ejEpJRDz7DTawtH05WRgE8,6732
3
+ web_search/config.py,sha256=DhLiERBJR2V5Boglf7Aq9Rbc4vsvLIh67CrLDIPeqA0,398
4
+ web_search/web_searcher.py,sha256=0zLgMsWCK71gStyWpFjup5WfxHx3tBTf3rGwM7Ae7Zs,13332
5
+ web_search/example/roleplayWithWebSearch.yaml,sha256=C-dGy3z8gcRcmxzurssP-kPRLqMf1TYR-nnNUaJjISE,7468
6
+ chatgpt_mirai_qq_bot_web_search-0.2.1.dist-info/LICENSE,sha256=ILBn-G3jdarm2w8oOrLmXeJNU3czuJvVhDLBASWdhM8,34522
7
+ chatgpt_mirai_qq_bot_web_search-0.2.1.dist-info/METADATA,sha256=Gt59c1F8TCJFClQ0qqdMvrCQ2gpeHHcIVH9cbVli-zw,1738
8
+ chatgpt_mirai_qq_bot_web_search-0.2.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
9
+ chatgpt_mirai_qq_bot_web_search-0.2.1.dist-info/entry_points.txt,sha256=o3kRDSdSmSdnCKlK6qS57aN0WpI4ab-Nxub2NwUrjf0,64
10
+ chatgpt_mirai_qq_bot_web_search-0.2.1.dist-info/top_level.txt,sha256=PoNm8MJYw_y8RTMaNlY0ePLoNHxVUAE2IHDuL5fFubI,11
11
+ chatgpt_mirai_qq_bot_web_search-0.2.1.dist-info/RECORD,,
web_search/blocks.py CHANGED
@@ -1,16 +1,17 @@
1
- from typing import Any, Dict, List, Optional
1
+ from typing import Any, Dict, List, Optional,Annotated
2
2
  import asyncio
3
- from kirara_ai.workflow.core.block import Block
4
- from kirara_ai.workflow.core.block.input_output import Input, Output
3
+ from kirara_ai.workflow.core.block import Block, Input, Output, ParamMeta
5
4
  from .web_searcher import WebSearcher
6
5
  from .config import WebSearchConfig
7
6
  from kirara_ai.llm.format.message import LLMChatMessage
8
7
  from kirara_ai.llm.format.response import LLMChatResponse
8
+ from kirara_ai.ioc.container import DependencyContainer
9
9
 
10
+ def get_options_provider(container: DependencyContainer, block: Block) -> List[str]:
11
+ return ["bing", "google", "baidu"]
10
12
  class WebSearchBlock(Block):
11
13
  """Web搜索Block"""
12
14
  name = "web_search"
13
-
14
15
  inputs = {
15
16
  "llm_resp": Input(name="llm_resp",label="LLM 响应", data_type=LLMChatResponse, description="搜索关键词")
16
17
  }
@@ -19,13 +20,16 @@ class WebSearchBlock(Block):
19
20
  "results": Output(name="results",label="搜索结果",data_type= str, description="搜索结果")
20
21
  }
21
22
 
22
- def __init__(self, name: str = None, max_results: Optional[int] = None, timeout: Optional[int] = None, fetch_content: Optional[bool] = None):
23
+ def __init__(self, name: str = None, max_results: Optional[int] = 3, timeout: Optional[int] = 10, fetch_content: Optional[bool] = True
24
+ ,engine: Annotated[Optional[str],ParamMeta(label="搜索引擎", description="要使用的搜索引擎", options_provider=get_options_provider),] = "bing", proxy: str = None,):
23
25
  super().__init__(name)
24
26
  self.searcher = None
25
27
  self.config = WebSearchConfig()
26
28
  self.max_results = max_results
27
29
  self.timeout = timeout
28
30
  self.fetch_content = fetch_content
31
+ self.engine=engine
32
+ self.proxy = proxy
29
33
 
30
34
  def _ensure_searcher(self):
31
35
  """同步方式初始化searcher"""
@@ -62,11 +66,81 @@ class WebSearchBlock(Block):
62
66
  query=query,
63
67
  max_results=max_results,
64
68
  timeout=timeout,
65
- fetch_content=fetch_content
69
+ fetch_content=fetch_content,
70
+ engine=self.engine,
71
+ proxy = self.proxy,
72
+ )
73
+ )
74
+ return {"results": "\n以下是联网搜索的结果:\n-- 搜索结果开始 --"+results+"\n-- 搜索结果结束 --"}
75
+ except Exception as e:
76
+ print(e)
77
+ return {"results": f"搜索失败: {str(e)}"}
78
+ class WebSearchByKeywordBlock(Block):
79
+ """Web搜索Block"""
80
+ name = "web_search_by_keyword"
81
+ description = "网络搜索,通过关键词进行网络搜索"
82
+
83
+ inputs = {
84
+ "keyword": Input(name="keyword",label="搜索关键字", data_type=str, description="搜索关键词")
85
+ }
86
+
87
+ outputs = {
88
+ "results": Output(name="results",label="搜索结果",data_type= str, description="搜索结果")
89
+ }
90
+
91
+ def __init__(self, name: str = None, max_results: Optional[int] = 3, timeout: Optional[int] = 10, fetch_content: Optional[bool] = True
92
+ ,engine: Annotated[Optional[str],ParamMeta(label="搜索引擎", description="要使用的搜索引擎", options_provider=get_options_provider),] = "bing", proxy: str = None,):
93
+ super().__init__(name)
94
+ self.searcher = None
95
+ self.config = WebSearchConfig()
96
+ self.max_results = max_results
97
+ self.timeout = timeout
98
+ self.fetch_content = fetch_content
99
+ self.engine=engine
100
+ self.proxy = proxy
101
+
102
+ def _ensure_searcher(self):
103
+ """同步方式初始化searcher"""
104
+ if not self.searcher:
105
+ try:
106
+ loop = asyncio.get_event_loop()
107
+ except RuntimeError:
108
+ # 如果在新线程中没有事件循环,则创建一个新的
109
+ loop = asyncio.new_event_loop()
110
+ asyncio.set_event_loop(loop)
111
+ self.searcher = loop.run_until_complete(WebSearcher.create())
112
+
113
+ def execute(self, **kwargs) -> Dict[str, Any]:
114
+ query = kwargs["keyword"]
115
+
116
+ if query == "" or query.startswith("无"):
117
+ return {"results": ""}
118
+ max_results = self.max_results
119
+ timeout = self.timeout
120
+ fetch_content = self.fetch_content
121
+ self._ensure_searcher()
122
+
123
+ try:
124
+ # 在新线程中创建事件循环
125
+ try:
126
+ loop = asyncio.get_event_loop()
127
+ except RuntimeError:
128
+ loop = asyncio.new_event_loop()
129
+ asyncio.set_event_loop(loop)
130
+
131
+ results = loop.run_until_complete(
132
+ self.searcher.search(
133
+ query=query,
134
+ max_results=max_results,
135
+ timeout=timeout,
136
+ fetch_content=fetch_content,
137
+ engine=self.engine,
138
+ proxy = self.proxy,
66
139
  )
67
140
  )
68
141
  return {"results": "\n以下是联网搜索的结果:\n-- 搜索结果开始 --"+results+"\n-- 搜索结果结束 --"}
69
142
  except Exception as e:
143
+ print(e)
70
144
  return {"results": f"搜索失败: {str(e)}"}
71
145
 
72
146
  class AppendSystemPromptBlock(Block):
@@ -1,237 +1,325 @@
1
- from playwright.async_api import async_playwright
2
- import trafilatura
3
- import random
4
- import time
5
- import urllib.parse
6
- import asyncio
7
- import subprocess
8
- import sys
9
- from kirara_ai.logger import get_logger
10
-
11
- logger = get_logger("WebSearchPlugin")
12
-
13
- class WebSearcher:
14
- def __init__(self):
15
- self.playwright = None
16
- self.browser = None
17
- self.context = None
18
-
19
- @classmethod
20
- async def create(cls):
21
- """创建 WebSearcher 实例的工厂方法"""
22
- self = cls()
23
- return self
24
-
25
- async def _ensure_initialized(self):
26
- """确保浏览器已初始化"""
27
- try:
28
- self.playwright = await async_playwright().start()
29
- try:
30
- self.browser = await self.playwright.chromium.launch(
31
- headless=True,
32
- chromium_sandbox=False,
33
- args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
34
- )
35
- except Exception as e:
36
- if "Executable doesn't exist" in str(e):
37
- logger.info("Installing playwright browsers...")
38
- # 使用 python -m playwright install 安装浏览器
39
- process = subprocess.Popen(
40
- [sys.executable, "-m", "playwright", "install", "chromium"],
41
- stdout=subprocess.PIPE,
42
- stderr=subprocess.PIPE
43
- )
44
- stdout, stderr = process.communicate()
45
- if process.returncode != 0:
46
- raise RuntimeError(f"Failed to install playwright browsers: {stderr.decode()}")
47
-
48
- # 重试启动浏览器
49
- self.browser = await self.playwright.chromium.launch(
50
- headless=False,
51
- chromium_sandbox=False,
52
- args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
53
- )
54
- else:
55
- raise
56
- return await self.browser.new_context(
57
- viewport={'width': 1920, 'height': 1080},
58
- user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
59
- )
60
- except Exception as e:
61
- logger.error(f"Failed to initialize WebSearcher: {e}")
62
- await self.close()
63
- raise
64
-
65
- async def random_sleep(self, min_time=1, max_time=3):
66
- """随机等待"""
67
- await asyncio.sleep(random.uniform(min_time, max_time))
68
-
69
- async def simulate_human_scroll(self, page):
70
- """模拟人类滚动"""
71
- for _ in range(3):
72
- await page.mouse.wheel(0, random.randint(300, 700))
73
- await self.random_sleep(0.3, 0.7)
74
-
75
- async def get_webpage_content(self, url: str, timeout: int,context) -> str:
76
- """获取网页内容"""
77
- start_time = time.time()
78
- try:
79
- # 创建新标签页获取内容
80
- page = await context.new_page()
81
- try:
82
- # 设置更严格的资源加载策略
83
- await page.route("**/*", lambda route: route.abort()
84
- if route.request.resource_type in ['image', 'stylesheet', 'font', 'media']
85
- else route.continue_())
86
-
87
- # 使用 domcontentloaded 而不是 networkidle
88
- await page.goto(url, wait_until='domcontentloaded', timeout=timeout * 1000)
89
-
90
- # 等待页面主要内容加载,但设置较短的超时时间
91
- try:
92
- await page.wait_for_load_state('domcontentloaded', timeout=5000)
93
- except Exception as e:
94
- logger.warning(f"Load state timeout for {url}, continuing anyway: {e}")
95
-
96
- await self.random_sleep(1, 2)
97
- await self.simulate_human_scroll(page)
98
-
99
- content = await page.content()
100
- text = trafilatura.extract(content)
101
-
102
- await page.close()
103
- logger.info(f"Content fetched - URL: {url} - Time: {time.time() - start_time:.2f}s")
104
- return text or ""
105
- except Exception as e:
106
- await page.close()
107
- logger.error(f"Failed to fetch content - URL: {url} - Error: {e}")
108
- return ""
109
- except Exception as e:
110
- logger.error(f"Failed to create page - URL: {url} - Error: {e}")
111
- return ""
112
-
113
- async def process_search_result(self, result, idx: int, timeout: int, fetch_content: bool,context):
114
- """处理单个搜索结果"""
115
- try:
116
- title_element = await result.query_selector('h2')
117
- link_element = await result.query_selector('h2 a')
118
- snippet_element = await result.query_selector('.b_caption p')
119
-
120
- if not title_element or not link_element:
121
- return None
122
-
123
- title = await title_element.inner_text()
124
- link = await link_element.get_attribute('href')
125
- snippet = await snippet_element.inner_text() if snippet_element else "无简介"
126
-
127
- if not link:
128
- return None
129
-
130
- result_text = f"[{idx+1}] {title}\nURL: {link}\n搜索简介: {snippet}"
131
-
132
- if fetch_content:
133
-
134
- content = await self.get_webpage_content(link, timeout,context)
135
- if content:
136
- result_text += f"\n内容详情:\n{content}"
137
-
138
- return result_text
139
-
140
- except Exception as e:
141
- logger.error(f"Failed to process result {idx}: {e}")
142
- return None
143
-
144
- async def search(self, query: str, max_results: int = 3, timeout: int = 10, fetch_content: bool = True) -> str:
145
- """执行搜索"""
146
- context = await self._ensure_initialized()
147
-
148
- search_start_time = time.time()
149
- page = None
150
- try:
151
- encoded_query = urllib.parse.quote(query)
152
- page = await context.new_page()
153
-
154
- # 添加重试逻辑
155
- max_retries = 3
156
- for attempt in range(max_retries):
157
- try:
158
- logger.info(f"Attempting to load search page (attempt {attempt + 1}/{max_retries})")
159
- await page.goto(
160
- f"https://www.bing.com/search?q={encoded_query}",
161
- wait_until='domcontentloaded',
162
- timeout=timeout * 1000
163
- )
164
-
165
- # 检查页面是否为空
166
- content = await page.content()
167
- if 'b_algo' not in content:
168
- if attempt < max_retries - 1:
169
- await page.reload()
170
- await self.random_sleep(1, 2)
171
- continue
172
- else:
173
- break
174
- except Exception as e:
175
- logger.warning(f"Page navigation failed on attempt {attempt + 1}: {e}")
176
- if attempt < max_retries - 1:
177
- await self.random_sleep(1, 2)
178
- continue
179
- else:
180
- raise
181
-
182
- # 使用更可靠的选择器等待策略
183
- try:
184
- selectors = ['.b_algo', '#b_results .b_algo', 'main .b_algo']
185
- results = None
186
-
187
- for selector in selectors:
188
- try:
189
- await page.wait_for_selector(selector, timeout=5000)
190
- results = await page.query_selector_all(selector)
191
- if results and len(results) > 0:
192
- break
193
- except Exception:
194
- continue
195
-
196
- if not results:
197
- logger.error("No search results found with any selector")
198
- return "搜索结果加载失败"
199
-
200
- except Exception as e:
201
- logger.error(f"Failed to find search results: {e}")
202
- return "搜索结果加载失败"
203
-
204
- logger.info(f"Found {len(results)} search results")
205
-
206
- tasks = []
207
- for idx, result in enumerate(results[:max_results]):
208
- tasks.append(self.process_search_result(result, idx, timeout, fetch_content,context))
209
-
210
- detailed_results = []
211
- completed_results = await asyncio.gather(*tasks)
212
-
213
- for result in completed_results:
214
- if result:
215
- detailed_results.append(result)
216
-
217
- total_time = time.time() - search_start_time
218
- results = "\n---\n".join(detailed_results) if detailed_results else "未找到相关结果"
219
- logger.info(f"Search completed - Query: {query} - Time: {total_time:.2f}s - Found {len(detailed_results)} valid results")
220
- return results
221
-
222
- except Exception as e:
223
- logger.error(f"Search failed - Query: {query} - Error: {e}", exc_info=True)
224
- return f"搜索失败: {str(e)}"
225
- finally:
226
- if page:
227
- try:
228
- await page.close()
229
- except Exception as e:
230
- logger.error(f"Error closing page: {e}")
231
-
232
- async def close(self):
233
- """关闭浏览器"""
234
- if self.browser:
235
- await self.browser.close()
236
- if self.playwright:
237
- await self.playwright.stop()
1
+ from playwright.async_api import async_playwright
2
+ import trafilatura
3
+ import random
4
+ import time
5
+ import urllib.parse
6
+ import asyncio
7
+ import subprocess
8
+ import sys
9
+ from kirara_ai.logger import get_logger
10
+ import os
11
+
12
+ logger = get_logger("WebSearchPlugin")
13
+
14
+ class WebSearcher:
15
+ def __init__(self):
16
+ self.playwright = None
17
+ self.browser = None
18
+ self.context = None
19
+ self.search_engines = {
20
+ 'bing': {
21
+ 'url': 'https://www.bing.com/search?q={}',
22
+ 'selectors': ['.b_algo', '#b_results .b_algo', 'main .b_algo'],
23
+ 'title_selector': 'h2',
24
+ 'link_selector': 'h2 a',
25
+ 'snippet_selector': '.b_caption p'
26
+ },
27
+ 'google': {
28
+ 'url': 'https://www.google.com/search?q={}',
29
+ 'selectors': ['.MjjYud', 'div.g', 'div[data-hveid]'],
30
+ 'title_selector': 'h3.LC20lb',
31
+ 'link_selector': 'a[jsname="UWckNb"], div.yuRUbf a',
32
+ 'snippet_selector': 'div.VwiC3b'
33
+ },
34
+ 'baidu': {
35
+ 'url': 'https://www.baidu.com/s?wd={}',
36
+ 'selectors': ['.result', '.result-op'],
37
+ 'title_selector': 'h3',
38
+ 'link_selector': 'h3 a',
39
+ 'snippet_selector': '.content-right_8Zs40'
40
+ }
41
+ }
42
+
43
+ @classmethod
44
+ async def create(cls):
45
+ """创建 WebSearcher 实例的工厂方法"""
46
+ self = cls()
47
+ return self
48
+
49
+ async def _ensure_initialized(self,proxy):
50
+ """确保浏览器已初始化"""
51
+ try:
52
+ self.playwright = await async_playwright().start()
53
+
54
+ # 创建用户数据目录路径
55
+ user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright_user_data")
56
+ os.makedirs(user_data_dir, exist_ok=True)
57
+
58
+ # 合并所有选项到一个字典
59
+ context_options = {
60
+ 'headless': True,
61
+ 'chromium_sandbox': False,
62
+ 'slow_mo': 50, # 减慢操作速度,更像人类
63
+ 'args': [
64
+ '--no-sandbox',
65
+ '--disable-setuid-sandbox',
66
+ '--disable-dev-shm-usage',
67
+ '--disable-blink-features=AutomationControlled', # 隐藏自动化控制痕迹
68
+ '--disable-features=IsolateOrigins,site-per-process',
69
+ ],
70
+ 'ignore_default_args': ['--enable-automation'], # 屏蔽自动化标志
71
+ 'viewport': {'width': 1920, 'height': 1080},
72
+ 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
73
+ 'locale': 'zh-CN',
74
+ 'timezone_id': 'Asia/Shanghai',
75
+ 'color_scheme': 'dark', # 'light',根据用户习惯
76
+ 'device_scale_factor': 1.75, # 高DPI设备
77
+ 'has_touch': True, # 支持触摸
78
+ 'is_mobile': False,
79
+ 'reduced_motion': 'no-preference'
80
+ }
81
+
82
+ # 如果是 Google 搜索,添加代理设置
83
+ if proxy:
84
+ context_options['proxy'] = {
85
+ 'server': proxy
86
+ }
87
+
88
+ try:
89
+ # 使用 launch_persistent_context 代替分开的 launch 和 new_context
90
+ self.context = await self.playwright.chromium.launch_persistent_context(
91
+ user_data_dir=user_data_dir,
92
+ **context_options
93
+ )
94
+
95
+ self.browser = None # 不再需要单独的browser引用
96
+
97
+ except Exception as e:
98
+ if "Executable doesn't exist" in str(e):
99
+ logger.info("Installing playwright browsers...")
100
+ process = subprocess.Popen(
101
+ [sys.executable, "-m", "playwright", "install", "chromium"],
102
+ stdout=subprocess.PIPE,
103
+ stderr=subprocess.PIPE
104
+ )
105
+ stdout, stderr = process.communicate()
106
+ if process.returncode != 0:
107
+ raise RuntimeError(f"Failed to install playwright browsers: {stderr.decode()}")
108
+
109
+ # 重试使用 launch_persistent_context
110
+ self.context = await self.playwright.chromium.launch_persistent_context(
111
+ user_data_dir=user_data_dir,
112
+ **context_options
113
+ )
114
+ else:
115
+ raise
116
+
117
+ # 注入脚本来伪装webdriver标记
118
+ await self.context.add_init_script("""
119
+ Object.defineProperty(navigator, 'webdriver', {
120
+ get: () => false,
121
+ });
122
+
123
+ // 防止 iframe 检测
124
+ window.parent.document;
125
+
126
+ // 防止检测到 Chrome Devtools 协议
127
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Array;
128
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Promise;
129
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Symbol;
130
+ """)
131
+
132
+ return self.context
133
+
134
+ except Exception as e:
135
+ logger.error(f"Failed to initialize WebSearcher: {e}")
136
+ await self.close()
137
+ raise
138
+
139
+ async def simulate_human_scroll(self, page):
140
+ """模拟人类滚动"""
141
+ for _ in range(3):
142
+ await page.mouse.wheel(0, random.randint(300, 700))
143
+
144
+ async def get_webpage_content(self, url: str, timeout: int,context) -> str:
145
+ """获取网页内容"""
146
+ start_time = time.time()
147
+ try:
148
+ # 创建新标签页获取内容
149
+ page = await context.new_page()
150
+ try:
151
+ # 设置更严格的资源加载策略
152
+ await page.route("**/*", lambda route: route.abort()
153
+ if route.request.resource_type in ['image', 'stylesheet', 'font', 'media']
154
+ else route.continue_())
155
+
156
+ # 使用 domcontentloaded 而不是 networkidle
157
+ await page.goto(url, wait_until='domcontentloaded', timeout=timeout * 1000)
158
+
159
+ # 等待页面主要内容加载,但设置较短的超时时间
160
+ try:
161
+ await page.wait_for_load_state('domcontentloaded', timeout=5000)
162
+ except Exception as e:
163
+ logger.warning(f"Load state timeout for {url}, continuing anyway: {e}")
164
+
165
+ await self.simulate_human_scroll(page)
166
+
167
+ content = await page.content()
168
+ text = trafilatura.extract(content)
169
+
170
+ await page.close()
171
+ logger.info(f"Content fetched - URL: {url} - Time: {time.time() - start_time:.2f}s")
172
+ return text or ""
173
+ except Exception as e:
174
+ await page.close()
175
+ logger.error(f"Failed to fetch content - URL: {url} - Error: {e}")
176
+ return ""
177
+ except Exception as e:
178
+ logger.error(f"Failed to create page - URL: {url} - Error: {e}")
179
+ return ""
180
+
181
+ async def process_search_result(self, result, idx: int, timeout: int, fetch_content: bool, context, engine='bing'):
182
+ """处理单个搜索结果"""
183
+ try:
184
+ engine_config = self.search_engines[engine]
185
+ title_element = await result.query_selector(engine_config['title_selector'])
186
+ link_element = await result.query_selector(engine_config['link_selector'])
187
+ snippet_element = await result.query_selector(engine_config['snippet_selector'])
188
+
189
+ if not title_element or not link_element:
190
+ return None
191
+
192
+ title = await title_element.inner_text()
193
+ link = await link_element.get_attribute('href')
194
+
195
+ # 对于百度搜索需要特殊处理链接
196
+ if engine == 'baidu':
197
+ try:
198
+ # 创建新页面来获取真实URL
199
+ new_page = await context.new_page()
200
+ await new_page.goto(link, wait_until='domcontentloaded', timeout=5000)
201
+ real_url = new_page.url
202
+ await new_page.close()
203
+ link = real_url
204
+ except Exception as e:
205
+ logger.warning(f"Failed to get real URL from Baidu: {e}")
206
+
207
+ snippet = await snippet_element.inner_text() if snippet_element else "无简介"
208
+
209
+ if not link:
210
+ return None
211
+
212
+ result_text = f"[{idx+1}] {title}\nURL: {link}\n搜索简介: {snippet}"
213
+
214
+ if fetch_content:
215
+
216
+ content = await self.get_webpage_content(link, timeout,context)
217
+ if content:
218
+ result_text += f"\n内容详情:\n{content}"
219
+
220
+ return result_text
221
+
222
+ except Exception as e:
223
+ logger.error(f"Failed to process result {idx}: {e}")
224
+ return None
225
+
226
+ async def search(self, query: str, max_results: int = 3, timeout: int = 10, fetch_content: bool = True, engine: str = 'bing', proxy: str = None) -> str:
227
+ """执行搜索"""
228
+ if engine not in self.search_engines:
229
+ return f"不支持的搜索引擎: {engine}"
230
+
231
+ # 设置当前搜索引擎
232
+ self.current_engine = engine
233
+ context = await self._ensure_initialized(proxy)
234
+ engine_config = self.search_engines[engine]
235
+ search_start_time = time.time()
236
+ page = None
237
+
238
+ try:
239
+ encoded_query = urllib.parse.quote(query)
240
+ page = await context.new_page()
241
+
242
+ # Google搜索特定处理
243
+ await page.goto(
244
+ engine_config['url'].format(encoded_query),
245
+ wait_until='load',
246
+ timeout=timeout * 1000
247
+ )
248
+
249
+ # 使用搜索引擎特定的选择器
250
+ results = None
251
+
252
+ # 对于Google,让页面有更多时间加载
253
+ if engine == 'google':
254
+ await self.simulate_human_scroll(page)
255
+
256
+ for selector in engine_config['selectors']:
257
+ try:
258
+ logger.info(f"Trying selector: {selector}")
259
+ await page.wait_for_selector(selector, timeout=8000) # 增加等待时间
260
+ results = await page.query_selector_all(selector)
261
+ if results and len(results) > 0:
262
+ logger.info(f"Found {len(results)} results with selector {selector}")
263
+ break
264
+ except Exception as e:
265
+ logger.warning(f"Selector {selector} failed: {e}")
266
+ continue
267
+
268
+ if not results:
269
+ # 尝试直接使用 JavaScript 获取元素
270
+ if engine == 'google':
271
+ try:
272
+ # 使用更通用的JavaScript选择器尝试获取结果
273
+ results = await page.evaluate("""
274
+ () => {
275
+ const elements = document.querySelectorAll('div[data-sokoban-container], div.g, .MjjYud');
276
+ return Array.from(elements).length;
277
+ }
278
+ """)
279
+ logger.info(f"JavaScript found {results} elements")
280
+
281
+ # 如果找到了元素,使用evaluate来处理它们
282
+ if results > 0:
283
+ # 自定义处理逻辑...
284
+ pass
285
+ except Exception as e:
286
+ logger.error(f"JavaScript evaluation failed: {e}")
287
+
288
+ logger.error("No search results found with any selector")
289
+ await page.screenshot(path=f'search_failed_{engine}.png')
290
+ return "搜索结果加载失败"
291
+
292
+ logger.info(f"Found {len(results)} search results")
293
+
294
+ tasks = []
295
+ for idx, result in enumerate(results[:max_results]):
296
+ tasks.append(self.process_search_result(result, idx, timeout, fetch_content, context, engine))
297
+
298
+ detailed_results = []
299
+ completed_results = await asyncio.gather(*tasks)
300
+
301
+ for result in completed_results:
302
+ if result:
303
+ detailed_results.append(result)
304
+
305
+ total_time = time.time() - search_start_time
306
+ results = "\n---\n".join(detailed_results) if detailed_results else "未找到相关结果"
307
+ logger.info(f"Search completed - Query: {query} - Time: {total_time:.2f}s - Found {len(detailed_results)} valid results")
308
+ return results
309
+
310
+ except Exception as e:
311
+ logger.error(f"Search failed - Query: {query} - Error: {e}", exc_info=True)
312
+ return f"搜索失败: {str(e)}"
313
+ finally:
314
+ if page:
315
+ try:
316
+ await page.close()
317
+ except Exception as e:
318
+ logger.error(f"Error closing page: {e}")
319
+
320
+ async def close(self):
321
+ """关闭浏览器"""
322
+ if self.context:
323
+ await self.context.close()
324
+ if self.playwright:
325
+ await self.playwright.stop()
@@ -1,11 +0,0 @@
1
- web_search/__init__.py,sha256=zVZLb5A-im5XETwohgxyE-UCxjSvYl6I2OC3LnEQhdQ,4360
2
- web_search/blocks.py,sha256=F1XJt7n1mHqMhkoU5Du1IB7NEVCx3Cl9vHkoY7plwls,3611
3
- web_search/config.py,sha256=DhLiERBJR2V5Boglf7Aq9Rbc4vsvLIh67CrLDIPeqA0,398
4
- web_search/web_searcher.py,sha256=HsXs5ctvBryupK4hX8qbJspx1-rWEPnMYfulEkC1WMw,9707
5
- web_search/example/roleplayWithWebSearch.yaml,sha256=C-dGy3z8gcRcmxzurssP-kPRLqMf1TYR-nnNUaJjISE,7468
6
- chatgpt_mirai_qq_bot_web_search-0.1.14.dist-info/LICENSE,sha256=ILBn-G3jdarm2w8oOrLmXeJNU3czuJvVhDLBASWdhM8,34522
7
- chatgpt_mirai_qq_bot_web_search-0.1.14.dist-info/METADATA,sha256=6UNFYoIQXPBsJ1efkuD7UtKr5R1BRJ_9XnGI-dqdRUE,1739
8
- chatgpt_mirai_qq_bot_web_search-0.1.14.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
9
- chatgpt_mirai_qq_bot_web_search-0.1.14.dist-info/entry_points.txt,sha256=o3kRDSdSmSdnCKlK6qS57aN0WpI4ab-Nxub2NwUrjf0,64
10
- chatgpt_mirai_qq_bot_web_search-0.1.14.dist-info/top_level.txt,sha256=PoNm8MJYw_y8RTMaNlY0ePLoNHxVUAE2IHDuL5fFubI,11
11
- chatgpt_mirai_qq_bot_web_search-0.1.14.dist-info/RECORD,,