cnks 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnks/__init__.py +49 -49
- cnks/server.py +831 -831
- {cnks-0.2.2.dist-info → cnks-0.2.3.dist-info}/METADATA +2 -8
- cnks-0.2.3.dist-info/RECORD +8 -0
- cnks-0.2.2.dist-info/RECORD +0 -8
- {cnks-0.2.2.dist-info → cnks-0.2.3.dist-info}/WHEEL +0 -0
- {cnks-0.2.2.dist-info → cnks-0.2.3.dist-info}/entry_points.txt +0 -0
cnks/server.py
CHANGED
@@ -1,832 +1,832 @@
|
|
1
|
-
import asyncio
|
2
|
-
import json
|
3
|
-
import os
|
4
|
-
import platform
|
5
|
-
import re
|
6
|
-
import subprocess
|
7
|
-
import sys
|
8
|
-
import time
|
9
|
-
import logging
|
10
|
-
import webbrowser
|
11
|
-
import traceback
|
12
|
-
from pathlib import Path
|
13
|
-
from urllib.parse import quote
|
14
|
-
from typing import Dict, List, Any, Optional, Union
|
15
|
-
|
16
|
-
from mcp.server.models import InitializationOptions
|
17
|
-
import mcp.types as types
|
18
|
-
from mcp.server import NotificationOptions, Server
|
19
|
-
from pydantic import AnyUrl
|
20
|
-
import mcp.server.stdio
|
21
|
-
|
22
|
-
# 配置日志记录
|
23
|
-
logging.basicConfig(
|
24
|
-
level=logging.DEBUG,
|
25
|
-
filename="cnks.log",
|
26
|
-
filemode="a",
|
27
|
-
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
28
|
-
)
|
29
|
-
logger = logging.getLogger("cnks")
|
30
|
-
|
31
|
-
# 尝试导入playwright
|
32
|
-
try:
|
33
|
-
from playwright.async_api import async_playwright
|
34
|
-
PLAYWRIGHT_AVAILABLE = True
|
35
|
-
except ImportError:
|
36
|
-
PLAYWRIGHT_AVAILABLE = False
|
37
|
-
logger.warning("Playwright未安装,将使用传统方式打开Chrome")
|
38
|
-
|
39
|
-
# 存储当前页面内容和笔记
|
40
|
-
page_content = ""
|
41
|
-
current_url = ""
|
42
|
-
notes: dict[str, str] = {}
|
43
|
-
browser_instance = None
|
44
|
-
|
45
|
-
server = Server("cnks")
|
46
|
-
|
47
|
-
# 导入我们新创建的extractor模块
|
48
|
-
try:
|
49
|
-
from . import chrome_extractor as extractor
|
50
|
-
except ImportError:
|
51
|
-
try:
|
52
|
-
import chrome_extractor as extractor
|
53
|
-
except ImportError:
|
54
|
-
extractor = None
|
55
|
-
logger.warning("无法导入chrome_extractor模块,批量提取功能将不可用")
|
56
|
-
|
57
|
-
def find_chrome_executable():
|
58
|
-
"""查找Chrome可执行文件路径"""
|
59
|
-
system = platform.system()
|
60
|
-
|
61
|
-
# 定义可能的Chrome位置
|
62
|
-
if system == "Windows":
|
63
|
-
chrome_paths = [
|
64
|
-
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
|
65
|
-
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
|
66
|
-
os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
|
67
|
-
]
|
68
|
-
elif system == "Darwin": # MacOS
|
69
|
-
chrome_paths = [
|
70
|
-
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
71
|
-
os.path.expanduser("~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"),
|
72
|
-
]
|
73
|
-
elif system == "Linux":
|
74
|
-
chrome_paths = [
|
75
|
-
"/usr/bin/google-chrome",
|
76
|
-
"/usr/bin/chromium-browser",
|
77
|
-
"/usr/bin/chromium",
|
78
|
-
]
|
79
|
-
else:
|
80
|
-
return None
|
81
|
-
|
82
|
-
# 检查路径是否存在
|
83
|
-
for path in chrome_paths:
|
84
|
-
if os.path.exists(path):
|
85
|
-
return path
|
86
|
-
|
87
|
-
# 尝试从环境变量中查找
|
88
|
-
chrome_env = os.environ.get("CHROME_PATH")
|
89
|
-
if chrome_env and os.path.exists(chrome_env):
|
90
|
-
return chrome_env
|
91
|
-
|
92
|
-
return None
|
93
|
-
|
94
|
-
def open_chrome(url):
|
95
|
-
"""打开Chrome浏览器并访问指定URL"""
|
96
|
-
try:
|
97
|
-
chrome_path = find_chrome_executable()
|
98
|
-
|
99
|
-
if not chrome_path:
|
100
|
-
return "未找到Chrome可执行文件。请设置CHROME_PATH环境变量指向Chrome位置。"
|
101
|
-
|
102
|
-
subprocess.Popen([
|
103
|
-
chrome_path,
|
104
|
-
url
|
105
|
-
])
|
106
|
-
time.sleep(2) # 等待页面加载
|
107
|
-
return True
|
108
|
-
except Exception as e:
|
109
|
-
return f"打开Chrome时出错: {str(e)}"
|
110
|
-
|
111
|
-
async def search_with_playwright(keywords):
|
112
|
-
"""使用playwright在知网搜索关键词"""
|
113
|
-
global page_content
|
114
|
-
|
115
|
-
if not PLAYWRIGHT_AVAILABLE:
|
116
|
-
return "需要安装playwright模块:uv add playwright"
|
117
|
-
|
118
|
-
try:
|
119
|
-
chrome_path = find_chrome_executable()
|
120
|
-
if not chrome_path:
|
121
|
-
return "未找到Chrome可执行文件。请设置CHROME_PATH环境变量指向Chrome位置。"
|
122
|
-
|
123
|
-
logger.debug(f"[DEBUG] 使用Playwright搜索,Chrome路径: {chrome_path}")
|
124
|
-
|
125
|
-
# 创建全局浏览器实例,避免执行完关闭
|
126
|
-
global browser_instance
|
127
|
-
|
128
|
-
# 只打开一个playwright实例
|
129
|
-
playwright = await async_playwright().start()
|
130
|
-
|
131
|
-
# 尝试使用系统Chrome
|
132
|
-
try:
|
133
|
-
logger.debug("[DEBUG] 尝试使用channel='chrome'启动浏览器")
|
134
|
-
browser = await playwright.chromium.launch(
|
135
|
-
headless=False,
|
136
|
-
channel="chrome"
|
137
|
-
)
|
138
|
-
except Exception as e:
|
139
|
-
logger.debug(f"[DEBUG] channel='chrome'方式失败: {str(e)}")
|
140
|
-
logger.debug("[DEBUG] 尝试使用executable_path启动浏览器")
|
141
|
-
# 如果失败,尝试使用executable_path指定Chrome路径
|
142
|
-
browser = await playwright.chromium.launch(
|
143
|
-
headless=False,
|
144
|
-
executable_path=chrome_path
|
145
|
-
)
|
146
|
-
|
147
|
-
# 保存浏览器实例以防止被关闭
|
148
|
-
browser_instance = browser
|
149
|
-
|
150
|
-
page = await browser.new_page()
|
151
|
-
|
152
|
-
# 导航到知网搜索页面
|
153
|
-
await page.goto("https://kns.cnki.net/kns8s/search")
|
154
|
-
logger.debug("[DEBUG] 成功打开知网搜索页面")
|
155
|
-
|
156
|
-
# 等待页面加载
|
157
|
-
await page.wait_for_load_state("networkidle")
|
158
|
-
|
159
|
-
# 查找并填写搜索框
|
160
|
-
try:
|
161
|
-
# 尝试定位搜索框
|
162
|
-
search_input = await page.query_selector('input.search-input')
|
163
|
-
if search_input:
|
164
|
-
# 清空搜索框
|
165
|
-
await search_input.fill("")
|
166
|
-
# 输入关键词
|
167
|
-
await search_input.fill(keywords)
|
168
|
-
logger.debug(f"[DEBUG] 已在搜索框中输入: {keywords}")
|
169
|
-
|
170
|
-
# 增加短暂等待以确保用户可以看到输入过程
|
171
|
-
await asyncio.sleep(1)
|
172
|
-
|
173
|
-
# 查找并点击搜索按钮
|
174
|
-
search_button = await page.query_selector('.search-btn')
|
175
|
-
if search_button:
|
176
|
-
await search_button.click()
|
177
|
-
logger.debug("[DEBUG] 已点击搜索按钮")
|
178
|
-
# 等待搜索结果加载
|
179
|
-
await page.wait_for_load_state("networkidle")
|
180
|
-
|
181
|
-
# 点击操作1:点击下拉框的三角形
|
182
|
-
try:
|
183
|
-
# 等待一下,确保页面元素都加载完成
|
184
|
-
await asyncio.sleep(2)
|
185
|
-
|
186
|
-
# 尝试点击排序下拉框
|
187
|
-
logger.debug("[DEBUG] 尝试点击排序下拉框")
|
188
|
-
# 根据提供的HTML,尝试定位下拉框的三角形
|
189
|
-
sort_dropdown = await page.query_selector('div[class="sort"][id="perPageDiv"]')
|
190
|
-
if sort_dropdown:
|
191
|
-
await sort_dropdown.click()
|
192
|
-
logger.debug("[DEBUG] 成功点击排序下拉框")
|
193
|
-
|
194
|
-
# 等待下拉菜单出现
|
195
|
-
await asyncio.sleep(1)
|
196
|
-
|
197
|
-
# 点击操作2:点击数字50选项
|
198
|
-
logger.debug("[DEBUG] 尝试点击'50'选项")
|
199
|
-
# 尝试定位"50"选项
|
200
|
-
option_50 = await page.query_selector('li[data-val="50"]')
|
201
|
-
if option_50:
|
202
|
-
await option_50.click()
|
203
|
-
logger.debug("[DEBUG] 成功点击'50'选项")
|
204
|
-
await page.wait_for_load_state("networkidle")
|
205
|
-
|
206
|
-
# 勾选来源类别中的CSSCI选项
|
207
|
-
try:
|
208
|
-
# 等待一下确保页面完全加载
|
209
|
-
await asyncio.sleep(2)
|
210
|
-
|
211
|
-
logger.debug("[DEBUG] 尝试勾选CSSCI选项")
|
212
|
-
|
213
|
-
# 首先尝试找到来源类别区域
|
214
|
-
# 通常来源类别会有一个标题或者分组
|
215
|
-
source_category = await page.query_selector('div.group-item:has-text("来源类别")')
|
216
|
-
|
217
|
-
if source_category:
|
218
|
-
logger.debug("[DEBUG] 找到来源类别区域")
|
219
|
-
|
220
|
-
# 在来源类别区域内查找CSSCI选项
|
221
|
-
cssci_checkbox = await source_category.query_selector('input[type="checkbox"]:near(:text("CSSCI"))')
|
222
|
-
|
223
|
-
if cssci_checkbox:
|
224
|
-
# 点击CSSCI复选框
|
225
|
-
await cssci_checkbox.click()
|
226
|
-
logger.debug("[DEBUG] 成功勾选CSSCI选项")
|
227
|
-
|
228
|
-
# 等待页面刷新
|
229
|
-
await page.wait_for_load_state("networkidle")
|
230
|
-
|
231
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
232
|
-
links_count = await find_and_count_abstract_links(page)
|
233
|
-
|
234
|
-
return links_count
|
235
|
-
else:
|
236
|
-
logger.debug("[DEBUG] 在来源类别区域未找到CSSCI选项")
|
237
|
-
|
238
|
-
# 尝试另一种方式:直接在整个页面中查找CSSCI
|
239
|
-
cssci_text = await page.query_selector(':text("CSSCI")')
|
240
|
-
if cssci_text:
|
241
|
-
# 尝试点击文本附近的复选框
|
242
|
-
await cssci_text.click()
|
243
|
-
logger.debug("[DEBUG] 通过文本找到并点击了CSSCI")
|
244
|
-
await page.wait_for_load_state("networkidle")
|
245
|
-
|
246
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
247
|
-
links_count = await find_and_count_abstract_links(page)
|
248
|
-
|
249
|
-
return links_count
|
250
|
-
else:
|
251
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
252
|
-
links_count = await find_and_count_abstract_links(page)
|
253
|
-
return links_count
|
254
|
-
else:
|
255
|
-
logger.debug("[DEBUG] 未找到来源类别区域")
|
256
|
-
|
257
|
-
# 尝试直接在页面中查找CSSCI文本
|
258
|
-
cssci_text = await page.query_selector(':text("CSSCI")')
|
259
|
-
if cssci_text:
|
260
|
-
# 尝试点击文本附近的复选框
|
261
|
-
await cssci_text.click()
|
262
|
-
logger.debug("[DEBUG] 直接找到并点击了CSSCI")
|
263
|
-
await page.wait_for_load_state("networkidle")
|
264
|
-
|
265
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
266
|
-
links_count = await find_and_count_abstract_links(page)
|
267
|
-
|
268
|
-
return links_count
|
269
|
-
else:
|
270
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
271
|
-
links_count = await find_and_count_abstract_links(page)
|
272
|
-
return links_count
|
273
|
-
except Exception as e:
|
274
|
-
logger.debug(f"[DEBUG] 勾选CSSCI选项时出错: {str(e)}")
|
275
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
276
|
-
links_count = await find_and_count_abstract_links(page)
|
277
|
-
return links_count
|
278
|
-
|
279
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
280
|
-
links_count = await find_and_count_abstract_links(page)
|
281
|
-
return links_count
|
282
|
-
else:
|
283
|
-
logger.debug("[DEBUG] 未找到'50'选项")
|
284
|
-
page_content = {
|
285
|
-
"count": 0,
|
286
|
-
"links": [],
|
287
|
-
"error": "已搜索并点击下拉框,但未找到'50'选项"
|
288
|
-
}
|
289
|
-
return 0
|
290
|
-
else:
|
291
|
-
logger.debug("[DEBUG] 未找到排序下拉框")
|
292
|
-
page_content = {
|
293
|
-
"count": 0,
|
294
|
-
"links": [],
|
295
|
-
"error": "已搜索,但未找到排序下拉框"
|
296
|
-
}
|
297
|
-
return 0
|
298
|
-
except Exception as e:
|
299
|
-
logger.debug(f"[DEBUG] 点击下拉框或选项时出错: {str(e)}")
|
300
|
-
page_content = {
|
301
|
-
"count": 0,
|
302
|
-
"links": [],
|
303
|
-
"error": f"已搜索,但在点击下拉框或选项时出错: {str(e)}"
|
304
|
-
}
|
305
|
-
return 0
|
306
|
-
else:
|
307
|
-
# 不关闭浏览器
|
308
|
-
page_content = {
|
309
|
-
"count": 0,
|
310
|
-
"links": [],
|
311
|
-
"error": f"已填写搜索关键词: {keywords},但未找到搜索按钮"
|
312
|
-
}
|
313
|
-
return 0
|
314
|
-
else:
|
315
|
-
# 不关闭浏览器
|
316
|
-
page_content = {
|
317
|
-
"count": 0,
|
318
|
-
"links": [],
|
319
|
-
"error": f"未找到搜索框,无法搜索: {keywords}"
|
320
|
-
}
|
321
|
-
return 0
|
322
|
-
except Exception as e:
|
323
|
-
logger.debug(f"[DEBUG] 填写搜索框或点击搜索按钮时出错: {str(e)}")
|
324
|
-
# 不关闭浏览器
|
325
|
-
page_content = {
|
326
|
-
"count": 0,
|
327
|
-
"links": [],
|
328
|
-
"error": f"自动搜索过程中出错: {str(e)}"
|
329
|
-
}
|
330
|
-
return 0
|
331
|
-
except Exception as e:
|
332
|
-
error_msg = str(e)
|
333
|
-
logger.debug(f"[DEBUG] Playwright错误: {error_msg}")
|
334
|
-
|
335
|
-
# 如果是找不到Chrome的错误,提供更明确的指导
|
336
|
-
if "Executable doesn't exist" in error_msg and "ms-playwright" in error_msg:
|
337
|
-
error_message = f"需要安装Playwright的浏览器: playwright install\n如果您想使用系统Chrome,请重新启动服务器。\n\n{error_msg}"
|
338
|
-
else:
|
339
|
-
error_message = f"使用Playwright启动Chrome失败: {error_msg}"
|
340
|
-
|
341
|
-
page_content = {
|
342
|
-
"count": 0,
|
343
|
-
"links": [],
|
344
|
-
"error": error_message
|
345
|
-
}
|
346
|
-
return 0
|
347
|
-
|
348
|
-
def search_with_direct_chrome(keywords):
|
349
|
-
"""直接使用Chrome搜索,不使用playwright"""
|
350
|
-
global page_content
|
351
|
-
|
352
|
-
logger.debug("[DEBUG] 正在使用search_with_direct_chrome函数")
|
353
|
-
|
354
|
-
try:
|
355
|
-
url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
|
356
|
-
logger.debug(f"[DEBUG] 打开URL: {url}")
|
357
|
-
|
358
|
-
result = open_chrome(url)
|
359
|
-
|
360
|
-
if isinstance(result, str) and "打开Chrome" in result:
|
361
|
-
logger.debug(f"[DEBUG] 直接打开Chrome结果: {result}")
|
362
|
-
|
363
|
-
page_content = {
|
364
|
-
"count": 0,
|
365
|
-
"links": [],
|
366
|
-
"error": f"直接打开Chrome搜索: {result}"
|
367
|
-
}
|
368
|
-
|
369
|
-
else:
|
370
|
-
logger.debug("[DEBUG] 直接打开Chrome成功")
|
371
|
-
|
372
|
-
page_content = {
|
373
|
-
"count": 0,
|
374
|
-
"links": [],
|
375
|
-
"message": "已打开Chrome并搜索关键词,但无法自动获取链接。请安装playwright以获取完整功能。"
|
376
|
-
}
|
377
|
-
|
378
|
-
return page_content
|
379
|
-
except Exception as e:
|
380
|
-
logger.debug(f"[DEBUG] search_with_direct_chrome出错: {str(e)}")
|
381
|
-
|
382
|
-
page_content = {
|
383
|
-
"count": 0,
|
384
|
-
"links": [],
|
385
|
-
"error": f"使用Chrome搜索时出错: {str(e)}"
|
386
|
-
}
|
387
|
-
|
388
|
-
return page_content
|
389
|
-
|
390
|
-
def get_page_content():
|
391
|
-
"""获取当前页面内容(简化模拟)"""
|
392
|
-
global page_content, current_url
|
393
|
-
if not current_url:
|
394
|
-
return "尚未打开任何页面"
|
395
|
-
|
396
|
-
# 实际应用中,这里可以使用Selenium或类似工具来获取实际页面内容
|
397
|
-
# 此处为简化实现,返回模拟内容
|
398
|
-
if "cnki" in current_url:
|
399
|
-
return f"中国知网搜索页面\n当前URL: {current_url}\n可使用搜索工具查询文献。"
|
400
|
-
return f"已打开页面: {current_url}"
|
401
|
-
|
402
|
-
@server.list_resources()
|
403
|
-
async def handle_list_resources() -> list[types.Resource]:
|
404
|
-
"""列出可用资源"""
|
405
|
-
resources = []
|
406
|
-
|
407
|
-
# 当前网页资源
|
408
|
-
resources.append(
|
409
|
-
types.Resource(
|
410
|
-
uri=AnyUrl("webpage://current"),
|
411
|
-
name="当前网页",
|
412
|
-
description="当前打开的网页内容",
|
413
|
-
mimeType="text/plain",
|
414
|
-
)
|
415
|
-
)
|
416
|
-
|
417
|
-
# 知网搜索页资源
|
418
|
-
resources.append(
|
419
|
-
types.Resource(
|
420
|
-
uri=AnyUrl("webpage://cnki/search"),
|
421
|
-
name="知网搜索页",
|
422
|
-
description="中国知网搜索页面",
|
423
|
-
mimeType="text/plain",
|
424
|
-
)
|
425
|
-
)
|
426
|
-
|
427
|
-
# 笔记资源
|
428
|
-
for name in notes:
|
429
|
-
resources.append(
|
430
|
-
types.Resource(
|
431
|
-
uri=AnyUrl(f"note://internal/{name}"),
|
432
|
-
name=f"笔记: {name}",
|
433
|
-
description=f"笔记: {name}",
|
434
|
-
mimeType="text/plain",
|
435
|
-
)
|
436
|
-
)
|
437
|
-
|
438
|
-
return resources
|
439
|
-
|
440
|
-
@server.read_resource()
|
441
|
-
async def handle_read_resource(uri: AnyUrl) -> str:
|
442
|
-
"""读取资源内容"""
|
443
|
-
global current_url
|
444
|
-
|
445
|
-
scheme = uri.scheme
|
446
|
-
|
447
|
-
if scheme == "webpage":
|
448
|
-
path = uri.path if uri.path else ""
|
449
|
-
host = uri.host if uri.host else ""
|
450
|
-
|
451
|
-
if host == "current":
|
452
|
-
return get_page_content()
|
453
|
-
elif host == "cnki" and path == "/search":
|
454
|
-
# 打开知网搜索页
|
455
|
-
current_url = "https://kns.cnki.net/kns8s/search"
|
456
|
-
result = open_chrome(current_url)
|
457
|
-
if result is True:
|
458
|
-
return "已打开中国知网搜索页面,可使用搜索工具查询文献。"
|
459
|
-
else:
|
460
|
-
return result
|
461
|
-
elif scheme == "note":
|
462
|
-
name = uri.path
|
463
|
-
if name is not None:
|
464
|
-
name = name.lstrip("/")
|
465
|
-
if name in notes:
|
466
|
-
return notes[name]
|
467
|
-
raise ValueError(f"笔记未找到: {name}")
|
468
|
-
|
469
|
-
raise ValueError(f"不支持的URI方案或资源未找到: {uri}")
|
470
|
-
|
471
|
-
@server.list_prompts()
|
472
|
-
async def handle_list_prompts() -> list[types.Prompt]:
|
473
|
-
"""列出可用提示"""
|
474
|
-
return [
|
475
|
-
types.Prompt(
|
476
|
-
name="search-literature",
|
477
|
-
description="按主题搜索文献",
|
478
|
-
arguments=[
|
479
|
-
types.PromptArgument(
|
480
|
-
name="keywords",
|
481
|
-
description="搜索关键词",
|
482
|
-
required=True,
|
483
|
-
)
|
484
|
-
],
|
485
|
-
),
|
486
|
-
types.Prompt(
|
487
|
-
name="advanced-search",
|
488
|
-
description="高级文献搜索",
|
489
|
-
arguments=[
|
490
|
-
types.PromptArgument(
|
491
|
-
name="title",
|
492
|
-
description="论文标题",
|
493
|
-
required=False,
|
494
|
-
),
|
495
|
-
types.PromptArgument(
|
496
|
-
name="author",
|
497
|
-
description="作者",
|
498
|
-
required=False,
|
499
|
-
),
|
500
|
-
types.PromptArgument(
|
501
|
-
name="keywords",
|
502
|
-
description="关键词",
|
503
|
-
required=False,
|
504
|
-
),
|
505
|
-
types.PromptArgument(
|
506
|
-
name="institution",
|
507
|
-
description="机构",
|
508
|
-
required=False,
|
509
|
-
),
|
510
|
-
],
|
511
|
-
),
|
512
|
-
types.Prompt(
|
513
|
-
name="summarize-notes",
|
514
|
-
description="总结所有笔记",
|
515
|
-
arguments=[
|
516
|
-
types.PromptArgument(
|
517
|
-
name="style",
|
518
|
-
description="摘要风格 (brief/detailed)",
|
519
|
-
required=False,
|
520
|
-
)
|
521
|
-
],
|
522
|
-
)
|
523
|
-
]
|
524
|
-
|
525
|
-
@server.get_prompt()
|
526
|
-
async def handle_get_prompt(
|
527
|
-
name: str, arguments: dict[str, str] | None
|
528
|
-
) -> types.GetPromptResult:
|
529
|
-
"""生成提示"""
|
530
|
-
if name == "search-literature":
|
531
|
-
keywords = (arguments or {}).get("keywords", "")
|
532
|
-
return types.GetPromptResult(
|
533
|
-
description="按主题搜索文献",
|
534
|
-
messages=[
|
535
|
-
types.PromptMessage(
|
536
|
-
role="user",
|
537
|
-
content=types.TextContent(
|
538
|
-
type="text",
|
539
|
-
text=f"请在中国知网搜索关于\"{keywords}\"的文献,并分析主要研究趋势。"
|
540
|
-
),
|
541
|
-
)
|
542
|
-
],
|
543
|
-
)
|
544
|
-
elif name == "advanced-search":
|
545
|
-
title = (arguments or {}).get("title", "")
|
546
|
-
author = (arguments or {}).get("author", "")
|
547
|
-
keywords = (arguments or {}).get("keywords", "")
|
548
|
-
institution = (arguments or {}).get("institution", "")
|
549
|
-
|
550
|
-
search_terms = []
|
551
|
-
if title:
|
552
|
-
search_terms.append(f"标题包含\"{title}\"")
|
553
|
-
if author:
|
554
|
-
search_terms.append(f"作者为\"{author}\"")
|
555
|
-
if keywords:
|
556
|
-
search_terms.append(f"关键词包含\"{keywords}\"")
|
557
|
-
if institution:
|
558
|
-
search_terms.append(f"机构为\"{institution}\"")
|
559
|
-
|
560
|
-
search_criteria = "、".join(search_terms)
|
561
|
-
|
562
|
-
return types.GetPromptResult(
|
563
|
-
description="高级文献搜索",
|
564
|
-
messages=[
|
565
|
-
types.PromptMessage(
|
566
|
-
role="user",
|
567
|
-
content=types.TextContent(
|
568
|
-
type="text",
|
569
|
-
text=f"请在中国知网搜索{search_criteria}的文献,并总结相关研究成果。"
|
570
|
-
),
|
571
|
-
)
|
572
|
-
],
|
573
|
-
)
|
574
|
-
elif name == "summarize-notes":
|
575
|
-
style = (arguments or {}).get("style", "brief")
|
576
|
-
detail_prompt = "请提供详细分析。" if style == "detailed" else ""
|
577
|
-
|
578
|
-
return types.GetPromptResult(
|
579
|
-
description="总结所有笔记",
|
580
|
-
messages=[
|
581
|
-
types.PromptMessage(
|
582
|
-
role="user",
|
583
|
-
content=types.TextContent(
|
584
|
-
type="text",
|
585
|
-
text=f"以下是需要总结的笔记:{detail_prompt}\n\n"
|
586
|
-
+ "\n".join(
|
587
|
-
f"- {name}: {content}"
|
588
|
-
for name, content in notes.items()
|
589
|
-
),
|
590
|
-
),
|
591
|
-
)
|
592
|
-
],
|
593
|
-
)
|
594
|
-
|
595
|
-
raise ValueError(f"未知提示: {name}")
|
596
|
-
|
597
|
-
@server.list_tools()
|
598
|
-
async def handle_list_tools() -> list[types.Tool]:
|
599
|
-
"""列出可用工具"""
|
600
|
-
tools = []
|
601
|
-
|
602
|
-
# 只添加搜索并提取的组合工具
|
603
|
-
if extractor is not None and PLAYWRIGHT_AVAILABLE:
|
604
|
-
tools.append(
|
605
|
-
types.Tool(
|
606
|
-
name="mcp_cnks_search_and_extract",
|
607
|
-
description="搜索知网关键词并提取所有论文的详细内容",
|
608
|
-
inputSchema={
|
609
|
-
"type": "object",
|
610
|
-
"properties": {
|
611
|
-
"keywords": {"type": "string", "description": "搜索关键词"},
|
612
|
-
},
|
613
|
-
"required": ["keywords"],
|
614
|
-
},
|
615
|
-
)
|
616
|
-
)
|
617
|
-
|
618
|
-
return tools
|
619
|
-
|
620
|
-
@server.call_tool()
|
621
|
-
async def handle_call_tool(
|
622
|
-
name: str, arguments: dict | None
|
623
|
-
) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
|
624
|
-
"""处理工具执行请求"""
|
625
|
-
global current_url, page_content
|
626
|
-
|
627
|
-
if name == "mcp_cnks_search_and_extract" and extractor is not None and PLAYWRIGHT_AVAILABLE:
|
628
|
-
if not arguments:
|
629
|
-
raise ValueError("缺少参数")
|
630
|
-
|
631
|
-
keywords = arguments.get("keywords")
|
632
|
-
if not keywords:
|
633
|
-
raise ValueError("缺少关键词")
|
634
|
-
|
635
|
-
try:
|
636
|
-
# 第一步:执行搜索
|
637
|
-
logger.info(f"开始执行搜索并提取:关键词 '{keywords}'")
|
638
|
-
links_count = await search_with_playwright(keywords)
|
639
|
-
current_url = "https://kns.cnki.net/kns8s/search"
|
640
|
-
|
641
|
-
# 检查搜索结果
|
642
|
-
if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
|
643
|
-
return [
|
644
|
-
types.TextContent(
|
645
|
-
type="text",
|
646
|
-
text=
|
647
|
-
"error": "搜索未返回有效链接",
|
648
|
-
"count": 0,
|
649
|
-
"results": []
|
650
|
-
}
|
651
|
-
)
|
652
|
-
]
|
653
|
-
|
654
|
-
# 提取链接
|
655
|
-
urls = [link["url"] for link in page_content["links"] if "url" in link]
|
656
|
-
if not urls:
|
657
|
-
return [
|
658
|
-
types.TextContent(
|
659
|
-
type="text",
|
660
|
-
text=
|
661
|
-
"error": "未找到有效链接",
|
662
|
-
"count": 0,
|
663
|
-
"results": []
|
664
|
-
}
|
665
|
-
)
|
666
|
-
]
|
667
|
-
|
668
|
-
# 第二步:执行提取
|
669
|
-
logger.info(f"搜索成功,找到 {len(urls)} 个链接,开始提取内容")
|
670
|
-
results = await extractor.batch_extract_contents(urls)
|
671
|
-
|
672
|
-
# 包装结果
|
673
|
-
result_json = {
|
674
|
-
"keywords": keywords,
|
675
|
-
"count": len(results),
|
676
|
-
"results": results,
|
677
|
-
"success_count": sum(1 for r in results if "error" not in r or not r["error"]),
|
678
|
-
"error_count": sum(1 for r in results if "error" in r and r["error"])
|
679
|
-
}
|
680
|
-
|
681
|
-
return [
|
682
|
-
types.TextContent(
|
683
|
-
type="text",
|
684
|
-
text=
|
685
|
-
)
|
686
|
-
]
|
687
|
-
except Exception as e:
|
688
|
-
logger.error(f"搜索并提取时出错: {str(e)}")
|
689
|
-
logger.error(traceback.format_exc())
|
690
|
-
return [
|
691
|
-
types.TextContent(
|
692
|
-
type="text",
|
693
|
-
text=
|
694
|
-
"error": f"搜索并提取内容时出错: {str(e)}",
|
695
|
-
"keywords": keywords,
|
696
|
-
"count": 0,
|
697
|
-
"results": []
|
698
|
-
}
|
699
|
-
)
|
700
|
-
]
|
701
|
-
|
702
|
-
else:
|
703
|
-
raise ValueError(f"未知工具: {name}")
|
704
|
-
|
705
|
-
async def find_and_count_abstract_links(page):
|
706
|
-
"""查找并统计包含article/abstract?v=的链接"""
|
707
|
-
global page_content
|
708
|
-
|
709
|
-
try:
|
710
|
-
logger.debug("[DEBUG] 开始查找所有包含article/abstract?v=的链接")
|
711
|
-
|
712
|
-
# 等待确保页面完全加载
|
713
|
-
await asyncio.sleep(2)
|
714
|
-
|
715
|
-
# 查找所有链接
|
716
|
-
all_links = await page.query_selector_all('a[href*="article/abstract?v="]')
|
717
|
-
links_count = len(all_links)
|
718
|
-
|
719
|
-
logger.debug(f"[DEBUG] 找到{links_count}条包含article/abstract?v=的链接")
|
720
|
-
|
721
|
-
# 提取并记录每个链接的URL和文本
|
722
|
-
links_info = []
|
723
|
-
|
724
|
-
for i, link in enumerate(all_links):
|
725
|
-
href = await link.get_attribute('href')
|
726
|
-
|
727
|
-
links_info.append({
|
728
|
-
'index': i + 1,
|
729
|
-
'href': href
|
730
|
-
})
|
731
|
-
|
732
|
-
logger.debug(f"[DEBUG] 链接 {i+1}: {href}")
|
733
|
-
|
734
|
-
# 判断数量是否符合预期(50条)
|
735
|
-
if links_count == 50:
|
736
|
-
logger.debug("[DEBUG] 链接数量正好是50条,符合预期")
|
737
|
-
elif links_count < 50:
|
738
|
-
logger.debug(f"[DEBUG] 链接数量为{links_count}条,少于预期的50条")
|
739
|
-
else:
|
740
|
-
logger.debug(f"[DEBUG] 链接数量为{links_count}条,多于预期的50条")
|
741
|
-
|
742
|
-
# 存储结果 - 使用字典结构而不是纯文本
|
743
|
-
page_content = {
|
744
|
-
"count": links_count,
|
745
|
-
"links": [{"index": link['index'], "url": link['href']} for link in links_info]
|
746
|
-
}
|
747
|
-
|
748
|
-
return links_count
|
749
|
-
except Exception as e:
|
750
|
-
logger.debug(f"[DEBUG] 查找链接时出错: {str(e)}")
|
751
|
-
return 0
|
752
|
-
|
753
|
-
async def main():
|
754
|
-
"""主程序入口"""
|
755
|
-
# 使用stdin/stdout流运行服务器
|
756
|
-
async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
|
757
|
-
await server.run(
|
758
|
-
read_stream,
|
759
|
-
write_stream,
|
760
|
-
InitializationOptions(
|
761
|
-
server_name="cnks",
|
762
|
-
server_version="0.2.
|
763
|
-
capabilities=server.get_capabilities(
|
764
|
-
notification_options=NotificationOptions(),
|
765
|
-
experimental_capabilities={},
|
766
|
-
),
|
767
|
-
),
|
768
|
-
)
|
769
|
-
|
770
|
-
# 为符合README.md的要求,添加从FastMCP导出的接口
|
771
|
-
def create_fastmcp_server():
|
772
|
-
"""创建FastMCP服务器接口,符合README中的示例"""
|
773
|
-
try:
|
774
|
-
from mcp.server.fastmcp import FastMCP
|
775
|
-
fast_mcp = FastMCP("知网搜索")
|
776
|
-
|
777
|
-
# 只添加搜索并提取的工具
|
778
|
-
if extractor is not None and PLAYWRIGHT_AVAILABLE:
|
779
|
-
@fast_mcp.tool()
|
780
|
-
async def mcp_cnks_search_and_extract(keywords: str) -> dict:
|
781
|
-
"""搜索关键词并提取所有论文的详细内容"""
|
782
|
-
logger.debug("[DEBUG] 正在使用FastMCP的mcp_cnks_search_and_extract函数")
|
783
|
-
try:
|
784
|
-
# 第一步:执行搜索
|
785
|
-
result_count = await search_with_playwright(keywords)
|
786
|
-
|
787
|
-
# 检查搜索结果
|
788
|
-
if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
|
789
|
-
return {
|
790
|
-
"error": "搜索未返回有效链接",
|
791
|
-
"keywords": keywords,
|
792
|
-
"count": 0,
|
793
|
-
"results": []
|
794
|
-
}
|
795
|
-
|
796
|
-
# 提取链接
|
797
|
-
urls = [link["url"] for link in page_content["links"] if "url" in link]
|
798
|
-
if not urls:
|
799
|
-
return {
|
800
|
-
"error": "未找到有效链接",
|
801
|
-
"keywords": keywords,
|
802
|
-
"count": 0,
|
803
|
-
"results": []
|
804
|
-
}
|
805
|
-
|
806
|
-
# 第二步:执行提取
|
807
|
-
results = await extractor.batch_extract_contents(urls)
|
808
|
-
|
809
|
-
# 包装结果
|
810
|
-
return {
|
811
|
-
"keywords": keywords,
|
812
|
-
"count": len(results),
|
813
|
-
"results": results,
|
814
|
-
"success_count": sum(1 for r in results if "error" not in r or not r["error"]),
|
815
|
-
"error_count": sum(1 for r in results if "error" in r and r["error"])
|
816
|
-
}
|
817
|
-
except Exception as e:
|
818
|
-
logger.error(f"搜索并提取时出错: {str(e)}")
|
819
|
-
return {
|
820
|
-
"error": f"搜索并提取内容时出错: {str(e)}",
|
821
|
-
"keywords": keywords,
|
822
|
-
"count": 0,
|
823
|
-
"results": []
|
824
|
-
}
|
825
|
-
|
826
|
-
return fast_mcp
|
827
|
-
except ImportError:
|
828
|
-
logger.warning("警告: 无法导入FastMCP,请确保已安装最新版本的MCP")
|
829
|
-
return None
|
830
|
-
|
831
|
-
if __name__ == "__main__":
|
1
|
+
import asyncio
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
import platform
|
5
|
+
import re
|
6
|
+
import subprocess
|
7
|
+
import sys
|
8
|
+
import time
|
9
|
+
import logging
|
10
|
+
import webbrowser
|
11
|
+
import traceback
|
12
|
+
from pathlib import Path
|
13
|
+
from urllib.parse import quote
|
14
|
+
from typing import Dict, List, Any, Optional, Union
|
15
|
+
|
16
|
+
from mcp.server.models import InitializationOptions
|
17
|
+
import mcp.types as types
|
18
|
+
from mcp.server import NotificationOptions, Server
|
19
|
+
from pydantic import AnyUrl
|
20
|
+
import mcp.server.stdio
|
21
|
+
|
22
|
+
# 配置日志记录
|
23
|
+
logging.basicConfig(
|
24
|
+
level=logging.DEBUG,
|
25
|
+
filename="cnks.log",
|
26
|
+
filemode="a",
|
27
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
28
|
+
)
|
29
|
+
logger = logging.getLogger("cnks")
|
30
|
+
|
31
|
+
# 尝试导入playwright
|
32
|
+
try:
|
33
|
+
from playwright.async_api import async_playwright
|
34
|
+
PLAYWRIGHT_AVAILABLE = True
|
35
|
+
except ImportError:
|
36
|
+
PLAYWRIGHT_AVAILABLE = False
|
37
|
+
logger.warning("Playwright未安装,将使用传统方式打开Chrome")
|
38
|
+
|
39
|
+
# 存储当前页面内容和笔记
|
40
|
+
page_content = ""
|
41
|
+
current_url = ""
|
42
|
+
notes: dict[str, str] = {}
|
43
|
+
browser_instance = None
|
44
|
+
|
45
|
+
server = Server("cnks")
|
46
|
+
|
47
|
+
# 导入我们新创建的extractor模块
|
48
|
+
try:
|
49
|
+
from . import chrome_extractor as extractor
|
50
|
+
except ImportError:
|
51
|
+
try:
|
52
|
+
import chrome_extractor as extractor
|
53
|
+
except ImportError:
|
54
|
+
extractor = None
|
55
|
+
logger.warning("无法导入chrome_extractor模块,批量提取功能将不可用")
|
56
|
+
|
57
|
+
def find_chrome_executable():
|
58
|
+
"""查找Chrome可执行文件路径"""
|
59
|
+
system = platform.system()
|
60
|
+
|
61
|
+
# 定义可能的Chrome位置
|
62
|
+
if system == "Windows":
|
63
|
+
chrome_paths = [
|
64
|
+
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
|
65
|
+
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
|
66
|
+
os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
|
67
|
+
]
|
68
|
+
elif system == "Darwin": # MacOS
|
69
|
+
chrome_paths = [
|
70
|
+
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
71
|
+
os.path.expanduser("~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"),
|
72
|
+
]
|
73
|
+
elif system == "Linux":
|
74
|
+
chrome_paths = [
|
75
|
+
"/usr/bin/google-chrome",
|
76
|
+
"/usr/bin/chromium-browser",
|
77
|
+
"/usr/bin/chromium",
|
78
|
+
]
|
79
|
+
else:
|
80
|
+
return None
|
81
|
+
|
82
|
+
# 检查路径是否存在
|
83
|
+
for path in chrome_paths:
|
84
|
+
if os.path.exists(path):
|
85
|
+
return path
|
86
|
+
|
87
|
+
# 尝试从环境变量中查找
|
88
|
+
chrome_env = os.environ.get("CHROME_PATH")
|
89
|
+
if chrome_env and os.path.exists(chrome_env):
|
90
|
+
return chrome_env
|
91
|
+
|
92
|
+
return None
|
93
|
+
|
94
|
+
def open_chrome(url):
|
95
|
+
"""打开Chrome浏览器并访问指定URL"""
|
96
|
+
try:
|
97
|
+
chrome_path = find_chrome_executable()
|
98
|
+
|
99
|
+
if not chrome_path:
|
100
|
+
return "未找到Chrome可执行文件。请设置CHROME_PATH环境变量指向Chrome位置。"
|
101
|
+
|
102
|
+
subprocess.Popen([
|
103
|
+
chrome_path,
|
104
|
+
url
|
105
|
+
])
|
106
|
+
time.sleep(2) # 等待页面加载
|
107
|
+
return True
|
108
|
+
except Exception as e:
|
109
|
+
return f"打开Chrome时出错: {str(e)}"
|
110
|
+
|
111
|
+
async def search_with_playwright(keywords):
|
112
|
+
"""使用playwright在知网搜索关键词"""
|
113
|
+
global page_content
|
114
|
+
|
115
|
+
if not PLAYWRIGHT_AVAILABLE:
|
116
|
+
return "需要安装playwright模块:uv add playwright"
|
117
|
+
|
118
|
+
try:
|
119
|
+
chrome_path = find_chrome_executable()
|
120
|
+
if not chrome_path:
|
121
|
+
return "未找到Chrome可执行文件。请设置CHROME_PATH环境变量指向Chrome位置。"
|
122
|
+
|
123
|
+
logger.debug(f"[DEBUG] 使用Playwright搜索,Chrome路径: {chrome_path}")
|
124
|
+
|
125
|
+
# 创建全局浏览器实例,避免执行完关闭
|
126
|
+
global browser_instance
|
127
|
+
|
128
|
+
# 只打开一个playwright实例
|
129
|
+
playwright = await async_playwright().start()
|
130
|
+
|
131
|
+
# 尝试使用系统Chrome
|
132
|
+
try:
|
133
|
+
logger.debug("[DEBUG] 尝试使用channel='chrome'启动浏览器")
|
134
|
+
browser = await playwright.chromium.launch(
|
135
|
+
headless=False,
|
136
|
+
channel="chrome"
|
137
|
+
)
|
138
|
+
except Exception as e:
|
139
|
+
logger.debug(f"[DEBUG] channel='chrome'方式失败: {str(e)}")
|
140
|
+
logger.debug("[DEBUG] 尝试使用executable_path启动浏览器")
|
141
|
+
# 如果失败,尝试使用executable_path指定Chrome路径
|
142
|
+
browser = await playwright.chromium.launch(
|
143
|
+
headless=False,
|
144
|
+
executable_path=chrome_path
|
145
|
+
)
|
146
|
+
|
147
|
+
# 保存浏览器实例以防止被关闭
|
148
|
+
browser_instance = browser
|
149
|
+
|
150
|
+
page = await browser.new_page()
|
151
|
+
|
152
|
+
# 导航到知网搜索页面
|
153
|
+
await page.goto("https://kns.cnki.net/kns8s/search")
|
154
|
+
logger.debug("[DEBUG] 成功打开知网搜索页面")
|
155
|
+
|
156
|
+
# 等待页面加载
|
157
|
+
await page.wait_for_load_state("networkidle")
|
158
|
+
|
159
|
+
# 查找并填写搜索框
|
160
|
+
try:
|
161
|
+
# 尝试定位搜索框
|
162
|
+
search_input = await page.query_selector('input.search-input')
|
163
|
+
if search_input:
|
164
|
+
# 清空搜索框
|
165
|
+
await search_input.fill("")
|
166
|
+
# 输入关键词
|
167
|
+
await search_input.fill(keywords)
|
168
|
+
logger.debug(f"[DEBUG] 已在搜索框中输入: {keywords}")
|
169
|
+
|
170
|
+
# 增加短暂等待以确保用户可以看到输入过程
|
171
|
+
await asyncio.sleep(1)
|
172
|
+
|
173
|
+
# 查找并点击搜索按钮
|
174
|
+
search_button = await page.query_selector('.search-btn')
|
175
|
+
if search_button:
|
176
|
+
await search_button.click()
|
177
|
+
logger.debug("[DEBUG] 已点击搜索按钮")
|
178
|
+
# 等待搜索结果加载
|
179
|
+
await page.wait_for_load_state("networkidle")
|
180
|
+
|
181
|
+
# 点击操作1:点击下拉框的三角形
|
182
|
+
try:
|
183
|
+
# 等待一下,确保页面元素都加载完成
|
184
|
+
await asyncio.sleep(2)
|
185
|
+
|
186
|
+
# 尝试点击排序下拉框
|
187
|
+
logger.debug("[DEBUG] 尝试点击排序下拉框")
|
188
|
+
# 根据提供的HTML,尝试定位下拉框的三角形
|
189
|
+
sort_dropdown = await page.query_selector('div[class="sort"][id="perPageDiv"]')
|
190
|
+
if sort_dropdown:
|
191
|
+
await sort_dropdown.click()
|
192
|
+
logger.debug("[DEBUG] 成功点击排序下拉框")
|
193
|
+
|
194
|
+
# 等待下拉菜单出现
|
195
|
+
await asyncio.sleep(1)
|
196
|
+
|
197
|
+
# 点击操作2:点击数字50选项
|
198
|
+
logger.debug("[DEBUG] 尝试点击'50'选项")
|
199
|
+
# 尝试定位"50"选项
|
200
|
+
option_50 = await page.query_selector('li[data-val="50"]')
|
201
|
+
if option_50:
|
202
|
+
await option_50.click()
|
203
|
+
logger.debug("[DEBUG] 成功点击'50'选项")
|
204
|
+
await page.wait_for_load_state("networkidle")
|
205
|
+
|
206
|
+
# 勾选来源类别中的CSSCI选项
|
207
|
+
try:
|
208
|
+
# 等待一下确保页面完全加载
|
209
|
+
await asyncio.sleep(2)
|
210
|
+
|
211
|
+
logger.debug("[DEBUG] 尝试勾选CSSCI选项")
|
212
|
+
|
213
|
+
# 首先尝试找到来源类别区域
|
214
|
+
# 通常来源类别会有一个标题或者分组
|
215
|
+
source_category = await page.query_selector('div.group-item:has-text("来源类别")')
|
216
|
+
|
217
|
+
if source_category:
|
218
|
+
logger.debug("[DEBUG] 找到来源类别区域")
|
219
|
+
|
220
|
+
# 在来源类别区域内查找CSSCI选项
|
221
|
+
cssci_checkbox = await source_category.query_selector('input[type="checkbox"]:near(:text("CSSCI"))')
|
222
|
+
|
223
|
+
if cssci_checkbox:
|
224
|
+
# 点击CSSCI复选框
|
225
|
+
await cssci_checkbox.click()
|
226
|
+
logger.debug("[DEBUG] 成功勾选CSSCI选项")
|
227
|
+
|
228
|
+
# 等待页面刷新
|
229
|
+
await page.wait_for_load_state("networkidle")
|
230
|
+
|
231
|
+
# 查找所有包含"article/abstract?v="字样的链接
|
232
|
+
links_count = await find_and_count_abstract_links(page)
|
233
|
+
|
234
|
+
return links_count
|
235
|
+
else:
|
236
|
+
logger.debug("[DEBUG] 在来源类别区域未找到CSSCI选项")
|
237
|
+
|
238
|
+
# 尝试另一种方式:直接在整个页面中查找CSSCI
|
239
|
+
cssci_text = await page.query_selector(':text("CSSCI")')
|
240
|
+
if cssci_text:
|
241
|
+
# 尝试点击文本附近的复选框
|
242
|
+
await cssci_text.click()
|
243
|
+
logger.debug("[DEBUG] 通过文本找到并点击了CSSCI")
|
244
|
+
await page.wait_for_load_state("networkidle")
|
245
|
+
|
246
|
+
# 查找所有包含"article/abstract?v="字样的链接
|
247
|
+
links_count = await find_and_count_abstract_links(page)
|
248
|
+
|
249
|
+
return links_count
|
250
|
+
else:
|
251
|
+
# 查找所有包含"article/abstract?v="字样的链接
|
252
|
+
links_count = await find_and_count_abstract_links(page)
|
253
|
+
return links_count
|
254
|
+
else:
|
255
|
+
logger.debug("[DEBUG] 未找到来源类别区域")
|
256
|
+
|
257
|
+
# 尝试直接在页面中查找CSSCI文本
|
258
|
+
cssci_text = await page.query_selector(':text("CSSCI")')
|
259
|
+
if cssci_text:
|
260
|
+
# 尝试点击文本附近的复选框
|
261
|
+
await cssci_text.click()
|
262
|
+
logger.debug("[DEBUG] 直接找到并点击了CSSCI")
|
263
|
+
await page.wait_for_load_state("networkidle")
|
264
|
+
|
265
|
+
# 查找所有包含"article/abstract?v="字样的链接
|
266
|
+
links_count = await find_and_count_abstract_links(page)
|
267
|
+
|
268
|
+
return links_count
|
269
|
+
else:
|
270
|
+
# 查找所有包含"article/abstract?v="字样的链接
|
271
|
+
links_count = await find_and_count_abstract_links(page)
|
272
|
+
return links_count
|
273
|
+
except Exception as e:
|
274
|
+
logger.debug(f"[DEBUG] 勾选CSSCI选项时出错: {str(e)}")
|
275
|
+
# 查找所有包含"article/abstract?v="字样的链接
|
276
|
+
links_count = await find_and_count_abstract_links(page)
|
277
|
+
return links_count
|
278
|
+
|
279
|
+
# 查找所有包含"article/abstract?v="字样的链接
|
280
|
+
links_count = await find_and_count_abstract_links(page)
|
281
|
+
return links_count
|
282
|
+
else:
|
283
|
+
logger.debug("[DEBUG] 未找到'50'选项")
|
284
|
+
page_content = {
|
285
|
+
"count": 0,
|
286
|
+
"links": [],
|
287
|
+
"error": "已搜索并点击下拉框,但未找到'50'选项"
|
288
|
+
}
|
289
|
+
return 0
|
290
|
+
else:
|
291
|
+
logger.debug("[DEBUG] 未找到排序下拉框")
|
292
|
+
page_content = {
|
293
|
+
"count": 0,
|
294
|
+
"links": [],
|
295
|
+
"error": "已搜索,但未找到排序下拉框"
|
296
|
+
}
|
297
|
+
return 0
|
298
|
+
except Exception as e:
|
299
|
+
logger.debug(f"[DEBUG] 点击下拉框或选项时出错: {str(e)}")
|
300
|
+
page_content = {
|
301
|
+
"count": 0,
|
302
|
+
"links": [],
|
303
|
+
"error": f"已搜索,但在点击下拉框或选项时出错: {str(e)}"
|
304
|
+
}
|
305
|
+
return 0
|
306
|
+
else:
|
307
|
+
# 不关闭浏览器
|
308
|
+
page_content = {
|
309
|
+
"count": 0,
|
310
|
+
"links": [],
|
311
|
+
"error": f"已填写搜索关键词: {keywords},但未找到搜索按钮"
|
312
|
+
}
|
313
|
+
return 0
|
314
|
+
else:
|
315
|
+
# 不关闭浏览器
|
316
|
+
page_content = {
|
317
|
+
"count": 0,
|
318
|
+
"links": [],
|
319
|
+
"error": f"未找到搜索框,无法搜索: {keywords}"
|
320
|
+
}
|
321
|
+
return 0
|
322
|
+
except Exception as e:
|
323
|
+
logger.debug(f"[DEBUG] 填写搜索框或点击搜索按钮时出错: {str(e)}")
|
324
|
+
# 不关闭浏览器
|
325
|
+
page_content = {
|
326
|
+
"count": 0,
|
327
|
+
"links": [],
|
328
|
+
"error": f"自动搜索过程中出错: {str(e)}"
|
329
|
+
}
|
330
|
+
return 0
|
331
|
+
except Exception as e:
|
332
|
+
error_msg = str(e)
|
333
|
+
logger.debug(f"[DEBUG] Playwright错误: {error_msg}")
|
334
|
+
|
335
|
+
# 如果是找不到Chrome的错误,提供更明确的指导
|
336
|
+
if "Executable doesn't exist" in error_msg and "ms-playwright" in error_msg:
|
337
|
+
error_message = f"需要安装Playwright的浏览器: playwright install\n如果您想使用系统Chrome,请重新启动服务器。\n\n{error_msg}"
|
338
|
+
else:
|
339
|
+
error_message = f"使用Playwright启动Chrome失败: {error_msg}"
|
340
|
+
|
341
|
+
page_content = {
|
342
|
+
"count": 0,
|
343
|
+
"links": [],
|
344
|
+
"error": error_message
|
345
|
+
}
|
346
|
+
return 0
|
347
|
+
|
348
|
+
def search_with_direct_chrome(keywords):
|
349
|
+
"""直接使用Chrome搜索,不使用playwright"""
|
350
|
+
global page_content
|
351
|
+
|
352
|
+
logger.debug("[DEBUG] 正在使用search_with_direct_chrome函数")
|
353
|
+
|
354
|
+
try:
|
355
|
+
url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
|
356
|
+
logger.debug(f"[DEBUG] 打开URL: {url}")
|
357
|
+
|
358
|
+
result = open_chrome(url)
|
359
|
+
|
360
|
+
if isinstance(result, str) and "打开Chrome" in result:
|
361
|
+
logger.debug(f"[DEBUG] 直接打开Chrome结果: {result}")
|
362
|
+
|
363
|
+
page_content = {
|
364
|
+
"count": 0,
|
365
|
+
"links": [],
|
366
|
+
"error": f"直接打开Chrome搜索: {result}"
|
367
|
+
}
|
368
|
+
|
369
|
+
else:
|
370
|
+
logger.debug("[DEBUG] 直接打开Chrome成功")
|
371
|
+
|
372
|
+
page_content = {
|
373
|
+
"count": 0,
|
374
|
+
"links": [],
|
375
|
+
"message": "已打开Chrome并搜索关键词,但无法自动获取链接。请安装playwright以获取完整功能。"
|
376
|
+
}
|
377
|
+
|
378
|
+
return page_content
|
379
|
+
except Exception as e:
|
380
|
+
logger.debug(f"[DEBUG] search_with_direct_chrome出错: {str(e)}")
|
381
|
+
|
382
|
+
page_content = {
|
383
|
+
"count": 0,
|
384
|
+
"links": [],
|
385
|
+
"error": f"使用Chrome搜索时出错: {str(e)}"
|
386
|
+
}
|
387
|
+
|
388
|
+
return page_content
|
389
|
+
|
390
|
+
def get_page_content():
|
391
|
+
"""获取当前页面内容(简化模拟)"""
|
392
|
+
global page_content, current_url
|
393
|
+
if not current_url:
|
394
|
+
return "尚未打开任何页面"
|
395
|
+
|
396
|
+
# 实际应用中,这里可以使用Selenium或类似工具来获取实际页面内容
|
397
|
+
# 此处为简化实现,返回模拟内容
|
398
|
+
if "cnki" in current_url:
|
399
|
+
return f"中国知网搜索页面\n当前URL: {current_url}\n可使用搜索工具查询文献。"
|
400
|
+
return f"已打开页面: {current_url}"
|
401
|
+
|
402
|
+
@server.list_resources()
|
403
|
+
async def handle_list_resources() -> list[types.Resource]:
|
404
|
+
"""列出可用资源"""
|
405
|
+
resources = []
|
406
|
+
|
407
|
+
# 当前网页资源
|
408
|
+
resources.append(
|
409
|
+
types.Resource(
|
410
|
+
uri=AnyUrl("webpage://current"),
|
411
|
+
name="当前网页",
|
412
|
+
description="当前打开的网页内容",
|
413
|
+
mimeType="text/plain",
|
414
|
+
)
|
415
|
+
)
|
416
|
+
|
417
|
+
# 知网搜索页资源
|
418
|
+
resources.append(
|
419
|
+
types.Resource(
|
420
|
+
uri=AnyUrl("webpage://cnki/search"),
|
421
|
+
name="知网搜索页",
|
422
|
+
description="中国知网搜索页面",
|
423
|
+
mimeType="text/plain",
|
424
|
+
)
|
425
|
+
)
|
426
|
+
|
427
|
+
# 笔记资源
|
428
|
+
for name in notes:
|
429
|
+
resources.append(
|
430
|
+
types.Resource(
|
431
|
+
uri=AnyUrl(f"note://internal/{name}"),
|
432
|
+
name=f"笔记: {name}",
|
433
|
+
description=f"笔记: {name}",
|
434
|
+
mimeType="text/plain",
|
435
|
+
)
|
436
|
+
)
|
437
|
+
|
438
|
+
return resources
|
439
|
+
|
440
|
+
@server.read_resource()
|
441
|
+
async def handle_read_resource(uri: AnyUrl) -> str:
|
442
|
+
"""读取资源内容"""
|
443
|
+
global current_url
|
444
|
+
|
445
|
+
scheme = uri.scheme
|
446
|
+
|
447
|
+
if scheme == "webpage":
|
448
|
+
path = uri.path if uri.path else ""
|
449
|
+
host = uri.host if uri.host else ""
|
450
|
+
|
451
|
+
if host == "current":
|
452
|
+
return get_page_content()
|
453
|
+
elif host == "cnki" and path == "/search":
|
454
|
+
# 打开知网搜索页
|
455
|
+
current_url = "https://kns.cnki.net/kns8s/search"
|
456
|
+
result = open_chrome(current_url)
|
457
|
+
if result is True:
|
458
|
+
return "已打开中国知网搜索页面,可使用搜索工具查询文献。"
|
459
|
+
else:
|
460
|
+
return result
|
461
|
+
elif scheme == "note":
|
462
|
+
name = uri.path
|
463
|
+
if name is not None:
|
464
|
+
name = name.lstrip("/")
|
465
|
+
if name in notes:
|
466
|
+
return notes[name]
|
467
|
+
raise ValueError(f"笔记未找到: {name}")
|
468
|
+
|
469
|
+
raise ValueError(f"不支持的URI方案或资源未找到: {uri}")
|
470
|
+
|
471
|
+
@server.list_prompts()
|
472
|
+
async def handle_list_prompts() -> list[types.Prompt]:
|
473
|
+
"""列出可用提示"""
|
474
|
+
return [
|
475
|
+
types.Prompt(
|
476
|
+
name="search-literature",
|
477
|
+
description="按主题搜索文献",
|
478
|
+
arguments=[
|
479
|
+
types.PromptArgument(
|
480
|
+
name="keywords",
|
481
|
+
description="搜索关键词",
|
482
|
+
required=True,
|
483
|
+
)
|
484
|
+
],
|
485
|
+
),
|
486
|
+
types.Prompt(
|
487
|
+
name="advanced-search",
|
488
|
+
description="高级文献搜索",
|
489
|
+
arguments=[
|
490
|
+
types.PromptArgument(
|
491
|
+
name="title",
|
492
|
+
description="论文标题",
|
493
|
+
required=False,
|
494
|
+
),
|
495
|
+
types.PromptArgument(
|
496
|
+
name="author",
|
497
|
+
description="作者",
|
498
|
+
required=False,
|
499
|
+
),
|
500
|
+
types.PromptArgument(
|
501
|
+
name="keywords",
|
502
|
+
description="关键词",
|
503
|
+
required=False,
|
504
|
+
),
|
505
|
+
types.PromptArgument(
|
506
|
+
name="institution",
|
507
|
+
description="机构",
|
508
|
+
required=False,
|
509
|
+
),
|
510
|
+
],
|
511
|
+
),
|
512
|
+
types.Prompt(
|
513
|
+
name="summarize-notes",
|
514
|
+
description="总结所有笔记",
|
515
|
+
arguments=[
|
516
|
+
types.PromptArgument(
|
517
|
+
name="style",
|
518
|
+
description="摘要风格 (brief/detailed)",
|
519
|
+
required=False,
|
520
|
+
)
|
521
|
+
],
|
522
|
+
)
|
523
|
+
]
|
524
|
+
|
525
|
+
@server.get_prompt()
|
526
|
+
async def handle_get_prompt(
|
527
|
+
name: str, arguments: dict[str, str] | None
|
528
|
+
) -> types.GetPromptResult:
|
529
|
+
"""生成提示"""
|
530
|
+
if name == "search-literature":
|
531
|
+
keywords = (arguments or {}).get("keywords", "")
|
532
|
+
return types.GetPromptResult(
|
533
|
+
description="按主题搜索文献",
|
534
|
+
messages=[
|
535
|
+
types.PromptMessage(
|
536
|
+
role="user",
|
537
|
+
content=types.TextContent(
|
538
|
+
type="text",
|
539
|
+
text=f"请在中国知网搜索关于\"{keywords}\"的文献,并分析主要研究趋势。"
|
540
|
+
),
|
541
|
+
)
|
542
|
+
],
|
543
|
+
)
|
544
|
+
elif name == "advanced-search":
|
545
|
+
title = (arguments or {}).get("title", "")
|
546
|
+
author = (arguments or {}).get("author", "")
|
547
|
+
keywords = (arguments or {}).get("keywords", "")
|
548
|
+
institution = (arguments or {}).get("institution", "")
|
549
|
+
|
550
|
+
search_terms = []
|
551
|
+
if title:
|
552
|
+
search_terms.append(f"标题包含\"{title}\"")
|
553
|
+
if author:
|
554
|
+
search_terms.append(f"作者为\"{author}\"")
|
555
|
+
if keywords:
|
556
|
+
search_terms.append(f"关键词包含\"{keywords}\"")
|
557
|
+
if institution:
|
558
|
+
search_terms.append(f"机构为\"{institution}\"")
|
559
|
+
|
560
|
+
search_criteria = "、".join(search_terms)
|
561
|
+
|
562
|
+
return types.GetPromptResult(
|
563
|
+
description="高级文献搜索",
|
564
|
+
messages=[
|
565
|
+
types.PromptMessage(
|
566
|
+
role="user",
|
567
|
+
content=types.TextContent(
|
568
|
+
type="text",
|
569
|
+
text=f"请在中国知网搜索{search_criteria}的文献,并总结相关研究成果。"
|
570
|
+
),
|
571
|
+
)
|
572
|
+
],
|
573
|
+
)
|
574
|
+
elif name == "summarize-notes":
|
575
|
+
style = (arguments or {}).get("style", "brief")
|
576
|
+
detail_prompt = "请提供详细分析。" if style == "detailed" else ""
|
577
|
+
|
578
|
+
return types.GetPromptResult(
|
579
|
+
description="总结所有笔记",
|
580
|
+
messages=[
|
581
|
+
types.PromptMessage(
|
582
|
+
role="user",
|
583
|
+
content=types.TextContent(
|
584
|
+
type="text",
|
585
|
+
text=f"以下是需要总结的笔记:{detail_prompt}\n\n"
|
586
|
+
+ "\n".join(
|
587
|
+
f"- {name}: {content}"
|
588
|
+
for name, content in notes.items()
|
589
|
+
),
|
590
|
+
),
|
591
|
+
)
|
592
|
+
],
|
593
|
+
)
|
594
|
+
|
595
|
+
raise ValueError(f"未知提示: {name}")
|
596
|
+
|
597
|
+
@server.list_tools()
|
598
|
+
async def handle_list_tools() -> list[types.Tool]:
|
599
|
+
"""列出可用工具"""
|
600
|
+
tools = []
|
601
|
+
|
602
|
+
# 只添加搜索并提取的组合工具
|
603
|
+
if extractor is not None and PLAYWRIGHT_AVAILABLE:
|
604
|
+
tools.append(
|
605
|
+
types.Tool(
|
606
|
+
name="mcp_cnks_search_and_extract",
|
607
|
+
description="搜索知网关键词并提取所有论文的详细内容",
|
608
|
+
inputSchema={
|
609
|
+
"type": "object",
|
610
|
+
"properties": {
|
611
|
+
"keywords": {"type": "string", "description": "搜索关键词"},
|
612
|
+
},
|
613
|
+
"required": ["keywords"],
|
614
|
+
},
|
615
|
+
)
|
616
|
+
)
|
617
|
+
|
618
|
+
return tools
|
619
|
+
|
620
|
+
@server.call_tool()
|
621
|
+
async def handle_call_tool(
|
622
|
+
name: str, arguments: dict | None
|
623
|
+
) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
|
624
|
+
"""处理工具执行请求"""
|
625
|
+
global current_url, page_content
|
626
|
+
|
627
|
+
if name == "mcp_cnks_search_and_extract" and extractor is not None and PLAYWRIGHT_AVAILABLE:
|
628
|
+
if not arguments:
|
629
|
+
raise ValueError("缺少参数")
|
630
|
+
|
631
|
+
keywords = arguments.get("keywords")
|
632
|
+
if not keywords:
|
633
|
+
raise ValueError("缺少关键词")
|
634
|
+
|
635
|
+
try:
|
636
|
+
# 第一步:执行搜索
|
637
|
+
logger.info(f"开始执行搜索并提取:关键词 '{keywords}'")
|
638
|
+
links_count = await search_with_playwright(keywords)
|
639
|
+
current_url = "https://kns.cnki.net/kns8s/search"
|
640
|
+
|
641
|
+
# 检查搜索结果
|
642
|
+
if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
|
643
|
+
return [
|
644
|
+
types.TextContent(
|
645
|
+
type="text",
|
646
|
+
text={
|
647
|
+
"error": "搜索未返回有效链接",
|
648
|
+
"count": 0,
|
649
|
+
"results": []
|
650
|
+
}
|
651
|
+
)
|
652
|
+
]
|
653
|
+
|
654
|
+
# 提取链接
|
655
|
+
urls = [link["url"] for link in page_content["links"] if "url" in link]
|
656
|
+
if not urls:
|
657
|
+
return [
|
658
|
+
types.TextContent(
|
659
|
+
type="text",
|
660
|
+
text={
|
661
|
+
"error": "未找到有效链接",
|
662
|
+
"count": 0,
|
663
|
+
"results": []
|
664
|
+
}
|
665
|
+
)
|
666
|
+
]
|
667
|
+
|
668
|
+
# 第二步:执行提取
|
669
|
+
logger.info(f"搜索成功,找到 {len(urls)} 个链接,开始提取内容")
|
670
|
+
results = await extractor.batch_extract_contents(urls)
|
671
|
+
|
672
|
+
# 包装结果
|
673
|
+
result_json = {
|
674
|
+
"keywords": keywords,
|
675
|
+
"count": len(results),
|
676
|
+
"results": results,
|
677
|
+
"success_count": sum(1 for r in results if "error" not in r or not r["error"]),
|
678
|
+
"error_count": sum(1 for r in results if "error" in r and r["error"])
|
679
|
+
}
|
680
|
+
|
681
|
+
return [
|
682
|
+
types.TextContent(
|
683
|
+
type="text",
|
684
|
+
text=result_json
|
685
|
+
)
|
686
|
+
]
|
687
|
+
except Exception as e:
|
688
|
+
logger.error(f"搜索并提取时出错: {str(e)}")
|
689
|
+
logger.error(traceback.format_exc())
|
690
|
+
return [
|
691
|
+
types.TextContent(
|
692
|
+
type="text",
|
693
|
+
text={
|
694
|
+
"error": f"搜索并提取内容时出错: {str(e)}",
|
695
|
+
"keywords": keywords,
|
696
|
+
"count": 0,
|
697
|
+
"results": []
|
698
|
+
}
|
699
|
+
)
|
700
|
+
]
|
701
|
+
|
702
|
+
else:
|
703
|
+
raise ValueError(f"未知工具: {name}")
|
704
|
+
|
705
|
+
async def find_and_count_abstract_links(page):
|
706
|
+
"""查找并统计包含article/abstract?v=的链接"""
|
707
|
+
global page_content
|
708
|
+
|
709
|
+
try:
|
710
|
+
logger.debug("[DEBUG] 开始查找所有包含article/abstract?v=的链接")
|
711
|
+
|
712
|
+
# 等待确保页面完全加载
|
713
|
+
await asyncio.sleep(2)
|
714
|
+
|
715
|
+
# 查找所有链接
|
716
|
+
all_links = await page.query_selector_all('a[href*="article/abstract?v="]')
|
717
|
+
links_count = len(all_links)
|
718
|
+
|
719
|
+
logger.debug(f"[DEBUG] 找到{links_count}条包含article/abstract?v=的链接")
|
720
|
+
|
721
|
+
# 提取并记录每个链接的URL和文本
|
722
|
+
links_info = []
|
723
|
+
|
724
|
+
for i, link in enumerate(all_links):
|
725
|
+
href = await link.get_attribute('href')
|
726
|
+
|
727
|
+
links_info.append({
|
728
|
+
'index': i + 1,
|
729
|
+
'href': href
|
730
|
+
})
|
731
|
+
|
732
|
+
logger.debug(f"[DEBUG] 链接 {i+1}: {href}")
|
733
|
+
|
734
|
+
# 判断数量是否符合预期(50条)
|
735
|
+
if links_count == 50:
|
736
|
+
logger.debug("[DEBUG] 链接数量正好是50条,符合预期")
|
737
|
+
elif links_count < 50:
|
738
|
+
logger.debug(f"[DEBUG] 链接数量为{links_count}条,少于预期的50条")
|
739
|
+
else:
|
740
|
+
logger.debug(f"[DEBUG] 链接数量为{links_count}条,多于预期的50条")
|
741
|
+
|
742
|
+
# 存储结果 - 使用字典结构而不是纯文本
|
743
|
+
page_content = {
|
744
|
+
"count": links_count,
|
745
|
+
"links": [{"index": link['index'], "url": link['href']} for link in links_info]
|
746
|
+
}
|
747
|
+
|
748
|
+
return links_count
|
749
|
+
except Exception as e:
|
750
|
+
logger.debug(f"[DEBUG] 查找链接时出错: {str(e)}")
|
751
|
+
return 0
|
752
|
+
|
753
|
+
async def main():
|
754
|
+
"""主程序入口"""
|
755
|
+
# 使用stdin/stdout流运行服务器
|
756
|
+
async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
|
757
|
+
await server.run(
|
758
|
+
read_stream,
|
759
|
+
write_stream,
|
760
|
+
InitializationOptions(
|
761
|
+
server_name="cnks",
|
762
|
+
server_version="0.2.3",
|
763
|
+
capabilities=server.get_capabilities(
|
764
|
+
notification_options=NotificationOptions(),
|
765
|
+
experimental_capabilities={},
|
766
|
+
),
|
767
|
+
),
|
768
|
+
)
|
769
|
+
|
770
|
+
# 为符合README.md的要求,添加从FastMCP导出的接口
|
771
|
+
def create_fastmcp_server():
|
772
|
+
"""创建FastMCP服务器接口,符合README中的示例"""
|
773
|
+
try:
|
774
|
+
from mcp.server.fastmcp import FastMCP
|
775
|
+
fast_mcp = FastMCP("知网搜索")
|
776
|
+
|
777
|
+
# 只添加搜索并提取的工具
|
778
|
+
if extractor is not None and PLAYWRIGHT_AVAILABLE:
|
779
|
+
@fast_mcp.tool()
|
780
|
+
async def mcp_cnks_search_and_extract(keywords: str) -> dict:
|
781
|
+
"""搜索关键词并提取所有论文的详细内容"""
|
782
|
+
logger.debug("[DEBUG] 正在使用FastMCP的mcp_cnks_search_and_extract函数")
|
783
|
+
try:
|
784
|
+
# 第一步:执行搜索
|
785
|
+
result_count = await search_with_playwright(keywords)
|
786
|
+
|
787
|
+
# 检查搜索结果
|
788
|
+
if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
|
789
|
+
return {
|
790
|
+
"error": "搜索未返回有效链接",
|
791
|
+
"keywords": keywords,
|
792
|
+
"count": 0,
|
793
|
+
"results": []
|
794
|
+
}
|
795
|
+
|
796
|
+
# 提取链接
|
797
|
+
urls = [link["url"] for link in page_content["links"] if "url" in link]
|
798
|
+
if not urls:
|
799
|
+
return {
|
800
|
+
"error": "未找到有效链接",
|
801
|
+
"keywords": keywords,
|
802
|
+
"count": 0,
|
803
|
+
"results": []
|
804
|
+
}
|
805
|
+
|
806
|
+
# 第二步:执行提取
|
807
|
+
results = await extractor.batch_extract_contents(urls)
|
808
|
+
|
809
|
+
# 包装结果
|
810
|
+
return {
|
811
|
+
"keywords": keywords,
|
812
|
+
"count": len(results),
|
813
|
+
"results": results,
|
814
|
+
"success_count": sum(1 for r in results if "error" not in r or not r["error"]),
|
815
|
+
"error_count": sum(1 for r in results if "error" in r and r["error"])
|
816
|
+
}
|
817
|
+
except Exception as e:
|
818
|
+
logger.error(f"搜索并提取时出错: {str(e)}")
|
819
|
+
return {
|
820
|
+
"error": f"搜索并提取内容时出错: {str(e)}",
|
821
|
+
"keywords": keywords,
|
822
|
+
"count": 0,
|
823
|
+
"results": []
|
824
|
+
}
|
825
|
+
|
826
|
+
return fast_mcp
|
827
|
+
except ImportError:
|
828
|
+
logger.warning("警告: 无法导入FastMCP,请确保已安装最新版本的MCP")
|
829
|
+
return None
|
830
|
+
|
831
|
+
if __name__ == "__main__":
|
832
832
|
asyncio.run(main())
|