cnks 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cnks/server.py CHANGED
@@ -12,11 +12,12 @@ import traceback
12
12
  from pathlib import Path
13
13
  from urllib.parse import quote
14
14
  from typing import Dict, List, Any, Optional, Union
15
+ from datetime import datetime
16
+ from pydantic import BaseModel, AnyUrl
15
17
 
16
18
  from mcp.server.models import InitializationOptions
17
19
  import mcp.types as types
18
20
  from mcp.server import NotificationOptions, Server
19
- from pydantic import AnyUrl
20
21
  import mcp.server.stdio
21
22
 
22
23
  # 配置日志记录
@@ -36,27 +37,38 @@ except ImportError:
36
37
  PLAYWRIGHT_AVAILABLE = False
37
38
  logger.warning("Playwright未安装,将使用传统方式打开Chrome")
38
39
 
40
+ # 定义数据模型
41
+ class CNKIContent(BaseModel):
42
+ """CNKI论文内容模型"""
43
+ title: str = ""
44
+ authors: List[str] = []
45
+ abstract: str = ""
46
+ keywords: List[str] = []
47
+ cite_format: str = ""
48
+ url: str = "" # 添加URL字段以记录来源
49
+
39
50
  # 存储当前页面内容和笔记
40
51
  page_content = ""
41
52
  current_url = ""
42
53
  notes: dict[str, str] = {}
43
- browser_instance = None
44
54
 
45
55
  server = Server("cnks")
46
56
 
47
- # 导入我们新创建的extractor模块
48
- try:
49
- from . import chrome_extractor as extractor
50
- except ImportError:
51
- try:
52
- import chrome_extractor as extractor
53
- except ImportError:
54
- extractor = None
55
- logger.warning("无法导入chrome_extractor模块,批量提取功能将不可用")
57
+ # 添加全局变量来跟踪playwright状态
58
+ playwright_instance = None
59
+ browser_instance = None
60
+ context = None
56
61
 
57
62
  def find_chrome_executable():
58
63
  """查找Chrome可执行文件路径"""
64
+ # 首先检查环境变量
65
+ chrome_env = os.environ.get("CHROME_PATH")
66
+ if chrome_env and os.path.exists(chrome_env):
67
+ logger.debug(f"[DEBUG] 从环境变量找到Chrome: {chrome_env}")
68
+ return chrome_env
69
+
59
70
  system = platform.system()
71
+ logger.debug(f"[DEBUG] 系统类型: {system}")
60
72
 
61
73
  # 定义可能的Chrome位置
62
74
  if system == "Windows":
@@ -64,48 +76,90 @@ def find_chrome_executable():
64
76
  r"C:\Program Files\Google\Chrome\Application\chrome.exe",
65
77
  r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
66
78
  os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
79
+ # Edge浏览器也是基于Chromium的
80
+ r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe",
81
+ r"C:\Program Files\Microsoft\Edge\Application\msedge.exe",
67
82
  ]
68
83
  elif system == "Darwin": # MacOS
69
84
  chrome_paths = [
70
85
  "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
71
86
  os.path.expanduser("~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"),
87
+ "/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
72
88
  ]
73
89
  elif system == "Linux":
74
90
  chrome_paths = [
75
91
  "/usr/bin/google-chrome",
76
92
  "/usr/bin/chromium-browser",
77
93
  "/usr/bin/chromium",
94
+ "/usr/bin/microsoft-edge",
78
95
  ]
79
96
  else:
97
+ logger.debug(f"[DEBUG] 未知系统类型: {system}")
80
98
  return None
81
99
 
82
100
  # 检查路径是否存在
83
101
  for path in chrome_paths:
84
102
  if os.path.exists(path):
103
+ logger.debug(f"[DEBUG] 找到Chrome: {path}")
85
104
  return path
86
105
 
87
- # 尝试从环境变量中查找
88
- chrome_env = os.environ.get("CHROME_PATH")
89
- if chrome_env and os.path.exists(chrome_env):
90
- return chrome_env
106
+ # 如果上述路径都不存在,尝试使用which命令查找
107
+ try:
108
+ if system != "Windows":
109
+ # 在Unix系统上尝试使用which命令
110
+ for browser in ["google-chrome", "chromium", "chromium-browser", "microsoft-edge"]:
111
+ try:
112
+ result = subprocess.check_output(["which", browser], universal_newlines=True).strip()
113
+ if result and os.path.exists(result):
114
+ logger.debug(f"[DEBUG] 使用which命令找到浏览器: {result}")
115
+ return result
116
+ except subprocess.CalledProcessError:
117
+ pass
118
+ else:
119
+ # 在Windows上尝试使用where命令
120
+ try:
121
+ result = subprocess.check_output(["where", "chrome"], universal_newlines=True).strip()
122
+ if result:
123
+ # where可能返回多行,取第一行
124
+ first_path = result.split('\n')[0].strip()
125
+ if os.path.exists(first_path):
126
+ logger.debug(f"[DEBUG] 使用where命令找到Chrome: {first_path}")
127
+ return first_path
128
+ except subprocess.CalledProcessError:
129
+ pass
130
+
131
+ # 尝试查找Edge
132
+ try:
133
+ result = subprocess.check_output(["where", "msedge"], universal_newlines=True).strip()
134
+ if result:
135
+ first_path = result.split('\n')[0].strip()
136
+ if os.path.exists(first_path):
137
+ logger.debug(f"[DEBUG] 使用where命令找到Edge: {first_path}")
138
+ return first_path
139
+ except subprocess.CalledProcessError:
140
+ pass
141
+ except Exception as e:
142
+ logger.debug(f"[DEBUG] 使用命令行查找浏览器时出错: {str(e)}")
91
143
 
144
+ logger.debug("[DEBUG] 未找到Chrome或兼容的浏览器")
92
145
  return None
93
146
 
94
147
  def open_chrome(url):
95
148
  """打开Chrome浏览器并访问指定URL"""
96
149
  try:
97
- chrome_path = find_chrome_executable()
150
+ logger.debug(f"[DEBUG] open_chrome函数被调用,URL: {url}")
98
151
 
99
- if not chrome_path:
100
- return "未找到Chrome可执行文件。请设置CHROME_PATH环境变量指向Chrome位置。"
152
+ # 使用webbrowser模块打开URL(会使用系统默认浏览器,通常是已经打开的Chrome)
153
+ logger.debug(f"[DEBUG] 尝试使用webbrowser.open打开URL: {url}")
154
+ webbrowser.open(url)
155
+ logger.debug(f"[DEBUG] webbrowser.open调用完成")
101
156
 
102
- subprocess.Popen([
103
- chrome_path,
104
- url
105
- ])
106
- time.sleep(2) # 等待页面加载
157
+ # 等待页面加载
158
+ time.sleep(2)
159
+ logger.debug("[DEBUG] open_chrome函数执行完毕")
107
160
  return True
108
161
  except Exception as e:
162
+ logger.debug(f"[DEBUG] open_chrome函数出错: {str(e)}")
109
163
  return f"打开Chrome时出错: {str(e)}"
110
164
 
111
165
  async def search_with_playwright(keywords):
@@ -116,234 +170,337 @@ async def search_with_playwright(keywords):
116
170
  return "需要安装playwright模块:uv add playwright"
117
171
 
118
172
  try:
119
- chrome_path = find_chrome_executable()
120
- if not chrome_path:
121
- return "未找到Chrome可执行文件。请设置CHROME_PATH环境变量指向Chrome位置。"
173
+ logger.debug(f"[DEBUG] 使用搜索功能,关键词: {keywords}")
122
174
 
123
- logger.debug(f"[DEBUG] 使用Playwright搜索,Chrome路径: {chrome_path}")
175
+ # 先访问知网首页而不是直接访问搜索结果页
176
+ initial_url = "https://kns.cnki.net/"
177
+ search_url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
178
+ logger.debug(f"[DEBUG] 初始URL: {initial_url}")
124
179
 
125
- # 创建全局浏览器实例,避免执行完关闭
126
- global browser_instance
180
+ # 创建全局变量来跟踪playwright状态
181
+ global playwright_instance, browser_instance, context
127
182
 
128
- # 只打开一个playwright实例
129
- playwright = await async_playwright().start()
183
+ # 查找Chrome路径
184
+ chrome_path = find_chrome_executable()
185
+ if not chrome_path:
186
+ logger.warning("[WARNING] 未找到Chrome可执行文件,将使用默认浏览器")
187
+ # 使用webbrowser模块打开
188
+ webbrowser.open(search_url)
189
+ # 构造一个基本结果
190
+ page_content = {
191
+ "count": 1,
192
+ "links": [{
193
+ "index": 1,
194
+ "url": search_url,
195
+ "title": f"搜索: {keywords}"
196
+ }]
197
+ }
198
+ return 1
199
+
200
+ logger.debug(f"[DEBUG] 找到Chrome路径: {chrome_path}")
130
201
 
131
- # 尝试使用系统Chrome
132
- try:
133
- logger.debug("[DEBUG] 尝试使用channel='chrome'启动浏览器")
134
- browser = await playwright.chromium.launch(
135
- headless=False,
136
- channel="chrome"
137
- )
138
- except Exception as e:
139
- logger.debug(f"[DEBUG] channel='chrome'方式失败: {str(e)}")
140
- logger.debug("[DEBUG] 尝试使用executable_path启动浏览器")
141
- # 如果失败,尝试使用executable_path指定Chrome路径
142
- browser = await playwright.chromium.launch(
143
- headless=False,
144
- executable_path=chrome_path
202
+ # 检查playwright是否已经运行
203
+ if 'playwright_instance' not in globals() or playwright_instance is None:
204
+ logger.debug("[DEBUG] 初始化新的playwright实例")
205
+ # 第一次运行,初始化playwright
206
+ playwright_instance = await async_playwright().start()
207
+
208
+ # 设置启动选项
209
+ browser_args = []
210
+
211
+ # 使用系统已安装的Chrome
212
+ if chrome_path:
213
+ browser_args.extend([
214
+ '--no-sandbox', # 在某些环境中可能需要
215
+ '--start-maximized' # 最大化窗口
216
+ ])
217
+
218
+ # 启动浏览器 - 尝试使用系统Chrome
219
+ try:
220
+ # 首先尝试使用chrome_path启动
221
+ logger.debug(f"[DEBUG] 尝试使用系统Chrome启动: {chrome_path}")
222
+ browser_instance = await playwright_instance.chromium.launch(
223
+ headless=False, # 显示浏览器界面
224
+ executable_path=chrome_path,
225
+ args=browser_args
226
+ )
227
+ except Exception as e:
228
+ logger.warning(f"[WARNING] 使用系统Chrome启动失败: {str(e)},尝试使用默认浏览器")
229
+ # 如果失败,使用默认浏览器
230
+ browser_instance = await playwright_instance.chromium.launch(
231
+ headless=False # 显示浏览器界面
232
+ )
233
+
234
+ # 创建上下文
235
+ context = await browser_instance.new_context(
236
+ viewport=None # 不限制视窗大小,使用浏览器默认设置
145
237
  )
238
+
239
+ # 创建新页面
240
+ page = await context.new_page()
241
+
242
+ # 访问初始URL(知网首页)
243
+ logger.debug(f"[DEBUG] 导航到知网首页: {initial_url}")
244
+ await page.goto(initial_url)
245
+ logger.debug("[DEBUG] 已打开新的浏览器窗口并访问知网首页")
246
+ else:
247
+ logger.debug("[DEBUG] 在现有playwright实例中打开新标签页")
248
+ # playwright已经在运行,创建新标签页
249
+ page = await context.new_page()
250
+ # 访问初始URL(知网首页)
251
+ await page.goto(initial_url)
252
+ logger.debug("[DEBUG] 已在现有浏览器中打开新标签页并访问知网首页")
146
253
 
147
- # 保存浏览器实例以防止被关闭
148
- browser_instance = browser
149
-
150
- page = await browser.new_page()
151
-
152
- # 导航到知网搜索页面
153
- await page.goto("https://kns.cnki.net/kns8s/search")
154
- logger.debug("[DEBUG] 成功打开知网搜索页面")
254
+ # 等待页面加载完成
255
+ await page.wait_for_load_state('networkidle')
256
+ await asyncio.sleep(1)
155
257
 
156
- # 等待页面加载
157
- await page.wait_for_load_state("networkidle")
258
+ # 检查是否需要验证
259
+ await check_and_wait_for_verification(page)
158
260
 
159
- # 查找并填写搜索框
261
+ # 尝试执行搜索操作
160
262
  try:
161
- # 尝试定位搜索框
162
- search_input = await page.query_selector('input.search-input')
263
+ # 方法1: 尝试在首页搜索框输入关键词
264
+ logger.debug("[DEBUG] 尝试在首页查找搜索框")
265
+
266
+ # 查找搜索框
267
+ search_input_selectors = [
268
+ '#txt_search',
269
+ 'input[type="text"]',
270
+ '.search-input',
271
+ '.input-box input',
272
+ 'input.search-textbox',
273
+ 'input[placeholder*="搜索"]'
274
+ ]
275
+
276
+ search_input = None
277
+ for selector in search_input_selectors:
278
+ try:
279
+ logger.debug(f"[DEBUG] 尝试查找搜索框选择器: {selector}")
280
+ search_input = await page.query_selector(selector)
281
+ if search_input:
282
+ logger.debug(f"[DEBUG] 找到搜索框: {selector}")
283
+ break
284
+ except Exception as e:
285
+ logger.debug(f"[DEBUG] 查找选择器 {selector} 时出错: {str(e)}")
286
+
163
287
  if search_input:
164
288
  # 清空搜索框
165
289
  await search_input.fill("")
166
290
  # 输入关键词
167
- await search_input.fill(keywords)
168
- logger.debug(f"[DEBUG] 已在搜索框中输入: {keywords}")
291
+ await search_input.type(keywords, delay=100) # 添加延迟模拟真实输入
292
+ logger.debug(f"[DEBUG] 已在搜索框中输入关键词: {keywords}")
169
293
 
170
- # 增加短暂等待以确保用户可以看到输入过程
171
- await asyncio.sleep(1)
294
+ # 查找搜索按钮
295
+ search_button_selectors = [
296
+ 'button.search-btn',
297
+ 'button.search',
298
+ 'button[type="submit"]',
299
+ 'input[type="submit"]',
300
+ '.search-action',
301
+ 'a.search-btn'
302
+ ]
303
+
304
+ search_button = None
305
+ for selector in search_button_selectors:
306
+ try:
307
+ logger.debug(f"[DEBUG] 尝试查找搜索按钮选择器: {selector}")
308
+ search_button = await page.query_selector(selector)
309
+ if search_button:
310
+ logger.debug(f"[DEBUG] 找到搜索按钮: {selector}")
311
+ break
312
+ except Exception as e:
313
+ logger.debug(f"[DEBUG] 查找选择器 {selector} 时出错: {str(e)}")
172
314
 
173
- # 查找并点击搜索按钮
174
- search_button = await page.query_selector('.search-btn')
175
315
  if search_button:
316
+ # 点击搜索按钮
317
+ logger.debug("[DEBUG] 点击搜索按钮")
176
318
  await search_button.click()
177
- logger.debug("[DEBUG] 已点击搜索按钮")
178
- # 等待搜索结果加载
179
- await page.wait_for_load_state("networkidle")
180
319
 
181
- # 点击操作1:点击下拉框的三角形
182
- try:
183
- # 等待一下,确保页面元素都加载完成
184
- await asyncio.sleep(2)
185
-
186
- # 尝试点击排序下拉框
187
- logger.debug("[DEBUG] 尝试点击排序下拉框")
188
- # 根据提供的HTML,尝试定位下拉框的三角形
189
- sort_dropdown = await page.query_selector('div[class="sort"][id="perPageDiv"]')
190
- if sort_dropdown:
191
- await sort_dropdown.click()
192
- logger.debug("[DEBUG] 成功点击排序下拉框")
193
-
194
- # 等待下拉菜单出现
195
- await asyncio.sleep(1)
196
-
197
- # 点击操作2:点击数字50选项
198
- logger.debug("[DEBUG] 尝试点击'50'选项")
199
- # 尝试定位"50"选项
200
- option_50 = await page.query_selector('li[data-val="50"]')
201
- if option_50:
202
- await option_50.click()
203
- logger.debug("[DEBUG] 成功点击'50'选项")
204
- await page.wait_for_load_state("networkidle")
205
-
206
- # 勾选来源类别中的CSSCI选项
207
- try:
208
- # 等待一下确保页面完全加载
209
- await asyncio.sleep(2)
210
-
211
- logger.debug("[DEBUG] 尝试勾选CSSCI选项")
212
-
213
- # 首先尝试找到来源类别区域
214
- # 通常来源类别会有一个标题或者分组
215
- source_category = await page.query_selector('div.group-item:has-text("来源类别")')
216
-
217
- if source_category:
218
- logger.debug("[DEBUG] 找到来源类别区域")
219
-
220
- # 在来源类别区域内查找CSSCI选项
221
- cssci_checkbox = await source_category.query_selector('input[type="checkbox"]:near(:text("CSSCI"))')
222
-
223
- if cssci_checkbox:
224
- # 点击CSSCI复选框
225
- await cssci_checkbox.click()
226
- logger.debug("[DEBUG] 成功勾选CSSCI选项")
227
-
228
- # 等待页面刷新
229
- await page.wait_for_load_state("networkidle")
230
-
231
- # 查找所有包含"article/abstract?v="字样的链接
232
- links_count = await find_and_count_abstract_links(page)
233
-
234
- return links_count
235
- else:
236
- logger.debug("[DEBUG] 在来源类别区域未找到CSSCI选项")
237
-
238
- # 尝试另一种方式:直接在整个页面中查找CSSCI
239
- cssci_text = await page.query_selector(':text("CSSCI")')
240
- if cssci_text:
241
- # 尝试点击文本附近的复选框
242
- await cssci_text.click()
243
- logger.debug("[DEBUG] 通过文本找到并点击了CSSCI")
244
- await page.wait_for_load_state("networkidle")
245
-
246
- # 查找所有包含"article/abstract?v="字样的链接
247
- links_count = await find_and_count_abstract_links(page)
248
-
249
- return links_count
250
- else:
251
- # 查找所有包含"article/abstract?v="字样的链接
252
- links_count = await find_and_count_abstract_links(page)
253
- return links_count
254
- else:
255
- logger.debug("[DEBUG] 未找到来源类别区域")
256
-
257
- # 尝试直接在页面中查找CSSCI文本
258
- cssci_text = await page.query_selector(':text("CSSCI")')
259
- if cssci_text:
260
- # 尝试点击文本附近的复选框
261
- await cssci_text.click()
262
- logger.debug("[DEBUG] 直接找到并点击了CSSCI")
263
- await page.wait_for_load_state("networkidle")
264
-
265
- # 查找所有包含"article/abstract?v="字样的链接
266
- links_count = await find_and_count_abstract_links(page)
267
-
268
- return links_count
269
- else:
270
- # 查找所有包含"article/abstract?v="字样的链接
271
- links_count = await find_and_count_abstract_links(page)
272
- return links_count
273
- except Exception as e:
274
- logger.debug(f"[DEBUG] 勾选CSSCI选项时出错: {str(e)}")
275
- # 查找所有包含"article/abstract?v="字样的链接
276
- links_count = await find_and_count_abstract_links(page)
277
- return links_count
278
-
279
- # 查找所有包含"article/abstract?v="字样的链接
280
- links_count = await find_and_count_abstract_links(page)
281
- return links_count
282
- else:
283
- logger.debug("[DEBUG] 未找到'50'选项")
284
- page_content = {
285
- "count": 0,
286
- "links": [],
287
- "error": "已搜索并点击下拉框,但未找到'50'选项"
288
- }
289
- return 0
290
- else:
291
- logger.debug("[DEBUG] 未找到排序下拉框")
292
- page_content = {
293
- "count": 0,
294
- "links": [],
295
- "error": "已搜索,但未找到排序下拉框"
296
- }
297
- return 0
298
- except Exception as e:
299
- logger.debug(f"[DEBUG] 点击下拉框或选项时出错: {str(e)}")
300
- page_content = {
301
- "count": 0,
302
- "links": [],
303
- "error": f"已搜索,但在点击下拉框或选项时出错: {str(e)}"
304
- }
305
- return 0
320
+ # 等待搜索结果加载
321
+ logger.debug("[DEBUG] 等待搜索结果加载")
322
+ await page.wait_for_load_state('networkidle')
323
+ await asyncio.sleep(2)
306
324
  else:
307
- # 不关闭浏览器
308
- page_content = {
309
- "count": 0,
310
- "links": [],
311
- "error": f"已填写搜索关键词: {keywords},但未找到搜索按钮"
312
- }
313
- return 0
325
+ # 如果找不到搜索按钮,尝试按回车
326
+ logger.debug("[DEBUG] 未找到搜索按钮,尝试按回车键")
327
+ await search_input.press("Enter")
328
+
329
+ # 等待搜索结果加载
330
+ logger.debug("[DEBUG] 等待搜索结果加载")
331
+ await page.wait_for_load_state('networkidle')
332
+ await asyncio.sleep(2)
314
333
  else:
315
- # 不关闭浏览器
316
- page_content = {
317
- "count": 0,
318
- "links": [],
319
- "error": f"未找到搜索框,无法搜索: {keywords}"
320
- }
321
- return 0
334
+ # 如果找不到搜索框,直接导航到搜索URL
335
+ logger.debug(f"[DEBUG] 未找到搜索框,直接导航到搜索URL: {search_url}")
336
+ await page.goto(search_url)
337
+ await page.wait_for_load_state('networkidle')
338
+ await asyncio.sleep(2)
322
339
  except Exception as e:
323
- logger.debug(f"[DEBUG] 填写搜索框或点击搜索按钮时出错: {str(e)}")
324
- # 不关闭浏览器
340
+ logger.debug(f"[DEBUG] 执行搜索操作时出错: {str(e)}")
341
+ logger.debug(traceback.format_exc())
342
+
343
+ # 如果交互失败,直接导航到搜索URL
344
+ logger.debug(f"[DEBUG] 导航到搜索URL: {search_url}")
345
+ await page.goto(search_url)
346
+ await page.wait_for_load_state('networkidle')
347
+ await asyncio.sleep(2)
348
+
349
+ # 在搜索结果页面再次检查是否需要验证
350
+ await check_and_wait_for_verification(page)
351
+
352
+ # 查找并计数链接
353
+ links_count = await find_and_count_abstract_links(page)
354
+
355
+ # 添加等待时间让用户可以查看结果
356
+ await asyncio.sleep(5)
357
+
358
+ logger.debug(f"[DEBUG] 搜索完成,找到 {links_count} 个链接")
359
+
360
+ # 如果找不到链接,使用基本信息构造结果
361
+ if links_count == 0:
362
+ # 获取当前URL
363
+ current_url = await page.url()
364
+ page_content = {
365
+ "count": 1,
366
+ "links": [{
367
+ "index": 1,
368
+ "url": current_url,
369
+ "title": f"搜索: {keywords}"
370
+ }]
371
+ }
372
+
373
+ return links_count
374
+ except Exception as e:
375
+ error_msg = str(e)
376
+ logger.debug(f"[DEBUG] 搜索错误: {error_msg}")
377
+ logger.debug(traceback.format_exc())
378
+
379
+ # 尝试直接使用webbrowser打开
380
+ try:
381
+ logger.debug("[DEBUG] 尝试使用webbrowser打开URL")
382
+ webbrowser.open(search_url)
383
+
384
+ # 构造一个基本结果
385
+ page_content = {
386
+ "count": 1,
387
+ "links": [{
388
+ "index": 1,
389
+ "url": search_url,
390
+ "title": f"搜索: {keywords}"
391
+ }]
392
+ }
393
+ return 1
394
+ except Exception as e2:
395
+ logger.debug(f"[DEBUG] 使用webbrowser打开URL失败: {str(e2)}")
396
+
325
397
  page_content = {
326
398
  "count": 0,
327
399
  "links": [],
328
- "error": f"自动搜索过程中出错: {str(e)}"
400
+ "error": f"搜索过程中出错: {error_msg}"
329
401
  }
330
402
  return 0
331
- except Exception as e:
332
- error_msg = str(e)
333
- logger.debug(f"[DEBUG] Playwright错误: {error_msg}")
403
+
404
+ async def check_and_wait_for_verification(page):
405
+ """检查页面是否需要验证,如果需要则等待用户手动验证"""
406
+ # 验证页面可能包含的特征
407
+ verification_indicators = [
408
+ '验证码',
409
+ '人机验证',
410
+ 'captcha',
411
+ 'verify',
412
+ '安全验证',
413
+ '滑动验证',
414
+ '拖动滑块',
415
+ '请完成验证',
416
+ '拼图验证'
417
+ ]
418
+
419
+ try:
420
+ # 获取页面内容
421
+ page_text = await page.content()
334
422
 
335
- # 如果是找不到Chrome的错误,提供更明确的指导
336
- if "Executable doesn't exist" in error_msg and "ms-playwright" in error_msg:
337
- error_message = f"需要安装Playwright的浏览器: playwright install\n如果您想使用系统Chrome,请重新启动服务器。\n\n{error_msg}"
338
- else:
339
- error_message = f"使用Playwright启动Chrome失败: {error_msg}"
423
+ # 检查是否包含验证指示词
424
+ needs_verification = any(indicator in page_text for indicator in verification_indicators)
425
+
426
+ # 尝试查找常见的验证元素
427
+ verification_selectors = [
428
+ '.verify-wrap',
429
+ '.captcha',
430
+ '.verification',
431
+ '#captcha',
432
+ '.slidecode',
433
+ '.verify-box',
434
+ '.verify-img-panel',
435
+ 'iframe[src*="captcha"]',
436
+ 'iframe[src*="verify"]'
437
+ ]
438
+
439
+ for selector in verification_selectors:
440
+ try:
441
+ verify_elem = await page.query_selector(selector)
442
+ if verify_elem:
443
+ needs_verification = True
444
+ logger.info(f"[INFO] 检测到验证元素: {selector}")
445
+ break
446
+ except:
447
+ pass
448
+
449
+ if needs_verification:
450
+ logger.info("[INFO] 检测到验证页面,等待用户手动验证...")
451
+ print("\n*** 请注意 ***")
452
+ print("检测到需要验证码验证,请在浏览器中完成验证...")
453
+ print("验证完成后,程序将自动继续\n")
340
454
 
341
- page_content = {
342
- "count": 0,
343
- "links": [],
344
- "error": error_message
345
- }
346
- return 0
455
+ # 等待用户完成验证,验证页面可能有不同的特征表明验证完成
456
+ # 例如,特定元素消失或页面URL改变
457
+ max_wait_time = 120 # 最长等待2分钟
458
+ start_time = time.time()
459
+ current_url = await page.url()
460
+
461
+ while time.time() - start_time < max_wait_time:
462
+ # 每隔一秒检查一次
463
+ await asyncio.sleep(1)
464
+
465
+ # 检查URL是否改变(可能表示验证成功)
466
+ new_url = await page.url()
467
+ if new_url != current_url:
468
+ logger.info("[INFO] 检测到URL变化,验证可能已完成")
469
+ break
470
+
471
+ # 再次检查验证元素是否消失
472
+ verification_still_present = False
473
+ for selector in verification_selectors:
474
+ try:
475
+ verify_elem = await page.query_selector(selector)
476
+ if verify_elem:
477
+ verification_still_present = True
478
+ break
479
+ except:
480
+ pass
481
+
482
+ if not verification_still_present:
483
+ logger.info("[INFO] 验证元素已消失,验证可能已完成")
484
+ break
485
+
486
+ # 检查页面内容是否不再包含验证指示词
487
+ page_text = await page.content()
488
+ if not any(indicator in page_text for indicator in verification_indicators):
489
+ logger.info("[INFO] 验证指示词已消失,验证可能已完成")
490
+ break
491
+
492
+ # 等待页面稳定
493
+ await page.wait_for_load_state('networkidle')
494
+ await asyncio.sleep(2)
495
+
496
+ logger.info("[INFO] 继续执行,可能已完成验证")
497
+ print("继续执行操作...\n")
498
+ else:
499
+ logger.debug("[DEBUG] 未检测到验证页面")
500
+
501
+ except Exception as e:
502
+ logger.error(f"[ERROR] 检查验证页面时出错: {str(e)}")
503
+ logger.error(traceback.format_exc())
347
504
 
348
505
  def search_with_direct_chrome(keywords):
349
506
  """直接使用Chrome搜索,不使用playwright"""
@@ -355,25 +512,26 @@ def search_with_direct_chrome(keywords):
355
512
  url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
356
513
  logger.debug(f"[DEBUG] 打开URL: {url}")
357
514
 
515
+ # 使用open_chrome函数打开URL
358
516
  result = open_chrome(url)
359
517
 
360
- if isinstance(result, str) and "打开Chrome" in result:
361
- logger.debug(f"[DEBUG] 直接打开Chrome结果: {result}")
362
-
363
- page_content = {
364
- "count": 0,
365
- "links": [],
366
- "error": f"直接打开Chrome搜索: {result}"
367
- }
368
-
369
- else:
370
- logger.debug("[DEBUG] 直接打开Chrome成功")
518
+ if isinstance(result, str) and "错误" in result:
519
+ logger.debug(f"[DEBUG] 打开Chrome失败: {result}")
371
520
 
372
521
  page_content = {
373
522
  "count": 0,
374
523
  "links": [],
375
- "message": "已打开Chrome并搜索关键词,但无法自动获取链接。请安装playwright以获取完整功能。"
524
+ "error": f"打开Chrome搜索失败: {result}"
376
525
  }
526
+ return page_content
527
+
528
+ logger.debug("[DEBUG] 已尝试在已有Chrome窗口中打开新标签页")
529
+
530
+ page_content = {
531
+ "count": 0,
532
+ "links": [],
533
+ "message": "已打开Chrome并搜索关键词,但无法自动获取链接。请安装playwright以获取完整功能。"
534
+ }
377
535
 
378
536
  return page_content
379
537
  except Exception as e:
@@ -600,10 +758,10 @@ async def handle_list_tools() -> list[types.Tool]:
600
758
  tools = []
601
759
 
602
760
  # 只添加搜索并提取的组合工具
603
- if extractor is not None and PLAYWRIGHT_AVAILABLE:
761
+ if PLAYWRIGHT_AVAILABLE:
604
762
  tools.append(
605
763
  types.Tool(
606
- name="mcp_cnks_search_and_extract",
764
+ name="search_and_extract",
607
765
  description="搜索知网关键词并提取所有论文的详细内容",
608
766
  inputSchema={
609
767
  "type": "object",
@@ -624,7 +782,7 @@ async def handle_call_tool(
624
782
  """处理工具执行请求"""
625
783
  global current_url, page_content
626
784
 
627
- if name == "mcp_cnks_search_and_extract" and extractor is not None and PLAYWRIGHT_AVAILABLE:
785
+ if name == "search_and_extract" and PLAYWRIGHT_AVAILABLE:
628
786
  if not arguments:
629
787
  raise ValueError("缺少参数")
630
788
 
@@ -635,39 +793,95 @@ async def handle_call_tool(
635
793
  try:
636
794
  # 第一步:执行搜索
637
795
  logger.info(f"开始执行搜索并提取:关键词 '{keywords}'")
638
- links_count = await search_with_playwright(keywords)
639
- current_url = "https://kns.cnki.net/kns8s/search"
796
+
797
+ # 构建URL
798
+ url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
799
+ current_url = url
800
+ logger.debug(f"[DEBUG] 搜索URL: {url}")
801
+
802
+ # 如果playwright可用,使用playwright搜索
803
+ if PLAYWRIGHT_AVAILABLE:
804
+ logger.debug("[DEBUG] 使用playwright搜索")
805
+ links_count = await search_with_playwright(keywords)
806
+ else:
807
+ # 否则直接用open_chrome打开URL
808
+ logger.debug("[DEBUG] 直接使用open_chrome打开URL")
809
+ result = open_chrome(url)
810
+
811
+ if isinstance(result, str):
812
+ # 如果是错误信息,返回错误
813
+ return [
814
+ types.TextContent(
815
+ type="text",
816
+ text=json.dumps({
817
+ "error": f"打开Chrome失败: {result}",
818
+ "keywords": keywords,
819
+ "count": 0,
820
+ "results": []
821
+ })
822
+ )
823
+ ]
824
+ else:
825
+ # 成功打开但无法获取链接
826
+ return [
827
+ types.TextContent(
828
+ type="text",
829
+ text=json.dumps({
830
+ "keywords": keywords,
831
+ "count": 0,
832
+ "message": "已直接在Chrome中打开搜索页面,但无法自动获取搜索结果。请安装playwright以获取完整功能。",
833
+ "results": []
834
+ })
835
+ )
836
+ ]
640
837
 
641
838
  # 检查搜索结果
642
839
  if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
840
+ # 如果没有找到链接,至少返回搜索页面作为结果
841
+ logger.debug("[DEBUG] 搜索未返回有效链接,返回搜索页面作为结果")
643
842
  return [
644
843
  types.TextContent(
645
844
  type="text",
646
- text={
647
- "error": "搜索未返回有效链接",
648
- "count": 0,
649
- "results": []
650
- }
845
+ text=json.dumps({
846
+ "keywords": keywords,
847
+ "count": 1,
848
+ "results": [{
849
+ "title": f"搜索结果: {keywords}",
850
+ "authors": [],
851
+ "abstract": "请在浏览器中查看搜索结果",
852
+ "keywords": [],
853
+ "cite_format": "",
854
+ "url": url
855
+ }]
856
+ })
651
857
  )
652
858
  ]
653
859
 
654
860
  # 提取链接
655
861
  urls = [link["url"] for link in page_content["links"] if "url" in link]
656
862
  if not urls:
863
+ logger.debug("[DEBUG] 没有找到有效链接,返回搜索页面")
657
864
  return [
658
865
  types.TextContent(
659
866
  type="text",
660
- text={
661
- "error": "未找到有效链接",
662
- "count": 0,
663
- "results": []
664
- }
867
+ text=json.dumps({
868
+ "keywords": keywords,
869
+ "count": 1,
870
+ "results": [{
871
+ "title": f"搜索结果: {keywords}",
872
+ "authors": [],
873
+ "abstract": "请在浏览器中查看搜索结果",
874
+ "keywords": [],
875
+ "cite_format": "",
876
+ "url": url
877
+ }]
878
+ })
665
879
  )
666
880
  ]
667
881
 
668
882
  # 第二步:执行提取
669
883
  logger.info(f"搜索成功,找到 {len(urls)} 个链接,开始提取内容")
670
- results = await extractor.batch_extract_contents(urls)
884
+ results = await batch_extract_contents(urls)
671
885
 
672
886
  # 包装结果
673
887
  result_json = {
@@ -681,7 +895,7 @@ async def handle_call_tool(
681
895
  return [
682
896
  types.TextContent(
683
897
  type="text",
684
- text=result_json
898
+ text=json.dumps(result_json)
685
899
  )
686
900
  ]
687
901
  except Exception as e:
@@ -690,12 +904,12 @@ async def handle_call_tool(
690
904
  return [
691
905
  types.TextContent(
692
906
  type="text",
693
- text={
907
+ text=json.dumps({
694
908
  "error": f"搜索并提取内容时出错: {str(e)}",
695
909
  "keywords": keywords,
696
910
  "count": 0,
697
911
  "results": []
698
- }
912
+ })
699
913
  )
700
914
  ]
701
915
 
@@ -703,69 +917,853 @@ async def handle_call_tool(
703
917
  raise ValueError(f"未知工具: {name}")
704
918
 
705
919
  async def find_and_count_abstract_links(page):
706
- """查找并统计包含article/abstract?v=的链接"""
920
+ """查找并统计知网搜索结果页面中的论文链接"""
707
921
  global page_content
708
922
 
709
923
  try:
710
- logger.debug("[DEBUG] 开始查找所有包含article/abstract?v=的链接")
924
+ logger.debug("[DEBUG] 开始查找知网搜索结果中的论文链接")
711
925
 
712
926
  # 等待确保页面完全加载
713
- await asyncio.sleep(2)
927
+ await asyncio.sleep(3)
714
928
 
715
- # 查找所有链接
716
- all_links = await page.query_selector_all('a[href*="article/abstract?v="]')
717
- links_count = len(all_links)
718
-
719
- logger.debug(f"[DEBUG] 找到{links_count}条包含article/abstract?v=的链接")
929
+ # 首先尝试设置每页显示50条记录
930
+ try:
931
+ logger.debug("[DEBUG] 尝试设置每页显示50条记录")
932
+
933
+ # 使用更直接的JavaScript方法点击50条
934
+ set_page_size_result = await page.evaluate("""() => {
935
+ try {
936
+ // 更精确地找到下拉框并点击
937
+ const dropdowns = document.querySelectorAll('#perPageDiv, .perpage-content, .page-count, div[class*="perpage"]');
938
+ if (dropdowns && dropdowns.length > 0) {
939
+ // 记录找到的下拉框
940
+ console.log('找到下拉框元素:', dropdowns[0]);
941
+ // 点击下拉框
942
+ dropdowns[0].click();
943
+ console.log('已点击下拉框');
944
+
945
+ // 直接等待而不使用setTimeout,确保下拉菜单显示
946
+ return new Promise(resolve => {
947
+ setTimeout(() => {
948
+ // 查找并点击50选项
949
+ const options = document.querySelectorAll('a[data-v="50"], a[href*="50"], li[data-val="50"]');
950
+ console.log('找到的50选项数量:', options.length);
951
+
952
+ for (let option of options) {
953
+ if (option.textContent.includes('50')) {
954
+ option.click();
955
+ console.log('已点击50选项:', option);
956
+ resolve("点击了50选项:" + option.textContent);
957
+ return;
958
+ }
959
+ }
960
+
961
+ // 如果没有找到特定的50选项,尝试点击最后一个选项(通常是最大数值)
962
+ const allOptions = document.querySelectorAll('.perpage-content a, .sort-list li');
963
+ if (allOptions && allOptions.length > 0) {
964
+ const lastOption = allOptions[allOptions.length - 1];
965
+ lastOption.click();
966
+ console.log('点击了最后一个选项:', lastOption.textContent);
967
+ resolve("点击了最后一个选项:" + lastOption.textContent);
968
+ return;
969
+ }
970
+
971
+ resolve("未找到50条/页选项");
972
+ }, 1000); // 等待一秒确保下拉菜单显示
973
+ });
974
+ }
975
+
976
+ // 尝试另一种方式 - 直接点击带有"50"的链接
977
+ const directLinks = document.querySelectorAll('a:not([style*="display:none"]):not([style*="display: none"])');
978
+ for (let link of directLinks) {
979
+ if (link.textContent.trim() === '50' ||
980
+ link.textContent.includes('50条') ||
981
+ link.textContent.includes('50 条')) {
982
+ link.click();
983
+ return "直接点击了50条链接: " + link.textContent;
984
+ }
985
+ }
986
+
987
+ return "未找到任何可点击的50条/页选项";
988
+ } catch (e) {
989
+ return "设置每页显示50条记录时出错: " + e.toString();
990
+ }
991
+ }""")
992
+
993
+ logger.debug(f"[DEBUG] 设置每页显示50条记录结果: {set_page_size_result}")
994
+
995
+ # 等待页面刷新
996
+ await page.wait_for_load_state('networkidle')
997
+ await asyncio.sleep(2)
998
+
999
+ # 检查是否有来源类别选项,并尝试勾选CSSCI
1000
+ await check_and_select_cssci(page)
1001
+
1002
+ except Exception as e:
1003
+ logger.debug(f"[DEBUG] 设置每页显示50条记录时出错: {str(e)}")
1004
+ logger.debug(traceback.format_exc())
720
1005
 
721
- # 提取并记录每个链接的URL和文本
722
- links_info = []
1006
+ # 尝试等待搜索结果加载
1007
+ try:
1008
+ await page.wait_for_selector('.result-table-list', timeout=5000)
1009
+ logger.debug("[DEBUG] 已找到搜索结果容器")
1010
+ except Exception as e:
1011
+ logger.debug(f"[DEBUG] 等待搜索结果容器超时: {str(e)}")
723
1012
 
724
- for i, link in enumerate(all_links):
725
- href = await link.get_attribute('href')
1013
+ # 优先查找带有article/abstract?v的链接
1014
+ try:
1015
+ logger.debug("[DEBUG] 尝试查找包含 article/abstract?v 的链接")
1016
+
1017
+ abstract_links = await page.evaluate("""() => {
1018
+ const links = [];
1019
+ // 严格查找包含article/abstract?v的链接
1020
+ const abstractLinks = document.querySelectorAll('a[href*="article/abstract?v="]');
1021
+
1022
+ console.log('找到包含article/abstract?v的链接数量:', abstractLinks.length);
1023
+
1024
+ for (let i = 0; i < abstractLinks.length; i++) {
1025
+ const link = abstractLinks[i];
1026
+ const href = link.href;
1027
+ const text = link.textContent.trim();
1028
+
1029
+ // 确保链接有效且包含必要的字段
1030
+ if (href && href.includes('article/abstract?v=') && text) {
1031
+ links.push({
1032
+ index: links.length + 1,
1033
+ href: href,
1034
+ text: text
1035
+ });
1036
+ }
1037
+ }
1038
+
1039
+ return links;
1040
+ }""")
726
1041
 
727
- links_info.append({
728
- 'index': i + 1,
729
- 'href': href
730
- })
1042
+ logger.debug(f"[DEBUG] 找到 {len(abstract_links)} 个包含article/abstract?v的链接")
731
1043
 
732
- logger.debug(f"[DEBUG] 链接 {i+1}: {href}")
1044
+ if abstract_links and len(abstract_links) > 0:
1045
+ # 找到有效的摘要链接
1046
+ links_info = abstract_links
1047
+ links_count = len(abstract_links)
1048
+ else:
1049
+ # 没有找到摘要链接,尝试备用方法
1050
+ logger.debug("[DEBUG] 未找到包含article/abstract?v的链接,尝试备用方法")
1051
+
1052
+ # 尝试查找可能的论文链接
1053
+ backup_links = await page.evaluate("""() => {
1054
+ const links = [];
1055
+ // 查找可能是论文链接的a标签
1056
+ const allLinks = document.querySelectorAll('a.fz14, a[href*="/kcms"], .result-table-list a');
1057
+
1058
+ for (let i = 0; i < allLinks.length; i++) {
1059
+ const link = allLinks[i];
1060
+ const href = link.href;
1061
+ const text = link.textContent.trim();
1062
+
1063
+ if (href && text && !links.some(l => l.href === href)) {
1064
+ links.push({
1065
+ index: links.length + 1,
1066
+ href: href,
1067
+ text: text
1068
+ });
1069
+ }
1070
+ }
1071
+
1072
+ return links;
1073
+ }""")
1074
+
1075
+ if backup_links and len(backup_links) > 0:
1076
+ logger.debug(f"[DEBUG] 使用备用方法找到 {len(backup_links)} 个可能的论文链接")
1077
+ links_info = backup_links
1078
+ links_count = len(backup_links)
1079
+ else:
1080
+ # 回退到常规方法
1081
+ links_info = []
1082
+ links_count = 0
1083
+
1084
+ # 尝试多种可能的选择器
1085
+ selectors = [
1086
+ 'a[href*="article/abstract?v="]', # 优先查找摘要链接
1087
+ 'a[href*="/kcms"]', # 知网文献链接
1088
+ '.fz14', # 标题样式类
1089
+ 'a.pc-link', # 搜索结果链接
1090
+ '.c_font a', # 内容字体下的链接
1091
+ '.result-table-list a', # 结果表下的链接
1092
+ 'table tr td a' # 表格中的链接
1093
+ ]
1094
+
1095
+ for selector in selectors:
1096
+ try:
1097
+ all_links = await page.query_selector_all(selector)
1098
+ logger.debug(f"[DEBUG] 使用选择器 {selector} 找到 {len(all_links)} 个链接")
1099
+
1100
+ for i, link in enumerate(all_links):
1101
+ try:
1102
+ href = await link.get_attribute('href')
1103
+ text = await link.text_content()
1104
+
1105
+ # 确保链接包含论文相关URL,如果没有指定URL则使用当前页面URL
1106
+ if not href:
1107
+ continue
1108
+
1109
+ # 处理相对URL
1110
+ if href.startswith('/'):
1111
+ href = f"https://kns.cnki.net{href}"
1112
+ elif not href.startswith('http'):
1113
+ href = f"https://kns.cnki.net/{href}"
1114
+
1115
+ # 防止重复添加同一链接
1116
+ if any(link_info['href'] == href for link_info in links_info):
1117
+ continue
1118
+
1119
+ links_info.append({
1120
+ 'index': len(links_info) + 1,
1121
+ 'href': href,
1122
+ 'text': text.strip() if text else ""
1123
+ })
1124
+
1125
+ logger.debug(f"[DEBUG] 链接 {len(links_info)}: {href}")
1126
+ except Exception as e:
1127
+ logger.debug(f"[DEBUG] 处理链接时出错: {str(e)}")
1128
+ except Exception as e:
1129
+ logger.debug(f"[DEBUG] 使用选择器 {selector} 查找链接时出错: {str(e)}")
1130
+ except Exception as e:
1131
+ logger.debug(f"[DEBUG] 查找链接时出错: {str(e)}")
1132
+ logger.debug(traceback.format_exc())
1133
+ links_info = []
1134
+ links_count = 0
733
1135
 
734
- # 判断数量是否符合预期(50条)
735
- if links_count == 50:
736
- logger.debug("[DEBUG] 链接数量正好是50条,符合预期")
737
- elif links_count < 50:
738
- logger.debug(f"[DEBUG] 链接数量为{links_count}条,少于预期的50条")
739
- else:
740
- logger.debug(f"[DEBUG] 链接数量为{links_count}条,多于预期的50条")
1136
+ # 过滤链接,只保留包含article/abstract?v的链接
1137
+ filtered_links = []
1138
+ for link in links_info:
1139
+ href = link['href']
1140
+ if 'article/abstract?v=' in href:
1141
+ filtered_links.append(link)
1142
+ logger.debug(f"[DEBUG] 保留包含article/abstract?v的链接: {href}")
1143
+
1144
+ # 如果过滤后没有链接,可能是知网搜索结果的格式变化,使用原始链接
1145
+ if not filtered_links:
1146
+ logger.debug("[DEBUG] 过滤后没有包含article/abstract?v的链接,使用原始链接")
1147
+ filtered_links = links_info
1148
+
1149
+ # 最终链接数量
1150
+ links_count = len(filtered_links)
1151
+ logger.debug(f"[DEBUG] 最终过滤后找到 {links_count} 个链接")
1152
+
1153
+ # 如果没有找到链接,不再进行截图
1154
+ if links_count == 0:
1155
+ logger.debug("[DEBUG] 未找到链接")
741
1156
 
742
1157
  # 存储结果 - 使用字典结构而不是纯文本
743
1158
  page_content = {
744
1159
  "count": links_count,
745
- "links": [{"index": link['index'], "url": link['href']} for link in links_info]
1160
+ "links": [{"index": link['index'], "url": link['href'], "title": link['text']} for link in filtered_links]
746
1161
  }
747
1162
 
748
1163
  return links_count
749
1164
  except Exception as e:
750
1165
  logger.debug(f"[DEBUG] 查找链接时出错: {str(e)}")
751
- return 0
1166
+ logger.debug(traceback.format_exc())
1167
+
1168
+ # 发生错误时,尝试获取当前页面URL
1169
+ try:
1170
+ current_url = await page.url()
1171
+ logger.debug(f"[DEBUG] 当前页面URL: {current_url}")
1172
+
1173
+ # 至少返回当前页面作为链接
1174
+ page_content = {
1175
+ "count": 1,
1176
+ "links": [{"index": 1, "url": current_url, "title": "当前页面"}]
1177
+ }
1178
+ return 1
1179
+ except:
1180
+ page_content = {
1181
+ "count": 0,
1182
+ "links": []
1183
+ }
1184
+ return 0
1185
+
1186
+ async def check_and_select_cssci(page):
1187
+ """检查页面是否有来源类别选项,并尝试勾选CSSCI"""
1188
+ try:
1189
+ logger.debug("[DEBUG] 尝试查找来源类别并勾选CSSCI")
1190
+
1191
+ # 使用JavaScript直接操作DOM
1192
+ cssci_result = await page.evaluate("""() => {
1193
+ try {
1194
+ // 查找包含"来源类别"的区域
1195
+ const categoryContainer = Array.from(document.querySelectorAll('div')).find(div =>
1196
+ div.textContent.includes('来源类别')
1197
+ );
1198
+
1199
+ if (categoryContainer) {
1200
+ // 在来源类别容器中查找CSSCI复选框
1201
+ const checkboxes = categoryContainer.querySelectorAll('input[type="checkbox"]');
1202
+ for (let checkbox of checkboxes) {
1203
+ // 查找CSSCI相关的复选框
1204
+ const parentText = checkbox.parentElement.textContent;
1205
+ if (parentText.includes('CSSCI') ||
1206
+ checkbox.value.includes('CSSCI') ||
1207
+ checkbox.id.includes('cssci')) {
1208
+
1209
+ // 勾选复选框
1210
+ if (!checkbox.checked) {
1211
+ checkbox.click();
1212
+ return "已勾选CSSCI复选框";
1213
+ } else {
1214
+ return "CSSCI复选框已经被勾选";
1215
+ }
1216
+ }
1217
+ }
1218
+
1219
+ // 如果没有找到复选框但找到了CSSCI的标签
1220
+ const cssciLabels = categoryContainer.querySelectorAll('label, span');
1221
+ for (let label of cssciLabels) {
1222
+ if (label.textContent.includes('CSSCI')) {
1223
+ label.click();
1224
+ return "已点击CSSCI标签";
1225
+ }
1226
+ }
1227
+
1228
+ return "在来源类别区域未找到CSSCI选项";
1229
+ }
1230
+
1231
+ return "未找到来源类别区域";
1232
+ } catch (e) {
1233
+ return "勾选CSSCI时出错: " + e.toString();
1234
+ }
1235
+ }""")
1236
+
1237
+ logger.debug(f"[DEBUG] CSSCI勾选结果: {cssci_result}")
1238
+
1239
+ # 等待页面刷新
1240
+ await page.wait_for_load_state('networkidle')
1241
+ await asyncio.sleep(2)
1242
+
1243
+ except Exception as e:
1244
+ logger.debug(f"[DEBUG] 勾选CSSCI时出错: {str(e)}")
1245
+ logger.debug(traceback.format_exc())
1246
+
1247
+ async def extract_content_from_url(url: str, page = None) -> CNKIContent:
1248
+ """从CNKI页面提取论文内容"""
1249
+ global playwright_instance, browser_instance, context
1250
+
1251
+ if not url.startswith('http'):
1252
+ # 处理相对URL
1253
+ if url.startswith('/'):
1254
+ url = f"https://kns.cnki.net{url}"
1255
+ else:
1256
+ url = f"https://kns.cnki.net/{url}"
1257
+
1258
+ # 创建基本内容对象
1259
+ content = CNKIContent(url=url)
1260
+
1261
+ try:
1262
+ logger.info(f"开始从URL提取内容: {url}")
1263
+
1264
+ # 如果没有提供page参数,检查playwright是否已初始化
1265
+ should_close_page = False
1266
+ if page is None:
1267
+ if playwright_instance is None or browser_instance is None or context is None:
1268
+ # 如果playwright未初始化,使用webbrowser打开URL
1269
+ logger.info(f"Playwright未初始化,使用webbrowser打开URL: {url}")
1270
+ webbrowser.open(url)
1271
+
1272
+ # 设置基本信息
1273
+ content.title = "请在浏览器中手动获取内容"
1274
+ content.abstract = "系统已打开链接,请在浏览器中查看完整内容"
1275
+ return content
1276
+ else:
1277
+ # 使用现有的playwright实例创建新页面
1278
+ logger.debug("[DEBUG] 使用现有的playwright实例创建新页面")
1279
+ page = await context.new_page()
1280
+ should_close_page = True # 后续需要关闭此页面
1281
+
1282
+ # 访问URL
1283
+ logger.debug(f"[DEBUG] 导航到URL: {url}")
1284
+
1285
+ try:
1286
+ await page.goto(url, wait_until='networkidle', timeout=30000)
1287
+ except Exception as e:
1288
+ logger.warning(f"导航超时,继续尝试提取: {str(e)}")
1289
+
1290
+ # 等待页面加载
1291
+ await asyncio.sleep(2)
1292
+
1293
+ # 检查是否需要验证
1294
+ await check_and_wait_for_verification(page)
1295
+
1296
+ # 尝试使用JavaScript提取所有内容
1297
+ try:
1298
+ logger.debug("[DEBUG] 尝试使用JavaScript提取内容")
1299
+
1300
+ content_result = await page.evaluate("""() => {
1301
+ try {
1302
+ // 提取标题
1303
+ const getTitle = () => {
1304
+ const selectors = ['h1.title', '.wx-tit h1', '.title', 'h1', '.article-title', 'div.brief h2', '.wxTitle', 'span.title'];
1305
+ for (const selector of selectors) {
1306
+ const element = document.querySelector(selector);
1307
+ if (element) {
1308
+ const text = element.textContent.trim();
1309
+ if (!text.includes('系统检测')) {
1310
+ return text;
1311
+ }
1312
+ }
1313
+ }
1314
+ return "";
1315
+ };
1316
+
1317
+ // 提取作者
1318
+ const getAuthors = () => {
1319
+ const selectors = ['.wx-tit .author', '.author', '.writers', '.authorinfo', 'div.brief p:first-child', 'span.author'];
1320
+ for (const selector of selectors) {
1321
+ const element = document.querySelector(selector);
1322
+ if (element) {
1323
+ const text = element.textContent.trim();
1324
+ return text.split(/[,,;;、\\s]+/).filter(a => a.trim());
1325
+ }
1326
+ }
1327
+ return [];
1328
+ };
1329
+
1330
+ // 提取摘要
1331
+ const getAbstract = () => {
1332
+ const selectors = ['#ChDivSummary', '.abstract', '.summary', '.Abstract', 'div.brief div.abstract', 'div.wxInfo span.abstract', 'div.wxInfo', 'span.abstract'];
1333
+ for (const selector of selectors) {
1334
+ const element = document.querySelector(selector);
1335
+ if (element) {
1336
+ let text = element.textContent.trim();
1337
+ // 移除可能的"摘要:"前缀
1338
+ text = text.replace(/^摘要[::]/g, '').trim();
1339
+ return text;
1340
+ }
1341
+ }
1342
+
1343
+ // 查找含有"摘要"的段落
1344
+ const paragraphs = document.querySelectorAll('p');
1345
+ for (const p of paragraphs) {
1346
+ if (p.textContent.includes('摘要')) {
1347
+ let text = p.textContent.trim();
1348
+ text = text.replace(/^摘要[::]/g, '').trim();
1349
+ return text;
1350
+ }
1351
+ }
1352
+
1353
+ return "";
1354
+ };
1355
+
1356
+ // 提取关键词
1357
+ const getKeywords = () => {
1358
+ const selectors = ['.wx-tit-keys', '.keywords', '.Keyword', 'div.wxInfo span.keywords', 'span.keywords', 'div.brief span.keywords', 'p.keywords'];
1359
+ for (const selector of selectors) {
1360
+ const element = document.querySelector(selector);
1361
+ if (element) {
1362
+ let text = element.textContent.trim();
1363
+ // 移除"关键词:"前缀
1364
+ text = text.replace(/^关键词[::]/g, '').trim();
1365
+ return text.split(/[;;,,、\\s]+/).filter(k => k.trim());
1366
+ }
1367
+ }
1368
+
1369
+ // 查找含有"关键词"的段落
1370
+ const paragraphs = document.querySelectorAll('p');
1371
+ for (const p of paragraphs) {
1372
+ if (p.textContent.includes('关键词')) {
1373
+ let text = p.textContent.trim();
1374
+ const keywordText = text.split(/关键词[::]/)[1];
1375
+ if (keywordText) {
1376
+ return keywordText.split(/[;;,,、\\s]+/).filter(k => k.trim());
1377
+ }
1378
+ }
1379
+ }
1380
+
1381
+ return [];
1382
+ };
1383
+
1384
+ // 尝试获取引用格式
1385
+ let citeFormat = "";
1386
+ const getCiteFormat = () => {
1387
+ // 首先检查是否有引用按钮
1388
+ const citeButton = document.querySelector('button:has-text("引用"), [class*="cite"], [class*="quote"]');
1389
+ if (citeButton) {
1390
+ // 如果有引用按钮,暂不点击,防止页面跳转
1391
+ return null;
1392
+ }
1393
+
1394
+ // 尝试直接获取引用区域
1395
+ const selectors = ['.quote-info', '.citation', 'div.cite', 'div.quoted', 'div.wxInfo div.quoted', '.refer-info'];
1396
+ for (const selector of selectors) {
1397
+ const element = document.querySelector(selector);
1398
+ if (element) {
1399
+ return element.textContent.trim();
1400
+ }
1401
+ }
1402
+
1403
+ return "";
1404
+ };
1405
+
1406
+ // 收集结果
1407
+ return {
1408
+ title: getTitle(),
1409
+ authors: getAuthors(),
1410
+ abstract: getAbstract(),
1411
+ keywords: getKeywords(),
1412
+ cite_format: getCiteFormat()
1413
+ };
1414
+ } catch (e) {
1415
+ return {
1416
+ error: "提取内容时出错: " + e.toString(),
1417
+ title: "",
1418
+ authors: [],
1419
+ abstract: "",
1420
+ keywords: [],
1421
+ cite_format: ""
1422
+ };
1423
+ }
1424
+ }""")
1425
+
1426
+ # 更新内容对象
1427
+ if content_result:
1428
+ if "error" in content_result and content_result["error"]:
1429
+ logger.warning(f"[WARNING] JavaScript提取内容时出错: {content_result['error']}")
1430
+ else:
1431
+ logger.debug("[DEBUG] JavaScript提取内容成功")
1432
+
1433
+ # 更新标题
1434
+ if content_result.get("title"):
1435
+ content.title = content_result["title"]
1436
+ logger.debug(f"[DEBUG] 提取到标题: {content.title}")
1437
+
1438
+ # 更新作者
1439
+ if content_result.get("authors"):
1440
+ content.authors = content_result["authors"]
1441
+ logger.debug(f"[DEBUG] 提取到作者: {content.authors}")
1442
+
1443
+ # 更新摘要
1444
+ if content_result.get("abstract"):
1445
+ content.abstract = content_result["abstract"]
1446
+ logger.debug(f"[DEBUG] 提取到摘要: {content.abstract[:100]}...")
1447
+
1448
+ # 更新关键词
1449
+ if content_result.get("keywords"):
1450
+ content.keywords = content_result["keywords"]
1451
+ logger.debug(f"[DEBUG] 提取到关键词: {content.keywords}")
1452
+
1453
+ # 更新引用格式
1454
+ if content_result.get("cite_format") != None:
1455
+ if content_result["cite_format"]:
1456
+ # 直接获取到引用格式
1457
+ content.cite_format = content_result["cite_format"]
1458
+ logger.debug(f"[DEBUG] 提取到引用格式: {content.cite_format[:100]}...")
1459
+ else:
1460
+ # 需要点击引用按钮
1461
+ logger.debug("[DEBUG] 尝试点击引用按钮获取引用格式")
1462
+
1463
+ try:
1464
+ # 查找引用按钮
1465
+ cite_button = await page.query_selector('button:has-text("引用"), [class*="cite"], [class*="quote"]')
1466
+ if cite_button:
1467
+ await cite_button.click()
1468
+ await asyncio.sleep(1) # 等待弹窗显示
1469
+
1470
+ # 在弹窗中提取引用格式
1471
+ cite_text = await page.evaluate("""() => {
1472
+ const textarea = document.querySelector('.quote-r textarea.text, .quote-text, [class*="quote"] textarea');
1473
+ if (textarea) {
1474
+ return textarea.value.trim();
1475
+ }
1476
+ return "";
1477
+ }""")
1478
+
1479
+ if cite_text:
1480
+ content.cite_format = cite_text
1481
+ logger.debug(f"[DEBUG] 从弹窗提取到引用格式: {content.cite_format[:100]}...")
1482
+ else:
1483
+ logger.debug("[DEBUG] 未从弹窗找到引用格式")
1484
+ else:
1485
+ logger.debug("[DEBUG] 未找到引用按钮")
1486
+ except Exception as e:
1487
+ logger.debug(f"[DEBUG] 点击引用按钮时出错: {str(e)}")
1488
+ else:
1489
+ logger.warning("[WARNING] JavaScript提取内容返回空结果")
1490
+
1491
+ except Exception as e:
1492
+ logger.debug(f"[DEBUG] 使用JavaScript提取内容时出错: {str(e)}")
1493
+ logger.debug(traceback.format_exc())
1494
+
1495
+ # 如果JavaScript提取失败,回退到原来的提取方法
1496
+ if not content.title:
1497
+ # 尝试提取论文标题
1498
+ try:
1499
+ title_selectors = [
1500
+ '.wx-tit h1',
1501
+ '.article-title',
1502
+ '.title',
1503
+ 'h1',
1504
+ '.articleTitle',
1505
+ 'div.brief h2',
1506
+ '.wxTitle',
1507
+ 'span.title'
1508
+ ]
1509
+
1510
+ title_elem = None
1511
+ for selector in title_selectors:
1512
+ title_elem = await page.query_selector(selector)
1513
+ if title_elem:
1514
+ logger.debug(f"[DEBUG] 找到标题元素: {selector}")
1515
+ break
1516
+
1517
+ if title_elem:
1518
+ content.title = await title_elem.text_content()
1519
+ content.title = content.title.strip()
1520
+ logger.debug(f"[DEBUG] 提取到标题: {content.title}")
1521
+ except Exception as e:
1522
+ logger.debug(f"[DEBUG] 提取标题时出错: {str(e)}")
1523
+
1524
+ if not content.authors:
1525
+ # 尝试提取作者信息
1526
+ try:
1527
+ author_selectors = [
1528
+ '.wx-tit .author',
1529
+ '.author',
1530
+ '.writers',
1531
+ '.authorinfo',
1532
+ 'div.brief p:first-child',
1533
+ 'span.author'
1534
+ ]
1535
+
1536
+ authors_elem = None
1537
+ for selector in author_selectors:
1538
+ authors_elem = await page.query_selector(selector)
1539
+ if authors_elem:
1540
+ logger.debug(f"[DEBUG] 找到作者元素: {selector}")
1541
+ break
1542
+
1543
+ if authors_elem:
1544
+ authors_text = await authors_elem.text_content()
1545
+ # 分割作者文本
1546
+ authors = [a.strip() for a in re.split(r'[,,;;、\s]+', authors_text) if a.strip()]
1547
+ content.authors = authors
1548
+ logger.debug(f"[DEBUG] 提取到作者: {authors}")
1549
+ except Exception as e:
1550
+ logger.debug(f"[DEBUG] 提取作者时出错: {str(e)}")
1551
+
1552
+ if not content.abstract:
1553
+ # 尝试提取摘要
1554
+ try:
1555
+ abstract_selectors = [
1556
+ '#ChDivSummary',
1557
+ '.abstract',
1558
+ '.summary',
1559
+ '.Abstract',
1560
+ 'div.brief div.abstract',
1561
+ 'div.wxInfo span.abstract',
1562
+ 'div.wxInfo',
1563
+ 'span.abstract'
1564
+ ]
1565
+
1566
+ abstract_elem = None
1567
+ for selector in abstract_selectors:
1568
+ abstract_elem = await page.query_selector(selector)
1569
+ if abstract_elem:
1570
+ logger.debug(f"[DEBUG] 找到摘要元素: {selector}")
1571
+ break
1572
+
1573
+ if abstract_elem:
1574
+ content.abstract = await abstract_elem.text_content()
1575
+ content.abstract = content.abstract.strip()
1576
+ # 移除可能的"摘要:"前缀
1577
+ content.abstract = re.sub(r'^摘要[::]\s*', '', content.abstract)
1578
+ logger.debug(f"[DEBUG] 提取到摘要: {content.abstract[:100]}...")
1579
+ except Exception as e:
1580
+ logger.debug(f"[DEBUG] 提取摘要时出错: {str(e)}")
1581
+
1582
+ if not content.keywords:
1583
+ # 尝试提取关键词
1584
+ try:
1585
+ keyword_selectors = [
1586
+ '.wx-tit-keys',
1587
+ '.keywords',
1588
+ '.Keyword',
1589
+ 'div.wxInfo span.keywords',
1590
+ 'span.keywords',
1591
+ 'div.brief span.keywords',
1592
+ 'p.keywords'
1593
+ ]
1594
+
1595
+ keywords_elem = None
1596
+ for selector in keyword_selectors:
1597
+ keywords_elem = await page.query_selector(selector)
1598
+ if keywords_elem:
1599
+ logger.debug(f"[DEBUG] 找到关键词元素: {selector}")
1600
+ break
1601
+
1602
+ if keywords_elem:
1603
+ keywords_text = await keywords_elem.text_content()
1604
+ # 移除"关键词:"前缀
1605
+ keywords_text = re.sub(r'^关键词[::]\s*', '', keywords_text)
1606
+ # 分割关键词
1607
+ keywords = [k.strip() for k in re.split(r'[;;,,、\s]+', keywords_text) if k.strip()]
1608
+ content.keywords = keywords
1609
+ logger.debug(f"[DEBUG] 提取到关键词: {keywords}")
1610
+ except Exception as e:
1611
+ logger.debug(f"[DEBUG] 提取关键词时出错: {str(e)}")
1612
+
1613
+ if not content.cite_format:
1614
+ # 尝试提取引用格式
1615
+ try:
1616
+ cite_selectors = [
1617
+ '.quote-info',
1618
+ '.citation',
1619
+ 'div.cite',
1620
+ 'div.quoted',
1621
+ 'div.wxInfo div.quoted',
1622
+ '.refer-info'
1623
+ ]
1624
+
1625
+ cite_elem = None
1626
+ for selector in cite_selectors:
1627
+ cite_elem = await page.query_selector(selector)
1628
+ if cite_elem:
1629
+ logger.debug(f"[DEBUG] 找到引用格式元素: {selector}")
1630
+ break
1631
+
1632
+ if cite_elem:
1633
+ content.cite_format = await cite_elem.text_content()
1634
+ content.cite_format = content.cite_format.strip()
1635
+ logger.debug(f"[DEBUG] 提取到引用格式: {content.cite_format[:100]}...")
1636
+ else:
1637
+ # 如果没有找到引用格式,尝试点击引用按钮
1638
+ cite_button = await page.query_selector('button:has-text("引用"), [class*="cite"], [class*="quote"]')
1639
+ if cite_button:
1640
+ await cite_button.click()
1641
+ await asyncio.sleep(1) # 等待弹窗显示
1642
+
1643
+ # 在弹窗中提取引用格式
1644
+ textarea = await page.query_selector('.quote-r textarea.text, .quote-text, [class*="quote"] textarea')
1645
+ if textarea:
1646
+ content.cite_format = await textarea.get_property('value')
1647
+ content.cite_format = content.cite_format.strip()
1648
+ logger.debug(f"[DEBUG] 从弹窗提取到引用格式: {content.cite_format[:100]}...")
1649
+ except Exception as e:
1650
+ logger.debug(f"[DEBUG] 提取引用格式时出错: {str(e)}")
1651
+
1652
+ # 如果页面是自己创建的,需要关闭
1653
+ if should_close_page:
1654
+ await page.close()
1655
+
1656
+ return content
1657
+ except Exception as e:
1658
+ logger.error(f"从URL提取内容时出错: {str(e)}")
1659
+ logger.error(traceback.format_exc())
1660
+
1661
+ # 确保如果页面是自己创建的,出错时也能关闭
1662
+ if 'page' in locals() and page is not None and 'should_close_page' in locals() and should_close_page:
1663
+ try:
1664
+ await page.close()
1665
+ except:
1666
+ pass
1667
+
1668
+ # 设置错误信息
1669
+ content.title = f"提取失败: {str(e)}"
1670
+ content.abstract = f"从URL提取内容时出错: {str(e)}"
1671
+ return content
1672
+
1673
+ async def batch_extract_contents(urls: List[str]) -> List[Dict]:
1674
+ """批量处理多个URL,提取内容并返回JSON格式"""
1675
+ results = []
1676
+ max_urls = min(50, len(urls)) # 限制最多处理50个URL
1677
+
1678
+ logger.info(f"开始批量提取内容,共 {max_urls} 个URL")
1679
+
1680
+ try:
1681
+ # 检查是否已初始化playwright
1682
+ global playwright_instance, browser_instance, context
1683
+
1684
+ if playwright_instance is None or browser_instance is None or context is None:
1685
+ logger.info("Playwright未初始化,创建新实例")
1686
+ playwright_instance = await async_playwright().start()
1687
+ browser_instance = await playwright_instance.chromium.launch(headless=False)
1688
+ context = await browser_instance.new_context()
1689
+
1690
+ # 一个一个处理URL
1691
+ for i, url in enumerate(urls[:max_urls]):
1692
+ logger.info(f"处理URL {i+1}/{max_urls}: {url}")
1693
+
1694
+ # 创建一个新页面
1695
+ page = await context.new_page()
1696
+
1697
+ try:
1698
+ # 提取内容
1699
+ result = await extract_content_from_url(url, page)
1700
+ results.append(result.dict())
1701
+ logger.info(f"成功处理URL: {url}")
1702
+ except Exception as e:
1703
+ logger.error(f"处理URL {url} 时出错: {str(e)}")
1704
+ results.append({
1705
+ "url": url,
1706
+ "error": str(e),
1707
+ "title": "",
1708
+ "authors": [],
1709
+ "abstract": "",
1710
+ "keywords": [],
1711
+ "cite_format": ""
1712
+ })
1713
+ finally:
1714
+ # 关闭页面
1715
+ await page.close()
1716
+
1717
+ # 添加短暂延迟,避免过快请求导致被封
1718
+ await asyncio.sleep(1)
1719
+
1720
+ logger.info(f"批量处理完成,共处理 {len(results)} 个URL")
1721
+ return results
1722
+ except Exception as e:
1723
+ logger.error(f"批量处理过程中出错: {str(e)}")
1724
+ logger.error(traceback.format_exc())
1725
+ return [{"error": f"批量处理过程中出错: {str(e)}"}] + results
1726
+
1727
+ # 添加关闭函数,在程序结束时清理资源
1728
+ async def cleanup_playwright():
1729
+ """清理playwright资源"""
1730
+ global playwright_instance, browser_instance, context
1731
+
1732
+ if context:
1733
+ logger.debug("[DEBUG] 关闭playwright上下文")
1734
+ await context.close()
1735
+ context = None
1736
+
1737
+ if browser_instance:
1738
+ logger.debug("[DEBUG] 关闭浏览器实例")
1739
+ await browser_instance.close()
1740
+ browser_instance = None
1741
+
1742
+ if playwright_instance:
1743
+ logger.debug("[DEBUG] 关闭playwright实例")
1744
+ await playwright_instance.stop()
1745
+ playwright_instance = None
752
1746
 
753
1747
  async def main():
754
1748
  """主程序入口"""
755
- # 使用stdin/stdout流运行服务器
756
- async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
757
- await server.run(
758
- read_stream,
759
- write_stream,
760
- InitializationOptions(
761
- server_name="cnks",
762
- server_version="0.2.3",
763
- capabilities=server.get_capabilities(
764
- notification_options=NotificationOptions(),
765
- experimental_capabilities={},
1749
+ try:
1750
+ # 使用stdin/stdout流运行服务器
1751
+ async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
1752
+ await server.run(
1753
+ read_stream,
1754
+ write_stream,
1755
+ InitializationOptions(
1756
+ server_name="cnks",
1757
+ server_version="0.3.0",
1758
+ capabilities=server.get_capabilities(
1759
+ notification_options=NotificationOptions(),
1760
+ experimental_capabilities={},
1761
+ ),
766
1762
  ),
767
- ),
768
- )
1763
+ )
1764
+ finally:
1765
+ # 确保playwright资源在程序结束时被清理
1766
+ await cleanup_playwright()
769
1767
 
770
1768
  # 为符合README.md的要求,添加从FastMCP导出的接口
771
1769
  def create_fastmcp_server():
@@ -775,36 +1773,81 @@ def create_fastmcp_server():
775
1773
  fast_mcp = FastMCP("知网搜索")
776
1774
 
777
1775
  # 只添加搜索并提取的工具
778
- if extractor is not None and PLAYWRIGHT_AVAILABLE:
1776
+ if PLAYWRIGHT_AVAILABLE:
779
1777
  @fast_mcp.tool()
780
- async def mcp_cnks_search_and_extract(keywords: str) -> dict:
1778
+ async def search_and_extract(keywords: str) -> dict:
781
1779
  """搜索关键词并提取所有论文的详细内容"""
782
- logger.debug("[DEBUG] 正在使用FastMCP的mcp_cnks_search_and_extract函数")
1780
+ logger.debug("[DEBUG] 正在使用FastMCP的search_and_extract函数")
783
1781
  try:
784
1782
  # 第一步:执行搜索
785
- result_count = await search_with_playwright(keywords)
1783
+ logger.debug(f"[DEBUG] 开始搜索关键词: {keywords}")
1784
+
1785
+ # 构建URL
1786
+ url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
1787
+ logger.debug(f"[DEBUG] 搜索URL: {url}")
1788
+
1789
+ # 如果playwright可用,使用playwright搜索
1790
+ if PLAYWRIGHT_AVAILABLE:
1791
+ logger.debug("[DEBUG] 使用playwright搜索")
1792
+ result_count = await search_with_playwright(keywords)
1793
+ else:
1794
+ # 否则直接用open_chrome打开URL
1795
+ logger.debug("[DEBUG] 直接使用open_chrome打开URL")
1796
+ result = open_chrome(url)
1797
+
1798
+ if isinstance(result, str):
1799
+ # 如果是错误信息,返回错误
1800
+ return {
1801
+ "error": f"打开Chrome失败: {result}",
1802
+ "keywords": keywords,
1803
+ "count": 0,
1804
+ "results": []
1805
+ }
1806
+ else:
1807
+ # 成功打开但无法获取链接
1808
+ return {
1809
+ "keywords": keywords,
1810
+ "count": 0,
1811
+ "message": "已直接在Chrome中打开搜索页面,但无法自动获取搜索结果。请安装playwright以获取完整功能。",
1812
+ "results": []
1813
+ }
786
1814
 
787
1815
  # 检查搜索结果
788
1816
  if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
1817
+ # 如果没有找到链接,至少返回搜索页面作为结果
1818
+ logger.debug("[DEBUG] 搜索未返回有效链接,返回搜索页面作为结果")
789
1819
  return {
790
- "error": "搜索未返回有效链接",
791
1820
  "keywords": keywords,
792
- "count": 0,
793
- "results": []
1821
+ "count": 1,
1822
+ "results": [{
1823
+ "title": f"搜索结果: {keywords}",
1824
+ "authors": [],
1825
+ "abstract": "请在浏览器中查看搜索结果",
1826
+ "keywords": [],
1827
+ "cite_format": "",
1828
+ "url": url
1829
+ }]
794
1830
  }
795
1831
 
796
1832
  # 提取链接
797
1833
  urls = [link["url"] for link in page_content["links"] if "url" in link]
798
1834
  if not urls:
1835
+ logger.debug("[DEBUG] 没有找到有效链接,返回搜索页面")
799
1836
  return {
800
- "error": "未找到有效链接",
801
1837
  "keywords": keywords,
802
- "count": 0,
803
- "results": []
1838
+ "count": 1,
1839
+ "results": [{
1840
+ "title": f"搜索结果: {keywords}",
1841
+ "authors": [],
1842
+ "abstract": "请在浏览器中查看搜索结果",
1843
+ "keywords": [],
1844
+ "cite_format": "",
1845
+ "url": url
1846
+ }]
804
1847
  }
805
1848
 
806
1849
  # 第二步:执行提取
807
- results = await extractor.batch_extract_contents(urls)
1850
+ results = await batch_extract_contents(urls)
808
1851
 
809
1852
  # 包装结果
810
1853
  return {
@@ -816,6 +1859,7 @@ def create_fastmcp_server():
816
1859
  }
817
1860
  except Exception as e:
818
1861
  logger.error(f"搜索并提取时出错: {str(e)}")
1862
+ logger.error(traceback.format_exc())
819
1863
  return {
820
1864
  "error": f"搜索并提取内容时出错: {str(e)}",
821
1865
  "keywords": keywords,