cnks 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cnks/server.py CHANGED
@@ -1,832 +1,832 @@
1
- import asyncio
2
- import json
3
- import os
4
- import platform
5
- import re
6
- import subprocess
7
- import sys
8
- import time
9
- import logging
10
- import webbrowser
11
- import traceback
12
- from pathlib import Path
13
- from urllib.parse import quote
14
- from typing import Dict, List, Any, Optional, Union
15
-
16
- from mcp.server.models import InitializationOptions
17
- import mcp.types as types
18
- from mcp.server import NotificationOptions, Server
19
- from pydantic import AnyUrl
20
- import mcp.server.stdio
21
-
22
- # 配置日志记录
23
- logging.basicConfig(
24
- level=logging.DEBUG,
25
- filename="cnks.log",
26
- filemode="a",
27
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
28
- )
29
- logger = logging.getLogger("cnks")
30
-
31
- # 尝试导入playwright
32
- try:
33
- from playwright.async_api import async_playwright
34
- PLAYWRIGHT_AVAILABLE = True
35
- except ImportError:
36
- PLAYWRIGHT_AVAILABLE = False
37
- logger.warning("Playwright未安装,将使用传统方式打开Chrome")
38
-
39
- # 存储当前页面内容和笔记
40
- page_content = ""
41
- current_url = ""
42
- notes: dict[str, str] = {}
43
- browser_instance = None
44
-
45
- server = Server("cnks")
46
-
47
- # 导入我们新创建的extractor模块
48
- try:
49
- from . import chrome_extractor as extractor
50
- except ImportError:
51
- try:
52
- import chrome_extractor as extractor
53
- except ImportError:
54
- extractor = None
55
- logger.warning("无法导入chrome_extractor模块,批量提取功能将不可用")
56
-
57
- def find_chrome_executable():
58
- """查找Chrome可执行文件路径"""
59
- system = platform.system()
60
-
61
- # 定义可能的Chrome位置
62
- if system == "Windows":
63
- chrome_paths = [
64
- r"C:\Program Files\Google\Chrome\Application\chrome.exe",
65
- r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
66
- os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
67
- ]
68
- elif system == "Darwin": # MacOS
69
- chrome_paths = [
70
- "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
71
- os.path.expanduser("~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"),
72
- ]
73
- elif system == "Linux":
74
- chrome_paths = [
75
- "/usr/bin/google-chrome",
76
- "/usr/bin/chromium-browser",
77
- "/usr/bin/chromium",
78
- ]
79
- else:
80
- return None
81
-
82
- # 检查路径是否存在
83
- for path in chrome_paths:
84
- if os.path.exists(path):
85
- return path
86
-
87
- # 尝试从环境变量中查找
88
- chrome_env = os.environ.get("CHROME_PATH")
89
- if chrome_env and os.path.exists(chrome_env):
90
- return chrome_env
91
-
92
- return None
93
-
94
- def open_chrome(url):
95
- """打开Chrome浏览器并访问指定URL"""
96
- try:
97
- chrome_path = find_chrome_executable()
98
-
99
- if not chrome_path:
100
- return "未找到Chrome可执行文件。请设置CHROME_PATH环境变量指向Chrome位置。"
101
-
102
- subprocess.Popen([
103
- chrome_path,
104
- url
105
- ])
106
- time.sleep(2) # 等待页面加载
107
- return True
108
- except Exception as e:
109
- return f"打开Chrome时出错: {str(e)}"
110
-
111
- async def search_with_playwright(keywords):
112
- """使用playwright在知网搜索关键词"""
113
- global page_content
114
-
115
- if not PLAYWRIGHT_AVAILABLE:
116
- return "需要安装playwright模块:uv add playwright"
117
-
118
- try:
119
- chrome_path = find_chrome_executable()
120
- if not chrome_path:
121
- return "未找到Chrome可执行文件。请设置CHROME_PATH环境变量指向Chrome位置。"
122
-
123
- logger.debug(f"[DEBUG] 使用Playwright搜索,Chrome路径: {chrome_path}")
124
-
125
- # 创建全局浏览器实例,避免执行完关闭
126
- global browser_instance
127
-
128
- # 只打开一个playwright实例
129
- playwright = await async_playwright().start()
130
-
131
- # 尝试使用系统Chrome
132
- try:
133
- logger.debug("[DEBUG] 尝试使用channel='chrome'启动浏览器")
134
- browser = await playwright.chromium.launch(
135
- headless=False,
136
- channel="chrome"
137
- )
138
- except Exception as e:
139
- logger.debug(f"[DEBUG] channel='chrome'方式失败: {str(e)}")
140
- logger.debug("[DEBUG] 尝试使用executable_path启动浏览器")
141
- # 如果失败,尝试使用executable_path指定Chrome路径
142
- browser = await playwright.chromium.launch(
143
- headless=False,
144
- executable_path=chrome_path
145
- )
146
-
147
- # 保存浏览器实例以防止被关闭
148
- browser_instance = browser
149
-
150
- page = await browser.new_page()
151
-
152
- # 导航到知网搜索页面
153
- await page.goto("https://kns.cnki.net/kns8s/search")
154
- logger.debug("[DEBUG] 成功打开知网搜索页面")
155
-
156
- # 等待页面加载
157
- await page.wait_for_load_state("networkidle")
158
-
159
- # 查找并填写搜索框
160
- try:
161
- # 尝试定位搜索框
162
- search_input = await page.query_selector('input.search-input')
163
- if search_input:
164
- # 清空搜索框
165
- await search_input.fill("")
166
- # 输入关键词
167
- await search_input.fill(keywords)
168
- logger.debug(f"[DEBUG] 已在搜索框中输入: {keywords}")
169
-
170
- # 增加短暂等待以确保用户可以看到输入过程
171
- await asyncio.sleep(1)
172
-
173
- # 查找并点击搜索按钮
174
- search_button = await page.query_selector('.search-btn')
175
- if search_button:
176
- await search_button.click()
177
- logger.debug("[DEBUG] 已点击搜索按钮")
178
- # 等待搜索结果加载
179
- await page.wait_for_load_state("networkidle")
180
-
181
- # 点击操作1:点击下拉框的三角形
182
- try:
183
- # 等待一下,确保页面元素都加载完成
184
- await asyncio.sleep(2)
185
-
186
- # 尝试点击排序下拉框
187
- logger.debug("[DEBUG] 尝试点击排序下拉框")
188
- # 根据提供的HTML,尝试定位下拉框的三角形
189
- sort_dropdown = await page.query_selector('div[class="sort"][id="perPageDiv"]')
190
- if sort_dropdown:
191
- await sort_dropdown.click()
192
- logger.debug("[DEBUG] 成功点击排序下拉框")
193
-
194
- # 等待下拉菜单出现
195
- await asyncio.sleep(1)
196
-
197
- # 点击操作2:点击数字50选项
198
- logger.debug("[DEBUG] 尝试点击'50'选项")
199
- # 尝试定位"50"选项
200
- option_50 = await page.query_selector('li[data-val="50"]')
201
- if option_50:
202
- await option_50.click()
203
- logger.debug("[DEBUG] 成功点击'50'选项")
204
- await page.wait_for_load_state("networkidle")
205
-
206
- # 勾选来源类别中的CSSCI选项
207
- try:
208
- # 等待一下确保页面完全加载
209
- await asyncio.sleep(2)
210
-
211
- logger.debug("[DEBUG] 尝试勾选CSSCI选项")
212
-
213
- # 首先尝试找到来源类别区域
214
- # 通常来源类别会有一个标题或者分组
215
- source_category = await page.query_selector('div.group-item:has-text("来源类别")')
216
-
217
- if source_category:
218
- logger.debug("[DEBUG] 找到来源类别区域")
219
-
220
- # 在来源类别区域内查找CSSCI选项
221
- cssci_checkbox = await source_category.query_selector('input[type="checkbox"]:near(:text("CSSCI"))')
222
-
223
- if cssci_checkbox:
224
- # 点击CSSCI复选框
225
- await cssci_checkbox.click()
226
- logger.debug("[DEBUG] 成功勾选CSSCI选项")
227
-
228
- # 等待页面刷新
229
- await page.wait_for_load_state("networkidle")
230
-
231
- # 查找所有包含"article/abstract?v="字样的链接
232
- links_count = await find_and_count_abstract_links(page)
233
-
234
- return links_count
235
- else:
236
- logger.debug("[DEBUG] 在来源类别区域未找到CSSCI选项")
237
-
238
- # 尝试另一种方式:直接在整个页面中查找CSSCI
239
- cssci_text = await page.query_selector(':text("CSSCI")')
240
- if cssci_text:
241
- # 尝试点击文本附近的复选框
242
- await cssci_text.click()
243
- logger.debug("[DEBUG] 通过文本找到并点击了CSSCI")
244
- await page.wait_for_load_state("networkidle")
245
-
246
- # 查找所有包含"article/abstract?v="字样的链接
247
- links_count = await find_and_count_abstract_links(page)
248
-
249
- return links_count
250
- else:
251
- # 查找所有包含"article/abstract?v="字样的链接
252
- links_count = await find_and_count_abstract_links(page)
253
- return links_count
254
- else:
255
- logger.debug("[DEBUG] 未找到来源类别区域")
256
-
257
- # 尝试直接在页面中查找CSSCI文本
258
- cssci_text = await page.query_selector(':text("CSSCI")')
259
- if cssci_text:
260
- # 尝试点击文本附近的复选框
261
- await cssci_text.click()
262
- logger.debug("[DEBUG] 直接找到并点击了CSSCI")
263
- await page.wait_for_load_state("networkidle")
264
-
265
- # 查找所有包含"article/abstract?v="字样的链接
266
- links_count = await find_and_count_abstract_links(page)
267
-
268
- return links_count
269
- else:
270
- # 查找所有包含"article/abstract?v="字样的链接
271
- links_count = await find_and_count_abstract_links(page)
272
- return links_count
273
- except Exception as e:
274
- logger.debug(f"[DEBUG] 勾选CSSCI选项时出错: {str(e)}")
275
- # 查找所有包含"article/abstract?v="字样的链接
276
- links_count = await find_and_count_abstract_links(page)
277
- return links_count
278
-
279
- # 查找所有包含"article/abstract?v="字样的链接
280
- links_count = await find_and_count_abstract_links(page)
281
- return links_count
282
- else:
283
- logger.debug("[DEBUG] 未找到'50'选项")
284
- page_content = {
285
- "count": 0,
286
- "links": [],
287
- "error": "已搜索并点击下拉框,但未找到'50'选项"
288
- }
289
- return 0
290
- else:
291
- logger.debug("[DEBUG] 未找到排序下拉框")
292
- page_content = {
293
- "count": 0,
294
- "links": [],
295
- "error": "已搜索,但未找到排序下拉框"
296
- }
297
- return 0
298
- except Exception as e:
299
- logger.debug(f"[DEBUG] 点击下拉框或选项时出错: {str(e)}")
300
- page_content = {
301
- "count": 0,
302
- "links": [],
303
- "error": f"已搜索,但在点击下拉框或选项时出错: {str(e)}"
304
- }
305
- return 0
306
- else:
307
- # 不关闭浏览器
308
- page_content = {
309
- "count": 0,
310
- "links": [],
311
- "error": f"已填写搜索关键词: {keywords},但未找到搜索按钮"
312
- }
313
- return 0
314
- else:
315
- # 不关闭浏览器
316
- page_content = {
317
- "count": 0,
318
- "links": [],
319
- "error": f"未找到搜索框,无法搜索: {keywords}"
320
- }
321
- return 0
322
- except Exception as e:
323
- logger.debug(f"[DEBUG] 填写搜索框或点击搜索按钮时出错: {str(e)}")
324
- # 不关闭浏览器
325
- page_content = {
326
- "count": 0,
327
- "links": [],
328
- "error": f"自动搜索过程中出错: {str(e)}"
329
- }
330
- return 0
331
- except Exception as e:
332
- error_msg = str(e)
333
- logger.debug(f"[DEBUG] Playwright错误: {error_msg}")
334
-
335
- # 如果是找不到Chrome的错误,提供更明确的指导
336
- if "Executable doesn't exist" in error_msg and "ms-playwright" in error_msg:
337
- error_message = f"需要安装Playwright的浏览器: playwright install\n如果您想使用系统Chrome,请重新启动服务器。\n\n{error_msg}"
338
- else:
339
- error_message = f"使用Playwright启动Chrome失败: {error_msg}"
340
-
341
- page_content = {
342
- "count": 0,
343
- "links": [],
344
- "error": error_message
345
- }
346
- return 0
347
-
348
- def search_with_direct_chrome(keywords):
349
- """直接使用Chrome搜索,不使用playwright"""
350
- global page_content
351
-
352
- logger.debug("[DEBUG] 正在使用search_with_direct_chrome函数")
353
-
354
- try:
355
- url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
356
- logger.debug(f"[DEBUG] 打开URL: {url}")
357
-
358
- result = open_chrome(url)
359
-
360
- if isinstance(result, str) and "打开Chrome" in result:
361
- logger.debug(f"[DEBUG] 直接打开Chrome结果: {result}")
362
-
363
- page_content = {
364
- "count": 0,
365
- "links": [],
366
- "error": f"直接打开Chrome搜索: {result}"
367
- }
368
-
369
- else:
370
- logger.debug("[DEBUG] 直接打开Chrome成功")
371
-
372
- page_content = {
373
- "count": 0,
374
- "links": [],
375
- "message": "已打开Chrome并搜索关键词,但无法自动获取链接。请安装playwright以获取完整功能。"
376
- }
377
-
378
- return page_content
379
- except Exception as e:
380
- logger.debug(f"[DEBUG] search_with_direct_chrome出错: {str(e)}")
381
-
382
- page_content = {
383
- "count": 0,
384
- "links": [],
385
- "error": f"使用Chrome搜索时出错: {str(e)}"
386
- }
387
-
388
- return page_content
389
-
390
- def get_page_content():
391
- """获取当前页面内容(简化模拟)"""
392
- global page_content, current_url
393
- if not current_url:
394
- return "尚未打开任何页面"
395
-
396
- # 实际应用中,这里可以使用Selenium或类似工具来获取实际页面内容
397
- # 此处为简化实现,返回模拟内容
398
- if "cnki" in current_url:
399
- return f"中国知网搜索页面\n当前URL: {current_url}\n可使用搜索工具查询文献。"
400
- return f"已打开页面: {current_url}"
401
-
402
- @server.list_resources()
403
- async def handle_list_resources() -> list[types.Resource]:
404
- """列出可用资源"""
405
- resources = []
406
-
407
- # 当前网页资源
408
- resources.append(
409
- types.Resource(
410
- uri=AnyUrl("webpage://current"),
411
- name="当前网页",
412
- description="当前打开的网页内容",
413
- mimeType="text/plain",
414
- )
415
- )
416
-
417
- # 知网搜索页资源
418
- resources.append(
419
- types.Resource(
420
- uri=AnyUrl("webpage://cnki/search"),
421
- name="知网搜索页",
422
- description="中国知网搜索页面",
423
- mimeType="text/plain",
424
- )
425
- )
426
-
427
- # 笔记资源
428
- for name in notes:
429
- resources.append(
430
- types.Resource(
431
- uri=AnyUrl(f"note://internal/{name}"),
432
- name=f"笔记: {name}",
433
- description=f"笔记: {name}",
434
- mimeType="text/plain",
435
- )
436
- )
437
-
438
- return resources
439
-
440
- @server.read_resource()
441
- async def handle_read_resource(uri: AnyUrl) -> str:
442
- """读取资源内容"""
443
- global current_url
444
-
445
- scheme = uri.scheme
446
-
447
- if scheme == "webpage":
448
- path = uri.path if uri.path else ""
449
- host = uri.host if uri.host else ""
450
-
451
- if host == "current":
452
- return get_page_content()
453
- elif host == "cnki" and path == "/search":
454
- # 打开知网搜索页
455
- current_url = "https://kns.cnki.net/kns8s/search"
456
- result = open_chrome(current_url)
457
- if result is True:
458
- return "已打开中国知网搜索页面,可使用搜索工具查询文献。"
459
- else:
460
- return result
461
- elif scheme == "note":
462
- name = uri.path
463
- if name is not None:
464
- name = name.lstrip("/")
465
- if name in notes:
466
- return notes[name]
467
- raise ValueError(f"笔记未找到: {name}")
468
-
469
- raise ValueError(f"不支持的URI方案或资源未找到: {uri}")
470
-
471
- @server.list_prompts()
472
- async def handle_list_prompts() -> list[types.Prompt]:
473
- """列出可用提示"""
474
- return [
475
- types.Prompt(
476
- name="search-literature",
477
- description="按主题搜索文献",
478
- arguments=[
479
- types.PromptArgument(
480
- name="keywords",
481
- description="搜索关键词",
482
- required=True,
483
- )
484
- ],
485
- ),
486
- types.Prompt(
487
- name="advanced-search",
488
- description="高级文献搜索",
489
- arguments=[
490
- types.PromptArgument(
491
- name="title",
492
- description="论文标题",
493
- required=False,
494
- ),
495
- types.PromptArgument(
496
- name="author",
497
- description="作者",
498
- required=False,
499
- ),
500
- types.PromptArgument(
501
- name="keywords",
502
- description="关键词",
503
- required=False,
504
- ),
505
- types.PromptArgument(
506
- name="institution",
507
- description="机构",
508
- required=False,
509
- ),
510
- ],
511
- ),
512
- types.Prompt(
513
- name="summarize-notes",
514
- description="总结所有笔记",
515
- arguments=[
516
- types.PromptArgument(
517
- name="style",
518
- description="摘要风格 (brief/detailed)",
519
- required=False,
520
- )
521
- ],
522
- )
523
- ]
524
-
525
- @server.get_prompt()
526
- async def handle_get_prompt(
527
- name: str, arguments: dict[str, str] | None
528
- ) -> types.GetPromptResult:
529
- """生成提示"""
530
- if name == "search-literature":
531
- keywords = (arguments or {}).get("keywords", "")
532
- return types.GetPromptResult(
533
- description="按主题搜索文献",
534
- messages=[
535
- types.PromptMessage(
536
- role="user",
537
- content=types.TextContent(
538
- type="text",
539
- text=f"请在中国知网搜索关于\"{keywords}\"的文献,并分析主要研究趋势。"
540
- ),
541
- )
542
- ],
543
- )
544
- elif name == "advanced-search":
545
- title = (arguments or {}).get("title", "")
546
- author = (arguments or {}).get("author", "")
547
- keywords = (arguments or {}).get("keywords", "")
548
- institution = (arguments or {}).get("institution", "")
549
-
550
- search_terms = []
551
- if title:
552
- search_terms.append(f"标题包含\"{title}\"")
553
- if author:
554
- search_terms.append(f"作者为\"{author}\"")
555
- if keywords:
556
- search_terms.append(f"关键词包含\"{keywords}\"")
557
- if institution:
558
- search_terms.append(f"机构为\"{institution}\"")
559
-
560
- search_criteria = "、".join(search_terms)
561
-
562
- return types.GetPromptResult(
563
- description="高级文献搜索",
564
- messages=[
565
- types.PromptMessage(
566
- role="user",
567
- content=types.TextContent(
568
- type="text",
569
- text=f"请在中国知网搜索{search_criteria}的文献,并总结相关研究成果。"
570
- ),
571
- )
572
- ],
573
- )
574
- elif name == "summarize-notes":
575
- style = (arguments or {}).get("style", "brief")
576
- detail_prompt = "请提供详细分析。" if style == "detailed" else ""
577
-
578
- return types.GetPromptResult(
579
- description="总结所有笔记",
580
- messages=[
581
- types.PromptMessage(
582
- role="user",
583
- content=types.TextContent(
584
- type="text",
585
- text=f"以下是需要总结的笔记:{detail_prompt}\n\n"
586
- + "\n".join(
587
- f"- {name}: {content}"
588
- for name, content in notes.items()
589
- ),
590
- ),
591
- )
592
- ],
593
- )
594
-
595
- raise ValueError(f"未知提示: {name}")
596
-
597
- @server.list_tools()
598
- async def handle_list_tools() -> list[types.Tool]:
599
- """列出可用工具"""
600
- tools = []
601
-
602
- # 只添加搜索并提取的组合工具
603
- if extractor is not None and PLAYWRIGHT_AVAILABLE:
604
- tools.append(
605
- types.Tool(
606
- name="mcp_cnks_search_and_extract",
607
- description="搜索知网关键词并提取所有论文的详细内容",
608
- inputSchema={
609
- "type": "object",
610
- "properties": {
611
- "keywords": {"type": "string", "description": "搜索关键词"},
612
- },
613
- "required": ["keywords"],
614
- },
615
- )
616
- )
617
-
618
- return tools
619
-
620
- @server.call_tool()
621
- async def handle_call_tool(
622
- name: str, arguments: dict | None
623
- ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
624
- """处理工具执行请求"""
625
- global current_url, page_content
626
-
627
- if name == "mcp_cnks_search_and_extract" and extractor is not None and PLAYWRIGHT_AVAILABLE:
628
- if not arguments:
629
- raise ValueError("缺少参数")
630
-
631
- keywords = arguments.get("keywords")
632
- if not keywords:
633
- raise ValueError("缺少关键词")
634
-
635
- try:
636
- # 第一步:执行搜索
637
- logger.info(f"开始执行搜索并提取:关键词 '{keywords}'")
638
- links_count = await search_with_playwright(keywords)
639
- current_url = "https://kns.cnki.net/kns8s/search"
640
-
641
- # 检查搜索结果
642
- if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
643
- return [
644
- types.TextContent(
645
- type="text",
646
- text=json.dumps({
647
- "error": "搜索未返回有效链接",
648
- "count": 0,
649
- "results": []
650
- }, ensure_ascii=False)
651
- )
652
- ]
653
-
654
- # 提取链接
655
- urls = [link["url"] for link in page_content["links"] if "url" in link]
656
- if not urls:
657
- return [
658
- types.TextContent(
659
- type="text",
660
- text=json.dumps({
661
- "error": "未找到有效链接",
662
- "count": 0,
663
- "results": []
664
- }, ensure_ascii=False)
665
- )
666
- ]
667
-
668
- # 第二步:执行提取
669
- logger.info(f"搜索成功,找到 {len(urls)} 个链接,开始提取内容")
670
- results = await extractor.batch_extract_contents(urls)
671
-
672
- # 包装结果
673
- result_json = {
674
- "keywords": keywords,
675
- "count": len(results),
676
- "results": results,
677
- "success_count": sum(1 for r in results if "error" not in r or not r["error"]),
678
- "error_count": sum(1 for r in results if "error" in r and r["error"])
679
- }
680
-
681
- return [
682
- types.TextContent(
683
- type="text",
684
- text=json.dumps(result_json, ensure_ascii=False)
685
- )
686
- ]
687
- except Exception as e:
688
- logger.error(f"搜索并提取时出错: {str(e)}")
689
- logger.error(traceback.format_exc())
690
- return [
691
- types.TextContent(
692
- type="text",
693
- text=json.dumps({
694
- "error": f"搜索并提取内容时出错: {str(e)}",
695
- "keywords": keywords,
696
- "count": 0,
697
- "results": []
698
- }, ensure_ascii=False)
699
- )
700
- ]
701
-
702
- else:
703
- raise ValueError(f"未知工具: {name}")
704
-
705
- async def find_and_count_abstract_links(page):
706
- """查找并统计包含article/abstract?v=的链接"""
707
- global page_content
708
-
709
- try:
710
- logger.debug("[DEBUG] 开始查找所有包含article/abstract?v=的链接")
711
-
712
- # 等待确保页面完全加载
713
- await asyncio.sleep(2)
714
-
715
- # 查找所有链接
716
- all_links = await page.query_selector_all('a[href*="article/abstract?v="]')
717
- links_count = len(all_links)
718
-
719
- logger.debug(f"[DEBUG] 找到{links_count}条包含article/abstract?v=的链接")
720
-
721
- # 提取并记录每个链接的URL和文本
722
- links_info = []
723
-
724
- for i, link in enumerate(all_links):
725
- href = await link.get_attribute('href')
726
-
727
- links_info.append({
728
- 'index': i + 1,
729
- 'href': href
730
- })
731
-
732
- logger.debug(f"[DEBUG] 链接 {i+1}: {href}")
733
-
734
- # 判断数量是否符合预期(50条)
735
- if links_count == 50:
736
- logger.debug("[DEBUG] 链接数量正好是50条,符合预期")
737
- elif links_count < 50:
738
- logger.debug(f"[DEBUG] 链接数量为{links_count}条,少于预期的50条")
739
- else:
740
- logger.debug(f"[DEBUG] 链接数量为{links_count}条,多于预期的50条")
741
-
742
- # 存储结果 - 使用字典结构而不是纯文本
743
- page_content = {
744
- "count": links_count,
745
- "links": [{"index": link['index'], "url": link['href']} for link in links_info]
746
- }
747
-
748
- return links_count
749
- except Exception as e:
750
- logger.debug(f"[DEBUG] 查找链接时出错: {str(e)}")
751
- return 0
752
-
753
- async def main():
754
- """主程序入口"""
755
- # 使用stdin/stdout流运行服务器
756
- async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
757
- await server.run(
758
- read_stream,
759
- write_stream,
760
- InitializationOptions(
761
- server_name="cnks",
762
- server_version="0.2.2",
763
- capabilities=server.get_capabilities(
764
- notification_options=NotificationOptions(),
765
- experimental_capabilities={},
766
- ),
767
- ),
768
- )
769
-
770
- # 为符合README.md的要求,添加从FastMCP导出的接口
771
- def create_fastmcp_server():
772
- """创建FastMCP服务器接口,符合README中的示例"""
773
- try:
774
- from mcp.server.fastmcp import FastMCP
775
- fast_mcp = FastMCP("知网搜索")
776
-
777
- # 只添加搜索并提取的工具
778
- if extractor is not None and PLAYWRIGHT_AVAILABLE:
779
- @fast_mcp.tool()
780
- async def mcp_cnks_search_and_extract(keywords: str) -> dict:
781
- """搜索关键词并提取所有论文的详细内容"""
782
- logger.debug("[DEBUG] 正在使用FastMCP的mcp_cnks_search_and_extract函数")
783
- try:
784
- # 第一步:执行搜索
785
- result_count = await search_with_playwright(keywords)
786
-
787
- # 检查搜索结果
788
- if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
789
- return {
790
- "error": "搜索未返回有效链接",
791
- "keywords": keywords,
792
- "count": 0,
793
- "results": []
794
- }
795
-
796
- # 提取链接
797
- urls = [link["url"] for link in page_content["links"] if "url" in link]
798
- if not urls:
799
- return {
800
- "error": "未找到有效链接",
801
- "keywords": keywords,
802
- "count": 0,
803
- "results": []
804
- }
805
-
806
- # 第二步:执行提取
807
- results = await extractor.batch_extract_contents(urls)
808
-
809
- # 包装结果
810
- return {
811
- "keywords": keywords,
812
- "count": len(results),
813
- "results": results,
814
- "success_count": sum(1 for r in results if "error" not in r or not r["error"]),
815
- "error_count": sum(1 for r in results if "error" in r and r["error"])
816
- }
817
- except Exception as e:
818
- logger.error(f"搜索并提取时出错: {str(e)}")
819
- return {
820
- "error": f"搜索并提取内容时出错: {str(e)}",
821
- "keywords": keywords,
822
- "count": 0,
823
- "results": []
824
- }
825
-
826
- return fast_mcp
827
- except ImportError:
828
- logger.warning("警告: 无法导入FastMCP,请确保已安装最新版本的MCP")
829
- return None
830
-
831
- if __name__ == "__main__":
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import platform
5
+ import re
6
+ import subprocess
7
+ import sys
8
+ import time
9
+ import logging
10
+ import webbrowser
11
+ import traceback
12
+ from pathlib import Path
13
+ from urllib.parse import quote
14
+ from typing import Dict, List, Any, Optional, Union
15
+
16
+ from mcp.server.models import InitializationOptions
17
+ import mcp.types as types
18
+ from mcp.server import NotificationOptions, Server
19
+ from pydantic import AnyUrl
20
+ import mcp.server.stdio
21
+
22
+ # 配置日志记录
23
+ logging.basicConfig(
24
+ level=logging.DEBUG,
25
+ filename="cnks.log",
26
+ filemode="a",
27
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
28
+ )
29
+ logger = logging.getLogger("cnks")
30
+
31
+ # 尝试导入playwright
32
+ try:
33
+ from playwright.async_api import async_playwright
34
+ PLAYWRIGHT_AVAILABLE = True
35
+ except ImportError:
36
+ PLAYWRIGHT_AVAILABLE = False
37
+ logger.warning("Playwright未安装,将使用传统方式打开Chrome")
38
+
39
+ # 存储当前页面内容和笔记
40
+ page_content = ""
41
+ current_url = ""
42
+ notes: dict[str, str] = {}
43
+ browser_instance = None
44
+
45
+ server = Server("cnks")
46
+
47
+ # 导入我们新创建的extractor模块
48
+ try:
49
+ from . import chrome_extractor as extractor
50
+ except ImportError:
51
+ try:
52
+ import chrome_extractor as extractor
53
+ except ImportError:
54
+ extractor = None
55
+ logger.warning("无法导入chrome_extractor模块,批量提取功能将不可用")
56
+
57
+ def find_chrome_executable():
58
+ """查找Chrome可执行文件路径"""
59
+ system = platform.system()
60
+
61
+ # 定义可能的Chrome位置
62
+ if system == "Windows":
63
+ chrome_paths = [
64
+ r"C:\Program Files\Google\Chrome\Application\chrome.exe",
65
+ r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
66
+ os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
67
+ ]
68
+ elif system == "Darwin": # MacOS
69
+ chrome_paths = [
70
+ "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
71
+ os.path.expanduser("~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"),
72
+ ]
73
+ elif system == "Linux":
74
+ chrome_paths = [
75
+ "/usr/bin/google-chrome",
76
+ "/usr/bin/chromium-browser",
77
+ "/usr/bin/chromium",
78
+ ]
79
+ else:
80
+ return None
81
+
82
+ # 检查路径是否存在
83
+ for path in chrome_paths:
84
+ if os.path.exists(path):
85
+ return path
86
+
87
+ # 尝试从环境变量中查找
88
+ chrome_env = os.environ.get("CHROME_PATH")
89
+ if chrome_env and os.path.exists(chrome_env):
90
+ return chrome_env
91
+
92
+ return None
93
+
94
+ def open_chrome(url):
95
+ """打开Chrome浏览器并访问指定URL"""
96
+ try:
97
+ chrome_path = find_chrome_executable()
98
+
99
+ if not chrome_path:
100
+ return "未找到Chrome可执行文件。请设置CHROME_PATH环境变量指向Chrome位置。"
101
+
102
+ subprocess.Popen([
103
+ chrome_path,
104
+ url
105
+ ])
106
+ time.sleep(2) # 等待页面加载
107
+ return True
108
+ except Exception as e:
109
+ return f"打开Chrome时出错: {str(e)}"
110
+
111
+ async def search_with_playwright(keywords):
112
+ """使用playwright在知网搜索关键词"""
113
+ global page_content
114
+
115
+ if not PLAYWRIGHT_AVAILABLE:
116
+ return "需要安装playwright模块:uv add playwright"
117
+
118
+ try:
119
+ chrome_path = find_chrome_executable()
120
+ if not chrome_path:
121
+ return "未找到Chrome可执行文件。请设置CHROME_PATH环境变量指向Chrome位置。"
122
+
123
+ logger.debug(f"[DEBUG] 使用Playwright搜索,Chrome路径: {chrome_path}")
124
+
125
+ # 创建全局浏览器实例,避免执行完关闭
126
+ global browser_instance
127
+
128
+ # 只打开一个playwright实例
129
+ playwright = await async_playwright().start()
130
+
131
+ # 尝试使用系统Chrome
132
+ try:
133
+ logger.debug("[DEBUG] 尝试使用channel='chrome'启动浏览器")
134
+ browser = await playwright.chromium.launch(
135
+ headless=False,
136
+ channel="chrome"
137
+ )
138
+ except Exception as e:
139
+ logger.debug(f"[DEBUG] channel='chrome'方式失败: {str(e)}")
140
+ logger.debug("[DEBUG] 尝试使用executable_path启动浏览器")
141
+ # 如果失败,尝试使用executable_path指定Chrome路径
142
+ browser = await playwright.chromium.launch(
143
+ headless=False,
144
+ executable_path=chrome_path
145
+ )
146
+
147
+ # 保存浏览器实例以防止被关闭
148
+ browser_instance = browser
149
+
150
+ page = await browser.new_page()
151
+
152
+ # 导航到知网搜索页面
153
+ await page.goto("https://kns.cnki.net/kns8s/search")
154
+ logger.debug("[DEBUG] 成功打开知网搜索页面")
155
+
156
+ # 等待页面加载
157
+ await page.wait_for_load_state("networkidle")
158
+
159
+ # 查找并填写搜索框
160
+ try:
161
+ # 尝试定位搜索框
162
+ search_input = await page.query_selector('input.search-input')
163
+ if search_input:
164
+ # 清空搜索框
165
+ await search_input.fill("")
166
+ # 输入关键词
167
+ await search_input.fill(keywords)
168
+ logger.debug(f"[DEBUG] 已在搜索框中输入: {keywords}")
169
+
170
+ # 增加短暂等待以确保用户可以看到输入过程
171
+ await asyncio.sleep(1)
172
+
173
+ # 查找并点击搜索按钮
174
+ search_button = await page.query_selector('.search-btn')
175
+ if search_button:
176
+ await search_button.click()
177
+ logger.debug("[DEBUG] 已点击搜索按钮")
178
+ # 等待搜索结果加载
179
+ await page.wait_for_load_state("networkidle")
180
+
181
+ # 点击操作1:点击下拉框的三角形
182
+ try:
183
+ # 等待一下,确保页面元素都加载完成
184
+ await asyncio.sleep(2)
185
+
186
+ # 尝试点击排序下拉框
187
+ logger.debug("[DEBUG] 尝试点击排序下拉框")
188
+ # 根据提供的HTML,尝试定位下拉框的三角形
189
+ sort_dropdown = await page.query_selector('div[class="sort"][id="perPageDiv"]')
190
+ if sort_dropdown:
191
+ await sort_dropdown.click()
192
+ logger.debug("[DEBUG] 成功点击排序下拉框")
193
+
194
+ # 等待下拉菜单出现
195
+ await asyncio.sleep(1)
196
+
197
+ # 点击操作2:点击数字50选项
198
+ logger.debug("[DEBUG] 尝试点击'50'选项")
199
+ # 尝试定位"50"选项
200
+ option_50 = await page.query_selector('li[data-val="50"]')
201
+ if option_50:
202
+ await option_50.click()
203
+ logger.debug("[DEBUG] 成功点击'50'选项")
204
+ await page.wait_for_load_state("networkidle")
205
+
206
+ # 勾选来源类别中的CSSCI选项
207
+ try:
208
+ # 等待一下确保页面完全加载
209
+ await asyncio.sleep(2)
210
+
211
+ logger.debug("[DEBUG] 尝试勾选CSSCI选项")
212
+
213
+ # 首先尝试找到来源类别区域
214
+ # 通常来源类别会有一个标题或者分组
215
+ source_category = await page.query_selector('div.group-item:has-text("来源类别")')
216
+
217
+ if source_category:
218
+ logger.debug("[DEBUG] 找到来源类别区域")
219
+
220
+ # 在来源类别区域内查找CSSCI选项
221
+ cssci_checkbox = await source_category.query_selector('input[type="checkbox"]:near(:text("CSSCI"))')
222
+
223
+ if cssci_checkbox:
224
+ # 点击CSSCI复选框
225
+ await cssci_checkbox.click()
226
+ logger.debug("[DEBUG] 成功勾选CSSCI选项")
227
+
228
+ # 等待页面刷新
229
+ await page.wait_for_load_state("networkidle")
230
+
231
+ # 查找所有包含"article/abstract?v="字样的链接
232
+ links_count = await find_and_count_abstract_links(page)
233
+
234
+ return links_count
235
+ else:
236
+ logger.debug("[DEBUG] 在来源类别区域未找到CSSCI选项")
237
+
238
+ # 尝试另一种方式:直接在整个页面中查找CSSCI
239
+ cssci_text = await page.query_selector(':text("CSSCI")')
240
+ if cssci_text:
241
+ # 尝试点击文本附近的复选框
242
+ await cssci_text.click()
243
+ logger.debug("[DEBUG] 通过文本找到并点击了CSSCI")
244
+ await page.wait_for_load_state("networkidle")
245
+
246
+ # 查找所有包含"article/abstract?v="字样的链接
247
+ links_count = await find_and_count_abstract_links(page)
248
+
249
+ return links_count
250
+ else:
251
+ # 查找所有包含"article/abstract?v="字样的链接
252
+ links_count = await find_and_count_abstract_links(page)
253
+ return links_count
254
+ else:
255
+ logger.debug("[DEBUG] 未找到来源类别区域")
256
+
257
+ # 尝试直接在页面中查找CSSCI文本
258
+ cssci_text = await page.query_selector(':text("CSSCI")')
259
+ if cssci_text:
260
+ # 尝试点击文本附近的复选框
261
+ await cssci_text.click()
262
+ logger.debug("[DEBUG] 直接找到并点击了CSSCI")
263
+ await page.wait_for_load_state("networkidle")
264
+
265
+ # 查找所有包含"article/abstract?v="字样的链接
266
+ links_count = await find_and_count_abstract_links(page)
267
+
268
+ return links_count
269
+ else:
270
+ # 查找所有包含"article/abstract?v="字样的链接
271
+ links_count = await find_and_count_abstract_links(page)
272
+ return links_count
273
+ except Exception as e:
274
+ logger.debug(f"[DEBUG] 勾选CSSCI选项时出错: {str(e)}")
275
+ # 查找所有包含"article/abstract?v="字样的链接
276
+ links_count = await find_and_count_abstract_links(page)
277
+ return links_count
278
+
279
+ # 查找所有包含"article/abstract?v="字样的链接
280
+ links_count = await find_and_count_abstract_links(page)
281
+ return links_count
282
+ else:
283
+ logger.debug("[DEBUG] 未找到'50'选项")
284
+ page_content = {
285
+ "count": 0,
286
+ "links": [],
287
+ "error": "已搜索并点击下拉框,但未找到'50'选项"
288
+ }
289
+ return 0
290
+ else:
291
+ logger.debug("[DEBUG] 未找到排序下拉框")
292
+ page_content = {
293
+ "count": 0,
294
+ "links": [],
295
+ "error": "已搜索,但未找到排序下拉框"
296
+ }
297
+ return 0
298
+ except Exception as e:
299
+ logger.debug(f"[DEBUG] 点击下拉框或选项时出错: {str(e)}")
300
+ page_content = {
301
+ "count": 0,
302
+ "links": [],
303
+ "error": f"已搜索,但在点击下拉框或选项时出错: {str(e)}"
304
+ }
305
+ return 0
306
+ else:
307
+ # 不关闭浏览器
308
+ page_content = {
309
+ "count": 0,
310
+ "links": [],
311
+ "error": f"已填写搜索关键词: {keywords},但未找到搜索按钮"
312
+ }
313
+ return 0
314
+ else:
315
+ # 不关闭浏览器
316
+ page_content = {
317
+ "count": 0,
318
+ "links": [],
319
+ "error": f"未找到搜索框,无法搜索: {keywords}"
320
+ }
321
+ return 0
322
+ except Exception as e:
323
+ logger.debug(f"[DEBUG] 填写搜索框或点击搜索按钮时出错: {str(e)}")
324
+ # 不关闭浏览器
325
+ page_content = {
326
+ "count": 0,
327
+ "links": [],
328
+ "error": f"自动搜索过程中出错: {str(e)}"
329
+ }
330
+ return 0
331
+ except Exception as e:
332
+ error_msg = str(e)
333
+ logger.debug(f"[DEBUG] Playwright错误: {error_msg}")
334
+
335
+ # 如果是找不到Chrome的错误,提供更明确的指导
336
+ if "Executable doesn't exist" in error_msg and "ms-playwright" in error_msg:
337
+ error_message = f"需要安装Playwright的浏览器: playwright install\n如果您想使用系统Chrome,请重新启动服务器。\n\n{error_msg}"
338
+ else:
339
+ error_message = f"使用Playwright启动Chrome失败: {error_msg}"
340
+
341
+ page_content = {
342
+ "count": 0,
343
+ "links": [],
344
+ "error": error_message
345
+ }
346
+ return 0
347
+
348
+ def search_with_direct_chrome(keywords):
349
+ """直接使用Chrome搜索,不使用playwright"""
350
+ global page_content
351
+
352
+ logger.debug("[DEBUG] 正在使用search_with_direct_chrome函数")
353
+
354
+ try:
355
+ url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
356
+ logger.debug(f"[DEBUG] 打开URL: {url}")
357
+
358
+ result = open_chrome(url)
359
+
360
+ if isinstance(result, str) and "打开Chrome" in result:
361
+ logger.debug(f"[DEBUG] 直接打开Chrome结果: {result}")
362
+
363
+ page_content = {
364
+ "count": 0,
365
+ "links": [],
366
+ "error": f"直接打开Chrome搜索: {result}"
367
+ }
368
+
369
+ else:
370
+ logger.debug("[DEBUG] 直接打开Chrome成功")
371
+
372
+ page_content = {
373
+ "count": 0,
374
+ "links": [],
375
+ "message": "已打开Chrome并搜索关键词,但无法自动获取链接。请安装playwright以获取完整功能。"
376
+ }
377
+
378
+ return page_content
379
+ except Exception as e:
380
+ logger.debug(f"[DEBUG] search_with_direct_chrome出错: {str(e)}")
381
+
382
+ page_content = {
383
+ "count": 0,
384
+ "links": [],
385
+ "error": f"使用Chrome搜索时出错: {str(e)}"
386
+ }
387
+
388
+ return page_content
389
+
390
+ def get_page_content():
391
+ """获取当前页面内容(简化模拟)"""
392
+ global page_content, current_url
393
+ if not current_url:
394
+ return "尚未打开任何页面"
395
+
396
+ # 实际应用中,这里可以使用Selenium或类似工具来获取实际页面内容
397
+ # 此处为简化实现,返回模拟内容
398
+ if "cnki" in current_url:
399
+ return f"中国知网搜索页面\n当前URL: {current_url}\n可使用搜索工具查询文献。"
400
+ return f"已打开页面: {current_url}"
401
+
402
+ @server.list_resources()
403
+ async def handle_list_resources() -> list[types.Resource]:
404
+ """列出可用资源"""
405
+ resources = []
406
+
407
+ # 当前网页资源
408
+ resources.append(
409
+ types.Resource(
410
+ uri=AnyUrl("webpage://current"),
411
+ name="当前网页",
412
+ description="当前打开的网页内容",
413
+ mimeType="text/plain",
414
+ )
415
+ )
416
+
417
+ # 知网搜索页资源
418
+ resources.append(
419
+ types.Resource(
420
+ uri=AnyUrl("webpage://cnki/search"),
421
+ name="知网搜索页",
422
+ description="中国知网搜索页面",
423
+ mimeType="text/plain",
424
+ )
425
+ )
426
+
427
+ # 笔记资源
428
+ for name in notes:
429
+ resources.append(
430
+ types.Resource(
431
+ uri=AnyUrl(f"note://internal/{name}"),
432
+ name=f"笔记: {name}",
433
+ description=f"笔记: {name}",
434
+ mimeType="text/plain",
435
+ )
436
+ )
437
+
438
+ return resources
439
+
440
+ @server.read_resource()
441
+ async def handle_read_resource(uri: AnyUrl) -> str:
442
+ """读取资源内容"""
443
+ global current_url
444
+
445
+ scheme = uri.scheme
446
+
447
+ if scheme == "webpage":
448
+ path = uri.path if uri.path else ""
449
+ host = uri.host if uri.host else ""
450
+
451
+ if host == "current":
452
+ return get_page_content()
453
+ elif host == "cnki" and path == "/search":
454
+ # 打开知网搜索页
455
+ current_url = "https://kns.cnki.net/kns8s/search"
456
+ result = open_chrome(current_url)
457
+ if result is True:
458
+ return "已打开中国知网搜索页面,可使用搜索工具查询文献。"
459
+ else:
460
+ return result
461
+ elif scheme == "note":
462
+ name = uri.path
463
+ if name is not None:
464
+ name = name.lstrip("/")
465
+ if name in notes:
466
+ return notes[name]
467
+ raise ValueError(f"笔记未找到: {name}")
468
+
469
+ raise ValueError(f"不支持的URI方案或资源未找到: {uri}")
470
+
471
+ @server.list_prompts()
472
+ async def handle_list_prompts() -> list[types.Prompt]:
473
+ """列出可用提示"""
474
+ return [
475
+ types.Prompt(
476
+ name="search-literature",
477
+ description="按主题搜索文献",
478
+ arguments=[
479
+ types.PromptArgument(
480
+ name="keywords",
481
+ description="搜索关键词",
482
+ required=True,
483
+ )
484
+ ],
485
+ ),
486
+ types.Prompt(
487
+ name="advanced-search",
488
+ description="高级文献搜索",
489
+ arguments=[
490
+ types.PromptArgument(
491
+ name="title",
492
+ description="论文标题",
493
+ required=False,
494
+ ),
495
+ types.PromptArgument(
496
+ name="author",
497
+ description="作者",
498
+ required=False,
499
+ ),
500
+ types.PromptArgument(
501
+ name="keywords",
502
+ description="关键词",
503
+ required=False,
504
+ ),
505
+ types.PromptArgument(
506
+ name="institution",
507
+ description="机构",
508
+ required=False,
509
+ ),
510
+ ],
511
+ ),
512
+ types.Prompt(
513
+ name="summarize-notes",
514
+ description="总结所有笔记",
515
+ arguments=[
516
+ types.PromptArgument(
517
+ name="style",
518
+ description="摘要风格 (brief/detailed)",
519
+ required=False,
520
+ )
521
+ ],
522
+ )
523
+ ]
524
+
525
+ @server.get_prompt()
526
+ async def handle_get_prompt(
527
+ name: str, arguments: dict[str, str] | None
528
+ ) -> types.GetPromptResult:
529
+ """生成提示"""
530
+ if name == "search-literature":
531
+ keywords = (arguments or {}).get("keywords", "")
532
+ return types.GetPromptResult(
533
+ description="按主题搜索文献",
534
+ messages=[
535
+ types.PromptMessage(
536
+ role="user",
537
+ content=types.TextContent(
538
+ type="text",
539
+ text=f"请在中国知网搜索关于\"{keywords}\"的文献,并分析主要研究趋势。"
540
+ ),
541
+ )
542
+ ],
543
+ )
544
+ elif name == "advanced-search":
545
+ title = (arguments or {}).get("title", "")
546
+ author = (arguments or {}).get("author", "")
547
+ keywords = (arguments or {}).get("keywords", "")
548
+ institution = (arguments or {}).get("institution", "")
549
+
550
+ search_terms = []
551
+ if title:
552
+ search_terms.append(f"标题包含\"{title}\"")
553
+ if author:
554
+ search_terms.append(f"作者为\"{author}\"")
555
+ if keywords:
556
+ search_terms.append(f"关键词包含\"{keywords}\"")
557
+ if institution:
558
+ search_terms.append(f"机构为\"{institution}\"")
559
+
560
+ search_criteria = "、".join(search_terms)
561
+
562
+ return types.GetPromptResult(
563
+ description="高级文献搜索",
564
+ messages=[
565
+ types.PromptMessage(
566
+ role="user",
567
+ content=types.TextContent(
568
+ type="text",
569
+ text=f"请在中国知网搜索{search_criteria}的文献,并总结相关研究成果。"
570
+ ),
571
+ )
572
+ ],
573
+ )
574
+ elif name == "summarize-notes":
575
+ style = (arguments or {}).get("style", "brief")
576
+ detail_prompt = "请提供详细分析。" if style == "detailed" else ""
577
+
578
+ return types.GetPromptResult(
579
+ description="总结所有笔记",
580
+ messages=[
581
+ types.PromptMessage(
582
+ role="user",
583
+ content=types.TextContent(
584
+ type="text",
585
+ text=f"以下是需要总结的笔记:{detail_prompt}\n\n"
586
+ + "\n".join(
587
+ f"- {name}: {content}"
588
+ for name, content in notes.items()
589
+ ),
590
+ ),
591
+ )
592
+ ],
593
+ )
594
+
595
+ raise ValueError(f"未知提示: {name}")
596
+
597
+ @server.list_tools()
598
+ async def handle_list_tools() -> list[types.Tool]:
599
+ """列出可用工具"""
600
+ tools = []
601
+
602
+ # 只添加搜索并提取的组合工具
603
+ if extractor is not None and PLAYWRIGHT_AVAILABLE:
604
+ tools.append(
605
+ types.Tool(
606
+ name="mcp_cnks_search_and_extract",
607
+ description="搜索知网关键词并提取所有论文的详细内容",
608
+ inputSchema={
609
+ "type": "object",
610
+ "properties": {
611
+ "keywords": {"type": "string", "description": "搜索关键词"},
612
+ },
613
+ "required": ["keywords"],
614
+ },
615
+ )
616
+ )
617
+
618
+ return tools
619
+
620
+ @server.call_tool()
621
+ async def handle_call_tool(
622
+ name: str, arguments: dict | None
623
+ ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
624
+ """处理工具执行请求"""
625
+ global current_url, page_content
626
+
627
+ if name == "mcp_cnks_search_and_extract" and extractor is not None and PLAYWRIGHT_AVAILABLE:
628
+ if not arguments:
629
+ raise ValueError("缺少参数")
630
+
631
+ keywords = arguments.get("keywords")
632
+ if not keywords:
633
+ raise ValueError("缺少关键词")
634
+
635
+ try:
636
+ # 第一步:执行搜索
637
+ logger.info(f"开始执行搜索并提取:关键词 '{keywords}'")
638
+ links_count = await search_with_playwright(keywords)
639
+ current_url = "https://kns.cnki.net/kns8s/search"
640
+
641
+ # 检查搜索结果
642
+ if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
643
+ return [
644
+ types.TextContent(
645
+ type="text",
646
+ text={
647
+ "error": "搜索未返回有效链接",
648
+ "count": 0,
649
+ "results": []
650
+ }
651
+ )
652
+ ]
653
+
654
+ # 提取链接
655
+ urls = [link["url"] for link in page_content["links"] if "url" in link]
656
+ if not urls:
657
+ return [
658
+ types.TextContent(
659
+ type="text",
660
+ text={
661
+ "error": "未找到有效链接",
662
+ "count": 0,
663
+ "results": []
664
+ }
665
+ )
666
+ ]
667
+
668
+ # 第二步:执行提取
669
+ logger.info(f"搜索成功,找到 {len(urls)} 个链接,开始提取内容")
670
+ results = await extractor.batch_extract_contents(urls)
671
+
672
+ # 包装结果
673
+ result_json = {
674
+ "keywords": keywords,
675
+ "count": len(results),
676
+ "results": results,
677
+ "success_count": sum(1 for r in results if "error" not in r or not r["error"]),
678
+ "error_count": sum(1 for r in results if "error" in r and r["error"])
679
+ }
680
+
681
+ return [
682
+ types.TextContent(
683
+ type="text",
684
+ text=result_json
685
+ )
686
+ ]
687
+ except Exception as e:
688
+ logger.error(f"搜索并提取时出错: {str(e)}")
689
+ logger.error(traceback.format_exc())
690
+ return [
691
+ types.TextContent(
692
+ type="text",
693
+ text={
694
+ "error": f"搜索并提取内容时出错: {str(e)}",
695
+ "keywords": keywords,
696
+ "count": 0,
697
+ "results": []
698
+ }
699
+ )
700
+ ]
701
+
702
+ else:
703
+ raise ValueError(f"未知工具: {name}")
704
+
705
+ async def find_and_count_abstract_links(page):
706
+ """查找并统计包含article/abstract?v=的链接"""
707
+ global page_content
708
+
709
+ try:
710
+ logger.debug("[DEBUG] 开始查找所有包含article/abstract?v=的链接")
711
+
712
+ # 等待确保页面完全加载
713
+ await asyncio.sleep(2)
714
+
715
+ # 查找所有链接
716
+ all_links = await page.query_selector_all('a[href*="article/abstract?v="]')
717
+ links_count = len(all_links)
718
+
719
+ logger.debug(f"[DEBUG] 找到{links_count}条包含article/abstract?v=的链接")
720
+
721
+ # 提取并记录每个链接的URL和文本
722
+ links_info = []
723
+
724
+ for i, link in enumerate(all_links):
725
+ href = await link.get_attribute('href')
726
+
727
+ links_info.append({
728
+ 'index': i + 1,
729
+ 'href': href
730
+ })
731
+
732
+ logger.debug(f"[DEBUG] 链接 {i+1}: {href}")
733
+
734
+ # 判断数量是否符合预期(50条)
735
+ if links_count == 50:
736
+ logger.debug("[DEBUG] 链接数量正好是50条,符合预期")
737
+ elif links_count < 50:
738
+ logger.debug(f"[DEBUG] 链接数量为{links_count}条,少于预期的50条")
739
+ else:
740
+ logger.debug(f"[DEBUG] 链接数量为{links_count}条,多于预期的50条")
741
+
742
+ # 存储结果 - 使用字典结构而不是纯文本
743
+ page_content = {
744
+ "count": links_count,
745
+ "links": [{"index": link['index'], "url": link['href']} for link in links_info]
746
+ }
747
+
748
+ return links_count
749
+ except Exception as e:
750
+ logger.debug(f"[DEBUG] 查找链接时出错: {str(e)}")
751
+ return 0
752
+
753
+ async def main():
754
+ """主程序入口"""
755
+ # 使用stdin/stdout流运行服务器
756
+ async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
757
+ await server.run(
758
+ read_stream,
759
+ write_stream,
760
+ InitializationOptions(
761
+ server_name="cnks",
762
+ server_version="0.2.3",
763
+ capabilities=server.get_capabilities(
764
+ notification_options=NotificationOptions(),
765
+ experimental_capabilities={},
766
+ ),
767
+ ),
768
+ )
769
+
770
+ # 为符合README.md的要求,添加从FastMCP导出的接口
771
+ def create_fastmcp_server():
772
+ """创建FastMCP服务器接口,符合README中的示例"""
773
+ try:
774
+ from mcp.server.fastmcp import FastMCP
775
+ fast_mcp = FastMCP("知网搜索")
776
+
777
+ # 只添加搜索并提取的工具
778
+ if extractor is not None and PLAYWRIGHT_AVAILABLE:
779
+ @fast_mcp.tool()
780
+ async def mcp_cnks_search_and_extract(keywords: str) -> dict:
781
+ """搜索关键词并提取所有论文的详细内容"""
782
+ logger.debug("[DEBUG] 正在使用FastMCP的mcp_cnks_search_and_extract函数")
783
+ try:
784
+ # 第一步:执行搜索
785
+ result_count = await search_with_playwright(keywords)
786
+
787
+ # 检查搜索结果
788
+ if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
789
+ return {
790
+ "error": "搜索未返回有效链接",
791
+ "keywords": keywords,
792
+ "count": 0,
793
+ "results": []
794
+ }
795
+
796
+ # 提取链接
797
+ urls = [link["url"] for link in page_content["links"] if "url" in link]
798
+ if not urls:
799
+ return {
800
+ "error": "未找到有效链接",
801
+ "keywords": keywords,
802
+ "count": 0,
803
+ "results": []
804
+ }
805
+
806
+ # 第二步:执行提取
807
+ results = await extractor.batch_extract_contents(urls)
808
+
809
+ # 包装结果
810
+ return {
811
+ "keywords": keywords,
812
+ "count": len(results),
813
+ "results": results,
814
+ "success_count": sum(1 for r in results if "error" not in r or not r["error"]),
815
+ "error_count": sum(1 for r in results if "error" in r and r["error"])
816
+ }
817
+ except Exception as e:
818
+ logger.error(f"搜索并提取时出错: {str(e)}")
819
+ return {
820
+ "error": f"搜索并提取内容时出错: {str(e)}",
821
+ "keywords": keywords,
822
+ "count": 0,
823
+ "results": []
824
+ }
825
+
826
+ return fast_mcp
827
+ except ImportError:
828
+ logger.warning("警告: 无法导入FastMCP,请确保已安装最新版本的MCP")
829
+ return None
830
+
831
+ if __name__ == "__main__":
832
832
  asyncio.run(main())