cnks 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cnks/server.py CHANGED
@@ -7,8 +7,11 @@ import subprocess
7
7
  import sys
8
8
  import time
9
9
  import logging
10
+ import webbrowser
11
+ import traceback
10
12
  from pathlib import Path
11
13
  from urllib.parse import quote
14
+ from typing import Dict, List, Any, Optional, Union
12
15
 
13
16
  from mcp.server.models import InitializationOptions
14
17
  import mcp.types as types
@@ -41,6 +44,16 @@ browser_instance = None
41
44
 
42
45
  server = Server("cnks")
43
46
 
47
+ # 导入我们新创建的extractor模块
48
+ try:
49
+ from . import chrome_extractor as extractor
50
+ except ImportError:
51
+ try:
52
+ import chrome_extractor as extractor
53
+ except ImportError:
54
+ extractor = None
55
+ logger.warning("无法导入chrome_extractor模块,批量提取功能将不可用")
56
+
44
57
  def find_chrome_executable():
45
58
  """查找Chrome可执行文件路径"""
46
59
  system = platform.system()
@@ -97,6 +110,8 @@ def open_chrome(url):
97
110
 
98
111
  async def search_with_playwright(keywords):
99
112
  """使用playwright在知网搜索关键词"""
113
+ global page_content
114
+
100
115
  if not PLAYWRIGHT_AVAILABLE:
101
116
  return "需要安装playwright模块:uv add playwright"
102
117
 
@@ -216,7 +231,7 @@ async def search_with_playwright(keywords):
216
231
  # 查找所有包含"article/abstract?v="字样的链接
217
232
  links_count = await find_and_count_abstract_links(page)
218
233
 
219
- return f"已完成全部操作:搜索关键词、设置每页显示50条、勾选CSSCI来源类别。找到{links_count}条包含article/abstract?v=的链接。浏览器将保持打开状态。"
234
+ return links_count
220
235
  else:
221
236
  logger.debug("[DEBUG] 在来源类别区域未找到CSSCI选项")
222
237
 
@@ -231,9 +246,11 @@ async def search_with_playwright(keywords):
231
246
  # 查找所有包含"article/abstract?v="字样的链接
232
247
  links_count = await find_and_count_abstract_links(page)
233
248
 
234
- return f"已完成全部操作:搜索关键词、设置每页显示50条、勾选CSSCI来源类别。找到{links_count}条包含article/abstract?v=的链接。浏览器将保持打开状态。"
249
+ return links_count
235
250
  else:
236
- return "已完成搜索和设置每页显示50条,但未找到CSSCI选项。浏览器将保持打开状态。"
251
+ # 查找所有包含"article/abstract?v="字样的链接
252
+ links_count = await find_and_count_abstract_links(page)
253
+ return links_count
237
254
  else:
238
255
  logger.debug("[DEBUG] 未找到来源类别区域")
239
256
 
@@ -248,61 +265,127 @@ async def search_with_playwright(keywords):
248
265
  # 查找所有包含"article/abstract?v="字样的链接
249
266
  links_count = await find_and_count_abstract_links(page)
250
267
 
251
- return f"已完成全部操作:搜索关键词、设置每页显示50条、勾选CSSCI来源类别。找到{links_count}条包含article/abstract?v=的链接。浏览器将保持打开状态。"
268
+ return links_count
252
269
  else:
253
- return "已完成搜索和设置每页显示50条,但未找到来源类别区域或CSSCI选项。浏览器将保持打开状态。"
270
+ # 查找所有包含"article/abstract?v="字样的链接
271
+ links_count = await find_and_count_abstract_links(page)
272
+ return links_count
254
273
  except Exception as e:
255
274
  logger.debug(f"[DEBUG] 勾选CSSCI选项时出错: {str(e)}")
256
- return f"已完成搜索和设置每页显示50条,但勾选CSSCI时出错: {str(e)}。浏览器将保持打开状态。"
275
+ # 查找所有包含"article/abstract?v="字样的链接
276
+ links_count = await find_and_count_abstract_links(page)
277
+ return links_count
257
278
 
258
- return "已完成全部操作:搜索关键词、点击排序下拉框、选择每页显示50条。浏览器将保持打开状态。"
279
+ # 查找所有包含"article/abstract?v="字样的链接
280
+ links_count = await find_and_count_abstract_links(page)
281
+ return links_count
259
282
  else:
260
283
  logger.debug("[DEBUG] 未找到'50'选项")
261
- return "已搜索并点击下拉框,但未找到'50'选项。浏览器将保持打开状态。"
284
+ page_content = {
285
+ "count": 0,
286
+ "links": [],
287
+ "error": "已搜索并点击下拉框,但未找到'50'选项"
288
+ }
289
+ return 0
262
290
  else:
263
291
  logger.debug("[DEBUG] 未找到排序下拉框")
264
- return "已搜索,但未找到排序下拉框。浏览器将保持打开状态。"
292
+ page_content = {
293
+ "count": 0,
294
+ "links": [],
295
+ "error": "已搜索,但未找到排序下拉框"
296
+ }
297
+ return 0
265
298
  except Exception as e:
266
299
  logger.debug(f"[DEBUG] 点击下拉框或选项时出错: {str(e)}")
267
- return f"已搜索,但在点击下拉框或选项时出错: {str(e)}。浏览器将保持打开状态。"
268
-
269
- # 不关闭浏览器,让它保持打开状态
270
- # 注意:不调用 browser.close() 和 playwright.stop()
300
+ page_content = {
301
+ "count": 0,
302
+ "links": [],
303
+ "error": f"已搜索,但在点击下拉框或选项时出错: {str(e)}"
304
+ }
305
+ return 0
271
306
  else:
272
307
  # 不关闭浏览器
273
- return f"已填写搜索关键词: {keywords},但未找到搜索按钮。请手动点击搜索。"
308
+ page_content = {
309
+ "count": 0,
310
+ "links": [],
311
+ "error": f"已填写搜索关键词: {keywords},但未找到搜索按钮"
312
+ }
313
+ return 0
274
314
  else:
275
315
  # 不关闭浏览器
276
- return f"未找到搜索框。已打开知网页面,请手动搜索: {keywords}"
316
+ page_content = {
317
+ "count": 0,
318
+ "links": [],
319
+ "error": f"未找到搜索框,无法搜索: {keywords}"
320
+ }
321
+ return 0
277
322
  except Exception as e:
278
323
  logger.debug(f"[DEBUG] 填写搜索框或点击搜索按钮时出错: {str(e)}")
279
324
  # 不关闭浏览器
280
- return f"自动搜索过程中出错,请手动在页面中搜索: {keywords}"
325
+ page_content = {
326
+ "count": 0,
327
+ "links": [],
328
+ "error": f"自动搜索过程中出错: {str(e)}"
329
+ }
330
+ return 0
281
331
  except Exception as e:
282
332
  error_msg = str(e)
283
333
  logger.debug(f"[DEBUG] Playwright错误: {error_msg}")
284
334
 
285
335
  # 如果是找不到Chrome的错误,提供更明确的指导
286
336
  if "Executable doesn't exist" in error_msg and "ms-playwright" in error_msg:
287
- return f"需要安装Playwright的浏览器: playwright install\n如果您想使用系统Chrome,请重新启动服务器。\n\n{error_msg}"
288
-
289
- # 如果Playwright启动失败,使用传统方式打开Chrome
290
- return f"使用Playwright启动Chrome失败: {error_msg}。尝试使用传统方式打开浏览器。"
337
+ error_message = f"需要安装Playwright的浏览器: playwright install\n如果您想使用系统Chrome,请重新启动服务器。\n\n{error_msg}"
338
+ else:
339
+ error_message = f"使用Playwright启动Chrome失败: {error_msg}"
340
+
341
+ page_content = {
342
+ "count": 0,
343
+ "links": [],
344
+ "error": error_message
345
+ }
346
+ return 0
291
347
 
292
348
  def search_with_direct_chrome(keywords):
293
349
  """直接使用Chrome搜索,不使用playwright"""
294
- logger.debug("[DEBUG] 正在使用search_with_direct_chrome函数")
295
-
296
- # 构建知网搜索URL - 知网不支持URL参数搜索,所以只能打开页面
297
- url = "https://kns.cnki.net/kns8s/search"
350
+ global page_content
298
351
 
299
- # 打开Chrome
300
- result = open_chrome(url)
352
+ logger.debug("[DEBUG] 正在使用search_with_direct_chrome函数")
301
353
 
302
- if result is True:
303
- return f"已打开知网页面。请在搜索框中输入并搜索: {keywords}"
304
- else:
305
- return f"打开Chrome浏览器失败: {result}"
354
+ try:
355
+ url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
356
+ logger.debug(f"[DEBUG] 打开URL: {url}")
357
+
358
+ result = open_chrome(url)
359
+
360
+ if isinstance(result, str) and "打开Chrome" in result:
361
+ logger.debug(f"[DEBUG] 直接打开Chrome结果: {result}")
362
+
363
+ page_content = {
364
+ "count": 0,
365
+ "links": [],
366
+ "error": f"直接打开Chrome搜索: {result}"
367
+ }
368
+
369
+ else:
370
+ logger.debug("[DEBUG] 直接打开Chrome成功")
371
+
372
+ page_content = {
373
+ "count": 0,
374
+ "links": [],
375
+ "message": "已打开Chrome并搜索关键词,但无法自动获取链接。请安装playwright以获取完整功能。"
376
+ }
377
+
378
+ return page_content
379
+ except Exception as e:
380
+ logger.debug(f"[DEBUG] search_with_direct_chrome出错: {str(e)}")
381
+
382
+ page_content = {
383
+ "count": 0,
384
+ "links": [],
385
+ "error": f"使用Chrome搜索时出错: {str(e)}"
386
+ }
387
+
388
+ return page_content
306
389
 
307
390
  def get_page_content():
308
391
  """获取当前页面内容(简化模拟)"""
@@ -514,49 +597,25 @@ async def handle_get_prompt(
514
597
  @server.list_tools()
515
598
  async def handle_list_tools() -> list[types.Tool]:
516
599
  """列出可用工具"""
517
- return [
518
- types.Tool(
519
- name="open-cnki",
520
- description="打开中国知网搜索页面",
521
- inputSchema={
522
- "type": "object",
523
- "properties": {},
524
- "required": [],
525
- },
526
- ),
527
- types.Tool(
528
- name="search-keywords",
529
- description="在知网搜索关键词",
530
- inputSchema={
531
- "type": "object",
532
- "properties": {
533
- "keywords": {"type": "string", "description": "搜索关键词"},
534
- },
535
- "required": ["keywords"],
536
- },
537
- ),
538
- types.Tool(
539
- name="add-note",
540
- description="添加笔记",
541
- inputSchema={
542
- "type": "object",
543
- "properties": {
544
- "name": {"type": "string", "description": "笔记名称"},
545
- "content": {"type": "string", "description": "笔记内容"},
600
+ tools = []
601
+
602
+ # 只添加搜索并提取的组合工具
603
+ if extractor is not None and PLAYWRIGHT_AVAILABLE:
604
+ tools.append(
605
+ types.Tool(
606
+ name="mcp_cnks_search_and_extract",
607
+ description="搜索知网关键词并提取所有论文的详细内容",
608
+ inputSchema={
609
+ "type": "object",
610
+ "properties": {
611
+ "keywords": {"type": "string", "description": "搜索关键词"},
612
+ },
613
+ "required": ["keywords"],
546
614
  },
547
- "required": ["name", "content"],
548
- },
549
- ),
550
- types.Tool(
551
- name="get-abstract-links",
552
- description="获取最近一次搜索找到的论文摘要链接",
553
- inputSchema={
554
- "type": "object",
555
- "properties": {},
556
- "required": [],
557
- },
615
+ )
558
616
  )
559
- ]
617
+
618
+ return tools
560
619
 
561
620
  @server.call_tool()
562
621
  async def handle_call_tool(
@@ -565,25 +624,7 @@ async def handle_call_tool(
565
624
  """处理工具执行请求"""
566
625
  global current_url, page_content
567
626
 
568
- if name == "open-cnki":
569
- current_url = "https://kns.cnki.net/kns8s/search"
570
- result = open_chrome(current_url)
571
- if result is True:
572
- return [
573
- types.TextContent(
574
- type="text",
575
- text="已打开中国知网搜索页面。"
576
- )
577
- ]
578
- else:
579
- return [
580
- types.TextContent(
581
- type="text",
582
- text=f"打开中国知网时出错: {result}"
583
- )
584
- ]
585
-
586
- elif name == "search-keywords":
627
+ if name == "mcp_cnks_search_and_extract" and extractor is not None and PLAYWRIGHT_AVAILABLE:
587
628
  if not arguments:
588
629
  raise ValueError("缺少参数")
589
630
 
@@ -591,72 +632,80 @@ async def handle_call_tool(
591
632
  if not keywords:
592
633
  raise ValueError("缺少关键词")
593
634
 
594
- # 优先使用playwright进行搜索
595
- if PLAYWRIGHT_AVAILABLE:
596
- result = await search_with_playwright(keywords)
635
+ try:
636
+ # 第一步:执行搜索
637
+ logger.info(f"开始执行搜索并提取:关键词 '{keywords}'")
638
+ links_count = await search_with_playwright(keywords)
597
639
  current_url = "https://kns.cnki.net/kns8s/search"
598
640
 
599
- return [
600
- types.TextContent(
601
- type="text",
602
- text=result
603
- )
604
- ]
605
- else:
606
- # 如果没有playwright,回退到传统方式
607
- result = search_with_direct_chrome(keywords)
608
- current_url = "https://kns.cnki.net/kns8s/search"
641
+ # 检查搜索结果
642
+ if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
643
+ return [
644
+ types.TextContent(
645
+ type="text",
646
+ text=json.dumps({
647
+ "error": "搜索未返回有效链接",
648
+ "count": 0,
649
+ "results": []
650
+ }, ensure_ascii=False)
651
+ )
652
+ ]
653
+
654
+ # 提取链接
655
+ urls = [link["url"] for link in page_content["links"] if "url" in link]
656
+ if not urls:
657
+ return [
658
+ types.TextContent(
659
+ type="text",
660
+ text=json.dumps({
661
+ "error": "未找到有效链接",
662
+ "count": 0,
663
+ "results": []
664
+ }, ensure_ascii=False)
665
+ )
666
+ ]
667
+
668
+ # 第二步:执行提取
669
+ logger.info(f"搜索成功,找到 {len(urls)} 个链接,开始提取内容")
670
+ results = await extractor.batch_extract_contents(urls)
671
+
672
+ # 包装结果
673
+ result_json = {
674
+ "keywords": keywords,
675
+ "count": len(results),
676
+ "results": results,
677
+ "success_count": sum(1 for r in results if "error" not in r or not r["error"]),
678
+ "error_count": sum(1 for r in results if "error" in r and r["error"])
679
+ }
609
680
 
610
681
  return [
611
682
  types.TextContent(
612
683
  type="text",
613
- text=f"{result}。如需自动搜索功能,请安装: uv add playwright"
684
+ text=json.dumps(result_json, ensure_ascii=False)
614
685
  )
615
686
  ]
616
-
617
- elif name == "add-note":
618
- if not arguments:
619
- raise ValueError("缺少参数")
620
-
621
- note_name = arguments.get("name")
622
- content = arguments.get("content")
623
-
624
- if not note_name or not content:
625
- raise ValueError("缺少名称或内容")
626
-
627
- # 更新服务器状态
628
- notes[note_name] = content
629
-
630
- # 通知客户端资源已更改
631
- await server.request_context.session.send_resource_list_changed()
632
-
633
- return [
634
- types.TextContent(
635
- type="text",
636
- text=f"已添加笔记 '{note_name}': {content}"
637
- )
638
- ]
639
-
640
- elif name == "get-abstract-links":
641
- if not page_content or "找到" not in page_content:
687
+ except Exception as e:
688
+ logger.error(f"搜索并提取时出错: {str(e)}")
689
+ logger.error(traceback.format_exc())
642
690
  return [
643
691
  types.TextContent(
644
692
  type="text",
645
- text="尚未执行搜索或未找到链接。请先使用search-keywords工具搜索。"
693
+ text=json.dumps({
694
+ "error": f"搜索并提取内容时出错: {str(e)}",
695
+ "keywords": keywords,
696
+ "count": 0,
697
+ "results": []
698
+ }, ensure_ascii=False)
646
699
  )
647
700
  ]
648
-
649
- return [
650
- types.TextContent(
651
- type="text",
652
- text=page_content
653
- )
654
- ]
655
701
 
656
- raise ValueError(f"未知工具: {name}")
702
+ else:
703
+ raise ValueError(f"未知工具: {name}")
657
704
 
658
705
  async def find_and_count_abstract_links(page):
659
706
  """查找并统计包含article/abstract?v=的链接"""
707
+ global page_content
708
+
660
709
  try:
661
710
  logger.debug("[DEBUG] 开始查找所有包含article/abstract?v=的链接")
662
711
 
@@ -690,11 +739,11 @@ async def find_and_count_abstract_links(page):
690
739
  else:
691
740
  logger.debug(f"[DEBUG] 链接数量为{links_count}条,多于预期的50条")
692
741
 
693
- # 存储结果 - 只包含编号和链接,不包含标题和连字符
694
- global page_content
695
- page_content = f"找到{links_count}条包含article/abstract?v=的链接\n\n" + "\n".join([
696
- f"{link['index']}. {link['href']}" for link in links_info
697
- ])
742
+ # 存储结果 - 使用字典结构而不是纯文本
743
+ page_content = {
744
+ "count": links_count,
745
+ "links": [{"index": link['index'], "url": link['href']} for link in links_info]
746
+ }
698
747
 
699
748
  return links_count
700
749
  except Exception as e:
@@ -710,7 +759,7 @@ async def main():
710
759
  write_stream,
711
760
  InitializationOptions(
712
761
  server_name="cnks",
713
- server_version="0.1.0",
762
+ server_version="0.2.1",
714
763
  capabilities=server.get_capabilities(
715
764
  notification_options=NotificationOptions(),
716
765
  experimental_capabilities={},
@@ -725,33 +774,54 @@ def create_fastmcp_server():
725
774
  from mcp.server.fastmcp import FastMCP
726
775
  fast_mcp = FastMCP("知网搜索")
727
776
 
728
- @fast_mcp.tool()
729
- def open_cnki_search():
730
- """打开中国知网搜索页面"""
731
- return open_chrome("https://kns.cnki.net/kns8s/search")
732
-
733
- @fast_mcp.tool()
734
- async def search_keywords(keywords: str) -> str:
735
- """在知网搜索关键词"""
736
- logger.debug("[DEBUG] 正在使用FastMCP的search_keywords函数")
737
- if PLAYWRIGHT_AVAILABLE:
738
- result = await search_with_playwright(keywords)
739
- return result
740
- else:
741
- result = search_with_direct_chrome(keywords)
742
- return f"{result}。如需自动搜索功能,请安装: uv add playwright"
743
-
744
- @fast_mcp.tool()
745
- def get_abstract_links() -> str:
746
- """获取最近一次搜索找到的论文摘要链接"""
747
- if not page_content or "找到" not in page_content:
748
- return "尚未执行搜索或未找到链接。请先使用search_keywords工具搜索。"
749
- return page_content
750
-
751
- @fast_mcp.resource("webpage://current")
752
- def get_current_webpage() -> str:
753
- """获取当前网页内容"""
754
- return get_page_content()
777
+ # 只添加搜索并提取的工具
778
+ if extractor is not None and PLAYWRIGHT_AVAILABLE:
779
+ @fast_mcp.tool()
780
+ async def mcp_cnks_search_and_extract(keywords: str) -> dict:
781
+ """搜索关键词并提取所有论文的详细内容"""
782
+ logger.debug("[DEBUG] 正在使用FastMCP的mcp_cnks_search_and_extract函数")
783
+ try:
784
+ # 第一步:执行搜索
785
+ result_count = await search_with_playwright(keywords)
786
+
787
+ # 检查搜索结果
788
+ if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
789
+ return {
790
+ "error": "搜索未返回有效链接",
791
+ "keywords": keywords,
792
+ "count": 0,
793
+ "results": []
794
+ }
795
+
796
+ # 提取链接
797
+ urls = [link["url"] for link in page_content["links"] if "url" in link]
798
+ if not urls:
799
+ return {
800
+ "error": "未找到有效链接",
801
+ "keywords": keywords,
802
+ "count": 0,
803
+ "results": []
804
+ }
805
+
806
+ # 第二步:执行提取
807
+ results = await extractor.batch_extract_contents(urls)
808
+
809
+ # 包装结果
810
+ return {
811
+ "keywords": keywords,
812
+ "count": len(results),
813
+ "results": results,
814
+ "success_count": sum(1 for r in results if "error" not in r or not r["error"]),
815
+ "error_count": sum(1 for r in results if "error" in r and r["error"])
816
+ }
817
+ except Exception as e:
818
+ logger.error(f"搜索并提取时出错: {str(e)}")
819
+ return {
820
+ "error": f"搜索并提取内容时出错: {str(e)}",
821
+ "keywords": keywords,
822
+ "count": 0,
823
+ "results": []
824
+ }
755
825
 
756
826
  return fast_mcp
757
827
  except ImportError: