cnks 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnks/__init__.py +17 -6
- cnks/chrome_extractor.py +413 -0
- cnks/extractor.py +250 -0
- cnks/server.py +242 -172
- cnks-0.2.1.dist-info/METADATA +181 -0
- cnks-0.2.1.dist-info/RECORD +8 -0
- cnks-0.1.0.dist-info/METADATA +0 -841
- cnks-0.1.0.dist-info/RECORD +0 -6
- {cnks-0.1.0.dist-info → cnks-0.2.1.dist-info}/WHEEL +0 -0
- {cnks-0.1.0.dist-info → cnks-0.2.1.dist-info}/entry_points.txt +0 -0
cnks/server.py
CHANGED
@@ -7,8 +7,11 @@ import subprocess
|
|
7
7
|
import sys
|
8
8
|
import time
|
9
9
|
import logging
|
10
|
+
import webbrowser
|
11
|
+
import traceback
|
10
12
|
from pathlib import Path
|
11
13
|
from urllib.parse import quote
|
14
|
+
from typing import Dict, List, Any, Optional, Union
|
12
15
|
|
13
16
|
from mcp.server.models import InitializationOptions
|
14
17
|
import mcp.types as types
|
@@ -41,6 +44,16 @@ browser_instance = None
|
|
41
44
|
|
42
45
|
server = Server("cnks")
|
43
46
|
|
47
|
+
# 导入我们新创建的extractor模块
|
48
|
+
try:
|
49
|
+
from . import chrome_extractor as extractor
|
50
|
+
except ImportError:
|
51
|
+
try:
|
52
|
+
import chrome_extractor as extractor
|
53
|
+
except ImportError:
|
54
|
+
extractor = None
|
55
|
+
logger.warning("无法导入chrome_extractor模块,批量提取功能将不可用")
|
56
|
+
|
44
57
|
def find_chrome_executable():
|
45
58
|
"""查找Chrome可执行文件路径"""
|
46
59
|
system = platform.system()
|
@@ -97,6 +110,8 @@ def open_chrome(url):
|
|
97
110
|
|
98
111
|
async def search_with_playwright(keywords):
|
99
112
|
"""使用playwright在知网搜索关键词"""
|
113
|
+
global page_content
|
114
|
+
|
100
115
|
if not PLAYWRIGHT_AVAILABLE:
|
101
116
|
return "需要安装playwright模块:uv add playwright"
|
102
117
|
|
@@ -216,7 +231,7 @@ async def search_with_playwright(keywords):
|
|
216
231
|
# 查找所有包含"article/abstract?v="字样的链接
|
217
232
|
links_count = await find_and_count_abstract_links(page)
|
218
233
|
|
219
|
-
return
|
234
|
+
return links_count
|
220
235
|
else:
|
221
236
|
logger.debug("[DEBUG] 在来源类别区域未找到CSSCI选项")
|
222
237
|
|
@@ -231,9 +246,11 @@ async def search_with_playwright(keywords):
|
|
231
246
|
# 查找所有包含"article/abstract?v="字样的链接
|
232
247
|
links_count = await find_and_count_abstract_links(page)
|
233
248
|
|
234
|
-
return
|
249
|
+
return links_count
|
235
250
|
else:
|
236
|
-
|
251
|
+
# 查找所有包含"article/abstract?v="字样的链接
|
252
|
+
links_count = await find_and_count_abstract_links(page)
|
253
|
+
return links_count
|
237
254
|
else:
|
238
255
|
logger.debug("[DEBUG] 未找到来源类别区域")
|
239
256
|
|
@@ -248,61 +265,127 @@ async def search_with_playwright(keywords):
|
|
248
265
|
# 查找所有包含"article/abstract?v="字样的链接
|
249
266
|
links_count = await find_and_count_abstract_links(page)
|
250
267
|
|
251
|
-
return
|
268
|
+
return links_count
|
252
269
|
else:
|
253
|
-
|
270
|
+
# 查找所有包含"article/abstract?v="字样的链接
|
271
|
+
links_count = await find_and_count_abstract_links(page)
|
272
|
+
return links_count
|
254
273
|
except Exception as e:
|
255
274
|
logger.debug(f"[DEBUG] 勾选CSSCI选项时出错: {str(e)}")
|
256
|
-
|
275
|
+
# 查找所有包含"article/abstract?v="字样的链接
|
276
|
+
links_count = await find_and_count_abstract_links(page)
|
277
|
+
return links_count
|
257
278
|
|
258
|
-
|
279
|
+
# 查找所有包含"article/abstract?v="字样的链接
|
280
|
+
links_count = await find_and_count_abstract_links(page)
|
281
|
+
return links_count
|
259
282
|
else:
|
260
283
|
logger.debug("[DEBUG] 未找到'50'选项")
|
261
|
-
|
284
|
+
page_content = {
|
285
|
+
"count": 0,
|
286
|
+
"links": [],
|
287
|
+
"error": "已搜索并点击下拉框,但未找到'50'选项"
|
288
|
+
}
|
289
|
+
return 0
|
262
290
|
else:
|
263
291
|
logger.debug("[DEBUG] 未找到排序下拉框")
|
264
|
-
|
292
|
+
page_content = {
|
293
|
+
"count": 0,
|
294
|
+
"links": [],
|
295
|
+
"error": "已搜索,但未找到排序下拉框"
|
296
|
+
}
|
297
|
+
return 0
|
265
298
|
except Exception as e:
|
266
299
|
logger.debug(f"[DEBUG] 点击下拉框或选项时出错: {str(e)}")
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
300
|
+
page_content = {
|
301
|
+
"count": 0,
|
302
|
+
"links": [],
|
303
|
+
"error": f"已搜索,但在点击下拉框或选项时出错: {str(e)}"
|
304
|
+
}
|
305
|
+
return 0
|
271
306
|
else:
|
272
307
|
# 不关闭浏览器
|
273
|
-
|
308
|
+
page_content = {
|
309
|
+
"count": 0,
|
310
|
+
"links": [],
|
311
|
+
"error": f"已填写搜索关键词: {keywords},但未找到搜索按钮"
|
312
|
+
}
|
313
|
+
return 0
|
274
314
|
else:
|
275
315
|
# 不关闭浏览器
|
276
|
-
|
316
|
+
page_content = {
|
317
|
+
"count": 0,
|
318
|
+
"links": [],
|
319
|
+
"error": f"未找到搜索框,无法搜索: {keywords}"
|
320
|
+
}
|
321
|
+
return 0
|
277
322
|
except Exception as e:
|
278
323
|
logger.debug(f"[DEBUG] 填写搜索框或点击搜索按钮时出错: {str(e)}")
|
279
324
|
# 不关闭浏览器
|
280
|
-
|
325
|
+
page_content = {
|
326
|
+
"count": 0,
|
327
|
+
"links": [],
|
328
|
+
"error": f"自动搜索过程中出错: {str(e)}"
|
329
|
+
}
|
330
|
+
return 0
|
281
331
|
except Exception as e:
|
282
332
|
error_msg = str(e)
|
283
333
|
logger.debug(f"[DEBUG] Playwright错误: {error_msg}")
|
284
334
|
|
285
335
|
# 如果是找不到Chrome的错误,提供更明确的指导
|
286
336
|
if "Executable doesn't exist" in error_msg and "ms-playwright" in error_msg:
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
337
|
+
error_message = f"需要安装Playwright的浏览器: playwright install\n如果您想使用系统Chrome,请重新启动服务器。\n\n{error_msg}"
|
338
|
+
else:
|
339
|
+
error_message = f"使用Playwright启动Chrome失败: {error_msg}"
|
340
|
+
|
341
|
+
page_content = {
|
342
|
+
"count": 0,
|
343
|
+
"links": [],
|
344
|
+
"error": error_message
|
345
|
+
}
|
346
|
+
return 0
|
291
347
|
|
292
348
|
def search_with_direct_chrome(keywords):
|
293
349
|
"""直接使用Chrome搜索,不使用playwright"""
|
294
|
-
|
295
|
-
|
296
|
-
# 构建知网搜索URL - 知网不支持URL参数搜索,所以只能打开页面
|
297
|
-
url = "https://kns.cnki.net/kns8s/search"
|
350
|
+
global page_content
|
298
351
|
|
299
|
-
|
300
|
-
result = open_chrome(url)
|
352
|
+
logger.debug("[DEBUG] 正在使用search_with_direct_chrome函数")
|
301
353
|
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
354
|
+
try:
|
355
|
+
url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
|
356
|
+
logger.debug(f"[DEBUG] 打开URL: {url}")
|
357
|
+
|
358
|
+
result = open_chrome(url)
|
359
|
+
|
360
|
+
if isinstance(result, str) and "打开Chrome" in result:
|
361
|
+
logger.debug(f"[DEBUG] 直接打开Chrome结果: {result}")
|
362
|
+
|
363
|
+
page_content = {
|
364
|
+
"count": 0,
|
365
|
+
"links": [],
|
366
|
+
"error": f"直接打开Chrome搜索: {result}"
|
367
|
+
}
|
368
|
+
|
369
|
+
else:
|
370
|
+
logger.debug("[DEBUG] 直接打开Chrome成功")
|
371
|
+
|
372
|
+
page_content = {
|
373
|
+
"count": 0,
|
374
|
+
"links": [],
|
375
|
+
"message": "已打开Chrome并搜索关键词,但无法自动获取链接。请安装playwright以获取完整功能。"
|
376
|
+
}
|
377
|
+
|
378
|
+
return page_content
|
379
|
+
except Exception as e:
|
380
|
+
logger.debug(f"[DEBUG] search_with_direct_chrome出错: {str(e)}")
|
381
|
+
|
382
|
+
page_content = {
|
383
|
+
"count": 0,
|
384
|
+
"links": [],
|
385
|
+
"error": f"使用Chrome搜索时出错: {str(e)}"
|
386
|
+
}
|
387
|
+
|
388
|
+
return page_content
|
306
389
|
|
307
390
|
def get_page_content():
|
308
391
|
"""获取当前页面内容(简化模拟)"""
|
@@ -514,49 +597,25 @@ async def handle_get_prompt(
|
|
514
597
|
@server.list_tools()
|
515
598
|
async def handle_list_tools() -> list[types.Tool]:
|
516
599
|
"""列出可用工具"""
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
"
|
524
|
-
"
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
"type": "object",
|
532
|
-
"properties": {
|
533
|
-
"keywords": {"type": "string", "description": "搜索关键词"},
|
534
|
-
},
|
535
|
-
"required": ["keywords"],
|
536
|
-
},
|
537
|
-
),
|
538
|
-
types.Tool(
|
539
|
-
name="add-note",
|
540
|
-
description="添加笔记",
|
541
|
-
inputSchema={
|
542
|
-
"type": "object",
|
543
|
-
"properties": {
|
544
|
-
"name": {"type": "string", "description": "笔记名称"},
|
545
|
-
"content": {"type": "string", "description": "笔记内容"},
|
600
|
+
tools = []
|
601
|
+
|
602
|
+
# 只添加搜索并提取的组合工具
|
603
|
+
if extractor is not None and PLAYWRIGHT_AVAILABLE:
|
604
|
+
tools.append(
|
605
|
+
types.Tool(
|
606
|
+
name="mcp_cnks_search_and_extract",
|
607
|
+
description="搜索知网关键词并提取所有论文的详细内容",
|
608
|
+
inputSchema={
|
609
|
+
"type": "object",
|
610
|
+
"properties": {
|
611
|
+
"keywords": {"type": "string", "description": "搜索关键词"},
|
612
|
+
},
|
613
|
+
"required": ["keywords"],
|
546
614
|
},
|
547
|
-
|
548
|
-
},
|
549
|
-
),
|
550
|
-
types.Tool(
|
551
|
-
name="get-abstract-links",
|
552
|
-
description="获取最近一次搜索找到的论文摘要链接",
|
553
|
-
inputSchema={
|
554
|
-
"type": "object",
|
555
|
-
"properties": {},
|
556
|
-
"required": [],
|
557
|
-
},
|
615
|
+
)
|
558
616
|
)
|
559
|
-
|
617
|
+
|
618
|
+
return tools
|
560
619
|
|
561
620
|
@server.call_tool()
|
562
621
|
async def handle_call_tool(
|
@@ -565,25 +624,7 @@ async def handle_call_tool(
|
|
565
624
|
"""处理工具执行请求"""
|
566
625
|
global current_url, page_content
|
567
626
|
|
568
|
-
if name == "
|
569
|
-
current_url = "https://kns.cnki.net/kns8s/search"
|
570
|
-
result = open_chrome(current_url)
|
571
|
-
if result is True:
|
572
|
-
return [
|
573
|
-
types.TextContent(
|
574
|
-
type="text",
|
575
|
-
text="已打开中国知网搜索页面。"
|
576
|
-
)
|
577
|
-
]
|
578
|
-
else:
|
579
|
-
return [
|
580
|
-
types.TextContent(
|
581
|
-
type="text",
|
582
|
-
text=f"打开中国知网时出错: {result}"
|
583
|
-
)
|
584
|
-
]
|
585
|
-
|
586
|
-
elif name == "search-keywords":
|
627
|
+
if name == "mcp_cnks_search_and_extract" and extractor is not None and PLAYWRIGHT_AVAILABLE:
|
587
628
|
if not arguments:
|
588
629
|
raise ValueError("缺少参数")
|
589
630
|
|
@@ -591,72 +632,80 @@ async def handle_call_tool(
|
|
591
632
|
if not keywords:
|
592
633
|
raise ValueError("缺少关键词")
|
593
634
|
|
594
|
-
|
595
|
-
|
596
|
-
|
635
|
+
try:
|
636
|
+
# 第一步:执行搜索
|
637
|
+
logger.info(f"开始执行搜索并提取:关键词 '{keywords}'")
|
638
|
+
links_count = await search_with_playwright(keywords)
|
597
639
|
current_url = "https://kns.cnki.net/kns8s/search"
|
598
640
|
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
641
|
+
# 检查搜索结果
|
642
|
+
if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
|
643
|
+
return [
|
644
|
+
types.TextContent(
|
645
|
+
type="text",
|
646
|
+
text=json.dumps({
|
647
|
+
"error": "搜索未返回有效链接",
|
648
|
+
"count": 0,
|
649
|
+
"results": []
|
650
|
+
}, ensure_ascii=False)
|
651
|
+
)
|
652
|
+
]
|
653
|
+
|
654
|
+
# 提取链接
|
655
|
+
urls = [link["url"] for link in page_content["links"] if "url" in link]
|
656
|
+
if not urls:
|
657
|
+
return [
|
658
|
+
types.TextContent(
|
659
|
+
type="text",
|
660
|
+
text=json.dumps({
|
661
|
+
"error": "未找到有效链接",
|
662
|
+
"count": 0,
|
663
|
+
"results": []
|
664
|
+
}, ensure_ascii=False)
|
665
|
+
)
|
666
|
+
]
|
667
|
+
|
668
|
+
# 第二步:执行提取
|
669
|
+
logger.info(f"搜索成功,找到 {len(urls)} 个链接,开始提取内容")
|
670
|
+
results = await extractor.batch_extract_contents(urls)
|
671
|
+
|
672
|
+
# 包装结果
|
673
|
+
result_json = {
|
674
|
+
"keywords": keywords,
|
675
|
+
"count": len(results),
|
676
|
+
"results": results,
|
677
|
+
"success_count": sum(1 for r in results if "error" not in r or not r["error"]),
|
678
|
+
"error_count": sum(1 for r in results if "error" in r and r["error"])
|
679
|
+
}
|
609
680
|
|
610
681
|
return [
|
611
682
|
types.TextContent(
|
612
683
|
type="text",
|
613
|
-
text=
|
684
|
+
text=json.dumps(result_json, ensure_ascii=False)
|
614
685
|
)
|
615
686
|
]
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
raise ValueError("缺少参数")
|
620
|
-
|
621
|
-
note_name = arguments.get("name")
|
622
|
-
content = arguments.get("content")
|
623
|
-
|
624
|
-
if not note_name or not content:
|
625
|
-
raise ValueError("缺少名称或内容")
|
626
|
-
|
627
|
-
# 更新服务器状态
|
628
|
-
notes[note_name] = content
|
629
|
-
|
630
|
-
# 通知客户端资源已更改
|
631
|
-
await server.request_context.session.send_resource_list_changed()
|
632
|
-
|
633
|
-
return [
|
634
|
-
types.TextContent(
|
635
|
-
type="text",
|
636
|
-
text=f"已添加笔记 '{note_name}': {content}"
|
637
|
-
)
|
638
|
-
]
|
639
|
-
|
640
|
-
elif name == "get-abstract-links":
|
641
|
-
if not page_content or "找到" not in page_content:
|
687
|
+
except Exception as e:
|
688
|
+
logger.error(f"搜索并提取时出错: {str(e)}")
|
689
|
+
logger.error(traceback.format_exc())
|
642
690
|
return [
|
643
691
|
types.TextContent(
|
644
692
|
type="text",
|
645
|
-
text=
|
693
|
+
text=json.dumps({
|
694
|
+
"error": f"搜索并提取内容时出错: {str(e)}",
|
695
|
+
"keywords": keywords,
|
696
|
+
"count": 0,
|
697
|
+
"results": []
|
698
|
+
}, ensure_ascii=False)
|
646
699
|
)
|
647
700
|
]
|
648
|
-
|
649
|
-
return [
|
650
|
-
types.TextContent(
|
651
|
-
type="text",
|
652
|
-
text=page_content
|
653
|
-
)
|
654
|
-
]
|
655
701
|
|
656
|
-
|
702
|
+
else:
|
703
|
+
raise ValueError(f"未知工具: {name}")
|
657
704
|
|
658
705
|
async def find_and_count_abstract_links(page):
|
659
706
|
"""查找并统计包含article/abstract?v=的链接"""
|
707
|
+
global page_content
|
708
|
+
|
660
709
|
try:
|
661
710
|
logger.debug("[DEBUG] 开始查找所有包含article/abstract?v=的链接")
|
662
711
|
|
@@ -690,11 +739,11 @@ async def find_and_count_abstract_links(page):
|
|
690
739
|
else:
|
691
740
|
logger.debug(f"[DEBUG] 链接数量为{links_count}条,多于预期的50条")
|
692
741
|
|
693
|
-
# 存储结果 -
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
742
|
+
# 存储结果 - 使用字典结构而不是纯文本
|
743
|
+
page_content = {
|
744
|
+
"count": links_count,
|
745
|
+
"links": [{"index": link['index'], "url": link['href']} for link in links_info]
|
746
|
+
}
|
698
747
|
|
699
748
|
return links_count
|
700
749
|
except Exception as e:
|
@@ -710,7 +759,7 @@ async def main():
|
|
710
759
|
write_stream,
|
711
760
|
InitializationOptions(
|
712
761
|
server_name="cnks",
|
713
|
-
server_version="0.1
|
762
|
+
server_version="0.2.1",
|
714
763
|
capabilities=server.get_capabilities(
|
715
764
|
notification_options=NotificationOptions(),
|
716
765
|
experimental_capabilities={},
|
@@ -725,33 +774,54 @@ def create_fastmcp_server():
|
|
725
774
|
from mcp.server.fastmcp import FastMCP
|
726
775
|
fast_mcp = FastMCP("知网搜索")
|
727
776
|
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
777
|
+
# 只添加搜索并提取的工具
|
778
|
+
if extractor is not None and PLAYWRIGHT_AVAILABLE:
|
779
|
+
@fast_mcp.tool()
|
780
|
+
async def mcp_cnks_search_and_extract(keywords: str) -> dict:
|
781
|
+
"""搜索关键词并提取所有论文的详细内容"""
|
782
|
+
logger.debug("[DEBUG] 正在使用FastMCP的mcp_cnks_search_and_extract函数")
|
783
|
+
try:
|
784
|
+
# 第一步:执行搜索
|
785
|
+
result_count = await search_with_playwright(keywords)
|
786
|
+
|
787
|
+
# 检查搜索结果
|
788
|
+
if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
|
789
|
+
return {
|
790
|
+
"error": "搜索未返回有效链接",
|
791
|
+
"keywords": keywords,
|
792
|
+
"count": 0,
|
793
|
+
"results": []
|
794
|
+
}
|
795
|
+
|
796
|
+
# 提取链接
|
797
|
+
urls = [link["url"] for link in page_content["links"] if "url" in link]
|
798
|
+
if not urls:
|
799
|
+
return {
|
800
|
+
"error": "未找到有效链接",
|
801
|
+
"keywords": keywords,
|
802
|
+
"count": 0,
|
803
|
+
"results": []
|
804
|
+
}
|
805
|
+
|
806
|
+
# 第二步:执行提取
|
807
|
+
results = await extractor.batch_extract_contents(urls)
|
808
|
+
|
809
|
+
# 包装结果
|
810
|
+
return {
|
811
|
+
"keywords": keywords,
|
812
|
+
"count": len(results),
|
813
|
+
"results": results,
|
814
|
+
"success_count": sum(1 for r in results if "error" not in r or not r["error"]),
|
815
|
+
"error_count": sum(1 for r in results if "error" in r and r["error"])
|
816
|
+
}
|
817
|
+
except Exception as e:
|
818
|
+
logger.error(f"搜索并提取时出错: {str(e)}")
|
819
|
+
return {
|
820
|
+
"error": f"搜索并提取内容时出错: {str(e)}",
|
821
|
+
"keywords": keywords,
|
822
|
+
"count": 0,
|
823
|
+
"results": []
|
824
|
+
}
|
755
825
|
|
756
826
|
return fast_mcp
|
757
827
|
except ImportError:
|