PyPI - cnks - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

cnks 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

cnks/__init__.py +17 -6
cnks/chrome_extractor.py +413 -0
cnks/extractor.py +250 -0
cnks/server.py +242 -172
cnks-0.2.1.dist-info/METADATA +181 -0
cnks-0.2.1.dist-info/RECORD +8 -0
cnks-0.1.0.dist-info/METADATA +0 -841
cnks-0.1.0.dist-info/RECORD +0 -6
{cnks-0.1.0.dist-info → cnks-0.2.1.dist-info}/WHEEL +0 -0
{cnks-0.1.0.dist-info → cnks-0.2.1.dist-info}/entry_points.txt +0 -0

cnks/server.py CHANGED Viewed

@@ -7,8 +7,11 @@ import subprocess
 import sys
 import time
 import logging
+import webbrowser
+import traceback
 from pathlib import Path
 from urllib.parse import quote
+from typing import Dict, List, Any, Optional, Union
 from mcp.server.models import InitializationOptions
 import mcp.types as types
@@ -41,6 +44,16 @@ browser_instance = None
 server = Server("cnks")
+# 导入我们新创建的extractor模块
+try:
+    from . import chrome_extractor as extractor
+except ImportError:
+    try:
+        import chrome_extractor as extractor
+    except ImportError:
+        extractor = None
+        logger.warning("无法导入chrome_extractor模块，批量提取功能将不可用")
 def find_chrome_executable():
     """查找Chrome可执行文件路径"""
     system = platform.system()
@@ -97,6 +110,8 @@ def open_chrome(url):
 async def search_with_playwright(keywords):
     """使用playwright在知网搜索关键词"""
+    global page_content
     if not PLAYWRIGHT_AVAILABLE:
         return "需要安装playwright模块：uv add playwright"
@@ -216,7 +231,7 @@ async def search_with_playwright(keywords):
                                             # 查找所有包含"article/abstract?v="字样的链接
                                             links_count = await find_and_count_abstract_links(page)
-                                            return f"已完成全部操作：搜索关键词、设置每页显示50条、勾选CSSCI来源类别。找到{links_count}条包含article/abstract?v=的链接。浏览器将保持打开状态。"
+                                            return links_count
                                         else:
                                             logger.debug("[DEBUG] 在来源类别区域未找到CSSCI选项")
@@ -231,9 +246,11 @@ async def search_with_playwright(keywords):
                                                 # 查找所有包含"article/abstract?v="字样的链接
                                                 links_count = await find_and_count_abstract_links(page)
-                                                return f"已完成全部操作：搜索关键词、设置每页显示50条、勾选CSSCI来源类别。找到{links_count}条包含article/abstract?v=的链接。浏览器将保持打开状态。"
+                                                return links_count
                                             else:
-                                                return "已完成搜索和设置每页显示50条，但未找到CSSCI选项。浏览器将保持打开状态。"
+                                                # 查找所有包含"article/abstract?v="字样的链接
+                                                links_count = await find_and_count_abstract_links(page)
+                                                return links_count
                                     else:
                                         logger.debug("[DEBUG] 未找到来源类别区域")
@@ -248,61 +265,127 @@ async def search_with_playwright(keywords):
                                             # 查找所有包含"article/abstract?v="字样的链接
                                             links_count = await find_and_count_abstract_links(page)
-                                            return f"已完成全部操作：搜索关键词、设置每页显示50条、勾选CSSCI来源类别。找到{links_count}条包含article/abstract?v=的链接。浏览器将保持打开状态。"
+                                            return links_count
                                         else:
-                                            return "已完成搜索和设置每页显示50条，但未找到来源类别区域或CSSCI选项。浏览器将保持打开状态。"
+                                            # 查找所有包含"article/abstract?v="字样的链接
+                                            links_count = await find_and_count_abstract_links(page)
+                                            return links_count
                                 except Exception as e:
                                     logger.debug(f"[DEBUG] 勾选CSSCI选项时出错: {str(e)}")
-                                    return f"已完成搜索和设置每页显示50条，但勾选CSSCI时出错: {str(e)}。浏览器将保持打开状态。"
+                                    # 查找所有包含"article/abstract?v="字样的链接
+                                    links_count = await find_and_count_abstract_links(page)
+                                    return links_count
-                                return "已完成全部操作：搜索关键词、点击排序下拉框、选择每页显示50条。浏览器将保持打开状态。"
+                                # 查找所有包含"article/abstract?v="字样的链接
+                                links_count = await find_and_count_abstract_links(page)
+                                return links_count
                             else:
                                 logger.debug("[DEBUG] 未找到'50'选项")
-                                return "已搜索并点击下拉框，但未找到'50'选项。浏览器将保持打开状态。"
+                                page_content = {
+                                    "count": 0,
+                                    "links": [],
+                                    "error": "已搜索并点击下拉框，但未找到'50'选项"
+                                }
+                                return 0
                         else:
                             logger.debug("[DEBUG] 未找到排序下拉框")
-                            return "已搜索，但未找到排序下拉框。浏览器将保持打开状态。"
+                            page_content = {
+                                "count": 0,
+                                "links": [],
+                                "error": "已搜索，但未找到排序下拉框"
+                            }
+                            return 0
                     except Exception as e:
                         logger.debug(f"[DEBUG] 点击下拉框或选项时出错: {str(e)}")
-                        return f"已搜索，但在点击下拉框或选项时出错: {str(e)}。浏览器将保持打开状态。"
-                    # 不关闭浏览器，让它保持打开状态
-                    # 注意：不调用 browser.close() 和 playwright.stop()
+                        page_content = {
+                            "count": 0,
+                            "links": [],
+                            "error": f"已搜索，但在点击下拉框或选项时出错: {str(e)}"
+                        }
+                        return 0
                 else:
                     # 不关闭浏览器
-                    return f"已填写搜索关键词: {keywords}，但未找到搜索按钮。请手动点击搜索。"
+                    page_content = {
+                        "count": 0,
+                        "links": [],
+                        "error": f"已填写搜索关键词: {keywords}，但未找到搜索按钮"
+                    }
+                    return 0
             else:
                 # 不关闭浏览器
-                return f"未找到搜索框。已打开知网页面，请手动搜索: {keywords}"
+                page_content = {
+                    "count": 0,
+                    "links": [],
+                    "error": f"未找到搜索框，无法搜索: {keywords}"
+                }
+                return 0
         except Exception as e:
             logger.debug(f"[DEBUG] 填写搜索框或点击搜索按钮时出错: {str(e)}")
             # 不关闭浏览器
-            return f"自动搜索过程中出错，请手动在页面中搜索: {keywords}"
+            page_content = {
+                "count": 0,
+                "links": [],
+                "error": f"自动搜索过程中出错: {str(e)}"
+            }
+            return 0
     except Exception as e:
         error_msg = str(e)
         logger.debug(f"[DEBUG] Playwright错误: {error_msg}")
         # 如果是找不到Chrome的错误，提供更明确的指导
         if "Executable doesn't exist" in error_msg and "ms-playwright" in error_msg:
-            return f"需要安装Playwright的浏览器: playwright install\n如果您想使用系统Chrome，请重新启动服务器。\n\n{error_msg}"
-        # 如果Playwright启动失败，使用传统方式打开Chrome
-        return f"使用Playwright启动Chrome失败: {error_msg}。尝试使用传统方式打开浏览器。"
+            error_message = f"需要安装Playwright的浏览器: playwright install\n如果您想使用系统Chrome，请重新启动服务器。\n\n{error_msg}"
+        else:
+            error_message = f"使用Playwright启动Chrome失败: {error_msg}"
+        page_content = {
+            "count": 0,
+            "links": [],
+            "error": error_message
+        }
+        return 0
 def search_with_direct_chrome(keywords):
     """直接使用Chrome搜索，不使用playwright"""
-    logger.debug("[DEBUG] 正在使用search_with_direct_chrome函数")
-    # 构建知网搜索URL - 知网不支持URL参数搜索，所以只能打开页面
-    url = "https://kns.cnki.net/kns8s/search"
+    global page_content
-    # 打开Chrome
-    result = open_chrome(url)
+    logger.debug("[DEBUG] 正在使用search_with_direct_chrome函数")
-    if result is True:
-        return f"已打开知网页面。请在搜索框中输入并搜索: {keywords}"
-    else:
-        return f"打开Chrome浏览器失败: {result}"
+    try:
+        url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
+        logger.debug(f"[DEBUG] 打开URL: {url}")
+        result = open_chrome(url)
+        if isinstance(result, str) and "打开Chrome" in result:
+            logger.debug(f"[DEBUG] 直接打开Chrome结果: {result}")
+            page_content = {
+                "count": 0,
+                "links": [],
+                "error": f"直接打开Chrome搜索: {result}"
+            }
+        else:
+            logger.debug("[DEBUG] 直接打开Chrome成功")
+            page_content = {
+                "count": 0,
+                "links": [],
+                "message": "已打开Chrome并搜索关键词，但无法自动获取链接。请安装playwright以获取完整功能。"
+            }
+        return page_content
+    except Exception as e:
+        logger.debug(f"[DEBUG] search_with_direct_chrome出错: {str(e)}")
+        page_content = {
+            "count": 0,
+            "links": [],
+            "error": f"使用Chrome搜索时出错: {str(e)}"
+        }
+        return page_content
 def get_page_content():
     """获取当前页面内容（简化模拟）"""
@@ -514,49 +597,25 @@ async def handle_get_prompt(
 @server.list_tools()
 async def handle_list_tools() -> list[types.Tool]:
     """列出可用工具"""
-    return [
-        types.Tool(
-            name="open-cnki",
-            description="打开中国知网搜索页面",
-            inputSchema={
-                "type": "object",
-                "properties": {},
-                "required": [],
-            },
-        ),
-        types.Tool(
-            name="search-keywords",
-            description="在知网搜索关键词",
-            inputSchema={
-                "type": "object",
-                "properties": {
-                    "keywords": {"type": "string", "description": "搜索关键词"},
-                },
-                "required": ["keywords"],
-            },
-        ),
-        types.Tool(
-            name="add-note",
-            description="添加笔记",
-            inputSchema={
-                "type": "object",
-                "properties": {
-                    "name": {"type": "string", "description": "笔记名称"},
-                    "content": {"type": "string", "description": "笔记内容"},
+    tools = []
+    # 只添加搜索并提取的组合工具
+    if extractor is not None and PLAYWRIGHT_AVAILABLE:
+        tools.append(
+            types.Tool(
+                name="mcp_cnks_search_and_extract",
+                description="搜索知网关键词并提取所有论文的详细内容",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "keywords": {"type": "string", "description": "搜索关键词"},
+                    },
+                    "required": ["keywords"],
                 },
-                "required": ["name", "content"],
-            },
-        ),
-        types.Tool(
-            name="get-abstract-links",
-            description="获取最近一次搜索找到的论文摘要链接",
-            inputSchema={
-                "type": "object",
-                "properties": {},
-                "required": [],
-            },
+            )
         )
-    ]
+    return tools
 @server.call_tool()
 async def handle_call_tool(
@@ -565,25 +624,7 @@ async def handle_call_tool(
     """处理工具执行请求"""
     global current_url, page_content
-    if name == "open-cnki":
-        current_url = "https://kns.cnki.net/kns8s/search"
-        result = open_chrome(current_url)
-        if result is True:
-            return [
-                types.TextContent(
-                    type="text",
-                    text="已打开中国知网搜索页面。"
-                )
-            ]
-        else:
-            return [
-                types.TextContent(
-                    type="text",
-                    text=f"打开中国知网时出错: {result}"
-                )
-            ]
-    elif name == "search-keywords":
+    if name == "mcp_cnks_search_and_extract" and extractor is not None and PLAYWRIGHT_AVAILABLE:
         if not arguments:
             raise ValueError("缺少参数")
@@ -591,72 +632,80 @@ async def handle_call_tool(
         if not keywords:
             raise ValueError("缺少关键词")
-        # 优先使用playwright进行搜索
-        if PLAYWRIGHT_AVAILABLE:
-            result = await search_with_playwright(keywords)
+        try:
+            # 第一步：执行搜索
+            logger.info(f"开始执行搜索并提取：关键词 '{keywords}'")
+            links_count = await search_with_playwright(keywords)
             current_url = "https://kns.cnki.net/kns8s/search"
-            return [
-                types.TextContent(
-                    type="text",
-                    text=result
-                )
-            ]
-        else:
-            # 如果没有playwright，回退到传统方式
-            result = search_with_direct_chrome(keywords)
-            current_url = "https://kns.cnki.net/kns8s/search"
+            # 检查搜索结果
+            if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
+                return [
+                    types.TextContent(
+                        type="text",
+                        text=json.dumps({
+                            "error": "搜索未返回有效链接",
+                            "count": 0,
+                            "results": []
+                        }, ensure_ascii=False)
+                    )
+                ]
+            # 提取链接
+            urls = [link["url"] for link in page_content["links"] if "url" in link]
+            if not urls:
+                return [
+                    types.TextContent(
+                        type="text",
+                        text=json.dumps({
+                            "error": "未找到有效链接",
+                            "count": 0,
+                            "results": []
+                        }, ensure_ascii=False)
+                    )
+                ]
+            # 第二步：执行提取
+            logger.info(f"搜索成功，找到 {len(urls)} 个链接，开始提取内容")
+            results = await extractor.batch_extract_contents(urls)
+            # 包装结果
+            result_json = {
+                "keywords": keywords,
+                "count": len(results),
+                "results": results,
+                "success_count": sum(1 for r in results if "error" not in r or not r["error"]),
+                "error_count": sum(1 for r in results if "error" in r and r["error"])
+            }
             return [
                 types.TextContent(
                     type="text",
-                    text=f"{result}。如需自动搜索功能，请安装: uv add playwright"
+                    text=json.dumps(result_json, ensure_ascii=False)
                 )
             ]
-    elif name == "add-note":
-        if not arguments:
-            raise ValueError("缺少参数")
-        note_name = arguments.get("name")
-        content = arguments.get("content")
-        if not note_name or not content:
-            raise ValueError("缺少名称或内容")
-        # 更新服务器状态
-        notes[note_name] = content
-        # 通知客户端资源已更改
-        await server.request_context.session.send_resource_list_changed()
-        return [
-            types.TextContent(
-                type="text",
-                text=f"已添加笔记 '{note_name}': {content}"
-            )
-        ]
-    elif name == "get-abstract-links":
-        if not page_content or "找到" not in page_content:
+        except Exception as e:
+            logger.error(f"搜索并提取时出错: {str(e)}")
+            logger.error(traceback.format_exc())
             return [
                 types.TextContent(
                     type="text",
-                    text="尚未执行搜索或未找到链接。请先使用search-keywords工具搜索。"
+                    text=json.dumps({
+                        "error": f"搜索并提取内容时出错: {str(e)}",
+                        "keywords": keywords,
+                        "count": 0,
+                        "results": []
+                    }, ensure_ascii=False)
                 )
             ]
-        return [
-            types.TextContent(
-                type="text",
-                text=page_content
-            )
-        ]
-    raise ValueError(f"未知工具: {name}")
+    else:
+        raise ValueError(f"未知工具: {name}")
 async def find_and_count_abstract_links(page):
     """查找并统计包含article/abstract?v=的链接"""
+    global page_content
     try:
         logger.debug("[DEBUG] 开始查找所有包含article/abstract?v=的链接")
@@ -690,11 +739,11 @@ async def find_and_count_abstract_links(page):
         else:
             logger.debug(f"[DEBUG] 链接数量为{links_count}条，多于预期的50条")
-        # 存储结果 - 只包含编号和链接，不包含标题和连字符
-        global page_content
-        page_content = f"找到{links_count}条包含article/abstract?v=的链接\n\n" + "\n".join([
-            f"{link['index']}. {link['href']}" for link in links_info
-        ])
+        # 存储结果 - 使用字典结构而不是纯文本
+        page_content = {
+            "count": links_count,
+            "links": [{"index": link['index'], "url": link['href']} for link in links_info]
+        }
         return links_count
     except Exception as e:
@@ -710,7 +759,7 @@ async def main():
             write_stream,
             InitializationOptions(
                 server_name="cnks",
-                server_version="0.1.0",
+                server_version="0.2.1",
                 capabilities=server.get_capabilities(
                     notification_options=NotificationOptions(),
                     experimental_capabilities={},
@@ -725,33 +774,54 @@ def create_fastmcp_server():
         from mcp.server.fastmcp import FastMCP
         fast_mcp = FastMCP("知网搜索")
-        @fast_mcp.tool()
-        def open_cnki_search():
-            """打开中国知网搜索页面"""
-            return open_chrome("https://kns.cnki.net/kns8s/search")
-        @fast_mcp.tool()
-        async def search_keywords(keywords: str) -> str:
-            """在知网搜索关键词"""
-            logger.debug("[DEBUG] 正在使用FastMCP的search_keywords函数")
-            if PLAYWRIGHT_AVAILABLE:
-                result = await search_with_playwright(keywords)
-                return result
-            else:
-                result = search_with_direct_chrome(keywords)
-                return f"{result}。如需自动搜索功能，请安装: uv add playwright"
-        @fast_mcp.tool()
-        def get_abstract_links() -> str:
-            """获取最近一次搜索找到的论文摘要链接"""
-            if not page_content or "找到" not in page_content:
-                return "尚未执行搜索或未找到链接。请先使用search_keywords工具搜索。"
-            return page_content
-        @fast_mcp.resource("webpage://current")
-        def get_current_webpage() -> str:
-            """获取当前网页内容"""
-            return get_page_content()
+        # 只添加搜索并提取的工具
+        if extractor is not None and PLAYWRIGHT_AVAILABLE:
+            @fast_mcp.tool()
+            async def mcp_cnks_search_and_extract(keywords: str) -> dict:
+                """搜索关键词并提取所有论文的详细内容"""
+                logger.debug("[DEBUG] 正在使用FastMCP的mcp_cnks_search_and_extract函数")
+                try:
+                    # 第一步：执行搜索
+                    result_count = await search_with_playwright(keywords)
+                    # 检查搜索结果
+                    if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
+                        return {
+                            "error": "搜索未返回有效链接",
+                            "keywords": keywords,
+                            "count": 0,
+                            "results": []
+                        }
+                    # 提取链接
+                    urls = [link["url"] for link in page_content["links"] if "url" in link]
+                    if not urls:
+                        return {
+                            "error": "未找到有效链接",
+                            "keywords": keywords,
+                            "count": 0,
+                            "results": []
+                        }
+                    # 第二步：执行提取
+                    results = await extractor.batch_extract_contents(urls)
+                    # 包装结果
+                    return {
+                        "keywords": keywords,
+                        "count": len(results),
+                        "results": results,
+                        "success_count": sum(1 for r in results if "error" not in r or not r["error"]),
+                        "error_count": sum(1 for r in results if "error" in r and r["error"])
+                    }
+                except Exception as e:
+                    logger.error(f"搜索并提取时出错: {str(e)}")
+                    return {
+                        "error": f"搜索并提取内容时出错: {str(e)}",
+                        "keywords": keywords,
+                        "count": 0,
+                        "results": []
+                    }
         return fast_mcp
     except ImportError:

cnks 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

cnks 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl