cnks 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnks-0.3.1.dist-info/METADATA +101 -0
- cnks-0.3.1.dist-info/RECORD +17 -0
- cnks-0.3.1.dist-info/entry_points.txt +5 -0
- src/ThisIsAServerSample.py +377 -0
- src/__init__.py +7 -0
- src/cache.py +451 -0
- src/citzer.py +868 -0
- src/click50.py +527 -0
- src/client.py +135 -0
- src/cssci.py +267 -0
- src/extractlink.py +262 -0
- src/ifverify.py +134 -0
- src/main.py +70 -0
- src/searcher.py +767 -0
- src/server.py +487 -0
- src/worker.py +219 -0
- cnks/__init__.py +0 -50
- cnks/server.py +0 -1876
- cnks-0.2.5.dist-info/METADATA +0 -181
- cnks-0.2.5.dist-info/RECORD +0 -6
- cnks-0.2.5.dist-info/entry_points.txt +0 -2
- {cnks-0.2.5.dist-info → cnks-0.3.1.dist-info}/WHEEL +0 -0
cnks/server.py
DELETED
@@ -1,1876 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
import json
|
3
|
-
import os
|
4
|
-
import platform
|
5
|
-
import re
|
6
|
-
import subprocess
|
7
|
-
import sys
|
8
|
-
import time
|
9
|
-
import logging
|
10
|
-
import webbrowser
|
11
|
-
import traceback
|
12
|
-
from pathlib import Path
|
13
|
-
from urllib.parse import quote
|
14
|
-
from typing import Dict, List, Any, Optional, Union
|
15
|
-
from datetime import datetime
|
16
|
-
from pydantic import BaseModel, AnyUrl
|
17
|
-
|
18
|
-
from mcp.server.models import InitializationOptions
|
19
|
-
import mcp.types as types
|
20
|
-
from mcp.server import NotificationOptions, Server
|
21
|
-
import mcp.server.stdio
|
22
|
-
|
23
|
-
# 配置日志记录
|
24
|
-
logging.basicConfig(
|
25
|
-
level=logging.DEBUG,
|
26
|
-
filename="cnks.log",
|
27
|
-
filemode="a",
|
28
|
-
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
29
|
-
)
|
30
|
-
logger = logging.getLogger("cnks")
|
31
|
-
|
32
|
-
# 尝试导入playwright
|
33
|
-
try:
|
34
|
-
from playwright.async_api import async_playwright
|
35
|
-
PLAYWRIGHT_AVAILABLE = True
|
36
|
-
except ImportError:
|
37
|
-
PLAYWRIGHT_AVAILABLE = False
|
38
|
-
logger.warning("Playwright未安装,将使用传统方式打开Chrome")
|
39
|
-
|
40
|
-
# 定义数据模型
|
41
|
-
class CNKIContent(BaseModel):
|
42
|
-
"""CNKI论文内容模型"""
|
43
|
-
title: str = ""
|
44
|
-
authors: List[str] = []
|
45
|
-
abstract: str = ""
|
46
|
-
keywords: List[str] = []
|
47
|
-
cite_format: str = ""
|
48
|
-
url: str = "" # 添加URL字段以记录来源
|
49
|
-
|
50
|
-
# 存储当前页面内容和笔记
|
51
|
-
page_content = ""
|
52
|
-
current_url = ""
|
53
|
-
notes: dict[str, str] = {}
|
54
|
-
|
55
|
-
server = Server("cnks")
|
56
|
-
|
57
|
-
# 添加全局变量来跟踪playwright状态
|
58
|
-
playwright_instance = None
|
59
|
-
browser_instance = None
|
60
|
-
context = None
|
61
|
-
|
62
|
-
def find_chrome_executable():
|
63
|
-
"""查找Chrome可执行文件路径"""
|
64
|
-
# 首先检查环境变量
|
65
|
-
chrome_env = os.environ.get("CHROME_PATH")
|
66
|
-
if chrome_env and os.path.exists(chrome_env):
|
67
|
-
logger.debug(f"[DEBUG] 从环境变量找到Chrome: {chrome_env}")
|
68
|
-
return chrome_env
|
69
|
-
|
70
|
-
system = platform.system()
|
71
|
-
logger.debug(f"[DEBUG] 系统类型: {system}")
|
72
|
-
|
73
|
-
# 定义可能的Chrome位置
|
74
|
-
if system == "Windows":
|
75
|
-
chrome_paths = [
|
76
|
-
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
|
77
|
-
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
|
78
|
-
os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
|
79
|
-
# Edge浏览器也是基于Chromium的
|
80
|
-
r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe",
|
81
|
-
r"C:\Program Files\Microsoft\Edge\Application\msedge.exe",
|
82
|
-
]
|
83
|
-
elif system == "Darwin": # MacOS
|
84
|
-
chrome_paths = [
|
85
|
-
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
86
|
-
os.path.expanduser("~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"),
|
87
|
-
"/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
|
88
|
-
]
|
89
|
-
elif system == "Linux":
|
90
|
-
chrome_paths = [
|
91
|
-
"/usr/bin/google-chrome",
|
92
|
-
"/usr/bin/chromium-browser",
|
93
|
-
"/usr/bin/chromium",
|
94
|
-
"/usr/bin/microsoft-edge",
|
95
|
-
]
|
96
|
-
else:
|
97
|
-
logger.debug(f"[DEBUG] 未知系统类型: {system}")
|
98
|
-
return None
|
99
|
-
|
100
|
-
# 检查路径是否存在
|
101
|
-
for path in chrome_paths:
|
102
|
-
if os.path.exists(path):
|
103
|
-
logger.debug(f"[DEBUG] 找到Chrome: {path}")
|
104
|
-
return path
|
105
|
-
|
106
|
-
# 如果上述路径都不存在,尝试使用which命令查找
|
107
|
-
try:
|
108
|
-
if system != "Windows":
|
109
|
-
# 在Unix系统上尝试使用which命令
|
110
|
-
for browser in ["google-chrome", "chromium", "chromium-browser", "microsoft-edge"]:
|
111
|
-
try:
|
112
|
-
result = subprocess.check_output(["which", browser], universal_newlines=True).strip()
|
113
|
-
if result and os.path.exists(result):
|
114
|
-
logger.debug(f"[DEBUG] 使用which命令找到浏览器: {result}")
|
115
|
-
return result
|
116
|
-
except subprocess.CalledProcessError:
|
117
|
-
pass
|
118
|
-
else:
|
119
|
-
# 在Windows上尝试使用where命令
|
120
|
-
try:
|
121
|
-
result = subprocess.check_output(["where", "chrome"], universal_newlines=True).strip()
|
122
|
-
if result:
|
123
|
-
# where可能返回多行,取第一行
|
124
|
-
first_path = result.split('\n')[0].strip()
|
125
|
-
if os.path.exists(first_path):
|
126
|
-
logger.debug(f"[DEBUG] 使用where命令找到Chrome: {first_path}")
|
127
|
-
return first_path
|
128
|
-
except subprocess.CalledProcessError:
|
129
|
-
pass
|
130
|
-
|
131
|
-
# 尝试查找Edge
|
132
|
-
try:
|
133
|
-
result = subprocess.check_output(["where", "msedge"], universal_newlines=True).strip()
|
134
|
-
if result:
|
135
|
-
first_path = result.split('\n')[0].strip()
|
136
|
-
if os.path.exists(first_path):
|
137
|
-
logger.debug(f"[DEBUG] 使用where命令找到Edge: {first_path}")
|
138
|
-
return first_path
|
139
|
-
except subprocess.CalledProcessError:
|
140
|
-
pass
|
141
|
-
except Exception as e:
|
142
|
-
logger.debug(f"[DEBUG] 使用命令行查找浏览器时出错: {str(e)}")
|
143
|
-
|
144
|
-
logger.debug("[DEBUG] 未找到Chrome或兼容的浏览器")
|
145
|
-
return None
|
146
|
-
|
147
|
-
def open_chrome(url):
|
148
|
-
"""打开Chrome浏览器并访问指定URL"""
|
149
|
-
try:
|
150
|
-
logger.debug(f"[DEBUG] open_chrome函数被调用,URL: {url}")
|
151
|
-
|
152
|
-
# 使用webbrowser模块打开URL(会使用系统默认浏览器,通常是已经打开的Chrome)
|
153
|
-
logger.debug(f"[DEBUG] 尝试使用webbrowser.open打开URL: {url}")
|
154
|
-
webbrowser.open(url)
|
155
|
-
logger.debug(f"[DEBUG] webbrowser.open调用完成")
|
156
|
-
|
157
|
-
# 等待页面加载
|
158
|
-
time.sleep(2)
|
159
|
-
logger.debug("[DEBUG] open_chrome函数执行完毕")
|
160
|
-
return True
|
161
|
-
except Exception as e:
|
162
|
-
logger.debug(f"[DEBUG] open_chrome函数出错: {str(e)}")
|
163
|
-
return f"打开Chrome时出错: {str(e)}"
|
164
|
-
|
165
|
-
async def search_with_playwright(keywords):
|
166
|
-
"""使用playwright在知网搜索关键词"""
|
167
|
-
global page_content
|
168
|
-
|
169
|
-
if not PLAYWRIGHT_AVAILABLE:
|
170
|
-
return "需要安装playwright模块:uv add playwright"
|
171
|
-
|
172
|
-
try:
|
173
|
-
logger.debug(f"[DEBUG] 使用搜索功能,关键词: {keywords}")
|
174
|
-
|
175
|
-
# 先访问知网首页而不是直接访问搜索结果页
|
176
|
-
initial_url = "https://kns.cnki.net/"
|
177
|
-
search_url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
|
178
|
-
logger.debug(f"[DEBUG] 初始URL: {initial_url}")
|
179
|
-
|
180
|
-
# 创建全局变量来跟踪playwright状态
|
181
|
-
global playwright_instance, browser_instance, context
|
182
|
-
|
183
|
-
# 查找Chrome路径
|
184
|
-
chrome_path = find_chrome_executable()
|
185
|
-
if not chrome_path:
|
186
|
-
logger.warning("[WARNING] 未找到Chrome可执行文件,将使用默认浏览器")
|
187
|
-
# 使用webbrowser模块打开
|
188
|
-
webbrowser.open(search_url)
|
189
|
-
# 构造一个基本结果
|
190
|
-
page_content = {
|
191
|
-
"count": 1,
|
192
|
-
"links": [{
|
193
|
-
"index": 1,
|
194
|
-
"url": search_url,
|
195
|
-
"title": f"搜索: {keywords}"
|
196
|
-
}]
|
197
|
-
}
|
198
|
-
return 1
|
199
|
-
|
200
|
-
logger.debug(f"[DEBUG] 找到Chrome路径: {chrome_path}")
|
201
|
-
|
202
|
-
# 检查playwright是否已经运行
|
203
|
-
if 'playwright_instance' not in globals() or playwright_instance is None:
|
204
|
-
logger.debug("[DEBUG] 初始化新的playwright实例")
|
205
|
-
# 第一次运行,初始化playwright
|
206
|
-
playwright_instance = await async_playwright().start()
|
207
|
-
|
208
|
-
# 设置启动选项
|
209
|
-
browser_args = []
|
210
|
-
|
211
|
-
# 使用系统已安装的Chrome
|
212
|
-
if chrome_path:
|
213
|
-
browser_args.extend([
|
214
|
-
'--no-sandbox', # 在某些环境中可能需要
|
215
|
-
'--start-maximized' # 最大化窗口
|
216
|
-
])
|
217
|
-
|
218
|
-
# 启动浏览器 - 尝试使用系统Chrome
|
219
|
-
try:
|
220
|
-
# 首先尝试使用chrome_path启动
|
221
|
-
logger.debug(f"[DEBUG] 尝试使用系统Chrome启动: {chrome_path}")
|
222
|
-
browser_instance = await playwright_instance.chromium.launch(
|
223
|
-
headless=False, # 显示浏览器界面
|
224
|
-
executable_path=chrome_path,
|
225
|
-
args=browser_args
|
226
|
-
)
|
227
|
-
except Exception as e:
|
228
|
-
logger.warning(f"[WARNING] 使用系统Chrome启动失败: {str(e)},尝试使用默认浏览器")
|
229
|
-
# 如果失败,使用默认浏览器
|
230
|
-
browser_instance = await playwright_instance.chromium.launch(
|
231
|
-
headless=False # 显示浏览器界面
|
232
|
-
)
|
233
|
-
|
234
|
-
# 创建上下文
|
235
|
-
context = await browser_instance.new_context(
|
236
|
-
viewport=None # 不限制视窗大小,使用浏览器默认设置
|
237
|
-
)
|
238
|
-
|
239
|
-
# 创建新页面
|
240
|
-
page = await context.new_page()
|
241
|
-
|
242
|
-
# 访问初始URL(知网首页)
|
243
|
-
logger.debug(f"[DEBUG] 导航到知网首页: {initial_url}")
|
244
|
-
await page.goto(initial_url)
|
245
|
-
logger.debug("[DEBUG] 已打开新的浏览器窗口并访问知网首页")
|
246
|
-
else:
|
247
|
-
logger.debug("[DEBUG] 在现有playwright实例中打开新标签页")
|
248
|
-
# playwright已经在运行,创建新标签页
|
249
|
-
page = await context.new_page()
|
250
|
-
# 访问初始URL(知网首页)
|
251
|
-
await page.goto(initial_url)
|
252
|
-
logger.debug("[DEBUG] 已在现有浏览器中打开新标签页并访问知网首页")
|
253
|
-
|
254
|
-
# 等待页面加载完成
|
255
|
-
await page.wait_for_load_state('networkidle')
|
256
|
-
await asyncio.sleep(1)
|
257
|
-
|
258
|
-
# 检查是否需要验证
|
259
|
-
await check_and_wait_for_verification(page)
|
260
|
-
|
261
|
-
# 尝试执行搜索操作
|
262
|
-
try:
|
263
|
-
# 方法1: 尝试在首页搜索框输入关键词
|
264
|
-
logger.debug("[DEBUG] 尝试在首页查找搜索框")
|
265
|
-
|
266
|
-
# 查找搜索框
|
267
|
-
search_input_selectors = [
|
268
|
-
'#txt_search',
|
269
|
-
'input[type="text"]',
|
270
|
-
'.search-input',
|
271
|
-
'.input-box input',
|
272
|
-
'input.search-textbox',
|
273
|
-
'input[placeholder*="搜索"]'
|
274
|
-
]
|
275
|
-
|
276
|
-
search_input = None
|
277
|
-
for selector in search_input_selectors:
|
278
|
-
try:
|
279
|
-
logger.debug(f"[DEBUG] 尝试查找搜索框选择器: {selector}")
|
280
|
-
search_input = await page.query_selector(selector)
|
281
|
-
if search_input:
|
282
|
-
logger.debug(f"[DEBUG] 找到搜索框: {selector}")
|
283
|
-
break
|
284
|
-
except Exception as e:
|
285
|
-
logger.debug(f"[DEBUG] 查找选择器 {selector} 时出错: {str(e)}")
|
286
|
-
|
287
|
-
if search_input:
|
288
|
-
# 清空搜索框
|
289
|
-
await search_input.fill("")
|
290
|
-
# 输入关键词
|
291
|
-
await search_input.type(keywords, delay=100) # 添加延迟模拟真实输入
|
292
|
-
logger.debug(f"[DEBUG] 已在搜索框中输入关键词: {keywords}")
|
293
|
-
|
294
|
-
# 查找搜索按钮
|
295
|
-
search_button_selectors = [
|
296
|
-
'button.search-btn',
|
297
|
-
'button.search',
|
298
|
-
'button[type="submit"]',
|
299
|
-
'input[type="submit"]',
|
300
|
-
'.search-action',
|
301
|
-
'a.search-btn'
|
302
|
-
]
|
303
|
-
|
304
|
-
search_button = None
|
305
|
-
for selector in search_button_selectors:
|
306
|
-
try:
|
307
|
-
logger.debug(f"[DEBUG] 尝试查找搜索按钮选择器: {selector}")
|
308
|
-
search_button = await page.query_selector(selector)
|
309
|
-
if search_button:
|
310
|
-
logger.debug(f"[DEBUG] 找到搜索按钮: {selector}")
|
311
|
-
break
|
312
|
-
except Exception as e:
|
313
|
-
logger.debug(f"[DEBUG] 查找选择器 {selector} 时出错: {str(e)}")
|
314
|
-
|
315
|
-
if search_button:
|
316
|
-
# 点击搜索按钮
|
317
|
-
logger.debug("[DEBUG] 点击搜索按钮")
|
318
|
-
await search_button.click()
|
319
|
-
|
320
|
-
# 等待搜索结果加载
|
321
|
-
logger.debug("[DEBUG] 等待搜索结果加载")
|
322
|
-
await page.wait_for_load_state('networkidle')
|
323
|
-
await asyncio.sleep(2)
|
324
|
-
else:
|
325
|
-
# 如果找不到搜索按钮,尝试按回车
|
326
|
-
logger.debug("[DEBUG] 未找到搜索按钮,尝试按回车键")
|
327
|
-
await search_input.press("Enter")
|
328
|
-
|
329
|
-
# 等待搜索结果加载
|
330
|
-
logger.debug("[DEBUG] 等待搜索结果加载")
|
331
|
-
await page.wait_for_load_state('networkidle')
|
332
|
-
await asyncio.sleep(2)
|
333
|
-
else:
|
334
|
-
# 如果找不到搜索框,直接导航到搜索URL
|
335
|
-
logger.debug(f"[DEBUG] 未找到搜索框,直接导航到搜索URL: {search_url}")
|
336
|
-
await page.goto(search_url)
|
337
|
-
await page.wait_for_load_state('networkidle')
|
338
|
-
await asyncio.sleep(2)
|
339
|
-
except Exception as e:
|
340
|
-
logger.debug(f"[DEBUG] 执行搜索操作时出错: {str(e)}")
|
341
|
-
logger.debug(traceback.format_exc())
|
342
|
-
|
343
|
-
# 如果交互失败,直接导航到搜索URL
|
344
|
-
logger.debug(f"[DEBUG] 导航到搜索URL: {search_url}")
|
345
|
-
await page.goto(search_url)
|
346
|
-
await page.wait_for_load_state('networkidle')
|
347
|
-
await asyncio.sleep(2)
|
348
|
-
|
349
|
-
# 在搜索结果页面再次检查是否需要验证
|
350
|
-
await check_and_wait_for_verification(page)
|
351
|
-
|
352
|
-
# 查找并计数链接
|
353
|
-
links_count = await find_and_count_abstract_links(page)
|
354
|
-
|
355
|
-
# 添加等待时间让用户可以查看结果
|
356
|
-
await asyncio.sleep(5)
|
357
|
-
|
358
|
-
logger.debug(f"[DEBUG] 搜索完成,找到 {links_count} 个链接")
|
359
|
-
|
360
|
-
# 如果找不到链接,使用基本信息构造结果
|
361
|
-
if links_count == 0:
|
362
|
-
# 获取当前URL
|
363
|
-
current_url = await page.url()
|
364
|
-
page_content = {
|
365
|
-
"count": 1,
|
366
|
-
"links": [{
|
367
|
-
"index": 1,
|
368
|
-
"url": current_url,
|
369
|
-
"title": f"搜索: {keywords}"
|
370
|
-
}]
|
371
|
-
}
|
372
|
-
|
373
|
-
return links_count
|
374
|
-
except Exception as e:
|
375
|
-
error_msg = str(e)
|
376
|
-
logger.debug(f"[DEBUG] 搜索错误: {error_msg}")
|
377
|
-
logger.debug(traceback.format_exc())
|
378
|
-
|
379
|
-
# 尝试直接使用webbrowser打开
|
380
|
-
try:
|
381
|
-
logger.debug("[DEBUG] 尝试使用webbrowser打开URL")
|
382
|
-
webbrowser.open(search_url)
|
383
|
-
|
384
|
-
# 构造一个基本结果
|
385
|
-
page_content = {
|
386
|
-
"count": 1,
|
387
|
-
"links": [{
|
388
|
-
"index": 1,
|
389
|
-
"url": search_url,
|
390
|
-
"title": f"搜索: {keywords}"
|
391
|
-
}]
|
392
|
-
}
|
393
|
-
return 1
|
394
|
-
except Exception as e2:
|
395
|
-
logger.debug(f"[DEBUG] 使用webbrowser打开URL失败: {str(e2)}")
|
396
|
-
|
397
|
-
page_content = {
|
398
|
-
"count": 0,
|
399
|
-
"links": [],
|
400
|
-
"error": f"搜索过程中出错: {error_msg}"
|
401
|
-
}
|
402
|
-
return 0
|
403
|
-
|
404
|
-
async def check_and_wait_for_verification(page):
|
405
|
-
"""检查页面是否需要验证,如果需要则等待用户手动验证"""
|
406
|
-
# 验证页面可能包含的特征
|
407
|
-
verification_indicators = [
|
408
|
-
'验证码',
|
409
|
-
'人机验证',
|
410
|
-
'captcha',
|
411
|
-
'verify',
|
412
|
-
'安全验证',
|
413
|
-
'滑动验证',
|
414
|
-
'拖动滑块',
|
415
|
-
'请完成验证',
|
416
|
-
'拼图验证'
|
417
|
-
]
|
418
|
-
|
419
|
-
try:
|
420
|
-
# 获取页面内容
|
421
|
-
page_text = await page.content()
|
422
|
-
|
423
|
-
# 检查是否包含验证指示词
|
424
|
-
needs_verification = any(indicator in page_text for indicator in verification_indicators)
|
425
|
-
|
426
|
-
# 尝试查找常见的验证元素
|
427
|
-
verification_selectors = [
|
428
|
-
'.verify-wrap',
|
429
|
-
'.captcha',
|
430
|
-
'.verification',
|
431
|
-
'#captcha',
|
432
|
-
'.slidecode',
|
433
|
-
'.verify-box',
|
434
|
-
'.verify-img-panel',
|
435
|
-
'iframe[src*="captcha"]',
|
436
|
-
'iframe[src*="verify"]'
|
437
|
-
]
|
438
|
-
|
439
|
-
for selector in verification_selectors:
|
440
|
-
try:
|
441
|
-
verify_elem = await page.query_selector(selector)
|
442
|
-
if verify_elem:
|
443
|
-
needs_verification = True
|
444
|
-
logger.info(f"[INFO] 检测到验证元素: {selector}")
|
445
|
-
break
|
446
|
-
except:
|
447
|
-
pass
|
448
|
-
|
449
|
-
if needs_verification:
|
450
|
-
logger.info("[INFO] 检测到验证页面,等待用户手动验证...")
|
451
|
-
print("\n*** 请注意 ***")
|
452
|
-
print("检测到需要验证码验证,请在浏览器中完成验证...")
|
453
|
-
print("验证完成后,程序将自动继续\n")
|
454
|
-
|
455
|
-
# 等待用户完成验证,验证页面可能有不同的特征表明验证完成
|
456
|
-
# 例如,特定元素消失或页面URL改变
|
457
|
-
max_wait_time = 120 # 最长等待2分钟
|
458
|
-
start_time = time.time()
|
459
|
-
current_url = await page.url()
|
460
|
-
|
461
|
-
while time.time() - start_time < max_wait_time:
|
462
|
-
# 每隔一秒检查一次
|
463
|
-
await asyncio.sleep(1)
|
464
|
-
|
465
|
-
# 检查URL是否改变(可能表示验证成功)
|
466
|
-
new_url = await page.url()
|
467
|
-
if new_url != current_url:
|
468
|
-
logger.info("[INFO] 检测到URL变化,验证可能已完成")
|
469
|
-
break
|
470
|
-
|
471
|
-
# 再次检查验证元素是否消失
|
472
|
-
verification_still_present = False
|
473
|
-
for selector in verification_selectors:
|
474
|
-
try:
|
475
|
-
verify_elem = await page.query_selector(selector)
|
476
|
-
if verify_elem:
|
477
|
-
verification_still_present = True
|
478
|
-
break
|
479
|
-
except:
|
480
|
-
pass
|
481
|
-
|
482
|
-
if not verification_still_present:
|
483
|
-
logger.info("[INFO] 验证元素已消失,验证可能已完成")
|
484
|
-
break
|
485
|
-
|
486
|
-
# 检查页面内容是否不再包含验证指示词
|
487
|
-
page_text = await page.content()
|
488
|
-
if not any(indicator in page_text for indicator in verification_indicators):
|
489
|
-
logger.info("[INFO] 验证指示词已消失,验证可能已完成")
|
490
|
-
break
|
491
|
-
|
492
|
-
# 等待页面稳定
|
493
|
-
await page.wait_for_load_state('networkidle')
|
494
|
-
await asyncio.sleep(2)
|
495
|
-
|
496
|
-
logger.info("[INFO] 继续执行,可能已完成验证")
|
497
|
-
print("继续执行操作...\n")
|
498
|
-
else:
|
499
|
-
logger.debug("[DEBUG] 未检测到验证页面")
|
500
|
-
|
501
|
-
except Exception as e:
|
502
|
-
logger.error(f"[ERROR] 检查验证页面时出错: {str(e)}")
|
503
|
-
logger.error(traceback.format_exc())
|
504
|
-
|
505
|
-
def search_with_direct_chrome(keywords):
|
506
|
-
"""直接使用Chrome搜索,不使用playwright"""
|
507
|
-
global page_content
|
508
|
-
|
509
|
-
logger.debug("[DEBUG] 正在使用search_with_direct_chrome函数")
|
510
|
-
|
511
|
-
try:
|
512
|
-
url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
|
513
|
-
logger.debug(f"[DEBUG] 打开URL: {url}")
|
514
|
-
|
515
|
-
# 使用open_chrome函数打开URL
|
516
|
-
result = open_chrome(url)
|
517
|
-
|
518
|
-
if isinstance(result, str) and "错误" in result:
|
519
|
-
logger.debug(f"[DEBUG] 打开Chrome失败: {result}")
|
520
|
-
|
521
|
-
page_content = {
|
522
|
-
"count": 0,
|
523
|
-
"links": [],
|
524
|
-
"error": f"打开Chrome搜索失败: {result}"
|
525
|
-
}
|
526
|
-
return page_content
|
527
|
-
|
528
|
-
logger.debug("[DEBUG] 已尝试在已有Chrome窗口中打开新标签页")
|
529
|
-
|
530
|
-
page_content = {
|
531
|
-
"count": 0,
|
532
|
-
"links": [],
|
533
|
-
"message": "已打开Chrome并搜索关键词,但无法自动获取链接。请安装playwright以获取完整功能。"
|
534
|
-
}
|
535
|
-
|
536
|
-
return page_content
|
537
|
-
except Exception as e:
|
538
|
-
logger.debug(f"[DEBUG] search_with_direct_chrome出错: {str(e)}")
|
539
|
-
|
540
|
-
page_content = {
|
541
|
-
"count": 0,
|
542
|
-
"links": [],
|
543
|
-
"error": f"使用Chrome搜索时出错: {str(e)}"
|
544
|
-
}
|
545
|
-
|
546
|
-
return page_content
|
547
|
-
|
548
|
-
def get_page_content():
|
549
|
-
"""获取当前页面内容(简化模拟)"""
|
550
|
-
global page_content, current_url
|
551
|
-
if not current_url:
|
552
|
-
return "尚未打开任何页面"
|
553
|
-
|
554
|
-
# 实际应用中,这里可以使用Selenium或类似工具来获取实际页面内容
|
555
|
-
# 此处为简化实现,返回模拟内容
|
556
|
-
if "cnki" in current_url:
|
557
|
-
return f"中国知网搜索页面\n当前URL: {current_url}\n可使用搜索工具查询文献。"
|
558
|
-
return f"已打开页面: {current_url}"
|
559
|
-
|
560
|
-
@server.list_resources()
|
561
|
-
async def handle_list_resources() -> list[types.Resource]:
|
562
|
-
"""列出可用资源"""
|
563
|
-
resources = []
|
564
|
-
|
565
|
-
# 当前网页资源
|
566
|
-
resources.append(
|
567
|
-
types.Resource(
|
568
|
-
uri=AnyUrl("webpage://current"),
|
569
|
-
name="当前网页",
|
570
|
-
description="当前打开的网页内容",
|
571
|
-
mimeType="text/plain",
|
572
|
-
)
|
573
|
-
)
|
574
|
-
|
575
|
-
# 知网搜索页资源
|
576
|
-
resources.append(
|
577
|
-
types.Resource(
|
578
|
-
uri=AnyUrl("webpage://cnki/search"),
|
579
|
-
name="知网搜索页",
|
580
|
-
description="中国知网搜索页面",
|
581
|
-
mimeType="text/plain",
|
582
|
-
)
|
583
|
-
)
|
584
|
-
|
585
|
-
# 笔记资源
|
586
|
-
for name in notes:
|
587
|
-
resources.append(
|
588
|
-
types.Resource(
|
589
|
-
uri=AnyUrl(f"note://internal/{name}"),
|
590
|
-
name=f"笔记: {name}",
|
591
|
-
description=f"笔记: {name}",
|
592
|
-
mimeType="text/plain",
|
593
|
-
)
|
594
|
-
)
|
595
|
-
|
596
|
-
return resources
|
597
|
-
|
598
|
-
@server.read_resource()
|
599
|
-
async def handle_read_resource(uri: AnyUrl) -> str:
|
600
|
-
"""读取资源内容"""
|
601
|
-
global current_url
|
602
|
-
|
603
|
-
scheme = uri.scheme
|
604
|
-
|
605
|
-
if scheme == "webpage":
|
606
|
-
path = uri.path if uri.path else ""
|
607
|
-
host = uri.host if uri.host else ""
|
608
|
-
|
609
|
-
if host == "current":
|
610
|
-
return get_page_content()
|
611
|
-
elif host == "cnki" and path == "/search":
|
612
|
-
# 打开知网搜索页
|
613
|
-
current_url = "https://kns.cnki.net/kns8s/search"
|
614
|
-
result = open_chrome(current_url)
|
615
|
-
if result is True:
|
616
|
-
return "已打开中国知网搜索页面,可使用搜索工具查询文献。"
|
617
|
-
else:
|
618
|
-
return result
|
619
|
-
elif scheme == "note":
|
620
|
-
name = uri.path
|
621
|
-
if name is not None:
|
622
|
-
name = name.lstrip("/")
|
623
|
-
if name in notes:
|
624
|
-
return notes[name]
|
625
|
-
raise ValueError(f"笔记未找到: {name}")
|
626
|
-
|
627
|
-
raise ValueError(f"不支持的URI方案或资源未找到: {uri}")
|
628
|
-
|
629
|
-
@server.list_prompts()
|
630
|
-
async def handle_list_prompts() -> list[types.Prompt]:
|
631
|
-
"""列出可用提示"""
|
632
|
-
return [
|
633
|
-
types.Prompt(
|
634
|
-
name="search-literature",
|
635
|
-
description="按主题搜索文献",
|
636
|
-
arguments=[
|
637
|
-
types.PromptArgument(
|
638
|
-
name="keywords",
|
639
|
-
description="搜索关键词",
|
640
|
-
required=True,
|
641
|
-
)
|
642
|
-
],
|
643
|
-
),
|
644
|
-
types.Prompt(
|
645
|
-
name="advanced-search",
|
646
|
-
description="高级文献搜索",
|
647
|
-
arguments=[
|
648
|
-
types.PromptArgument(
|
649
|
-
name="title",
|
650
|
-
description="论文标题",
|
651
|
-
required=False,
|
652
|
-
),
|
653
|
-
types.PromptArgument(
|
654
|
-
name="author",
|
655
|
-
description="作者",
|
656
|
-
required=False,
|
657
|
-
),
|
658
|
-
types.PromptArgument(
|
659
|
-
name="keywords",
|
660
|
-
description="关键词",
|
661
|
-
required=False,
|
662
|
-
),
|
663
|
-
types.PromptArgument(
|
664
|
-
name="institution",
|
665
|
-
description="机构",
|
666
|
-
required=False,
|
667
|
-
),
|
668
|
-
],
|
669
|
-
),
|
670
|
-
types.Prompt(
|
671
|
-
name="summarize-notes",
|
672
|
-
description="总结所有笔记",
|
673
|
-
arguments=[
|
674
|
-
types.PromptArgument(
|
675
|
-
name="style",
|
676
|
-
description="摘要风格 (brief/detailed)",
|
677
|
-
required=False,
|
678
|
-
)
|
679
|
-
],
|
680
|
-
)
|
681
|
-
]
|
682
|
-
|
683
|
-
@server.get_prompt()
|
684
|
-
async def handle_get_prompt(
|
685
|
-
name: str, arguments: dict[str, str] | None
|
686
|
-
) -> types.GetPromptResult:
|
687
|
-
"""生成提示"""
|
688
|
-
if name == "search-literature":
|
689
|
-
keywords = (arguments or {}).get("keywords", "")
|
690
|
-
return types.GetPromptResult(
|
691
|
-
description="按主题搜索文献",
|
692
|
-
messages=[
|
693
|
-
types.PromptMessage(
|
694
|
-
role="user",
|
695
|
-
content=types.TextContent(
|
696
|
-
type="text",
|
697
|
-
text=f"请在中国知网搜索关于\"{keywords}\"的文献,并分析主要研究趋势。"
|
698
|
-
),
|
699
|
-
)
|
700
|
-
],
|
701
|
-
)
|
702
|
-
elif name == "advanced-search":
|
703
|
-
title = (arguments or {}).get("title", "")
|
704
|
-
author = (arguments or {}).get("author", "")
|
705
|
-
keywords = (arguments or {}).get("keywords", "")
|
706
|
-
institution = (arguments or {}).get("institution", "")
|
707
|
-
|
708
|
-
search_terms = []
|
709
|
-
if title:
|
710
|
-
search_terms.append(f"标题包含\"{title}\"")
|
711
|
-
if author:
|
712
|
-
search_terms.append(f"作者为\"{author}\"")
|
713
|
-
if keywords:
|
714
|
-
search_terms.append(f"关键词包含\"{keywords}\"")
|
715
|
-
if institution:
|
716
|
-
search_terms.append(f"机构为\"{institution}\"")
|
717
|
-
|
718
|
-
search_criteria = "、".join(search_terms)
|
719
|
-
|
720
|
-
return types.GetPromptResult(
|
721
|
-
description="高级文献搜索",
|
722
|
-
messages=[
|
723
|
-
types.PromptMessage(
|
724
|
-
role="user",
|
725
|
-
content=types.TextContent(
|
726
|
-
type="text",
|
727
|
-
text=f"请在中国知网搜索{search_criteria}的文献,并总结相关研究成果。"
|
728
|
-
),
|
729
|
-
)
|
730
|
-
],
|
731
|
-
)
|
732
|
-
elif name == "summarize-notes":
|
733
|
-
style = (arguments or {}).get("style", "brief")
|
734
|
-
detail_prompt = "请提供详细分析。" if style == "detailed" else ""
|
735
|
-
|
736
|
-
return types.GetPromptResult(
|
737
|
-
description="总结所有笔记",
|
738
|
-
messages=[
|
739
|
-
types.PromptMessage(
|
740
|
-
role="user",
|
741
|
-
content=types.TextContent(
|
742
|
-
type="text",
|
743
|
-
text=f"以下是需要总结的笔记:{detail_prompt}\n\n"
|
744
|
-
+ "\n".join(
|
745
|
-
f"- {name}: {content}"
|
746
|
-
for name, content in notes.items()
|
747
|
-
),
|
748
|
-
),
|
749
|
-
)
|
750
|
-
],
|
751
|
-
)
|
752
|
-
|
753
|
-
raise ValueError(f"未知提示: {name}")
|
754
|
-
|
755
|
-
@server.list_tools()
|
756
|
-
async def handle_list_tools() -> list[types.Tool]:
|
757
|
-
"""列出可用工具"""
|
758
|
-
tools = []
|
759
|
-
|
760
|
-
# 只添加搜索并提取的组合工具
|
761
|
-
if PLAYWRIGHT_AVAILABLE:
|
762
|
-
tools.append(
|
763
|
-
types.Tool(
|
764
|
-
name="search_and_extract",
|
765
|
-
description="搜索知网关键词并提取所有论文的详细内容",
|
766
|
-
inputSchema={
|
767
|
-
"type": "object",
|
768
|
-
"properties": {
|
769
|
-
"keywords": {"type": "string", "description": "搜索关键词"},
|
770
|
-
},
|
771
|
-
"required": ["keywords"],
|
772
|
-
},
|
773
|
-
)
|
774
|
-
)
|
775
|
-
|
776
|
-
return tools
|
777
|
-
|
778
|
-
@server.call_tool()
|
779
|
-
async def handle_call_tool(
|
780
|
-
name: str, arguments: dict | None
|
781
|
-
) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
|
782
|
-
"""处理工具执行请求"""
|
783
|
-
global current_url, page_content
|
784
|
-
|
785
|
-
if name == "search_and_extract" and PLAYWRIGHT_AVAILABLE:
|
786
|
-
if not arguments:
|
787
|
-
raise ValueError("缺少参数")
|
788
|
-
|
789
|
-
keywords = arguments.get("keywords")
|
790
|
-
if not keywords:
|
791
|
-
raise ValueError("缺少关键词")
|
792
|
-
|
793
|
-
try:
|
794
|
-
# 第一步:执行搜索
|
795
|
-
logger.info(f"开始执行搜索并提取:关键词 '{keywords}'")
|
796
|
-
|
797
|
-
# 构建URL
|
798
|
-
url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
|
799
|
-
current_url = url
|
800
|
-
logger.debug(f"[DEBUG] 搜索URL: {url}")
|
801
|
-
|
802
|
-
# 如果playwright可用,使用playwright搜索
|
803
|
-
if PLAYWRIGHT_AVAILABLE:
|
804
|
-
logger.debug("[DEBUG] 使用playwright搜索")
|
805
|
-
links_count = await search_with_playwright(keywords)
|
806
|
-
else:
|
807
|
-
# 否则直接用open_chrome打开URL
|
808
|
-
logger.debug("[DEBUG] 直接使用open_chrome打开URL")
|
809
|
-
result = open_chrome(url)
|
810
|
-
|
811
|
-
if isinstance(result, str):
|
812
|
-
# 如果是错误信息,返回错误
|
813
|
-
return [
|
814
|
-
types.TextContent(
|
815
|
-
type="text",
|
816
|
-
text=json.dumps({
|
817
|
-
"error": f"打开Chrome失败: {result}",
|
818
|
-
"keywords": keywords,
|
819
|
-
"count": 0,
|
820
|
-
"results": []
|
821
|
-
})
|
822
|
-
)
|
823
|
-
]
|
824
|
-
else:
|
825
|
-
# 成功打开但无法获取链接
|
826
|
-
return [
|
827
|
-
types.TextContent(
|
828
|
-
type="text",
|
829
|
-
text=json.dumps({
|
830
|
-
"keywords": keywords,
|
831
|
-
"count": 0,
|
832
|
-
"message": "已直接在Chrome中打开搜索页面,但无法自动获取搜索结果。请安装playwright以获取完整功能。",
|
833
|
-
"results": []
|
834
|
-
})
|
835
|
-
)
|
836
|
-
]
|
837
|
-
|
838
|
-
# 检查搜索结果
|
839
|
-
if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
|
840
|
-
# 如果没有找到链接,至少返回搜索页面作为结果
|
841
|
-
logger.debug("[DEBUG] 搜索未返回有效链接,返回搜索页面作为结果")
|
842
|
-
return [
|
843
|
-
types.TextContent(
|
844
|
-
type="text",
|
845
|
-
text=json.dumps({
|
846
|
-
"keywords": keywords,
|
847
|
-
"count": 1,
|
848
|
-
"results": [{
|
849
|
-
"title": f"搜索结果: {keywords}",
|
850
|
-
"authors": [],
|
851
|
-
"abstract": "请在浏览器中查看搜索结果",
|
852
|
-
"keywords": [],
|
853
|
-
"cite_format": "",
|
854
|
-
"url": url
|
855
|
-
}]
|
856
|
-
})
|
857
|
-
)
|
858
|
-
]
|
859
|
-
|
860
|
-
# 提取链接
|
861
|
-
urls = [link["url"] for link in page_content["links"] if "url" in link]
|
862
|
-
if not urls:
|
863
|
-
logger.debug("[DEBUG] 没有找到有效链接,返回搜索页面")
|
864
|
-
return [
|
865
|
-
types.TextContent(
|
866
|
-
type="text",
|
867
|
-
text=json.dumps({
|
868
|
-
"keywords": keywords,
|
869
|
-
"count": 1,
|
870
|
-
"results": [{
|
871
|
-
"title": f"搜索结果: {keywords}",
|
872
|
-
"authors": [],
|
873
|
-
"abstract": "请在浏览器中查看搜索结果",
|
874
|
-
"keywords": [],
|
875
|
-
"cite_format": "",
|
876
|
-
"url": url
|
877
|
-
}]
|
878
|
-
})
|
879
|
-
)
|
880
|
-
]
|
881
|
-
|
882
|
-
# 第二步:执行提取
|
883
|
-
logger.info(f"搜索成功,找到 {len(urls)} 个链接,开始提取内容")
|
884
|
-
results = await batch_extract_contents(urls)
|
885
|
-
|
886
|
-
# 包装结果
|
887
|
-
result_json = {
|
888
|
-
"keywords": keywords,
|
889
|
-
"count": len(results),
|
890
|
-
"results": results,
|
891
|
-
"success_count": sum(1 for r in results if "error" not in r or not r["error"]),
|
892
|
-
"error_count": sum(1 for r in results if "error" in r and r["error"])
|
893
|
-
}
|
894
|
-
|
895
|
-
return [
|
896
|
-
types.TextContent(
|
897
|
-
type="text",
|
898
|
-
text=json.dumps(result_json)
|
899
|
-
)
|
900
|
-
]
|
901
|
-
except Exception as e:
|
902
|
-
logger.error(f"搜索并提取时出错: {str(e)}")
|
903
|
-
logger.error(traceback.format_exc())
|
904
|
-
return [
|
905
|
-
types.TextContent(
|
906
|
-
type="text",
|
907
|
-
text=json.dumps({
|
908
|
-
"error": f"搜索并提取内容时出错: {str(e)}",
|
909
|
-
"keywords": keywords,
|
910
|
-
"count": 0,
|
911
|
-
"results": []
|
912
|
-
})
|
913
|
-
)
|
914
|
-
]
|
915
|
-
|
916
|
-
else:
|
917
|
-
raise ValueError(f"未知工具: {name}")
|
918
|
-
|
919
|
-
async def find_and_count_abstract_links(page):
|
920
|
-
"""查找并统计知网搜索结果页面中的论文链接"""
|
921
|
-
global page_content
|
922
|
-
|
923
|
-
try:
|
924
|
-
logger.debug("[DEBUG] 开始查找知网搜索结果中的论文链接")
|
925
|
-
|
926
|
-
# 等待确保页面完全加载
|
927
|
-
await asyncio.sleep(3)
|
928
|
-
|
929
|
-
# 首先尝试设置每页显示50条记录
|
930
|
-
try:
|
931
|
-
logger.debug("[DEBUG] 尝试设置每页显示50条记录")
|
932
|
-
|
933
|
-
# 使用更直接的JavaScript方法点击50条
|
934
|
-
set_page_size_result = await page.evaluate("""() => {
|
935
|
-
try {
|
936
|
-
// 更精确地找到下拉框并点击
|
937
|
-
const dropdowns = document.querySelectorAll('#perPageDiv, .perpage-content, .page-count, div[class*="perpage"]');
|
938
|
-
if (dropdowns && dropdowns.length > 0) {
|
939
|
-
// 记录找到的下拉框
|
940
|
-
console.log('找到下拉框元素:', dropdowns[0]);
|
941
|
-
// 点击下拉框
|
942
|
-
dropdowns[0].click();
|
943
|
-
console.log('已点击下拉框');
|
944
|
-
|
945
|
-
// 直接等待而不使用setTimeout,确保下拉菜单显示
|
946
|
-
return new Promise(resolve => {
|
947
|
-
setTimeout(() => {
|
948
|
-
// 查找并点击50选项
|
949
|
-
const options = document.querySelectorAll('a[data-v="50"], a[href*="50"], li[data-val="50"]');
|
950
|
-
console.log('找到的50选项数量:', options.length);
|
951
|
-
|
952
|
-
for (let option of options) {
|
953
|
-
if (option.textContent.includes('50')) {
|
954
|
-
option.click();
|
955
|
-
console.log('已点击50选项:', option);
|
956
|
-
resolve("点击了50选项:" + option.textContent);
|
957
|
-
return;
|
958
|
-
}
|
959
|
-
}
|
960
|
-
|
961
|
-
// 如果没有找到特定的50选项,尝试点击最后一个选项(通常是最大数值)
|
962
|
-
const allOptions = document.querySelectorAll('.perpage-content a, .sort-list li');
|
963
|
-
if (allOptions && allOptions.length > 0) {
|
964
|
-
const lastOption = allOptions[allOptions.length - 1];
|
965
|
-
lastOption.click();
|
966
|
-
console.log('点击了最后一个选项:', lastOption.textContent);
|
967
|
-
resolve("点击了最后一个选项:" + lastOption.textContent);
|
968
|
-
return;
|
969
|
-
}
|
970
|
-
|
971
|
-
resolve("未找到50条/页选项");
|
972
|
-
}, 1000); // 等待一秒确保下拉菜单显示
|
973
|
-
});
|
974
|
-
}
|
975
|
-
|
976
|
-
// 尝试另一种方式 - 直接点击带有"50"的链接
|
977
|
-
const directLinks = document.querySelectorAll('a:not([style*="display:none"]):not([style*="display: none"])');
|
978
|
-
for (let link of directLinks) {
|
979
|
-
if (link.textContent.trim() === '50' ||
|
980
|
-
link.textContent.includes('50条') ||
|
981
|
-
link.textContent.includes('50 条')) {
|
982
|
-
link.click();
|
983
|
-
return "直接点击了50条链接: " + link.textContent;
|
984
|
-
}
|
985
|
-
}
|
986
|
-
|
987
|
-
return "未找到任何可点击的50条/页选项";
|
988
|
-
} catch (e) {
|
989
|
-
return "设置每页显示50条记录时出错: " + e.toString();
|
990
|
-
}
|
991
|
-
}""")
|
992
|
-
|
993
|
-
logger.debug(f"[DEBUG] 设置每页显示50条记录结果: {set_page_size_result}")
|
994
|
-
|
995
|
-
# 等待页面刷新
|
996
|
-
await page.wait_for_load_state('networkidle')
|
997
|
-
await asyncio.sleep(2)
|
998
|
-
|
999
|
-
# 检查是否有来源类别选项,并尝试勾选CSSCI
|
1000
|
-
await check_and_select_cssci(page)
|
1001
|
-
|
1002
|
-
except Exception as e:
|
1003
|
-
logger.debug(f"[DEBUG] 设置每页显示50条记录时出错: {str(e)}")
|
1004
|
-
logger.debug(traceback.format_exc())
|
1005
|
-
|
1006
|
-
# 尝试等待搜索结果加载
|
1007
|
-
try:
|
1008
|
-
await page.wait_for_selector('.result-table-list', timeout=5000)
|
1009
|
-
logger.debug("[DEBUG] 已找到搜索结果容器")
|
1010
|
-
except Exception as e:
|
1011
|
-
logger.debug(f"[DEBUG] 等待搜索结果容器超时: {str(e)}")
|
1012
|
-
|
1013
|
-
# 优先查找带有article/abstract?v的链接
|
1014
|
-
try:
|
1015
|
-
logger.debug("[DEBUG] 尝试查找包含 article/abstract?v 的链接")
|
1016
|
-
|
1017
|
-
abstract_links = await page.evaluate("""() => {
|
1018
|
-
const links = [];
|
1019
|
-
// 严格查找包含article/abstract?v的链接
|
1020
|
-
const abstractLinks = document.querySelectorAll('a[href*="article/abstract?v="]');
|
1021
|
-
|
1022
|
-
console.log('找到包含article/abstract?v的链接数量:', abstractLinks.length);
|
1023
|
-
|
1024
|
-
for (let i = 0; i < abstractLinks.length; i++) {
|
1025
|
-
const link = abstractLinks[i];
|
1026
|
-
const href = link.href;
|
1027
|
-
const text = link.textContent.trim();
|
1028
|
-
|
1029
|
-
// 确保链接有效且包含必要的字段
|
1030
|
-
if (href && href.includes('article/abstract?v=') && text) {
|
1031
|
-
links.push({
|
1032
|
-
index: links.length + 1,
|
1033
|
-
href: href,
|
1034
|
-
text: text
|
1035
|
-
});
|
1036
|
-
}
|
1037
|
-
}
|
1038
|
-
|
1039
|
-
return links;
|
1040
|
-
}""")
|
1041
|
-
|
1042
|
-
logger.debug(f"[DEBUG] 找到 {len(abstract_links)} 个包含article/abstract?v的链接")
|
1043
|
-
|
1044
|
-
if abstract_links and len(abstract_links) > 0:
|
1045
|
-
# 找到有效的摘要链接
|
1046
|
-
links_info = abstract_links
|
1047
|
-
links_count = len(abstract_links)
|
1048
|
-
else:
|
1049
|
-
# 没有找到摘要链接,尝试备用方法
|
1050
|
-
logger.debug("[DEBUG] 未找到包含article/abstract?v的链接,尝试备用方法")
|
1051
|
-
|
1052
|
-
# 尝试查找可能的论文链接
|
1053
|
-
backup_links = await page.evaluate("""() => {
|
1054
|
-
const links = [];
|
1055
|
-
// 查找可能是论文链接的a标签
|
1056
|
-
const allLinks = document.querySelectorAll('a.fz14, a[href*="/kcms"], .result-table-list a');
|
1057
|
-
|
1058
|
-
for (let i = 0; i < allLinks.length; i++) {
|
1059
|
-
const link = allLinks[i];
|
1060
|
-
const href = link.href;
|
1061
|
-
const text = link.textContent.trim();
|
1062
|
-
|
1063
|
-
if (href && text && !links.some(l => l.href === href)) {
|
1064
|
-
links.push({
|
1065
|
-
index: links.length + 1,
|
1066
|
-
href: href,
|
1067
|
-
text: text
|
1068
|
-
});
|
1069
|
-
}
|
1070
|
-
}
|
1071
|
-
|
1072
|
-
return links;
|
1073
|
-
}""")
|
1074
|
-
|
1075
|
-
if backup_links and len(backup_links) > 0:
|
1076
|
-
logger.debug(f"[DEBUG] 使用备用方法找到 {len(backup_links)} 个可能的论文链接")
|
1077
|
-
links_info = backup_links
|
1078
|
-
links_count = len(backup_links)
|
1079
|
-
else:
|
1080
|
-
# 回退到常规方法
|
1081
|
-
links_info = []
|
1082
|
-
links_count = 0
|
1083
|
-
|
1084
|
-
# 尝试多种可能的选择器
|
1085
|
-
selectors = [
|
1086
|
-
'a[href*="article/abstract?v="]', # 优先查找摘要链接
|
1087
|
-
'a[href*="/kcms"]', # 知网文献链接
|
1088
|
-
'.fz14', # 标题样式类
|
1089
|
-
'a.pc-link', # 搜索结果链接
|
1090
|
-
'.c_font a', # 内容字体下的链接
|
1091
|
-
'.result-table-list a', # 结果表下的链接
|
1092
|
-
'table tr td a' # 表格中的链接
|
1093
|
-
]
|
1094
|
-
|
1095
|
-
for selector in selectors:
|
1096
|
-
try:
|
1097
|
-
all_links = await page.query_selector_all(selector)
|
1098
|
-
logger.debug(f"[DEBUG] 使用选择器 {selector} 找到 {len(all_links)} 个链接")
|
1099
|
-
|
1100
|
-
for i, link in enumerate(all_links):
|
1101
|
-
try:
|
1102
|
-
href = await link.get_attribute('href')
|
1103
|
-
text = await link.text_content()
|
1104
|
-
|
1105
|
-
# 确保链接包含论文相关URL,如果没有指定URL则使用当前页面URL
|
1106
|
-
if not href:
|
1107
|
-
continue
|
1108
|
-
|
1109
|
-
# 处理相对URL
|
1110
|
-
if href.startswith('/'):
|
1111
|
-
href = f"https://kns.cnki.net{href}"
|
1112
|
-
elif not href.startswith('http'):
|
1113
|
-
href = f"https://kns.cnki.net/{href}"
|
1114
|
-
|
1115
|
-
# 防止重复添加同一链接
|
1116
|
-
if any(link_info['href'] == href for link_info in links_info):
|
1117
|
-
continue
|
1118
|
-
|
1119
|
-
links_info.append({
|
1120
|
-
'index': len(links_info) + 1,
|
1121
|
-
'href': href,
|
1122
|
-
'text': text.strip() if text else ""
|
1123
|
-
})
|
1124
|
-
|
1125
|
-
logger.debug(f"[DEBUG] 链接 {len(links_info)}: {href}")
|
1126
|
-
except Exception as e:
|
1127
|
-
logger.debug(f"[DEBUG] 处理链接时出错: {str(e)}")
|
1128
|
-
except Exception as e:
|
1129
|
-
logger.debug(f"[DEBUG] 使用选择器 {selector} 查找链接时出错: {str(e)}")
|
1130
|
-
except Exception as e:
|
1131
|
-
logger.debug(f"[DEBUG] 查找链接时出错: {str(e)}")
|
1132
|
-
logger.debug(traceback.format_exc())
|
1133
|
-
links_info = []
|
1134
|
-
links_count = 0
|
1135
|
-
|
1136
|
-
# 过滤链接,只保留包含article/abstract?v的链接
|
1137
|
-
filtered_links = []
|
1138
|
-
for link in links_info:
|
1139
|
-
href = link['href']
|
1140
|
-
if 'article/abstract?v=' in href:
|
1141
|
-
filtered_links.append(link)
|
1142
|
-
logger.debug(f"[DEBUG] 保留包含article/abstract?v的链接: {href}")
|
1143
|
-
|
1144
|
-
# 如果过滤后没有链接,可能是知网搜索结果的格式变化,使用原始链接
|
1145
|
-
if not filtered_links:
|
1146
|
-
logger.debug("[DEBUG] 过滤后没有包含article/abstract?v的链接,使用原始链接")
|
1147
|
-
filtered_links = links_info
|
1148
|
-
|
1149
|
-
# 最终链接数量
|
1150
|
-
links_count = len(filtered_links)
|
1151
|
-
logger.debug(f"[DEBUG] 最终过滤后找到 {links_count} 个链接")
|
1152
|
-
|
1153
|
-
# 如果没有找到链接,不再进行截图
|
1154
|
-
if links_count == 0:
|
1155
|
-
logger.debug("[DEBUG] 未找到链接")
|
1156
|
-
|
1157
|
-
# 存储结果 - 使用字典结构而不是纯文本
|
1158
|
-
page_content = {
|
1159
|
-
"count": links_count,
|
1160
|
-
"links": [{"index": link['index'], "url": link['href'], "title": link['text']} for link in filtered_links]
|
1161
|
-
}
|
1162
|
-
|
1163
|
-
return links_count
|
1164
|
-
except Exception as e:
|
1165
|
-
logger.debug(f"[DEBUG] 查找链接时出错: {str(e)}")
|
1166
|
-
logger.debug(traceback.format_exc())
|
1167
|
-
|
1168
|
-
# 发生错误时,尝试获取当前页面URL
|
1169
|
-
try:
|
1170
|
-
current_url = await page.url()
|
1171
|
-
logger.debug(f"[DEBUG] 当前页面URL: {current_url}")
|
1172
|
-
|
1173
|
-
# 至少返回当前页面作为链接
|
1174
|
-
page_content = {
|
1175
|
-
"count": 1,
|
1176
|
-
"links": [{"index": 1, "url": current_url, "title": "当前页面"}]
|
1177
|
-
}
|
1178
|
-
return 1
|
1179
|
-
except:
|
1180
|
-
page_content = {
|
1181
|
-
"count": 0,
|
1182
|
-
"links": []
|
1183
|
-
}
|
1184
|
-
return 0
|
1185
|
-
|
1186
|
-
async def check_and_select_cssci(page):
|
1187
|
-
"""检查页面是否有来源类别选项,并尝试勾选CSSCI"""
|
1188
|
-
try:
|
1189
|
-
logger.debug("[DEBUG] 尝试查找来源类别并勾选CSSCI")
|
1190
|
-
|
1191
|
-
# 使用JavaScript直接操作DOM
|
1192
|
-
cssci_result = await page.evaluate("""() => {
|
1193
|
-
try {
|
1194
|
-
// 查找包含"来源类别"的区域
|
1195
|
-
const categoryContainer = Array.from(document.querySelectorAll('div')).find(div =>
|
1196
|
-
div.textContent.includes('来源类别')
|
1197
|
-
);
|
1198
|
-
|
1199
|
-
if (categoryContainer) {
|
1200
|
-
// 在来源类别容器中查找CSSCI复选框
|
1201
|
-
const checkboxes = categoryContainer.querySelectorAll('input[type="checkbox"]');
|
1202
|
-
for (let checkbox of checkboxes) {
|
1203
|
-
// 查找CSSCI相关的复选框
|
1204
|
-
const parentText = checkbox.parentElement.textContent;
|
1205
|
-
if (parentText.includes('CSSCI') ||
|
1206
|
-
checkbox.value.includes('CSSCI') ||
|
1207
|
-
checkbox.id.includes('cssci')) {
|
1208
|
-
|
1209
|
-
// 勾选复选框
|
1210
|
-
if (!checkbox.checked) {
|
1211
|
-
checkbox.click();
|
1212
|
-
return "已勾选CSSCI复选框";
|
1213
|
-
} else {
|
1214
|
-
return "CSSCI复选框已经被勾选";
|
1215
|
-
}
|
1216
|
-
}
|
1217
|
-
}
|
1218
|
-
|
1219
|
-
// 如果没有找到复选框但找到了CSSCI的标签
|
1220
|
-
const cssciLabels = categoryContainer.querySelectorAll('label, span');
|
1221
|
-
for (let label of cssciLabels) {
|
1222
|
-
if (label.textContent.includes('CSSCI')) {
|
1223
|
-
label.click();
|
1224
|
-
return "已点击CSSCI标签";
|
1225
|
-
}
|
1226
|
-
}
|
1227
|
-
|
1228
|
-
return "在来源类别区域未找到CSSCI选项";
|
1229
|
-
}
|
1230
|
-
|
1231
|
-
return "未找到来源类别区域";
|
1232
|
-
} catch (e) {
|
1233
|
-
return "勾选CSSCI时出错: " + e.toString();
|
1234
|
-
}
|
1235
|
-
}""")
|
1236
|
-
|
1237
|
-
logger.debug(f"[DEBUG] CSSCI勾选结果: {cssci_result}")
|
1238
|
-
|
1239
|
-
# 等待页面刷新
|
1240
|
-
await page.wait_for_load_state('networkidle')
|
1241
|
-
await asyncio.sleep(2)
|
1242
|
-
|
1243
|
-
except Exception as e:
|
1244
|
-
logger.debug(f"[DEBUG] 勾选CSSCI时出错: {str(e)}")
|
1245
|
-
logger.debug(traceback.format_exc())
|
1246
|
-
|
1247
|
-
async def extract_content_from_url(url: str, page = None) -> CNKIContent:
|
1248
|
-
"""从CNKI页面提取论文内容"""
|
1249
|
-
global playwright_instance, browser_instance, context
|
1250
|
-
|
1251
|
-
if not url.startswith('http'):
|
1252
|
-
# 处理相对URL
|
1253
|
-
if url.startswith('/'):
|
1254
|
-
url = f"https://kns.cnki.net{url}"
|
1255
|
-
else:
|
1256
|
-
url = f"https://kns.cnki.net/{url}"
|
1257
|
-
|
1258
|
-
# 创建基本内容对象
|
1259
|
-
content = CNKIContent(url=url)
|
1260
|
-
|
1261
|
-
try:
|
1262
|
-
logger.info(f"开始从URL提取内容: {url}")
|
1263
|
-
|
1264
|
-
# 如果没有提供page参数,检查playwright是否已初始化
|
1265
|
-
should_close_page = False
|
1266
|
-
if page is None:
|
1267
|
-
if playwright_instance is None or browser_instance is None or context is None:
|
1268
|
-
# 如果playwright未初始化,使用webbrowser打开URL
|
1269
|
-
logger.info(f"Playwright未初始化,使用webbrowser打开URL: {url}")
|
1270
|
-
webbrowser.open(url)
|
1271
|
-
|
1272
|
-
# 设置基本信息
|
1273
|
-
content.title = "请在浏览器中手动获取内容"
|
1274
|
-
content.abstract = "系统已打开链接,请在浏览器中查看完整内容"
|
1275
|
-
return content
|
1276
|
-
else:
|
1277
|
-
# 使用现有的playwright实例创建新页面
|
1278
|
-
logger.debug("[DEBUG] 使用现有的playwright实例创建新页面")
|
1279
|
-
page = await context.new_page()
|
1280
|
-
should_close_page = True # 后续需要关闭此页面
|
1281
|
-
|
1282
|
-
# 访问URL
|
1283
|
-
logger.debug(f"[DEBUG] 导航到URL: {url}")
|
1284
|
-
|
1285
|
-
try:
|
1286
|
-
await page.goto(url, wait_until='networkidle', timeout=30000)
|
1287
|
-
except Exception as e:
|
1288
|
-
logger.warning(f"导航超时,继续尝试提取: {str(e)}")
|
1289
|
-
|
1290
|
-
# 等待页面加载
|
1291
|
-
await asyncio.sleep(2)
|
1292
|
-
|
1293
|
-
# 检查是否需要验证
|
1294
|
-
await check_and_wait_for_verification(page)
|
1295
|
-
|
1296
|
-
# 尝试使用JavaScript提取所有内容
|
1297
|
-
try:
|
1298
|
-
logger.debug("[DEBUG] 尝试使用JavaScript提取内容")
|
1299
|
-
|
1300
|
-
content_result = await page.evaluate("""() => {
|
1301
|
-
try {
|
1302
|
-
// 提取标题
|
1303
|
-
const getTitle = () => {
|
1304
|
-
const selectors = ['h1.title', '.wx-tit h1', '.title', 'h1', '.article-title', 'div.brief h2', '.wxTitle', 'span.title'];
|
1305
|
-
for (const selector of selectors) {
|
1306
|
-
const element = document.querySelector(selector);
|
1307
|
-
if (element) {
|
1308
|
-
const text = element.textContent.trim();
|
1309
|
-
if (!text.includes('系统检测')) {
|
1310
|
-
return text;
|
1311
|
-
}
|
1312
|
-
}
|
1313
|
-
}
|
1314
|
-
return "";
|
1315
|
-
};
|
1316
|
-
|
1317
|
-
// 提取作者
|
1318
|
-
const getAuthors = () => {
|
1319
|
-
const selectors = ['.wx-tit .author', '.author', '.writers', '.authorinfo', 'div.brief p:first-child', 'span.author'];
|
1320
|
-
for (const selector of selectors) {
|
1321
|
-
const element = document.querySelector(selector);
|
1322
|
-
if (element) {
|
1323
|
-
const text = element.textContent.trim();
|
1324
|
-
return text.split(/[,,;;、\\s]+/).filter(a => a.trim());
|
1325
|
-
}
|
1326
|
-
}
|
1327
|
-
return [];
|
1328
|
-
};
|
1329
|
-
|
1330
|
-
// 提取摘要
|
1331
|
-
const getAbstract = () => {
|
1332
|
-
const selectors = ['#ChDivSummary', '.abstract', '.summary', '.Abstract', 'div.brief div.abstract', 'div.wxInfo span.abstract', 'div.wxInfo', 'span.abstract'];
|
1333
|
-
for (const selector of selectors) {
|
1334
|
-
const element = document.querySelector(selector);
|
1335
|
-
if (element) {
|
1336
|
-
let text = element.textContent.trim();
|
1337
|
-
// 移除可能的"摘要:"前缀
|
1338
|
-
text = text.replace(/^摘要[::]/g, '').trim();
|
1339
|
-
return text;
|
1340
|
-
}
|
1341
|
-
}
|
1342
|
-
|
1343
|
-
// 查找含有"摘要"的段落
|
1344
|
-
const paragraphs = document.querySelectorAll('p');
|
1345
|
-
for (const p of paragraphs) {
|
1346
|
-
if (p.textContent.includes('摘要')) {
|
1347
|
-
let text = p.textContent.trim();
|
1348
|
-
text = text.replace(/^摘要[::]/g, '').trim();
|
1349
|
-
return text;
|
1350
|
-
}
|
1351
|
-
}
|
1352
|
-
|
1353
|
-
return "";
|
1354
|
-
};
|
1355
|
-
|
1356
|
-
// 提取关键词
|
1357
|
-
const getKeywords = () => {
|
1358
|
-
const selectors = ['.wx-tit-keys', '.keywords', '.Keyword', 'div.wxInfo span.keywords', 'span.keywords', 'div.brief span.keywords', 'p.keywords'];
|
1359
|
-
for (const selector of selectors) {
|
1360
|
-
const element = document.querySelector(selector);
|
1361
|
-
if (element) {
|
1362
|
-
let text = element.textContent.trim();
|
1363
|
-
// 移除"关键词:"前缀
|
1364
|
-
text = text.replace(/^关键词[::]/g, '').trim();
|
1365
|
-
return text.split(/[;;,,、\\s]+/).filter(k => k.trim());
|
1366
|
-
}
|
1367
|
-
}
|
1368
|
-
|
1369
|
-
// 查找含有"关键词"的段落
|
1370
|
-
const paragraphs = document.querySelectorAll('p');
|
1371
|
-
for (const p of paragraphs) {
|
1372
|
-
if (p.textContent.includes('关键词')) {
|
1373
|
-
let text = p.textContent.trim();
|
1374
|
-
const keywordText = text.split(/关键词[::]/)[1];
|
1375
|
-
if (keywordText) {
|
1376
|
-
return keywordText.split(/[;;,,、\\s]+/).filter(k => k.trim());
|
1377
|
-
}
|
1378
|
-
}
|
1379
|
-
}
|
1380
|
-
|
1381
|
-
return [];
|
1382
|
-
};
|
1383
|
-
|
1384
|
-
// 尝试获取引用格式
|
1385
|
-
let citeFormat = "";
|
1386
|
-
const getCiteFormat = () => {
|
1387
|
-
// 首先检查是否有引用按钮
|
1388
|
-
const citeButton = document.querySelector('button:has-text("引用"), [class*="cite"], [class*="quote"]');
|
1389
|
-
if (citeButton) {
|
1390
|
-
// 如果有引用按钮,暂不点击,防止页面跳转
|
1391
|
-
return null;
|
1392
|
-
}
|
1393
|
-
|
1394
|
-
// 尝试直接获取引用区域
|
1395
|
-
const selectors = ['.quote-info', '.citation', 'div.cite', 'div.quoted', 'div.wxInfo div.quoted', '.refer-info'];
|
1396
|
-
for (const selector of selectors) {
|
1397
|
-
const element = document.querySelector(selector);
|
1398
|
-
if (element) {
|
1399
|
-
return element.textContent.trim();
|
1400
|
-
}
|
1401
|
-
}
|
1402
|
-
|
1403
|
-
return "";
|
1404
|
-
};
|
1405
|
-
|
1406
|
-
// 收集结果
|
1407
|
-
return {
|
1408
|
-
title: getTitle(),
|
1409
|
-
authors: getAuthors(),
|
1410
|
-
abstract: getAbstract(),
|
1411
|
-
keywords: getKeywords(),
|
1412
|
-
cite_format: getCiteFormat()
|
1413
|
-
};
|
1414
|
-
} catch (e) {
|
1415
|
-
return {
|
1416
|
-
error: "提取内容时出错: " + e.toString(),
|
1417
|
-
title: "",
|
1418
|
-
authors: [],
|
1419
|
-
abstract: "",
|
1420
|
-
keywords: [],
|
1421
|
-
cite_format: ""
|
1422
|
-
};
|
1423
|
-
}
|
1424
|
-
}""")
|
1425
|
-
|
1426
|
-
# 更新内容对象
|
1427
|
-
if content_result:
|
1428
|
-
if "error" in content_result and content_result["error"]:
|
1429
|
-
logger.warning(f"[WARNING] JavaScript提取内容时出错: {content_result['error']}")
|
1430
|
-
else:
|
1431
|
-
logger.debug("[DEBUG] JavaScript提取内容成功")
|
1432
|
-
|
1433
|
-
# 更新标题
|
1434
|
-
if content_result.get("title"):
|
1435
|
-
content.title = content_result["title"]
|
1436
|
-
logger.debug(f"[DEBUG] 提取到标题: {content.title}")
|
1437
|
-
|
1438
|
-
# 更新作者
|
1439
|
-
if content_result.get("authors"):
|
1440
|
-
content.authors = content_result["authors"]
|
1441
|
-
logger.debug(f"[DEBUG] 提取到作者: {content.authors}")
|
1442
|
-
|
1443
|
-
# 更新摘要
|
1444
|
-
if content_result.get("abstract"):
|
1445
|
-
content.abstract = content_result["abstract"]
|
1446
|
-
logger.debug(f"[DEBUG] 提取到摘要: {content.abstract[:100]}...")
|
1447
|
-
|
1448
|
-
# 更新关键词
|
1449
|
-
if content_result.get("keywords"):
|
1450
|
-
content.keywords = content_result["keywords"]
|
1451
|
-
logger.debug(f"[DEBUG] 提取到关键词: {content.keywords}")
|
1452
|
-
|
1453
|
-
# 更新引用格式
|
1454
|
-
if content_result.get("cite_format") != None:
|
1455
|
-
if content_result["cite_format"]:
|
1456
|
-
# 直接获取到引用格式
|
1457
|
-
content.cite_format = content_result["cite_format"]
|
1458
|
-
logger.debug(f"[DEBUG] 提取到引用格式: {content.cite_format[:100]}...")
|
1459
|
-
else:
|
1460
|
-
# 需要点击引用按钮
|
1461
|
-
logger.debug("[DEBUG] 尝试点击引用按钮获取引用格式")
|
1462
|
-
|
1463
|
-
try:
|
1464
|
-
# 查找引用按钮
|
1465
|
-
cite_button = await page.query_selector('button:has-text("引用"), [class*="cite"], [class*="quote"]')
|
1466
|
-
if cite_button:
|
1467
|
-
await cite_button.click()
|
1468
|
-
await asyncio.sleep(1) # 等待弹窗显示
|
1469
|
-
|
1470
|
-
# 在弹窗中提取引用格式
|
1471
|
-
cite_text = await page.evaluate("""() => {
|
1472
|
-
const textarea = document.querySelector('.quote-r textarea.text, .quote-text, [class*="quote"] textarea');
|
1473
|
-
if (textarea) {
|
1474
|
-
return textarea.value.trim();
|
1475
|
-
}
|
1476
|
-
return "";
|
1477
|
-
}""")
|
1478
|
-
|
1479
|
-
if cite_text:
|
1480
|
-
content.cite_format = cite_text
|
1481
|
-
logger.debug(f"[DEBUG] 从弹窗提取到引用格式: {content.cite_format[:100]}...")
|
1482
|
-
else:
|
1483
|
-
logger.debug("[DEBUG] 未从弹窗找到引用格式")
|
1484
|
-
else:
|
1485
|
-
logger.debug("[DEBUG] 未找到引用按钮")
|
1486
|
-
except Exception as e:
|
1487
|
-
logger.debug(f"[DEBUG] 点击引用按钮时出错: {str(e)}")
|
1488
|
-
else:
|
1489
|
-
logger.warning("[WARNING] JavaScript提取内容返回空结果")
|
1490
|
-
|
1491
|
-
except Exception as e:
|
1492
|
-
logger.debug(f"[DEBUG] 使用JavaScript提取内容时出错: {str(e)}")
|
1493
|
-
logger.debug(traceback.format_exc())
|
1494
|
-
|
1495
|
-
# 如果JavaScript提取失败,回退到原来的提取方法
|
1496
|
-
if not content.title:
|
1497
|
-
# 尝试提取论文标题
|
1498
|
-
try:
|
1499
|
-
title_selectors = [
|
1500
|
-
'.wx-tit h1',
|
1501
|
-
'.article-title',
|
1502
|
-
'.title',
|
1503
|
-
'h1',
|
1504
|
-
'.articleTitle',
|
1505
|
-
'div.brief h2',
|
1506
|
-
'.wxTitle',
|
1507
|
-
'span.title'
|
1508
|
-
]
|
1509
|
-
|
1510
|
-
title_elem = None
|
1511
|
-
for selector in title_selectors:
|
1512
|
-
title_elem = await page.query_selector(selector)
|
1513
|
-
if title_elem:
|
1514
|
-
logger.debug(f"[DEBUG] 找到标题元素: {selector}")
|
1515
|
-
break
|
1516
|
-
|
1517
|
-
if title_elem:
|
1518
|
-
content.title = await title_elem.text_content()
|
1519
|
-
content.title = content.title.strip()
|
1520
|
-
logger.debug(f"[DEBUG] 提取到标题: {content.title}")
|
1521
|
-
except Exception as e:
|
1522
|
-
logger.debug(f"[DEBUG] 提取标题时出错: {str(e)}")
|
1523
|
-
|
1524
|
-
if not content.authors:
|
1525
|
-
# 尝试提取作者信息
|
1526
|
-
try:
|
1527
|
-
author_selectors = [
|
1528
|
-
'.wx-tit .author',
|
1529
|
-
'.author',
|
1530
|
-
'.writers',
|
1531
|
-
'.authorinfo',
|
1532
|
-
'div.brief p:first-child',
|
1533
|
-
'span.author'
|
1534
|
-
]
|
1535
|
-
|
1536
|
-
authors_elem = None
|
1537
|
-
for selector in author_selectors:
|
1538
|
-
authors_elem = await page.query_selector(selector)
|
1539
|
-
if authors_elem:
|
1540
|
-
logger.debug(f"[DEBUG] 找到作者元素: {selector}")
|
1541
|
-
break
|
1542
|
-
|
1543
|
-
if authors_elem:
|
1544
|
-
authors_text = await authors_elem.text_content()
|
1545
|
-
# 分割作者文本
|
1546
|
-
authors = [a.strip() for a in re.split(r'[,,;;、\s]+', authors_text) if a.strip()]
|
1547
|
-
content.authors = authors
|
1548
|
-
logger.debug(f"[DEBUG] 提取到作者: {authors}")
|
1549
|
-
except Exception as e:
|
1550
|
-
logger.debug(f"[DEBUG] 提取作者时出错: {str(e)}")
|
1551
|
-
|
1552
|
-
if not content.abstract:
|
1553
|
-
# 尝试提取摘要
|
1554
|
-
try:
|
1555
|
-
abstract_selectors = [
|
1556
|
-
'#ChDivSummary',
|
1557
|
-
'.abstract',
|
1558
|
-
'.summary',
|
1559
|
-
'.Abstract',
|
1560
|
-
'div.brief div.abstract',
|
1561
|
-
'div.wxInfo span.abstract',
|
1562
|
-
'div.wxInfo',
|
1563
|
-
'span.abstract'
|
1564
|
-
]
|
1565
|
-
|
1566
|
-
abstract_elem = None
|
1567
|
-
for selector in abstract_selectors:
|
1568
|
-
abstract_elem = await page.query_selector(selector)
|
1569
|
-
if abstract_elem:
|
1570
|
-
logger.debug(f"[DEBUG] 找到摘要元素: {selector}")
|
1571
|
-
break
|
1572
|
-
|
1573
|
-
if abstract_elem:
|
1574
|
-
content.abstract = await abstract_elem.text_content()
|
1575
|
-
content.abstract = content.abstract.strip()
|
1576
|
-
# 移除可能的"摘要:"前缀
|
1577
|
-
content.abstract = re.sub(r'^摘要[::]\s*', '', content.abstract)
|
1578
|
-
logger.debug(f"[DEBUG] 提取到摘要: {content.abstract[:100]}...")
|
1579
|
-
except Exception as e:
|
1580
|
-
logger.debug(f"[DEBUG] 提取摘要时出错: {str(e)}")
|
1581
|
-
|
1582
|
-
if not content.keywords:
|
1583
|
-
# 尝试提取关键词
|
1584
|
-
try:
|
1585
|
-
keyword_selectors = [
|
1586
|
-
'.wx-tit-keys',
|
1587
|
-
'.keywords',
|
1588
|
-
'.Keyword',
|
1589
|
-
'div.wxInfo span.keywords',
|
1590
|
-
'span.keywords',
|
1591
|
-
'div.brief span.keywords',
|
1592
|
-
'p.keywords'
|
1593
|
-
]
|
1594
|
-
|
1595
|
-
keywords_elem = None
|
1596
|
-
for selector in keyword_selectors:
|
1597
|
-
keywords_elem = await page.query_selector(selector)
|
1598
|
-
if keywords_elem:
|
1599
|
-
logger.debug(f"[DEBUG] 找到关键词元素: {selector}")
|
1600
|
-
break
|
1601
|
-
|
1602
|
-
if keywords_elem:
|
1603
|
-
keywords_text = await keywords_elem.text_content()
|
1604
|
-
# 移除"关键词:"前缀
|
1605
|
-
keywords_text = re.sub(r'^关键词[::]\s*', '', keywords_text)
|
1606
|
-
# 分割关键词
|
1607
|
-
keywords = [k.strip() for k in re.split(r'[;;,,、\s]+', keywords_text) if k.strip()]
|
1608
|
-
content.keywords = keywords
|
1609
|
-
logger.debug(f"[DEBUG] 提取到关键词: {keywords}")
|
1610
|
-
except Exception as e:
|
1611
|
-
logger.debug(f"[DEBUG] 提取关键词时出错: {str(e)}")
|
1612
|
-
|
1613
|
-
if not content.cite_format:
|
1614
|
-
# 尝试提取引用格式
|
1615
|
-
try:
|
1616
|
-
cite_selectors = [
|
1617
|
-
'.quote-info',
|
1618
|
-
'.citation',
|
1619
|
-
'div.cite',
|
1620
|
-
'div.quoted',
|
1621
|
-
'div.wxInfo div.quoted',
|
1622
|
-
'.refer-info'
|
1623
|
-
]
|
1624
|
-
|
1625
|
-
cite_elem = None
|
1626
|
-
for selector in cite_selectors:
|
1627
|
-
cite_elem = await page.query_selector(selector)
|
1628
|
-
if cite_elem:
|
1629
|
-
logger.debug(f"[DEBUG] 找到引用格式元素: {selector}")
|
1630
|
-
break
|
1631
|
-
|
1632
|
-
if cite_elem:
|
1633
|
-
content.cite_format = await cite_elem.text_content()
|
1634
|
-
content.cite_format = content.cite_format.strip()
|
1635
|
-
logger.debug(f"[DEBUG] 提取到引用格式: {content.cite_format[:100]}...")
|
1636
|
-
else:
|
1637
|
-
# 如果没有找到引用格式,尝试点击引用按钮
|
1638
|
-
cite_button = await page.query_selector('button:has-text("引用"), [class*="cite"], [class*="quote"]')
|
1639
|
-
if cite_button:
|
1640
|
-
await cite_button.click()
|
1641
|
-
await asyncio.sleep(1) # 等待弹窗显示
|
1642
|
-
|
1643
|
-
# 在弹窗中提取引用格式
|
1644
|
-
textarea = await page.query_selector('.quote-r textarea.text, .quote-text, [class*="quote"] textarea')
|
1645
|
-
if textarea:
|
1646
|
-
content.cite_format = await textarea.get_property('value')
|
1647
|
-
content.cite_format = content.cite_format.strip()
|
1648
|
-
logger.debug(f"[DEBUG] 从弹窗提取到引用格式: {content.cite_format[:100]}...")
|
1649
|
-
except Exception as e:
|
1650
|
-
logger.debug(f"[DEBUG] 提取引用格式时出错: {str(e)}")
|
1651
|
-
|
1652
|
-
# 如果页面是自己创建的,需要关闭
|
1653
|
-
if should_close_page:
|
1654
|
-
await page.close()
|
1655
|
-
|
1656
|
-
return content
|
1657
|
-
except Exception as e:
|
1658
|
-
logger.error(f"从URL提取内容时出错: {str(e)}")
|
1659
|
-
logger.error(traceback.format_exc())
|
1660
|
-
|
1661
|
-
# 确保如果页面是自己创建的,出错时也能关闭
|
1662
|
-
if 'page' in locals() and page is not None and 'should_close_page' in locals() and should_close_page:
|
1663
|
-
try:
|
1664
|
-
await page.close()
|
1665
|
-
except:
|
1666
|
-
pass
|
1667
|
-
|
1668
|
-
# 设置错误信息
|
1669
|
-
content.title = f"提取失败: {str(e)}"
|
1670
|
-
content.abstract = f"从URL提取内容时出错: {str(e)}"
|
1671
|
-
return content
|
1672
|
-
|
1673
|
-
async def batch_extract_contents(urls: List[str]) -> List[Dict]:
|
1674
|
-
"""批量处理多个URL,提取内容并返回JSON格式"""
|
1675
|
-
results = []
|
1676
|
-
max_urls = min(50, len(urls)) # 限制最多处理50个URL
|
1677
|
-
|
1678
|
-
logger.info(f"开始批量提取内容,共 {max_urls} 个URL")
|
1679
|
-
|
1680
|
-
try:
|
1681
|
-
# 检查是否已初始化playwright
|
1682
|
-
global playwright_instance, browser_instance, context
|
1683
|
-
|
1684
|
-
if playwright_instance is None or browser_instance is None or context is None:
|
1685
|
-
logger.info("Playwright未初始化,创建新实例")
|
1686
|
-
playwright_instance = await async_playwright().start()
|
1687
|
-
browser_instance = await playwright_instance.chromium.launch(headless=False)
|
1688
|
-
context = await browser_instance.new_context()
|
1689
|
-
|
1690
|
-
# 一个一个处理URL
|
1691
|
-
for i, url in enumerate(urls[:max_urls]):
|
1692
|
-
logger.info(f"处理URL {i+1}/{max_urls}: {url}")
|
1693
|
-
|
1694
|
-
# 创建一个新页面
|
1695
|
-
page = await context.new_page()
|
1696
|
-
|
1697
|
-
try:
|
1698
|
-
# 提取内容
|
1699
|
-
result = await extract_content_from_url(url, page)
|
1700
|
-
results.append(result.dict())
|
1701
|
-
logger.info(f"成功处理URL: {url}")
|
1702
|
-
except Exception as e:
|
1703
|
-
logger.error(f"处理URL {url} 时出错: {str(e)}")
|
1704
|
-
results.append({
|
1705
|
-
"url": url,
|
1706
|
-
"error": str(e),
|
1707
|
-
"title": "",
|
1708
|
-
"authors": [],
|
1709
|
-
"abstract": "",
|
1710
|
-
"keywords": [],
|
1711
|
-
"cite_format": ""
|
1712
|
-
})
|
1713
|
-
finally:
|
1714
|
-
# 关闭页面
|
1715
|
-
await page.close()
|
1716
|
-
|
1717
|
-
# 添加短暂延迟,避免过快请求导致被封
|
1718
|
-
await asyncio.sleep(1)
|
1719
|
-
|
1720
|
-
logger.info(f"批量处理完成,共处理 {len(results)} 个URL")
|
1721
|
-
return results
|
1722
|
-
except Exception as e:
|
1723
|
-
logger.error(f"批量处理过程中出错: {str(e)}")
|
1724
|
-
logger.error(traceback.format_exc())
|
1725
|
-
return [{"error": f"批量处理过程中出错: {str(e)}"}] + results
|
1726
|
-
|
1727
|
-
# 添加关闭函数,在程序结束时清理资源
|
1728
|
-
async def cleanup_playwright():
|
1729
|
-
"""清理playwright资源"""
|
1730
|
-
global playwright_instance, browser_instance, context
|
1731
|
-
|
1732
|
-
if context:
|
1733
|
-
logger.debug("[DEBUG] 关闭playwright上下文")
|
1734
|
-
await context.close()
|
1735
|
-
context = None
|
1736
|
-
|
1737
|
-
if browser_instance:
|
1738
|
-
logger.debug("[DEBUG] 关闭浏览器实例")
|
1739
|
-
await browser_instance.close()
|
1740
|
-
browser_instance = None
|
1741
|
-
|
1742
|
-
if playwright_instance:
|
1743
|
-
logger.debug("[DEBUG] 关闭playwright实例")
|
1744
|
-
await playwright_instance.stop()
|
1745
|
-
playwright_instance = None
|
1746
|
-
|
1747
|
-
async def main():
|
1748
|
-
"""主程序入口"""
|
1749
|
-
try:
|
1750
|
-
# 使用stdin/stdout流运行服务器
|
1751
|
-
async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
|
1752
|
-
await server.run(
|
1753
|
-
read_stream,
|
1754
|
-
write_stream,
|
1755
|
-
InitializationOptions(
|
1756
|
-
server_name="cnks",
|
1757
|
-
server_version="0.3.0",
|
1758
|
-
capabilities=server.get_capabilities(
|
1759
|
-
notification_options=NotificationOptions(),
|
1760
|
-
experimental_capabilities={},
|
1761
|
-
),
|
1762
|
-
),
|
1763
|
-
)
|
1764
|
-
finally:
|
1765
|
-
# 确保playwright资源在程序结束时被清理
|
1766
|
-
await cleanup_playwright()
|
1767
|
-
|
1768
|
-
# 为符合README.md的要求,添加从FastMCP导出的接口
|
1769
|
-
def create_fastmcp_server():
|
1770
|
-
"""创建FastMCP服务器接口,符合README中的示例"""
|
1771
|
-
try:
|
1772
|
-
from mcp.server.fastmcp import FastMCP
|
1773
|
-
fast_mcp = FastMCP("知网搜索")
|
1774
|
-
|
1775
|
-
# 只添加搜索并提取的工具
|
1776
|
-
if PLAYWRIGHT_AVAILABLE:
|
1777
|
-
@fast_mcp.tool()
|
1778
|
-
async def search_and_extract(keywords: str) -> dict:
|
1779
|
-
"""搜索关键词并提取所有论文的详细内容"""
|
1780
|
-
logger.debug("[DEBUG] 正在使用FastMCP的search_and_extract函数")
|
1781
|
-
try:
|
1782
|
-
# 第一步:执行搜索
|
1783
|
-
logger.debug(f"[DEBUG] 开始搜索关键词: {keywords}")
|
1784
|
-
|
1785
|
-
# 构建URL
|
1786
|
-
url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
|
1787
|
-
logger.debug(f"[DEBUG] 搜索URL: {url}")
|
1788
|
-
|
1789
|
-
# 如果playwright可用,使用playwright搜索
|
1790
|
-
if PLAYWRIGHT_AVAILABLE:
|
1791
|
-
logger.debug("[DEBUG] 使用playwright搜索")
|
1792
|
-
result_count = await search_with_playwright(keywords)
|
1793
|
-
else:
|
1794
|
-
# 否则直接用open_chrome打开URL
|
1795
|
-
logger.debug("[DEBUG] 直接使用open_chrome打开URL")
|
1796
|
-
result = open_chrome(url)
|
1797
|
-
|
1798
|
-
if isinstance(result, str):
|
1799
|
-
# 如果是错误信息,返回错误
|
1800
|
-
return {
|
1801
|
-
"error": f"打开Chrome失败: {result}",
|
1802
|
-
"keywords": keywords,
|
1803
|
-
"count": 0,
|
1804
|
-
"results": []
|
1805
|
-
}
|
1806
|
-
else:
|
1807
|
-
# 成功打开但无法获取链接
|
1808
|
-
return {
|
1809
|
-
"keywords": keywords,
|
1810
|
-
"count": 0,
|
1811
|
-
"message": "已直接在Chrome中打开搜索页面,但无法自动获取搜索结果。请安装playwright以获取完整功能。",
|
1812
|
-
"results": []
|
1813
|
-
}
|
1814
|
-
|
1815
|
-
# 检查搜索结果
|
1816
|
-
if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
|
1817
|
-
# 如果没有找到链接,至少返回搜索页面作为结果
|
1818
|
-
logger.debug("[DEBUG] 搜索未返回有效链接,返回搜索页面作为结果")
|
1819
|
-
return {
|
1820
|
-
"keywords": keywords,
|
1821
|
-
"count": 1,
|
1822
|
-
"results": [{
|
1823
|
-
"title": f"搜索结果: {keywords}",
|
1824
|
-
"authors": [],
|
1825
|
-
"abstract": "请在浏览器中查看搜索结果",
|
1826
|
-
"keywords": [],
|
1827
|
-
"cite_format": "",
|
1828
|
-
"url": url
|
1829
|
-
}]
|
1830
|
-
}
|
1831
|
-
|
1832
|
-
# 提取链接
|
1833
|
-
urls = [link["url"] for link in page_content["links"] if "url" in link]
|
1834
|
-
if not urls:
|
1835
|
-
logger.debug("[DEBUG] 没有找到有效链接,返回搜索页面")
|
1836
|
-
return {
|
1837
|
-
"keywords": keywords,
|
1838
|
-
"count": 1,
|
1839
|
-
"results": [{
|
1840
|
-
"title": f"搜索结果: {keywords}",
|
1841
|
-
"authors": [],
|
1842
|
-
"abstract": "请在浏览器中查看搜索结果",
|
1843
|
-
"keywords": [],
|
1844
|
-
"cite_format": "",
|
1845
|
-
"url": url
|
1846
|
-
}]
|
1847
|
-
}
|
1848
|
-
|
1849
|
-
# 第二步:执行提取
|
1850
|
-
results = await batch_extract_contents(urls)
|
1851
|
-
|
1852
|
-
# 包装结果
|
1853
|
-
return {
|
1854
|
-
"keywords": keywords,
|
1855
|
-
"count": len(results),
|
1856
|
-
"results": results,
|
1857
|
-
"success_count": sum(1 for r in results if "error" not in r or not r["error"]),
|
1858
|
-
"error_count": sum(1 for r in results if "error" in r and r["error"])
|
1859
|
-
}
|
1860
|
-
except Exception as e:
|
1861
|
-
logger.error(f"搜索并提取时出错: {str(e)}")
|
1862
|
-
logger.error(traceback.format_exc())
|
1863
|
-
return {
|
1864
|
-
"error": f"搜索并提取内容时出错: {str(e)}",
|
1865
|
-
"keywords": keywords,
|
1866
|
-
"count": 0,
|
1867
|
-
"results": []
|
1868
|
-
}
|
1869
|
-
|
1870
|
-
return fast_mcp
|
1871
|
-
except ImportError:
|
1872
|
-
logger.warning("警告: 无法导入FastMCP,请确保已安装最新版本的MCP")
|
1873
|
-
return None
|
1874
|
-
|
1875
|
-
if __name__ == "__main__":
|
1876
|
-
asyncio.run(main())
|