cnks 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnks/server.py +1348 -304
- {cnks-0.2.3.dist-info → cnks-0.2.4.dist-info}/METADATA +4 -4
- cnks-0.2.4.dist-info/RECORD +6 -0
- cnks/chrome_extractor.py +0 -413
- cnks/extractor.py +0 -250
- cnks-0.2.3.dist-info/RECORD +0 -8
- {cnks-0.2.3.dist-info → cnks-0.2.4.dist-info}/WHEEL +0 -0
- {cnks-0.2.3.dist-info → cnks-0.2.4.dist-info}/entry_points.txt +0 -0
cnks/server.py
CHANGED
@@ -12,11 +12,12 @@ import traceback
|
|
12
12
|
from pathlib import Path
|
13
13
|
from urllib.parse import quote
|
14
14
|
from typing import Dict, List, Any, Optional, Union
|
15
|
+
from datetime import datetime
|
16
|
+
from pydantic import BaseModel, AnyUrl
|
15
17
|
|
16
18
|
from mcp.server.models import InitializationOptions
|
17
19
|
import mcp.types as types
|
18
20
|
from mcp.server import NotificationOptions, Server
|
19
|
-
from pydantic import AnyUrl
|
20
21
|
import mcp.server.stdio
|
21
22
|
|
22
23
|
# 配置日志记录
|
@@ -36,27 +37,38 @@ except ImportError:
|
|
36
37
|
PLAYWRIGHT_AVAILABLE = False
|
37
38
|
logger.warning("Playwright未安装,将使用传统方式打开Chrome")
|
38
39
|
|
40
|
+
# 定义数据模型
|
41
|
+
class CNKIContent(BaseModel):
|
42
|
+
"""CNKI论文内容模型"""
|
43
|
+
title: str = ""
|
44
|
+
authors: List[str] = []
|
45
|
+
abstract: str = ""
|
46
|
+
keywords: List[str] = []
|
47
|
+
cite_format: str = ""
|
48
|
+
url: str = "" # 添加URL字段以记录来源
|
49
|
+
|
39
50
|
# 存储当前页面内容和笔记
|
40
51
|
page_content = ""
|
41
52
|
current_url = ""
|
42
53
|
notes: dict[str, str] = {}
|
43
|
-
browser_instance = None
|
44
54
|
|
45
55
|
server = Server("cnks")
|
46
56
|
|
47
|
-
#
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
try:
|
52
|
-
import chrome_extractor as extractor
|
53
|
-
except ImportError:
|
54
|
-
extractor = None
|
55
|
-
logger.warning("无法导入chrome_extractor模块,批量提取功能将不可用")
|
57
|
+
# 添加全局变量来跟踪playwright状态
|
58
|
+
playwright_instance = None
|
59
|
+
browser_instance = None
|
60
|
+
context = None
|
56
61
|
|
57
62
|
def find_chrome_executable():
|
58
63
|
"""查找Chrome可执行文件路径"""
|
64
|
+
# 首先检查环境变量
|
65
|
+
chrome_env = os.environ.get("CHROME_PATH")
|
66
|
+
if chrome_env and os.path.exists(chrome_env):
|
67
|
+
logger.debug(f"[DEBUG] 从环境变量找到Chrome: {chrome_env}")
|
68
|
+
return chrome_env
|
69
|
+
|
59
70
|
system = platform.system()
|
71
|
+
logger.debug(f"[DEBUG] 系统类型: {system}")
|
60
72
|
|
61
73
|
# 定义可能的Chrome位置
|
62
74
|
if system == "Windows":
|
@@ -64,48 +76,90 @@ def find_chrome_executable():
|
|
64
76
|
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
|
65
77
|
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
|
66
78
|
os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
|
79
|
+
# Edge浏览器也是基于Chromium的
|
80
|
+
r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe",
|
81
|
+
r"C:\Program Files\Microsoft\Edge\Application\msedge.exe",
|
67
82
|
]
|
68
83
|
elif system == "Darwin": # MacOS
|
69
84
|
chrome_paths = [
|
70
85
|
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
71
86
|
os.path.expanduser("~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"),
|
87
|
+
"/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
|
72
88
|
]
|
73
89
|
elif system == "Linux":
|
74
90
|
chrome_paths = [
|
75
91
|
"/usr/bin/google-chrome",
|
76
92
|
"/usr/bin/chromium-browser",
|
77
93
|
"/usr/bin/chromium",
|
94
|
+
"/usr/bin/microsoft-edge",
|
78
95
|
]
|
79
96
|
else:
|
97
|
+
logger.debug(f"[DEBUG] 未知系统类型: {system}")
|
80
98
|
return None
|
81
99
|
|
82
100
|
# 检查路径是否存在
|
83
101
|
for path in chrome_paths:
|
84
102
|
if os.path.exists(path):
|
103
|
+
logger.debug(f"[DEBUG] 找到Chrome: {path}")
|
85
104
|
return path
|
86
105
|
|
87
|
-
#
|
88
|
-
|
89
|
-
|
90
|
-
|
106
|
+
# 如果上述路径都不存在,尝试使用which命令查找
|
107
|
+
try:
|
108
|
+
if system != "Windows":
|
109
|
+
# 在Unix系统上尝试使用which命令
|
110
|
+
for browser in ["google-chrome", "chromium", "chromium-browser", "microsoft-edge"]:
|
111
|
+
try:
|
112
|
+
result = subprocess.check_output(["which", browser], universal_newlines=True).strip()
|
113
|
+
if result and os.path.exists(result):
|
114
|
+
logger.debug(f"[DEBUG] 使用which命令找到浏览器: {result}")
|
115
|
+
return result
|
116
|
+
except subprocess.CalledProcessError:
|
117
|
+
pass
|
118
|
+
else:
|
119
|
+
# 在Windows上尝试使用where命令
|
120
|
+
try:
|
121
|
+
result = subprocess.check_output(["where", "chrome"], universal_newlines=True).strip()
|
122
|
+
if result:
|
123
|
+
# where可能返回多行,取第一行
|
124
|
+
first_path = result.split('\n')[0].strip()
|
125
|
+
if os.path.exists(first_path):
|
126
|
+
logger.debug(f"[DEBUG] 使用where命令找到Chrome: {first_path}")
|
127
|
+
return first_path
|
128
|
+
except subprocess.CalledProcessError:
|
129
|
+
pass
|
130
|
+
|
131
|
+
# 尝试查找Edge
|
132
|
+
try:
|
133
|
+
result = subprocess.check_output(["where", "msedge"], universal_newlines=True).strip()
|
134
|
+
if result:
|
135
|
+
first_path = result.split('\n')[0].strip()
|
136
|
+
if os.path.exists(first_path):
|
137
|
+
logger.debug(f"[DEBUG] 使用where命令找到Edge: {first_path}")
|
138
|
+
return first_path
|
139
|
+
except subprocess.CalledProcessError:
|
140
|
+
pass
|
141
|
+
except Exception as e:
|
142
|
+
logger.debug(f"[DEBUG] 使用命令行查找浏览器时出错: {str(e)}")
|
91
143
|
|
144
|
+
logger.debug("[DEBUG] 未找到Chrome或兼容的浏览器")
|
92
145
|
return None
|
93
146
|
|
94
147
|
def open_chrome(url):
|
95
148
|
"""打开Chrome浏览器并访问指定URL"""
|
96
149
|
try:
|
97
|
-
|
150
|
+
logger.debug(f"[DEBUG] open_chrome函数被调用,URL: {url}")
|
98
151
|
|
99
|
-
|
100
|
-
|
152
|
+
# 使用webbrowser模块打开URL(会使用系统默认浏览器,通常是已经打开的Chrome)
|
153
|
+
logger.debug(f"[DEBUG] 尝试使用webbrowser.open打开URL: {url}")
|
154
|
+
webbrowser.open(url)
|
155
|
+
logger.debug(f"[DEBUG] webbrowser.open调用完成")
|
101
156
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
])
|
106
|
-
time.sleep(2) # 等待页面加载
|
157
|
+
# 等待页面加载
|
158
|
+
time.sleep(2)
|
159
|
+
logger.debug("[DEBUG] open_chrome函数执行完毕")
|
107
160
|
return True
|
108
161
|
except Exception as e:
|
162
|
+
logger.debug(f"[DEBUG] open_chrome函数出错: {str(e)}")
|
109
163
|
return f"打开Chrome时出错: {str(e)}"
|
110
164
|
|
111
165
|
async def search_with_playwright(keywords):
|
@@ -116,234 +170,337 @@ async def search_with_playwright(keywords):
|
|
116
170
|
return "需要安装playwright模块:uv add playwright"
|
117
171
|
|
118
172
|
try:
|
119
|
-
|
120
|
-
if not chrome_path:
|
121
|
-
return "未找到Chrome可执行文件。请设置CHROME_PATH环境变量指向Chrome位置。"
|
173
|
+
logger.debug(f"[DEBUG] 使用搜索功能,关键词: {keywords}")
|
122
174
|
|
123
|
-
|
175
|
+
# 先访问知网首页而不是直接访问搜索结果页
|
176
|
+
initial_url = "https://kns.cnki.net/"
|
177
|
+
search_url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
|
178
|
+
logger.debug(f"[DEBUG] 初始URL: {initial_url}")
|
124
179
|
|
125
|
-
#
|
126
|
-
global browser_instance
|
180
|
+
# 创建全局变量来跟踪playwright状态
|
181
|
+
global playwright_instance, browser_instance, context
|
127
182
|
|
128
|
-
#
|
129
|
-
|
183
|
+
# 查找Chrome路径
|
184
|
+
chrome_path = find_chrome_executable()
|
185
|
+
if not chrome_path:
|
186
|
+
logger.warning("[WARNING] 未找到Chrome可执行文件,将使用默认浏览器")
|
187
|
+
# 使用webbrowser模块打开
|
188
|
+
webbrowser.open(search_url)
|
189
|
+
# 构造一个基本结果
|
190
|
+
page_content = {
|
191
|
+
"count": 1,
|
192
|
+
"links": [{
|
193
|
+
"index": 1,
|
194
|
+
"url": search_url,
|
195
|
+
"title": f"搜索: {keywords}"
|
196
|
+
}]
|
197
|
+
}
|
198
|
+
return 1
|
199
|
+
|
200
|
+
logger.debug(f"[DEBUG] 找到Chrome路径: {chrome_path}")
|
130
201
|
|
131
|
-
#
|
132
|
-
|
133
|
-
logger.debug("[DEBUG]
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
202
|
+
# 检查playwright是否已经运行
|
203
|
+
if 'playwright_instance' not in globals() or playwright_instance is None:
|
204
|
+
logger.debug("[DEBUG] 初始化新的playwright实例")
|
205
|
+
# 第一次运行,初始化playwright
|
206
|
+
playwright_instance = await async_playwright().start()
|
207
|
+
|
208
|
+
# 设置启动选项
|
209
|
+
browser_args = []
|
210
|
+
|
211
|
+
# 使用系统已安装的Chrome
|
212
|
+
if chrome_path:
|
213
|
+
browser_args.extend([
|
214
|
+
'--no-sandbox', # 在某些环境中可能需要
|
215
|
+
'--start-maximized' # 最大化窗口
|
216
|
+
])
|
217
|
+
|
218
|
+
# 启动浏览器 - 尝试使用系统Chrome
|
219
|
+
try:
|
220
|
+
# 首先尝试使用chrome_path启动
|
221
|
+
logger.debug(f"[DEBUG] 尝试使用系统Chrome启动: {chrome_path}")
|
222
|
+
browser_instance = await playwright_instance.chromium.launch(
|
223
|
+
headless=False, # 显示浏览器界面
|
224
|
+
executable_path=chrome_path,
|
225
|
+
args=browser_args
|
226
|
+
)
|
227
|
+
except Exception as e:
|
228
|
+
logger.warning(f"[WARNING] 使用系统Chrome启动失败: {str(e)},尝试使用默认浏览器")
|
229
|
+
# 如果失败,使用默认浏览器
|
230
|
+
browser_instance = await playwright_instance.chromium.launch(
|
231
|
+
headless=False # 显示浏览器界面
|
232
|
+
)
|
233
|
+
|
234
|
+
# 创建上下文
|
235
|
+
context = await browser_instance.new_context(
|
236
|
+
viewport=None # 不限制视窗大小,使用浏览器默认设置
|
145
237
|
)
|
238
|
+
|
239
|
+
# 创建新页面
|
240
|
+
page = await context.new_page()
|
241
|
+
|
242
|
+
# 访问初始URL(知网首页)
|
243
|
+
logger.debug(f"[DEBUG] 导航到知网首页: {initial_url}")
|
244
|
+
await page.goto(initial_url)
|
245
|
+
logger.debug("[DEBUG] 已打开新的浏览器窗口并访问知网首页")
|
246
|
+
else:
|
247
|
+
logger.debug("[DEBUG] 在现有playwright实例中打开新标签页")
|
248
|
+
# playwright已经在运行,创建新标签页
|
249
|
+
page = await context.new_page()
|
250
|
+
# 访问初始URL(知网首页)
|
251
|
+
await page.goto(initial_url)
|
252
|
+
logger.debug("[DEBUG] 已在现有浏览器中打开新标签页并访问知网首页")
|
146
253
|
|
147
|
-
#
|
148
|
-
|
149
|
-
|
150
|
-
page = await browser.new_page()
|
151
|
-
|
152
|
-
# 导航到知网搜索页面
|
153
|
-
await page.goto("https://kns.cnki.net/kns8s/search")
|
154
|
-
logger.debug("[DEBUG] 成功打开知网搜索页面")
|
254
|
+
# 等待页面加载完成
|
255
|
+
await page.wait_for_load_state('networkidle')
|
256
|
+
await asyncio.sleep(1)
|
155
257
|
|
156
|
-
#
|
157
|
-
await page
|
258
|
+
# 检查是否需要验证
|
259
|
+
await check_and_wait_for_verification(page)
|
158
260
|
|
159
|
-
#
|
261
|
+
# 尝试执行搜索操作
|
160
262
|
try:
|
161
|
-
#
|
162
|
-
|
263
|
+
# 方法1: 尝试在首页搜索框输入关键词
|
264
|
+
logger.debug("[DEBUG] 尝试在首页查找搜索框")
|
265
|
+
|
266
|
+
# 查找搜索框
|
267
|
+
search_input_selectors = [
|
268
|
+
'#txt_search',
|
269
|
+
'input[type="text"]',
|
270
|
+
'.search-input',
|
271
|
+
'.input-box input',
|
272
|
+
'input.search-textbox',
|
273
|
+
'input[placeholder*="搜索"]'
|
274
|
+
]
|
275
|
+
|
276
|
+
search_input = None
|
277
|
+
for selector in search_input_selectors:
|
278
|
+
try:
|
279
|
+
logger.debug(f"[DEBUG] 尝试查找搜索框选择器: {selector}")
|
280
|
+
search_input = await page.query_selector(selector)
|
281
|
+
if search_input:
|
282
|
+
logger.debug(f"[DEBUG] 找到搜索框: {selector}")
|
283
|
+
break
|
284
|
+
except Exception as e:
|
285
|
+
logger.debug(f"[DEBUG] 查找选择器 {selector} 时出错: {str(e)}")
|
286
|
+
|
163
287
|
if search_input:
|
164
288
|
# 清空搜索框
|
165
289
|
await search_input.fill("")
|
166
290
|
# 输入关键词
|
167
|
-
await search_input.
|
168
|
-
logger.debug(f"[DEBUG]
|
291
|
+
await search_input.type(keywords, delay=100) # 添加延迟模拟真实输入
|
292
|
+
logger.debug(f"[DEBUG] 已在搜索框中输入关键词: {keywords}")
|
169
293
|
|
170
|
-
#
|
171
|
-
|
294
|
+
# 查找搜索按钮
|
295
|
+
search_button_selectors = [
|
296
|
+
'button.search-btn',
|
297
|
+
'button.search',
|
298
|
+
'button[type="submit"]',
|
299
|
+
'input[type="submit"]',
|
300
|
+
'.search-action',
|
301
|
+
'a.search-btn'
|
302
|
+
]
|
303
|
+
|
304
|
+
search_button = None
|
305
|
+
for selector in search_button_selectors:
|
306
|
+
try:
|
307
|
+
logger.debug(f"[DEBUG] 尝试查找搜索按钮选择器: {selector}")
|
308
|
+
search_button = await page.query_selector(selector)
|
309
|
+
if search_button:
|
310
|
+
logger.debug(f"[DEBUG] 找到搜索按钮: {selector}")
|
311
|
+
break
|
312
|
+
except Exception as e:
|
313
|
+
logger.debug(f"[DEBUG] 查找选择器 {selector} 时出错: {str(e)}")
|
172
314
|
|
173
|
-
# 查找并点击搜索按钮
|
174
|
-
search_button = await page.query_selector('.search-btn')
|
175
315
|
if search_button:
|
316
|
+
# 点击搜索按钮
|
317
|
+
logger.debug("[DEBUG] 点击搜索按钮")
|
176
318
|
await search_button.click()
|
177
|
-
logger.debug("[DEBUG] 已点击搜索按钮")
|
178
|
-
# 等待搜索结果加载
|
179
|
-
await page.wait_for_load_state("networkidle")
|
180
319
|
|
181
|
-
#
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
# 尝试点击排序下拉框
|
187
|
-
logger.debug("[DEBUG] 尝试点击排序下拉框")
|
188
|
-
# 根据提供的HTML,尝试定位下拉框的三角形
|
189
|
-
sort_dropdown = await page.query_selector('div[class="sort"][id="perPageDiv"]')
|
190
|
-
if sort_dropdown:
|
191
|
-
await sort_dropdown.click()
|
192
|
-
logger.debug("[DEBUG] 成功点击排序下拉框")
|
193
|
-
|
194
|
-
# 等待下拉菜单出现
|
195
|
-
await asyncio.sleep(1)
|
196
|
-
|
197
|
-
# 点击操作2:点击数字50选项
|
198
|
-
logger.debug("[DEBUG] 尝试点击'50'选项")
|
199
|
-
# 尝试定位"50"选项
|
200
|
-
option_50 = await page.query_selector('li[data-val="50"]')
|
201
|
-
if option_50:
|
202
|
-
await option_50.click()
|
203
|
-
logger.debug("[DEBUG] 成功点击'50'选项")
|
204
|
-
await page.wait_for_load_state("networkidle")
|
205
|
-
|
206
|
-
# 勾选来源类别中的CSSCI选项
|
207
|
-
try:
|
208
|
-
# 等待一下确保页面完全加载
|
209
|
-
await asyncio.sleep(2)
|
210
|
-
|
211
|
-
logger.debug("[DEBUG] 尝试勾选CSSCI选项")
|
212
|
-
|
213
|
-
# 首先尝试找到来源类别区域
|
214
|
-
# 通常来源类别会有一个标题或者分组
|
215
|
-
source_category = await page.query_selector('div.group-item:has-text("来源类别")')
|
216
|
-
|
217
|
-
if source_category:
|
218
|
-
logger.debug("[DEBUG] 找到来源类别区域")
|
219
|
-
|
220
|
-
# 在来源类别区域内查找CSSCI选项
|
221
|
-
cssci_checkbox = await source_category.query_selector('input[type="checkbox"]:near(:text("CSSCI"))')
|
222
|
-
|
223
|
-
if cssci_checkbox:
|
224
|
-
# 点击CSSCI复选框
|
225
|
-
await cssci_checkbox.click()
|
226
|
-
logger.debug("[DEBUG] 成功勾选CSSCI选项")
|
227
|
-
|
228
|
-
# 等待页面刷新
|
229
|
-
await page.wait_for_load_state("networkidle")
|
230
|
-
|
231
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
232
|
-
links_count = await find_and_count_abstract_links(page)
|
233
|
-
|
234
|
-
return links_count
|
235
|
-
else:
|
236
|
-
logger.debug("[DEBUG] 在来源类别区域未找到CSSCI选项")
|
237
|
-
|
238
|
-
# 尝试另一种方式:直接在整个页面中查找CSSCI
|
239
|
-
cssci_text = await page.query_selector(':text("CSSCI")')
|
240
|
-
if cssci_text:
|
241
|
-
# 尝试点击文本附近的复选框
|
242
|
-
await cssci_text.click()
|
243
|
-
logger.debug("[DEBUG] 通过文本找到并点击了CSSCI")
|
244
|
-
await page.wait_for_load_state("networkidle")
|
245
|
-
|
246
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
247
|
-
links_count = await find_and_count_abstract_links(page)
|
248
|
-
|
249
|
-
return links_count
|
250
|
-
else:
|
251
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
252
|
-
links_count = await find_and_count_abstract_links(page)
|
253
|
-
return links_count
|
254
|
-
else:
|
255
|
-
logger.debug("[DEBUG] 未找到来源类别区域")
|
256
|
-
|
257
|
-
# 尝试直接在页面中查找CSSCI文本
|
258
|
-
cssci_text = await page.query_selector(':text("CSSCI")')
|
259
|
-
if cssci_text:
|
260
|
-
# 尝试点击文本附近的复选框
|
261
|
-
await cssci_text.click()
|
262
|
-
logger.debug("[DEBUG] 直接找到并点击了CSSCI")
|
263
|
-
await page.wait_for_load_state("networkidle")
|
264
|
-
|
265
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
266
|
-
links_count = await find_and_count_abstract_links(page)
|
267
|
-
|
268
|
-
return links_count
|
269
|
-
else:
|
270
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
271
|
-
links_count = await find_and_count_abstract_links(page)
|
272
|
-
return links_count
|
273
|
-
except Exception as e:
|
274
|
-
logger.debug(f"[DEBUG] 勾选CSSCI选项时出错: {str(e)}")
|
275
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
276
|
-
links_count = await find_and_count_abstract_links(page)
|
277
|
-
return links_count
|
278
|
-
|
279
|
-
# 查找所有包含"article/abstract?v="字样的链接
|
280
|
-
links_count = await find_and_count_abstract_links(page)
|
281
|
-
return links_count
|
282
|
-
else:
|
283
|
-
logger.debug("[DEBUG] 未找到'50'选项")
|
284
|
-
page_content = {
|
285
|
-
"count": 0,
|
286
|
-
"links": [],
|
287
|
-
"error": "已搜索并点击下拉框,但未找到'50'选项"
|
288
|
-
}
|
289
|
-
return 0
|
290
|
-
else:
|
291
|
-
logger.debug("[DEBUG] 未找到排序下拉框")
|
292
|
-
page_content = {
|
293
|
-
"count": 0,
|
294
|
-
"links": [],
|
295
|
-
"error": "已搜索,但未找到排序下拉框"
|
296
|
-
}
|
297
|
-
return 0
|
298
|
-
except Exception as e:
|
299
|
-
logger.debug(f"[DEBUG] 点击下拉框或选项时出错: {str(e)}")
|
300
|
-
page_content = {
|
301
|
-
"count": 0,
|
302
|
-
"links": [],
|
303
|
-
"error": f"已搜索,但在点击下拉框或选项时出错: {str(e)}"
|
304
|
-
}
|
305
|
-
return 0
|
320
|
+
# 等待搜索结果加载
|
321
|
+
logger.debug("[DEBUG] 等待搜索结果加载")
|
322
|
+
await page.wait_for_load_state('networkidle')
|
323
|
+
await asyncio.sleep(2)
|
306
324
|
else:
|
307
|
-
#
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
325
|
+
# 如果找不到搜索按钮,尝试按回车
|
326
|
+
logger.debug("[DEBUG] 未找到搜索按钮,尝试按回车键")
|
327
|
+
await search_input.press("Enter")
|
328
|
+
|
329
|
+
# 等待搜索结果加载
|
330
|
+
logger.debug("[DEBUG] 等待搜索结果加载")
|
331
|
+
await page.wait_for_load_state('networkidle')
|
332
|
+
await asyncio.sleep(2)
|
314
333
|
else:
|
315
|
-
#
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
}
|
321
|
-
return 0
|
334
|
+
# 如果找不到搜索框,直接导航到搜索URL
|
335
|
+
logger.debug(f"[DEBUG] 未找到搜索框,直接导航到搜索URL: {search_url}")
|
336
|
+
await page.goto(search_url)
|
337
|
+
await page.wait_for_load_state('networkidle')
|
338
|
+
await asyncio.sleep(2)
|
322
339
|
except Exception as e:
|
323
|
-
logger.debug(f"[DEBUG]
|
324
|
-
|
340
|
+
logger.debug(f"[DEBUG] 执行搜索操作时出错: {str(e)}")
|
341
|
+
logger.debug(traceback.format_exc())
|
342
|
+
|
343
|
+
# 如果交互失败,直接导航到搜索URL
|
344
|
+
logger.debug(f"[DEBUG] 导航到搜索URL: {search_url}")
|
345
|
+
await page.goto(search_url)
|
346
|
+
await page.wait_for_load_state('networkidle')
|
347
|
+
await asyncio.sleep(2)
|
348
|
+
|
349
|
+
# 在搜索结果页面再次检查是否需要验证
|
350
|
+
await check_and_wait_for_verification(page)
|
351
|
+
|
352
|
+
# 查找并计数链接
|
353
|
+
links_count = await find_and_count_abstract_links(page)
|
354
|
+
|
355
|
+
# 添加等待时间让用户可以查看结果
|
356
|
+
await asyncio.sleep(5)
|
357
|
+
|
358
|
+
logger.debug(f"[DEBUG] 搜索完成,找到 {links_count} 个链接")
|
359
|
+
|
360
|
+
# 如果找不到链接,使用基本信息构造结果
|
361
|
+
if links_count == 0:
|
362
|
+
# 获取当前URL
|
363
|
+
current_url = await page.url()
|
364
|
+
page_content = {
|
365
|
+
"count": 1,
|
366
|
+
"links": [{
|
367
|
+
"index": 1,
|
368
|
+
"url": current_url,
|
369
|
+
"title": f"搜索: {keywords}"
|
370
|
+
}]
|
371
|
+
}
|
372
|
+
|
373
|
+
return links_count
|
374
|
+
except Exception as e:
|
375
|
+
error_msg = str(e)
|
376
|
+
logger.debug(f"[DEBUG] 搜索错误: {error_msg}")
|
377
|
+
logger.debug(traceback.format_exc())
|
378
|
+
|
379
|
+
# 尝试直接使用webbrowser打开
|
380
|
+
try:
|
381
|
+
logger.debug("[DEBUG] 尝试使用webbrowser打开URL")
|
382
|
+
webbrowser.open(search_url)
|
383
|
+
|
384
|
+
# 构造一个基本结果
|
385
|
+
page_content = {
|
386
|
+
"count": 1,
|
387
|
+
"links": [{
|
388
|
+
"index": 1,
|
389
|
+
"url": search_url,
|
390
|
+
"title": f"搜索: {keywords}"
|
391
|
+
}]
|
392
|
+
}
|
393
|
+
return 1
|
394
|
+
except Exception as e2:
|
395
|
+
logger.debug(f"[DEBUG] 使用webbrowser打开URL失败: {str(e2)}")
|
396
|
+
|
325
397
|
page_content = {
|
326
398
|
"count": 0,
|
327
399
|
"links": [],
|
328
|
-
"error": f"
|
400
|
+
"error": f"搜索过程中出错: {error_msg}"
|
329
401
|
}
|
330
402
|
return 0
|
331
|
-
|
332
|
-
|
333
|
-
|
403
|
+
|
404
|
+
async def check_and_wait_for_verification(page):
|
405
|
+
"""检查页面是否需要验证,如果需要则等待用户手动验证"""
|
406
|
+
# 验证页面可能包含的特征
|
407
|
+
verification_indicators = [
|
408
|
+
'验证码',
|
409
|
+
'人机验证',
|
410
|
+
'captcha',
|
411
|
+
'verify',
|
412
|
+
'安全验证',
|
413
|
+
'滑动验证',
|
414
|
+
'拖动滑块',
|
415
|
+
'请完成验证',
|
416
|
+
'拼图验证'
|
417
|
+
]
|
418
|
+
|
419
|
+
try:
|
420
|
+
# 获取页面内容
|
421
|
+
page_text = await page.content()
|
334
422
|
|
335
|
-
#
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
423
|
+
# 检查是否包含验证指示词
|
424
|
+
needs_verification = any(indicator in page_text for indicator in verification_indicators)
|
425
|
+
|
426
|
+
# 尝试查找常见的验证元素
|
427
|
+
verification_selectors = [
|
428
|
+
'.verify-wrap',
|
429
|
+
'.captcha',
|
430
|
+
'.verification',
|
431
|
+
'#captcha',
|
432
|
+
'.slidecode',
|
433
|
+
'.verify-box',
|
434
|
+
'.verify-img-panel',
|
435
|
+
'iframe[src*="captcha"]',
|
436
|
+
'iframe[src*="verify"]'
|
437
|
+
]
|
438
|
+
|
439
|
+
for selector in verification_selectors:
|
440
|
+
try:
|
441
|
+
verify_elem = await page.query_selector(selector)
|
442
|
+
if verify_elem:
|
443
|
+
needs_verification = True
|
444
|
+
logger.info(f"[INFO] 检测到验证元素: {selector}")
|
445
|
+
break
|
446
|
+
except:
|
447
|
+
pass
|
448
|
+
|
449
|
+
if needs_verification:
|
450
|
+
logger.info("[INFO] 检测到验证页面,等待用户手动验证...")
|
451
|
+
print("\n*** 请注意 ***")
|
452
|
+
print("检测到需要验证码验证,请在浏览器中完成验证...")
|
453
|
+
print("验证完成后,程序将自动继续\n")
|
340
454
|
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
455
|
+
# 等待用户完成验证,验证页面可能有不同的特征表明验证完成
|
456
|
+
# 例如,特定元素消失或页面URL改变
|
457
|
+
max_wait_time = 120 # 最长等待2分钟
|
458
|
+
start_time = time.time()
|
459
|
+
current_url = await page.url()
|
460
|
+
|
461
|
+
while time.time() - start_time < max_wait_time:
|
462
|
+
# 每隔一秒检查一次
|
463
|
+
await asyncio.sleep(1)
|
464
|
+
|
465
|
+
# 检查URL是否改变(可能表示验证成功)
|
466
|
+
new_url = await page.url()
|
467
|
+
if new_url != current_url:
|
468
|
+
logger.info("[INFO] 检测到URL变化,验证可能已完成")
|
469
|
+
break
|
470
|
+
|
471
|
+
# 再次检查验证元素是否消失
|
472
|
+
verification_still_present = False
|
473
|
+
for selector in verification_selectors:
|
474
|
+
try:
|
475
|
+
verify_elem = await page.query_selector(selector)
|
476
|
+
if verify_elem:
|
477
|
+
verification_still_present = True
|
478
|
+
break
|
479
|
+
except:
|
480
|
+
pass
|
481
|
+
|
482
|
+
if not verification_still_present:
|
483
|
+
logger.info("[INFO] 验证元素已消失,验证可能已完成")
|
484
|
+
break
|
485
|
+
|
486
|
+
# 检查页面内容是否不再包含验证指示词
|
487
|
+
page_text = await page.content()
|
488
|
+
if not any(indicator in page_text for indicator in verification_indicators):
|
489
|
+
logger.info("[INFO] 验证指示词已消失,验证可能已完成")
|
490
|
+
break
|
491
|
+
|
492
|
+
# 等待页面稳定
|
493
|
+
await page.wait_for_load_state('networkidle')
|
494
|
+
await asyncio.sleep(2)
|
495
|
+
|
496
|
+
logger.info("[INFO] 继续执行,可能已完成验证")
|
497
|
+
print("继续执行操作...\n")
|
498
|
+
else:
|
499
|
+
logger.debug("[DEBUG] 未检测到验证页面")
|
500
|
+
|
501
|
+
except Exception as e:
|
502
|
+
logger.error(f"[ERROR] 检查验证页面时出错: {str(e)}")
|
503
|
+
logger.error(traceback.format_exc())
|
347
504
|
|
348
505
|
def search_with_direct_chrome(keywords):
|
349
506
|
"""直接使用Chrome搜索,不使用playwright"""
|
@@ -355,25 +512,26 @@ def search_with_direct_chrome(keywords):
|
|
355
512
|
url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
|
356
513
|
logger.debug(f"[DEBUG] 打开URL: {url}")
|
357
514
|
|
515
|
+
# 使用open_chrome函数打开URL
|
358
516
|
result = open_chrome(url)
|
359
517
|
|
360
|
-
if isinstance(result, str) and "
|
361
|
-
logger.debug(f"[DEBUG]
|
362
|
-
|
363
|
-
page_content = {
|
364
|
-
"count": 0,
|
365
|
-
"links": [],
|
366
|
-
"error": f"直接打开Chrome搜索: {result}"
|
367
|
-
}
|
368
|
-
|
369
|
-
else:
|
370
|
-
logger.debug("[DEBUG] 直接打开Chrome成功")
|
518
|
+
if isinstance(result, str) and "错误" in result:
|
519
|
+
logger.debug(f"[DEBUG] 打开Chrome失败: {result}")
|
371
520
|
|
372
521
|
page_content = {
|
373
522
|
"count": 0,
|
374
523
|
"links": [],
|
375
|
-
"
|
524
|
+
"error": f"打开Chrome搜索失败: {result}"
|
376
525
|
}
|
526
|
+
return page_content
|
527
|
+
|
528
|
+
logger.debug("[DEBUG] 已尝试在已有Chrome窗口中打开新标签页")
|
529
|
+
|
530
|
+
page_content = {
|
531
|
+
"count": 0,
|
532
|
+
"links": [],
|
533
|
+
"message": "已打开Chrome并搜索关键词,但无法自动获取链接。请安装playwright以获取完整功能。"
|
534
|
+
}
|
377
535
|
|
378
536
|
return page_content
|
379
537
|
except Exception as e:
|
@@ -600,10 +758,10 @@ async def handle_list_tools() -> list[types.Tool]:
|
|
600
758
|
tools = []
|
601
759
|
|
602
760
|
# 只添加搜索并提取的组合工具
|
603
|
-
if
|
761
|
+
if PLAYWRIGHT_AVAILABLE:
|
604
762
|
tools.append(
|
605
763
|
types.Tool(
|
606
|
-
name="
|
764
|
+
name="search_and_extract",
|
607
765
|
description="搜索知网关键词并提取所有论文的详细内容",
|
608
766
|
inputSchema={
|
609
767
|
"type": "object",
|
@@ -624,7 +782,7 @@ async def handle_call_tool(
|
|
624
782
|
"""处理工具执行请求"""
|
625
783
|
global current_url, page_content
|
626
784
|
|
627
|
-
if name == "
|
785
|
+
if name == "search_and_extract" and PLAYWRIGHT_AVAILABLE:
|
628
786
|
if not arguments:
|
629
787
|
raise ValueError("缺少参数")
|
630
788
|
|
@@ -635,39 +793,95 @@ async def handle_call_tool(
|
|
635
793
|
try:
|
636
794
|
# 第一步:执行搜索
|
637
795
|
logger.info(f"开始执行搜索并提取:关键词 '{keywords}'")
|
638
|
-
|
639
|
-
|
796
|
+
|
797
|
+
# 构建URL
|
798
|
+
url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
|
799
|
+
current_url = url
|
800
|
+
logger.debug(f"[DEBUG] 搜索URL: {url}")
|
801
|
+
|
802
|
+
# 如果playwright可用,使用playwright搜索
|
803
|
+
if PLAYWRIGHT_AVAILABLE:
|
804
|
+
logger.debug("[DEBUG] 使用playwright搜索")
|
805
|
+
links_count = await search_with_playwright(keywords)
|
806
|
+
else:
|
807
|
+
# 否则直接用open_chrome打开URL
|
808
|
+
logger.debug("[DEBUG] 直接使用open_chrome打开URL")
|
809
|
+
result = open_chrome(url)
|
810
|
+
|
811
|
+
if isinstance(result, str):
|
812
|
+
# 如果是错误信息,返回错误
|
813
|
+
return [
|
814
|
+
types.TextContent(
|
815
|
+
type="text",
|
816
|
+
text=json.dumps({
|
817
|
+
"error": f"打开Chrome失败: {result}",
|
818
|
+
"keywords": keywords,
|
819
|
+
"count": 0,
|
820
|
+
"results": []
|
821
|
+
})
|
822
|
+
)
|
823
|
+
]
|
824
|
+
else:
|
825
|
+
# 成功打开但无法获取链接
|
826
|
+
return [
|
827
|
+
types.TextContent(
|
828
|
+
type="text",
|
829
|
+
text=json.dumps({
|
830
|
+
"keywords": keywords,
|
831
|
+
"count": 0,
|
832
|
+
"message": "已直接在Chrome中打开搜索页面,但无法自动获取搜索结果。请安装playwright以获取完整功能。",
|
833
|
+
"results": []
|
834
|
+
})
|
835
|
+
)
|
836
|
+
]
|
640
837
|
|
641
838
|
# 检查搜索结果
|
642
839
|
if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
|
840
|
+
# 如果没有找到链接,至少返回搜索页面作为结果
|
841
|
+
logger.debug("[DEBUG] 搜索未返回有效链接,返回搜索页面作为结果")
|
643
842
|
return [
|
644
843
|
types.TextContent(
|
645
844
|
type="text",
|
646
|
-
text={
|
647
|
-
"
|
648
|
-
"count":
|
649
|
-
"results": [
|
650
|
-
|
845
|
+
text=json.dumps({
|
846
|
+
"keywords": keywords,
|
847
|
+
"count": 1,
|
848
|
+
"results": [{
|
849
|
+
"title": f"搜索结果: {keywords}",
|
850
|
+
"authors": [],
|
851
|
+
"abstract": "请在浏览器中查看搜索结果",
|
852
|
+
"keywords": [],
|
853
|
+
"cite_format": "",
|
854
|
+
"url": url
|
855
|
+
}]
|
856
|
+
})
|
651
857
|
)
|
652
858
|
]
|
653
859
|
|
654
860
|
# 提取链接
|
655
861
|
urls = [link["url"] for link in page_content["links"] if "url" in link]
|
656
862
|
if not urls:
|
863
|
+
logger.debug("[DEBUG] 没有找到有效链接,返回搜索页面")
|
657
864
|
return [
|
658
865
|
types.TextContent(
|
659
866
|
type="text",
|
660
|
-
text={
|
661
|
-
"
|
662
|
-
"count":
|
663
|
-
"results": [
|
664
|
-
|
867
|
+
text=json.dumps({
|
868
|
+
"keywords": keywords,
|
869
|
+
"count": 1,
|
870
|
+
"results": [{
|
871
|
+
"title": f"搜索结果: {keywords}",
|
872
|
+
"authors": [],
|
873
|
+
"abstract": "请在浏览器中查看搜索结果",
|
874
|
+
"keywords": [],
|
875
|
+
"cite_format": "",
|
876
|
+
"url": url
|
877
|
+
}]
|
878
|
+
})
|
665
879
|
)
|
666
880
|
]
|
667
881
|
|
668
882
|
# 第二步:执行提取
|
669
883
|
logger.info(f"搜索成功,找到 {len(urls)} 个链接,开始提取内容")
|
670
|
-
results = await
|
884
|
+
results = await batch_extract_contents(urls)
|
671
885
|
|
672
886
|
# 包装结果
|
673
887
|
result_json = {
|
@@ -681,7 +895,7 @@ async def handle_call_tool(
|
|
681
895
|
return [
|
682
896
|
types.TextContent(
|
683
897
|
type="text",
|
684
|
-
text=result_json
|
898
|
+
text=json.dumps(result_json)
|
685
899
|
)
|
686
900
|
]
|
687
901
|
except Exception as e:
|
@@ -690,12 +904,12 @@ async def handle_call_tool(
|
|
690
904
|
return [
|
691
905
|
types.TextContent(
|
692
906
|
type="text",
|
693
|
-
text={
|
907
|
+
text=json.dumps({
|
694
908
|
"error": f"搜索并提取内容时出错: {str(e)}",
|
695
909
|
"keywords": keywords,
|
696
910
|
"count": 0,
|
697
911
|
"results": []
|
698
|
-
}
|
912
|
+
})
|
699
913
|
)
|
700
914
|
]
|
701
915
|
|
@@ -703,69 +917,853 @@ async def handle_call_tool(
|
|
703
917
|
raise ValueError(f"未知工具: {name}")
|
704
918
|
|
705
919
|
async def find_and_count_abstract_links(page):
|
706
|
-
"""
|
920
|
+
"""查找并统计知网搜索结果页面中的论文链接"""
|
707
921
|
global page_content
|
708
922
|
|
709
923
|
try:
|
710
|
-
logger.debug("[DEBUG]
|
924
|
+
logger.debug("[DEBUG] 开始查找知网搜索结果中的论文链接")
|
711
925
|
|
712
926
|
# 等待确保页面完全加载
|
713
|
-
await asyncio.sleep(
|
927
|
+
await asyncio.sleep(3)
|
714
928
|
|
715
|
-
#
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
929
|
+
# 首先尝试设置每页显示50条记录
|
930
|
+
try:
|
931
|
+
logger.debug("[DEBUG] 尝试设置每页显示50条记录")
|
932
|
+
|
933
|
+
# 使用更直接的JavaScript方法点击50条
|
934
|
+
set_page_size_result = await page.evaluate("""() => {
|
935
|
+
try {
|
936
|
+
// 更精确地找到下拉框并点击
|
937
|
+
const dropdowns = document.querySelectorAll('#perPageDiv, .perpage-content, .page-count, div[class*="perpage"]');
|
938
|
+
if (dropdowns && dropdowns.length > 0) {
|
939
|
+
// 记录找到的下拉框
|
940
|
+
console.log('找到下拉框元素:', dropdowns[0]);
|
941
|
+
// 点击下拉框
|
942
|
+
dropdowns[0].click();
|
943
|
+
console.log('已点击下拉框');
|
944
|
+
|
945
|
+
// 直接等待而不使用setTimeout,确保下拉菜单显示
|
946
|
+
return new Promise(resolve => {
|
947
|
+
setTimeout(() => {
|
948
|
+
// 查找并点击50选项
|
949
|
+
const options = document.querySelectorAll('a[data-v="50"], a[href*="50"], li[data-val="50"]');
|
950
|
+
console.log('找到的50选项数量:', options.length);
|
951
|
+
|
952
|
+
for (let option of options) {
|
953
|
+
if (option.textContent.includes('50')) {
|
954
|
+
option.click();
|
955
|
+
console.log('已点击50选项:', option);
|
956
|
+
resolve("点击了50选项:" + option.textContent);
|
957
|
+
return;
|
958
|
+
}
|
959
|
+
}
|
960
|
+
|
961
|
+
// 如果没有找到特定的50选项,尝试点击最后一个选项(通常是最大数值)
|
962
|
+
const allOptions = document.querySelectorAll('.perpage-content a, .sort-list li');
|
963
|
+
if (allOptions && allOptions.length > 0) {
|
964
|
+
const lastOption = allOptions[allOptions.length - 1];
|
965
|
+
lastOption.click();
|
966
|
+
console.log('点击了最后一个选项:', lastOption.textContent);
|
967
|
+
resolve("点击了最后一个选项:" + lastOption.textContent);
|
968
|
+
return;
|
969
|
+
}
|
970
|
+
|
971
|
+
resolve("未找到50条/页选项");
|
972
|
+
}, 1000); // 等待一秒确保下拉菜单显示
|
973
|
+
});
|
974
|
+
}
|
975
|
+
|
976
|
+
// 尝试另一种方式 - 直接点击带有"50"的链接
|
977
|
+
const directLinks = document.querySelectorAll('a:not([style*="display:none"]):not([style*="display: none"])');
|
978
|
+
for (let link of directLinks) {
|
979
|
+
if (link.textContent.trim() === '50' ||
|
980
|
+
link.textContent.includes('50条') ||
|
981
|
+
link.textContent.includes('50 条')) {
|
982
|
+
link.click();
|
983
|
+
return "直接点击了50条链接: " + link.textContent;
|
984
|
+
}
|
985
|
+
}
|
986
|
+
|
987
|
+
return "未找到任何可点击的50条/页选项";
|
988
|
+
} catch (e) {
|
989
|
+
return "设置每页显示50条记录时出错: " + e.toString();
|
990
|
+
}
|
991
|
+
}""")
|
992
|
+
|
993
|
+
logger.debug(f"[DEBUG] 设置每页显示50条记录结果: {set_page_size_result}")
|
994
|
+
|
995
|
+
# 等待页面刷新
|
996
|
+
await page.wait_for_load_state('networkidle')
|
997
|
+
await asyncio.sleep(2)
|
998
|
+
|
999
|
+
# 检查是否有来源类别选项,并尝试勾选CSSCI
|
1000
|
+
await check_and_select_cssci(page)
|
1001
|
+
|
1002
|
+
except Exception as e:
|
1003
|
+
logger.debug(f"[DEBUG] 设置每页显示50条记录时出错: {str(e)}")
|
1004
|
+
logger.debug(traceback.format_exc())
|
720
1005
|
|
721
|
-
#
|
722
|
-
|
1006
|
+
# 尝试等待搜索结果加载
|
1007
|
+
try:
|
1008
|
+
await page.wait_for_selector('.result-table-list', timeout=5000)
|
1009
|
+
logger.debug("[DEBUG] 已找到搜索结果容器")
|
1010
|
+
except Exception as e:
|
1011
|
+
logger.debug(f"[DEBUG] 等待搜索结果容器超时: {str(e)}")
|
723
1012
|
|
724
|
-
|
725
|
-
|
1013
|
+
# 优先查找带有article/abstract?v的链接
|
1014
|
+
try:
|
1015
|
+
logger.debug("[DEBUG] 尝试查找包含 article/abstract?v 的链接")
|
1016
|
+
|
1017
|
+
abstract_links = await page.evaluate("""() => {
|
1018
|
+
const links = [];
|
1019
|
+
// 严格查找包含article/abstract?v的链接
|
1020
|
+
const abstractLinks = document.querySelectorAll('a[href*="article/abstract?v="]');
|
1021
|
+
|
1022
|
+
console.log('找到包含article/abstract?v的链接数量:', abstractLinks.length);
|
1023
|
+
|
1024
|
+
for (let i = 0; i < abstractLinks.length; i++) {
|
1025
|
+
const link = abstractLinks[i];
|
1026
|
+
const href = link.href;
|
1027
|
+
const text = link.textContent.trim();
|
1028
|
+
|
1029
|
+
// 确保链接有效且包含必要的字段
|
1030
|
+
if (href && href.includes('article/abstract?v=') && text) {
|
1031
|
+
links.push({
|
1032
|
+
index: links.length + 1,
|
1033
|
+
href: href,
|
1034
|
+
text: text
|
1035
|
+
});
|
1036
|
+
}
|
1037
|
+
}
|
1038
|
+
|
1039
|
+
return links;
|
1040
|
+
}""")
|
726
1041
|
|
727
|
-
|
728
|
-
'index': i + 1,
|
729
|
-
'href': href
|
730
|
-
})
|
1042
|
+
logger.debug(f"[DEBUG] 找到 {len(abstract_links)} 个包含article/abstract?v的链接")
|
731
1043
|
|
732
|
-
|
1044
|
+
if abstract_links and len(abstract_links) > 0:
|
1045
|
+
# 找到有效的摘要链接
|
1046
|
+
links_info = abstract_links
|
1047
|
+
links_count = len(abstract_links)
|
1048
|
+
else:
|
1049
|
+
# 没有找到摘要链接,尝试备用方法
|
1050
|
+
logger.debug("[DEBUG] 未找到包含article/abstract?v的链接,尝试备用方法")
|
1051
|
+
|
1052
|
+
# 尝试查找可能的论文链接
|
1053
|
+
backup_links = await page.evaluate("""() => {
|
1054
|
+
const links = [];
|
1055
|
+
// 查找可能是论文链接的a标签
|
1056
|
+
const allLinks = document.querySelectorAll('a.fz14, a[href*="/kcms"], .result-table-list a');
|
1057
|
+
|
1058
|
+
for (let i = 0; i < allLinks.length; i++) {
|
1059
|
+
const link = allLinks[i];
|
1060
|
+
const href = link.href;
|
1061
|
+
const text = link.textContent.trim();
|
1062
|
+
|
1063
|
+
if (href && text && !links.some(l => l.href === href)) {
|
1064
|
+
links.push({
|
1065
|
+
index: links.length + 1,
|
1066
|
+
href: href,
|
1067
|
+
text: text
|
1068
|
+
});
|
1069
|
+
}
|
1070
|
+
}
|
1071
|
+
|
1072
|
+
return links;
|
1073
|
+
}""")
|
1074
|
+
|
1075
|
+
if backup_links and len(backup_links) > 0:
|
1076
|
+
logger.debug(f"[DEBUG] 使用备用方法找到 {len(backup_links)} 个可能的论文链接")
|
1077
|
+
links_info = backup_links
|
1078
|
+
links_count = len(backup_links)
|
1079
|
+
else:
|
1080
|
+
# 回退到常规方法
|
1081
|
+
links_info = []
|
1082
|
+
links_count = 0
|
1083
|
+
|
1084
|
+
# 尝试多种可能的选择器
|
1085
|
+
selectors = [
|
1086
|
+
'a[href*="article/abstract?v="]', # 优先查找摘要链接
|
1087
|
+
'a[href*="/kcms"]', # 知网文献链接
|
1088
|
+
'.fz14', # 标题样式类
|
1089
|
+
'a.pc-link', # 搜索结果链接
|
1090
|
+
'.c_font a', # 内容字体下的链接
|
1091
|
+
'.result-table-list a', # 结果表下的链接
|
1092
|
+
'table tr td a' # 表格中的链接
|
1093
|
+
]
|
1094
|
+
|
1095
|
+
for selector in selectors:
|
1096
|
+
try:
|
1097
|
+
all_links = await page.query_selector_all(selector)
|
1098
|
+
logger.debug(f"[DEBUG] 使用选择器 {selector} 找到 {len(all_links)} 个链接")
|
1099
|
+
|
1100
|
+
for i, link in enumerate(all_links):
|
1101
|
+
try:
|
1102
|
+
href = await link.get_attribute('href')
|
1103
|
+
text = await link.text_content()
|
1104
|
+
|
1105
|
+
# 确保链接包含论文相关URL,如果没有指定URL则使用当前页面URL
|
1106
|
+
if not href:
|
1107
|
+
continue
|
1108
|
+
|
1109
|
+
# 处理相对URL
|
1110
|
+
if href.startswith('/'):
|
1111
|
+
href = f"https://kns.cnki.net{href}"
|
1112
|
+
elif not href.startswith('http'):
|
1113
|
+
href = f"https://kns.cnki.net/{href}"
|
1114
|
+
|
1115
|
+
# 防止重复添加同一链接
|
1116
|
+
if any(link_info['href'] == href for link_info in links_info):
|
1117
|
+
continue
|
1118
|
+
|
1119
|
+
links_info.append({
|
1120
|
+
'index': len(links_info) + 1,
|
1121
|
+
'href': href,
|
1122
|
+
'text': text.strip() if text else ""
|
1123
|
+
})
|
1124
|
+
|
1125
|
+
logger.debug(f"[DEBUG] 链接 {len(links_info)}: {href}")
|
1126
|
+
except Exception as e:
|
1127
|
+
logger.debug(f"[DEBUG] 处理链接时出错: {str(e)}")
|
1128
|
+
except Exception as e:
|
1129
|
+
logger.debug(f"[DEBUG] 使用选择器 {selector} 查找链接时出错: {str(e)}")
|
1130
|
+
except Exception as e:
|
1131
|
+
logger.debug(f"[DEBUG] 查找链接时出错: {str(e)}")
|
1132
|
+
logger.debug(traceback.format_exc())
|
1133
|
+
links_info = []
|
1134
|
+
links_count = 0
|
733
1135
|
|
734
|
-
#
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
1136
|
+
# 过滤链接,只保留包含article/abstract?v的链接
|
1137
|
+
filtered_links = []
|
1138
|
+
for link in links_info:
|
1139
|
+
href = link['href']
|
1140
|
+
if 'article/abstract?v=' in href:
|
1141
|
+
filtered_links.append(link)
|
1142
|
+
logger.debug(f"[DEBUG] 保留包含article/abstract?v的链接: {href}")
|
1143
|
+
|
1144
|
+
# 如果过滤后没有链接,可能是知网搜索结果的格式变化,使用原始链接
|
1145
|
+
if not filtered_links:
|
1146
|
+
logger.debug("[DEBUG] 过滤后没有包含article/abstract?v的链接,使用原始链接")
|
1147
|
+
filtered_links = links_info
|
1148
|
+
|
1149
|
+
# 最终链接数量
|
1150
|
+
links_count = len(filtered_links)
|
1151
|
+
logger.debug(f"[DEBUG] 最终过滤后找到 {links_count} 个链接")
|
1152
|
+
|
1153
|
+
# 如果没有找到链接,不再进行截图
|
1154
|
+
if links_count == 0:
|
1155
|
+
logger.debug("[DEBUG] 未找到链接")
|
741
1156
|
|
742
1157
|
# 存储结果 - 使用字典结构而不是纯文本
|
743
1158
|
page_content = {
|
744
1159
|
"count": links_count,
|
745
|
-
"links": [{"index": link['index'], "url": link['href']} for link in
|
1160
|
+
"links": [{"index": link['index'], "url": link['href'], "title": link['text']} for link in filtered_links]
|
746
1161
|
}
|
747
1162
|
|
748
1163
|
return links_count
|
749
1164
|
except Exception as e:
|
750
1165
|
logger.debug(f"[DEBUG] 查找链接时出错: {str(e)}")
|
751
|
-
|
1166
|
+
logger.debug(traceback.format_exc())
|
1167
|
+
|
1168
|
+
# 发生错误时,尝试获取当前页面URL
|
1169
|
+
try:
|
1170
|
+
current_url = await page.url()
|
1171
|
+
logger.debug(f"[DEBUG] 当前页面URL: {current_url}")
|
1172
|
+
|
1173
|
+
# 至少返回当前页面作为链接
|
1174
|
+
page_content = {
|
1175
|
+
"count": 1,
|
1176
|
+
"links": [{"index": 1, "url": current_url, "title": "当前页面"}]
|
1177
|
+
}
|
1178
|
+
return 1
|
1179
|
+
except:
|
1180
|
+
page_content = {
|
1181
|
+
"count": 0,
|
1182
|
+
"links": []
|
1183
|
+
}
|
1184
|
+
return 0
|
1185
|
+
|
1186
|
+
async def check_and_select_cssci(page):
|
1187
|
+
"""检查页面是否有来源类别选项,并尝试勾选CSSCI"""
|
1188
|
+
try:
|
1189
|
+
logger.debug("[DEBUG] 尝试查找来源类别并勾选CSSCI")
|
1190
|
+
|
1191
|
+
# 使用JavaScript直接操作DOM
|
1192
|
+
cssci_result = await page.evaluate("""() => {
|
1193
|
+
try {
|
1194
|
+
// 查找包含"来源类别"的区域
|
1195
|
+
const categoryContainer = Array.from(document.querySelectorAll('div')).find(div =>
|
1196
|
+
div.textContent.includes('来源类别')
|
1197
|
+
);
|
1198
|
+
|
1199
|
+
if (categoryContainer) {
|
1200
|
+
// 在来源类别容器中查找CSSCI复选框
|
1201
|
+
const checkboxes = categoryContainer.querySelectorAll('input[type="checkbox"]');
|
1202
|
+
for (let checkbox of checkboxes) {
|
1203
|
+
// 查找CSSCI相关的复选框
|
1204
|
+
const parentText = checkbox.parentElement.textContent;
|
1205
|
+
if (parentText.includes('CSSCI') ||
|
1206
|
+
checkbox.value.includes('CSSCI') ||
|
1207
|
+
checkbox.id.includes('cssci')) {
|
1208
|
+
|
1209
|
+
// 勾选复选框
|
1210
|
+
if (!checkbox.checked) {
|
1211
|
+
checkbox.click();
|
1212
|
+
return "已勾选CSSCI复选框";
|
1213
|
+
} else {
|
1214
|
+
return "CSSCI复选框已经被勾选";
|
1215
|
+
}
|
1216
|
+
}
|
1217
|
+
}
|
1218
|
+
|
1219
|
+
// 如果没有找到复选框但找到了CSSCI的标签
|
1220
|
+
const cssciLabels = categoryContainer.querySelectorAll('label, span');
|
1221
|
+
for (let label of cssciLabels) {
|
1222
|
+
if (label.textContent.includes('CSSCI')) {
|
1223
|
+
label.click();
|
1224
|
+
return "已点击CSSCI标签";
|
1225
|
+
}
|
1226
|
+
}
|
1227
|
+
|
1228
|
+
return "在来源类别区域未找到CSSCI选项";
|
1229
|
+
}
|
1230
|
+
|
1231
|
+
return "未找到来源类别区域";
|
1232
|
+
} catch (e) {
|
1233
|
+
return "勾选CSSCI时出错: " + e.toString();
|
1234
|
+
}
|
1235
|
+
}""")
|
1236
|
+
|
1237
|
+
logger.debug(f"[DEBUG] CSSCI勾选结果: {cssci_result}")
|
1238
|
+
|
1239
|
+
# 等待页面刷新
|
1240
|
+
await page.wait_for_load_state('networkidle')
|
1241
|
+
await asyncio.sleep(2)
|
1242
|
+
|
1243
|
+
except Exception as e:
|
1244
|
+
logger.debug(f"[DEBUG] 勾选CSSCI时出错: {str(e)}")
|
1245
|
+
logger.debug(traceback.format_exc())
|
1246
|
+
|
1247
|
+
async def extract_content_from_url(url: str, page = None) -> CNKIContent:
|
1248
|
+
"""从CNKI页面提取论文内容"""
|
1249
|
+
global playwright_instance, browser_instance, context
|
1250
|
+
|
1251
|
+
if not url.startswith('http'):
|
1252
|
+
# 处理相对URL
|
1253
|
+
if url.startswith('/'):
|
1254
|
+
url = f"https://kns.cnki.net{url}"
|
1255
|
+
else:
|
1256
|
+
url = f"https://kns.cnki.net/{url}"
|
1257
|
+
|
1258
|
+
# 创建基本内容对象
|
1259
|
+
content = CNKIContent(url=url)
|
1260
|
+
|
1261
|
+
try:
|
1262
|
+
logger.info(f"开始从URL提取内容: {url}")
|
1263
|
+
|
1264
|
+
# 如果没有提供page参数,检查playwright是否已初始化
|
1265
|
+
should_close_page = False
|
1266
|
+
if page is None:
|
1267
|
+
if playwright_instance is None or browser_instance is None or context is None:
|
1268
|
+
# 如果playwright未初始化,使用webbrowser打开URL
|
1269
|
+
logger.info(f"Playwright未初始化,使用webbrowser打开URL: {url}")
|
1270
|
+
webbrowser.open(url)
|
1271
|
+
|
1272
|
+
# 设置基本信息
|
1273
|
+
content.title = "请在浏览器中手动获取内容"
|
1274
|
+
content.abstract = "系统已打开链接,请在浏览器中查看完整内容"
|
1275
|
+
return content
|
1276
|
+
else:
|
1277
|
+
# 使用现有的playwright实例创建新页面
|
1278
|
+
logger.debug("[DEBUG] 使用现有的playwright实例创建新页面")
|
1279
|
+
page = await context.new_page()
|
1280
|
+
should_close_page = True # 后续需要关闭此页面
|
1281
|
+
|
1282
|
+
# 访问URL
|
1283
|
+
logger.debug(f"[DEBUG] 导航到URL: {url}")
|
1284
|
+
|
1285
|
+
try:
|
1286
|
+
await page.goto(url, wait_until='networkidle', timeout=30000)
|
1287
|
+
except Exception as e:
|
1288
|
+
logger.warning(f"导航超时,继续尝试提取: {str(e)}")
|
1289
|
+
|
1290
|
+
# 等待页面加载
|
1291
|
+
await asyncio.sleep(2)
|
1292
|
+
|
1293
|
+
# 检查是否需要验证
|
1294
|
+
await check_and_wait_for_verification(page)
|
1295
|
+
|
1296
|
+
# 尝试使用JavaScript提取所有内容
|
1297
|
+
try:
|
1298
|
+
logger.debug("[DEBUG] 尝试使用JavaScript提取内容")
|
1299
|
+
|
1300
|
+
content_result = await page.evaluate("""() => {
|
1301
|
+
try {
|
1302
|
+
// 提取标题
|
1303
|
+
const getTitle = () => {
|
1304
|
+
const selectors = ['h1.title', '.wx-tit h1', '.title', 'h1', '.article-title', 'div.brief h2', '.wxTitle', 'span.title'];
|
1305
|
+
for (const selector of selectors) {
|
1306
|
+
const element = document.querySelector(selector);
|
1307
|
+
if (element) {
|
1308
|
+
const text = element.textContent.trim();
|
1309
|
+
if (!text.includes('系统检测')) {
|
1310
|
+
return text;
|
1311
|
+
}
|
1312
|
+
}
|
1313
|
+
}
|
1314
|
+
return "";
|
1315
|
+
};
|
1316
|
+
|
1317
|
+
// 提取作者
|
1318
|
+
const getAuthors = () => {
|
1319
|
+
const selectors = ['.wx-tit .author', '.author', '.writers', '.authorinfo', 'div.brief p:first-child', 'span.author'];
|
1320
|
+
for (const selector of selectors) {
|
1321
|
+
const element = document.querySelector(selector);
|
1322
|
+
if (element) {
|
1323
|
+
const text = element.textContent.trim();
|
1324
|
+
return text.split(/[,,;;、\\s]+/).filter(a => a.trim());
|
1325
|
+
}
|
1326
|
+
}
|
1327
|
+
return [];
|
1328
|
+
};
|
1329
|
+
|
1330
|
+
// 提取摘要
|
1331
|
+
const getAbstract = () => {
|
1332
|
+
const selectors = ['#ChDivSummary', '.abstract', '.summary', '.Abstract', 'div.brief div.abstract', 'div.wxInfo span.abstract', 'div.wxInfo', 'span.abstract'];
|
1333
|
+
for (const selector of selectors) {
|
1334
|
+
const element = document.querySelector(selector);
|
1335
|
+
if (element) {
|
1336
|
+
let text = element.textContent.trim();
|
1337
|
+
// 移除可能的"摘要:"前缀
|
1338
|
+
text = text.replace(/^摘要[::]/g, '').trim();
|
1339
|
+
return text;
|
1340
|
+
}
|
1341
|
+
}
|
1342
|
+
|
1343
|
+
// 查找含有"摘要"的段落
|
1344
|
+
const paragraphs = document.querySelectorAll('p');
|
1345
|
+
for (const p of paragraphs) {
|
1346
|
+
if (p.textContent.includes('摘要')) {
|
1347
|
+
let text = p.textContent.trim();
|
1348
|
+
text = text.replace(/^摘要[::]/g, '').trim();
|
1349
|
+
return text;
|
1350
|
+
}
|
1351
|
+
}
|
1352
|
+
|
1353
|
+
return "";
|
1354
|
+
};
|
1355
|
+
|
1356
|
+
// 提取关键词
|
1357
|
+
const getKeywords = () => {
|
1358
|
+
const selectors = ['.wx-tit-keys', '.keywords', '.Keyword', 'div.wxInfo span.keywords', 'span.keywords', 'div.brief span.keywords', 'p.keywords'];
|
1359
|
+
for (const selector of selectors) {
|
1360
|
+
const element = document.querySelector(selector);
|
1361
|
+
if (element) {
|
1362
|
+
let text = element.textContent.trim();
|
1363
|
+
// 移除"关键词:"前缀
|
1364
|
+
text = text.replace(/^关键词[::]/g, '').trim();
|
1365
|
+
return text.split(/[;;,,、\\s]+/).filter(k => k.trim());
|
1366
|
+
}
|
1367
|
+
}
|
1368
|
+
|
1369
|
+
// 查找含有"关键词"的段落
|
1370
|
+
const paragraphs = document.querySelectorAll('p');
|
1371
|
+
for (const p of paragraphs) {
|
1372
|
+
if (p.textContent.includes('关键词')) {
|
1373
|
+
let text = p.textContent.trim();
|
1374
|
+
const keywordText = text.split(/关键词[::]/)[1];
|
1375
|
+
if (keywordText) {
|
1376
|
+
return keywordText.split(/[;;,,、\\s]+/).filter(k => k.trim());
|
1377
|
+
}
|
1378
|
+
}
|
1379
|
+
}
|
1380
|
+
|
1381
|
+
return [];
|
1382
|
+
};
|
1383
|
+
|
1384
|
+
// 尝试获取引用格式
|
1385
|
+
let citeFormat = "";
|
1386
|
+
const getCiteFormat = () => {
|
1387
|
+
// 首先检查是否有引用按钮
|
1388
|
+
const citeButton = document.querySelector('button:has-text("引用"), [class*="cite"], [class*="quote"]');
|
1389
|
+
if (citeButton) {
|
1390
|
+
// 如果有引用按钮,暂不点击,防止页面跳转
|
1391
|
+
return null;
|
1392
|
+
}
|
1393
|
+
|
1394
|
+
// 尝试直接获取引用区域
|
1395
|
+
const selectors = ['.quote-info', '.citation', 'div.cite', 'div.quoted', 'div.wxInfo div.quoted', '.refer-info'];
|
1396
|
+
for (const selector of selectors) {
|
1397
|
+
const element = document.querySelector(selector);
|
1398
|
+
if (element) {
|
1399
|
+
return element.textContent.trim();
|
1400
|
+
}
|
1401
|
+
}
|
1402
|
+
|
1403
|
+
return "";
|
1404
|
+
};
|
1405
|
+
|
1406
|
+
// 收集结果
|
1407
|
+
return {
|
1408
|
+
title: getTitle(),
|
1409
|
+
authors: getAuthors(),
|
1410
|
+
abstract: getAbstract(),
|
1411
|
+
keywords: getKeywords(),
|
1412
|
+
cite_format: getCiteFormat()
|
1413
|
+
};
|
1414
|
+
} catch (e) {
|
1415
|
+
return {
|
1416
|
+
error: "提取内容时出错: " + e.toString(),
|
1417
|
+
title: "",
|
1418
|
+
authors: [],
|
1419
|
+
abstract: "",
|
1420
|
+
keywords: [],
|
1421
|
+
cite_format: ""
|
1422
|
+
};
|
1423
|
+
}
|
1424
|
+
}""")
|
1425
|
+
|
1426
|
+
# 更新内容对象
|
1427
|
+
if content_result:
|
1428
|
+
if "error" in content_result and content_result["error"]:
|
1429
|
+
logger.warning(f"[WARNING] JavaScript提取内容时出错: {content_result['error']}")
|
1430
|
+
else:
|
1431
|
+
logger.debug("[DEBUG] JavaScript提取内容成功")
|
1432
|
+
|
1433
|
+
# 更新标题
|
1434
|
+
if content_result.get("title"):
|
1435
|
+
content.title = content_result["title"]
|
1436
|
+
logger.debug(f"[DEBUG] 提取到标题: {content.title}")
|
1437
|
+
|
1438
|
+
# 更新作者
|
1439
|
+
if content_result.get("authors"):
|
1440
|
+
content.authors = content_result["authors"]
|
1441
|
+
logger.debug(f"[DEBUG] 提取到作者: {content.authors}")
|
1442
|
+
|
1443
|
+
# 更新摘要
|
1444
|
+
if content_result.get("abstract"):
|
1445
|
+
content.abstract = content_result["abstract"]
|
1446
|
+
logger.debug(f"[DEBUG] 提取到摘要: {content.abstract[:100]}...")
|
1447
|
+
|
1448
|
+
# 更新关键词
|
1449
|
+
if content_result.get("keywords"):
|
1450
|
+
content.keywords = content_result["keywords"]
|
1451
|
+
logger.debug(f"[DEBUG] 提取到关键词: {content.keywords}")
|
1452
|
+
|
1453
|
+
# 更新引用格式
|
1454
|
+
if content_result.get("cite_format") != None:
|
1455
|
+
if content_result["cite_format"]:
|
1456
|
+
# 直接获取到引用格式
|
1457
|
+
content.cite_format = content_result["cite_format"]
|
1458
|
+
logger.debug(f"[DEBUG] 提取到引用格式: {content.cite_format[:100]}...")
|
1459
|
+
else:
|
1460
|
+
# 需要点击引用按钮
|
1461
|
+
logger.debug("[DEBUG] 尝试点击引用按钮获取引用格式")
|
1462
|
+
|
1463
|
+
try:
|
1464
|
+
# 查找引用按钮
|
1465
|
+
cite_button = await page.query_selector('button:has-text("引用"), [class*="cite"], [class*="quote"]')
|
1466
|
+
if cite_button:
|
1467
|
+
await cite_button.click()
|
1468
|
+
await asyncio.sleep(1) # 等待弹窗显示
|
1469
|
+
|
1470
|
+
# 在弹窗中提取引用格式
|
1471
|
+
cite_text = await page.evaluate("""() => {
|
1472
|
+
const textarea = document.querySelector('.quote-r textarea.text, .quote-text, [class*="quote"] textarea');
|
1473
|
+
if (textarea) {
|
1474
|
+
return textarea.value.trim();
|
1475
|
+
}
|
1476
|
+
return "";
|
1477
|
+
}""")
|
1478
|
+
|
1479
|
+
if cite_text:
|
1480
|
+
content.cite_format = cite_text
|
1481
|
+
logger.debug(f"[DEBUG] 从弹窗提取到引用格式: {content.cite_format[:100]}...")
|
1482
|
+
else:
|
1483
|
+
logger.debug("[DEBUG] 未从弹窗找到引用格式")
|
1484
|
+
else:
|
1485
|
+
logger.debug("[DEBUG] 未找到引用按钮")
|
1486
|
+
except Exception as e:
|
1487
|
+
logger.debug(f"[DEBUG] 点击引用按钮时出错: {str(e)}")
|
1488
|
+
else:
|
1489
|
+
logger.warning("[WARNING] JavaScript提取内容返回空结果")
|
1490
|
+
|
1491
|
+
except Exception as e:
|
1492
|
+
logger.debug(f"[DEBUG] 使用JavaScript提取内容时出错: {str(e)}")
|
1493
|
+
logger.debug(traceback.format_exc())
|
1494
|
+
|
1495
|
+
# 如果JavaScript提取失败,回退到原来的提取方法
|
1496
|
+
if not content.title:
|
1497
|
+
# 尝试提取论文标题
|
1498
|
+
try:
|
1499
|
+
title_selectors = [
|
1500
|
+
'.wx-tit h1',
|
1501
|
+
'.article-title',
|
1502
|
+
'.title',
|
1503
|
+
'h1',
|
1504
|
+
'.articleTitle',
|
1505
|
+
'div.brief h2',
|
1506
|
+
'.wxTitle',
|
1507
|
+
'span.title'
|
1508
|
+
]
|
1509
|
+
|
1510
|
+
title_elem = None
|
1511
|
+
for selector in title_selectors:
|
1512
|
+
title_elem = await page.query_selector(selector)
|
1513
|
+
if title_elem:
|
1514
|
+
logger.debug(f"[DEBUG] 找到标题元素: {selector}")
|
1515
|
+
break
|
1516
|
+
|
1517
|
+
if title_elem:
|
1518
|
+
content.title = await title_elem.text_content()
|
1519
|
+
content.title = content.title.strip()
|
1520
|
+
logger.debug(f"[DEBUG] 提取到标题: {content.title}")
|
1521
|
+
except Exception as e:
|
1522
|
+
logger.debug(f"[DEBUG] 提取标题时出错: {str(e)}")
|
1523
|
+
|
1524
|
+
if not content.authors:
|
1525
|
+
# 尝试提取作者信息
|
1526
|
+
try:
|
1527
|
+
author_selectors = [
|
1528
|
+
'.wx-tit .author',
|
1529
|
+
'.author',
|
1530
|
+
'.writers',
|
1531
|
+
'.authorinfo',
|
1532
|
+
'div.brief p:first-child',
|
1533
|
+
'span.author'
|
1534
|
+
]
|
1535
|
+
|
1536
|
+
authors_elem = None
|
1537
|
+
for selector in author_selectors:
|
1538
|
+
authors_elem = await page.query_selector(selector)
|
1539
|
+
if authors_elem:
|
1540
|
+
logger.debug(f"[DEBUG] 找到作者元素: {selector}")
|
1541
|
+
break
|
1542
|
+
|
1543
|
+
if authors_elem:
|
1544
|
+
authors_text = await authors_elem.text_content()
|
1545
|
+
# 分割作者文本
|
1546
|
+
authors = [a.strip() for a in re.split(r'[,,;;、\s]+', authors_text) if a.strip()]
|
1547
|
+
content.authors = authors
|
1548
|
+
logger.debug(f"[DEBUG] 提取到作者: {authors}")
|
1549
|
+
except Exception as e:
|
1550
|
+
logger.debug(f"[DEBUG] 提取作者时出错: {str(e)}")
|
1551
|
+
|
1552
|
+
if not content.abstract:
|
1553
|
+
# 尝试提取摘要
|
1554
|
+
try:
|
1555
|
+
abstract_selectors = [
|
1556
|
+
'#ChDivSummary',
|
1557
|
+
'.abstract',
|
1558
|
+
'.summary',
|
1559
|
+
'.Abstract',
|
1560
|
+
'div.brief div.abstract',
|
1561
|
+
'div.wxInfo span.abstract',
|
1562
|
+
'div.wxInfo',
|
1563
|
+
'span.abstract'
|
1564
|
+
]
|
1565
|
+
|
1566
|
+
abstract_elem = None
|
1567
|
+
for selector in abstract_selectors:
|
1568
|
+
abstract_elem = await page.query_selector(selector)
|
1569
|
+
if abstract_elem:
|
1570
|
+
logger.debug(f"[DEBUG] 找到摘要元素: {selector}")
|
1571
|
+
break
|
1572
|
+
|
1573
|
+
if abstract_elem:
|
1574
|
+
content.abstract = await abstract_elem.text_content()
|
1575
|
+
content.abstract = content.abstract.strip()
|
1576
|
+
# 移除可能的"摘要:"前缀
|
1577
|
+
content.abstract = re.sub(r'^摘要[::]\s*', '', content.abstract)
|
1578
|
+
logger.debug(f"[DEBUG] 提取到摘要: {content.abstract[:100]}...")
|
1579
|
+
except Exception as e:
|
1580
|
+
logger.debug(f"[DEBUG] 提取摘要时出错: {str(e)}")
|
1581
|
+
|
1582
|
+
if not content.keywords:
|
1583
|
+
# 尝试提取关键词
|
1584
|
+
try:
|
1585
|
+
keyword_selectors = [
|
1586
|
+
'.wx-tit-keys',
|
1587
|
+
'.keywords',
|
1588
|
+
'.Keyword',
|
1589
|
+
'div.wxInfo span.keywords',
|
1590
|
+
'span.keywords',
|
1591
|
+
'div.brief span.keywords',
|
1592
|
+
'p.keywords'
|
1593
|
+
]
|
1594
|
+
|
1595
|
+
keywords_elem = None
|
1596
|
+
for selector in keyword_selectors:
|
1597
|
+
keywords_elem = await page.query_selector(selector)
|
1598
|
+
if keywords_elem:
|
1599
|
+
logger.debug(f"[DEBUG] 找到关键词元素: {selector}")
|
1600
|
+
break
|
1601
|
+
|
1602
|
+
if keywords_elem:
|
1603
|
+
keywords_text = await keywords_elem.text_content()
|
1604
|
+
# 移除"关键词:"前缀
|
1605
|
+
keywords_text = re.sub(r'^关键词[::]\s*', '', keywords_text)
|
1606
|
+
# 分割关键词
|
1607
|
+
keywords = [k.strip() for k in re.split(r'[;;,,、\s]+', keywords_text) if k.strip()]
|
1608
|
+
content.keywords = keywords
|
1609
|
+
logger.debug(f"[DEBUG] 提取到关键词: {keywords}")
|
1610
|
+
except Exception as e:
|
1611
|
+
logger.debug(f"[DEBUG] 提取关键词时出错: {str(e)}")
|
1612
|
+
|
1613
|
+
if not content.cite_format:
|
1614
|
+
# 尝试提取引用格式
|
1615
|
+
try:
|
1616
|
+
cite_selectors = [
|
1617
|
+
'.quote-info',
|
1618
|
+
'.citation',
|
1619
|
+
'div.cite',
|
1620
|
+
'div.quoted',
|
1621
|
+
'div.wxInfo div.quoted',
|
1622
|
+
'.refer-info'
|
1623
|
+
]
|
1624
|
+
|
1625
|
+
cite_elem = None
|
1626
|
+
for selector in cite_selectors:
|
1627
|
+
cite_elem = await page.query_selector(selector)
|
1628
|
+
if cite_elem:
|
1629
|
+
logger.debug(f"[DEBUG] 找到引用格式元素: {selector}")
|
1630
|
+
break
|
1631
|
+
|
1632
|
+
if cite_elem:
|
1633
|
+
content.cite_format = await cite_elem.text_content()
|
1634
|
+
content.cite_format = content.cite_format.strip()
|
1635
|
+
logger.debug(f"[DEBUG] 提取到引用格式: {content.cite_format[:100]}...")
|
1636
|
+
else:
|
1637
|
+
# 如果没有找到引用格式,尝试点击引用按钮
|
1638
|
+
cite_button = await page.query_selector('button:has-text("引用"), [class*="cite"], [class*="quote"]')
|
1639
|
+
if cite_button:
|
1640
|
+
await cite_button.click()
|
1641
|
+
await asyncio.sleep(1) # 等待弹窗显示
|
1642
|
+
|
1643
|
+
# 在弹窗中提取引用格式
|
1644
|
+
textarea = await page.query_selector('.quote-r textarea.text, .quote-text, [class*="quote"] textarea')
|
1645
|
+
if textarea:
|
1646
|
+
content.cite_format = await textarea.get_property('value')
|
1647
|
+
content.cite_format = content.cite_format.strip()
|
1648
|
+
logger.debug(f"[DEBUG] 从弹窗提取到引用格式: {content.cite_format[:100]}...")
|
1649
|
+
except Exception as e:
|
1650
|
+
logger.debug(f"[DEBUG] 提取引用格式时出错: {str(e)}")
|
1651
|
+
|
1652
|
+
# 如果页面是自己创建的,需要关闭
|
1653
|
+
if should_close_page:
|
1654
|
+
await page.close()
|
1655
|
+
|
1656
|
+
return content
|
1657
|
+
except Exception as e:
|
1658
|
+
logger.error(f"从URL提取内容时出错: {str(e)}")
|
1659
|
+
logger.error(traceback.format_exc())
|
1660
|
+
|
1661
|
+
# 确保如果页面是自己创建的,出错时也能关闭
|
1662
|
+
if 'page' in locals() and page is not None and 'should_close_page' in locals() and should_close_page:
|
1663
|
+
try:
|
1664
|
+
await page.close()
|
1665
|
+
except:
|
1666
|
+
pass
|
1667
|
+
|
1668
|
+
# 设置错误信息
|
1669
|
+
content.title = f"提取失败: {str(e)}"
|
1670
|
+
content.abstract = f"从URL提取内容时出错: {str(e)}"
|
1671
|
+
return content
|
1672
|
+
|
1673
|
+
async def batch_extract_contents(urls: List[str]) -> List[Dict]:
|
1674
|
+
"""批量处理多个URL,提取内容并返回JSON格式"""
|
1675
|
+
results = []
|
1676
|
+
max_urls = min(50, len(urls)) # 限制最多处理50个URL
|
1677
|
+
|
1678
|
+
logger.info(f"开始批量提取内容,共 {max_urls} 个URL")
|
1679
|
+
|
1680
|
+
try:
|
1681
|
+
# 检查是否已初始化playwright
|
1682
|
+
global playwright_instance, browser_instance, context
|
1683
|
+
|
1684
|
+
if playwright_instance is None or browser_instance is None or context is None:
|
1685
|
+
logger.info("Playwright未初始化,创建新实例")
|
1686
|
+
playwright_instance = await async_playwright().start()
|
1687
|
+
browser_instance = await playwright_instance.chromium.launch(headless=False)
|
1688
|
+
context = await browser_instance.new_context()
|
1689
|
+
|
1690
|
+
# 一个一个处理URL
|
1691
|
+
for i, url in enumerate(urls[:max_urls]):
|
1692
|
+
logger.info(f"处理URL {i+1}/{max_urls}: {url}")
|
1693
|
+
|
1694
|
+
# 创建一个新页面
|
1695
|
+
page = await context.new_page()
|
1696
|
+
|
1697
|
+
try:
|
1698
|
+
# 提取内容
|
1699
|
+
result = await extract_content_from_url(url, page)
|
1700
|
+
results.append(result.dict())
|
1701
|
+
logger.info(f"成功处理URL: {url}")
|
1702
|
+
except Exception as e:
|
1703
|
+
logger.error(f"处理URL {url} 时出错: {str(e)}")
|
1704
|
+
results.append({
|
1705
|
+
"url": url,
|
1706
|
+
"error": str(e),
|
1707
|
+
"title": "",
|
1708
|
+
"authors": [],
|
1709
|
+
"abstract": "",
|
1710
|
+
"keywords": [],
|
1711
|
+
"cite_format": ""
|
1712
|
+
})
|
1713
|
+
finally:
|
1714
|
+
# 关闭页面
|
1715
|
+
await page.close()
|
1716
|
+
|
1717
|
+
# 添加短暂延迟,避免过快请求导致被封
|
1718
|
+
await asyncio.sleep(1)
|
1719
|
+
|
1720
|
+
logger.info(f"批量处理完成,共处理 {len(results)} 个URL")
|
1721
|
+
return results
|
1722
|
+
except Exception as e:
|
1723
|
+
logger.error(f"批量处理过程中出错: {str(e)}")
|
1724
|
+
logger.error(traceback.format_exc())
|
1725
|
+
return [{"error": f"批量处理过程中出错: {str(e)}"}] + results
|
1726
|
+
|
1727
|
+
# 添加关闭函数,在程序结束时清理资源
|
1728
|
+
async def cleanup_playwright():
|
1729
|
+
"""清理playwright资源"""
|
1730
|
+
global playwright_instance, browser_instance, context
|
1731
|
+
|
1732
|
+
if context:
|
1733
|
+
logger.debug("[DEBUG] 关闭playwright上下文")
|
1734
|
+
await context.close()
|
1735
|
+
context = None
|
1736
|
+
|
1737
|
+
if browser_instance:
|
1738
|
+
logger.debug("[DEBUG] 关闭浏览器实例")
|
1739
|
+
await browser_instance.close()
|
1740
|
+
browser_instance = None
|
1741
|
+
|
1742
|
+
if playwright_instance:
|
1743
|
+
logger.debug("[DEBUG] 关闭playwright实例")
|
1744
|
+
await playwright_instance.stop()
|
1745
|
+
playwright_instance = None
|
752
1746
|
|
753
1747
|
async def main():
|
754
1748
|
"""主程序入口"""
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
1749
|
+
try:
|
1750
|
+
# 使用stdin/stdout流运行服务器
|
1751
|
+
async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
|
1752
|
+
await server.run(
|
1753
|
+
read_stream,
|
1754
|
+
write_stream,
|
1755
|
+
InitializationOptions(
|
1756
|
+
server_name="cnks",
|
1757
|
+
server_version="0.3.0",
|
1758
|
+
capabilities=server.get_capabilities(
|
1759
|
+
notification_options=NotificationOptions(),
|
1760
|
+
experimental_capabilities={},
|
1761
|
+
),
|
766
1762
|
),
|
767
|
-
)
|
768
|
-
|
1763
|
+
)
|
1764
|
+
finally:
|
1765
|
+
# 确保playwright资源在程序结束时被清理
|
1766
|
+
await cleanup_playwright()
|
769
1767
|
|
770
1768
|
# 为符合README.md的要求,添加从FastMCP导出的接口
|
771
1769
|
def create_fastmcp_server():
|
@@ -775,36 +1773,81 @@ def create_fastmcp_server():
|
|
775
1773
|
fast_mcp = FastMCP("知网搜索")
|
776
1774
|
|
777
1775
|
# 只添加搜索并提取的工具
|
778
|
-
if
|
1776
|
+
if PLAYWRIGHT_AVAILABLE:
|
779
1777
|
@fast_mcp.tool()
|
780
|
-
async def
|
1778
|
+
async def search_and_extract(keywords: str) -> dict:
|
781
1779
|
"""搜索关键词并提取所有论文的详细内容"""
|
782
|
-
logger.debug("[DEBUG] 正在使用FastMCP的
|
1780
|
+
logger.debug("[DEBUG] 正在使用FastMCP的search_and_extract函数")
|
783
1781
|
try:
|
784
1782
|
# 第一步:执行搜索
|
785
|
-
|
1783
|
+
logger.debug(f"[DEBUG] 开始搜索关键词: {keywords}")
|
1784
|
+
|
1785
|
+
# 构建URL
|
1786
|
+
url = f"https://kns.cnki.net/kns8s/search?q={quote(keywords)}"
|
1787
|
+
logger.debug(f"[DEBUG] 搜索URL: {url}")
|
1788
|
+
|
1789
|
+
# 如果playwright可用,使用playwright搜索
|
1790
|
+
if PLAYWRIGHT_AVAILABLE:
|
1791
|
+
logger.debug("[DEBUG] 使用playwright搜索")
|
1792
|
+
result_count = await search_with_playwright(keywords)
|
1793
|
+
else:
|
1794
|
+
# 否则直接用open_chrome打开URL
|
1795
|
+
logger.debug("[DEBUG] 直接使用open_chrome打开URL")
|
1796
|
+
result = open_chrome(url)
|
1797
|
+
|
1798
|
+
if isinstance(result, str):
|
1799
|
+
# 如果是错误信息,返回错误
|
1800
|
+
return {
|
1801
|
+
"error": f"打开Chrome失败: {result}",
|
1802
|
+
"keywords": keywords,
|
1803
|
+
"count": 0,
|
1804
|
+
"results": []
|
1805
|
+
}
|
1806
|
+
else:
|
1807
|
+
# 成功打开但无法获取链接
|
1808
|
+
return {
|
1809
|
+
"keywords": keywords,
|
1810
|
+
"count": 0,
|
1811
|
+
"message": "已直接在Chrome中打开搜索页面,但无法自动获取搜索结果。请安装playwright以获取完整功能。",
|
1812
|
+
"results": []
|
1813
|
+
}
|
786
1814
|
|
787
1815
|
# 检查搜索结果
|
788
1816
|
if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
|
1817
|
+
# 如果没有找到链接,至少返回搜索页面作为结果
|
1818
|
+
logger.debug("[DEBUG] 搜索未返回有效链接,返回搜索页面作为结果")
|
789
1819
|
return {
|
790
|
-
"error": "搜索未返回有效链接",
|
791
1820
|
"keywords": keywords,
|
792
|
-
"count":
|
793
|
-
"results": [
|
1821
|
+
"count": 1,
|
1822
|
+
"results": [{
|
1823
|
+
"title": f"搜索结果: {keywords}",
|
1824
|
+
"authors": [],
|
1825
|
+
"abstract": "请在浏览器中查看搜索结果",
|
1826
|
+
"keywords": [],
|
1827
|
+
"cite_format": "",
|
1828
|
+
"url": url
|
1829
|
+
}]
|
794
1830
|
}
|
795
1831
|
|
796
1832
|
# 提取链接
|
797
1833
|
urls = [link["url"] for link in page_content["links"] if "url" in link]
|
798
1834
|
if not urls:
|
1835
|
+
logger.debug("[DEBUG] 没有找到有效链接,返回搜索页面")
|
799
1836
|
return {
|
800
|
-
"error": "未找到有效链接",
|
801
1837
|
"keywords": keywords,
|
802
|
-
"count":
|
803
|
-
"results": [
|
1838
|
+
"count": 1,
|
1839
|
+
"results": [{
|
1840
|
+
"title": f"搜索结果: {keywords}",
|
1841
|
+
"authors": [],
|
1842
|
+
"abstract": "请在浏览器中查看搜索结果",
|
1843
|
+
"keywords": [],
|
1844
|
+
"cite_format": "",
|
1845
|
+
"url": url
|
1846
|
+
}]
|
804
1847
|
}
|
805
1848
|
|
806
1849
|
# 第二步:执行提取
|
807
|
-
results = await
|
1850
|
+
results = await batch_extract_contents(urls)
|
808
1851
|
|
809
1852
|
# 包装结果
|
810
1853
|
return {
|
@@ -816,6 +1859,7 @@ def create_fastmcp_server():
|
|
816
1859
|
}
|
817
1860
|
except Exception as e:
|
818
1861
|
logger.error(f"搜索并提取时出错: {str(e)}")
|
1862
|
+
logger.error(traceback.format_exc())
|
819
1863
|
return {
|
820
1864
|
"error": f"搜索并提取内容时出错: {str(e)}",
|
821
1865
|
"keywords": keywords,
|