cnks 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnks/__init__.py +49 -49
- cnks/server.py +1875 -831
- {cnks-0.2.2.dist-info → cnks-0.2.4.dist-info}/METADATA +5 -11
- cnks-0.2.4.dist-info/RECORD +6 -0
- cnks/chrome_extractor.py +0 -413
- cnks/extractor.py +0 -250
- cnks-0.2.2.dist-info/RECORD +0 -8
- {cnks-0.2.2.dist-info → cnks-0.2.4.dist-info}/WHEEL +0 -0
- {cnks-0.2.2.dist-info → cnks-0.2.4.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cnks
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.4
|
4
4
|
Summary: CNKI Search tool with Chrome browser integration
|
5
5
|
Author-email: bai-z-l <b@iziliang.com>
|
6
6
|
Requires-Python: >=3.10
|
@@ -17,22 +17,16 @@ Description-Content-Type: text/markdown
|
|
17
17
|
|
18
18
|
- 打开Chrome浏览器访问知网
|
19
19
|
- 提供关键词搜索工具
|
20
|
-
-
|
20
|
+
- **自动提取摘要内容**(v0.2.0新增)
|
21
21
|
- **将搜索结果转换为结构化JSON数据**(v0.2.0新增)
|
22
|
-
- 允许用户保存笔记
|
23
22
|
|
24
|
-
## 版本更新
|
25
23
|
|
26
|
-
|
27
|
-
- 优化Chrome浏览器本地调用,不再尝试下载Chromium
|
28
|
-
- 增强错误处理和日志记录
|
29
|
-
- 降低Python版本要求至3.10,提高兼容性
|
30
|
-
- 修复若干bug和稳定性问题
|
24
|
+
## 版本更新
|
31
25
|
|
32
26
|
### v0.2.0
|
33
27
|
- 简化工具接口,统一为单一的`mcp_cnks_search_and_extract`工具
|
34
28
|
- 实现一步式搜索和内容提取
|
35
|
-
- 支持从搜索结果提取50
|
29
|
+
- 支持从搜索结果提取50篇论文的摘要
|
36
30
|
- 结果包含标题、作者、摘要、关键词和引用格式等信息
|
37
31
|
|
38
32
|
### v0.1.1
|
@@ -42,7 +36,7 @@ Description-Content-Type: text/markdown
|
|
42
36
|
|
43
37
|
## 安装
|
44
38
|
|
45
|
-
确保您已安装Python 3.
|
39
|
+
确保您已安装Python 3.13或更高版本,然后执行以下命令安装:
|
46
40
|
|
47
41
|
```bash
|
48
42
|
uv add "mcp[cli]"
|
@@ -0,0 +1,6 @@
|
|
1
|
+
cnks/__init__.py,sha256=jAOpGR_mQLsDctjb9B5E2J7d6pEgdoijSOQowFfX2es,1850
|
2
|
+
cnks/server.py,sha256=rDANj6wZZ_tPu3XoHPhyM0BUXAi7sv4zDUVgoqFlVog,79819
|
3
|
+
cnks-0.2.4.dist-info/METADATA,sha256=Qejt1Y4pYqHww8WIoJwwcDx7ttj4B9B1tBzdK0mgryM,3959
|
4
|
+
cnks-0.2.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
5
|
+
cnks-0.2.4.dist-info/entry_points.txt,sha256=OkIiD7Ctn1Fn5B5zY09ltFFjIA8mJd3lw5V20AGtyYg,35
|
6
|
+
cnks-0.2.4.dist-info/RECORD,,
|
cnks/chrome_extractor.py
DELETED
@@ -1,413 +0,0 @@
|
|
1
|
-
from typing import Dict, List, Optional, Union
|
2
|
-
import logging
|
3
|
-
import traceback
|
4
|
-
import asyncio
|
5
|
-
import os
|
6
|
-
import subprocess
|
7
|
-
import time
|
8
|
-
from datetime import datetime
|
9
|
-
from pydantic import BaseModel
|
10
|
-
import platform
|
11
|
-
import json
|
12
|
-
from contextlib import asynccontextmanager
|
13
|
-
|
14
|
-
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
15
|
-
|
16
|
-
# 配置日志
|
17
|
-
logging.basicConfig(
|
18
|
-
level=logging.DEBUG,
|
19
|
-
format='%(asctime)s - %(levelname)s - %(message)s',
|
20
|
-
filename="chrome_extractor.log",
|
21
|
-
filemode="a"
|
22
|
-
)
|
23
|
-
logger = logging.getLogger("chrome_extractor")
|
24
|
-
|
25
|
-
# 定义数据模型
|
26
|
-
class CNKIContent(BaseModel):
|
27
|
-
"""CNKI论文内容模型"""
|
28
|
-
title: str = ""
|
29
|
-
authors: List[str] = []
|
30
|
-
abstract: str = ""
|
31
|
-
keywords: List[str] = []
|
32
|
-
cite_format: str = ""
|
33
|
-
url: str = "" # 添加URL字段以记录来源
|
34
|
-
|
35
|
-
def find_chrome_executable():
|
36
|
-
"""查找Chrome可执行文件路径"""
|
37
|
-
# 获取操作系统类型
|
38
|
-
system = platform.system()
|
39
|
-
|
40
|
-
# 定义可能的Chrome位置
|
41
|
-
if system == "Windows":
|
42
|
-
chrome_paths = [
|
43
|
-
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
|
44
|
-
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
|
45
|
-
os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
|
46
|
-
]
|
47
|
-
elif system == "Darwin": # MacOS
|
48
|
-
chrome_paths = [
|
49
|
-
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
50
|
-
os.path.expanduser("~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"),
|
51
|
-
]
|
52
|
-
elif system == "Linux":
|
53
|
-
chrome_paths = [
|
54
|
-
"/usr/bin/google-chrome",
|
55
|
-
"/usr/bin/chromium-browser",
|
56
|
-
"/usr/bin/chromium",
|
57
|
-
]
|
58
|
-
else:
|
59
|
-
logger.error(f"不支持的操作系统: {system}")
|
60
|
-
return None
|
61
|
-
|
62
|
-
# 检查路径是否存在
|
63
|
-
for path in chrome_paths:
|
64
|
-
if os.path.exists(path):
|
65
|
-
logger.info(f"找到Chrome: {path}")
|
66
|
-
return path
|
67
|
-
|
68
|
-
# 尝试从环境变量中查找
|
69
|
-
chrome_env = os.environ.get("CHROME_PATH")
|
70
|
-
if chrome_env and os.path.exists(chrome_env):
|
71
|
-
logger.info(f"从环境变量找到Chrome: {chrome_env}")
|
72
|
-
return chrome_env
|
73
|
-
|
74
|
-
logger.error("未找到Chrome浏览器")
|
75
|
-
return None
|
76
|
-
|
77
|
-
@asynccontextmanager
|
78
|
-
async def get_browser():
|
79
|
-
"""获取浏览器实例,使用上下文管理器确保资源释放"""
|
80
|
-
playwright = None
|
81
|
-
browser = None
|
82
|
-
|
83
|
-
try:
|
84
|
-
chrome_path = find_chrome_executable()
|
85
|
-
if not chrome_path:
|
86
|
-
raise ValueError("未找到Chrome浏览器,请设置CHROME_PATH环境变量指向Chrome位置")
|
87
|
-
|
88
|
-
logger.info(f"正在启动Chrome浏览器: {chrome_path}")
|
89
|
-
|
90
|
-
# 启动playwright
|
91
|
-
playwright = await async_playwright().start()
|
92
|
-
|
93
|
-
# 配置环境变量,告诉Playwright不要自动下载浏览器
|
94
|
-
os.environ["PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD"] = "1"
|
95
|
-
|
96
|
-
# 尝试不同的启动方式
|
97
|
-
try:
|
98
|
-
logger.info("尝试使用channel='chrome'启动")
|
99
|
-
browser = await playwright.chromium.launch(
|
100
|
-
headless=False,
|
101
|
-
channel="chrome",
|
102
|
-
executable_path=None # 让Playwright自己查找Chrome
|
103
|
-
)
|
104
|
-
except Exception as e1:
|
105
|
-
logger.info(f"channel='chrome'方式失败: {str(e1)}")
|
106
|
-
try:
|
107
|
-
logger.info(f"尝试使用executable_path启动: {chrome_path}")
|
108
|
-
browser = await playwright.chromium.launch(
|
109
|
-
headless=False,
|
110
|
-
executable_path=chrome_path
|
111
|
-
)
|
112
|
-
except Exception as e2:
|
113
|
-
logger.error(f"两种方式都失败了: {str(e2)}")
|
114
|
-
# 最后尝试以非无头模式启动
|
115
|
-
browser = await playwright.chromium.launch(
|
116
|
-
headless=False,
|
117
|
-
channel=None,
|
118
|
-
executable_path=chrome_path,
|
119
|
-
args=[
|
120
|
-
'--no-sandbox',
|
121
|
-
'--disable-setuid-sandbox',
|
122
|
-
'--disable-gpu',
|
123
|
-
'--start-maximized'
|
124
|
-
]
|
125
|
-
)
|
126
|
-
|
127
|
-
# 创建新的浏览器上下文
|
128
|
-
context = await browser.new_context(viewport={'width': 1920, 'height': 1080})
|
129
|
-
|
130
|
-
try:
|
131
|
-
yield browser, context
|
132
|
-
finally:
|
133
|
-
logger.info("关闭浏览器和上下文")
|
134
|
-
await context.close()
|
135
|
-
await browser.close()
|
136
|
-
except Exception as e:
|
137
|
-
logger.error(f"获取浏览器时出错: {str(e)}")
|
138
|
-
logger.error(traceback.format_exc())
|
139
|
-
raise
|
140
|
-
finally:
|
141
|
-
if playwright:
|
142
|
-
logger.info("关闭Playwright")
|
143
|
-
await playwright.stop()
|
144
|
-
|
145
|
-
async def extract_content_from_url(url: str) -> CNKIContent:
|
146
|
-
"""从CNKI页面提取论文内容"""
|
147
|
-
if not url.startswith('https://kns.cnki.net/'):
|
148
|
-
raise ValueError('URL必须是CNKI知网的链接')
|
149
|
-
|
150
|
-
content = CNKIContent(url=url)
|
151
|
-
|
152
|
-
async with get_browser() as (browser, context):
|
153
|
-
# 创建新页面
|
154
|
-
page = await context.new_page()
|
155
|
-
logger.info(f"正在访问页面: {url}")
|
156
|
-
|
157
|
-
try:
|
158
|
-
# 访问页面
|
159
|
-
await page.goto(url, wait_until='networkidle', timeout=60000)
|
160
|
-
logger.info("页面加载完成")
|
161
|
-
|
162
|
-
# 等待并检查登录状态
|
163
|
-
login_text = await page.evaluate('() => document.querySelector(".login-btn")?.textContent || ""')
|
164
|
-
if "登录" in login_text:
|
165
|
-
logger.info("需要登录,请手动登录...")
|
166
|
-
# 等待用户登录完成
|
167
|
-
await asyncio.sleep(15) # 给用户更多时间登录
|
168
|
-
|
169
|
-
# 提取标题
|
170
|
-
content.title = await page.evaluate('''
|
171
|
-
() => {
|
172
|
-
const selectors = ['h1.title', '.wx-tit h1', '.title', 'h1'];
|
173
|
-
for (const selector of selectors) {
|
174
|
-
const element = document.querySelector(selector);
|
175
|
-
if (element) {
|
176
|
-
const text = element.textContent.trim();
|
177
|
-
if (!text.includes('系统检测')) {
|
178
|
-
return text.split(/\\s+/)[0];
|
179
|
-
}
|
180
|
-
}
|
181
|
-
}
|
182
|
-
return "";
|
183
|
-
}
|
184
|
-
''')
|
185
|
-
logger.info(f"提取到标题: {content.title}")
|
186
|
-
|
187
|
-
# 提取引用格式和作者
|
188
|
-
try:
|
189
|
-
cite_button = await page.wait_for_selector(
|
190
|
-
'button:has-text("引用"), [class*="cite"], [class*="quote"]',
|
191
|
-
timeout=15000
|
192
|
-
)
|
193
|
-
if cite_button:
|
194
|
-
await cite_button.click()
|
195
|
-
logger.info("获取引用格式")
|
196
|
-
await asyncio.sleep(2) # 等待引用框加载
|
197
|
-
|
198
|
-
cite_result = await page.evaluate('''
|
199
|
-
() => {
|
200
|
-
const textarea = document.querySelector('.quote-r textarea.text');
|
201
|
-
if (textarea) {
|
202
|
-
const text = textarea.value.trim();
|
203
|
-
const cite_text = text.replace(/^\\[1\\]/, '').trim();
|
204
|
-
|
205
|
-
const match = cite_text.match(/^([^\\.]+)\\./);
|
206
|
-
const authors = match ? match[1].split(',').map(a => a.trim()) : [];
|
207
|
-
|
208
|
-
const titleMatch = cite_text.match(/\\.([^\\.]+?)\\[/);
|
209
|
-
const title = titleMatch ? titleMatch[1].trim() : '';
|
210
|
-
|
211
|
-
return {
|
212
|
-
cite_format: cite_text,
|
213
|
-
authors: authors,
|
214
|
-
title: title
|
215
|
-
};
|
216
|
-
}
|
217
|
-
return null;
|
218
|
-
}
|
219
|
-
''')
|
220
|
-
|
221
|
-
if cite_result:
|
222
|
-
content.cite_format = cite_result["cite_format"]
|
223
|
-
content.authors = cite_result["authors"]
|
224
|
-
if cite_result["title"]:
|
225
|
-
content.title = cite_result["title"]
|
226
|
-
logger.info(f"提取到作者: {content.authors}")
|
227
|
-
except Exception as e:
|
228
|
-
logger.error(f"提取引用格式时出错: {str(e)}")
|
229
|
-
|
230
|
-
# 提取摘要
|
231
|
-
content.abstract = await page.evaluate('''
|
232
|
-
() => {
|
233
|
-
const abstract = document.querySelector('.abstract-text, .abstract, .wx-tit + p');
|
234
|
-
return abstract ? abstract.textContent.trim() : "";
|
235
|
-
}
|
236
|
-
''')
|
237
|
-
logger.info(f"提取到摘要长度: {len(content.abstract)} 字符")
|
238
|
-
|
239
|
-
# 提取关键词
|
240
|
-
content.keywords = await page.evaluate('''
|
241
|
-
() => {
|
242
|
-
const keywordElements = Array.from(document.querySelectorAll('.keywords a, .keywords-text, .keyword'));
|
243
|
-
if (keywordElements.length > 0) {
|
244
|
-
return keywordElements.map(k => k.textContent.trim());
|
245
|
-
}
|
246
|
-
|
247
|
-
const paragraphs = Array.from(document.querySelectorAll('p'));
|
248
|
-
for (const p of paragraphs) {
|
249
|
-
if (p.textContent.includes('关键词')) {
|
250
|
-
const text = p.textContent.trim();
|
251
|
-
const keywordText = text.split(/关键词[::]/)[1];
|
252
|
-
if (keywordText) {
|
253
|
-
return keywordText.split(/[,,;;]/)
|
254
|
-
.map(k => k.trim())
|
255
|
-
.filter(k => k);
|
256
|
-
}
|
257
|
-
}
|
258
|
-
}
|
259
|
-
return [];
|
260
|
-
}
|
261
|
-
''')
|
262
|
-
logger.info(f"提取到关键词: {content.keywords}")
|
263
|
-
|
264
|
-
# 确保关闭页面
|
265
|
-
await page.close()
|
266
|
-
return content
|
267
|
-
|
268
|
-
except Exception as e:
|
269
|
-
logger.error(f"提取内容时出错: {str(e)}")
|
270
|
-
try:
|
271
|
-
# 尝试截图
|
272
|
-
screenshot_path = f'extraction_error_{datetime.now().strftime("%Y%m%d%H%M%S")}.png'
|
273
|
-
await page.screenshot(path=screenshot_path)
|
274
|
-
logger.info(f"错误截图已保存至: {screenshot_path}")
|
275
|
-
except:
|
276
|
-
pass
|
277
|
-
|
278
|
-
# 确保关闭页面
|
279
|
-
await page.close()
|
280
|
-
raise Exception(f"提取内容失败: {str(e)}")
|
281
|
-
|
282
|
-
async def batch_extract_contents(urls: List[str]) -> List[Dict]:
|
283
|
-
"""批量处理多个URL,提取内容并返回JSON格式"""
|
284
|
-
results = []
|
285
|
-
max_concurrent = 1 # 限制并发数,避免资源消耗过大
|
286
|
-
|
287
|
-
# 创建任务分组
|
288
|
-
for i in range(0, len(urls), max_concurrent):
|
289
|
-
batch_urls = urls[i:i+max_concurrent]
|
290
|
-
tasks = []
|
291
|
-
|
292
|
-
for url in batch_urls:
|
293
|
-
logger.info(f"添加任务: 处理URL {url}")
|
294
|
-
tasks.append(extract_content_from_url(url))
|
295
|
-
|
296
|
-
# 并发执行当前批次
|
297
|
-
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
|
298
|
-
|
299
|
-
# 处理结果
|
300
|
-
for j, result in enumerate(batch_results):
|
301
|
-
url_index = i + j
|
302
|
-
url = urls[url_index] if url_index < len(urls) else "unknown"
|
303
|
-
|
304
|
-
try:
|
305
|
-
if isinstance(result, Exception):
|
306
|
-
# 处理异常情况
|
307
|
-
logger.error(f"处理URL {url} 时出错: {str(result)}")
|
308
|
-
results.append({
|
309
|
-
"url": url,
|
310
|
-
"error": str(result),
|
311
|
-
"title": "",
|
312
|
-
"authors": [],
|
313
|
-
"abstract": "",
|
314
|
-
"keywords": [],
|
315
|
-
"cite_format": ""
|
316
|
-
})
|
317
|
-
else:
|
318
|
-
# 处理成功情况
|
319
|
-
results.append(result.dict())
|
320
|
-
logger.info(f"成功处理URL: {url}")
|
321
|
-
except Exception as e:
|
322
|
-
logger.error(f"处理结果时出错: {str(e)}")
|
323
|
-
results.append({
|
324
|
-
"url": url,
|
325
|
-
"error": f"处理结果时出错: {str(e)}",
|
326
|
-
"title": "",
|
327
|
-
"authors": [],
|
328
|
-
"abstract": "",
|
329
|
-
"keywords": [],
|
330
|
-
"cite_format": ""
|
331
|
-
})
|
332
|
-
|
333
|
-
# 添加短暂延迟,避免过快请求导致被封
|
334
|
-
await asyncio.sleep(2)
|
335
|
-
|
336
|
-
logger.info(f"批量处理完成,共处理 {len(results)} 个URL")
|
337
|
-
return results
|
338
|
-
|
339
|
-
# 直接搜索并提取内容
|
340
|
-
async def search_and_extract(keywords: str) -> Dict:
|
341
|
-
"""搜索关键词并提取所有论文的详细内容"""
|
342
|
-
from . import server # 导入server模块使用搜索功能
|
343
|
-
|
344
|
-
try:
|
345
|
-
logger.info(f"开始搜索关键词: {keywords}")
|
346
|
-
# 使用server中的搜索功能
|
347
|
-
links_count = await server.search_with_playwright(keywords)
|
348
|
-
|
349
|
-
# 获取搜索结果
|
350
|
-
page_content = server.page_content
|
351
|
-
|
352
|
-
# 检查搜索结果
|
353
|
-
if not isinstance(page_content, dict) or "links" not in page_content or not page_content["links"]:
|
354
|
-
logger.error("搜索未返回有效链接")
|
355
|
-
return {
|
356
|
-
"error": "搜索未返回有效链接",
|
357
|
-
"keywords": keywords,
|
358
|
-
"count": 0,
|
359
|
-
"results": []
|
360
|
-
}
|
361
|
-
|
362
|
-
# 提取链接
|
363
|
-
urls = [link["url"] for link in page_content["links"] if "url" in link]
|
364
|
-
if not urls:
|
365
|
-
logger.error("搜索返回结果中没有有效链接")
|
366
|
-
return {
|
367
|
-
"error": "未找到有效链接",
|
368
|
-
"keywords": keywords,
|
369
|
-
"count": 0,
|
370
|
-
"results": []
|
371
|
-
}
|
372
|
-
|
373
|
-
logger.info(f"搜索成功,找到 {len(urls)} 个链接,开始提取内容")
|
374
|
-
|
375
|
-
# 提取内容
|
376
|
-
results = await batch_extract_contents(urls)
|
377
|
-
|
378
|
-
# 包装结果
|
379
|
-
result_dict = {
|
380
|
-
"keywords": keywords,
|
381
|
-
"count": len(results),
|
382
|
-
"results": results,
|
383
|
-
"success_count": sum(1 for r in results if "error" not in r or not r["error"]),
|
384
|
-
"error_count": sum(1 for r in results if "error" in r and r["error"])
|
385
|
-
}
|
386
|
-
|
387
|
-
return result_dict
|
388
|
-
|
389
|
-
except Exception as e:
|
390
|
-
logger.error(f"搜索并提取时出错: {str(e)}")
|
391
|
-
logger.error(traceback.format_exc())
|
392
|
-
return {
|
393
|
-
"error": f"搜索并提取内容时出错: {str(e)}",
|
394
|
-
"keywords": keywords,
|
395
|
-
"count": 0,
|
396
|
-
"results": []
|
397
|
-
}
|
398
|
-
|
399
|
-
# 单元测试
|
400
|
-
async def test_extractor():
|
401
|
-
"""测试提取器功能"""
|
402
|
-
test_url = "https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKibYlV5Vjs7ioT0BO4yQ4m_wBGfVyh51O4GSy-IA63-FppCj3oNSHEUNzY35qnIKlFKtN6Av&uniplatform=NZKPT"
|
403
|
-
try:
|
404
|
-
content = await extract_content_from_url(test_url)
|
405
|
-
print(f"提取成功:\n{content.json(indent=2, ensure_ascii=False)}")
|
406
|
-
return True
|
407
|
-
except Exception as e:
|
408
|
-
print(f"测试失败: {str(e)}")
|
409
|
-
return False
|
410
|
-
|
411
|
-
if __name__ == "__main__":
|
412
|
-
# 运行测试
|
413
|
-
asyncio.run(test_extractor())
|