cnks 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnks/__init__.py +49 -49
- cnks/server.py +1875 -831
- {cnks-0.2.2.dist-info → cnks-0.2.4.dist-info}/METADATA +5 -11
- cnks-0.2.4.dist-info/RECORD +6 -0
- cnks/chrome_extractor.py +0 -413
- cnks/extractor.py +0 -250
- cnks-0.2.2.dist-info/RECORD +0 -8
- {cnks-0.2.2.dist-info → cnks-0.2.4.dist-info}/WHEEL +0 -0
- {cnks-0.2.2.dist-info → cnks-0.2.4.dist-info}/entry_points.txt +0 -0
cnks/extractor.py
DELETED
@@ -1,250 +0,0 @@
|
|
1
|
-
from typing import Dict, List, Optional, Union
|
2
|
-
import logging
|
3
|
-
import traceback
|
4
|
-
import asyncio
|
5
|
-
from contextlib import asynccontextmanager
|
6
|
-
from datetime import datetime
|
7
|
-
from pydantic import BaseModel
|
8
|
-
|
9
|
-
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
10
|
-
|
11
|
-
# 配置日志
|
12
|
-
logging.basicConfig(
|
13
|
-
level=logging.DEBUG,
|
14
|
-
format='%(asctime)s - %(levelname)s - %(message)s',
|
15
|
-
filename="cnki_extractor.log",
|
16
|
-
filemode="a"
|
17
|
-
)
|
18
|
-
logger = logging.getLogger("cnki_extractor")
|
19
|
-
|
20
|
-
# 定义数据模型
|
21
|
-
class CNKIContent(BaseModel):
|
22
|
-
"""CNKI论文内容模型"""
|
23
|
-
title: str = ""
|
24
|
-
authors: List[str] = []
|
25
|
-
abstract: str = ""
|
26
|
-
keywords: List[str] = []
|
27
|
-
cite_format: str = ""
|
28
|
-
url: str = "" # 添加URL字段以记录来源
|
29
|
-
|
30
|
-
async def get_browser():
|
31
|
-
"""获取浏览器实例"""
|
32
|
-
from . import server # 导入服务器模块以使用查找Chrome的函数
|
33
|
-
|
34
|
-
playwright = await async_playwright().start()
|
35
|
-
|
36
|
-
# 查找本地Chrome路径
|
37
|
-
chrome_path = server.find_chrome_executable()
|
38
|
-
|
39
|
-
if not chrome_path:
|
40
|
-
raise ValueError('未找到Chrome可执行文件,请设置CHROME_PATH环境变量指向Chrome位置')
|
41
|
-
|
42
|
-
logger.info(f"使用本地Chrome: {chrome_path}")
|
43
|
-
|
44
|
-
try:
|
45
|
-
# 尝试使用channel='chrome'模式
|
46
|
-
browser = await playwright.chromium.launch(
|
47
|
-
headless=False,
|
48
|
-
channel="chrome" # 优先使用Chrome通道
|
49
|
-
)
|
50
|
-
except Exception as e:
|
51
|
-
logger.info(f"使用channel='chrome'失败: {str(e)},尝试使用executable_path")
|
52
|
-
# 如果失败,尝试使用executable_path指定Chrome路径
|
53
|
-
browser = await playwright.chromium.launch(
|
54
|
-
headless=False,
|
55
|
-
executable_path=chrome_path
|
56
|
-
)
|
57
|
-
|
58
|
-
context = await browser.new_context(viewport={'width': 1920, 'height': 1080})
|
59
|
-
|
60
|
-
return playwright, browser, context
|
61
|
-
|
62
|
-
async def extract_content_from_url(url: str) -> CNKIContent:
|
63
|
-
"""从CNKI页面提取论文内容"""
|
64
|
-
if not url.startswith('https://kns.cnki.net/'):
|
65
|
-
raise ValueError('URL必须是CNKI知网的链接')
|
66
|
-
|
67
|
-
content = CNKIContent(url=url)
|
68
|
-
playwright = None
|
69
|
-
browser = None
|
70
|
-
context = None
|
71
|
-
page = None
|
72
|
-
|
73
|
-
try:
|
74
|
-
# 初始化浏览器
|
75
|
-
playwright, browser, context = await get_browser()
|
76
|
-
|
77
|
-
# 创建新页面
|
78
|
-
page = await context.new_page()
|
79
|
-
logger.info(f"正在访问页面: {url}")
|
80
|
-
|
81
|
-
try:
|
82
|
-
# 访问页面
|
83
|
-
await page.goto(url, wait_until='networkidle', timeout=60000)
|
84
|
-
logger.info("页面加载完成")
|
85
|
-
|
86
|
-
# 等待并检查登录状态
|
87
|
-
login_text = await page.evaluate('() => document.querySelector(".login-btn")?.textContent || ""')
|
88
|
-
if "登录" in login_text:
|
89
|
-
logger.info("需要登录,请手动登录...")
|
90
|
-
# 等待用户登录完成
|
91
|
-
await asyncio.sleep(10) # 给用户一些时间登录
|
92
|
-
|
93
|
-
# 提取标题
|
94
|
-
content.title = await page.evaluate('''
|
95
|
-
() => {
|
96
|
-
const selectors = ['h1.title', '.wx-tit h1', '.title', 'h1'];
|
97
|
-
for (const selector of selectors) {
|
98
|
-
const element = document.querySelector(selector);
|
99
|
-
if (element) {
|
100
|
-
const text = element.textContent.trim();
|
101
|
-
if (!text.includes('系统检测')) {
|
102
|
-
return text.split(/\\s+/)[0];
|
103
|
-
}
|
104
|
-
}
|
105
|
-
}
|
106
|
-
return "";
|
107
|
-
}
|
108
|
-
''')
|
109
|
-
logger.info(f"提取到标题: {content.title}")
|
110
|
-
|
111
|
-
# 提取引用格式和作者
|
112
|
-
try:
|
113
|
-
cite_button = await page.wait_for_selector(
|
114
|
-
'button:has-text("引用"), [class*="cite"], [class*="quote"]',
|
115
|
-
timeout=15000
|
116
|
-
)
|
117
|
-
if cite_button:
|
118
|
-
await cite_button.click()
|
119
|
-
logger.info("获取引用格式")
|
120
|
-
|
121
|
-
cite_result = await page.evaluate('''
|
122
|
-
() => {
|
123
|
-
const textarea = document.querySelector('.quote-r textarea.text');
|
124
|
-
if (textarea) {
|
125
|
-
const text = textarea.value.trim();
|
126
|
-
const cite_text = text.replace(/^\\[1\\]/, '').trim();
|
127
|
-
|
128
|
-
const match = cite_text.match(/^([^\\.]+)\\./);
|
129
|
-
const authors = match ? match[1].split(',').map(a => a.trim()) : [];
|
130
|
-
|
131
|
-
const titleMatch = cite_text.match(/\\.([^\\.]+?)\\[/);
|
132
|
-
const title = titleMatch ? titleMatch[1].trim() : '';
|
133
|
-
|
134
|
-
return {
|
135
|
-
cite_format: cite_text,
|
136
|
-
authors: authors,
|
137
|
-
title: title
|
138
|
-
};
|
139
|
-
}
|
140
|
-
return null;
|
141
|
-
}
|
142
|
-
''')
|
143
|
-
|
144
|
-
if cite_result:
|
145
|
-
content.cite_format = cite_result["cite_format"]
|
146
|
-
content.authors = cite_result["authors"]
|
147
|
-
if cite_result["title"]:
|
148
|
-
content.title = cite_result["title"]
|
149
|
-
logger.info(f"提取到作者: {content.authors}")
|
150
|
-
except Exception as e:
|
151
|
-
logger.error(f"提取引用格式时出错: {str(e)}")
|
152
|
-
|
153
|
-
# 提取摘要
|
154
|
-
content.abstract = await page.evaluate('''
|
155
|
-
() => {
|
156
|
-
const abstract = document.querySelector('.abstract-text, .abstract, .wx-tit + p');
|
157
|
-
return abstract ? abstract.textContent.trim() : "";
|
158
|
-
}
|
159
|
-
''')
|
160
|
-
logger.info(f"提取到摘要长度: {len(content.abstract)} 字符")
|
161
|
-
|
162
|
-
# 提取关键词
|
163
|
-
content.keywords = await page.evaluate('''
|
164
|
-
() => {
|
165
|
-
const keywordElements = Array.from(document.querySelectorAll('.keywords a, .keywords-text, .keyword'));
|
166
|
-
if (keywordElements.length > 0) {
|
167
|
-
return keywordElements.map(k => k.textContent.trim());
|
168
|
-
}
|
169
|
-
|
170
|
-
const paragraphs = Array.from(document.querySelectorAll('p'));
|
171
|
-
for (const p of paragraphs) {
|
172
|
-
if (p.textContent.includes('关键词')) {
|
173
|
-
const text = p.textContent.trim();
|
174
|
-
const keywordText = text.split(/关键词[::]/)[1];
|
175
|
-
if (keywordText) {
|
176
|
-
return keywordText.split(/[,,;;]/)
|
177
|
-
.map(k => k.trim())
|
178
|
-
.filter(k => k);
|
179
|
-
}
|
180
|
-
}
|
181
|
-
}
|
182
|
-
return [];
|
183
|
-
}
|
184
|
-
''')
|
185
|
-
logger.info(f"提取到关键词: {content.keywords}")
|
186
|
-
|
187
|
-
return content
|
188
|
-
|
189
|
-
except Exception as e:
|
190
|
-
logger.error(f"提取内容时出错: {str(e)}")
|
191
|
-
if page:
|
192
|
-
await page.screenshot(path=f'extraction_error_{datetime.now().strftime("%Y%m%d%H%M%S")}.png')
|
193
|
-
raise Exception(f"提取内容失败: {str(e)}")
|
194
|
-
|
195
|
-
except Exception as e:
|
196
|
-
logger.error(f"处理请求时出错: {str(e)}")
|
197
|
-
logger.error(f"错误堆栈: {traceback.format_exc()}")
|
198
|
-
raise
|
199
|
-
|
200
|
-
finally:
|
201
|
-
# 关闭资源
|
202
|
-
if page:
|
203
|
-
await page.close()
|
204
|
-
if context:
|
205
|
-
await context.close()
|
206
|
-
if browser:
|
207
|
-
await browser.close()
|
208
|
-
if playwright:
|
209
|
-
await playwright.stop()
|
210
|
-
|
211
|
-
async def batch_extract_contents(urls: List[str]) -> List[Dict]:
|
212
|
-
"""批量处理多个URL,提取内容并返回JSON格式"""
|
213
|
-
results = []
|
214
|
-
|
215
|
-
for i, url in enumerate(urls):
|
216
|
-
try:
|
217
|
-
logger.info(f"正在处理第 {i+1}/{len(urls)} 个URL: {url}")
|
218
|
-
content = await extract_content_from_url(url)
|
219
|
-
results.append(content.dict())
|
220
|
-
logger.info(f"成功提取第 {i+1} 个URL的内容")
|
221
|
-
# 添加短暂延迟,避免过快请求导致被封
|
222
|
-
await asyncio.sleep(2)
|
223
|
-
except Exception as e:
|
224
|
-
logger.error(f"处理URL {url} 时出错: {str(e)}")
|
225
|
-
# 添加错误信息而不是跳过,这样可以知道哪些URL处理失败
|
226
|
-
results.append({
|
227
|
-
"url": url,
|
228
|
-
"error": str(e),
|
229
|
-
"title": "",
|
230
|
-
"authors": [],
|
231
|
-
"abstract": "",
|
232
|
-
"keywords": [],
|
233
|
-
"cite_format": ""
|
234
|
-
})
|
235
|
-
|
236
|
-
return results
|
237
|
-
|
238
|
-
# 单元测试
|
239
|
-
async def test_extractor():
|
240
|
-
"""测试提取器功能"""
|
241
|
-
test_url = "https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKibYlV5Vjs7ioT0BO4yQ4m_wBGfVyh51O4GSy-IA63-FppCj3oNSHEUNzY35qnIKlFKtN6Av&uniplatform=NZKPT"
|
242
|
-
try:
|
243
|
-
content = await extract_content_from_url(test_url)
|
244
|
-
print(f"提取成功:\n{content.json(indent=2, ensure_ascii=False)}")
|
245
|
-
except Exception as e:
|
246
|
-
print(f"测试失败: {str(e)}")
|
247
|
-
|
248
|
-
if __name__ == "__main__":
|
249
|
-
# 运行测试
|
250
|
-
asyncio.run(test_extractor())
|
cnks-0.2.2.dist-info/RECORD
DELETED
@@ -1,8 +0,0 @@
|
|
1
|
-
cnks/__init__.py,sha256=wG3EMbmNepa8s90dSq7c_A4RnvsC--pzlbnxCBpdwpU,1899
|
2
|
-
cnks/chrome_extractor.py,sha256=upTO6DswZEqdY7XuWgRiw_xv-X20ldq0sxKA1MK7z-g,16262
|
3
|
-
cnks/extractor.py,sha256=VbRJ7flI6huWnXAjCUxufZBUfETxvc8TCw7NQMLGU4o,9998
|
4
|
-
cnks/server.py,sha256=L1ugCuCyKUpbwtWG2cKtjSnGAljzl9PLQEZBapoyb1g,34002
|
5
|
-
cnks-0.2.2.dist-info/METADATA,sha256=xbSj8F-Y5GtsW678mLEgv1RKvg-Qv0JxIWDBSMpGCBU,4198
|
6
|
-
cnks-0.2.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
7
|
-
cnks-0.2.2.dist-info/entry_points.txt,sha256=OkIiD7Ctn1Fn5B5zY09ltFFjIA8mJd3lw5V20AGtyYg,35
|
8
|
-
cnks-0.2.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|