cnks 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cnks-0.3.1 → cnks-0.3.2}/PKG-INFO +1 -1
- {cnks-0.3.1 → cnks-0.3.2}/pyproject.toml +1 -1
- {cnks-0.3.1 → cnks-0.3.2}/src/citzer.py +45 -136
- {cnks-0.3.1 → cnks-0.3.2}/src/searcher.py +42 -155
- {cnks-0.3.1 → cnks-0.3.2}/src/server.py +20 -170
- {cnks-0.3.1 → cnks-0.3.2}/src/worker.py +29 -65
- cnks-0.3.1/cache.json +0 -37
- cnks-0.3.1/cnks_cache.log +0 -4
- cnks-0.3.1/cnks_citzer.log +0 -4
- cnks-0.3.1/cnks_main.log +0 -28
- cnks-0.3.1/cnks_searcher.log +0 -4
- cnks-0.3.1/cnks_server.log +0 -6
- cnks-0.3.1/cnks_worker.log +0 -4
- cnks-0.3.1/uv.lock +0 -432
- cnks-0.3.1//346/226/260/345/273/272/346/226/207/346/234/254/346/226/207/346/241/243.txt +0 -30
- {cnks-0.3.1 → cnks-0.3.2}/.env +0 -0
- {cnks-0.3.1 → cnks-0.3.2}/README.md +0 -0
- {cnks-0.3.1 → cnks-0.3.2}/src/ThisIsAServerSample.py +0 -0
- {cnks-0.3.1 → cnks-0.3.2}/src/__init__.py +0 -0
- {cnks-0.3.1 → cnks-0.3.2}/src/cache.py +0 -0
- {cnks-0.3.1 → cnks-0.3.2}/src/click50.py +0 -0
- {cnks-0.3.1 → cnks-0.3.2}/src/client.py +0 -0
- {cnks-0.3.1 → cnks-0.3.2}/src/cssci.py +0 -0
- {cnks-0.3.1 → cnks-0.3.2}/src/extractlink.py +0 -0
- {cnks-0.3.1 → cnks-0.3.2}/src/ifverify.py +0 -0
- {cnks-0.3.1 → cnks-0.3.2}/src/main.py +0 -0
@@ -13,7 +13,6 @@
|
|
13
13
|
3. 处理验证页面和特殊情况
|
14
14
|
4. 返回结构化引文数据
|
15
15
|
"""
|
16
|
-
import logging
|
17
16
|
import os
|
18
17
|
import platform
|
19
18
|
import traceback
|
@@ -21,45 +20,36 @@ import time
|
|
21
20
|
import re
|
22
21
|
from typing import Dict, List, Any, Optional, Union
|
23
22
|
|
24
|
-
#
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
23
|
+
# 禁用日志记录
|
24
|
+
class DummyLogger:
|
25
|
+
"""空日志记录器,用于禁用日志输出"""
|
26
|
+
def __init__(self, *args, **kwargs):
|
27
|
+
pass
|
28
|
+
|
29
|
+
def debug(self, *args, **kwargs):
|
30
|
+
pass
|
29
31
|
|
30
|
-
|
31
|
-
|
32
|
-
console_handler = logging.StreamHandler()
|
32
|
+
def info(self, *args, **kwargs):
|
33
|
+
pass
|
33
34
|
|
34
|
-
|
35
|
-
|
36
|
-
file_handler.setFormatter(formatter)
|
37
|
-
console_handler.setFormatter(formatter)
|
35
|
+
def warning(self, *args, **kwargs):
|
36
|
+
pass
|
38
37
|
|
39
|
-
|
40
|
-
|
41
|
-
logger.setLevel(logging.DEBUG)
|
38
|
+
def error(self, *args, **kwargs):
|
39
|
+
pass
|
42
40
|
|
43
|
-
|
44
|
-
|
45
|
-
for handler in logger.handlers:
|
46
|
-
logger.removeHandler(handler)
|
41
|
+
def critical(self, *args, **kwargs):
|
42
|
+
pass
|
47
43
|
|
48
|
-
|
49
|
-
|
44
|
+
def addHandler(self, *args, **kwargs):
|
45
|
+
pass
|
50
46
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
level=logging.DEBUG,
|
58
|
-
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
59
|
-
)
|
60
|
-
logger = logging.getLogger("cnks.citzer")
|
61
|
-
logger.error(f"Failed to set up file logging: {str(e)}")
|
62
|
-
print(f"Error setting up citzer file logging: {str(e)}")
|
47
|
+
def setLevel(self, *args, **kwargs):
|
48
|
+
pass
|
49
|
+
|
50
|
+
# 使用空日志记录器
|
51
|
+
logger = DummyLogger()
|
52
|
+
print = lambda *args, **kwargs: None # 禁用print函数
|
63
53
|
|
64
54
|
# 导入必要的模块
|
65
55
|
try:
|
@@ -67,7 +57,6 @@ try:
|
|
67
57
|
PLAYWRIGHT_AVAILABLE = True
|
68
58
|
except ImportError:
|
69
59
|
PLAYWRIGHT_AVAILABLE = False
|
70
|
-
logger.error("Playwright not available. Install with: pip install playwright")
|
71
60
|
|
72
61
|
# 尝试导入其他模块
|
73
62
|
try:
|
@@ -76,7 +65,7 @@ except ImportError:
|
|
76
65
|
try:
|
77
66
|
from ifverify import check_verification_needed, handle_verification
|
78
67
|
except ImportError:
|
79
|
-
|
68
|
+
pass
|
80
69
|
|
81
70
|
class Citzer:
|
82
71
|
"""引文分析器类,负责浏览器管理和引文内容提取"""
|
@@ -88,11 +77,8 @@ class Citzer:
|
|
88
77
|
self.context = None
|
89
78
|
self.browser_started = False
|
90
79
|
|
91
|
-
#
|
92
|
-
self.debug_dir =
|
93
|
-
os.makedirs(self.debug_dir, exist_ok=True)
|
94
|
-
|
95
|
-
logger.info("Citzer初始化完成")
|
80
|
+
# 不创建调试截图目录
|
81
|
+
self.debug_dir = "/dev/null" if platform.system() != "Windows" else "NUL"
|
96
82
|
|
97
83
|
async def start_browser(self) -> bool:
|
98
84
|
"""
|
@@ -102,24 +88,18 @@ class Citzer:
|
|
102
88
|
bool: 浏览器是否成功启动
|
103
89
|
"""
|
104
90
|
if self.browser_started:
|
105
|
-
logger.info("浏览器已启动,重用现有实例")
|
106
91
|
return True
|
107
92
|
|
108
93
|
if not PLAYWRIGHT_AVAILABLE:
|
109
|
-
logger.error("Playwright未安装,无法启动浏览器")
|
110
94
|
return False
|
111
95
|
|
112
96
|
try:
|
113
|
-
logger.info("使用持久上下文启动浏览器")
|
114
|
-
|
115
97
|
# 创建Playwright实例
|
116
98
|
self.playwright = await async_playwright().start()
|
117
|
-
logger.info("Playwright已启动")
|
118
99
|
|
119
100
|
# 创建用户数据目录(如果不存在)
|
120
101
|
user_data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chrome_data")
|
121
102
|
os.makedirs(user_data_dir, exist_ok=True)
|
122
|
-
logger.info(f"使用Chrome用户数据目录: {user_data_dir}")
|
123
103
|
|
124
104
|
# 设置Chrome参数
|
125
105
|
browser_args = [
|
@@ -129,8 +109,6 @@ class Citzer:
|
|
129
109
|
|
130
110
|
# 查找Chrome可执行文件
|
131
111
|
chrome_path = self._find_chrome_executable()
|
132
|
-
if chrome_path:
|
133
|
-
logger.info(f"使用Chrome路径: {chrome_path}")
|
134
112
|
|
135
113
|
# 使用持久上下文启动浏览器
|
136
114
|
self.context = await self.playwright.chromium.launch_persistent_context(
|
@@ -139,22 +117,17 @@ class Citzer:
|
|
139
117
|
headless=False,
|
140
118
|
args=browser_args
|
141
119
|
)
|
142
|
-
logger.info("使用持久上下文启动浏览器成功")
|
143
120
|
|
144
121
|
# 创建一个初始页面确保上下文已激活
|
145
122
|
init_page = await self.context.new_page()
|
146
123
|
await init_page.goto("about:blank")
|
147
124
|
await init_page.close()
|
148
|
-
logger.info("使用空白页初始化浏览器")
|
149
125
|
|
150
126
|
# 标记浏览器已启动
|
151
127
|
self.browser_started = True
|
152
128
|
return True
|
153
129
|
|
154
130
|
except Exception as e:
|
155
|
-
logger.error(f"启动浏览器时出错: {str(e)}")
|
156
|
-
logger.error(traceback.format_exc())
|
157
|
-
|
158
131
|
# 清理资源
|
159
132
|
await self.close_browser()
|
160
133
|
return False
|
@@ -166,29 +139,22 @@ class Citzer:
|
|
166
139
|
Returns:
|
167
140
|
bool: 是否成功关闭
|
168
141
|
"""
|
169
|
-
logger.info("关闭浏览器资源")
|
170
|
-
|
171
142
|
try:
|
172
143
|
# 关闭浏览器上下文
|
173
144
|
if self.context:
|
174
145
|
await self.context.close()
|
175
146
|
self.context = None
|
176
|
-
logger.info("浏览器上下文已关闭")
|
177
147
|
|
178
148
|
# 停止Playwright
|
179
149
|
if self.playwright:
|
180
150
|
await self.playwright.stop()
|
181
151
|
self.playwright = None
|
182
|
-
logger.info("Playwright已停止")
|
183
152
|
|
184
153
|
# 重置浏览器状态
|
185
154
|
self.browser_started = False
|
186
|
-
logger.info("浏览器资源已成功关闭")
|
187
155
|
return True
|
188
156
|
|
189
|
-
except Exception
|
190
|
-
logger.error(f"关闭浏览器时出错: {str(e)}")
|
191
|
-
logger.error(traceback.format_exc())
|
157
|
+
except Exception:
|
192
158
|
self.browser_started = False
|
193
159
|
return False
|
194
160
|
|
@@ -225,10 +191,8 @@ class Citzer:
|
|
225
191
|
for path in candidates:
|
226
192
|
expanded_path = os.path.expanduser(path)
|
227
193
|
if os.path.exists(expanded_path) and os.access(expanded_path, os.X_OK):
|
228
|
-
logger.info(f"找到Chrome路径: {expanded_path}")
|
229
194
|
return expanded_path
|
230
195
|
|
231
|
-
logger.warning("未找到Chrome可执行文件")
|
232
196
|
return None
|
233
197
|
|
234
198
|
async def open_page(self, url: str) -> Optional[Page]:
|
@@ -245,13 +209,11 @@ class Citzer:
|
|
245
209
|
if self.context is None and not self.browser_started:
|
246
210
|
success = await self.start_browser()
|
247
211
|
if not success:
|
248
|
-
logger.error("无法启动浏览器,放弃打开页面")
|
249
212
|
return None
|
250
213
|
|
251
214
|
try:
|
252
215
|
# 创建新标签页
|
253
216
|
page = await self.context.new_page()
|
254
|
-
logger.info(f"已创建新标签页,正在导航到URL: {url}")
|
255
217
|
|
256
218
|
# 导航到指定URL
|
257
219
|
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
@@ -259,19 +221,14 @@ class Citzer:
|
|
259
221
|
# 检查是否需要验证
|
260
222
|
verification_needed = await check_verification_needed(page)
|
261
223
|
if verification_needed:
|
262
|
-
logger.info("检测到需要人工验证,等待10秒钟...")
|
263
224
|
await page.wait_for_timeout(10000) # 等待10秒
|
264
|
-
else:
|
265
|
-
logger.info("未检测到验证页面,继续执行")
|
266
225
|
|
267
226
|
# 等待页面完全加载
|
268
227
|
await page.wait_for_load_state("networkidle", timeout=30000)
|
269
228
|
|
270
229
|
return page
|
271
230
|
|
272
|
-
except Exception
|
273
|
-
logger.error(f"打开页面时出错: {str(e)}")
|
274
|
-
logger.error(traceback.format_exc())
|
231
|
+
except Exception:
|
275
232
|
return None
|
276
233
|
|
277
234
|
async def process_link(self, link: str) -> Dict[str, Any]:
|
@@ -286,31 +243,22 @@ class Citzer:
|
|
286
243
|
"""
|
287
244
|
page = None
|
288
245
|
try:
|
289
|
-
logger.info(f"处理链接: {link}")
|
290
|
-
|
291
246
|
# 检查是否已经有浏览器上下文(可能是共享的)
|
292
247
|
if self.context is None and not self.browser_started:
|
293
|
-
logger.info("尝试启动浏览器")
|
294
248
|
await self.start_browser()
|
295
249
|
|
296
250
|
# 打开链接
|
297
251
|
page = await self.open_page(link)
|
298
252
|
if not page:
|
299
|
-
logger.error(f"无法打开链接: {link}")
|
300
253
|
return {}
|
301
254
|
|
302
|
-
#
|
303
|
-
screenshot_path = os.path.join(self.debug_dir, f"link_page_{int(time.time())}.png")
|
304
|
-
await page.screenshot(path=screenshot_path)
|
305
|
-
logger.info(f"已保存页面截图: {screenshot_path}")
|
255
|
+
# 不进行截图操作
|
306
256
|
|
307
257
|
# 提取摘要
|
308
258
|
abstract = await self._extract_abstract(page)
|
309
|
-
logger.info(f"提取的摘要: {abstract[:100]}..." if abstract else "未提取到摘要")
|
310
259
|
|
311
260
|
# 提取引用信息
|
312
261
|
citation = await self.extract_citation_from_button(page)
|
313
|
-
logger.info(f"提取的引用信息: {citation}" if citation else "未提取到引用信息")
|
314
262
|
|
315
263
|
# 如果提取失败,尝试从页面标题获取基本信息
|
316
264
|
if not citation:
|
@@ -319,7 +267,6 @@ class Citzer:
|
|
319
267
|
"title": title,
|
320
268
|
"raw_extracted": True
|
321
269
|
}
|
322
|
-
logger.info(f"使用页面标题作为基本信息: {title}")
|
323
270
|
|
324
271
|
# 组合结果
|
325
272
|
result = {
|
@@ -333,24 +280,18 @@ class Citzer:
|
|
333
280
|
if not result["title"] and result["cite_format"]:
|
334
281
|
result["title"] = self.extract_title_from_cite(result["cite_format"])
|
335
282
|
|
336
|
-
logger.info(f"成功处理链接: {link}")
|
337
283
|
return result
|
338
284
|
|
339
|
-
except Exception
|
340
|
-
logger.error(f"处理链接 {link} 时出错: {str(e)}")
|
341
|
-
logger.error(traceback.format_exc())
|
285
|
+
except Exception:
|
342
286
|
return {}
|
343
287
|
|
344
288
|
finally:
|
345
289
|
if page:
|
346
290
|
await page.close()
|
347
|
-
logger.info(f"已关闭链接页面: {link}")
|
348
291
|
|
349
292
|
async def extract_from_html(self, page):
|
350
293
|
"""从HTML页面提取引用信息"""
|
351
294
|
try:
|
352
|
-
logger.info(f"开始从页面提取引用信息")
|
353
|
-
|
354
295
|
# 等待页面加载完成
|
355
296
|
await page.wait_for_load_state("networkidle")
|
356
297
|
|
@@ -371,14 +312,11 @@ class Citzer:
|
|
371
312
|
if result.get('title') and not result.get('cite_format'):
|
372
313
|
result['cite_format'] = f"{result['title']}"
|
373
314
|
|
374
|
-
logger.info(f"成功提取引用信息")
|
375
315
|
return result
|
376
316
|
else:
|
377
|
-
logger.warning("无法提取引用信息")
|
378
317
|
return None
|
379
318
|
|
380
|
-
except Exception
|
381
|
-
logger.error(f"提取引用信息时出错: {str(e)}")
|
319
|
+
except Exception:
|
382
320
|
return None
|
383
321
|
|
384
322
|
@staticmethod
|
@@ -412,13 +350,9 @@ class Citzer:
|
|
412
350
|
abstract = re.sub(r'\s+', ' ', abstract).strip()
|
413
351
|
result["abstract"] = abstract
|
414
352
|
|
415
|
-
logger.info(f"Citation data processed, title: {result['title']}")
|
416
|
-
print(f"Citzer: processed citation, title: {result.get('title', '')[:30]}...")
|
417
353
|
return result
|
418
354
|
|
419
|
-
except Exception
|
420
|
-
logger.error(f"Error processing citation data: {str(e)}")
|
421
|
-
print(f"Citzer error: {str(e)}")
|
355
|
+
except Exception:
|
422
356
|
# Return raw data
|
423
357
|
return {
|
424
358
|
"title": citation_data.get("title", ""),
|
@@ -437,8 +371,8 @@ class Citzer:
|
|
437
371
|
match = re.search(r'[\.。]\s*([^\.。]+)[\.。]', cite_format)
|
438
372
|
if match:
|
439
373
|
return match.group(1).strip()
|
440
|
-
except Exception
|
441
|
-
|
374
|
+
except Exception:
|
375
|
+
pass
|
442
376
|
|
443
377
|
return ""
|
444
378
|
|
@@ -454,8 +388,8 @@ class Citzer:
|
|
454
388
|
if match:
|
455
389
|
authors_str = match.group(1)
|
456
390
|
return [a.strip() for a in re.split(r'[,,、]', authors_str) if a.strip()]
|
457
|
-
except Exception
|
458
|
-
|
391
|
+
except Exception:
|
392
|
+
pass
|
459
393
|
|
460
394
|
return []
|
461
395
|
|
@@ -541,22 +475,13 @@ class Citzer:
|
|
541
475
|
"""
|
542
476
|
abstract = await page.evaluate(abstract_js)
|
543
477
|
if abstract:
|
544
|
-
logger.info("从页面直接提取到摘要信息")
|
545
478
|
citation_data["abstract"] = abstract
|
546
|
-
except Exception
|
547
|
-
|
548
|
-
|
549
|
-
# 创建调试截图目录
|
550
|
-
debug_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "debug_screenshots")
|
551
|
-
os.makedirs(debug_dir, exist_ok=True)
|
479
|
+
except Exception:
|
480
|
+
pass
|
552
481
|
|
553
|
-
#
|
554
|
-
screenshot_path = os.path.join(debug_dir, f"pre_citation_click_{int(time.time())}.png")
|
555
|
-
await page.screenshot(path=screenshot_path)
|
556
|
-
logger.info(f"已保存点击前截图: {screenshot_path}")
|
482
|
+
# 不创建调试截图
|
557
483
|
|
558
484
|
# 使用JavaScript定位并点击引用按钮
|
559
|
-
logger.info("使用JavaScript查找并点击引用按钮")
|
560
485
|
js_button_finder = """
|
561
486
|
() => {
|
562
487
|
// 根据已知的HTML结构首先尝试精确定位
|
@@ -679,36 +604,27 @@ class Citzer:
|
|
679
604
|
|
680
605
|
button_info = await page.evaluate(js_button_finder)
|
681
606
|
if button_info and button_info.get('found'):
|
682
|
-
logger.info(f"通过JavaScript找到引用按钮: {button_info.get('text')} ({button_info.get('tag')}),方法: {button_info.get('method', 'unknown')}")
|
683
|
-
|
684
607
|
# 使用鼠标点击坐标
|
685
608
|
x, y = button_info.get('x'), button_info.get('y')
|
686
609
|
await page.mouse.click(x, y)
|
687
|
-
logger.info(f"已点击坐标 ({x}, {y}) 处的引用按钮")
|
688
610
|
|
689
611
|
# 等待引用对话框出现
|
690
612
|
await page.wait_for_timeout(2000)
|
691
613
|
|
692
|
-
#
|
693
|
-
screenshot_path = os.path.join(debug_dir, f"post_js_click_{int(time.time())}.png")
|
694
|
-
await page.screenshot(path=screenshot_path)
|
695
|
-
logger.info(f"已保存JavaScript点击后截图: {screenshot_path}")
|
614
|
+
# 不获取点击后截图
|
696
615
|
|
697
616
|
# 提取引用信息
|
698
617
|
cite_result = await self._extract_citation_text(page)
|
699
618
|
if cite_result:
|
700
619
|
cite_result.update(citation_data)
|
701
620
|
return cite_result
|
702
|
-
else:
|
703
|
-
logger.info("未能使用JavaScript找到引用按钮")
|
704
621
|
|
705
622
|
# 如果点击按钮都失败了,但我们已经提取到了其他信息
|
706
623
|
if citation_data:
|
707
624
|
return citation_data
|
708
625
|
|
709
|
-
except Exception
|
710
|
-
|
711
|
-
logger.error(traceback.format_exc())
|
626
|
+
except Exception:
|
627
|
+
pass
|
712
628
|
|
713
629
|
return {}
|
714
630
|
|
@@ -767,19 +683,15 @@ class Citzer:
|
|
767
683
|
''')
|
768
684
|
|
769
685
|
if cite_result:
|
770
|
-
logger.info("成功提取引用文本")
|
771
686
|
return {
|
772
687
|
"title": "",
|
773
688
|
"cite_format": cite_result,
|
774
689
|
"abstract": ""
|
775
690
|
}
|
776
691
|
else:
|
777
|
-
logger.warning("无法提取引用文本")
|
778
692
|
return {}
|
779
693
|
|
780
|
-
except Exception
|
781
|
-
logger.error(f"提取引用文本时出错: {str(e)}")
|
782
|
-
logger.error(traceback.format_exc())
|
694
|
+
except Exception:
|
783
695
|
return {}
|
784
696
|
|
785
697
|
async def _extract_abstract(self, page) -> str:
|
@@ -854,15 +766,12 @@ class Citzer:
|
|
854
766
|
|
855
767
|
abstract = await page.evaluate(abstract_js)
|
856
768
|
if abstract:
|
857
|
-
logger.info("成功从页面提取摘要")
|
858
769
|
# 清理摘要文本
|
859
770
|
abstract = abstract.replace("摘要:", "").replace("摘要:", "").replace("Abstract:", "").strip()
|
860
771
|
return abstract
|
861
772
|
else:
|
862
|
-
logger.info("未能找到页面摘要内容")
|
863
773
|
return ""
|
864
774
|
|
865
|
-
except Exception
|
866
|
-
logger.error(f"提取摘要时出错: {str(e)}")
|
775
|
+
except Exception:
|
867
776
|
return ""
|
868
777
|
|