cnks 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cnks
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: 中国知网搜索与引文处理系统
5
5
  Author-email: bai-z-l <b@iziliang.com>
6
6
  Requires-Python: >=3.12
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "cnks"
3
- version = "0.3.1"
3
+ version = "0.3.2"
4
4
  description = "中国知网搜索与引文处理系统"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -13,7 +13,6 @@
13
13
  3. 处理验证页面和特殊情况
14
14
  4. 返回结构化引文数据
15
15
  """
16
- import logging
17
16
  import os
18
17
  import platform
19
18
  import traceback
@@ -21,45 +20,36 @@ import time
21
20
  import re
22
21
  from typing import Dict, List, Any, Optional, Union
23
22
 
24
- # 配置日志记录
25
- try:
26
- # 尝试使用绝对路径
27
- log_dir = os.path.dirname(os.path.abspath(__file__))
28
- log_file = os.path.join(os.path.dirname(log_dir), "cnks_citzer.log")
23
+ # 禁用日志记录
24
+ class DummyLogger:
25
+ """空日志记录器,用于禁用日志输出"""
26
+ def __init__(self, *args, **kwargs):
27
+ pass
28
+
29
+ def debug(self, *args, **kwargs):
30
+ pass
29
31
 
30
- # 创建处理器
31
- file_handler = logging.FileHandler(log_file, mode="a")
32
- console_handler = logging.StreamHandler()
32
+ def info(self, *args, **kwargs):
33
+ pass
33
34
 
34
- # 设置格式
35
- formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
36
- file_handler.setFormatter(formatter)
37
- console_handler.setFormatter(formatter)
35
+ def warning(self, *args, **kwargs):
36
+ pass
38
37
 
39
- # 获取日志记录器并添加处理器
40
- logger = logging.getLogger("cnks.citzer")
41
- logger.setLevel(logging.DEBUG)
38
+ def error(self, *args, **kwargs):
39
+ pass
42
40
 
43
- # 移除现有处理器以避免重复
44
- if logger.handlers:
45
- for handler in logger.handlers:
46
- logger.removeHandler(handler)
41
+ def critical(self, *args, **kwargs):
42
+ pass
47
43
 
48
- logger.addHandler(file_handler)
49
- logger.addHandler(console_handler)
44
+ def addHandler(self, *args, **kwargs):
45
+ pass
50
46
 
51
- # 打印确认信息
52
- print(f"Citzer logger initialized, logging to: {log_file}")
53
- logger.info(f"Citzer logging to: {log_file}")
54
- except Exception as e:
55
- # 回退到基本控制台日志记录
56
- logging.basicConfig(
57
- level=logging.DEBUG,
58
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
59
- )
60
- logger = logging.getLogger("cnks.citzer")
61
- logger.error(f"Failed to set up file logging: {str(e)}")
62
- print(f"Error setting up citzer file logging: {str(e)}")
47
+ def setLevel(self, *args, **kwargs):
48
+ pass
49
+
50
+ # 使用空日志记录器
51
+ logger = DummyLogger()
52
+ print = lambda *args, **kwargs: None # 禁用print函数
63
53
 
64
54
  # 导入必要的模块
65
55
  try:
@@ -67,7 +57,6 @@ try:
67
57
  PLAYWRIGHT_AVAILABLE = True
68
58
  except ImportError:
69
59
  PLAYWRIGHT_AVAILABLE = False
70
- logger.error("Playwright not available. Install with: pip install playwright")
71
60
 
72
61
  # 尝试导入其他模块
73
62
  try:
@@ -76,7 +65,7 @@ except ImportError:
76
65
  try:
77
66
  from ifverify import check_verification_needed, handle_verification
78
67
  except ImportError:
79
- logger.warning("无法导入验证处理模块,部分功能可能不可用")
68
+ pass
80
69
 
81
70
  class Citzer:
82
71
  """引文分析器类,负责浏览器管理和引文内容提取"""
@@ -88,11 +77,8 @@ class Citzer:
88
77
  self.context = None
89
78
  self.browser_started = False
90
79
 
91
- # 创建调试截图目录
92
- self.debug_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "debug_screenshots")
93
- os.makedirs(self.debug_dir, exist_ok=True)
94
-
95
- logger.info("Citzer初始化完成")
80
+ # 不创建调试截图目录
81
+ self.debug_dir = "/dev/null" if platform.system() != "Windows" else "NUL"
96
82
 
97
83
  async def start_browser(self) -> bool:
98
84
  """
@@ -102,24 +88,18 @@ class Citzer:
102
88
  bool: 浏览器是否成功启动
103
89
  """
104
90
  if self.browser_started:
105
- logger.info("浏览器已启动,重用现有实例")
106
91
  return True
107
92
 
108
93
  if not PLAYWRIGHT_AVAILABLE:
109
- logger.error("Playwright未安装,无法启动浏览器")
110
94
  return False
111
95
 
112
96
  try:
113
- logger.info("使用持久上下文启动浏览器")
114
-
115
97
  # 创建Playwright实例
116
98
  self.playwright = await async_playwright().start()
117
- logger.info("Playwright已启动")
118
99
 
119
100
  # 创建用户数据目录(如果不存在)
120
101
  user_data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chrome_data")
121
102
  os.makedirs(user_data_dir, exist_ok=True)
122
- logger.info(f"使用Chrome用户数据目录: {user_data_dir}")
123
103
 
124
104
  # 设置Chrome参数
125
105
  browser_args = [
@@ -129,8 +109,6 @@ class Citzer:
129
109
 
130
110
  # 查找Chrome可执行文件
131
111
  chrome_path = self._find_chrome_executable()
132
- if chrome_path:
133
- logger.info(f"使用Chrome路径: {chrome_path}")
134
112
 
135
113
  # 使用持久上下文启动浏览器
136
114
  self.context = await self.playwright.chromium.launch_persistent_context(
@@ -139,22 +117,17 @@ class Citzer:
139
117
  headless=False,
140
118
  args=browser_args
141
119
  )
142
- logger.info("使用持久上下文启动浏览器成功")
143
120
 
144
121
  # 创建一个初始页面确保上下文已激活
145
122
  init_page = await self.context.new_page()
146
123
  await init_page.goto("about:blank")
147
124
  await init_page.close()
148
- logger.info("使用空白页初始化浏览器")
149
125
 
150
126
  # 标记浏览器已启动
151
127
  self.browser_started = True
152
128
  return True
153
129
 
154
130
  except Exception as e:
155
- logger.error(f"启动浏览器时出错: {str(e)}")
156
- logger.error(traceback.format_exc())
157
-
158
131
  # 清理资源
159
132
  await self.close_browser()
160
133
  return False
@@ -166,29 +139,22 @@ class Citzer:
166
139
  Returns:
167
140
  bool: 是否成功关闭
168
141
  """
169
- logger.info("关闭浏览器资源")
170
-
171
142
  try:
172
143
  # 关闭浏览器上下文
173
144
  if self.context:
174
145
  await self.context.close()
175
146
  self.context = None
176
- logger.info("浏览器上下文已关闭")
177
147
 
178
148
  # 停止Playwright
179
149
  if self.playwright:
180
150
  await self.playwright.stop()
181
151
  self.playwright = None
182
- logger.info("Playwright已停止")
183
152
 
184
153
  # 重置浏览器状态
185
154
  self.browser_started = False
186
- logger.info("浏览器资源已成功关闭")
187
155
  return True
188
156
 
189
- except Exception as e:
190
- logger.error(f"关闭浏览器时出错: {str(e)}")
191
- logger.error(traceback.format_exc())
157
+ except Exception:
192
158
  self.browser_started = False
193
159
  return False
194
160
 
@@ -225,10 +191,8 @@ class Citzer:
225
191
  for path in candidates:
226
192
  expanded_path = os.path.expanduser(path)
227
193
  if os.path.exists(expanded_path) and os.access(expanded_path, os.X_OK):
228
- logger.info(f"找到Chrome路径: {expanded_path}")
229
194
  return expanded_path
230
195
 
231
- logger.warning("未找到Chrome可执行文件")
232
196
  return None
233
197
 
234
198
  async def open_page(self, url: str) -> Optional[Page]:
@@ -245,13 +209,11 @@ class Citzer:
245
209
  if self.context is None and not self.browser_started:
246
210
  success = await self.start_browser()
247
211
  if not success:
248
- logger.error("无法启动浏览器,放弃打开页面")
249
212
  return None
250
213
 
251
214
  try:
252
215
  # 创建新标签页
253
216
  page = await self.context.new_page()
254
- logger.info(f"已创建新标签页,正在导航到URL: {url}")
255
217
 
256
218
  # 导航到指定URL
257
219
  await page.goto(url, wait_until="domcontentloaded", timeout=30000)
@@ -259,19 +221,14 @@ class Citzer:
259
221
  # 检查是否需要验证
260
222
  verification_needed = await check_verification_needed(page)
261
223
  if verification_needed:
262
- logger.info("检测到需要人工验证,等待10秒钟...")
263
224
  await page.wait_for_timeout(10000) # 等待10秒
264
- else:
265
- logger.info("未检测到验证页面,继续执行")
266
225
 
267
226
  # 等待页面完全加载
268
227
  await page.wait_for_load_state("networkidle", timeout=30000)
269
228
 
270
229
  return page
271
230
 
272
- except Exception as e:
273
- logger.error(f"打开页面时出错: {str(e)}")
274
- logger.error(traceback.format_exc())
231
+ except Exception:
275
232
  return None
276
233
 
277
234
  async def process_link(self, link: str) -> Dict[str, Any]:
@@ -286,31 +243,22 @@ class Citzer:
286
243
  """
287
244
  page = None
288
245
  try:
289
- logger.info(f"处理链接: {link}")
290
-
291
246
  # 检查是否已经有浏览器上下文(可能是共享的)
292
247
  if self.context is None and not self.browser_started:
293
- logger.info("尝试启动浏览器")
294
248
  await self.start_browser()
295
249
 
296
250
  # 打开链接
297
251
  page = await self.open_page(link)
298
252
  if not page:
299
- logger.error(f"无法打开链接: {link}")
300
253
  return {}
301
254
 
302
- # 获取页面截图用于调试
303
- screenshot_path = os.path.join(self.debug_dir, f"link_page_{int(time.time())}.png")
304
- await page.screenshot(path=screenshot_path)
305
- logger.info(f"已保存页面截图: {screenshot_path}")
255
+ # 不进行截图操作
306
256
 
307
257
  # 提取摘要
308
258
  abstract = await self._extract_abstract(page)
309
- logger.info(f"提取的摘要: {abstract[:100]}..." if abstract else "未提取到摘要")
310
259
 
311
260
  # 提取引用信息
312
261
  citation = await self.extract_citation_from_button(page)
313
- logger.info(f"提取的引用信息: {citation}" if citation else "未提取到引用信息")
314
262
 
315
263
  # 如果提取失败,尝试从页面标题获取基本信息
316
264
  if not citation:
@@ -319,7 +267,6 @@ class Citzer:
319
267
  "title": title,
320
268
  "raw_extracted": True
321
269
  }
322
- logger.info(f"使用页面标题作为基本信息: {title}")
323
270
 
324
271
  # 组合结果
325
272
  result = {
@@ -333,24 +280,18 @@ class Citzer:
333
280
  if not result["title"] and result["cite_format"]:
334
281
  result["title"] = self.extract_title_from_cite(result["cite_format"])
335
282
 
336
- logger.info(f"成功处理链接: {link}")
337
283
  return result
338
284
 
339
- except Exception as e:
340
- logger.error(f"处理链接 {link} 时出错: {str(e)}")
341
- logger.error(traceback.format_exc())
285
+ except Exception:
342
286
  return {}
343
287
 
344
288
  finally:
345
289
  if page:
346
290
  await page.close()
347
- logger.info(f"已关闭链接页面: {link}")
348
291
 
349
292
  async def extract_from_html(self, page):
350
293
  """从HTML页面提取引用信息"""
351
294
  try:
352
- logger.info(f"开始从页面提取引用信息")
353
-
354
295
  # 等待页面加载完成
355
296
  await page.wait_for_load_state("networkidle")
356
297
 
@@ -371,14 +312,11 @@ class Citzer:
371
312
  if result.get('title') and not result.get('cite_format'):
372
313
  result['cite_format'] = f"{result['title']}"
373
314
 
374
- logger.info(f"成功提取引用信息")
375
315
  return result
376
316
  else:
377
- logger.warning("无法提取引用信息")
378
317
  return None
379
318
 
380
- except Exception as e:
381
- logger.error(f"提取引用信息时出错: {str(e)}")
319
+ except Exception:
382
320
  return None
383
321
 
384
322
  @staticmethod
@@ -412,13 +350,9 @@ class Citzer:
412
350
  abstract = re.sub(r'\s+', ' ', abstract).strip()
413
351
  result["abstract"] = abstract
414
352
 
415
- logger.info(f"Citation data processed, title: {result['title']}")
416
- print(f"Citzer: processed citation, title: {result.get('title', '')[:30]}...")
417
353
  return result
418
354
 
419
- except Exception as e:
420
- logger.error(f"Error processing citation data: {str(e)}")
421
- print(f"Citzer error: {str(e)}")
355
+ except Exception:
422
356
  # Return raw data
423
357
  return {
424
358
  "title": citation_data.get("title", ""),
@@ -437,8 +371,8 @@ class Citzer:
437
371
  match = re.search(r'[\.。]\s*([^\.。]+)[\.。]', cite_format)
438
372
  if match:
439
373
  return match.group(1).strip()
440
- except Exception as e:
441
- logger.error(f"Error extracting title from citation format: {str(e)}")
374
+ except Exception:
375
+ pass
442
376
 
443
377
  return ""
444
378
 
@@ -454,8 +388,8 @@ class Citzer:
454
388
  if match:
455
389
  authors_str = match.group(1)
456
390
  return [a.strip() for a in re.split(r'[,,、]', authors_str) if a.strip()]
457
- except Exception as e:
458
- logger.error(f"Error extracting authors from citation format: {str(e)}")
391
+ except Exception:
392
+ pass
459
393
 
460
394
  return []
461
395
 
@@ -541,22 +475,13 @@ class Citzer:
541
475
  """
542
476
  abstract = await page.evaluate(abstract_js)
543
477
  if abstract:
544
- logger.info("从页面直接提取到摘要信息")
545
478
  citation_data["abstract"] = abstract
546
- except Exception as e:
547
- logger.error(f"直接提取页面信息时出错: {str(e)}")
548
-
549
- # 创建调试截图目录
550
- debug_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "debug_screenshots")
551
- os.makedirs(debug_dir, exist_ok=True)
479
+ except Exception:
480
+ pass
552
481
 
553
- # 获取当前页面截图用于调试
554
- screenshot_path = os.path.join(debug_dir, f"pre_citation_click_{int(time.time())}.png")
555
- await page.screenshot(path=screenshot_path)
556
- logger.info(f"已保存点击前截图: {screenshot_path}")
482
+ # 不创建调试截图
557
483
 
558
484
  # 使用JavaScript定位并点击引用按钮
559
- logger.info("使用JavaScript查找并点击引用按钮")
560
485
  js_button_finder = """
561
486
  () => {
562
487
  // 根据已知的HTML结构首先尝试精确定位
@@ -679,36 +604,27 @@ class Citzer:
679
604
 
680
605
  button_info = await page.evaluate(js_button_finder)
681
606
  if button_info and button_info.get('found'):
682
- logger.info(f"通过JavaScript找到引用按钮: {button_info.get('text')} ({button_info.get('tag')}),方法: {button_info.get('method', 'unknown')}")
683
-
684
607
  # 使用鼠标点击坐标
685
608
  x, y = button_info.get('x'), button_info.get('y')
686
609
  await page.mouse.click(x, y)
687
- logger.info(f"已点击坐标 ({x}, {y}) 处的引用按钮")
688
610
 
689
611
  # 等待引用对话框出现
690
612
  await page.wait_for_timeout(2000)
691
613
 
692
- # 获取点击后截图
693
- screenshot_path = os.path.join(debug_dir, f"post_js_click_{int(time.time())}.png")
694
- await page.screenshot(path=screenshot_path)
695
- logger.info(f"已保存JavaScript点击后截图: {screenshot_path}")
614
+ # 不获取点击后截图
696
615
 
697
616
  # 提取引用信息
698
617
  cite_result = await self._extract_citation_text(page)
699
618
  if cite_result:
700
619
  cite_result.update(citation_data)
701
620
  return cite_result
702
- else:
703
- logger.info("未能使用JavaScript找到引用按钮")
704
621
 
705
622
  # 如果点击按钮都失败了,但我们已经提取到了其他信息
706
623
  if citation_data:
707
624
  return citation_data
708
625
 
709
- except Exception as e:
710
- logger.error(f"提取引用信息时出错: {str(e)}")
711
- logger.error(traceback.format_exc())
626
+ except Exception:
627
+ pass
712
628
 
713
629
  return {}
714
630
 
@@ -767,19 +683,15 @@ class Citzer:
767
683
  ''')
768
684
 
769
685
  if cite_result:
770
- logger.info("成功提取引用文本")
771
686
  return {
772
687
  "title": "",
773
688
  "cite_format": cite_result,
774
689
  "abstract": ""
775
690
  }
776
691
  else:
777
- logger.warning("无法提取引用文本")
778
692
  return {}
779
693
 
780
- except Exception as e:
781
- logger.error(f"提取引用文本时出错: {str(e)}")
782
- logger.error(traceback.format_exc())
694
+ except Exception:
783
695
  return {}
784
696
 
785
697
  async def _extract_abstract(self, page) -> str:
@@ -854,15 +766,12 @@ class Citzer:
854
766
 
855
767
  abstract = await page.evaluate(abstract_js)
856
768
  if abstract:
857
- logger.info("成功从页面提取摘要")
858
769
  # 清理摘要文本
859
770
  abstract = abstract.replace("摘要:", "").replace("摘要:", "").replace("Abstract:", "").strip()
860
771
  return abstract
861
772
  else:
862
- logger.info("未能找到页面摘要内容")
863
773
  return ""
864
774
 
865
- except Exception as e:
866
- logger.error(f"提取摘要时出错: {str(e)}")
775
+ except Exception:
867
776
  return ""
868
777