PyPI - cnks - Versions diffs - 0.3.1__tar.gz → 0.3.2__tar.gz - Mend

cnks 0.3.1tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{cnks-0.3.1 → cnks-0.3.2}/PKG-INFO +1 -1
{cnks-0.3.1 → cnks-0.3.2}/pyproject.toml +1 -1
{cnks-0.3.1 → cnks-0.3.2}/src/citzer.py +45 -136
{cnks-0.3.1 → cnks-0.3.2}/src/searcher.py +42 -155
{cnks-0.3.1 → cnks-0.3.2}/src/server.py +20 -170
{cnks-0.3.1 → cnks-0.3.2}/src/worker.py +29 -65
cnks-0.3.1/cache.json +0 -37
cnks-0.3.1/cnks_cache.log +0 -4
cnks-0.3.1/cnks_citzer.log +0 -4
cnks-0.3.1/cnks_main.log +0 -28
cnks-0.3.1/cnks_searcher.log +0 -4
cnks-0.3.1/cnks_server.log +0 -6
cnks-0.3.1/cnks_worker.log +0 -4
cnks-0.3.1/uv.lock +0 -432
cnks-0.3.1//346/226/260/345/273/272/346/226/207/346/234/254/346/226/207/346/241/243.txt +0 -30
{cnks-0.3.1 → cnks-0.3.2}/.env +0 -0
{cnks-0.3.1 → cnks-0.3.2}/README.md +0 -0
{cnks-0.3.1 → cnks-0.3.2}/src/ThisIsAServerSample.py +0 -0
{cnks-0.3.1 → cnks-0.3.2}/src/__init__.py +0 -0
{cnks-0.3.1 → cnks-0.3.2}/src/cache.py +0 -0
{cnks-0.3.1 → cnks-0.3.2}/src/click50.py +0 -0
{cnks-0.3.1 → cnks-0.3.2}/src/client.py +0 -0
{cnks-0.3.1 → cnks-0.3.2}/src/cssci.py +0 -0
{cnks-0.3.1 → cnks-0.3.2}/src/extractlink.py +0 -0
{cnks-0.3.1 → cnks-0.3.2}/src/ifverify.py +0 -0
{cnks-0.3.1 → cnks-0.3.2}/src/main.py +0 -0

{cnks-0.3.1 → cnks-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cnks
-Version: 0.3.1
+Version: 0.3.2
 Summary: 中国知网搜索与引文处理系统
 Author-email: bai-z-l <b@iziliang.com>
 Requires-Python: >=3.12

{cnks-0.3.1 → cnks-0.3.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "cnks"
-version = "0.3.1"
+version = "0.3.2"
 description = "中国知网搜索与引文处理系统"
 readme = "README.md"
 requires-python = ">=3.12"

{cnks-0.3.1 → cnks-0.3.2}/src/citzer.py RENAMED Viewed

@@ -13,7 +13,6 @@
 3. 处理验证页面和特殊情况
 4. 返回结构化引文数据
 """
-import logging
 import os
 import platform
 import traceback
@@ -21,45 +20,36 @@ import time
 import re
 from typing import Dict, List, Any, Optional, Union
-# 配置日志记录
-try:
-    # 尝试使用绝对路径
-    log_dir = os.path.dirname(os.path.abspath(__file__))
-    log_file = os.path.join(os.path.dirname(log_dir), "cnks_citzer.log")
+# 禁用日志记录
+class DummyLogger:
+    """空日志记录器，用于禁用日志输出"""
+    def __init__(self, *args, **kwargs):
+        pass
+    def debug(self, *args, **kwargs):
+        pass
-    # 创建处理器
-    file_handler = logging.FileHandler(log_file, mode="a")
-    console_handler = logging.StreamHandler()
+    def info(self, *args, **kwargs):
+        pass
-    # 设置格式
-    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-    file_handler.setFormatter(formatter)
-    console_handler.setFormatter(formatter)
+    def warning(self, *args, **kwargs):
+        pass
-    # 获取日志记录器并添加处理器
-    logger = logging.getLogger("cnks.citzer")
-    logger.setLevel(logging.DEBUG)
+    def error(self, *args, **kwargs):
+        pass
-    # 移除现有处理器以避免重复
-    if logger.handlers:
-        for handler in logger.handlers:
-            logger.removeHandler(handler)
+    def critical(self, *args, **kwargs):
+        pass
-    logger.addHandler(file_handler)
-    logger.addHandler(console_handler)
+    def addHandler(self, *args, **kwargs):
+        pass
-    # 打印确认信息
-    print(f"Citzer logger initialized, logging to: {log_file}")
-    logger.info(f"Citzer logging to: {log_file}")
-except Exception as e:
-    # 回退到基本控制台日志记录
-    logging.basicConfig(
-        level=logging.DEBUG,
-        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-    )
-    logger = logging.getLogger("cnks.citzer")
-    logger.error(f"Failed to set up file logging: {str(e)}")
-    print(f"Error setting up citzer file logging: {str(e)}")
+    def setLevel(self, *args, **kwargs):
+        pass
+# 使用空日志记录器
+logger = DummyLogger()
+print = lambda *args, **kwargs: None  # 禁用print函数
 # 导入必要的模块
 try:
@@ -67,7 +57,6 @@ try:
     PLAYWRIGHT_AVAILABLE = True
 except ImportError:
     PLAYWRIGHT_AVAILABLE = False
-    logger.error("Playwright not available. Install with: pip install playwright")
 # 尝试导入其他模块
 try:
@@ -76,7 +65,7 @@ except ImportError:
     try:
         from ifverify import check_verification_needed, handle_verification
     except ImportError:
-        logger.warning("无法导入验证处理模块，部分功能可能不可用")
+        pass
 class Citzer:
     """引文分析器类，负责浏览器管理和引文内容提取"""
@@ -88,11 +77,8 @@ class Citzer:
         self.context = None
         self.browser_started = False
-        # 创建调试截图目录
-        self.debug_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "debug_screenshots")
-        os.makedirs(self.debug_dir, exist_ok=True)
-        logger.info("Citzer初始化完成")
+        # 不创建调试截图目录
+        self.debug_dir = "/dev/null" if platform.system() != "Windows" else "NUL"
     async def start_browser(self) -> bool:
         """
@@ -102,24 +88,18 @@ class Citzer:
             bool: 浏览器是否成功启动
         """
         if self.browser_started:
-            logger.info("浏览器已启动，重用现有实例")
             return True
         if not PLAYWRIGHT_AVAILABLE:
-            logger.error("Playwright未安装，无法启动浏览器")
             return False
         try:
-            logger.info("使用持久上下文启动浏览器")
             # 创建Playwright实例
             self.playwright = await async_playwright().start()
-            logger.info("Playwright已启动")
             # 创建用户数据目录（如果不存在）
             user_data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chrome_data")
             os.makedirs(user_data_dir, exist_ok=True)
-            logger.info(f"使用Chrome用户数据目录: {user_data_dir}")
             # 设置Chrome参数
             browser_args = [
@@ -129,8 +109,6 @@ class Citzer:
             # 查找Chrome可执行文件
             chrome_path = self._find_chrome_executable()
-            if chrome_path:
-                logger.info(f"使用Chrome路径: {chrome_path}")
             # 使用持久上下文启动浏览器
             self.context = await self.playwright.chromium.launch_persistent_context(
@@ -139,22 +117,17 @@ class Citzer:
                 headless=False,
                 args=browser_args
             )
-            logger.info("使用持久上下文启动浏览器成功")
             # 创建一个初始页面确保上下文已激活
             init_page = await self.context.new_page()
             await init_page.goto("about:blank")
             await init_page.close()
-            logger.info("使用空白页初始化浏览器")
             # 标记浏览器已启动
             self.browser_started = True
             return True
         except Exception as e:
-            logger.error(f"启动浏览器时出错: {str(e)}")
-            logger.error(traceback.format_exc())
             # 清理资源
             await self.close_browser()
             return False
@@ -166,29 +139,22 @@ class Citzer:
         Returns:
             bool: 是否成功关闭
         """
-        logger.info("关闭浏览器资源")
         try:
             # 关闭浏览器上下文
             if self.context:
                 await self.context.close()
                 self.context = None
-                logger.info("浏览器上下文已关闭")
             # 停止Playwright
             if self.playwright:
                 await self.playwright.stop()
                 self.playwright = None
-                logger.info("Playwright已停止")
             # 重置浏览器状态
             self.browser_started = False
-            logger.info("浏览器资源已成功关闭")
             return True
-        except Exception as e:
-            logger.error(f"关闭浏览器时出错: {str(e)}")
-            logger.error(traceback.format_exc())
+        except Exception:
             self.browser_started = False
             return False
@@ -225,10 +191,8 @@ class Citzer:
         for path in candidates:
             expanded_path = os.path.expanduser(path)
             if os.path.exists(expanded_path) and os.access(expanded_path, os.X_OK):
-                logger.info(f"找到Chrome路径: {expanded_path}")
                 return expanded_path
-        logger.warning("未找到Chrome可执行文件")
         return None
     async def open_page(self, url: str) -> Optional[Page]:
@@ -245,13 +209,11 @@ class Citzer:
         if self.context is None and not self.browser_started:
             success = await self.start_browser()
             if not success:
-                logger.error("无法启动浏览器，放弃打开页面")
                 return None
         try:
             # 创建新标签页
             page = await self.context.new_page()
-            logger.info(f"已创建新标签页，正在导航到URL: {url}")
             # 导航到指定URL
             await page.goto(url, wait_until="domcontentloaded", timeout=30000)
@@ -259,19 +221,14 @@ class Citzer:
             # 检查是否需要验证
             verification_needed = await check_verification_needed(page)
             if verification_needed:
-                logger.info("检测到需要人工验证，等待10秒钟...")
                 await page.wait_for_timeout(10000)  # 等待10秒
-            else:
-                logger.info("未检测到验证页面，继续执行")
             # 等待页面完全加载
             await page.wait_for_load_state("networkidle", timeout=30000)
             return page
-        except Exception as e:
-            logger.error(f"打开页面时出错: {str(e)}")
-            logger.error(traceback.format_exc())
+        except Exception:
             return None
     async def process_link(self, link: str) -> Dict[str, Any]:
@@ -286,31 +243,22 @@ class Citzer:
         """
         page = None
         try:
-            logger.info(f"处理链接: {link}")
             # 检查是否已经有浏览器上下文(可能是共享的)
             if self.context is None and not self.browser_started:
-                logger.info("尝试启动浏览器")
                 await self.start_browser()
             # 打开链接
             page = await self.open_page(link)
             if not page:
-                logger.error(f"无法打开链接: {link}")
                 return {}
-            # 获取页面截图用于调试
-            screenshot_path = os.path.join(self.debug_dir, f"link_page_{int(time.time())}.png")
-            await page.screenshot(path=screenshot_path)
-            logger.info(f"已保存页面截图: {screenshot_path}")
+            # 不进行截图操作
             # 提取摘要
             abstract = await self._extract_abstract(page)
-            logger.info(f"提取的摘要: {abstract[:100]}..." if abstract else "未提取到摘要")
             # 提取引用信息
             citation = await self.extract_citation_from_button(page)
-            logger.info(f"提取的引用信息: {citation}" if citation else "未提取到引用信息")
             # 如果提取失败，尝试从页面标题获取基本信息
             if not citation:
@@ -319,7 +267,6 @@ class Citzer:
                     "title": title,
                     "raw_extracted": True
                 }
-                logger.info(f"使用页面标题作为基本信息: {title}")
             # 组合结果
             result = {
@@ -333,24 +280,18 @@ class Citzer:
             if not result["title"] and result["cite_format"]:
                 result["title"] = self.extract_title_from_cite(result["cite_format"])
-            logger.info(f"成功处理链接: {link}")
             return result
-        except Exception as e:
-            logger.error(f"处理链接 {link} 时出错: {str(e)}")
-            logger.error(traceback.format_exc())
+        except Exception:
             return {}
         finally:
             if page:
                 await page.close()
-                logger.info(f"已关闭链接页面: {link}")
     async def extract_from_html(self, page):
         """从HTML页面提取引用信息"""
         try:
-            logger.info(f"开始从页面提取引用信息")
             # 等待页面加载完成
             await page.wait_for_load_state("networkidle")
@@ -371,14 +312,11 @@ class Citzer:
                 if result.get('title') and not result.get('cite_format'):
                     result['cite_format'] = f"{result['title']}"
-                logger.info(f"成功提取引用信息")
                 return result
             else:
-                logger.warning("无法提取引用信息")
                 return None
-        except Exception as e:
-            logger.error(f"提取引用信息时出错: {str(e)}")
+        except Exception:
             return None
     @staticmethod
@@ -412,13 +350,9 @@ class Citzer:
                 abstract = re.sub(r'\s+', ' ', abstract).strip()
                 result["abstract"] = abstract
-            logger.info(f"Citation data processed, title: {result['title']}")
-            print(f"Citzer: processed citation, title: {result.get('title', '')[:30]}...")
             return result
-        except Exception as e:
-            logger.error(f"Error processing citation data: {str(e)}")
-            print(f"Citzer error: {str(e)}")
+        except Exception:
             # Return raw data
             return {
                 "title": citation_data.get("title", ""),
@@ -437,8 +371,8 @@ class Citzer:
             match = re.search(r'[\.。]\s*([^\.。]+)[\.。]', cite_format)
             if match:
                 return match.group(1).strip()
-        except Exception as e:
-            logger.error(f"Error extracting title from citation format: {str(e)}")
+        except Exception:
+            pass
         return ""
@@ -454,8 +388,8 @@ class Citzer:
             if match:
                 authors_str = match.group(1)
                 return [a.strip() for a in re.split(r'[,，、]', authors_str) if a.strip()]
-        except Exception as e:
-            logger.error(f"Error extracting authors from citation format: {str(e)}")
+        except Exception:
+            pass
         return []
@@ -541,22 +475,13 @@ class Citzer:
                 """
                 abstract = await page.evaluate(abstract_js)
                 if abstract:
-                    logger.info("从页面直接提取到摘要信息")
                     citation_data["abstract"] = abstract
-            except Exception as e:
-                logger.error(f"直接提取页面信息时出错: {str(e)}")
-            # 创建调试截图目录
-            debug_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "debug_screenshots")
-            os.makedirs(debug_dir, exist_ok=True)
+            except Exception:
+                pass
-            # 获取当前页面截图用于调试
-            screenshot_path = os.path.join(debug_dir, f"pre_citation_click_{int(time.time())}.png")
-            await page.screenshot(path=screenshot_path)
-            logger.info(f"已保存点击前截图: {screenshot_path}")
+            # 不创建调试截图
             # 使用JavaScript定位并点击引用按钮
-            logger.info("使用JavaScript查找并点击引用按钮")
             js_button_finder = """
             () => {
                 // 根据已知的HTML结构首先尝试精确定位
@@ -679,36 +604,27 @@ class Citzer:
             button_info = await page.evaluate(js_button_finder)
             if button_info and button_info.get('found'):
-                logger.info(f"通过JavaScript找到引用按钮: {button_info.get('text')} ({button_info.get('tag')})，方法: {button_info.get('method', 'unknown')}")
                 # 使用鼠标点击坐标
                 x, y = button_info.get('x'), button_info.get('y')
                 await page.mouse.click(x, y)
-                logger.info(f"已点击坐标 ({x}, {y}) 处的引用按钮")
                 # 等待引用对话框出现
                 await page.wait_for_timeout(2000)
-                # 获取点击后截图
-                screenshot_path = os.path.join(debug_dir, f"post_js_click_{int(time.time())}.png")
-                await page.screenshot(path=screenshot_path)
-                logger.info(f"已保存JavaScript点击后截图: {screenshot_path}")
+                # 不获取点击后截图
                 # 提取引用信息
                 cite_result = await self._extract_citation_text(page)
                 if cite_result:
                     cite_result.update(citation_data)
                     return cite_result
-            else:
-                logger.info("未能使用JavaScript找到引用按钮")
             # 如果点击按钮都失败了，但我们已经提取到了其他信息
             if citation_data:
                 return citation_data
-        except Exception as e:
-            logger.error(f"提取引用信息时出错: {str(e)}")
-            logger.error(traceback.format_exc())
+        except Exception:
+            pass
         return {}
@@ -767,19 +683,15 @@ class Citzer:
             ''')
             if cite_result:
-                logger.info("成功提取引用文本")
                 return {
                     "title": "",
                     "cite_format": cite_result,
                     "abstract": ""
                 }
             else:
-                logger.warning("无法提取引用文本")
                 return {}
-        except Exception as e:
-            logger.error(f"提取引用文本时出错: {str(e)}")
-            logger.error(traceback.format_exc())
+        except Exception:
             return {}
     async def _extract_abstract(self, page) -> str:
@@ -854,15 +766,12 @@ class Citzer:
             abstract = await page.evaluate(abstract_js)
             if abstract:
-                logger.info("成功从页面提取摘要")
                 # 清理摘要文本
                 abstract = abstract.replace("摘要:", "").replace("摘要：", "").replace("Abstract:", "").strip()
                 return abstract
             else:
-                logger.info("未能找到页面摘要内容")
                 return ""
-        except Exception as e:
-            logger.error(f"提取摘要时出错: {str(e)}")
+        except Exception:
             return ""

cnks 0.3.1__tar.gz → 0.3.2__tar.gz

cnks 0.3.1tar.gz → 0.3.2tar.gz