PyPI - arxiv-pulse - Versions diffs - 0.5.0__py3-none-any.whl - Mend

arxiv-pulse 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

arxiv_pulse/.ENV.TEMPLATE +72 -0
arxiv_pulse/__init__.py +26 -0
arxiv_pulse/__version__.py +33 -0
arxiv_pulse/arxiv_crawler.py +377 -0
arxiv_pulse/cli.py +1608 -0
arxiv_pulse/config.py +64 -0
arxiv_pulse/models.py +255 -0
arxiv_pulse/output_manager.py +235 -0
arxiv_pulse/report_generator.py +768 -0
arxiv_pulse/search_engine.py +367 -0
arxiv_pulse/summarizer.py +356 -0
arxiv_pulse-0.5.0.dist-info/METADATA +546 -0
arxiv_pulse-0.5.0.dist-info/RECORD +17 -0
arxiv_pulse-0.5.0.dist-info/WHEEL +5 -0
arxiv_pulse-0.5.0.dist-info/entry_points.txt +2 -0
arxiv_pulse-0.5.0.dist-info/licenses/LICENSE +674 -0
arxiv_pulse-0.5.0.dist-info/top_level.txt +1 -0

arxiv_pulse/config.py ADDED Viewed

@@ -0,0 +1,64 @@
+import os
+import warnings
+class Config:
+    # Database
+    DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///data/arxiv_papers.db")
+    # Crawler
+    MAX_RESULTS_INITIAL = int(os.getenv("MAX_RESULTS_INITIAL", 100))
+    MAX_RESULTS_DAILY = int(os.getenv("MAX_RESULTS_DAILY", 20))
+    # Search queries - use semicolon as separator to allow commas in queries
+    SEARCH_QUERIES_RAW = os.getenv(
+        "SEARCH_QUERIES",
+        "condensed matter physics; density functional theory; machine learning; force fields; first principles calculation; molecular dynamics; quantum chemistry; computational materials science",
+    )
+    SEARCH_QUERIES = [q.strip() for q in SEARCH_QUERIES_RAW.split(";") if q.strip()]
+    # AI API (支持 OpenAI 格式，如 DeepSeek、Paratera AI 等)
+    # 使用 AI_* 环境变量配置
+    # AI API 配置变量
+    AI_API_KEY = os.getenv("AI_API_KEY")  # 可以为 None
+    AI_MODEL = os.getenv("AI_MODEL", "DeepSeek-V3.2-Thinking")
+    AI_BASE_URL = os.getenv("AI_BASE_URL", "https://llmapi.paratera.com")
+    # 模型配置：SUMMARY_MODEL 现在复用 AI_MODEL
+    SUMMARY_MAX_TOKENS = int(os.getenv("SUMMARY_MAX_TOKENS", 2000))
+    # Report generation settings
+    SUMMARY_SENTENCES_LIMIT = int(os.getenv("SUMMARY_SENTENCES_LIMIT", 3))
+    TOKEN_PRICE_PER_MILLION = float(os.getenv("TOKEN_PRICE_PER_MILLION", 3.0))
+    # Paths
+    REPORT_DIR = os.getenv("REPORT_DIR", "reports")
+    DATA_DIR = os.path.dirname(DATABASE_URL.replace("sqlite:///", ""))
+    # Report generation limits
+    REPORT_MAX_PAPERS = int(os.getenv("REPORT_MAX_PAPERS", "50"))
+    # ArXiv API
+    ARXIV_MAX_RESULTS = 1000
+    ARXIV_SORT_BY = "submittedDate"
+    ARXIV_SORT_ORDER = "descending"
+    # Sync configuration
+    YEARS_BACK = int(os.getenv("YEARS_BACK", 3))  # Years to look back for initial sync
+    IMPORTANT_PAPERS_FILE = os.getenv("IMPORTANT_PAPERS_FILE", "important_papers.txt")
+    @classmethod
+    def validate(cls):
+        """Validate configuration"""
+        if not cls.AI_API_KEY:
+            print("警告: 未设置 AI_API_KEY。AI 总结和翻译功能将受限。")
+            print("      请设置 AI_API_KEY 环境变量以启用 AI 功能。")
+        else:
+            print(f"信息: 找到 AI API 密钥 (AI_API_KEY)。AI 总结和翻译功能已启用 (模型: {cls.AI_MODEL})。")
+        # Ensure directories exist
+        os.makedirs(cls.REPORT_DIR, exist_ok=True)
+        os.makedirs(cls.DATA_DIR, exist_ok=True)
+        return True

arxiv_pulse/models.py ADDED Viewed

@@ -0,0 +1,255 @@
+from sqlalchemy import (
+    create_engine,
+    Column,
+    Integer,
+    String,
+    Text,
+    DateTime,
+    Boolean,
+    Float,
+    JSON,
+)
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+from datetime import datetime, timedelta
+import json
+from typing import Optional
+from arxiv_pulse.config import Config
+Base = declarative_base()
+class Paper(Base):
+    __tablename__ = "papers"
+    id = Column(Integer, primary_key=True)
+    arxiv_id = Column(String(50), unique=True, nullable=False, index=True)
+    title = Column(String(500), nullable=False)
+    authors = Column(Text)  # JSON string of authors list
+    abstract = Column(Text)
+    categories = Column(String(500))
+    primary_category = Column(String(100))
+    published = Column(DateTime, nullable=False)
+    updated = Column(DateTime)
+    pdf_url = Column(String(500))
+    doi = Column(String(200))
+    journal_ref = Column(String(500))
+    comment = Column(Text)
+    # Search relevance
+    search_query = Column(String(200))
+    relevance_score = Column(Float, default=0.0)
+    keywords = Column(Text)  # JSON string of extracted keywords
+    # Processing status
+    downloaded = Column(Boolean, default=False)
+    summarized = Column(Boolean, default=False)
+    summary = Column(Text)
+    # Metadata
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+    def to_dict(self):
+        """Convert to dictionary"""
+        return {
+            "id": self.id,
+            "arxiv_id": self.arxiv_id,
+            "title": self.title,
+            "authors": json.loads(self.authors) if self.authors else [],
+            "abstract": self.abstract,
+            "categories": self.categories,
+            "primary_category": self.primary_category,
+            "published": self.published.isoformat() if self.published else None,
+            "updated": self.updated.isoformat() if self.updated else None,
+            "pdf_url": self.pdf_url,
+            "doi": self.doi,
+            "journal_ref": self.journal_ref,
+            "comment": self.comment,
+            "search_query": self.search_query,
+            "relevance_score": self.relevance_score,
+            "keywords": json.loads(self.keywords) if self.keywords else [],
+            "downloaded": self.downloaded,
+            "summarized": self.summarized,
+            "summary": self.summary,
+            "created_at": self.created_at.isoformat() if self.created_at else None,
+            "updated_at": self.updated_at.isoformat() if self.updated_at else None,
+        }
+    @classmethod
+    def from_arxiv_entry(cls, entry, search_query):
+        """Create Paper instance from arXiv entry"""
+        authors = [{"name": author.name, "affiliation": getattr(author, "affiliation", "")} for author in entry.authors]
+        return cls(
+            arxiv_id=entry.entry_id.split("/")[-1],
+            title=entry.title,
+            authors=json.dumps(authors),
+            abstract=entry.summary,
+            categories=", ".join(entry.categories) if hasattr(entry, "categories") else entry.primary_category,
+            primary_category=entry.primary_category if hasattr(entry, "primary_category") else "",
+            published=entry.published,
+            updated=entry.updated if hasattr(entry, "updated") else None,
+            pdf_url=entry.pdf_url
+            if hasattr(entry, "pdf_url")
+            else f"https://arxiv.org/pdf/{entry.entry_id.split('/')[-1]}.pdf",
+            doi=entry.doi if hasattr(entry, "doi") else None,
+            journal_ref=entry.journal_ref if hasattr(entry, "journal_ref") else None,
+            comment=entry.comment if hasattr(entry, "comment") else None,
+            search_query=search_query,
+            relevance_score=0.0,
+        )
+class TranslationCache(Base):
+    """缓存翻译结果以避免重复API调用"""
+    __tablename__ = "translation_cache"
+    id = Column(Integer, primary_key=True)
+    source_text = Column(Text, nullable=False)
+    source_text_hash = Column(String(64), nullable=False, unique=True, index=True)
+    translated_text = Column(Text, nullable=False)
+    target_language = Column(String(10), default="zh")
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+    def __repr__(self):
+        return f"<TranslationCache(id={self.id}, hash={self.source_text_hash[:16]}...)>"
+class Database:
+    def __init__(self):
+        self.engine = create_engine(Config.DATABASE_URL)
+        self.Session = sessionmaker(bind=self.engine)
+        Base.metadata.create_all(self.engine)
+    def get_session(self):
+        return self.Session()
+    def paper_exists(self, arxiv_id):
+        """Check if paper already exists"""
+        with self.get_session() as session:
+            return session.query(Paper).filter_by(arxiv_id=arxiv_id).first() is not None
+    def add_paper(self, paper):
+        """Add paper to database"""
+        with self.get_session() as session:
+            session.add(paper)
+            session.commit()
+            return paper.id
+    def update_paper(self, arxiv_id, **kwargs):
+        """Update paper fields"""
+        with self.get_session() as session:
+            paper = session.query(Paper).filter_by(arxiv_id=arxiv_id).first()
+            if paper:
+                for key, value in kwargs.items():
+                    setattr(paper, key, value)
+                paper.updated_at = datetime.utcnow()
+                session.commit()
+                return True
+            return False
+    def get_recent_papers(self, days=7, limit=100):
+        """Get recent papers"""
+        with self.get_session() as session:
+            cutoff_date = datetime.utcnow() - timedelta(days=days)
+            return (
+                session.query(Paper)
+                .filter(Paper.published >= cutoff_date)
+                .order_by(Paper.published.desc())
+                .limit(limit)
+                .all()
+            )
+    def get_papers_by_category(self, category, limit=50):
+        """Get papers by category"""
+        with self.get_session() as session:
+            return (
+                session.query(Paper)
+                .filter(Paper.categories.contains(category) | Paper.primary_category.contains(category))
+                .order_by(Paper.published.desc())
+                .limit(limit)
+                .all()
+            )
+    def get_papers_to_summarize(self, limit=20):
+        """Get papers that need summarization"""
+        with self.get_session() as session:
+            return (
+                session.query(Paper)
+                .filter(Paper.summarized == False, Paper.abstract.isnot(None))
+                .order_by(Paper.published.desc())
+                .limit(limit)
+                .all()
+            )
+    def get_statistics(self):
+        """Get database statistics"""
+        with self.get_session() as session:
+            total = session.query(Paper).count()
+            summarized = session.query(Paper).filter_by(summarized=True).count()
+            categories = {}
+            papers = session.query(Paper).all()
+            for paper in papers:
+                for cat in paper.categories.split(", "):
+                    categories[cat] = categories.get(cat, 0) + 1
+            return {
+                "total_papers": total,
+                "summarized_papers": summarized,
+                "categories_distribution": categories,
+            }
+    def get_translation_cache(self, source_text: str, target_language: str = "zh") -> Optional[str]:
+        """获取缓存的翻译结果"""
+        import hashlib
+        # 计算文本哈希作为缓存键（包含目标语言）
+        cache_key = f"{source_text}:{target_language}"
+        text_hash = hashlib.sha256(cache_key.encode("utf-8")).hexdigest()
+        with self.get_session() as session:
+            cache_entry = session.query(TranslationCache).filter_by(source_text_hash=text_hash).first()
+            if cache_entry:
+                return cache_entry.translated_text
+            return None
+    def set_translation_cache(self, source_text: str, translated_text: str, target_language: str = "zh") -> None:
+        """缓存翻译结果"""
+        import hashlib
+        # 计算文本哈希作为缓存键（包含目标语言）
+        cache_key = f"{source_text}:{target_language}"
+        text_hash = hashlib.sha256(cache_key.encode("utf-8")).hexdigest()
+        with self.get_session() as session:
+            # 检查是否已存在缓存
+            existing = session.query(TranslationCache).filter_by(source_text_hash=text_hash).first()
+            if existing:
+                # 更新现有缓存
+                existing.translated_text = translated_text
+                existing.updated_at = datetime.utcnow()
+            else:
+                # 创建新缓存
+                cache_entry = TranslationCache(
+                    source_text=source_text,
+                    source_text_hash=text_hash,
+                    translated_text=translated_text,
+                    target_language=target_language,
+                )
+                session.add(cache_entry)
+            session.commit()
+    def clear_old_translation_cache(self, days_old: int = 30) -> int:
+        """清理旧的翻译缓存"""
+        with self.get_session() as session:
+            cutoff_date = datetime.utcnow() - timedelta(days=days_old)
+            deleted_count = session.query(TranslationCache).filter(TranslationCache.updated_at < cutoff_date).delete()
+            session.commit()
+            return deleted_count

arxiv_pulse/output_manager.py ADDED Viewed

@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+"""
+统一输出管理器 - 提供优雅的控制台输出体验
+提供以下标签级别的输出：
+[do]      - 正在执行的操作
+[done]    - 操作完成
+[tips]    - 提示信息
+[info]    - 一般信息
+[warn]    - 警告信息
+[error]   - 错误信息（简洁）
+[debug]   - 调试信息（默认不显示）
+所有详细日志同时写入日志文件，控制台只显示简洁信息。
+"""
+import sys
+import os
+import time
+from typing import Optional, Dict, Any
+import logging
+from enum import Enum
+class OutputLevel(Enum):
+    """输出级别"""
+    DO = "do"
+    DONE = "done"
+    TIPS = "tips"
+    INFO = "info"
+    WARN = "warn"
+    ERROR = "error"
+    DEBUG = "debug"
+class OutputManager:
+    """统一输出管理器"""
+    # 颜色定义
+    COLORS = {
+        "do": "\033[94m",  # 蓝色
+        "done": "\033[92m",  # 绿色
+        "tips": "\033[96m",  # 青色
+        "info": "\033[97m",  # 白色
+        "warn": "\033[93m",  # 黄色
+        "error": "\033[91m",  # 红色
+        "debug": "\033[90m",  # 灰色
+        "reset": "\033[0m",  # 重置
+    }
+    # 标签显示格式
+    LABELS = {
+        OutputLevel.DO: "[执行]",
+        OutputLevel.DONE: "[完成]",
+        OutputLevel.TIPS: "[提示]",
+        OutputLevel.INFO: "[信息]",
+        OutputLevel.WARN: "[警告]",
+        OutputLevel.ERROR: "[错误]",
+        OutputLevel.DEBUG: "[调试]",
+    }
+    _instance = None
+    _initialized = False
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def __init__(self):
+        if not self._initialized:
+            self._initialized = True
+            self._console_enabled = True
+            self._file_logger = None
+            self._min_level = OutputLevel.DO  # 默认显示DO及以上（包括DONE, TIPS, INFO等）
+            self._suppressed_modules = set()
+            self._setup_file_logger()
+            # 抑制第三方库的详细日志
+            self._suppress_third_party_logs()
+    def _setup_file_logger(self):
+        """设置文件日志记录器"""
+        # 创建日志目录
+        os.makedirs("logs", exist_ok=True)
+        # 配置文件日志记录器
+        self._file_logger = logging.getLogger("arxiv_crawler")
+        self._file_logger.setLevel(logging.DEBUG)
+        # 移除现有处理器
+        for handler in self._file_logger.handlers[:]:
+            self._file_logger.removeHandler(handler)
+        # 添加文件处理器
+        file_handler = logging.FileHandler("logs/arxiv_pulse.log", encoding="utf-8")
+        file_handler.setLevel(logging.DEBUG)
+        formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+        file_handler.setFormatter(formatter)
+        self._file_logger.addHandler(file_handler)
+    def _suppress_third_party_logs(self):
+        """抑制第三方库的详细日志"""
+        # 设置第三方库的日志级别为WARNING或更高
+        suppressed_modules = ["arxiv", "httpx", "httpcore", "urllib3", "asyncio"]
+        for module in suppressed_modules:
+            logger = logging.getLogger(module)
+            logger.setLevel(logging.WARNING)
+            # 禁用传播，避免被根日志记录器处理
+            logger.propagate = False
+    def _should_output(self, level: OutputLevel, module: Optional[str] = None) -> bool:
+        """检查是否应该输出"""
+        if module and module in self._suppressed_modules:
+            return False
+        # 检查级别
+        level_order = {
+            OutputLevel.DO: 0,
+            OutputLevel.DONE: 1,
+            OutputLevel.TIPS: 2,
+            OutputLevel.INFO: 3,
+            OutputLevel.WARN: 4,
+            OutputLevel.ERROR: 5,
+            OutputLevel.DEBUG: 6,
+        }
+        return level_order[level] >= level_order[self._min_level]
+    def _output(
+        self,
+        level: OutputLevel,
+        message: str,
+        module: Optional[str] = None,
+        details: Optional[Dict[str, Any]] = None,
+    ):
+        """统一输出方法"""
+        # 记录到文件日志
+        log_level = {
+            OutputLevel.DO: logging.INFO,
+            OutputLevel.DONE: logging.INFO,
+            OutputLevel.TIPS: logging.INFO,
+            OutputLevel.INFO: logging.INFO,
+            OutputLevel.WARN: logging.WARNING,
+            OutputLevel.ERROR: logging.ERROR,
+            OutputLevel.DEBUG: logging.DEBUG,
+        }[level]
+        # 构建详细日志消息
+        log_message = message
+        if module:
+            log_message = f"[{module}] {message}"
+        if details:
+            details_str = " ".join(f"{k}={v}" for k, v in details.items())
+            log_message = f"{log_message} | {details_str}"
+        # 写入文件日志
+        self._file_logger.log(log_level, log_message)
+        # 控制台输出
+        if self._console_enabled and self._should_output(level, module):
+            # 获取标签和颜色
+            label = self.LABELS[level]
+            color = self.COLORS[level.value]
+            reset = self.COLORS["reset"]
+            # 构建输出行
+            output = f"{color}{label}{reset} {message}"
+            # 输出到控制台
+            print(output, file=sys.stderr if level == OutputLevel.ERROR else sys.stdout)
+            sys.stdout.flush()
+    # 公共方法
+    @classmethod
+    def do(cls, message: str, module: Optional[str] = None, **details):
+        """正在执行的操作"""
+        cls()._output(OutputLevel.DO, message, module, details)
+    @classmethod
+    def done(cls, message: str, module: Optional[str] = None, **details):
+        """操作完成"""
+        cls()._output(OutputLevel.DONE, message, module, details)
+    @classmethod
+    def tips(cls, message: str, module: Optional[str] = None, **details):
+        """提示信息"""
+        cls()._output(OutputLevel.TIPS, message, module, details)
+    @classmethod
+    def info(cls, message: str, module: Optional[str] = None, **details):
+        """一般信息"""
+        cls()._output(OutputLevel.INFO, message, module, details)
+    @classmethod
+    def warn(cls, message: str, module: Optional[str] = None, **details):
+        """警告信息"""
+        cls()._output(OutputLevel.WARN, message, module, details)
+    @classmethod
+    def error(cls, message: str, module: Optional[str] = None, **details):
+        """错误信息（简洁）"""
+        cls()._output(OutputLevel.ERROR, message, module, details)
+    @classmethod
+    def debug(cls, message: str, module: Optional[str] = None, **details):
+        """调试信息"""
+        cls()._output(OutputLevel.DEBUG, message, module, details)
+    # 配置方法
+    @classmethod
+    def set_min_level(cls, level: OutputLevel):
+        """设置最小输出级别"""
+        cls()._min_level = level
+    @classmethod
+    def suppress_module(cls, module: str):
+        """抑制指定模块的输出"""
+        cls()._suppressed_modules.add(module)
+    @classmethod
+    def enable_console(cls, enabled: bool = True):
+        """启用/禁用控制台输出"""
+        cls()._console_enabled = enabled
+    @classmethod
+    def get_file_logger(cls) -> logging.Logger:
+        """获取文件日志记录器"""
+        return cls()._file_logger
+# 简化别名
+output = OutputManager