arxiv-pulse 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arxiv_pulse/config.py ADDED
@@ -0,0 +1,64 @@
1
+ import os
2
+ import warnings
3
+
4
+
5
+ class Config:
6
+ # Database
7
+ DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///data/arxiv_papers.db")
8
+
9
+ # Crawler
10
+ MAX_RESULTS_INITIAL = int(os.getenv("MAX_RESULTS_INITIAL", 100))
11
+ MAX_RESULTS_DAILY = int(os.getenv("MAX_RESULTS_DAILY", 20))
12
+
13
+ # Search queries - use semicolon as separator to allow commas in queries
14
+ SEARCH_QUERIES_RAW = os.getenv(
15
+ "SEARCH_QUERIES",
16
+ "condensed matter physics; density functional theory; machine learning; force fields; first principles calculation; molecular dynamics; quantum chemistry; computational materials science",
17
+ )
18
+ SEARCH_QUERIES = [q.strip() for q in SEARCH_QUERIES_RAW.split(";") if q.strip()]
19
+
20
+ # AI API (支持 OpenAI 格式,如 DeepSeek、Paratera AI 等)
21
+ # 使用 AI_* 环境变量配置
22
+
23
+ # AI API 配置变量
24
+ AI_API_KEY = os.getenv("AI_API_KEY") # 可以为 None
25
+ AI_MODEL = os.getenv("AI_MODEL", "DeepSeek-V3.2-Thinking")
26
+ AI_BASE_URL = os.getenv("AI_BASE_URL", "https://llmapi.paratera.com")
27
+
28
+ # 模型配置:SUMMARY_MODEL 现在复用 AI_MODEL
29
+ SUMMARY_MAX_TOKENS = int(os.getenv("SUMMARY_MAX_TOKENS", 2000))
30
+
31
+ # Report generation settings
32
+ SUMMARY_SENTENCES_LIMIT = int(os.getenv("SUMMARY_SENTENCES_LIMIT", 3))
33
+ TOKEN_PRICE_PER_MILLION = float(os.getenv("TOKEN_PRICE_PER_MILLION", 3.0))
34
+
35
+ # Paths
36
+ REPORT_DIR = os.getenv("REPORT_DIR", "reports")
37
+ DATA_DIR = os.path.dirname(DATABASE_URL.replace("sqlite:///", ""))
38
+
39
+ # Report generation limits
40
+ REPORT_MAX_PAPERS = int(os.getenv("REPORT_MAX_PAPERS", "50"))
41
+
42
+ # ArXiv API
43
+ ARXIV_MAX_RESULTS = 1000
44
+ ARXIV_SORT_BY = "submittedDate"
45
+ ARXIV_SORT_ORDER = "descending"
46
+
47
+ # Sync configuration
48
+ YEARS_BACK = int(os.getenv("YEARS_BACK", 3)) # Years to look back for initial sync
49
+ IMPORTANT_PAPERS_FILE = os.getenv("IMPORTANT_PAPERS_FILE", "important_papers.txt")
50
+
51
+ @classmethod
52
+ def validate(cls):
53
+ """Validate configuration"""
54
+ if not cls.AI_API_KEY:
55
+ print("警告: 未设置 AI_API_KEY。AI 总结和翻译功能将受限。")
56
+ print(" 请设置 AI_API_KEY 环境变量以启用 AI 功能。")
57
+ else:
58
+ print(f"信息: 找到 AI API 密钥 (AI_API_KEY)。AI 总结和翻译功能已启用 (模型: {cls.AI_MODEL})。")
59
+
60
+ # Ensure directories exist
61
+ os.makedirs(cls.REPORT_DIR, exist_ok=True)
62
+ os.makedirs(cls.DATA_DIR, exist_ok=True)
63
+
64
+ return True
arxiv_pulse/models.py ADDED
@@ -0,0 +1,255 @@
1
+ from sqlalchemy import (
2
+ create_engine,
3
+ Column,
4
+ Integer,
5
+ String,
6
+ Text,
7
+ DateTime,
8
+ Boolean,
9
+ Float,
10
+ JSON,
11
+ )
12
+ from sqlalchemy.ext.declarative import declarative_base
13
+ from sqlalchemy.orm import sessionmaker
14
+ from datetime import datetime, timedelta
15
+ import json
16
+ from typing import Optional
17
+
18
+ from arxiv_pulse.config import Config
19
+
20
+ Base = declarative_base()
21
+
22
+
23
+ class Paper(Base):
24
+ __tablename__ = "papers"
25
+
26
+ id = Column(Integer, primary_key=True)
27
+ arxiv_id = Column(String(50), unique=True, nullable=False, index=True)
28
+ title = Column(String(500), nullable=False)
29
+ authors = Column(Text) # JSON string of authors list
30
+ abstract = Column(Text)
31
+ categories = Column(String(500))
32
+ primary_category = Column(String(100))
33
+ published = Column(DateTime, nullable=False)
34
+ updated = Column(DateTime)
35
+ pdf_url = Column(String(500))
36
+ doi = Column(String(200))
37
+ journal_ref = Column(String(500))
38
+ comment = Column(Text)
39
+
40
+ # Search relevance
41
+ search_query = Column(String(200))
42
+ relevance_score = Column(Float, default=0.0)
43
+ keywords = Column(Text) # JSON string of extracted keywords
44
+
45
+ # Processing status
46
+ downloaded = Column(Boolean, default=False)
47
+ summarized = Column(Boolean, default=False)
48
+ summary = Column(Text)
49
+
50
+ # Metadata
51
+ created_at = Column(DateTime, default=datetime.utcnow)
52
+ updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
53
+
54
+ def to_dict(self):
55
+ """Convert to dictionary"""
56
+ return {
57
+ "id": self.id,
58
+ "arxiv_id": self.arxiv_id,
59
+ "title": self.title,
60
+ "authors": json.loads(self.authors) if self.authors else [],
61
+ "abstract": self.abstract,
62
+ "categories": self.categories,
63
+ "primary_category": self.primary_category,
64
+ "published": self.published.isoformat() if self.published else None,
65
+ "updated": self.updated.isoformat() if self.updated else None,
66
+ "pdf_url": self.pdf_url,
67
+ "doi": self.doi,
68
+ "journal_ref": self.journal_ref,
69
+ "comment": self.comment,
70
+ "search_query": self.search_query,
71
+ "relevance_score": self.relevance_score,
72
+ "keywords": json.loads(self.keywords) if self.keywords else [],
73
+ "downloaded": self.downloaded,
74
+ "summarized": self.summarized,
75
+ "summary": self.summary,
76
+ "created_at": self.created_at.isoformat() if self.created_at else None,
77
+ "updated_at": self.updated_at.isoformat() if self.updated_at else None,
78
+ }
79
+
80
+ @classmethod
81
+ def from_arxiv_entry(cls, entry, search_query):
82
+ """Create Paper instance from arXiv entry"""
83
+ authors = [{"name": author.name, "affiliation": getattr(author, "affiliation", "")} for author in entry.authors]
84
+
85
+ return cls(
86
+ arxiv_id=entry.entry_id.split("/")[-1],
87
+ title=entry.title,
88
+ authors=json.dumps(authors),
89
+ abstract=entry.summary,
90
+ categories=", ".join(entry.categories) if hasattr(entry, "categories") else entry.primary_category,
91
+ primary_category=entry.primary_category if hasattr(entry, "primary_category") else "",
92
+ published=entry.published,
93
+ updated=entry.updated if hasattr(entry, "updated") else None,
94
+ pdf_url=entry.pdf_url
95
+ if hasattr(entry, "pdf_url")
96
+ else f"https://arxiv.org/pdf/{entry.entry_id.split('/')[-1]}.pdf",
97
+ doi=entry.doi if hasattr(entry, "doi") else None,
98
+ journal_ref=entry.journal_ref if hasattr(entry, "journal_ref") else None,
99
+ comment=entry.comment if hasattr(entry, "comment") else None,
100
+ search_query=search_query,
101
+ relevance_score=0.0,
102
+ )
103
+
104
+
105
+ class TranslationCache(Base):
106
+ """缓存翻译结果以避免重复API调用"""
107
+
108
+ __tablename__ = "translation_cache"
109
+
110
+ id = Column(Integer, primary_key=True)
111
+ source_text = Column(Text, nullable=False)
112
+ source_text_hash = Column(String(64), nullable=False, unique=True, index=True)
113
+ translated_text = Column(Text, nullable=False)
114
+ target_language = Column(String(10), default="zh")
115
+ created_at = Column(DateTime, default=datetime.utcnow)
116
+ updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
117
+
118
+ def __repr__(self):
119
+ return f"<TranslationCache(id={self.id}, hash={self.source_text_hash[:16]}...)>"
120
+
121
+
122
+ class Database:
123
+ def __init__(self):
124
+ self.engine = create_engine(Config.DATABASE_URL)
125
+ self.Session = sessionmaker(bind=self.engine)
126
+ Base.metadata.create_all(self.engine)
127
+
128
+ def get_session(self):
129
+ return self.Session()
130
+
131
+ def paper_exists(self, arxiv_id):
132
+ """Check if paper already exists"""
133
+ with self.get_session() as session:
134
+ return session.query(Paper).filter_by(arxiv_id=arxiv_id).first() is not None
135
+
136
+ def add_paper(self, paper):
137
+ """Add paper to database"""
138
+ with self.get_session() as session:
139
+ session.add(paper)
140
+ session.commit()
141
+ return paper.id
142
+
143
+ def update_paper(self, arxiv_id, **kwargs):
144
+ """Update paper fields"""
145
+ with self.get_session() as session:
146
+ paper = session.query(Paper).filter_by(arxiv_id=arxiv_id).first()
147
+ if paper:
148
+ for key, value in kwargs.items():
149
+ setattr(paper, key, value)
150
+ paper.updated_at = datetime.utcnow()
151
+ session.commit()
152
+ return True
153
+ return False
154
+
155
+ def get_recent_papers(self, days=7, limit=100):
156
+ """Get recent papers"""
157
+ with self.get_session() as session:
158
+ cutoff_date = datetime.utcnow() - timedelta(days=days)
159
+ return (
160
+ session.query(Paper)
161
+ .filter(Paper.published >= cutoff_date)
162
+ .order_by(Paper.published.desc())
163
+ .limit(limit)
164
+ .all()
165
+ )
166
+
167
+ def get_papers_by_category(self, category, limit=50):
168
+ """Get papers by category"""
169
+ with self.get_session() as session:
170
+ return (
171
+ session.query(Paper)
172
+ .filter(Paper.categories.contains(category) | Paper.primary_category.contains(category))
173
+ .order_by(Paper.published.desc())
174
+ .limit(limit)
175
+ .all()
176
+ )
177
+
178
+ def get_papers_to_summarize(self, limit=20):
179
+ """Get papers that need summarization"""
180
+ with self.get_session() as session:
181
+ return (
182
+ session.query(Paper)
183
+ .filter(Paper.summarized == False, Paper.abstract.isnot(None))
184
+ .order_by(Paper.published.desc())
185
+ .limit(limit)
186
+ .all()
187
+ )
188
+
189
+ def get_statistics(self):
190
+ """Get database statistics"""
191
+ with self.get_session() as session:
192
+ total = session.query(Paper).count()
193
+ summarized = session.query(Paper).filter_by(summarized=True).count()
194
+ categories = {}
195
+ papers = session.query(Paper).all()
196
+ for paper in papers:
197
+ for cat in paper.categories.split(", "):
198
+ categories[cat] = categories.get(cat, 0) + 1
199
+
200
+ return {
201
+ "total_papers": total,
202
+ "summarized_papers": summarized,
203
+ "categories_distribution": categories,
204
+ }
205
+
206
+ def get_translation_cache(self, source_text: str, target_language: str = "zh") -> Optional[str]:
207
+ """获取缓存的翻译结果"""
208
+ import hashlib
209
+
210
+ # 计算文本哈希作为缓存键(包含目标语言)
211
+ cache_key = f"{source_text}:{target_language}"
212
+ text_hash = hashlib.sha256(cache_key.encode("utf-8")).hexdigest()
213
+
214
+ with self.get_session() as session:
215
+ cache_entry = session.query(TranslationCache).filter_by(source_text_hash=text_hash).first()
216
+
217
+ if cache_entry:
218
+ return cache_entry.translated_text
219
+ return None
220
+
221
+ def set_translation_cache(self, source_text: str, translated_text: str, target_language: str = "zh") -> None:
222
+ """缓存翻译结果"""
223
+ import hashlib
224
+
225
+ # 计算文本哈希作为缓存键(包含目标语言)
226
+ cache_key = f"{source_text}:{target_language}"
227
+ text_hash = hashlib.sha256(cache_key.encode("utf-8")).hexdigest()
228
+
229
+ with self.get_session() as session:
230
+ # 检查是否已存在缓存
231
+ existing = session.query(TranslationCache).filter_by(source_text_hash=text_hash).first()
232
+
233
+ if existing:
234
+ # 更新现有缓存
235
+ existing.translated_text = translated_text
236
+ existing.updated_at = datetime.utcnow()
237
+ else:
238
+ # 创建新缓存
239
+ cache_entry = TranslationCache(
240
+ source_text=source_text,
241
+ source_text_hash=text_hash,
242
+ translated_text=translated_text,
243
+ target_language=target_language,
244
+ )
245
+ session.add(cache_entry)
246
+
247
+ session.commit()
248
+
249
+ def clear_old_translation_cache(self, days_old: int = 30) -> int:
250
+ """清理旧的翻译缓存"""
251
+ with self.get_session() as session:
252
+ cutoff_date = datetime.utcnow() - timedelta(days=days_old)
253
+ deleted_count = session.query(TranslationCache).filter(TranslationCache.updated_at < cutoff_date).delete()
254
+ session.commit()
255
+ return deleted_count
@@ -0,0 +1,235 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ 统一输出管理器 - 提供优雅的控制台输出体验
4
+
5
+ 提供以下标签级别的输出:
6
+ [do] - 正在执行的操作
7
+ [done] - 操作完成
8
+ [tips] - 提示信息
9
+ [info] - 一般信息
10
+ [warn] - 警告信息
11
+ [error] - 错误信息(简洁)
12
+ [debug] - 调试信息(默认不显示)
13
+
14
+ 所有详细日志同时写入日志文件,控制台只显示简洁信息。
15
+ """
16
+
17
+ import sys
18
+ import os
19
+ import time
20
+ from typing import Optional, Dict, Any
21
+ import logging
22
+ from enum import Enum
23
+
24
+
25
+ class OutputLevel(Enum):
26
+ """输出级别"""
27
+
28
+ DO = "do"
29
+ DONE = "done"
30
+ TIPS = "tips"
31
+ INFO = "info"
32
+ WARN = "warn"
33
+ ERROR = "error"
34
+ DEBUG = "debug"
35
+
36
+
37
+ class OutputManager:
38
+ """统一输出管理器"""
39
+
40
+ # 颜色定义
41
+ COLORS = {
42
+ "do": "\033[94m", # 蓝色
43
+ "done": "\033[92m", # 绿色
44
+ "tips": "\033[96m", # 青色
45
+ "info": "\033[97m", # 白色
46
+ "warn": "\033[93m", # 黄色
47
+ "error": "\033[91m", # 红色
48
+ "debug": "\033[90m", # 灰色
49
+ "reset": "\033[0m", # 重置
50
+ }
51
+
52
+ # 标签显示格式
53
+ LABELS = {
54
+ OutputLevel.DO: "[执行]",
55
+ OutputLevel.DONE: "[完成]",
56
+ OutputLevel.TIPS: "[提示]",
57
+ OutputLevel.INFO: "[信息]",
58
+ OutputLevel.WARN: "[警告]",
59
+ OutputLevel.ERROR: "[错误]",
60
+ OutputLevel.DEBUG: "[调试]",
61
+ }
62
+
63
+ _instance = None
64
+ _initialized = False
65
+
66
+ def __new__(cls):
67
+ if cls._instance is None:
68
+ cls._instance = super().__new__(cls)
69
+ return cls._instance
70
+
71
+ def __init__(self):
72
+ if not self._initialized:
73
+ self._initialized = True
74
+ self._console_enabled = True
75
+ self._file_logger = None
76
+ self._min_level = OutputLevel.DO # 默认显示DO及以上(包括DONE, TIPS, INFO等)
77
+ self._suppressed_modules = set()
78
+ self._setup_file_logger()
79
+
80
+ # 抑制第三方库的详细日志
81
+ self._suppress_third_party_logs()
82
+
83
+ def _setup_file_logger(self):
84
+ """设置文件日志记录器"""
85
+ # 创建日志目录
86
+ os.makedirs("logs", exist_ok=True)
87
+
88
+ # 配置文件日志记录器
89
+ self._file_logger = logging.getLogger("arxiv_crawler")
90
+ self._file_logger.setLevel(logging.DEBUG)
91
+
92
+ # 移除现有处理器
93
+ for handler in self._file_logger.handlers[:]:
94
+ self._file_logger.removeHandler(handler)
95
+
96
+ # 添加文件处理器
97
+ file_handler = logging.FileHandler("logs/arxiv_pulse.log", encoding="utf-8")
98
+ file_handler.setLevel(logging.DEBUG)
99
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
100
+ file_handler.setFormatter(formatter)
101
+ self._file_logger.addHandler(file_handler)
102
+
103
+ def _suppress_third_party_logs(self):
104
+ """抑制第三方库的详细日志"""
105
+ # 设置第三方库的日志级别为WARNING或更高
106
+ suppressed_modules = ["arxiv", "httpx", "httpcore", "urllib3", "asyncio"]
107
+
108
+ for module in suppressed_modules:
109
+ logger = logging.getLogger(module)
110
+ logger.setLevel(logging.WARNING)
111
+ # 禁用传播,避免被根日志记录器处理
112
+ logger.propagate = False
113
+
114
+ def _should_output(self, level: OutputLevel, module: Optional[str] = None) -> bool:
115
+ """检查是否应该输出"""
116
+ if module and module in self._suppressed_modules:
117
+ return False
118
+
119
+ # 检查级别
120
+ level_order = {
121
+ OutputLevel.DO: 0,
122
+ OutputLevel.DONE: 1,
123
+ OutputLevel.TIPS: 2,
124
+ OutputLevel.INFO: 3,
125
+ OutputLevel.WARN: 4,
126
+ OutputLevel.ERROR: 5,
127
+ OutputLevel.DEBUG: 6,
128
+ }
129
+
130
+ return level_order[level] >= level_order[self._min_level]
131
+
132
+ def _output(
133
+ self,
134
+ level: OutputLevel,
135
+ message: str,
136
+ module: Optional[str] = None,
137
+ details: Optional[Dict[str, Any]] = None,
138
+ ):
139
+ """统一输出方法"""
140
+ # 记录到文件日志
141
+ log_level = {
142
+ OutputLevel.DO: logging.INFO,
143
+ OutputLevel.DONE: logging.INFO,
144
+ OutputLevel.TIPS: logging.INFO,
145
+ OutputLevel.INFO: logging.INFO,
146
+ OutputLevel.WARN: logging.WARNING,
147
+ OutputLevel.ERROR: logging.ERROR,
148
+ OutputLevel.DEBUG: logging.DEBUG,
149
+ }[level]
150
+
151
+ # 构建详细日志消息
152
+ log_message = message
153
+ if module:
154
+ log_message = f"[{module}] {message}"
155
+ if details:
156
+ details_str = " ".join(f"{k}={v}" for k, v in details.items())
157
+ log_message = f"{log_message} | {details_str}"
158
+
159
+ # 写入文件日志
160
+ self._file_logger.log(log_level, log_message)
161
+
162
+ # 控制台输出
163
+ if self._console_enabled and self._should_output(level, module):
164
+ # 获取标签和颜色
165
+ label = self.LABELS[level]
166
+ color = self.COLORS[level.value]
167
+ reset = self.COLORS["reset"]
168
+
169
+ # 构建输出行
170
+ output = f"{color}{label}{reset} {message}"
171
+
172
+ # 输出到控制台
173
+ print(output, file=sys.stderr if level == OutputLevel.ERROR else sys.stdout)
174
+ sys.stdout.flush()
175
+
176
+ # 公共方法
177
+ @classmethod
178
+ def do(cls, message: str, module: Optional[str] = None, **details):
179
+ """正在执行的操作"""
180
+ cls()._output(OutputLevel.DO, message, module, details)
181
+
182
+ @classmethod
183
+ def done(cls, message: str, module: Optional[str] = None, **details):
184
+ """操作完成"""
185
+ cls()._output(OutputLevel.DONE, message, module, details)
186
+
187
+ @classmethod
188
+ def tips(cls, message: str, module: Optional[str] = None, **details):
189
+ """提示信息"""
190
+ cls()._output(OutputLevel.TIPS, message, module, details)
191
+
192
+ @classmethod
193
+ def info(cls, message: str, module: Optional[str] = None, **details):
194
+ """一般信息"""
195
+ cls()._output(OutputLevel.INFO, message, module, details)
196
+
197
+ @classmethod
198
+ def warn(cls, message: str, module: Optional[str] = None, **details):
199
+ """警告信息"""
200
+ cls()._output(OutputLevel.WARN, message, module, details)
201
+
202
+ @classmethod
203
+ def error(cls, message: str, module: Optional[str] = None, **details):
204
+ """错误信息(简洁)"""
205
+ cls()._output(OutputLevel.ERROR, message, module, details)
206
+
207
+ @classmethod
208
+ def debug(cls, message: str, module: Optional[str] = None, **details):
209
+ """调试信息"""
210
+ cls()._output(OutputLevel.DEBUG, message, module, details)
211
+
212
+ # 配置方法
213
+ @classmethod
214
+ def set_min_level(cls, level: OutputLevel):
215
+ """设置最小输出级别"""
216
+ cls()._min_level = level
217
+
218
+ @classmethod
219
+ def suppress_module(cls, module: str):
220
+ """抑制指定模块的输出"""
221
+ cls()._suppressed_modules.add(module)
222
+
223
+ @classmethod
224
+ def enable_console(cls, enabled: bool = True):
225
+ """启用/禁用控制台输出"""
226
+ cls()._console_enabled = enabled
227
+
228
+ @classmethod
229
+ def get_file_logger(cls) -> logging.Logger:
230
+ """获取文件日志记录器"""
231
+ return cls()._file_logger
232
+
233
+
234
+ # 简化别名
235
+ output = OutputManager