arxiv-pulse 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,72 @@
1
+ # arXiv Pulse 配置文件模板
2
+ # 将此文件复制为 .env 并进行配置
3
+
4
+ # ========================
5
+ # AI API 配置 (支持 OpenAI 格式)
6
+ # ========================
7
+ # 使用 OpenAI 格式的 API,支持 DeepSeek、Paratera AI 等
8
+ # 示例:Paratera AI: https://llmapi.paratera.com
9
+ # 示例:DeepSeek: https://api.deepseek.com
10
+
11
+ AI_API_KEY=your_api_key_here
12
+ AI_MODEL=DeepSeek-V3.2-Thinking
13
+ AI_BASE_URL=https://llmapi.paratera.com
14
+
15
+
16
+ # ========================
17
+ # 数据库配置
18
+ # ========================
19
+ DATABASE_URL=sqlite:///data/arxiv_papers.db
20
+
21
+
22
+ # ========================
23
+ # 爬虫配置
24
+ # ========================
25
+ MAX_RESULTS_INITIAL=100 # init命令每个查询的论文数
26
+ MAX_RESULTS_DAILY=20 # sync命令每个查询的论文数
27
+
28
+
29
+ # ========================
30
+ # 搜索查询配置
31
+ # ========================
32
+ # 分号分隔,允许查询中包含逗号
33
+ # 默认监控凝聚态物理、DFT、机器学习、力场等研究领域
34
+ SEARCH_QUERIES=condensed matter physics AND cat:cond-mat.*; (ti:"density functional" OR abs:"density functional") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph); (ti:"machine learning" OR abs:"machine learning") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph); (ti:"force field" OR abs:"force field") AND (cat:physics.comp-ph OR cat:cond-mat.soft OR cat:physics.chem-ph); (ti:"first principles" OR abs:"first principles" OR ti:"ab initio" OR abs:"ab initio") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci); (ti:"molecular dynamics" OR abs:"molecular dynamics") AND (cat:physics.comp-ph OR cat:cond-mat.soft OR cat:physics.chem-ph); (ti:"quantum chemistry" OR abs:"quantum chemistry") AND (cat:physics.chem-ph OR cat:physics.comp-ph); cat:cond-mat.mtrl-sci AND (ti:"computational" OR abs:"computational" OR ti:"simulation" OR abs:"simulation")
35
+
36
+
37
+ # ========================
38
+ # 报告配置
39
+ # ========================
40
+ REPORT_DIR=reports
41
+ # SUMMARY_MODEL 现在由 AI_MODEL 控制,不再需要单独设置
42
+ SUMMARY_MAX_TOKENS=2000 # 总结和翻译的最大token数
43
+ SUMMARY_SENTENCES_LIMIT=3
44
+ TOKEN_PRICE_PER_MILLION=3.0
45
+ REPORT_MAX_PAPERS=50
46
+
47
+
48
+ # ========================
49
+ # 同步配置
50
+ # ========================
51
+ YEARS_BACK=3 # 同步回溯的年数
52
+ IMPORTANT_PAPERS_FILE=important_papers.txt
53
+
54
+
55
+ # ========================
56
+ # 可选配置
57
+ # ========================
58
+ # 日志级别: DEBUG, INFO, WARNING, ERROR (默认: INFO)
59
+ LOG_LEVEL=INFO
60
+
61
+ # 爬虫延迟(秒,避免频繁请求 arXiv API)
62
+ CRAWL_DELAY=1.0
63
+
64
+ # ========================
65
+ # 使用说明
66
+ # ========================
67
+ # 1. 将此文件复制为 .env
68
+ # 2. 设置您的 AI API 密钥
69
+ # 3. 根据需要调整其他配置
70
+ # 4. 运行 pulse init . 初始化目录
71
+ # 5. 运行 pulse sync . 同步论文
72
+ # 6. 使用 pulse search "查询内容" . 搜索论文
@@ -0,0 +1,26 @@
1
+ """
2
+ arXiv Pulse: An intelligent arXiv literature crawler and analyzer for physics research.
3
+ """
4
+
5
+ from .__version__ import __version__
6
+
7
+ __author__ = "arXiv Pulse Team"
8
+
9
+ from .arxiv_crawler import ArXivCrawler
10
+ from .config import Config
11
+ from .models import Database, Paper, TranslationCache
12
+ from .output_manager import OutputManager, output
13
+ from .report_generator import ReportGenerator
14
+ from .summarizer import PaperSummarizer
15
+
16
+ __all__ = [
17
+ "ArXivCrawler",
18
+ "Config",
19
+ "Database",
20
+ "Paper",
21
+ "TranslationCache",
22
+ "OutputManager",
23
+ "output",
24
+ "ReportGenerator",
25
+ "PaperSummarizer",
26
+ ]
@@ -0,0 +1,33 @@
1
+ """
2
+ arXiv Pulse 版本信息 - 动态从包元数据读取
3
+ """
4
+
5
+ import importlib.metadata
6
+
7
+ # 从包元数据读取版本
8
+ try:
9
+ __version__ = importlib.metadata.version("arxiv-pulse")
10
+ # 解析版本信息元组
11
+ try:
12
+ __version_info__ = tuple(map(int, __version__.split(".")))
13
+ except ValueError:
14
+ # 如果版本号包含字母或其他字符,只取数字部分
15
+ parts = []
16
+ for part in __version__.split("."):
17
+ # 提取数字部分
18
+ digits = ""
19
+ for char in part:
20
+ if char.isdigit():
21
+ digits += char
22
+ else:
23
+ break
24
+ if digits:
25
+ parts.append(int(digits))
26
+ else:
27
+ parts.append(0)
28
+ __version_info__ = tuple(parts)
29
+
30
+ except importlib.metadata.PackageNotFoundError:
31
+ # 包未安装时使用默认版本
32
+ __version__ = "0.5.0"
33
+ __version_info__ = (0, 5, 0)
@@ -0,0 +1,377 @@
1
+ import arxiv
2
+ import asyncio
3
+ import aiohttp
4
+ import os
5
+ from typing import List, Dict, Any, Optional
6
+ from datetime import datetime, timedelta, timezone
7
+ from tqdm import tqdm
8
+ import time
9
+ import logging
10
+
11
+ from arxiv_pulse.models import Database, Paper
12
+ from arxiv_pulse.config import Config
13
+ from arxiv_pulse.output_manager import output
14
+
15
+ # 使用根日志记录器的配置(保留用于向后兼容)
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class ArXivCrawler:
20
+ def __init__(self):
21
+ self.db = Database()
22
+ # 配置arXiv客户端,遵守调用频率限制
23
+ self.client = arxiv.Client(page_size=100, delay_seconds=3.0, num_retries=3)
24
+ self.config = Config
25
+
26
+ # 抑制第三方库的详细日志
27
+ logging.getLogger("arxiv").setLevel(logging.WARNING)
28
+ logging.getLogger("httpx").setLevel(logging.WARNING)
29
+
30
+ def search_arxiv(
31
+ self,
32
+ query: str,
33
+ max_results: int = 100,
34
+ days_back: Optional[int] = None,
35
+ cutoff_date: Optional[datetime] = None,
36
+ ) -> List[arxiv.Result]:
37
+ """Search arXiv for papers matching query
38
+
39
+ Args:
40
+ query: arXiv search query
41
+ max_results: Maximum number of results to return
42
+ days_back: Optional number of days to look back (deprecated, use cutoff_date)
43
+ cutoff_date: Optional UTC datetime cutoff; papers older than this will be skipped
44
+ and iteration will stop early due to descending date order.
45
+ """
46
+ search = arxiv.Search(
47
+ query=query,
48
+ max_results=max_results,
49
+ sort_by=arxiv.SortCriterion.SubmittedDate,
50
+ sort_order=arxiv.SortOrder.Descending,
51
+ )
52
+
53
+ results = []
54
+ for paper in self.client.results(search):
55
+ # If cutoff date is provided, check paper date
56
+ if cutoff_date is not None and hasattr(paper, "published") and paper.published:
57
+ # Convert paper.published to UTC aware datetime
58
+ if paper.published.tzinfo is None:
59
+ # Assume naive datetime is UTC
60
+ paper_date = paper.published.replace(tzinfo=timezone.utc)
61
+ else:
62
+ paper_date = paper.published.astimezone(timezone.utc)
63
+
64
+ if paper_date < cutoff_date:
65
+ output.debug(f"遇到旧论文 ({paper_date.date()}),停止爬取")
66
+ break
67
+
68
+ results.append(paper)
69
+
70
+ # Safety: don't exceed max_results even if cutoff_date not reached
71
+ if len(results) >= max_results:
72
+ break
73
+
74
+ output.debug(f"Found {len(results)} papers for query: {query}")
75
+ return results
76
+
77
+ def filter_new_papers(self, papers: List[arxiv.Result]) -> List[arxiv.Result]:
78
+ """Filter out papers already in database"""
79
+ new_papers = []
80
+ for paper in papers:
81
+ arxiv_id = paper.entry_id.split("/")[-1]
82
+ if not self.db.paper_exists(arxiv_id):
83
+ new_papers.append(paper)
84
+ else:
85
+ output.debug(f"Paper {arxiv_id} already exists in database")
86
+
87
+ output.debug(f"Filtered to {len(new_papers)} new papers")
88
+ return new_papers
89
+
90
+ def save_papers(self, papers: List[arxiv.Result], search_query: str) -> List[Paper]:
91
+ """Save papers to database"""
92
+ saved_papers = []
93
+ for paper in tqdm(papers, desc="Saving papers"):
94
+ try:
95
+ # Check again to avoid race conditions
96
+ arxiv_id = paper.entry_id.split("/")[-1]
97
+ if self.db.paper_exists(arxiv_id):
98
+ continue
99
+
100
+ paper_obj = Paper.from_arxiv_entry(paper, search_query)
101
+ self.db.add_paper(paper_obj)
102
+ saved_papers.append(paper_obj)
103
+
104
+ except Exception as e:
105
+ output.error(
106
+ "保存论文失败",
107
+ details={"paper_id": paper.entry_id, "exception": str(e)},
108
+ )
109
+
110
+ output.done(f"保存完成: {len(saved_papers)} 篇新论文")
111
+ return saved_papers
112
+
113
+ def initial_crawl(self) -> Dict[str, Any]:
114
+ """Perform initial crawl with multiple queries"""
115
+ output.do("开始初始爬取")
116
+ all_saved = []
117
+
118
+ for query in self.config.SEARCH_QUERIES:
119
+ output.do(f"搜索: {query}")
120
+ try:
121
+ papers = self.search_arxiv(query, max_results=self.config.MAX_RESULTS_INITIAL)
122
+ new_papers = self.filter_new_papers(papers)
123
+ saved = self.save_papers(new_papers, query)
124
+ all_saved.extend(saved)
125
+
126
+ output.done(f"保存: {len(saved)} 篇论文")
127
+ time.sleep(1) # Be nice to arXiv API
128
+
129
+ except Exception as e:
130
+ output.error(f"爬取查询失败: {query}", details={"exception": str(e)})
131
+
132
+ output.done(f"初始爬取完成: 共保存 {len(all_saved)} 篇论文")
133
+ return {
134
+ "total_saved": len(all_saved),
135
+ "queries_searched": len(self.config.SEARCH_QUERIES),
136
+ "saved_papers": all_saved,
137
+ }
138
+
139
+ def daily_update(self) -> Dict[str, Any]:
140
+ """Perform daily update crawl with early stopping optimization"""
141
+ output.do("开始每日更新")
142
+ all_saved = []
143
+
144
+ # 使用2天的时间窗口,因为arXiv通常在UTC 00:00-02:00更新
145
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=2)
146
+ output.info(f"查找 {cutoff_date.date()} 之后的新论文")
147
+
148
+ for query in self.config.SEARCH_QUERIES:
149
+ output.do(f"搜索: {query}")
150
+ try:
151
+ # 使用cutoff_date参数实现早期终止
152
+ papers = self.search_arxiv(
153
+ query,
154
+ max_results=self.config.MAX_RESULTS_DAILY,
155
+ cutoff_date=cutoff_date,
156
+ )
157
+
158
+ output.debug(f"找到 {len(papers)} 篇最近论文")
159
+
160
+ new_papers = self.filter_new_papers(papers)
161
+ saved = self.save_papers(new_papers, query)
162
+ all_saved.extend(saved)
163
+
164
+ output.done(f"保存: {len(saved)} 篇新论文")
165
+ time.sleep(1)
166
+
167
+ except Exception as e:
168
+ output.error(f"每日更新失败: {query}", details={"exception": str(e)})
169
+
170
+ output.done(f"每日更新完成: 共保存 {len(all_saved)} 篇新论文")
171
+ return {
172
+ "total_saved": len(all_saved),
173
+ "queries_searched": len(self.config.SEARCH_QUERIES),
174
+ "date_range": f"Since {cutoff_date.date()}",
175
+ "saved_papers": all_saved,
176
+ }
177
+
178
+ def crawl_by_categories(self, categories: List[str], max_results: int = 50) -> Dict[str, Any]:
179
+ """Crawl specific arXiv categories"""
180
+ all_saved = []
181
+
182
+ for category in categories:
183
+ query = f"cat:{category}"
184
+ output.do(f"搜索类别: {category}")
185
+ try:
186
+ papers = self.search_arxiv(query, max_results=max_results)
187
+ new_papers = self.filter_new_papers(papers)
188
+ saved = self.save_papers(new_papers, f"cat:{category}")
189
+ all_saved.extend(saved)
190
+
191
+ output.done(f"保存: {len(saved)} 篇论文")
192
+ time.sleep(1)
193
+
194
+ except Exception as e:
195
+ output.error(f"爬取类别失败: {category}", details={"exception": str(e)})
196
+
197
+ return {
198
+ "total_saved": len(all_saved),
199
+ "categories": categories,
200
+ "saved_papers": all_saved,
201
+ }
202
+
203
+ def get_latest_paper_date_for_query(self, query: str) -> Optional[datetime]:
204
+ """Get the latest paper date for a specific query in database"""
205
+ with self.db.get_session() as session:
206
+ latest_paper = (
207
+ session.query(Paper).filter(Paper.search_query == query).order_by(Paper.published.desc()).first()
208
+ )
209
+ return latest_paper.published if latest_paper else None # type: ignore
210
+
211
+ def sync_query(self, query: str, years_back: int = 3) -> Dict[str, Any]:
212
+ """Sync papers for a specific query, fetching missing papers from recent years"""
213
+ output.do(f"同步查询: {query}")
214
+
215
+ # Get latest paper date in database for this query
216
+ latest_date = self.get_latest_paper_date_for_query(query)
217
+
218
+ if latest_date:
219
+ # If we have papers, fetch from latest date onward
220
+ start_date = latest_date.replace(tzinfo=timezone.utc)
221
+ # 减去一天以确保获取所有可能的新论文,避免因时间精度问题错过论文
222
+ start_date = start_date - timedelta(days=1)
223
+ output.debug(f"获取论文从 {start_date.strftime('%Y-%m-%d')} 到现在")
224
+ else:
225
+ # If no papers, fetch from years_back years ago
226
+ start_date = datetime.now(timezone.utc) - timedelta(days=365 * years_back)
227
+ output.debug(f"获取最近 {years_back} 年的论文 ({start_date.strftime('%Y-%m-%d')} 到现在)")
228
+
229
+ # Search arXiv without date filter (use cutoff_date for early stopping)
230
+ try:
231
+ papers = self.search_arxiv(
232
+ query,
233
+ max_results=Config.ARXIV_MAX_RESULTS,
234
+ cutoff_date=start_date,
235
+ )
236
+ new_papers = self.filter_new_papers(papers)
237
+ saved = self.save_papers(new_papers, query)
238
+
239
+ output.done(f"同步完成: {len(saved)} 篇新论文")
240
+ time.sleep(1) # Rate limiting
241
+
242
+ return {
243
+ "query": query,
244
+ "start_date": start_date,
245
+ "total_found": len(papers),
246
+ "new_papers": len(saved),
247
+ "saved_papers": saved,
248
+ }
249
+
250
+ except Exception as e:
251
+ output.error(f"同步查询失败: {query}", details={"exception": str(e)})
252
+ return {"query": query, "error": str(e), "new_papers": 0}
253
+
254
+ def sync_all_queries(self, years_back: int = 3) -> Dict[str, Any]:
255
+ """Sync all configured search queries"""
256
+ output.do(f"同步所有查询 (回溯 {years_back} 年)")
257
+
258
+ all_results = []
259
+ total_new = 0
260
+
261
+ for query in self.config.SEARCH_QUERIES:
262
+ result = self.sync_query(query, years_back)
263
+ all_results.append(result)
264
+ total_new += result.get("new_papers", 0)
265
+
266
+ output.done(f"同步完成: 共 {total_new} 篇新论文")
267
+ return {
268
+ "total_new_papers": total_new,
269
+ "query_results": all_results,
270
+ "years_back": years_back,
271
+ }
272
+
273
+ def sync_important_papers(self) -> Dict[str, Any]:
274
+ """Ensure important papers are in database"""
275
+ important_file = Config.IMPORTANT_PAPERS_FILE
276
+ if not os.path.exists(important_file):
277
+ output.warn(f"重要论文文件未找到: {important_file}")
278
+ return {"total_processed": 0, "added": 0, "errors": []}
279
+
280
+ added = 0
281
+ errors = []
282
+
283
+ with open(important_file, "r") as f:
284
+ for line in f:
285
+ line = line.strip()
286
+ if not line or line.startswith("#"):
287
+ continue
288
+
289
+ # Extract arXiv ID (format: 1234.56789v1 or 1234.56789)
290
+ arxiv_id = line.split()[0] if " " in line else line
291
+
292
+ # Remove version suffix if present
293
+ if "v" in arxiv_id:
294
+ arxiv_id = arxiv_id.split("v")[0]
295
+
296
+ # Check if paper already exists
297
+ if self.db.paper_exists(arxiv_id):
298
+ output.debug(f"重要论文已在数据库中: {arxiv_id}")
299
+ continue
300
+
301
+ # Try to fetch paper from arXiv
302
+ try:
303
+ search = arxiv.Search(id_list=[arxiv_id])
304
+ results = list(self.client.results(search))
305
+
306
+ if results:
307
+ paper = results[0]
308
+ paper_obj = Paper.from_arxiv_entry(paper, "important")
309
+ self.db.add_paper(paper_obj)
310
+ added += 1
311
+ output.done(f"添加重要论文: {arxiv_id}")
312
+ else:
313
+ errors.append(f"Paper not found on arXiv: {arxiv_id}")
314
+
315
+ except Exception as e:
316
+ errors.append(f"Error fetching paper {arxiv_id}: {e}")
317
+
318
+ time.sleep(0.5) # Rate limiting
319
+
320
+ return {
321
+ "total_processed": added + len(errors),
322
+ "added": added,
323
+ "errors": errors,
324
+ }
325
+
326
+ def get_crawler_stats(self) -> Dict[str, Any]:
327
+ """Get crawler statistics"""
328
+ with self.db.get_session() as session:
329
+ total = session.query(Paper).count()
330
+ today = datetime.now().date()
331
+ today_start = datetime.combine(today, datetime.min.time())
332
+ today_count = session.query(Paper).filter(Paper.created_at >= today_start).count()
333
+
334
+ by_query = {}
335
+ papers = session.query(Paper).all()
336
+ for paper in papers:
337
+ query = paper.search_query
338
+ by_query[query] = by_query.get(query, 0) + 1
339
+
340
+ return {
341
+ "total_papers": total,
342
+ "papers_today": today_count,
343
+ "papers_by_query": by_query,
344
+ }
345
+
346
+
347
+ def main():
348
+ """Test the crawler"""
349
+ crawler = ArXivCrawler()
350
+
351
+ print("Testing arXiv crawler...")
352
+ print(f"Search queries: {Config.SEARCH_QUERIES}")
353
+
354
+ # Test with a small crawl
355
+ test_query = Config.SEARCH_QUERIES[0]
356
+ print(f"\nTesting search for: {test_query}")
357
+
358
+ papers = crawler.search_arxiv(test_query, max_results=5)
359
+ print(f"Found {len(papers)} papers")
360
+
361
+ if papers:
362
+ paper = papers[0]
363
+ print(f"\nSample paper:")
364
+ print(f"Title: {paper.title[:100]}...")
365
+ print(f"Authors: {[author.name for author in paper.authors[:3]]}")
366
+ print(f"Published: {paper.published}")
367
+ print(f"Categories: {paper.categories if hasattr(paper, 'categories') else paper.primary_category}")
368
+
369
+ # Get stats
370
+ stats = crawler.get_crawler_stats()
371
+ print(f"\nDatabase stats:")
372
+ print(f"Total papers: {stats['total_papers']}")
373
+ print(f"Papers today: {stats['papers_today']}")
374
+
375
+
376
+ if __name__ == "__main__":
377
+ main()