arxiv-pulse 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arxiv_pulse/.ENV.TEMPLATE +72 -0
- arxiv_pulse/__init__.py +26 -0
- arxiv_pulse/__version__.py +33 -0
- arxiv_pulse/arxiv_crawler.py +377 -0
- arxiv_pulse/cli.py +1608 -0
- arxiv_pulse/config.py +64 -0
- arxiv_pulse/models.py +255 -0
- arxiv_pulse/output_manager.py +235 -0
- arxiv_pulse/report_generator.py +768 -0
- arxiv_pulse/search_engine.py +367 -0
- arxiv_pulse/summarizer.py +356 -0
- arxiv_pulse-0.5.0.dist-info/METADATA +546 -0
- arxiv_pulse-0.5.0.dist-info/RECORD +17 -0
- arxiv_pulse-0.5.0.dist-info/WHEEL +5 -0
- arxiv_pulse-0.5.0.dist-info/entry_points.txt +2 -0
- arxiv_pulse-0.5.0.dist-info/licenses/LICENSE +674 -0
- arxiv_pulse-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# arXiv Pulse 配置文件模板
|
|
2
|
+
# 将此文件复制为 .env 并进行配置
|
|
3
|
+
|
|
4
|
+
# ========================
|
|
5
|
+
# AI API 配置 (支持 OpenAI 格式)
|
|
6
|
+
# ========================
|
|
7
|
+
# 使用 OpenAI 格式的 API,支持 DeepSeek、Paratera AI 等
|
|
8
|
+
# 示例:Paratera AI: https://llmapi.paratera.com
|
|
9
|
+
# 示例:DeepSeek: https://api.deepseek.com
|
|
10
|
+
|
|
11
|
+
AI_API_KEY=your_api_key_here
|
|
12
|
+
AI_MODEL=DeepSeek-V3.2-Thinking
|
|
13
|
+
AI_BASE_URL=https://llmapi.paratera.com
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# ========================
|
|
17
|
+
# 数据库配置
|
|
18
|
+
# ========================
|
|
19
|
+
DATABASE_URL=sqlite:///data/arxiv_papers.db
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ========================
|
|
23
|
+
# 爬虫配置
|
|
24
|
+
# ========================
|
|
25
|
+
MAX_RESULTS_INITIAL=100 # init命令每个查询的论文数
|
|
26
|
+
MAX_RESULTS_DAILY=20 # sync命令每个查询的论文数
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ========================
|
|
30
|
+
# 搜索查询配置
|
|
31
|
+
# ========================
|
|
32
|
+
# 分号分隔,允许查询中包含逗号
|
|
33
|
+
# 默认监控凝聚态物理、DFT、机器学习、力场等研究领域
|
|
34
|
+
SEARCH_QUERIES=condensed matter physics AND cat:cond-mat.*; (ti:"density functional" OR abs:"density functional") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph); (ti:"machine learning" OR abs:"machine learning") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph); (ti:"force field" OR abs:"force field") AND (cat:physics.comp-ph OR cat:cond-mat.soft OR cat:physics.chem-ph); (ti:"first principles" OR abs:"first principles" OR ti:"ab initio" OR abs:"ab initio") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci); (ti:"molecular dynamics" OR abs:"molecular dynamics") AND (cat:physics.comp-ph OR cat:cond-mat.soft OR cat:physics.chem-ph); (ti:"quantum chemistry" OR abs:"quantum chemistry") AND (cat:physics.chem-ph OR cat:physics.comp-ph); cat:cond-mat.mtrl-sci AND (ti:"computational" OR abs:"computational" OR ti:"simulation" OR abs:"simulation")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ========================
|
|
38
|
+
# 报告配置
|
|
39
|
+
# ========================
|
|
40
|
+
REPORT_DIR=reports
|
|
41
|
+
# SUMMARY_MODEL 现在由 AI_MODEL 控制,不再需要单独设置
|
|
42
|
+
SUMMARY_MAX_TOKENS=2000 # 总结和翻译的最大token数
|
|
43
|
+
SUMMARY_SENTENCES_LIMIT=3
|
|
44
|
+
TOKEN_PRICE_PER_MILLION=3.0
|
|
45
|
+
REPORT_MAX_PAPERS=50
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ========================
|
|
49
|
+
# 同步配置
|
|
50
|
+
# ========================
|
|
51
|
+
YEARS_BACK=3 # 同步回溯的年数
|
|
52
|
+
IMPORTANT_PAPERS_FILE=important_papers.txt
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ========================
|
|
56
|
+
# 可选配置
|
|
57
|
+
# ========================
|
|
58
|
+
# 日志级别: DEBUG, INFO, WARNING, ERROR (默认: INFO)
|
|
59
|
+
LOG_LEVEL=INFO
|
|
60
|
+
|
|
61
|
+
# 爬虫延迟(秒,避免频繁请求 arXiv API)
|
|
62
|
+
CRAWL_DELAY=1.0
|
|
63
|
+
|
|
64
|
+
# ========================
|
|
65
|
+
# 使用说明
|
|
66
|
+
# ========================
|
|
67
|
+
# 1. 将此文件复制为 .env
|
|
68
|
+
# 2. 设置您的 AI API 密钥
|
|
69
|
+
# 3. 根据需要调整其他配置
|
|
70
|
+
# 4. 运行 pulse init . 初始化目录
|
|
71
|
+
# 5. 运行 pulse sync . 同步论文
|
|
72
|
+
# 6. 使用 pulse search "查询内容" . 搜索论文
|
arxiv_pulse/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
arXiv Pulse: An intelligent arXiv literature crawler and analyzer for physics research.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .__version__ import __version__
|
|
6
|
+
|
|
7
|
+
__author__ = "arXiv Pulse Team"
|
|
8
|
+
|
|
9
|
+
from .arxiv_crawler import ArXivCrawler
|
|
10
|
+
from .config import Config
|
|
11
|
+
from .models import Database, Paper, TranslationCache
|
|
12
|
+
from .output_manager import OutputManager, output
|
|
13
|
+
from .report_generator import ReportGenerator
|
|
14
|
+
from .summarizer import PaperSummarizer
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"ArXivCrawler",
|
|
18
|
+
"Config",
|
|
19
|
+
"Database",
|
|
20
|
+
"Paper",
|
|
21
|
+
"TranslationCache",
|
|
22
|
+
"OutputManager",
|
|
23
|
+
"output",
|
|
24
|
+
"ReportGenerator",
|
|
25
|
+
"PaperSummarizer",
|
|
26
|
+
]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
arXiv Pulse 版本信息 - 动态从包元数据读取
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import importlib.metadata
|
|
6
|
+
|
|
7
|
+
# 从包元数据读取版本
|
|
8
|
+
try:
|
|
9
|
+
__version__ = importlib.metadata.version("arxiv-pulse")
|
|
10
|
+
# 解析版本信息元组
|
|
11
|
+
try:
|
|
12
|
+
__version_info__ = tuple(map(int, __version__.split(".")))
|
|
13
|
+
except ValueError:
|
|
14
|
+
# 如果版本号包含字母或其他字符,只取数字部分
|
|
15
|
+
parts = []
|
|
16
|
+
for part in __version__.split("."):
|
|
17
|
+
# 提取数字部分
|
|
18
|
+
digits = ""
|
|
19
|
+
for char in part:
|
|
20
|
+
if char.isdigit():
|
|
21
|
+
digits += char
|
|
22
|
+
else:
|
|
23
|
+
break
|
|
24
|
+
if digits:
|
|
25
|
+
parts.append(int(digits))
|
|
26
|
+
else:
|
|
27
|
+
parts.append(0)
|
|
28
|
+
__version_info__ = tuple(parts)
|
|
29
|
+
|
|
30
|
+
except importlib.metadata.PackageNotFoundError:
|
|
31
|
+
# 包未安装时使用默认版本
|
|
32
|
+
__version__ = "0.5.0"
|
|
33
|
+
__version_info__ = (0, 5, 0)
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
import arxiv
|
|
2
|
+
import asyncio
|
|
3
|
+
import aiohttp
|
|
4
|
+
import os
|
|
5
|
+
from typing import List, Dict, Any, Optional
|
|
6
|
+
from datetime import datetime, timedelta, timezone
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
import time
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
from arxiv_pulse.models import Database, Paper
|
|
12
|
+
from arxiv_pulse.config import Config
|
|
13
|
+
from arxiv_pulse.output_manager import output
|
|
14
|
+
|
|
15
|
+
# 使用根日志记录器的配置(保留用于向后兼容)
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ArXivCrawler:
|
|
20
|
+
def __init__(self):
|
|
21
|
+
self.db = Database()
|
|
22
|
+
# 配置arXiv客户端,遵守调用频率限制
|
|
23
|
+
self.client = arxiv.Client(page_size=100, delay_seconds=3.0, num_retries=3)
|
|
24
|
+
self.config = Config
|
|
25
|
+
|
|
26
|
+
# 抑制第三方库的详细日志
|
|
27
|
+
logging.getLogger("arxiv").setLevel(logging.WARNING)
|
|
28
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
29
|
+
|
|
30
|
+
def search_arxiv(
|
|
31
|
+
self,
|
|
32
|
+
query: str,
|
|
33
|
+
max_results: int = 100,
|
|
34
|
+
days_back: Optional[int] = None,
|
|
35
|
+
cutoff_date: Optional[datetime] = None,
|
|
36
|
+
) -> List[arxiv.Result]:
|
|
37
|
+
"""Search arXiv for papers matching query
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
query: arXiv search query
|
|
41
|
+
max_results: Maximum number of results to return
|
|
42
|
+
days_back: Optional number of days to look back (deprecated, use cutoff_date)
|
|
43
|
+
cutoff_date: Optional UTC datetime cutoff; papers older than this will be skipped
|
|
44
|
+
and iteration will stop early due to descending date order.
|
|
45
|
+
"""
|
|
46
|
+
search = arxiv.Search(
|
|
47
|
+
query=query,
|
|
48
|
+
max_results=max_results,
|
|
49
|
+
sort_by=arxiv.SortCriterion.SubmittedDate,
|
|
50
|
+
sort_order=arxiv.SortOrder.Descending,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
results = []
|
|
54
|
+
for paper in self.client.results(search):
|
|
55
|
+
# If cutoff date is provided, check paper date
|
|
56
|
+
if cutoff_date is not None and hasattr(paper, "published") and paper.published:
|
|
57
|
+
# Convert paper.published to UTC aware datetime
|
|
58
|
+
if paper.published.tzinfo is None:
|
|
59
|
+
# Assume naive datetime is UTC
|
|
60
|
+
paper_date = paper.published.replace(tzinfo=timezone.utc)
|
|
61
|
+
else:
|
|
62
|
+
paper_date = paper.published.astimezone(timezone.utc)
|
|
63
|
+
|
|
64
|
+
if paper_date < cutoff_date:
|
|
65
|
+
output.debug(f"遇到旧论文 ({paper_date.date()}),停止爬取")
|
|
66
|
+
break
|
|
67
|
+
|
|
68
|
+
results.append(paper)
|
|
69
|
+
|
|
70
|
+
# Safety: don't exceed max_results even if cutoff_date not reached
|
|
71
|
+
if len(results) >= max_results:
|
|
72
|
+
break
|
|
73
|
+
|
|
74
|
+
output.debug(f"Found {len(results)} papers for query: {query}")
|
|
75
|
+
return results
|
|
76
|
+
|
|
77
|
+
def filter_new_papers(self, papers: List[arxiv.Result]) -> List[arxiv.Result]:
|
|
78
|
+
"""Filter out papers already in database"""
|
|
79
|
+
new_papers = []
|
|
80
|
+
for paper in papers:
|
|
81
|
+
arxiv_id = paper.entry_id.split("/")[-1]
|
|
82
|
+
if not self.db.paper_exists(arxiv_id):
|
|
83
|
+
new_papers.append(paper)
|
|
84
|
+
else:
|
|
85
|
+
output.debug(f"Paper {arxiv_id} already exists in database")
|
|
86
|
+
|
|
87
|
+
output.debug(f"Filtered to {len(new_papers)} new papers")
|
|
88
|
+
return new_papers
|
|
89
|
+
|
|
90
|
+
def save_papers(self, papers: List[arxiv.Result], search_query: str) -> List[Paper]:
|
|
91
|
+
"""Save papers to database"""
|
|
92
|
+
saved_papers = []
|
|
93
|
+
for paper in tqdm(papers, desc="Saving papers"):
|
|
94
|
+
try:
|
|
95
|
+
# Check again to avoid race conditions
|
|
96
|
+
arxiv_id = paper.entry_id.split("/")[-1]
|
|
97
|
+
if self.db.paper_exists(arxiv_id):
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
paper_obj = Paper.from_arxiv_entry(paper, search_query)
|
|
101
|
+
self.db.add_paper(paper_obj)
|
|
102
|
+
saved_papers.append(paper_obj)
|
|
103
|
+
|
|
104
|
+
except Exception as e:
|
|
105
|
+
output.error(
|
|
106
|
+
"保存论文失败",
|
|
107
|
+
details={"paper_id": paper.entry_id, "exception": str(e)},
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
output.done(f"保存完成: {len(saved_papers)} 篇新论文")
|
|
111
|
+
return saved_papers
|
|
112
|
+
|
|
113
|
+
def initial_crawl(self) -> Dict[str, Any]:
|
|
114
|
+
"""Perform initial crawl with multiple queries"""
|
|
115
|
+
output.do("开始初始爬取")
|
|
116
|
+
all_saved = []
|
|
117
|
+
|
|
118
|
+
for query in self.config.SEARCH_QUERIES:
|
|
119
|
+
output.do(f"搜索: {query}")
|
|
120
|
+
try:
|
|
121
|
+
papers = self.search_arxiv(query, max_results=self.config.MAX_RESULTS_INITIAL)
|
|
122
|
+
new_papers = self.filter_new_papers(papers)
|
|
123
|
+
saved = self.save_papers(new_papers, query)
|
|
124
|
+
all_saved.extend(saved)
|
|
125
|
+
|
|
126
|
+
output.done(f"保存: {len(saved)} 篇论文")
|
|
127
|
+
time.sleep(1) # Be nice to arXiv API
|
|
128
|
+
|
|
129
|
+
except Exception as e:
|
|
130
|
+
output.error(f"爬取查询失败: {query}", details={"exception": str(e)})
|
|
131
|
+
|
|
132
|
+
output.done(f"初始爬取完成: 共保存 {len(all_saved)} 篇论文")
|
|
133
|
+
return {
|
|
134
|
+
"total_saved": len(all_saved),
|
|
135
|
+
"queries_searched": len(self.config.SEARCH_QUERIES),
|
|
136
|
+
"saved_papers": all_saved,
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
def daily_update(self) -> Dict[str, Any]:
|
|
140
|
+
"""Perform daily update crawl with early stopping optimization"""
|
|
141
|
+
output.do("开始每日更新")
|
|
142
|
+
all_saved = []
|
|
143
|
+
|
|
144
|
+
# 使用2天的时间窗口,因为arXiv通常在UTC 00:00-02:00更新
|
|
145
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=2)
|
|
146
|
+
output.info(f"查找 {cutoff_date.date()} 之后的新论文")
|
|
147
|
+
|
|
148
|
+
for query in self.config.SEARCH_QUERIES:
|
|
149
|
+
output.do(f"搜索: {query}")
|
|
150
|
+
try:
|
|
151
|
+
# 使用cutoff_date参数实现早期终止
|
|
152
|
+
papers = self.search_arxiv(
|
|
153
|
+
query,
|
|
154
|
+
max_results=self.config.MAX_RESULTS_DAILY,
|
|
155
|
+
cutoff_date=cutoff_date,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
output.debug(f"找到 {len(papers)} 篇最近论文")
|
|
159
|
+
|
|
160
|
+
new_papers = self.filter_new_papers(papers)
|
|
161
|
+
saved = self.save_papers(new_papers, query)
|
|
162
|
+
all_saved.extend(saved)
|
|
163
|
+
|
|
164
|
+
output.done(f"保存: {len(saved)} 篇新论文")
|
|
165
|
+
time.sleep(1)
|
|
166
|
+
|
|
167
|
+
except Exception as e:
|
|
168
|
+
output.error(f"每日更新失败: {query}", details={"exception": str(e)})
|
|
169
|
+
|
|
170
|
+
output.done(f"每日更新完成: 共保存 {len(all_saved)} 篇新论文")
|
|
171
|
+
return {
|
|
172
|
+
"total_saved": len(all_saved),
|
|
173
|
+
"queries_searched": len(self.config.SEARCH_QUERIES),
|
|
174
|
+
"date_range": f"Since {cutoff_date.date()}",
|
|
175
|
+
"saved_papers": all_saved,
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
def crawl_by_categories(self, categories: List[str], max_results: int = 50) -> Dict[str, Any]:
|
|
179
|
+
"""Crawl specific arXiv categories"""
|
|
180
|
+
all_saved = []
|
|
181
|
+
|
|
182
|
+
for category in categories:
|
|
183
|
+
query = f"cat:{category}"
|
|
184
|
+
output.do(f"搜索类别: {category}")
|
|
185
|
+
try:
|
|
186
|
+
papers = self.search_arxiv(query, max_results=max_results)
|
|
187
|
+
new_papers = self.filter_new_papers(papers)
|
|
188
|
+
saved = self.save_papers(new_papers, f"cat:{category}")
|
|
189
|
+
all_saved.extend(saved)
|
|
190
|
+
|
|
191
|
+
output.done(f"保存: {len(saved)} 篇论文")
|
|
192
|
+
time.sleep(1)
|
|
193
|
+
|
|
194
|
+
except Exception as e:
|
|
195
|
+
output.error(f"爬取类别失败: {category}", details={"exception": str(e)})
|
|
196
|
+
|
|
197
|
+
return {
|
|
198
|
+
"total_saved": len(all_saved),
|
|
199
|
+
"categories": categories,
|
|
200
|
+
"saved_papers": all_saved,
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
def get_latest_paper_date_for_query(self, query: str) -> Optional[datetime]:
|
|
204
|
+
"""Get the latest paper date for a specific query in database"""
|
|
205
|
+
with self.db.get_session() as session:
|
|
206
|
+
latest_paper = (
|
|
207
|
+
session.query(Paper).filter(Paper.search_query == query).order_by(Paper.published.desc()).first()
|
|
208
|
+
)
|
|
209
|
+
return latest_paper.published if latest_paper else None # type: ignore
|
|
210
|
+
|
|
211
|
+
def sync_query(self, query: str, years_back: int = 3) -> Dict[str, Any]:
|
|
212
|
+
"""Sync papers for a specific query, fetching missing papers from recent years"""
|
|
213
|
+
output.do(f"同步查询: {query}")
|
|
214
|
+
|
|
215
|
+
# Get latest paper date in database for this query
|
|
216
|
+
latest_date = self.get_latest_paper_date_for_query(query)
|
|
217
|
+
|
|
218
|
+
if latest_date:
|
|
219
|
+
# If we have papers, fetch from latest date onward
|
|
220
|
+
start_date = latest_date.replace(tzinfo=timezone.utc)
|
|
221
|
+
# 减去一天以确保获取所有可能的新论文,避免因时间精度问题错过论文
|
|
222
|
+
start_date = start_date - timedelta(days=1)
|
|
223
|
+
output.debug(f"获取论文从 {start_date.strftime('%Y-%m-%d')} 到现在")
|
|
224
|
+
else:
|
|
225
|
+
# If no papers, fetch from years_back years ago
|
|
226
|
+
start_date = datetime.now(timezone.utc) - timedelta(days=365 * years_back)
|
|
227
|
+
output.debug(f"获取最近 {years_back} 年的论文 ({start_date.strftime('%Y-%m-%d')} 到现在)")
|
|
228
|
+
|
|
229
|
+
# Search arXiv without date filter (use cutoff_date for early stopping)
|
|
230
|
+
try:
|
|
231
|
+
papers = self.search_arxiv(
|
|
232
|
+
query,
|
|
233
|
+
max_results=Config.ARXIV_MAX_RESULTS,
|
|
234
|
+
cutoff_date=start_date,
|
|
235
|
+
)
|
|
236
|
+
new_papers = self.filter_new_papers(papers)
|
|
237
|
+
saved = self.save_papers(new_papers, query)
|
|
238
|
+
|
|
239
|
+
output.done(f"同步完成: {len(saved)} 篇新论文")
|
|
240
|
+
time.sleep(1) # Rate limiting
|
|
241
|
+
|
|
242
|
+
return {
|
|
243
|
+
"query": query,
|
|
244
|
+
"start_date": start_date,
|
|
245
|
+
"total_found": len(papers),
|
|
246
|
+
"new_papers": len(saved),
|
|
247
|
+
"saved_papers": saved,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
except Exception as e:
|
|
251
|
+
output.error(f"同步查询失败: {query}", details={"exception": str(e)})
|
|
252
|
+
return {"query": query, "error": str(e), "new_papers": 0}
|
|
253
|
+
|
|
254
|
+
def sync_all_queries(self, years_back: int = 3) -> Dict[str, Any]:
|
|
255
|
+
"""Sync all configured search queries"""
|
|
256
|
+
output.do(f"同步所有查询 (回溯 {years_back} 年)")
|
|
257
|
+
|
|
258
|
+
all_results = []
|
|
259
|
+
total_new = 0
|
|
260
|
+
|
|
261
|
+
for query in self.config.SEARCH_QUERIES:
|
|
262
|
+
result = self.sync_query(query, years_back)
|
|
263
|
+
all_results.append(result)
|
|
264
|
+
total_new += result.get("new_papers", 0)
|
|
265
|
+
|
|
266
|
+
output.done(f"同步完成: 共 {total_new} 篇新论文")
|
|
267
|
+
return {
|
|
268
|
+
"total_new_papers": total_new,
|
|
269
|
+
"query_results": all_results,
|
|
270
|
+
"years_back": years_back,
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
def sync_important_papers(self) -> Dict[str, Any]:
|
|
274
|
+
"""Ensure important papers are in database"""
|
|
275
|
+
important_file = Config.IMPORTANT_PAPERS_FILE
|
|
276
|
+
if not os.path.exists(important_file):
|
|
277
|
+
output.warn(f"重要论文文件未找到: {important_file}")
|
|
278
|
+
return {"total_processed": 0, "added": 0, "errors": []}
|
|
279
|
+
|
|
280
|
+
added = 0
|
|
281
|
+
errors = []
|
|
282
|
+
|
|
283
|
+
with open(important_file, "r") as f:
|
|
284
|
+
for line in f:
|
|
285
|
+
line = line.strip()
|
|
286
|
+
if not line or line.startswith("#"):
|
|
287
|
+
continue
|
|
288
|
+
|
|
289
|
+
# Extract arXiv ID (format: 1234.56789v1 or 1234.56789)
|
|
290
|
+
arxiv_id = line.split()[0] if " " in line else line
|
|
291
|
+
|
|
292
|
+
# Remove version suffix if present
|
|
293
|
+
if "v" in arxiv_id:
|
|
294
|
+
arxiv_id = arxiv_id.split("v")[0]
|
|
295
|
+
|
|
296
|
+
# Check if paper already exists
|
|
297
|
+
if self.db.paper_exists(arxiv_id):
|
|
298
|
+
output.debug(f"重要论文已在数据库中: {arxiv_id}")
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
# Try to fetch paper from arXiv
|
|
302
|
+
try:
|
|
303
|
+
search = arxiv.Search(id_list=[arxiv_id])
|
|
304
|
+
results = list(self.client.results(search))
|
|
305
|
+
|
|
306
|
+
if results:
|
|
307
|
+
paper = results[0]
|
|
308
|
+
paper_obj = Paper.from_arxiv_entry(paper, "important")
|
|
309
|
+
self.db.add_paper(paper_obj)
|
|
310
|
+
added += 1
|
|
311
|
+
output.done(f"添加重要论文: {arxiv_id}")
|
|
312
|
+
else:
|
|
313
|
+
errors.append(f"Paper not found on arXiv: {arxiv_id}")
|
|
314
|
+
|
|
315
|
+
except Exception as e:
|
|
316
|
+
errors.append(f"Error fetching paper {arxiv_id}: {e}")
|
|
317
|
+
|
|
318
|
+
time.sleep(0.5) # Rate limiting
|
|
319
|
+
|
|
320
|
+
return {
|
|
321
|
+
"total_processed": added + len(errors),
|
|
322
|
+
"added": added,
|
|
323
|
+
"errors": errors,
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
def get_crawler_stats(self) -> Dict[str, Any]:
|
|
327
|
+
"""Get crawler statistics"""
|
|
328
|
+
with self.db.get_session() as session:
|
|
329
|
+
total = session.query(Paper).count()
|
|
330
|
+
today = datetime.now().date()
|
|
331
|
+
today_start = datetime.combine(today, datetime.min.time())
|
|
332
|
+
today_count = session.query(Paper).filter(Paper.created_at >= today_start).count()
|
|
333
|
+
|
|
334
|
+
by_query = {}
|
|
335
|
+
papers = session.query(Paper).all()
|
|
336
|
+
for paper in papers:
|
|
337
|
+
query = paper.search_query
|
|
338
|
+
by_query[query] = by_query.get(query, 0) + 1
|
|
339
|
+
|
|
340
|
+
return {
|
|
341
|
+
"total_papers": total,
|
|
342
|
+
"papers_today": today_count,
|
|
343
|
+
"papers_by_query": by_query,
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def main():
|
|
348
|
+
"""Test the crawler"""
|
|
349
|
+
crawler = ArXivCrawler()
|
|
350
|
+
|
|
351
|
+
print("Testing arXiv crawler...")
|
|
352
|
+
print(f"Search queries: {Config.SEARCH_QUERIES}")
|
|
353
|
+
|
|
354
|
+
# Test with a small crawl
|
|
355
|
+
test_query = Config.SEARCH_QUERIES[0]
|
|
356
|
+
print(f"\nTesting search for: {test_query}")
|
|
357
|
+
|
|
358
|
+
papers = crawler.search_arxiv(test_query, max_results=5)
|
|
359
|
+
print(f"Found {len(papers)} papers")
|
|
360
|
+
|
|
361
|
+
if papers:
|
|
362
|
+
paper = papers[0]
|
|
363
|
+
print(f"\nSample paper:")
|
|
364
|
+
print(f"Title: {paper.title[:100]}...")
|
|
365
|
+
print(f"Authors: {[author.name for author in paper.authors[:3]]}")
|
|
366
|
+
print(f"Published: {paper.published}")
|
|
367
|
+
print(f"Categories: {paper.categories if hasattr(paper, 'categories') else paper.primary_category}")
|
|
368
|
+
|
|
369
|
+
# Get stats
|
|
370
|
+
stats = crawler.get_crawler_stats()
|
|
371
|
+
print(f"\nDatabase stats:")
|
|
372
|
+
print(f"Total papers: {stats['total_papers']}")
|
|
373
|
+
print(f"Papers today: {stats['papers_today']}")
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
if __name__ == "__main__":
|
|
377
|
+
main()
|