arxiv-pulse 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arxiv_pulse/.ENV.TEMPLATE +93 -41
- arxiv_pulse/__version__.py +2 -2
- arxiv_pulse/arxiv_crawler.py +65 -23
- arxiv_pulse/cli.py +228 -433
- arxiv_pulse/config.py +6 -8
- arxiv_pulse/models.py +17 -9
- arxiv_pulse/output_manager.py +38 -54
- arxiv_pulse/report_generator.py +3 -46
- arxiv_pulse/search_engine.py +105 -53
- arxiv_pulse/summarizer.py +0 -1
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/METADATA +61 -124
- arxiv_pulse-0.6.1.dist-info/RECORD +17 -0
- arxiv_pulse-0.5.0.dist-info/RECORD +0 -17
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/WHEEL +0 -0
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/entry_points.txt +0 -0
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/top_level.txt +0 -0
arxiv_pulse/.ENV.TEMPLATE
CHANGED
|
@@ -1,72 +1,124 @@
|
|
|
1
1
|
# arXiv Pulse 配置文件模板
|
|
2
2
|
# 将此文件复制为 .env 并进行配置
|
|
3
|
+
# 所有配置项均可通过环境变量覆盖
|
|
3
4
|
|
|
4
|
-
#
|
|
5
|
-
# AI API 配置 (
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
#
|
|
5
|
+
# ============================================================================
|
|
6
|
+
# 1. AI API 配置 (支持所有 OpenAI 兼容服务)
|
|
7
|
+
# ============================================================================
|
|
8
|
+
# 支持 DeepSeek、Paratera AI、OpenAI 等所有 OpenAI 兼容服务
|
|
9
|
+
# 示例: Paratera AI: https://llmapi.paratera.com
|
|
10
|
+
# 示例: DeepSeek: https://api.deepseek.com
|
|
11
|
+
# 示例: OpenAI: https://api.openai.com
|
|
10
12
|
|
|
13
|
+
# AI API 密钥 (必需,用于论文总结和翻译)
|
|
11
14
|
AI_API_KEY=your_api_key_here
|
|
15
|
+
|
|
16
|
+
# AI 模型名称 (默认: DeepSeek-V3.2-Thinking)
|
|
12
17
|
AI_MODEL=DeepSeek-V3.2-Thinking
|
|
18
|
+
|
|
19
|
+
# AI API 基础 URL (默认: Paratera AI)
|
|
13
20
|
AI_BASE_URL=https://llmapi.paratera.com
|
|
14
21
|
|
|
15
22
|
|
|
16
|
-
#
|
|
17
|
-
# 数据库配置
|
|
18
|
-
#
|
|
23
|
+
# ============================================================================
|
|
24
|
+
# 2. 数据库配置
|
|
25
|
+
# ============================================================================
|
|
26
|
+
|
|
27
|
+
# 数据库连接 URL (默认: SQLite)
|
|
19
28
|
DATABASE_URL=sqlite:///data/arxiv_papers.db
|
|
20
29
|
|
|
21
30
|
|
|
22
|
-
#
|
|
23
|
-
# 爬虫配置
|
|
24
|
-
#
|
|
25
|
-
|
|
26
|
-
|
|
31
|
+
# ============================================================================
|
|
32
|
+
# 3. 爬虫配置
|
|
33
|
+
# ============================================================================
|
|
34
|
+
|
|
35
|
+
# 初始同步每个查询的最大论文数 (默认: 10000)
|
|
36
|
+
# 注意: 实际取 MIN(MAX_RESULTS_INITIAL, ARXIV_MAX_RESULTS, 30000)
|
|
37
|
+
MAX_RESULTS_INITIAL=10000
|
|
38
|
+
|
|
39
|
+
# 每日同步每个查询的最大论文数 (默认: 500)
|
|
40
|
+
# 日常更新通常只需要最新论文,因此设置较小值
|
|
41
|
+
MAX_RESULTS_DAILY=500
|
|
42
|
+
|
|
43
|
+
# 爬虫延迟 (秒,避免频繁请求 arXiv API) (默认: 1.0)
|
|
44
|
+
# 注意: 当前代码中未使用此配置,arXiv 客户端使用固定延迟 3.0 秒
|
|
45
|
+
CRAWL_DELAY=1.0
|
|
46
|
+
|
|
47
|
+
# arXiv API 最大返回论文数限制 (默认: 30000)
|
|
48
|
+
# 注意: arXiv API 最大限制为 30000 篇,但实际查询可能受性能影响
|
|
49
|
+
ARXIV_MAX_RESULTS=30000
|
|
50
|
+
|
|
51
|
+
# arXiv API 排序方式 (默认: submittedDate)
|
|
52
|
+
# 可选: submittedDate, lastUpdatedDate, relevance
|
|
53
|
+
ARXIV_SORT_BY=submittedDate
|
|
27
54
|
|
|
55
|
+
# arXiv API 排序顺序 (默认: descending)
|
|
56
|
+
# 可选: descending, ascending
|
|
57
|
+
ARXIV_SORT_ORDER=descending
|
|
28
58
|
|
|
29
|
-
|
|
30
|
-
#
|
|
31
|
-
#
|
|
32
|
-
#
|
|
59
|
+
|
|
60
|
+
# ============================================================================
|
|
61
|
+
# 4. 搜索查询配置
|
|
62
|
+
# ============================================================================
|
|
63
|
+
# 分号分隔的搜索查询列表,允许查询中包含逗号
|
|
33
64
|
# 默认监控凝聚态物理、DFT、机器学习、力场等研究领域
|
|
65
|
+
# 通过交互式配置向导 (pulse init .) 可以自动生成优化查询
|
|
66
|
+
|
|
34
67
|
SEARCH_QUERIES=condensed matter physics AND cat:cond-mat.*; (ti:"density functional" OR abs:"density functional") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph); (ti:"machine learning" OR abs:"machine learning") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph); (ti:"force field" OR abs:"force field") AND (cat:physics.comp-ph OR cat:cond-mat.soft OR cat:physics.chem-ph); (ti:"first principles" OR abs:"first principles" OR ti:"ab initio" OR abs:"ab initio") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci); (ti:"molecular dynamics" OR abs:"molecular dynamics") AND (cat:physics.comp-ph OR cat:cond-mat.soft OR cat:physics.chem-ph); (ti:"quantum chemistry" OR abs:"quantum chemistry") AND (cat:physics.chem-ph OR cat:physics.comp-ph); cat:cond-mat.mtrl-sci AND (ti:"computational" OR abs:"computational" OR ti:"simulation" OR abs:"simulation")
|
|
35
68
|
|
|
36
69
|
|
|
37
|
-
#
|
|
38
|
-
# 报告配置
|
|
39
|
-
#
|
|
70
|
+
# ============================================================================
|
|
71
|
+
# 5. 报告配置
|
|
72
|
+
# ============================================================================
|
|
73
|
+
|
|
74
|
+
# 报告输出目录 (默认: reports)
|
|
40
75
|
REPORT_DIR=reports
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
SUMMARY_SENTENCES_LIMIT=3
|
|
44
|
-
TOKEN_PRICE_PER_MILLION=3.0
|
|
76
|
+
|
|
77
|
+
# 报告中包含的最大论文数 (默认: 50)
|
|
45
78
|
REPORT_MAX_PAPERS=50
|
|
46
79
|
|
|
80
|
+
# AI 总结和翻译的最大 token 数 (默认: 2000)
|
|
81
|
+
SUMMARY_MAX_TOKENS=2000
|
|
82
|
+
|
|
83
|
+
# AI API 每百万 token 价格 (人民币,用于费用估算) (默认: 3.0)
|
|
84
|
+
TOKEN_PRICE_PER_MILLION=3.0
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# ============================================================================
|
|
88
|
+
# 6. 同步配置
|
|
89
|
+
# ============================================================================
|
|
47
90
|
|
|
48
|
-
#
|
|
49
|
-
|
|
50
|
-
# ========================
|
|
51
|
-
YEARS_BACK=3 # 同步回溯的年数
|
|
52
|
-
IMPORTANT_PAPERS_FILE=important_papers.txt
|
|
91
|
+
# 初始同步回溯的年数 (默认: 3 年)
|
|
92
|
+
YEARS_BACK=3
|
|
53
93
|
|
|
94
|
+
# 重要论文列表文件路径 (默认: data/important_papers.txt)
|
|
95
|
+
IMPORTANT_PAPERS_FILE=data/important_papers.txt
|
|
54
96
|
|
|
55
|
-
|
|
56
|
-
#
|
|
57
|
-
#
|
|
58
|
-
#
|
|
97
|
+
|
|
98
|
+
# ============================================================================
|
|
99
|
+
# 7. 日志配置
|
|
100
|
+
# ============================================================================
|
|
101
|
+
|
|
102
|
+
# 日志级别 (默认: INFO)
|
|
103
|
+
# 可选: DEBUG, INFO, WARNING, ERROR
|
|
59
104
|
LOG_LEVEL=INFO
|
|
60
105
|
|
|
61
|
-
# 爬虫延迟(秒,避免频繁请求 arXiv API)
|
|
62
|
-
CRAWL_DELAY=1.0
|
|
63
106
|
|
|
64
|
-
#
|
|
107
|
+
# ============================================================================
|
|
65
108
|
# 使用说明
|
|
66
|
-
#
|
|
109
|
+
# ============================================================================
|
|
67
110
|
# 1. 将此文件复制为 .env
|
|
68
|
-
# 2. 设置您的 AI API 密钥
|
|
111
|
+
# 2. 设置您的 AI API 密钥 (AI_API_KEY)
|
|
69
112
|
# 3. 根据需要调整其他配置
|
|
70
|
-
# 4. 运行 pulse init . 初始化目录
|
|
113
|
+
# 4. 运行 pulse init . 初始化目录 (推荐使用交互式配置向导)
|
|
71
114
|
# 5. 运行 pulse sync . 同步论文
|
|
72
|
-
# 6. 使用 pulse search "查询内容" . 搜索论文
|
|
115
|
+
# 6. 使用 pulse search "查询内容" . 搜索论文
|
|
116
|
+
# 7. 使用 pulse recent . 生成最近论文报告
|
|
117
|
+
|
|
118
|
+
# 注意: 通过交互式配置向导 (pulse init .) 可以自动优化配置,
|
|
119
|
+
# 包括研究领域选择、智能参数推荐等。
|
|
120
|
+
|
|
121
|
+
# 关于 MAX_RESULTS_INITIAL 和 MAX_RESULTS_DAILY 的区别:
|
|
122
|
+
# - MAX_RESULTS_INITIAL: 首次爬取某个查询时使用的限制,通常需要获取较多历史论文
|
|
123
|
+
# - MAX_RESULTS_DAILY: 日常更新已有查询时使用的限制,通常只需要获取最新论文
|
|
124
|
+
# 两者有不同的使用场景,因此需要分别配置。
|
arxiv_pulse/__version__.py
CHANGED
arxiv_pulse/arxiv_crawler.py
CHANGED
|
@@ -43,11 +43,26 @@ class ArXivCrawler:
|
|
|
43
43
|
cutoff_date: Optional UTC datetime cutoff; papers older than this will be skipped
|
|
44
44
|
and iteration will stop early due to descending date order.
|
|
45
45
|
"""
|
|
46
|
+
# Map sort_by string to arxiv enum
|
|
47
|
+
sort_by_map = {
|
|
48
|
+
"submittedDate": arxiv.SortCriterion.SubmittedDate,
|
|
49
|
+
"lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate,
|
|
50
|
+
"relevance": arxiv.SortCriterion.Relevance,
|
|
51
|
+
}
|
|
52
|
+
sort_by = sort_by_map.get(Config.ARXIV_SORT_BY, arxiv.SortCriterion.SubmittedDate)
|
|
53
|
+
|
|
54
|
+
# Map sort_order string to arxiv enum
|
|
55
|
+
sort_order_map = {
|
|
56
|
+
"descending": arxiv.SortOrder.Descending,
|
|
57
|
+
"ascending": arxiv.SortOrder.Ascending,
|
|
58
|
+
}
|
|
59
|
+
sort_order = sort_order_map.get(Config.ARXIV_SORT_ORDER, arxiv.SortOrder.Descending)
|
|
60
|
+
|
|
46
61
|
search = arxiv.Search(
|
|
47
62
|
query=query,
|
|
48
63
|
max_results=max_results,
|
|
49
|
-
sort_by=
|
|
50
|
-
sort_order=
|
|
64
|
+
sort_by=sort_by,
|
|
65
|
+
sort_order=sort_order,
|
|
51
66
|
)
|
|
52
67
|
|
|
53
68
|
results = []
|
|
@@ -208,31 +223,51 @@ class ArXivCrawler:
|
|
|
208
223
|
)
|
|
209
224
|
return latest_paper.published if latest_paper else None # type: ignore
|
|
210
225
|
|
|
211
|
-
def sync_query(self, query: str, years_back: int = 3) -> Dict[str, Any]:
|
|
212
|
-
"""Sync papers for a specific query, fetching missing papers from recent years
|
|
213
|
-
output.do(f"同步查询: {query}")
|
|
226
|
+
def sync_query(self, query: str, years_back: int = 3, force: bool = False) -> Dict[str, Any]:
|
|
227
|
+
"""Sync papers for a specific query, fetching missing papers from recent years
|
|
214
228
|
|
|
215
|
-
|
|
216
|
-
|
|
229
|
+
Args:
|
|
230
|
+
query: arXiv search query
|
|
231
|
+
years_back: Number of years to look back
|
|
232
|
+
force: If True, ignore all max results limits (MAX_RESULTS_INITIAL,
|
|
233
|
+
MAX_RESULTS_DAILY, ARXIV_MAX_RESULTS) but still skip existing papers
|
|
234
|
+
"""
|
|
235
|
+
output.do(f"同步查询: {query}" + (" (强制模式)" if force else ""))
|
|
217
236
|
|
|
218
|
-
if
|
|
219
|
-
#
|
|
220
|
-
start_date = latest_date.replace(tzinfo=timezone.utc)
|
|
221
|
-
# 减去一天以确保获取所有可能的新论文,避免因时间精度问题错过论文
|
|
222
|
-
start_date = start_date - timedelta(days=1)
|
|
223
|
-
output.debug(f"获取论文从 {start_date.strftime('%Y-%m-%d')} 到现在")
|
|
224
|
-
else:
|
|
225
|
-
# If no papers, fetch from years_back years ago
|
|
237
|
+
if force:
|
|
238
|
+
# Force mode: always start from years_back years ago, ignore all limits
|
|
226
239
|
start_date = datetime.now(timezone.utc) - timedelta(days=365 * years_back)
|
|
227
|
-
output.debug(f"获取最近 {years_back}
|
|
240
|
+
output.debug(f"强制同步: 获取最近 {years_back} 年的所有论文 ({start_date.strftime('%Y-%m-%d')} 到现在)")
|
|
241
|
+
# Use a very large number to bypass all limits (arxiv API may have its own limits)
|
|
242
|
+
max_results = 30000 # Ignore MAX_RESULTS_INITIAL, MAX_RESULTS_DAILY, and ARXIV_MAX_RESULTS
|
|
243
|
+
else:
|
|
244
|
+
# Normal mode: get latest paper date in database for this query
|
|
245
|
+
latest_date = self.get_latest_paper_date_for_query(query)
|
|
246
|
+
|
|
247
|
+
if latest_date:
|
|
248
|
+
# If we have papers, fetch from latest date onward
|
|
249
|
+
start_date = latest_date.replace(tzinfo=timezone.utc)
|
|
250
|
+
# 减去一天以确保获取所有可能的新论文,避免因时间精度问题错过论文
|
|
251
|
+
start_date = start_date - timedelta(days=1)
|
|
252
|
+
output.debug(f"获取论文从 {start_date.strftime('%Y-%m-%d')} 到现在")
|
|
253
|
+
# Use daily limit, but respect arXiv API maximum
|
|
254
|
+
max_results = min(Config.MAX_RESULTS_DAILY, Config.ARXIV_MAX_RESULTS)
|
|
255
|
+
else:
|
|
256
|
+
# If no papers, fetch from years_back years ago
|
|
257
|
+
start_date = datetime.now(timezone.utc) - timedelta(days=365 * years_back)
|
|
258
|
+
output.debug(f"获取最近 {years_back} 年的论文 ({start_date.strftime('%Y-%m-%d')} 到现在)")
|
|
259
|
+
# Use initial limit, but respect arXiv API maximum
|
|
260
|
+
max_results = min(Config.MAX_RESULTS_INITIAL, Config.ARXIV_MAX_RESULTS)
|
|
228
261
|
|
|
229
262
|
# Search arXiv without date filter (use cutoff_date for early stopping)
|
|
230
263
|
try:
|
|
231
264
|
papers = self.search_arxiv(
|
|
232
265
|
query,
|
|
233
|
-
max_results=
|
|
266
|
+
max_results=max_results,
|
|
234
267
|
cutoff_date=start_date,
|
|
235
268
|
)
|
|
269
|
+
|
|
270
|
+
# Always filter out existing papers (even in force mode)
|
|
236
271
|
new_papers = self.filter_new_papers(papers)
|
|
237
272
|
saved = self.save_papers(new_papers, query)
|
|
238
273
|
|
|
@@ -245,29 +280,36 @@ class ArXivCrawler:
|
|
|
245
280
|
"total_found": len(papers),
|
|
246
281
|
"new_papers": len(saved),
|
|
247
282
|
"saved_papers": saved,
|
|
283
|
+
"force_mode": force,
|
|
248
284
|
}
|
|
249
285
|
|
|
250
286
|
except Exception as e:
|
|
251
287
|
output.error(f"同步查询失败: {query}", details={"exception": str(e)})
|
|
252
|
-
return {"query": query, "error": str(e), "new_papers": 0}
|
|
288
|
+
return {"query": query, "error": str(e), "new_papers": 0, "force_mode": force}
|
|
253
289
|
|
|
254
|
-
def sync_all_queries(self, years_back: int = 3) -> Dict[str, Any]:
|
|
255
|
-
"""Sync all configured search queries
|
|
256
|
-
|
|
290
|
+
def sync_all_queries(self, years_back: int = 3, force: bool = False) -> Dict[str, Any]:
|
|
291
|
+
"""Sync all configured search queries
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
years_back: Number of years to look back
|
|
295
|
+
force: If True, ignore all max results limits but still skip existing papers
|
|
296
|
+
"""
|
|
297
|
+
output.do(f"同步所有查询 (回溯 {years_back} 年)" + (" (强制模式)" if force else ""))
|
|
257
298
|
|
|
258
299
|
all_results = []
|
|
259
300
|
total_new = 0
|
|
260
301
|
|
|
261
302
|
for query in self.config.SEARCH_QUERIES:
|
|
262
|
-
result = self.sync_query(query, years_back)
|
|
303
|
+
result = self.sync_query(query, years_back, force)
|
|
263
304
|
all_results.append(result)
|
|
264
305
|
total_new += result.get("new_papers", 0)
|
|
265
306
|
|
|
266
|
-
output.done(f"同步完成: 共 {total_new}
|
|
307
|
+
output.done(f"同步完成: 共 {total_new} 篇论文")
|
|
267
308
|
return {
|
|
268
309
|
"total_new_papers": total_new,
|
|
269
310
|
"query_results": all_results,
|
|
270
311
|
"years_back": years_back,
|
|
312
|
+
"force_mode": force,
|
|
271
313
|
}
|
|
272
314
|
|
|
273
315
|
def sync_important_papers(self) -> Dict[str, Any]:
|