arxiv-pulse 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arxiv_pulse/.ENV.TEMPLATE +72 -0
- arxiv_pulse/__init__.py +26 -0
- arxiv_pulse/__version__.py +33 -0
- arxiv_pulse/arxiv_crawler.py +377 -0
- arxiv_pulse/cli.py +1608 -0
- arxiv_pulse/config.py +64 -0
- arxiv_pulse/models.py +255 -0
- arxiv_pulse/output_manager.py +235 -0
- arxiv_pulse/report_generator.py +768 -0
- arxiv_pulse/search_engine.py +367 -0
- arxiv_pulse/summarizer.py +356 -0
- arxiv_pulse-0.5.0.dist-info/METADATA +546 -0
- arxiv_pulse-0.5.0.dist-info/RECORD +17 -0
- arxiv_pulse-0.5.0.dist-info/WHEEL +5 -0
- arxiv_pulse-0.5.0.dist-info/entry_points.txt +2 -0
- arxiv_pulse-0.5.0.dist-info/licenses/LICENSE +674 -0
- arxiv_pulse-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,768 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
import markdown
|
|
5
|
+
from typing import Dict, List, Any, Optional
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
from arxiv_pulse.models import Database, Paper
|
|
10
|
+
from arxiv_pulse.config import Config
|
|
11
|
+
from arxiv_pulse.output_manager import output
|
|
12
|
+
|
|
13
|
+
# 使用根日志记录器的配置(保留用于向后兼容)
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ReportGenerator:
|
|
18
|
+
def __init__(self):
|
|
19
|
+
self.db = Database()
|
|
20
|
+
self.config = Config
|
|
21
|
+
self.total_tokens_used = 0 # 总token使用量
|
|
22
|
+
self.total_cost = 0.0 # 总费用(元)
|
|
23
|
+
self.token_price_per_million = Config.TOKEN_PRICE_PER_MILLION # 每百万token价格,可从配置覆盖
|
|
24
|
+
self.summary_sentences_limit = Config.SUMMARY_SENTENCES_LIMIT # 摘要句子数限制
|
|
25
|
+
|
|
26
|
+
# 抑制第三方库的详细日志
|
|
27
|
+
import logging
|
|
28
|
+
|
|
29
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
30
|
+
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|
31
|
+
|
|
32
|
+
# arXiv分类解释映射
|
|
33
|
+
self.category_explanations = {
|
|
34
|
+
# Computer Science
|
|
35
|
+
"cs.AI": "人工智能 (Artificial Intelligence)",
|
|
36
|
+
"cs.CL": "计算语言学 (Computation and Language)",
|
|
37
|
+
"cs.CR": "密码学与安全 (Cryptography and Security)",
|
|
38
|
+
"cs.CV": "计算机视觉 (Computer Vision)",
|
|
39
|
+
"cs.LG": "机器学习 (Machine Learning)",
|
|
40
|
+
"cs.NE": "神经网络 (Neural and Evolutionary Computing)",
|
|
41
|
+
"cs.SE": "软件工程 (Software Engineering)",
|
|
42
|
+
"cs.PL": "编程语言 (Programming Languages)",
|
|
43
|
+
"cs.DC": "分布式计算 (Distributed, Parallel, and Cluster Computing)",
|
|
44
|
+
"cs.DS": "数据结构与算法 (Data Structures and Algorithms)",
|
|
45
|
+
"cs.IT": "信息论 (Information Theory)",
|
|
46
|
+
"cs.SY": "系统与控制 (Systems and Control)",
|
|
47
|
+
# Physics
|
|
48
|
+
"cond-mat": "凝聚态物理 (Condensed Matter)",
|
|
49
|
+
"cond-mat.mtrl-sci": "材料科学 (Materials Science)",
|
|
50
|
+
"cond-mat.str-el": "强关联电子系统 (Strongly Correlated Electrons)",
|
|
51
|
+
"cond-mat.supr-con": "超导 (Superconductivity)",
|
|
52
|
+
"cond-mat.mes-hall": "介观系统与量子霍尔效应 (Mesoscopic Systems and Quantum Hall Effect)",
|
|
53
|
+
"cond-mat.soft": "软凝聚态物质 (Soft Condensed Matter)",
|
|
54
|
+
"cond-mat.dis-nn": "无序系统与神经网络 (Disordered Systems and Neural Networks)",
|
|
55
|
+
"cond-mat.stat-mech": "统计力学 (Statistical Mechanics)",
|
|
56
|
+
"cond-mat.quant-gas": "量子气体 (Quantum Gases)",
|
|
57
|
+
# Physics other
|
|
58
|
+
"physics": "物理学 (Physics)",
|
|
59
|
+
"physics.comp-ph": "计算物理 (Computational Physics)",
|
|
60
|
+
"physics.chem-ph": "化学物理 (Chemical Physics)",
|
|
61
|
+
"physics.data-an": "数据分析 (Data Analysis, Statistics and Probability)",
|
|
62
|
+
"physics.ins-det": "仪器与探测器 (Instrumentation and Detectors)",
|
|
63
|
+
# Mathematics
|
|
64
|
+
"math": "数学 (Mathematics)",
|
|
65
|
+
"math.NA": "数值分析 (Numerical Analysis)",
|
|
66
|
+
"math.OC": "优化与控制 (Optimization and Control)",
|
|
67
|
+
"math.ST": "统计 (Statistics)",
|
|
68
|
+
# Quantitative Biology
|
|
69
|
+
"q-bio": "定量生物学 (Quantitative Biology)",
|
|
70
|
+
"q-bio.BM": "生物分子 (Biomolecules)",
|
|
71
|
+
"q-bio.QM": "定量方法 (Quantitative Methods)",
|
|
72
|
+
# Quantitative Finance
|
|
73
|
+
"q-fin": "定量金融 (Quantitative Finance)",
|
|
74
|
+
# Statistics
|
|
75
|
+
"stat": "统计学 (Statistics)",
|
|
76
|
+
"stat.ML": "机器学习 (Machine Learning)",
|
|
77
|
+
"stat.AP": "应用 (Applications)",
|
|
78
|
+
"stat.CO": "计算 (Computation)",
|
|
79
|
+
"stat.ME": "方法学 (Methodology)",
|
|
80
|
+
"stat.OT": "其他 (Other)",
|
|
81
|
+
"stat.TH": "理论 (Theory)",
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
def get_category_explanation(self, category_code: str) -> str:
|
|
85
|
+
"""获取分类代码的解释"""
|
|
86
|
+
# 处理可能的多个分类
|
|
87
|
+
categories = [c.strip() for c in category_code.split(",")]
|
|
88
|
+
explanations = []
|
|
89
|
+
|
|
90
|
+
for cat in categories:
|
|
91
|
+
# 处理主分类,如cond-mat.mtrl-sci
|
|
92
|
+
if cat in self.category_explanations:
|
|
93
|
+
explanations.append(self.category_explanations[cat])
|
|
94
|
+
else:
|
|
95
|
+
# 尝试匹配主分类前缀
|
|
96
|
+
main_cat = cat.split(".")[0] if "." in cat else cat
|
|
97
|
+
if main_cat in self.category_explanations:
|
|
98
|
+
explanations.append(f"{cat} ({self.category_explanations[main_cat].split('(')[0]})")
|
|
99
|
+
else:
|
|
100
|
+
explanations.append(cat)
|
|
101
|
+
|
|
102
|
+
return "; ".join(explanations)
|
|
103
|
+
|
|
104
|
+
def clean_json_response(self, text):
|
|
105
|
+
"""清理AI响应中的JSON代码块标记"""
|
|
106
|
+
import re
|
|
107
|
+
|
|
108
|
+
if not text:
|
|
109
|
+
return text
|
|
110
|
+
# 移除 ```json 和 ``` 标记
|
|
111
|
+
text = text.strip()
|
|
112
|
+
# 匹配 ```json ... ``` 模式
|
|
113
|
+
json_match = re.search(r"```json\s*(.*?)\s*```", text, re.DOTALL)
|
|
114
|
+
if json_match:
|
|
115
|
+
return json_match.group(1).strip()
|
|
116
|
+
# 匹配 ``` ... ``` 模式(没有json标签)
|
|
117
|
+
code_match = re.search(r"```\s*(.*?)\s*```", text, re.DOTALL)
|
|
118
|
+
if code_match:
|
|
119
|
+
return code_match.group(1).strip()
|
|
120
|
+
# 如果以 ```json 开头但没有闭合
|
|
121
|
+
if text.startswith("```json"):
|
|
122
|
+
text = text[7:].strip()
|
|
123
|
+
if text.startswith("```"):
|
|
124
|
+
text = text[3:].strip()
|
|
125
|
+
if text.endswith("```"):
|
|
126
|
+
text = text[:-3].strip()
|
|
127
|
+
return text
|
|
128
|
+
|
|
129
|
+
def calculate_relevance_score(self, paper) -> int:
|
|
130
|
+
"""计算论文相关度评级 (1-5星)"""
|
|
131
|
+
# 基于搜索查询和分类的评分
|
|
132
|
+
query = paper.search_query or ""
|
|
133
|
+
categories = paper.categories or ""
|
|
134
|
+
|
|
135
|
+
# 核心研究领域 (凝聚态物理、DFT、力场等)
|
|
136
|
+
core_domains = [
|
|
137
|
+
"condensed matter physics",
|
|
138
|
+
"density functional theory",
|
|
139
|
+
"first principles calculation",
|
|
140
|
+
"force fields",
|
|
141
|
+
"molecular dynamics",
|
|
142
|
+
"computational materials science",
|
|
143
|
+
"quantum chemistry",
|
|
144
|
+
]
|
|
145
|
+
|
|
146
|
+
# 相关领域 (机器学习等)
|
|
147
|
+
related_domains = ["machine learning"]
|
|
148
|
+
|
|
149
|
+
# 目标分类 (凝聚态物理、计算物理等)
|
|
150
|
+
target_categories = [
|
|
151
|
+
"cond-mat", # 凝聚态物理
|
|
152
|
+
"physics.comp-ph", # 计算物理
|
|
153
|
+
"physics.chem-ph", # 化学物理
|
|
154
|
+
"quant-ph", # 量子物理
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
# 相关分类 (机器学习、人工智能)
|
|
158
|
+
related_categories = [
|
|
159
|
+
"cs.ai",
|
|
160
|
+
"cs.lg",
|
|
161
|
+
"stat.ml",
|
|
162
|
+
"cs.ne", # 机器学习
|
|
163
|
+
"physics.data-an", # 数据分析
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
# 不相关分类 (网络安全、其他CS领域)
|
|
167
|
+
unrelated_categories = [
|
|
168
|
+
"cs.cr", # 密码学与安全
|
|
169
|
+
"cs.pl", # 编程语言
|
|
170
|
+
"cs.se", # 软件工程
|
|
171
|
+
"cs.cv", # 计算机视觉 (除非明确相关)
|
|
172
|
+
"cs.cl", # 计算语言学
|
|
173
|
+
]
|
|
174
|
+
|
|
175
|
+
query_lower = query.lower()
|
|
176
|
+
categories_lower = categories.lower()
|
|
177
|
+
|
|
178
|
+
# 评分逻辑
|
|
179
|
+
score = 3 # 默认3星
|
|
180
|
+
|
|
181
|
+
# 1. 检查搜索查询
|
|
182
|
+
for domain in core_domains:
|
|
183
|
+
if domain in query_lower:
|
|
184
|
+
score += 2 # 核心领域加分
|
|
185
|
+
break
|
|
186
|
+
|
|
187
|
+
for domain in related_domains:
|
|
188
|
+
if domain in query_lower:
|
|
189
|
+
score += 1 # 相关领域加分
|
|
190
|
+
break
|
|
191
|
+
|
|
192
|
+
# 2. 检查分类匹配
|
|
193
|
+
# 如果分类与目标领域匹配
|
|
194
|
+
if any(cat in categories_lower for cat in target_categories):
|
|
195
|
+
score += 2
|
|
196
|
+
|
|
197
|
+
# 如果分类与相关领域匹配
|
|
198
|
+
elif any(cat in categories_lower for cat in related_categories):
|
|
199
|
+
score += 1
|
|
200
|
+
|
|
201
|
+
# 如果分类明显不相关
|
|
202
|
+
if any(cat in categories_lower for cat in unrelated_categories):
|
|
203
|
+
score -= 1
|
|
204
|
+
|
|
205
|
+
# 3. 特殊情况:搜索查询是计算材料科学但分类是CS领域
|
|
206
|
+
if "computational materials science" in query_lower and any(
|
|
207
|
+
cat in categories_lower for cat in unrelated_categories
|
|
208
|
+
):
|
|
209
|
+
score = max(2, score - 1) # 降低评分
|
|
210
|
+
|
|
211
|
+
# 确保分数在1-5之间
|
|
212
|
+
return max(1, min(5, score))
|
|
213
|
+
|
|
214
|
+
def _truncate_to_sentences(self, text: str, max_sentences: Optional[int] = None) -> str:
|
|
215
|
+
"""将文本截断为指定数量的句子(支持中英文)"""
|
|
216
|
+
if not text:
|
|
217
|
+
return ""
|
|
218
|
+
|
|
219
|
+
if max_sentences is None:
|
|
220
|
+
max_sentences = self.summary_sentences_limit
|
|
221
|
+
|
|
222
|
+
import re
|
|
223
|
+
|
|
224
|
+
# 支持中英文句子分隔符:句号、问号、感叹号、分号、省略号
|
|
225
|
+
# 英文: . ? ! ; ... 中文: 。!?;…
|
|
226
|
+
pattern = r"([。!?;…\.\?!;]+|\.{3,})"
|
|
227
|
+
parts = re.split(pattern, text)
|
|
228
|
+
|
|
229
|
+
sentences = []
|
|
230
|
+
current = ""
|
|
231
|
+
for i, part in enumerate(parts):
|
|
232
|
+
current += part
|
|
233
|
+
if i % 2 == 1: # 分隔符部分
|
|
234
|
+
sentences.append(current)
|
|
235
|
+
current = ""
|
|
236
|
+
|
|
237
|
+
# 如果最后还有未结束的句子
|
|
238
|
+
if current:
|
|
239
|
+
sentences.append(current)
|
|
240
|
+
|
|
241
|
+
# 如果分割失败,按长度简单截断
|
|
242
|
+
if len(sentences) == 0:
|
|
243
|
+
return text[:200] + "..." if len(text) > 200 else text
|
|
244
|
+
|
|
245
|
+
# 取前max_sentences句
|
|
246
|
+
result = "".join(sentences[:max_sentences])
|
|
247
|
+
|
|
248
|
+
# 如果截断后比原文本短很多,添加省略号
|
|
249
|
+
if len(result) < len(text) * 0.8:
|
|
250
|
+
# 移除末尾的句子分隔符,添加省略号
|
|
251
|
+
result = result.rstrip("。!?;….?!;") + "…"
|
|
252
|
+
|
|
253
|
+
return result
|
|
254
|
+
|
|
255
|
+
def translate_text(self, text: str, target_lang: str = "zh") -> str:
|
|
256
|
+
"""使用DeepSeek或OpenAI API翻译文本,优先使用缓存"""
|
|
257
|
+
if not text or not text.strip():
|
|
258
|
+
return ""
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
import openai
|
|
262
|
+
|
|
263
|
+
# 1. 检查缓存
|
|
264
|
+
cached_translation = self.db.get_translation_cache(text, target_lang)
|
|
265
|
+
if cached_translation:
|
|
266
|
+
output.info(f"✓ 缓存命中 ({len(text)} 字符)")
|
|
267
|
+
return cached_translation
|
|
268
|
+
|
|
269
|
+
# 2. 使用DeepSeek
|
|
270
|
+
if Config.AI_API_KEY:
|
|
271
|
+
translated = self._translate_with_deepseek(text, target_lang)
|
|
272
|
+
# 3. 存储到缓存
|
|
273
|
+
if translated and not translated.startswith("*"):
|
|
274
|
+
self.db.set_translation_cache(text, translated, target_lang)
|
|
275
|
+
return translated
|
|
276
|
+
else:
|
|
277
|
+
return "*翻译需要配置DeepSeek API密钥*"
|
|
278
|
+
|
|
279
|
+
except ImportError:
|
|
280
|
+
return "*需要安装openai库*"
|
|
281
|
+
except Exception as e:
|
|
282
|
+
output.error("翻译文本失败", details={"exception": str(e)})
|
|
283
|
+
return f"*翻译出错: {str(e)[:100]}*"
|
|
284
|
+
|
|
285
|
+
def _translate_with_deepseek(self, text: str, target_lang: str = "zh") -> str:
|
|
286
|
+
"""使用DeepSeek API翻译文本"""
|
|
287
|
+
import openai
|
|
288
|
+
|
|
289
|
+
# 配置DeepSeek (openai 2.x版本)
|
|
290
|
+
client = openai.OpenAI(api_key=Config.AI_API_KEY, base_url=Config.AI_BASE_URL)
|
|
291
|
+
|
|
292
|
+
# 如果文本太长,截断(API有token限制)
|
|
293
|
+
max_chars = 3000
|
|
294
|
+
if len(text) > max_chars:
|
|
295
|
+
text_to_translate = text[:max_chars] + "... [文本过长,已截断]"
|
|
296
|
+
else:
|
|
297
|
+
text_to_translate = text
|
|
298
|
+
|
|
299
|
+
# 准备翻译提示
|
|
300
|
+
if target_lang == "zh":
|
|
301
|
+
system_prompt = "你是一个专业的翻译助手。将以下英文文本翻译成中文,保持专业术语准确,语言流畅。"
|
|
302
|
+
else:
|
|
303
|
+
system_prompt = f"你是一个专业的翻译助手。将以下文本翻译成{target_lang},保持专业术语准确,语言流畅。"
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
response = client.chat.completions.create(
|
|
307
|
+
model=Config.AI_MODEL or "DeepSeek-V3.2-Thinking",
|
|
308
|
+
messages=[
|
|
309
|
+
{"role": "system", "content": system_prompt},
|
|
310
|
+
{"role": "user", "content": text_to_translate},
|
|
311
|
+
],
|
|
312
|
+
max_tokens=min(2000, len(text_to_translate) // 2),
|
|
313
|
+
temperature=0.3,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# 记录token使用情况并更新统计
|
|
317
|
+
if hasattr(response, "usage") and response.usage:
|
|
318
|
+
usage = response.usage
|
|
319
|
+
current_tokens = usage.total_tokens
|
|
320
|
+
self.total_tokens_used += current_tokens
|
|
321
|
+
|
|
322
|
+
# DeepSeek实际价格:输入0.14元/百万token,输出0.28元/百万token
|
|
323
|
+
# 如果用户指定了价格则使用用户价格,否则使用实际价格
|
|
324
|
+
if self.token_price_per_million is not None:
|
|
325
|
+
# 使用用户指定的统一价格
|
|
326
|
+
cost = (current_tokens / 1_000_000) * self.token_price_per_million
|
|
327
|
+
else:
|
|
328
|
+
# 使用DeepSeek实际价格
|
|
329
|
+
input_cost = (usage.prompt_tokens / 1_000_000) * 0.14
|
|
330
|
+
output_cost = (usage.completion_tokens / 1_000_000) * 0.28
|
|
331
|
+
cost = input_cost + output_cost
|
|
332
|
+
|
|
333
|
+
# 更新累计总费用
|
|
334
|
+
self.total_cost += cost
|
|
335
|
+
total_cost_formatted = f"{self.total_cost:.4f}"
|
|
336
|
+
output.info(
|
|
337
|
+
f"✓ 翻译完成 | 本次: {current_tokens} tokens ({cost:.4f}¥) | 累计: {self.total_tokens_used} tokens (¥{total_cost_formatted})"
|
|
338
|
+
)
|
|
339
|
+
else:
|
|
340
|
+
# 估算token使用(约4字符/1token)
|
|
341
|
+
estimated_tokens = len(text) // 4 + 500 # 基础估计
|
|
342
|
+
self.total_tokens_used += estimated_tokens
|
|
343
|
+
# 计算当前批次费用
|
|
344
|
+
price_per_million = (
|
|
345
|
+
self.token_price_per_million if self.token_price_per_million is not None else 0.21
|
|
346
|
+
) # DeepSeek平均价格
|
|
347
|
+
current_cost = (estimated_tokens / 1_000_000) * price_per_million
|
|
348
|
+
self.total_cost += current_cost
|
|
349
|
+
total_cost_formatted = f"{self.total_cost:.4f}"
|
|
350
|
+
output.info(
|
|
351
|
+
f"✓ 翻译完成 | 本次: ~{estimated_tokens} tokens ({current_cost:.4f}¥) | 累计: {self.total_tokens_used} tokens (¥{total_cost_formatted})"
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
translated = response.choices[0].message.content
|
|
355
|
+
return translated.strip() if translated else ""
|
|
356
|
+
|
|
357
|
+
except Exception as e:
|
|
358
|
+
output.error("DeepSeek翻译失败", details={"exception": str(e)})
|
|
359
|
+
raise e
|
|
360
|
+
|
|
361
|
+
def generate_daily_report(self) -> Dict[str, Any]:
|
|
362
|
+
"""Generate daily report of new papers"""
|
|
363
|
+
output.do("生成每日报告")
|
|
364
|
+
|
|
365
|
+
with self.db.get_session() as session:
|
|
366
|
+
# Get papers from last 24 hours
|
|
367
|
+
cutoff = datetime.utcnow() - timedelta(hours=24)
|
|
368
|
+
new_papers = (
|
|
369
|
+
session.query(Paper)
|
|
370
|
+
.filter(Paper.created_at >= cutoff)
|
|
371
|
+
.order_by(Paper.published.desc())
|
|
372
|
+
.limit(self.config.REPORT_MAX_PAPERS)
|
|
373
|
+
.all()
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Get summarized papers
|
|
377
|
+
summarized = [p for p in new_papers if getattr(p, "summarized", False) == True]
|
|
378
|
+
|
|
379
|
+
# Group by category/query
|
|
380
|
+
by_query = {}
|
|
381
|
+
for paper in new_papers:
|
|
382
|
+
query = paper.search_query or "Unknown"
|
|
383
|
+
if query not in by_query:
|
|
384
|
+
by_query[query] = []
|
|
385
|
+
by_query[query].append(paper)
|
|
386
|
+
|
|
387
|
+
# Statistics
|
|
388
|
+
stats = {
|
|
389
|
+
"total_new": len(new_papers),
|
|
390
|
+
"summarized_new": len(summarized),
|
|
391
|
+
"papers_by_query": {k: len(v) for k, v in by_query.items()},
|
|
392
|
+
"date_generated": datetime.now().isoformat(),
|
|
393
|
+
"report_type": "daily",
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
return {
|
|
397
|
+
"stats": stats,
|
|
398
|
+
"papers": new_papers,
|
|
399
|
+
"summarized_papers": summarized,
|
|
400
|
+
"grouped_papers": by_query,
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
def generate_weekly_report(self) -> Dict[str, Any]:
|
|
404
|
+
"""Generate weekly report"""
|
|
405
|
+
output.do("生成每周报告")
|
|
406
|
+
|
|
407
|
+
with self.db.get_session() as session:
|
|
408
|
+
# Get papers from last 7 days
|
|
409
|
+
cutoff = datetime.utcnow() - timedelta(days=7)
|
|
410
|
+
recent_papers = (
|
|
411
|
+
session.query(Paper)
|
|
412
|
+
.filter(Paper.created_at >= cutoff)
|
|
413
|
+
.order_by(Paper.published.desc())
|
|
414
|
+
.limit(self.config.REPORT_MAX_PAPERS)
|
|
415
|
+
.all()
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
# Get database stats
|
|
419
|
+
db_stats = self.db.get_statistics()
|
|
420
|
+
|
|
421
|
+
# Top categories
|
|
422
|
+
categories = db_stats.get("categories_distribution", {})
|
|
423
|
+
top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
424
|
+
|
|
425
|
+
# Statistics
|
|
426
|
+
stats = {
|
|
427
|
+
"total_recent": len(recent_papers),
|
|
428
|
+
"database_stats": db_stats,
|
|
429
|
+
"top_categories": dict(top_categories),
|
|
430
|
+
"date_generated": datetime.now().isoformat(),
|
|
431
|
+
"report_type": "weekly",
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
return {
|
|
435
|
+
"stats": stats,
|
|
436
|
+
"recent_papers": recent_papers,
|
|
437
|
+
"database_stats": db_stats,
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
def save_markdown_report(self, report_data: Dict[str, Any], filename: Optional[str] = None) -> str:
|
|
441
|
+
"""Save report as markdown file"""
|
|
442
|
+
if filename is None:
|
|
443
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
444
|
+
report_type = report_data.get("stats", {}).get("report_type", "report")
|
|
445
|
+
filename = f"{report_type}_{timestamp}.md"
|
|
446
|
+
|
|
447
|
+
filepath = os.path.join(self.config.REPORT_DIR, filename)
|
|
448
|
+
|
|
449
|
+
# 重置token统计(每次新报告开始时)
|
|
450
|
+
self.total_tokens_used = 0
|
|
451
|
+
self.total_cost = 0.0
|
|
452
|
+
output.info("开始生成报告 - token计数已重置")
|
|
453
|
+
|
|
454
|
+
# Generate markdown content
|
|
455
|
+
stats = report_data["stats"]
|
|
456
|
+
|
|
457
|
+
# 报告类型中文映射
|
|
458
|
+
report_type_chinese = {"daily": "每日", "weekly": "每周", "recent": "最近", "search": "搜索"}
|
|
459
|
+
report_type = report_type_chinese.get(stats["report_type"], stats["report_type"])
|
|
460
|
+
|
|
461
|
+
markdown_content = f"""# arXiv 文献报告
|
|
462
|
+
**生成时间**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
|
|
463
|
+
**报告类型**: {report_type}报告
|
|
464
|
+
|
|
465
|
+
## 统计摘要
|
|
466
|
+
"""
|
|
467
|
+
|
|
468
|
+
if stats["report_type"] == "daily":
|
|
469
|
+
markdown_content += f"""
|
|
470
|
+
- **今日新论文**: {stats["total_new"]}
|
|
471
|
+
- **今日已总结**: {stats["summarized_new"]}
|
|
472
|
+
- **总结率**: {stats["summarized_new"] / stats["total_new"]:.1%} (如果总数 > 0)
|
|
473
|
+
|
|
474
|
+
### 按搜索查询统计
|
|
475
|
+
"""
|
|
476
|
+
for query, count in stats["papers_by_query"].items():
|
|
477
|
+
markdown_content += f"- **{query}**: {count} 篇论文\n"
|
|
478
|
+
|
|
479
|
+
elif stats["report_type"] == "weekly":
|
|
480
|
+
markdown_content += f"""
|
|
481
|
+
- **本周论文**: {stats["total_recent"]}
|
|
482
|
+
- **数据库总论文**: {stats["database_stats"]["total_papers"]}
|
|
483
|
+
- **已总结论文**: {stats["database_stats"]["summarized_papers"]}
|
|
484
|
+
- **总体总结率**: {stats["database_stats"]["summarized_papers"] / stats["database_stats"]["total_papers"]:.1%} (如果总数 > 0)
|
|
485
|
+
|
|
486
|
+
### 热门分类
|
|
487
|
+
"""
|
|
488
|
+
for category, count in stats["top_categories"].items():
|
|
489
|
+
markdown_content += f"- **{category}**: {count} 篇论文\n"
|
|
490
|
+
|
|
491
|
+
elif stats["report_type"] == "recent":
|
|
492
|
+
markdown_content += f"""
|
|
493
|
+
- **最近论文**: {stats["total_recent"]} (最近 {stats["days_back"]} 天)
|
|
494
|
+
- **数据库总论文**: {stats.get("database_stats", {}).get("total_papers", "N/A")}
|
|
495
|
+
- **已总结论文**: {stats.get("database_stats", {}).get("summarized_papers", "N/A")}
|
|
496
|
+
|
|
497
|
+
### 热门分类
|
|
498
|
+
"""
|
|
499
|
+
for category, count in stats.get("top_categories", {}).items():
|
|
500
|
+
markdown_content += f"- **{category}**: {count} 篇论文\n"
|
|
501
|
+
|
|
502
|
+
elif stats["report_type"] == "search":
|
|
503
|
+
markdown_content += f"""
|
|
504
|
+
- **搜索查询**: {stats.get("original_query", "N/A")}
|
|
505
|
+
- **找到论文**: {stats.get("total_found", "N/A")}
|
|
506
|
+
- **数据库总论文**: {stats.get("database_stats", {}).get("total_papers", "N/A")}
|
|
507
|
+
- **已总结论文**: {stats.get("database_stats", {}).get("summarized_papers", "N/A")}
|
|
508
|
+
|
|
509
|
+
### 搜索词
|
|
510
|
+
"""
|
|
511
|
+
search_terms = stats.get("search_terms", [])
|
|
512
|
+
if isinstance(search_terms, list):
|
|
513
|
+
for term in search_terms[:5]: # 只显示前5个搜索词
|
|
514
|
+
markdown_content += f"- **{term}**\n"
|
|
515
|
+
else:
|
|
516
|
+
markdown_content += f"- **{search_terms}**\n"
|
|
517
|
+
|
|
518
|
+
markdown_content += "\n### 热门分类\n"
|
|
519
|
+
for category, count in stats.get("top_categories", {}).items():
|
|
520
|
+
markdown_content += f"- **{category}**: {count} 篇论文\n"
|
|
521
|
+
|
|
522
|
+
# Add paper details
|
|
523
|
+
if "papers" in report_data and report_data["papers"]:
|
|
524
|
+
markdown_content += "\n## 新论文\n\n"
|
|
525
|
+
|
|
526
|
+
papers = report_data["papers"][:50] # Limit to 50 papers
|
|
527
|
+
total_papers = len(papers)
|
|
528
|
+
|
|
529
|
+
for i, paper in enumerate(papers, 1):
|
|
530
|
+
try:
|
|
531
|
+
output.do(f"[{i}/{total_papers}] 处理论文: {paper.arxiv_id}")
|
|
532
|
+
authors = json.loads(paper.authors) if paper.authors else []
|
|
533
|
+
author_names = [a.get("name", "") for a in authors[:3]]
|
|
534
|
+
if len(authors) > 3:
|
|
535
|
+
author_names.append("et al.")
|
|
536
|
+
|
|
537
|
+
# 计算相关度评级
|
|
538
|
+
relevance_score = self.calculate_relevance_score(paper)
|
|
539
|
+
stars = "★" * relevance_score + "☆" * (5 - relevance_score)
|
|
540
|
+
|
|
541
|
+
# 题目中英双语
|
|
542
|
+
# 尝试翻译标题
|
|
543
|
+
output.do(f"[{i}/{total_papers}] 翻译标题")
|
|
544
|
+
title_translation = self.translate_text(paper.title, "zh")
|
|
545
|
+
|
|
546
|
+
# 根据用户要求:中文标题为主,英文标题放在括号里
|
|
547
|
+
if title_translation and not title_translation.startswith("*"):
|
|
548
|
+
markdown_content += f"### {title_translation} ({paper.title})\n\n"
|
|
549
|
+
else:
|
|
550
|
+
markdown_content += f"### {paper.title}\n\n"
|
|
551
|
+
|
|
552
|
+
markdown_content += f"**作者 (Authors)**: {', '.join(author_names)}\n"
|
|
553
|
+
markdown_content += f"**发表日期 (Published)**: {paper.published.strftime('%Y-%m-%d') if paper.published else 'N/A'}\n"
|
|
554
|
+
|
|
555
|
+
# 分类解释
|
|
556
|
+
category_explanation = self.get_category_explanation(paper.categories or "")
|
|
557
|
+
markdown_content += f"**分类解释 (Categories)**: {category_explanation}\n"
|
|
558
|
+
markdown_content += f"**原始分类代码 (Original Codes)**: {paper.categories}\n"
|
|
559
|
+
|
|
560
|
+
markdown_content += f"**搜索查询 (Search Query)**: {paper.search_query}\n"
|
|
561
|
+
|
|
562
|
+
# 相关度评级
|
|
563
|
+
markdown_content += f"**相关度评级 (Relevance)**: {stars} ({relevance_score}/5)\n\n"
|
|
564
|
+
|
|
565
|
+
# 显示关键发现(如果存在)
|
|
566
|
+
if paper.summary:
|
|
567
|
+
summary_data = None
|
|
568
|
+
|
|
569
|
+
# 尝试解析总结JSON
|
|
570
|
+
try:
|
|
571
|
+
summary_data = json.loads(paper.summary)
|
|
572
|
+
except json.JSONDecodeError:
|
|
573
|
+
# 如果直接解析失败,尝试清理后解析
|
|
574
|
+
cleaned_summary = self.clean_json_response(paper.summary)
|
|
575
|
+
try:
|
|
576
|
+
summary_data = json.loads(cleaned_summary)
|
|
577
|
+
except json.JSONDecodeError:
|
|
578
|
+
# 如果仍然不是JSON,忽略总结数据
|
|
579
|
+
pass
|
|
580
|
+
|
|
581
|
+
# 显示关键发现(如果存在)
|
|
582
|
+
if summary_data and "key_findings" in summary_data and summary_data["key_findings"]:
|
|
583
|
+
markdown_content += "**关键发现 (Key Findings)**:\n"
|
|
584
|
+
for finding in summary_data["key_findings"][:5]:
|
|
585
|
+
markdown_content += f"- {finding}\n"
|
|
586
|
+
markdown_content += "\n"
|
|
587
|
+
|
|
588
|
+
# 完整英文摘要和中文翻译
|
|
589
|
+
if paper.abstract:
|
|
590
|
+
markdown_content += f"**完整英文摘要 (Full Abstract)**:\n{paper.abstract}\n\n"
|
|
591
|
+
|
|
592
|
+
# 尝试翻译摘要
|
|
593
|
+
output.do(f"[{i}/{total_papers}] 翻译摘要")
|
|
594
|
+
chinese_translation = self.translate_text(paper.abstract, "zh")
|
|
595
|
+
if chinese_translation and not chinese_translation.startswith("*"):
|
|
596
|
+
markdown_content += f"**中文翻译 (Chinese Translation)**:\n{chinese_translation}\n\n"
|
|
597
|
+
elif chinese_translation:
|
|
598
|
+
markdown_content += f"**中文翻译 (Chinese Translation)**: {chinese_translation}\n\n"
|
|
599
|
+
else:
|
|
600
|
+
markdown_content += f"**中文翻译 (Chinese Translation)**: *翻译服务不可用*\n\n"
|
|
601
|
+
else:
|
|
602
|
+
markdown_content += f"**摘要 (Abstract)**: 无摘要\n\n"
|
|
603
|
+
|
|
604
|
+
markdown_content += f"**arXiv ID**: [{paper.arxiv_id}](https://arxiv.org/abs/{paper.arxiv_id})\n"
|
|
605
|
+
markdown_content += f"**PDF**: [下载 (Download)]({paper.pdf_url})\n\n"
|
|
606
|
+
markdown_content += "---\n\n"
|
|
607
|
+
|
|
608
|
+
except Exception as e:
|
|
609
|
+
output.error(
|
|
610
|
+
f"格式化论文失败: {paper.arxiv_id}",
|
|
611
|
+
details={"exception": str(e)},
|
|
612
|
+
)
|
|
613
|
+
continue
|
|
614
|
+
|
|
615
|
+
# Add recommendations section
|
|
616
|
+
markdown_content += "\n## 建议\n\n"
|
|
617
|
+
|
|
618
|
+
if stats["report_type"] == "daily":
|
|
619
|
+
markdown_content += "1. 浏览您感兴趣领域的新论文\n"
|
|
620
|
+
markdown_content += "2. 查看已总结的论文以快速了解内容\n"
|
|
621
|
+
markdown_content += "3. 将相关论文添加到阅读列表\n"
|
|
622
|
+
else:
|
|
623
|
+
markdown_content += "1. 回顾您研究领域的每周趋势\n"
|
|
624
|
+
markdown_content += "2. 从热门分类中识别新兴主题\n"
|
|
625
|
+
markdown_content += "3. 规划下周的阅读计划\n"
|
|
626
|
+
|
|
627
|
+
# Save to file
|
|
628
|
+
with open(filepath, "w", encoding="utf-8") as f:
|
|
629
|
+
f.write(markdown_content)
|
|
630
|
+
|
|
631
|
+
# 显示最终token统计
|
|
632
|
+
if self.total_tokens_used > 0:
|
|
633
|
+
total_cost_formatted = f"{self.total_cost:.4f}"
|
|
634
|
+
output.info(f"报告生成完成 - 总计: {self.total_tokens_used} tokens (¥{total_cost_formatted})")
|
|
635
|
+
|
|
636
|
+
output.done(f"报告已保存: {filepath}")
|
|
637
|
+
return filepath
|
|
638
|
+
|
|
639
|
+
def save_csv_report(self, report_data: Dict[str, Any], filename: Optional[str] = None) -> Optional[str]:
|
|
640
|
+
"""Save report as CSV file"""
|
|
641
|
+
if filename is None:
|
|
642
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
643
|
+
report_type = report_data["stats"]["report_type"]
|
|
644
|
+
filename = f"{report_type}_report_{timestamp}.csv"
|
|
645
|
+
|
|
646
|
+
filepath = os.path.join(self.config.REPORT_DIR, filename)
|
|
647
|
+
|
|
648
|
+
# Convert papers to DataFrame
|
|
649
|
+
papers_data = []
|
|
650
|
+
for paper in report_data.get("papers", []):
|
|
651
|
+
try:
|
|
652
|
+
papers_data.append(
|
|
653
|
+
{
|
|
654
|
+
"arxiv_id": paper.arxiv_id,
|
|
655
|
+
"title": paper.title,
|
|
656
|
+
"authors": paper.authors,
|
|
657
|
+
"published": paper.published.isoformat() if paper.published else None,
|
|
658
|
+
"categories": paper.categories,
|
|
659
|
+
"primary_category": paper.primary_category,
|
|
660
|
+
"search_query": paper.search_query,
|
|
661
|
+
"summarized": paper.summarized,
|
|
662
|
+
"pdf_url": paper.pdf_url,
|
|
663
|
+
"doi": paper.doi,
|
|
664
|
+
"created_at": paper.created_at.isoformat() if paper.created_at else None,
|
|
665
|
+
}
|
|
666
|
+
)
|
|
667
|
+
except Exception as e:
|
|
668
|
+
output.error(
|
|
669
|
+
f"处理论文到CSV失败: {paper.arxiv_id}",
|
|
670
|
+
details={"exception": str(e)},
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
if papers_data:
|
|
674
|
+
df = pd.DataFrame(papers_data)
|
|
675
|
+
|
|
676
|
+
# 中文列名映射
|
|
677
|
+
chinese_columns = {
|
|
678
|
+
"arxiv_id": "arXiv ID",
|
|
679
|
+
"title": "标题",
|
|
680
|
+
"authors": "作者",
|
|
681
|
+
"published": "发表日期",
|
|
682
|
+
"categories": "分类",
|
|
683
|
+
"primary_category": "主要分类",
|
|
684
|
+
"search_query": "搜索查询",
|
|
685
|
+
"summarized": "已总结",
|
|
686
|
+
"pdf_url": "PDF链接",
|
|
687
|
+
"doi": "DOI",
|
|
688
|
+
"created_at": "创建时间",
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
# 重命名列
|
|
692
|
+
df = df.rename(columns=chinese_columns)
|
|
693
|
+
|
|
694
|
+
df.to_csv(filepath, index=False, encoding="utf-8")
|
|
695
|
+
output.done(f"CSV报告已保存: {filepath}")
|
|
696
|
+
return filepath
|
|
697
|
+
else:
|
|
698
|
+
output.warn("没有论文数据可保存为CSV")
|
|
699
|
+
return None
|
|
700
|
+
|
|
701
|
+
def generate_and_save_daily_report(self) -> List[str]:
|
|
702
|
+
"""Generate and save daily report (returns list of saved files)"""
|
|
703
|
+
report_data = self.generate_daily_report()
|
|
704
|
+
|
|
705
|
+
saved_files = []
|
|
706
|
+
|
|
707
|
+
# Save markdown report
|
|
708
|
+
md_file = self.save_markdown_report(report_data)
|
|
709
|
+
if md_file:
|
|
710
|
+
saved_files.append(md_file)
|
|
711
|
+
|
|
712
|
+
# Save CSV report
|
|
713
|
+
csv_file = self.save_csv_report(report_data)
|
|
714
|
+
if csv_file:
|
|
715
|
+
saved_files.append(csv_file)
|
|
716
|
+
|
|
717
|
+
return saved_files
|
|
718
|
+
|
|
719
|
+
def generate_and_save_weekly_report(self) -> List[str]:
|
|
720
|
+
"""Generate and save weekly report"""
|
|
721
|
+
report_data = self.generate_weekly_report()
|
|
722
|
+
|
|
723
|
+
saved_files = []
|
|
724
|
+
|
|
725
|
+
# Save markdown report
|
|
726
|
+
md_file = self.save_markdown_report(report_data)
|
|
727
|
+
if md_file:
|
|
728
|
+
saved_files.append(md_file)
|
|
729
|
+
|
|
730
|
+
# Save CSV report
|
|
731
|
+
csv_file = self.save_csv_report(report_data)
|
|
732
|
+
if csv_file:
|
|
733
|
+
saved_files.append(csv_file)
|
|
734
|
+
|
|
735
|
+
return saved_files
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
def main():
|
|
739
|
+
"""Test report generator"""
|
|
740
|
+
generator = ReportGenerator()
|
|
741
|
+
|
|
742
|
+
print("Testing report generator...")
|
|
743
|
+
|
|
744
|
+
# Generate daily report
|
|
745
|
+
print("\nGenerating daily report...")
|
|
746
|
+
daily_data = generator.generate_daily_report()
|
|
747
|
+
print(f"Daily stats: {daily_data['stats']}")
|
|
748
|
+
|
|
749
|
+
# Save reports
|
|
750
|
+
print("\nSaving reports...")
|
|
751
|
+
saved_files = generator.generate_and_save_daily_report()
|
|
752
|
+
print(f"Saved files: {saved_files}")
|
|
753
|
+
|
|
754
|
+
# Generate weekly report
|
|
755
|
+
print("\nGenerating weekly report...")
|
|
756
|
+
weekly_data = generator.generate_weekly_report()
|
|
757
|
+
print(f"Weekly stats: {weekly_data['stats'].get('total_recent', 0)} recent papers")
|
|
758
|
+
|
|
759
|
+
# Check report directory
|
|
760
|
+
report_dir = Config.REPORT_DIR
|
|
761
|
+
print(f"\nReport directory: {report_dir}")
|
|
762
|
+
if os.path.exists(report_dir):
|
|
763
|
+
files = os.listdir(report_dir)
|
|
764
|
+
print(f"Existing reports: {len(files)} files")
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
if __name__ == "__main__":
|
|
768
|
+
main()
|