arxiv-pulse 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,768 @@
1
+ import json
2
+ import pandas as pd
3
+ from datetime import datetime, timedelta
4
+ import markdown
5
+ from typing import Dict, List, Any, Optional
6
+ import logging
7
+ import os
8
+
9
+ from arxiv_pulse.models import Database, Paper
10
+ from arxiv_pulse.config import Config
11
+ from arxiv_pulse.output_manager import output
12
+
13
+ # 使用根日志记录器的配置(保留用于向后兼容)
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class ReportGenerator:
18
+ def __init__(self):
19
+ self.db = Database()
20
+ self.config = Config
21
+ self.total_tokens_used = 0 # 总token使用量
22
+ self.total_cost = 0.0 # 总费用(元)
23
+ self.token_price_per_million = Config.TOKEN_PRICE_PER_MILLION # 每百万token价格,可从配置覆盖
24
+ self.summary_sentences_limit = Config.SUMMARY_SENTENCES_LIMIT # 摘要句子数限制
25
+
26
+ # 抑制第三方库的详细日志
27
+ import logging
28
+
29
+ logging.getLogger("httpx").setLevel(logging.WARNING)
30
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
31
+
32
+ # arXiv分类解释映射
33
+ self.category_explanations = {
34
+ # Computer Science
35
+ "cs.AI": "人工智能 (Artificial Intelligence)",
36
+ "cs.CL": "计算语言学 (Computation and Language)",
37
+ "cs.CR": "密码学与安全 (Cryptography and Security)",
38
+ "cs.CV": "计算机视觉 (Computer Vision)",
39
+ "cs.LG": "机器学习 (Machine Learning)",
40
+ "cs.NE": "神经网络 (Neural and Evolutionary Computing)",
41
+ "cs.SE": "软件工程 (Software Engineering)",
42
+ "cs.PL": "编程语言 (Programming Languages)",
43
+ "cs.DC": "分布式计算 (Distributed, Parallel, and Cluster Computing)",
44
+ "cs.DS": "数据结构与算法 (Data Structures and Algorithms)",
45
+ "cs.IT": "信息论 (Information Theory)",
46
+ "cs.SY": "系统与控制 (Systems and Control)",
47
+ # Physics
48
+ "cond-mat": "凝聚态物理 (Condensed Matter)",
49
+ "cond-mat.mtrl-sci": "材料科学 (Materials Science)",
50
+ "cond-mat.str-el": "强关联电子系统 (Strongly Correlated Electrons)",
51
+ "cond-mat.supr-con": "超导 (Superconductivity)",
52
+ "cond-mat.mes-hall": "介观系统与量子霍尔效应 (Mesoscopic Systems and Quantum Hall Effect)",
53
+ "cond-mat.soft": "软凝聚态物质 (Soft Condensed Matter)",
54
+ "cond-mat.dis-nn": "无序系统与神经网络 (Disordered Systems and Neural Networks)",
55
+ "cond-mat.stat-mech": "统计力学 (Statistical Mechanics)",
56
+ "cond-mat.quant-gas": "量子气体 (Quantum Gases)",
57
+ # Physics other
58
+ "physics": "物理学 (Physics)",
59
+ "physics.comp-ph": "计算物理 (Computational Physics)",
60
+ "physics.chem-ph": "化学物理 (Chemical Physics)",
61
+ "physics.data-an": "数据分析 (Data Analysis, Statistics and Probability)",
62
+ "physics.ins-det": "仪器与探测器 (Instrumentation and Detectors)",
63
+ # Mathematics
64
+ "math": "数学 (Mathematics)",
65
+ "math.NA": "数值分析 (Numerical Analysis)",
66
+ "math.OC": "优化与控制 (Optimization and Control)",
67
+ "math.ST": "统计 (Statistics)",
68
+ # Quantitative Biology
69
+ "q-bio": "定量生物学 (Quantitative Biology)",
70
+ "q-bio.BM": "生物分子 (Biomolecules)",
71
+ "q-bio.QM": "定量方法 (Quantitative Methods)",
72
+ # Quantitative Finance
73
+ "q-fin": "定量金融 (Quantitative Finance)",
74
+ # Statistics
75
+ "stat": "统计学 (Statistics)",
76
+ "stat.ML": "机器学习 (Machine Learning)",
77
+ "stat.AP": "应用 (Applications)",
78
+ "stat.CO": "计算 (Computation)",
79
+ "stat.ME": "方法学 (Methodology)",
80
+ "stat.OT": "其他 (Other)",
81
+ "stat.TH": "理论 (Theory)",
82
+ }
83
+
84
+ def get_category_explanation(self, category_code: str) -> str:
85
+ """获取分类代码的解释"""
86
+ # 处理可能的多个分类
87
+ categories = [c.strip() for c in category_code.split(",")]
88
+ explanations = []
89
+
90
+ for cat in categories:
91
+ # 处理主分类,如cond-mat.mtrl-sci
92
+ if cat in self.category_explanations:
93
+ explanations.append(self.category_explanations[cat])
94
+ else:
95
+ # 尝试匹配主分类前缀
96
+ main_cat = cat.split(".")[0] if "." in cat else cat
97
+ if main_cat in self.category_explanations:
98
+ explanations.append(f"{cat} ({self.category_explanations[main_cat].split('(')[0]})")
99
+ else:
100
+ explanations.append(cat)
101
+
102
+ return "; ".join(explanations)
103
+
104
+ def clean_json_response(self, text):
105
+ """清理AI响应中的JSON代码块标记"""
106
+ import re
107
+
108
+ if not text:
109
+ return text
110
+ # 移除 ```json 和 ``` 标记
111
+ text = text.strip()
112
+ # 匹配 ```json ... ``` 模式
113
+ json_match = re.search(r"```json\s*(.*?)\s*```", text, re.DOTALL)
114
+ if json_match:
115
+ return json_match.group(1).strip()
116
+ # 匹配 ``` ... ``` 模式(没有json标签)
117
+ code_match = re.search(r"```\s*(.*?)\s*```", text, re.DOTALL)
118
+ if code_match:
119
+ return code_match.group(1).strip()
120
+ # 如果以 ```json 开头但没有闭合
121
+ if text.startswith("```json"):
122
+ text = text[7:].strip()
123
+ if text.startswith("```"):
124
+ text = text[3:].strip()
125
+ if text.endswith("```"):
126
+ text = text[:-3].strip()
127
+ return text
128
+
129
+ def calculate_relevance_score(self, paper) -> int:
130
+ """计算论文相关度评级 (1-5星)"""
131
+ # 基于搜索查询和分类的评分
132
+ query = paper.search_query or ""
133
+ categories = paper.categories or ""
134
+
135
+ # 核心研究领域 (凝聚态物理、DFT、力场等)
136
+ core_domains = [
137
+ "condensed matter physics",
138
+ "density functional theory",
139
+ "first principles calculation",
140
+ "force fields",
141
+ "molecular dynamics",
142
+ "computational materials science",
143
+ "quantum chemistry",
144
+ ]
145
+
146
+ # 相关领域 (机器学习等)
147
+ related_domains = ["machine learning"]
148
+
149
+ # 目标分类 (凝聚态物理、计算物理等)
150
+ target_categories = [
151
+ "cond-mat", # 凝聚态物理
152
+ "physics.comp-ph", # 计算物理
153
+ "physics.chem-ph", # 化学物理
154
+ "quant-ph", # 量子物理
155
+ ]
156
+
157
+ # 相关分类 (机器学习、人工智能)
158
+ related_categories = [
159
+ "cs.ai",
160
+ "cs.lg",
161
+ "stat.ml",
162
+ "cs.ne", # 机器学习
163
+ "physics.data-an", # 数据分析
164
+ ]
165
+
166
+ # 不相关分类 (网络安全、其他CS领域)
167
+ unrelated_categories = [
168
+ "cs.cr", # 密码学与安全
169
+ "cs.pl", # 编程语言
170
+ "cs.se", # 软件工程
171
+ "cs.cv", # 计算机视觉 (除非明确相关)
172
+ "cs.cl", # 计算语言学
173
+ ]
174
+
175
+ query_lower = query.lower()
176
+ categories_lower = categories.lower()
177
+
178
+ # 评分逻辑
179
+ score = 3 # 默认3星
180
+
181
+ # 1. 检查搜索查询
182
+ for domain in core_domains:
183
+ if domain in query_lower:
184
+ score += 2 # 核心领域加分
185
+ break
186
+
187
+ for domain in related_domains:
188
+ if domain in query_lower:
189
+ score += 1 # 相关领域加分
190
+ break
191
+
192
+ # 2. 检查分类匹配
193
+ # 如果分类与目标领域匹配
194
+ if any(cat in categories_lower for cat in target_categories):
195
+ score += 2
196
+
197
+ # 如果分类与相关领域匹配
198
+ elif any(cat in categories_lower for cat in related_categories):
199
+ score += 1
200
+
201
+ # 如果分类明显不相关
202
+ if any(cat in categories_lower for cat in unrelated_categories):
203
+ score -= 1
204
+
205
+ # 3. 特殊情况:搜索查询是计算材料科学但分类是CS领域
206
+ if "computational materials science" in query_lower and any(
207
+ cat in categories_lower for cat in unrelated_categories
208
+ ):
209
+ score = max(2, score - 1) # 降低评分
210
+
211
+ # 确保分数在1-5之间
212
+ return max(1, min(5, score))
213
+
214
+ def _truncate_to_sentences(self, text: str, max_sentences: Optional[int] = None) -> str:
215
+ """将文本截断为指定数量的句子(支持中英文)"""
216
+ if not text:
217
+ return ""
218
+
219
+ if max_sentences is None:
220
+ max_sentences = self.summary_sentences_limit
221
+
222
+ import re
223
+
224
+ # 支持中英文句子分隔符:句号、问号、感叹号、分号、省略号
225
+ # 英文: . ? ! ; ... 中文: 。!?;…
226
+ pattern = r"([。!?;…\.\?!;]+|\.{3,})"
227
+ parts = re.split(pattern, text)
228
+
229
+ sentences = []
230
+ current = ""
231
+ for i, part in enumerate(parts):
232
+ current += part
233
+ if i % 2 == 1: # 分隔符部分
234
+ sentences.append(current)
235
+ current = ""
236
+
237
+ # 如果最后还有未结束的句子
238
+ if current:
239
+ sentences.append(current)
240
+
241
+ # 如果分割失败,按长度简单截断
242
+ if len(sentences) == 0:
243
+ return text[:200] + "..." if len(text) > 200 else text
244
+
245
+ # 取前max_sentences句
246
+ result = "".join(sentences[:max_sentences])
247
+
248
+ # 如果截断后比原文本短很多,添加省略号
249
+ if len(result) < len(text) * 0.8:
250
+ # 移除末尾的句子分隔符,添加省略号
251
+ result = result.rstrip("。!?;….?!;") + "…"
252
+
253
+ return result
254
+
255
+ def translate_text(self, text: str, target_lang: str = "zh") -> str:
256
+ """使用DeepSeek或OpenAI API翻译文本,优先使用缓存"""
257
+ if not text or not text.strip():
258
+ return ""
259
+
260
+ try:
261
+ import openai
262
+
263
+ # 1. 检查缓存
264
+ cached_translation = self.db.get_translation_cache(text, target_lang)
265
+ if cached_translation:
266
+ output.info(f"✓ 缓存命中 ({len(text)} 字符)")
267
+ return cached_translation
268
+
269
+ # 2. 使用DeepSeek
270
+ if Config.AI_API_KEY:
271
+ translated = self._translate_with_deepseek(text, target_lang)
272
+ # 3. 存储到缓存
273
+ if translated and not translated.startswith("*"):
274
+ self.db.set_translation_cache(text, translated, target_lang)
275
+ return translated
276
+ else:
277
+ return "*翻译需要配置DeepSeek API密钥*"
278
+
279
+ except ImportError:
280
+ return "*需要安装openai库*"
281
+ except Exception as e:
282
+ output.error("翻译文本失败", details={"exception": str(e)})
283
+ return f"*翻译出错: {str(e)[:100]}*"
284
+
285
+ def _translate_with_deepseek(self, text: str, target_lang: str = "zh") -> str:
286
+ """使用DeepSeek API翻译文本"""
287
+ import openai
288
+
289
+ # 配置DeepSeek (openai 2.x版本)
290
+ client = openai.OpenAI(api_key=Config.AI_API_KEY, base_url=Config.AI_BASE_URL)
291
+
292
+ # 如果文本太长,截断(API有token限制)
293
+ max_chars = 3000
294
+ if len(text) > max_chars:
295
+ text_to_translate = text[:max_chars] + "... [文本过长,已截断]"
296
+ else:
297
+ text_to_translate = text
298
+
299
+ # 准备翻译提示
300
+ if target_lang == "zh":
301
+ system_prompt = "你是一个专业的翻译助手。将以下英文文本翻译成中文,保持专业术语准确,语言流畅。"
302
+ else:
303
+ system_prompt = f"你是一个专业的翻译助手。将以下文本翻译成{target_lang},保持专业术语准确,语言流畅。"
304
+
305
+ try:
306
+ response = client.chat.completions.create(
307
+ model=Config.AI_MODEL or "DeepSeek-V3.2-Thinking",
308
+ messages=[
309
+ {"role": "system", "content": system_prompt},
310
+ {"role": "user", "content": text_to_translate},
311
+ ],
312
+ max_tokens=min(2000, len(text_to_translate) // 2),
313
+ temperature=0.3,
314
+ )
315
+
316
+ # 记录token使用情况并更新统计
317
+ if hasattr(response, "usage") and response.usage:
318
+ usage = response.usage
319
+ current_tokens = usage.total_tokens
320
+ self.total_tokens_used += current_tokens
321
+
322
+ # DeepSeek实际价格:输入0.14元/百万token,输出0.28元/百万token
323
+ # 如果用户指定了价格则使用用户价格,否则使用实际价格
324
+ if self.token_price_per_million is not None:
325
+ # 使用用户指定的统一价格
326
+ cost = (current_tokens / 1_000_000) * self.token_price_per_million
327
+ else:
328
+ # 使用DeepSeek实际价格
329
+ input_cost = (usage.prompt_tokens / 1_000_000) * 0.14
330
+ output_cost = (usage.completion_tokens / 1_000_000) * 0.28
331
+ cost = input_cost + output_cost
332
+
333
+ # 更新累计总费用
334
+ self.total_cost += cost
335
+ total_cost_formatted = f"{self.total_cost:.4f}"
336
+ output.info(
337
+ f"✓ 翻译完成 | 本次: {current_tokens} tokens ({cost:.4f}¥) | 累计: {self.total_tokens_used} tokens (¥{total_cost_formatted})"
338
+ )
339
+ else:
340
+ # 估算token使用(约4字符/1token)
341
+ estimated_tokens = len(text) // 4 + 500 # 基础估计
342
+ self.total_tokens_used += estimated_tokens
343
+ # 计算当前批次费用
344
+ price_per_million = (
345
+ self.token_price_per_million if self.token_price_per_million is not None else 0.21
346
+ ) # DeepSeek平均价格
347
+ current_cost = (estimated_tokens / 1_000_000) * price_per_million
348
+ self.total_cost += current_cost
349
+ total_cost_formatted = f"{self.total_cost:.4f}"
350
+ output.info(
351
+ f"✓ 翻译完成 | 本次: ~{estimated_tokens} tokens ({current_cost:.4f}¥) | 累计: {self.total_tokens_used} tokens (¥{total_cost_formatted})"
352
+ )
353
+
354
+ translated = response.choices[0].message.content
355
+ return translated.strip() if translated else ""
356
+
357
+ except Exception as e:
358
+ output.error("DeepSeek翻译失败", details={"exception": str(e)})
359
+ raise e
360
+
361
+ def generate_daily_report(self) -> Dict[str, Any]:
362
+ """Generate daily report of new papers"""
363
+ output.do("生成每日报告")
364
+
365
+ with self.db.get_session() as session:
366
+ # Get papers from last 24 hours
367
+ cutoff = datetime.utcnow() - timedelta(hours=24)
368
+ new_papers = (
369
+ session.query(Paper)
370
+ .filter(Paper.created_at >= cutoff)
371
+ .order_by(Paper.published.desc())
372
+ .limit(self.config.REPORT_MAX_PAPERS)
373
+ .all()
374
+ )
375
+
376
+ # Get summarized papers
377
+ summarized = [p for p in new_papers if getattr(p, "summarized", False) == True]
378
+
379
+ # Group by category/query
380
+ by_query = {}
381
+ for paper in new_papers:
382
+ query = paper.search_query or "Unknown"
383
+ if query not in by_query:
384
+ by_query[query] = []
385
+ by_query[query].append(paper)
386
+
387
+ # Statistics
388
+ stats = {
389
+ "total_new": len(new_papers),
390
+ "summarized_new": len(summarized),
391
+ "papers_by_query": {k: len(v) for k, v in by_query.items()},
392
+ "date_generated": datetime.now().isoformat(),
393
+ "report_type": "daily",
394
+ }
395
+
396
+ return {
397
+ "stats": stats,
398
+ "papers": new_papers,
399
+ "summarized_papers": summarized,
400
+ "grouped_papers": by_query,
401
+ }
402
+
403
+ def generate_weekly_report(self) -> Dict[str, Any]:
404
+ """Generate weekly report"""
405
+ output.do("生成每周报告")
406
+
407
+ with self.db.get_session() as session:
408
+ # Get papers from last 7 days
409
+ cutoff = datetime.utcnow() - timedelta(days=7)
410
+ recent_papers = (
411
+ session.query(Paper)
412
+ .filter(Paper.created_at >= cutoff)
413
+ .order_by(Paper.published.desc())
414
+ .limit(self.config.REPORT_MAX_PAPERS)
415
+ .all()
416
+ )
417
+
418
+ # Get database stats
419
+ db_stats = self.db.get_statistics()
420
+
421
+ # Top categories
422
+ categories = db_stats.get("categories_distribution", {})
423
+ top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:10]
424
+
425
+ # Statistics
426
+ stats = {
427
+ "total_recent": len(recent_papers),
428
+ "database_stats": db_stats,
429
+ "top_categories": dict(top_categories),
430
+ "date_generated": datetime.now().isoformat(),
431
+ "report_type": "weekly",
432
+ }
433
+
434
+ return {
435
+ "stats": stats,
436
+ "recent_papers": recent_papers,
437
+ "database_stats": db_stats,
438
+ }
439
+
440
+ def save_markdown_report(self, report_data: Dict[str, Any], filename: Optional[str] = None) -> str:
441
+ """Save report as markdown file"""
442
+ if filename is None:
443
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
444
+ report_type = report_data.get("stats", {}).get("report_type", "report")
445
+ filename = f"{report_type}_{timestamp}.md"
446
+
447
+ filepath = os.path.join(self.config.REPORT_DIR, filename)
448
+
449
+ # 重置token统计(每次新报告开始时)
450
+ self.total_tokens_used = 0
451
+ self.total_cost = 0.0
452
+ output.info("开始生成报告 - token计数已重置")
453
+
454
+ # Generate markdown content
455
+ stats = report_data["stats"]
456
+
457
+ # 报告类型中文映射
458
+ report_type_chinese = {"daily": "每日", "weekly": "每周", "recent": "最近", "search": "搜索"}
459
+ report_type = report_type_chinese.get(stats["report_type"], stats["report_type"])
460
+
461
+ markdown_content = f"""# arXiv 文献报告
462
+ **生成时间**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
463
+ **报告类型**: {report_type}报告
464
+
465
+ ## 统计摘要
466
+ """
467
+
468
+ if stats["report_type"] == "daily":
469
+ markdown_content += f"""
470
+ - **今日新论文**: {stats["total_new"]}
471
+ - **今日已总结**: {stats["summarized_new"]}
472
+ - **总结率**: {stats["summarized_new"] / stats["total_new"]:.1%} (如果总数 > 0)
473
+
474
+ ### 按搜索查询统计
475
+ """
476
+ for query, count in stats["papers_by_query"].items():
477
+ markdown_content += f"- **{query}**: {count} 篇论文\n"
478
+
479
+ elif stats["report_type"] == "weekly":
480
+ markdown_content += f"""
481
+ - **本周论文**: {stats["total_recent"]}
482
+ - **数据库总论文**: {stats["database_stats"]["total_papers"]}
483
+ - **已总结论文**: {stats["database_stats"]["summarized_papers"]}
484
+ - **总体总结率**: {stats["database_stats"]["summarized_papers"] / stats["database_stats"]["total_papers"]:.1%} (如果总数 > 0)
485
+
486
+ ### 热门分类
487
+ """
488
+ for category, count in stats["top_categories"].items():
489
+ markdown_content += f"- **{category}**: {count} 篇论文\n"
490
+
491
+ elif stats["report_type"] == "recent":
492
+ markdown_content += f"""
493
+ - **最近论文**: {stats["total_recent"]} (最近 {stats["days_back"]} 天)
494
+ - **数据库总论文**: {stats.get("database_stats", {}).get("total_papers", "N/A")}
495
+ - **已总结论文**: {stats.get("database_stats", {}).get("summarized_papers", "N/A")}
496
+
497
+ ### 热门分类
498
+ """
499
+ for category, count in stats.get("top_categories", {}).items():
500
+ markdown_content += f"- **{category}**: {count} 篇论文\n"
501
+
502
+ elif stats["report_type"] == "search":
503
+ markdown_content += f"""
504
+ - **搜索查询**: {stats.get("original_query", "N/A")}
505
+ - **找到论文**: {stats.get("total_found", "N/A")}
506
+ - **数据库总论文**: {stats.get("database_stats", {}).get("total_papers", "N/A")}
507
+ - **已总结论文**: {stats.get("database_stats", {}).get("summarized_papers", "N/A")}
508
+
509
+ ### 搜索词
510
+ """
511
+ search_terms = stats.get("search_terms", [])
512
+ if isinstance(search_terms, list):
513
+ for term in search_terms[:5]: # 只显示前5个搜索词
514
+ markdown_content += f"- **{term}**\n"
515
+ else:
516
+ markdown_content += f"- **{search_terms}**\n"
517
+
518
+ markdown_content += "\n### 热门分类\n"
519
+ for category, count in stats.get("top_categories", {}).items():
520
+ markdown_content += f"- **{category}**: {count} 篇论文\n"
521
+
522
+ # Add paper details
523
+ if "papers" in report_data and report_data["papers"]:
524
+ markdown_content += "\n## 新论文\n\n"
525
+
526
+ papers = report_data["papers"][:50] # Limit to 50 papers
527
+ total_papers = len(papers)
528
+
529
+ for i, paper in enumerate(papers, 1):
530
+ try:
531
+ output.do(f"[{i}/{total_papers}] 处理论文: {paper.arxiv_id}")
532
+ authors = json.loads(paper.authors) if paper.authors else []
533
+ author_names = [a.get("name", "") for a in authors[:3]]
534
+ if len(authors) > 3:
535
+ author_names.append("et al.")
536
+
537
+ # 计算相关度评级
538
+ relevance_score = self.calculate_relevance_score(paper)
539
+ stars = "★" * relevance_score + "☆" * (5 - relevance_score)
540
+
541
+ # 题目中英双语
542
+ # 尝试翻译标题
543
+ output.do(f"[{i}/{total_papers}] 翻译标题")
544
+ title_translation = self.translate_text(paper.title, "zh")
545
+
546
+ # 根据用户要求:中文标题为主,英文标题放在括号里
547
+ if title_translation and not title_translation.startswith("*"):
548
+ markdown_content += f"### {title_translation} ({paper.title})\n\n"
549
+ else:
550
+ markdown_content += f"### {paper.title}\n\n"
551
+
552
+ markdown_content += f"**作者 (Authors)**: {', '.join(author_names)}\n"
553
+ markdown_content += f"**发表日期 (Published)**: {paper.published.strftime('%Y-%m-%d') if paper.published else 'N/A'}\n"
554
+
555
+ # 分类解释
556
+ category_explanation = self.get_category_explanation(paper.categories or "")
557
+ markdown_content += f"**分类解释 (Categories)**: {category_explanation}\n"
558
+ markdown_content += f"**原始分类代码 (Original Codes)**: {paper.categories}\n"
559
+
560
+ markdown_content += f"**搜索查询 (Search Query)**: {paper.search_query}\n"
561
+
562
+ # 相关度评级
563
+ markdown_content += f"**相关度评级 (Relevance)**: {stars} ({relevance_score}/5)\n\n"
564
+
565
+ # 显示关键发现(如果存在)
566
+ if paper.summary:
567
+ summary_data = None
568
+
569
+ # 尝试解析总结JSON
570
+ try:
571
+ summary_data = json.loads(paper.summary)
572
+ except json.JSONDecodeError:
573
+ # 如果直接解析失败,尝试清理后解析
574
+ cleaned_summary = self.clean_json_response(paper.summary)
575
+ try:
576
+ summary_data = json.loads(cleaned_summary)
577
+ except json.JSONDecodeError:
578
+ # 如果仍然不是JSON,忽略总结数据
579
+ pass
580
+
581
+ # 显示关键发现(如果存在)
582
+ if summary_data and "key_findings" in summary_data and summary_data["key_findings"]:
583
+ markdown_content += "**关键发现 (Key Findings)**:\n"
584
+ for finding in summary_data["key_findings"][:5]:
585
+ markdown_content += f"- {finding}\n"
586
+ markdown_content += "\n"
587
+
588
+ # 完整英文摘要和中文翻译
589
+ if paper.abstract:
590
+ markdown_content += f"**完整英文摘要 (Full Abstract)**:\n{paper.abstract}\n\n"
591
+
592
+ # 尝试翻译摘要
593
+ output.do(f"[{i}/{total_papers}] 翻译摘要")
594
+ chinese_translation = self.translate_text(paper.abstract, "zh")
595
+ if chinese_translation and not chinese_translation.startswith("*"):
596
+ markdown_content += f"**中文翻译 (Chinese Translation)**:\n{chinese_translation}\n\n"
597
+ elif chinese_translation:
598
+ markdown_content += f"**中文翻译 (Chinese Translation)**: {chinese_translation}\n\n"
599
+ else:
600
+ markdown_content += f"**中文翻译 (Chinese Translation)**: *翻译服务不可用*\n\n"
601
+ else:
602
+ markdown_content += f"**摘要 (Abstract)**: 无摘要\n\n"
603
+
604
+ markdown_content += f"**arXiv ID**: [{paper.arxiv_id}](https://arxiv.org/abs/{paper.arxiv_id})\n"
605
+ markdown_content += f"**PDF**: [下载 (Download)]({paper.pdf_url})\n\n"
606
+ markdown_content += "---\n\n"
607
+
608
+ except Exception as e:
609
+ output.error(
610
+ f"格式化论文失败: {paper.arxiv_id}",
611
+ details={"exception": str(e)},
612
+ )
613
+ continue
614
+
615
+ # Add recommendations section
616
+ markdown_content += "\n## 建议\n\n"
617
+
618
+ if stats["report_type"] == "daily":
619
+ markdown_content += "1. 浏览您感兴趣领域的新论文\n"
620
+ markdown_content += "2. 查看已总结的论文以快速了解内容\n"
621
+ markdown_content += "3. 将相关论文添加到阅读列表\n"
622
+ else:
623
+ markdown_content += "1. 回顾您研究领域的每周趋势\n"
624
+ markdown_content += "2. 从热门分类中识别新兴主题\n"
625
+ markdown_content += "3. 规划下周的阅读计划\n"
626
+
627
+ # Save to file
628
+ with open(filepath, "w", encoding="utf-8") as f:
629
+ f.write(markdown_content)
630
+
631
+ # 显示最终token统计
632
+ if self.total_tokens_used > 0:
633
+ total_cost_formatted = f"{self.total_cost:.4f}"
634
+ output.info(f"报告生成完成 - 总计: {self.total_tokens_used} tokens (¥{total_cost_formatted})")
635
+
636
+ output.done(f"报告已保存: {filepath}")
637
+ return filepath
638
+
639
+ def save_csv_report(self, report_data: Dict[str, Any], filename: Optional[str] = None) -> Optional[str]:
640
+ """Save report as CSV file"""
641
+ if filename is None:
642
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
643
+ report_type = report_data["stats"]["report_type"]
644
+ filename = f"{report_type}_report_{timestamp}.csv"
645
+
646
+ filepath = os.path.join(self.config.REPORT_DIR, filename)
647
+
648
+ # Convert papers to DataFrame
649
+ papers_data = []
650
+ for paper in report_data.get("papers", []):
651
+ try:
652
+ papers_data.append(
653
+ {
654
+ "arxiv_id": paper.arxiv_id,
655
+ "title": paper.title,
656
+ "authors": paper.authors,
657
+ "published": paper.published.isoformat() if paper.published else None,
658
+ "categories": paper.categories,
659
+ "primary_category": paper.primary_category,
660
+ "search_query": paper.search_query,
661
+ "summarized": paper.summarized,
662
+ "pdf_url": paper.pdf_url,
663
+ "doi": paper.doi,
664
+ "created_at": paper.created_at.isoformat() if paper.created_at else None,
665
+ }
666
+ )
667
+ except Exception as e:
668
+ output.error(
669
+ f"处理论文到CSV失败: {paper.arxiv_id}",
670
+ details={"exception": str(e)},
671
+ )
672
+
673
+ if papers_data:
674
+ df = pd.DataFrame(papers_data)
675
+
676
+ # 中文列名映射
677
+ chinese_columns = {
678
+ "arxiv_id": "arXiv ID",
679
+ "title": "标题",
680
+ "authors": "作者",
681
+ "published": "发表日期",
682
+ "categories": "分类",
683
+ "primary_category": "主要分类",
684
+ "search_query": "搜索查询",
685
+ "summarized": "已总结",
686
+ "pdf_url": "PDF链接",
687
+ "doi": "DOI",
688
+ "created_at": "创建时间",
689
+ }
690
+
691
+ # 重命名列
692
+ df = df.rename(columns=chinese_columns)
693
+
694
+ df.to_csv(filepath, index=False, encoding="utf-8")
695
+ output.done(f"CSV报告已保存: {filepath}")
696
+ return filepath
697
+ else:
698
+ output.warn("没有论文数据可保存为CSV")
699
+ return None
700
+
701
+ def generate_and_save_daily_report(self) -> List[str]:
702
+ """Generate and save daily report (returns list of saved files)"""
703
+ report_data = self.generate_daily_report()
704
+
705
+ saved_files = []
706
+
707
+ # Save markdown report
708
+ md_file = self.save_markdown_report(report_data)
709
+ if md_file:
710
+ saved_files.append(md_file)
711
+
712
+ # Save CSV report
713
+ csv_file = self.save_csv_report(report_data)
714
+ if csv_file:
715
+ saved_files.append(csv_file)
716
+
717
+ return saved_files
718
+
719
+ def generate_and_save_weekly_report(self) -> List[str]:
720
+ """Generate and save weekly report"""
721
+ report_data = self.generate_weekly_report()
722
+
723
+ saved_files = []
724
+
725
+ # Save markdown report
726
+ md_file = self.save_markdown_report(report_data)
727
+ if md_file:
728
+ saved_files.append(md_file)
729
+
730
+ # Save CSV report
731
+ csv_file = self.save_csv_report(report_data)
732
+ if csv_file:
733
+ saved_files.append(csv_file)
734
+
735
+ return saved_files
736
+
737
+
738
+ def main():
739
+ """Test report generator"""
740
+ generator = ReportGenerator()
741
+
742
+ print("Testing report generator...")
743
+
744
+ # Generate daily report
745
+ print("\nGenerating daily report...")
746
+ daily_data = generator.generate_daily_report()
747
+ print(f"Daily stats: {daily_data['stats']}")
748
+
749
+ # Save reports
750
+ print("\nSaving reports...")
751
+ saved_files = generator.generate_and_save_daily_report()
752
+ print(f"Saved files: {saved_files}")
753
+
754
+ # Generate weekly report
755
+ print("\nGenerating weekly report...")
756
+ weekly_data = generator.generate_weekly_report()
757
+ print(f"Weekly stats: {weekly_data['stats'].get('total_recent', 0)} recent papers")
758
+
759
+ # Check report directory
760
+ report_dir = Config.REPORT_DIR
761
+ print(f"\nReport directory: {report_dir}")
762
+ if os.path.exists(report_dir):
763
+ files = os.listdir(report_dir)
764
+ print(f"Existing reports: {len(files)} files")
765
+
766
+
767
+ if __name__ == "__main__":
768
+ main()