arxiv-pulse 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arxiv_pulse/.ENV.TEMPLATE +93 -41
- arxiv_pulse/__version__.py +2 -2
- arxiv_pulse/arxiv_crawler.py +65 -23
- arxiv_pulse/cli.py +144 -365
- arxiv_pulse/config.py +6 -8
- arxiv_pulse/models.py +17 -9
- arxiv_pulse/output_manager.py +38 -54
- arxiv_pulse/report_generator.py +3 -46
- arxiv_pulse/search_engine.py +105 -53
- arxiv_pulse/summarizer.py +0 -1
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.0.dist-info}/METADATA +61 -124
- arxiv_pulse-0.6.0.dist-info/RECORD +17 -0
- arxiv_pulse-0.5.0.dist-info/RECORD +0 -17
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.0.dist-info}/WHEEL +0 -0
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.0.dist-info}/entry_points.txt +0 -0
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.0.dist-info}/top_level.txt +0 -0
arxiv_pulse/cli.py
CHANGED
|
@@ -10,14 +10,15 @@ from pathlib import Path
|
|
|
10
10
|
import click
|
|
11
11
|
from dotenv import load_dotenv
|
|
12
12
|
import json
|
|
13
|
-
from datetime import datetime, timedelta
|
|
13
|
+
from datetime import datetime, timedelta, timezone
|
|
14
14
|
import questionary
|
|
15
|
+
import wcwidth
|
|
15
16
|
|
|
16
17
|
from arxiv_pulse.config import Config
|
|
17
18
|
from arxiv_pulse.arxiv_crawler import ArXivCrawler
|
|
18
19
|
from arxiv_pulse.summarizer import PaperSummarizer
|
|
19
20
|
from arxiv_pulse.report_generator import ReportGenerator
|
|
20
|
-
from arxiv_pulse.output_manager import output
|
|
21
|
+
from arxiv_pulse.output_manager import output, OutputLevel
|
|
21
22
|
from arxiv_pulse.search_engine import SearchEngine, SearchFilter
|
|
22
23
|
from arxiv_pulse.__version__ import __version__
|
|
23
24
|
|
|
@@ -234,7 +235,6 @@ def setup_environment(directory: Path):
|
|
|
234
235
|
# 创建必要的目录
|
|
235
236
|
os.makedirs("data", exist_ok=True)
|
|
236
237
|
os.makedirs("reports", exist_ok=True)
|
|
237
|
-
os.makedirs("logs", exist_ok=True)
|
|
238
238
|
|
|
239
239
|
# 加载 .env 文件(如果存在)
|
|
240
240
|
env_file = directory / ".env"
|
|
@@ -267,12 +267,14 @@ def setup_environment(directory: Path):
|
|
|
267
267
|
Config.AI_MODEL = os.getenv("AI_MODEL", "DeepSeek-V3.2-Thinking")
|
|
268
268
|
Config.AI_BASE_URL = os.getenv("AI_BASE_URL", "https://llmapi.paratera.com")
|
|
269
269
|
Config.SUMMARY_MAX_TOKENS = int(os.getenv("SUMMARY_MAX_TOKENS", "2000"))
|
|
270
|
-
Config.SUMMARY_SENTENCES_LIMIT = int(os.getenv("SUMMARY_SENTENCES_LIMIT", "3"))
|
|
271
270
|
Config.TOKEN_PRICE_PER_MILLION = float(os.getenv("TOKEN_PRICE_PER_MILLION", "3.0"))
|
|
272
|
-
Config.MAX_RESULTS_INITIAL = int(os.getenv("MAX_RESULTS_INITIAL", "
|
|
273
|
-
Config.MAX_RESULTS_DAILY = int(os.getenv("MAX_RESULTS_DAILY", "
|
|
271
|
+
Config.MAX_RESULTS_INITIAL = int(os.getenv("MAX_RESULTS_INITIAL", "10000"))
|
|
272
|
+
Config.MAX_RESULTS_DAILY = int(os.getenv("MAX_RESULTS_DAILY", "500"))
|
|
274
273
|
Config.YEARS_BACK = int(os.getenv("YEARS_BACK", "3"))
|
|
275
|
-
Config.IMPORTANT_PAPERS_FILE = os.getenv("IMPORTANT_PAPERS_FILE", "important_papers.txt")
|
|
274
|
+
Config.IMPORTANT_PAPERS_FILE = os.getenv("IMPORTANT_PAPERS_FILE", "data/important_papers.txt")
|
|
275
|
+
Config.ARXIV_MAX_RESULTS = int(os.getenv("ARXIV_MAX_RESULTS", "30000"))
|
|
276
|
+
Config.ARXIV_SORT_BY = os.getenv("ARXIV_SORT_BY", "submittedDate")
|
|
277
|
+
Config.ARXIV_SORT_ORDER = os.getenv("ARXIV_SORT_ORDER", "descending")
|
|
276
278
|
Config.REPORT_MAX_PAPERS = int(os.getenv("REPORT_MAX_PAPERS", "50"))
|
|
277
279
|
|
|
278
280
|
# 更新 SEARCH_QUERIES
|
|
@@ -407,54 +409,74 @@ def print_banner_custom(fields):
|
|
|
407
409
|
else:
|
|
408
410
|
field_str = f"{fields[0]} • {fields[1]} • {fields[2]} • {fields[3]}"
|
|
409
411
|
|
|
410
|
-
#
|
|
411
|
-
# 第一行标题:"arXiv Pulse - 文献追踪系统" (25字符)
|
|
412
|
-
# 需要将字段字符串居中显示
|
|
412
|
+
# 横幅尺寸
|
|
413
413
|
banner_width = 55
|
|
414
414
|
content_width = 53
|
|
415
415
|
|
|
416
|
-
#
|
|
416
|
+
# 辅助函数:计算字符串显示宽度
|
|
417
|
+
def display_width(text):
|
|
418
|
+
return wcwidth.wcswidth(text)
|
|
419
|
+
|
|
420
|
+
# 辅助函数:截断字符串到指定显示宽度,添加省略号
|
|
421
|
+
def truncate_to_width(text, max_width):
|
|
422
|
+
if display_width(text) <= max_width:
|
|
423
|
+
return text
|
|
424
|
+
# 逐步减少字符直到宽度合适
|
|
425
|
+
result = ""
|
|
426
|
+
for char in text:
|
|
427
|
+
if display_width(result + char) > max_width - 3: # 为"..."留出空间
|
|
428
|
+
break
|
|
429
|
+
result += char
|
|
430
|
+
return result + "..." if result else "..." # 至少返回省略号
|
|
431
|
+
|
|
432
|
+
# 创建横幅边框
|
|
417
433
|
border_top = "╔" + "═" * (banner_width - 2) + "╗"
|
|
418
434
|
border_bottom = "╚" + "═" * (banner_width - 2) + "╝"
|
|
419
435
|
|
|
420
436
|
# 第一行标题
|
|
421
437
|
title = "arXiv Pulse - 文献追踪系统"
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
title_line = "║" + " " *
|
|
438
|
+
title_width = display_width(title)
|
|
439
|
+
# 计算左右填充
|
|
440
|
+
left_padding = (content_width - title_width) // 2
|
|
441
|
+
right_padding = content_width - title_width - left_padding
|
|
442
|
+
title_line = "║" + " " * left_padding + title + " " * right_padding + "║"
|
|
427
443
|
|
|
428
444
|
# 第二行字段
|
|
429
|
-
#
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
field_line =
|
|
439
|
-
"║" + " " * field_padding + field_str + " " * (content_width - len(field_str) * 2 - field_padding) + "║"
|
|
440
|
-
)
|
|
445
|
+
# 最大字段显示宽度(留出边距)
|
|
446
|
+
max_field_width = content_width - 4
|
|
447
|
+
# 截断字段字符串如果太长
|
|
448
|
+
field_str = truncate_to_width(field_str, max_field_width)
|
|
449
|
+
field_width = display_width(field_str)
|
|
450
|
+
|
|
451
|
+
# 计算字段行的左右填充
|
|
452
|
+
left_padding = (content_width - field_width) // 2
|
|
453
|
+
right_padding = content_width - field_width - left_padding
|
|
454
|
+
field_line = "║" + " " * left_padding + field_str + " " * right_padding + "║"
|
|
441
455
|
|
|
442
456
|
banner = f"\n{border_top}\n{title_line}\n{field_line}\n{border_bottom}\n"
|
|
443
457
|
click.echo(banner)
|
|
444
458
|
|
|
445
459
|
|
|
446
|
-
def sync_papers(years_back=1, summarize=False):
|
|
447
|
-
"""同步论文(内部函数)
|
|
460
|
+
def sync_papers(years_back=1, summarize=False, force=False):
|
|
461
|
+
"""同步论文(内部函数)
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
years_back: 回溯的年数
|
|
465
|
+
summarize: 是否总结新论文
|
|
466
|
+
force: 是否强制同步(重新下载所有论文,忽略重复检查)
|
|
467
|
+
"""
|
|
448
468
|
crawler = ArXivCrawler()
|
|
449
469
|
summarizer = PaperSummarizer()
|
|
450
470
|
|
|
451
|
-
|
|
471
|
+
mode_text = "强制同步" if force else "同步缺失论文"
|
|
472
|
+
click.echo(f"正在{mode_text}(回溯 {years_back} 年)...")
|
|
452
473
|
click.echo("=" * 50)
|
|
453
474
|
|
|
454
475
|
# 同步所有查询
|
|
455
476
|
click.echo("1. 正在同步搜索查询...")
|
|
456
|
-
sync_result = crawler.sync_all_queries(years_back=years_back)
|
|
457
|
-
|
|
477
|
+
sync_result = crawler.sync_all_queries(years_back=years_back, force=force)
|
|
478
|
+
result_text = "处理了" if force else "添加了"
|
|
479
|
+
click.echo(f" 从查询{result_text} {sync_result['total_new_papers']} 篇论文")
|
|
458
480
|
|
|
459
481
|
# 同步重要论文
|
|
460
482
|
click.echo("2. 正在同步重要论文...")
|
|
@@ -480,7 +502,7 @@ def sync_papers(years_back=1, summarize=False):
|
|
|
480
502
|
|
|
481
503
|
click.echo("\n" + "=" * 50)
|
|
482
504
|
click.echo("同步完成!")
|
|
483
|
-
click.echo(f"
|
|
505
|
+
click.echo(f"总共{result_text}论文: {total_new}")
|
|
484
506
|
click.echo(f"数据库现有 {crawl_stats['total_papers']} 篇论文")
|
|
485
507
|
click.echo(f"已总结: {summary_stats['summarized_papers']} ({summary_stats['summarization_rate']:.1%})")
|
|
486
508
|
|
|
@@ -490,12 +512,13 @@ def sync_papers(years_back=1, summarize=False):
|
|
|
490
512
|
"sync_result": sync_result,
|
|
491
513
|
"important_result": important_result,
|
|
492
514
|
"stats": {"crawl_stats": crawl_stats, "summary_stats": summary_stats},
|
|
515
|
+
"force_mode": force,
|
|
493
516
|
}
|
|
494
517
|
|
|
495
518
|
|
|
496
519
|
def get_workday_cutoff(days_back):
|
|
497
520
|
"""计算排除周末的截止日期"""
|
|
498
|
-
current = datetime.
|
|
521
|
+
current = datetime.now(timezone.utc).replace(tzinfo=None)
|
|
499
522
|
workdays_counted = 0
|
|
500
523
|
days_to_go_back = 0
|
|
501
524
|
|
|
@@ -609,6 +632,11 @@ def generate_search_report(query, search_terms, papers, paper_limit=50, summariz
|
|
|
609
632
|
"""生成搜索结果的报告(内部函数)"""
|
|
610
633
|
reporter = ReportGenerator()
|
|
611
634
|
|
|
635
|
+
# 如果没有找到论文,不生成报告
|
|
636
|
+
if not papers:
|
|
637
|
+
output.info("未找到论文,跳过报告生成")
|
|
638
|
+
return []
|
|
639
|
+
|
|
612
640
|
# 设置报告限制
|
|
613
641
|
original_limit = Config.REPORT_MAX_PAPERS
|
|
614
642
|
Config.REPORT_MAX_PAPERS = paper_limit
|
|
@@ -698,10 +726,15 @@ def generate_search_report(query, search_terms, papers, paper_limit=50, summariz
|
|
|
698
726
|
|
|
699
727
|
|
|
700
728
|
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
|
|
729
|
+
@click.option("--verbose", "-v", is_flag=True, help="显示详细输出(包括调试信息)")
|
|
701
730
|
@click.version_option(version=__version__, prog_name="arXiv Pulse")
|
|
702
|
-
def cli():
|
|
731
|
+
def cli(verbose):
|
|
703
732
|
"""arXiv Pulse: 智能arXiv文献追踪和分析系统"""
|
|
704
|
-
|
|
733
|
+
if verbose:
|
|
734
|
+
output.set_min_level(OutputLevel.DEBUG)
|
|
735
|
+
else:
|
|
736
|
+
# 确保使用环境变量中的LOG_LEVEL(output manager已经处理)
|
|
737
|
+
pass
|
|
705
738
|
|
|
706
739
|
|
|
707
740
|
def interactive_configuration():
|
|
@@ -903,9 +936,6 @@ def interactive_configuration():
|
|
|
903
936
|
report_max_papers = click.prompt("每份报告显示的最大论文数", default=50, type=int, show_default=True)
|
|
904
937
|
config["REPORT_MAX_PAPERS"] = str(report_max_papers)
|
|
905
938
|
|
|
906
|
-
summary_sentences_limit = click.prompt("摘要句子数限制", default=3, type=int, show_default=True)
|
|
907
|
-
config["SUMMARY_SENTENCES_LIMIT"] = str(summary_sentences_limit)
|
|
908
|
-
|
|
909
939
|
click.echo("\n✅ 配置完成!")
|
|
910
940
|
return config, int(years_back)
|
|
911
941
|
|
|
@@ -920,7 +950,6 @@ def init(directory, years_back):
|
|
|
920
950
|
# 创建目录结构
|
|
921
951
|
(directory / "data").mkdir(exist_ok=True)
|
|
922
952
|
(directory / "reports").mkdir(exist_ok=True)
|
|
923
|
-
(directory / "logs").mkdir(exist_ok=True)
|
|
924
953
|
|
|
925
954
|
# 创建 .env 文件(如果不存在)
|
|
926
955
|
env_file = directory / ".env"
|
|
@@ -971,7 +1000,6 @@ SEARCH_QUERIES={config.get("SEARCH_QUERIES", 'condensed matter physics AND cat:c
|
|
|
971
1000
|
# ========================
|
|
972
1001
|
REPORT_DIR=reports
|
|
973
1002
|
SUMMARY_MAX_TOKENS=2000 # 总结和翻译的最大token数
|
|
974
|
-
SUMMARY_SENTENCES_LIMIT={config.get("SUMMARY_SENTENCES_LIMIT", "3")}
|
|
975
1003
|
TOKEN_PRICE_PER_MILLION=3.0
|
|
976
1004
|
REPORT_MAX_PAPERS={config.get("REPORT_MAX_PAPERS", "50")}
|
|
977
1005
|
|
|
@@ -1000,10 +1028,12 @@ CRAWL_DELAY=1.0
|
|
|
1000
1028
|
years_back = 5 # 默认值
|
|
1001
1029
|
|
|
1002
1030
|
# 创建 important_papers.txt(如果不存在)
|
|
1003
|
-
important_file = directory /
|
|
1031
|
+
important_file = directory / Config.IMPORTANT_PAPERS_FILE
|
|
1032
|
+
# 确保父目录存在
|
|
1033
|
+
important_file.parent.mkdir(parents=True, exist_ok=True)
|
|
1004
1034
|
if not important_file.exists():
|
|
1005
1035
|
important_file.write_text("# 在此添加重要论文的arXiv ID,每行一个\n")
|
|
1006
|
-
click.echo(f"✅
|
|
1036
|
+
click.echo(f"✅ 已创建重要论文文件: {important_file}")
|
|
1007
1037
|
|
|
1008
1038
|
# 设置环境并验证配置
|
|
1009
1039
|
if not setup_environment(directory):
|
|
@@ -1046,10 +1076,15 @@ CRAWL_DELAY=1.0
|
|
|
1046
1076
|
|
|
1047
1077
|
@cli.command()
|
|
1048
1078
|
@click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
|
|
1049
|
-
@click.option("--years-back", type=int, default=
|
|
1079
|
+
@click.option("--years-back", type=int, default=None, help="同步回溯的年数(默认:强制模式5年,普通模式1年)")
|
|
1050
1080
|
@click.option("--summarize/--no-summarize", default=False, help="是否总结新论文(默认:否)")
|
|
1051
|
-
|
|
1052
|
-
|
|
1081
|
+
@click.option("--force", is_flag=True, default=False, help="强制同步:重新下载最近N年的所有论文,忽略重复检查")
|
|
1082
|
+
def sync(directory, years_back, summarize, force):
|
|
1083
|
+
"""同步最新论文到数据库
|
|
1084
|
+
|
|
1085
|
+
强制模式(--force): 重新下载最近N年的所有论文,忽略重复检查,默认回溯5年。
|
|
1086
|
+
普通模式: 只下载缺失的新论文,默认回溯1年。
|
|
1087
|
+
"""
|
|
1053
1088
|
directory = Path(directory).resolve()
|
|
1054
1089
|
click.echo(f"正在同步 arXiv Pulse 于 {directory}")
|
|
1055
1090
|
|
|
@@ -1058,8 +1093,13 @@ def sync(directory, years_back, summarize):
|
|
|
1058
1093
|
|
|
1059
1094
|
print_banner()
|
|
1060
1095
|
|
|
1096
|
+
# 设置默认years_back值
|
|
1097
|
+
if years_back is None:
|
|
1098
|
+
years_back = 5 if force else 1
|
|
1099
|
+
click.echo(f"使用默认回溯年数: {years_back} 年")
|
|
1100
|
+
|
|
1061
1101
|
# 同步论文
|
|
1062
|
-
sync_result = sync_papers(years_back=years_back, summarize=summarize)
|
|
1102
|
+
sync_result = sync_papers(years_back=years_back, summarize=summarize, force=force)
|
|
1063
1103
|
|
|
1064
1104
|
click.echo("\n" + "=" * 50)
|
|
1065
1105
|
click.echo("同步完成!数据库已更新。")
|
|
@@ -1073,8 +1113,19 @@ def sync(directory, years_back, summarize):
|
|
|
1073
1113
|
@click.option("--use-ai/--no-ai", default=True, help="是否使用AI理解自然语言查询(默认:是)")
|
|
1074
1114
|
@click.option("--summarize/--no-summarize", default=True, help="是否自动总结未总结的论文(默认:是)")
|
|
1075
1115
|
@click.option("--max-summarize", type=int, default=0, help="最大总结论文数(默认:0表示无限制)")
|
|
1076
|
-
|
|
1077
|
-
|
|
1116
|
+
@click.option("--categories", "-c", multiple=True, help="包含的分类(可多次使用)")
|
|
1117
|
+
@click.option("--days-back", type=int, help="回溯天数(例如:30表示最近30天)")
|
|
1118
|
+
@click.option("--authors", "-a", multiple=True, help="作者姓名(可多次使用)")
|
|
1119
|
+
@click.option(
|
|
1120
|
+
"--sort-by",
|
|
1121
|
+
type=click.Choice(["published", "relevance_score", "title", "updated"]),
|
|
1122
|
+
default="published",
|
|
1123
|
+
help="排序字段",
|
|
1124
|
+
)
|
|
1125
|
+
def search(
|
|
1126
|
+
query, directory, limit, years_back, use_ai, summarize, max_summarize, categories, days_back, authors, sort_by
|
|
1127
|
+
):
|
|
1128
|
+
"""智能搜索论文(支持自然语言查询和基本过滤)"""
|
|
1078
1129
|
directory = Path(directory).resolve()
|
|
1079
1130
|
|
|
1080
1131
|
if not setup_environment(directory):
|
|
@@ -1086,7 +1137,7 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
|
|
|
1086
1137
|
crawler = ArXivCrawler()
|
|
1087
1138
|
if years_back > 0:
|
|
1088
1139
|
click.echo(f"搜索前先同步最近 {years_back} 年论文...")
|
|
1089
|
-
sync_result = sync_papers(years_back=years_back, summarize=False)
|
|
1140
|
+
sync_result = sync_papers(years_back=years_back, summarize=False, force=False)
|
|
1090
1141
|
crawler = sync_result["crawler"]
|
|
1091
1142
|
|
|
1092
1143
|
click.echo(f"\n正在搜索: '{query}'")
|
|
@@ -1105,10 +1156,22 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
|
|
|
1105
1156
|
用户正在搜索arXiv物理/计算材料科学论文,查询是: "{query}"
|
|
1106
1157
|
|
|
1107
1158
|
请将自然语言查询转换为适合arXiv搜索的关键词或短语。
|
|
1108
|
-
考虑以下领域:凝聚态物理、密度泛函理论(DFT)、机器学习、力场、分子动力学、量子化学。
|
|
1109
1159
|
|
|
1110
|
-
|
|
1111
|
-
|
|
1160
|
+
重要规则:
|
|
1161
|
+
1. 如果查询已经是明确的搜索词(如"DeepH"、"deep learning Hamiltonian"、"DFT计算"),直接使用它,不要添加同义词
|
|
1162
|
+
2. 如果查询包含专业术语、缩写或专有名词,保持原样作为主要搜索词
|
|
1163
|
+
3. 仅当查询非常模糊或一般性时(如"机器学习在材料科学中的应用"),才生成1-2个相关关键词
|
|
1164
|
+
4. 优先保持查询的原始意图,不要添加不相关的关键词
|
|
1165
|
+
5. 对于英文查询,保持原样;对于中文查询,翻译为英文关键词
|
|
1166
|
+
考虑以下领域:凝聚态物理、密度泛函理论(DFT)、机器学习、力场、分子动力学、量子化学、计算材料科学。
|
|
1167
|
+
|
|
1168
|
+
返回格式:JSON数组,包含1-2个搜索关键词/短语。
|
|
1169
|
+
示例:
|
|
1170
|
+
- 查询"DeepH": ["DeepH"]
|
|
1171
|
+
- 查询"deep learning Hamiltonian": ["deep learning Hamiltonian"]
|
|
1172
|
+
- 查询"DFT计算": ["DFT"]
|
|
1173
|
+
- 查询"分子动力学模拟": ["molecular dynamics simulation"]
|
|
1174
|
+
- 查询"机器学习在材料科学中的应用": ["machine learning materials science"]
|
|
1112
1175
|
|
|
1113
1176
|
只返回JSON数组,不要其他文本。
|
|
1114
1177
|
"""
|
|
@@ -1116,7 +1179,10 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
|
|
|
1116
1179
|
response = client.chat.completions.create(
|
|
1117
1180
|
model=Config.AI_MODEL,
|
|
1118
1181
|
messages=[
|
|
1119
|
-
{
|
|
1182
|
+
{
|
|
1183
|
+
"role": "system",
|
|
1184
|
+
"content": "你是arXiv论文搜索助手,擅长识别专业术语并将自然语言查询转换为学术搜索关键词。",
|
|
1185
|
+
},
|
|
1120
1186
|
{"role": "user", "content": ai_prompt},
|
|
1121
1187
|
],
|
|
1122
1188
|
max_tokens=200,
|
|
@@ -1141,30 +1207,30 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
|
|
|
1141
1207
|
with crawler.db.get_session() as session:
|
|
1142
1208
|
from arxiv_pulse.models import Paper
|
|
1143
1209
|
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
Paper.title.contains(term)
|
|
1150
|
-
| Paper.abstract.contains(term)
|
|
1151
|
-
| Paper.categories.contains(term)
|
|
1152
|
-
| Paper.search_query.contains(term)
|
|
1153
|
-
)
|
|
1154
|
-
.order_by(Paper.published.desc())
|
|
1155
|
-
.limit(limit)
|
|
1156
|
-
.all()
|
|
1157
|
-
)
|
|
1158
|
-
all_results.extend(papers)
|
|
1210
|
+
# 使用增强搜索引擎进行模糊搜索
|
|
1211
|
+
search_engine = SearchEngine(session)
|
|
1212
|
+
|
|
1213
|
+
# 将搜索词合并为一个查询(搜索引擎会处理单词拆分和同义词扩展)
|
|
1214
|
+
combined_query = " ".join(search_terms)
|
|
1159
1215
|
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
if
|
|
1164
|
-
|
|
1216
|
+
filter_config = SearchFilter(
|
|
1217
|
+
query=combined_query,
|
|
1218
|
+
search_fields=["title", "abstract"],
|
|
1219
|
+
categories=list(categories) if categories else None,
|
|
1220
|
+
authors=list(authors) if authors else None,
|
|
1221
|
+
author_match="contains", # 默认使用包含匹配
|
|
1222
|
+
days_back=days_back,
|
|
1223
|
+
limit=limit * min(len(search_terms), 2), # 扩大限制但最多2倍,避免过多结果
|
|
1224
|
+
sort_by=sort_by,
|
|
1225
|
+
sort_order="desc",
|
|
1226
|
+
match_all=True, # AND逻辑:匹配所有搜索词
|
|
1227
|
+
)
|
|
1165
1228
|
|
|
1166
|
-
|
|
1167
|
-
papers_to_show =
|
|
1229
|
+
# 执行搜索
|
|
1230
|
+
papers_to_show = search_engine.search_papers(filter_config)
|
|
1231
|
+
|
|
1232
|
+
# 确保不超过限制
|
|
1233
|
+
papers_to_show = papers_to_show[:limit]
|
|
1168
1234
|
|
|
1169
1235
|
click.echo(f"找到 {len(papers_to_show)} 篇论文:")
|
|
1170
1236
|
|
|
@@ -1219,7 +1285,7 @@ def recent(directory, limit, days_back, years_back, summarize, max_summarize):
|
|
|
1219
1285
|
# 先同步论文
|
|
1220
1286
|
if years_back > 0:
|
|
1221
1287
|
click.echo(f"报告前先同步最近 {years_back} 年论文...")
|
|
1222
|
-
sync_papers(years_back=years_back, summarize=False)
|
|
1288
|
+
sync_papers(years_back=years_back, summarize=False, force=False)
|
|
1223
1289
|
|
|
1224
1290
|
# 生成报告
|
|
1225
1291
|
click.echo("\n" + "=" * 50)
|
|
@@ -1294,8 +1360,6 @@ def stat(directory):
|
|
|
1294
1360
|
# 时间分布
|
|
1295
1361
|
click.echo(f"\n📅 时间分布:")
|
|
1296
1362
|
with crawler.db.get_session() as session:
|
|
1297
|
-
from datetime import datetime, timedelta
|
|
1298
|
-
|
|
1299
1363
|
# 按年统计
|
|
1300
1364
|
year_stats = {}
|
|
1301
1365
|
for paper in papers:
|
|
@@ -1319,290 +1383,5 @@ def stat(directory):
|
|
|
1319
1383
|
click.echo("统计完成 ✅")
|
|
1320
1384
|
|
|
1321
1385
|
|
|
1322
|
-
@cli.command()
|
|
1323
|
-
@click.argument("paper_id")
|
|
1324
|
-
@click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
|
|
1325
|
-
@click.option("--limit", default=10, help="返回结果的最大数量(默认:10)")
|
|
1326
|
-
@click.option("--threshold", type=float, default=0.5, help="相似度阈值(0.0-1.0,默认:0.5)")
|
|
1327
|
-
@click.option("--years-back", type=int, default=0, help="搜索前同步回溯的年数(默认:0,不更新)")
|
|
1328
|
-
def similar(paper_id, directory, limit, threshold, years_back):
|
|
1329
|
-
"""查找与指定论文相似的论文"""
|
|
1330
|
-
directory = Path(directory).resolve()
|
|
1331
|
-
|
|
1332
|
-
if not setup_environment(directory):
|
|
1333
|
-
sys.exit(1)
|
|
1334
|
-
|
|
1335
|
-
print_banner()
|
|
1336
|
-
|
|
1337
|
-
# 如果需要,先同步最新论文
|
|
1338
|
-
crawler = ArXivCrawler()
|
|
1339
|
-
if years_back > 0:
|
|
1340
|
-
click.echo(f"搜索前先同步最近 {years_back} 年论文...")
|
|
1341
|
-
sync_result = sync_papers(years_back=years_back, summarize=False)
|
|
1342
|
-
crawler = sync_result["crawler"]
|
|
1343
|
-
|
|
1344
|
-
click.echo(f"\n查找与论文 '{paper_id}' 相似的论文")
|
|
1345
|
-
click.echo("=" * 50)
|
|
1346
|
-
|
|
1347
|
-
with crawler.db.get_session() as session:
|
|
1348
|
-
# 创建搜索引擎
|
|
1349
|
-
search_engine = SearchEngine(session)
|
|
1350
|
-
|
|
1351
|
-
# 查找相似论文
|
|
1352
|
-
click.echo(f"正在查找相似度≥{threshold}的论文...")
|
|
1353
|
-
similar_papers_with_scores = search_engine.search_similar_papers(paper_id, limit=limit, threshold=threshold)
|
|
1354
|
-
|
|
1355
|
-
if not similar_papers_with_scores:
|
|
1356
|
-
click.echo("未找到相似论文。")
|
|
1357
|
-
return
|
|
1358
|
-
|
|
1359
|
-
click.echo(f"找到 {len(similar_papers_with_scores)} 篇相似论文:")
|
|
1360
|
-
|
|
1361
|
-
# 提取paper列表用于报告生成
|
|
1362
|
-
similar_papers = [paper for paper, _ in similar_papers_with_scores]
|
|
1363
|
-
|
|
1364
|
-
# 显示结果
|
|
1365
|
-
for i, (paper, similarity) in enumerate(similar_papers_with_scores, 1):
|
|
1366
|
-
authors = json.loads(paper.authors) if paper.authors else []
|
|
1367
|
-
author_names = [a.get("name", "") for a in authors[:2]]
|
|
1368
|
-
if len(authors) > 2:
|
|
1369
|
-
author_names.append("等")
|
|
1370
|
-
|
|
1371
|
-
click.echo(f"\n{i}. {paper.title}")
|
|
1372
|
-
click.echo(f" 相似度: {similarity:.2f}")
|
|
1373
|
-
click.echo(f" 作者: {', '.join(author_names)}")
|
|
1374
|
-
click.echo(f" arXiv ID: {paper.arxiv_id}")
|
|
1375
|
-
click.echo(f" 分类: {paper.categories}")
|
|
1376
|
-
click.echo(f" 发布日期: {paper.published.strftime('%Y-%m-%d') if paper.published else 'N/A'}")
|
|
1377
|
-
|
|
1378
|
-
# 生成报告
|
|
1379
|
-
click.echo("\n正在生成相似论文报告...")
|
|
1380
|
-
report_files = generate_search_report(
|
|
1381
|
-
f"与 {paper_id} 相似的论文", [f"similar to {paper_id}"], similar_papers, paper_limit=limit
|
|
1382
|
-
)
|
|
1383
|
-
|
|
1384
|
-
click.echo(f"报告生成完成:")
|
|
1385
|
-
for f in report_files:
|
|
1386
|
-
click.echo(f" - {f}")
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
@cli.command()
|
|
1390
|
-
@click.argument("query")
|
|
1391
|
-
@click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
|
|
1392
|
-
@click.option("--limit", default=20, help="返回结果的最大数量(默认:20)")
|
|
1393
|
-
@click.option("--years-back", type=int, default=0, help="搜索前同步回溯的年数(默认:0,不更新)")
|
|
1394
|
-
@click.option("--use-ai/--no-ai", default=True, help="是否使用AI理解自然语言查询(默认:是)")
|
|
1395
|
-
@click.option("--categories", "-c", multiple=True, help="包含的分类(可多次使用)")
|
|
1396
|
-
@click.option("--exclude-categories", "-ec", multiple=True, help="排除的分类(可多次使用)")
|
|
1397
|
-
@click.option("--primary-category", "-pc", help="主要分类")
|
|
1398
|
-
@click.option("--authors", "-a", multiple=True, help="作者姓名(可多次使用)")
|
|
1399
|
-
@click.option(
|
|
1400
|
-
"--author-match",
|
|
1401
|
-
type=click.Choice(["contains", "exact", "any"]),
|
|
1402
|
-
default="contains",
|
|
1403
|
-
help="作者匹配方式:contains(包含)、exact(精确)、any(任一)",
|
|
1404
|
-
)
|
|
1405
|
-
@click.option("--date-from", type=click.DateTime(formats=["%Y-%m-%d"]), help="起始日期(格式:YYYY-MM-DD)")
|
|
1406
|
-
@click.option("--date-to", type=click.DateTime(formats=["%Y-%m-%d"]), help="结束日期(格式:YYYY-MM-DD)")
|
|
1407
|
-
@click.option("--days-back", type=int, help="回溯天数(例如:30表示最近30天)")
|
|
1408
|
-
@click.option("--summarized-only/--no-summarized-only", default=False, help="仅显示已总结的论文")
|
|
1409
|
-
@click.option("--downloaded-only/--no-downloaded-only", default=False, help="仅显示已下载的论文")
|
|
1410
|
-
@click.option(
|
|
1411
|
-
"--sort-by",
|
|
1412
|
-
type=click.Choice(["published", "relevance_score", "title", "updated", "created_at"]),
|
|
1413
|
-
default="published",
|
|
1414
|
-
help="排序字段",
|
|
1415
|
-
)
|
|
1416
|
-
@click.option("--sort-order", type=click.Choice(["asc", "desc"]), default="desc", help="排序顺序")
|
|
1417
|
-
@click.option("--match-all/--match-any", default=False, help="匹配所有条件(AND逻辑)或任一条件(OR逻辑)")
|
|
1418
|
-
def search_advanced(
|
|
1419
|
-
query,
|
|
1420
|
-
directory,
|
|
1421
|
-
limit,
|
|
1422
|
-
years_back,
|
|
1423
|
-
use_ai,
|
|
1424
|
-
categories,
|
|
1425
|
-
exclude_categories,
|
|
1426
|
-
primary_category,
|
|
1427
|
-
authors,
|
|
1428
|
-
author_match,
|
|
1429
|
-
date_from,
|
|
1430
|
-
date_to,
|
|
1431
|
-
days_back,
|
|
1432
|
-
summarized_only,
|
|
1433
|
-
downloaded_only,
|
|
1434
|
-
sort_by,
|
|
1435
|
-
sort_order,
|
|
1436
|
-
match_all,
|
|
1437
|
-
):
|
|
1438
|
-
"""高级搜索论文(支持多字段过滤)"""
|
|
1439
|
-
directory = Path(directory).resolve()
|
|
1440
|
-
|
|
1441
|
-
if not setup_environment(directory):
|
|
1442
|
-
sys.exit(1)
|
|
1443
|
-
|
|
1444
|
-
print_banner()
|
|
1445
|
-
|
|
1446
|
-
# 如果需要,先同步最新论文
|
|
1447
|
-
crawler = ArXivCrawler()
|
|
1448
|
-
if years_back > 0:
|
|
1449
|
-
click.echo(f"搜索前先同步最近 {years_back} 年论文...")
|
|
1450
|
-
sync_result = sync_papers(years_back=years_back, summarize=False)
|
|
1451
|
-
crawler = sync_result["crawler"]
|
|
1452
|
-
|
|
1453
|
-
click.echo(f"\n高级搜索: '{query}'")
|
|
1454
|
-
click.echo("=" * 50)
|
|
1455
|
-
|
|
1456
|
-
search_terms = [query]
|
|
1457
|
-
|
|
1458
|
-
# 如果启用AI且配置了AI API密钥,尝试解析自然语言查询
|
|
1459
|
-
if use_ai and Config.AI_API_KEY:
|
|
1460
|
-
try:
|
|
1461
|
-
import openai
|
|
1462
|
-
|
|
1463
|
-
client = openai.OpenAI(api_key=Config.AI_API_KEY, base_url=Config.AI_BASE_URL)
|
|
1464
|
-
|
|
1465
|
-
ai_prompt = f"""
|
|
1466
|
-
用户正在搜索arXiv物理/计算材料科学论文,查询是: "{query}"
|
|
1467
|
-
|
|
1468
|
-
请将自然语言查询转换为适合arXiv搜索的关键词或短语。
|
|
1469
|
-
考虑以下领域:凝聚态物理、密度泛函理论(DFT)、机器学习、力场、分子动力学、量子化学。
|
|
1470
|
-
|
|
1471
|
-
返回格式:JSON数组,包含最多5个搜索关键词/短语。
|
|
1472
|
-
示例:["machine learning materials science", "density functional theory", "condensed matter physics"]
|
|
1473
|
-
|
|
1474
|
-
只返回JSON数组,不要其他文本。
|
|
1475
|
-
"""
|
|
1476
|
-
|
|
1477
|
-
response = client.chat.completions.create(
|
|
1478
|
-
model=Config.AI_MODEL,
|
|
1479
|
-
messages=[
|
|
1480
|
-
{"role": "system", "content": "你是arXiv论文搜索助手,擅长将自然语言查询转换为学术搜索关键词。"},
|
|
1481
|
-
{"role": "user", "content": ai_prompt},
|
|
1482
|
-
],
|
|
1483
|
-
max_tokens=200,
|
|
1484
|
-
temperature=0.3,
|
|
1485
|
-
)
|
|
1486
|
-
|
|
1487
|
-
ai_response = response.choices[0].message.content
|
|
1488
|
-
try:
|
|
1489
|
-
search_terms = json.loads(ai_response)
|
|
1490
|
-
if isinstance(search_terms, list) and len(search_terms) > 0:
|
|
1491
|
-
click.echo(f"AI解析的搜索词: {', '.join(search_terms[:3])}")
|
|
1492
|
-
if len(search_terms) > 3:
|
|
1493
|
-
click.echo(f" 以及 {len(search_terms) - 3} 个其他关键词")
|
|
1494
|
-
except:
|
|
1495
|
-
# 如果AI响应不是有效JSON,使用原始查询
|
|
1496
|
-
pass
|
|
1497
|
-
|
|
1498
|
-
except Exception as e:
|
|
1499
|
-
click.echo(f"AI解析失败,使用原始查询: {e}")
|
|
1500
|
-
|
|
1501
|
-
# 使用增强搜索引擎
|
|
1502
|
-
with crawler.db.get_session() as session:
|
|
1503
|
-
# 创建搜索过滤器
|
|
1504
|
-
filter_config = SearchFilter(
|
|
1505
|
-
query=query,
|
|
1506
|
-
categories=list(categories) if categories else None,
|
|
1507
|
-
exclude_categories=list(exclude_categories) if exclude_categories else None,
|
|
1508
|
-
primary_category=primary_category,
|
|
1509
|
-
authors=list(authors) if authors else None,
|
|
1510
|
-
author_match=author_match,
|
|
1511
|
-
date_from=date_from,
|
|
1512
|
-
date_to=date_to,
|
|
1513
|
-
days_back=days_back,
|
|
1514
|
-
summarized_only=summarized_only,
|
|
1515
|
-
downloaded_only=downloaded_only,
|
|
1516
|
-
limit=limit,
|
|
1517
|
-
sort_by=sort_by,
|
|
1518
|
-
sort_order=sort_order,
|
|
1519
|
-
match_all=match_all,
|
|
1520
|
-
)
|
|
1521
|
-
|
|
1522
|
-
# 创建搜索引擎
|
|
1523
|
-
search_engine = SearchEngine(session)
|
|
1524
|
-
|
|
1525
|
-
# 执行搜索
|
|
1526
|
-
click.echo(f"正在搜索...")
|
|
1527
|
-
papers = search_engine.search_papers(filter_config)
|
|
1528
|
-
|
|
1529
|
-
if not papers:
|
|
1530
|
-
click.echo("未找到匹配的论文。")
|
|
1531
|
-
return
|
|
1532
|
-
|
|
1533
|
-
click.echo(f"找到 {len(papers)} 篇论文:")
|
|
1534
|
-
|
|
1535
|
-
# 显示简要结果
|
|
1536
|
-
for i, paper in enumerate(papers[:5], 1): # 只显示前5篇作为预览
|
|
1537
|
-
authors_list = json.loads(paper.authors) if paper.authors else []
|
|
1538
|
-
author_names = [a.get("name", "") for a in authors_list[:2]]
|
|
1539
|
-
if len(authors_list) > 2:
|
|
1540
|
-
author_names.append("等")
|
|
1541
|
-
|
|
1542
|
-
click.echo(f"\n{i}. {paper.title}")
|
|
1543
|
-
click.echo(f" 作者: {', '.join(author_names)}")
|
|
1544
|
-
click.echo(f" arXiv ID: {paper.arxiv_id}")
|
|
1545
|
-
click.echo(f" 分类: {paper.categories}")
|
|
1546
|
-
click.echo(f" 发布日期: {paper.published.strftime('%Y-%m-%d') if paper.published else 'N/A'}")
|
|
1547
|
-
click.echo(f" 总结状态: {'已总结' if paper.summarized else '未总结'}")
|
|
1548
|
-
|
|
1549
|
-
if len(papers) > 5:
|
|
1550
|
-
click.echo(f"\n... 以及 {len(papers) - 5} 篇更多论文")
|
|
1551
|
-
|
|
1552
|
-
# 生成搜索报告
|
|
1553
|
-
click.echo("\n正在生成搜索报告...")
|
|
1554
|
-
files = generate_search_report(directory, query, search_terms, papers, paper_limit=limit)
|
|
1555
|
-
|
|
1556
|
-
click.echo(f"报告生成完成:")
|
|
1557
|
-
for f in files:
|
|
1558
|
-
click.echo(f" - {f}")
|
|
1559
|
-
click.echo(f"\n详细论文信息、中文翻译和PDF链接请查看生成的Markdown报告。")
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
@cli.command()
|
|
1563
|
-
@click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
|
|
1564
|
-
@click.option("--limit", default=10, help="显示的搜索查询数量(默认:10)")
|
|
1565
|
-
def search_history(directory, limit):
|
|
1566
|
-
"""显示搜索历史(按使用频率排序)"""
|
|
1567
|
-
directory = Path(directory).resolve()
|
|
1568
|
-
|
|
1569
|
-
if not setup_environment(directory):
|
|
1570
|
-
sys.exit(1)
|
|
1571
|
-
|
|
1572
|
-
print_banner()
|
|
1573
|
-
|
|
1574
|
-
crawler = ArXivCrawler()
|
|
1575
|
-
|
|
1576
|
-
click.echo("\n" + "=" * 50)
|
|
1577
|
-
click.echo("搜索历史")
|
|
1578
|
-
click.echo("=" * 50)
|
|
1579
|
-
|
|
1580
|
-
with crawler.db.get_session() as session:
|
|
1581
|
-
# 创建搜索引擎
|
|
1582
|
-
search_engine = SearchEngine(session)
|
|
1583
|
-
|
|
1584
|
-
# 获取搜索历史
|
|
1585
|
-
click.echo(f"正在获取搜索历史...")
|
|
1586
|
-
history = search_engine.get_search_history(limit=limit)
|
|
1587
|
-
|
|
1588
|
-
if not history:
|
|
1589
|
-
click.echo("暂无搜索历史。")
|
|
1590
|
-
return
|
|
1591
|
-
|
|
1592
|
-
click.echo(f"\n找到 {len(history)} 个搜索查询:")
|
|
1593
|
-
click.echo("-" * 50)
|
|
1594
|
-
|
|
1595
|
-
for i, item in enumerate(history, 1):
|
|
1596
|
-
last_used = item["last_used"].strftime("%Y-%m-%d") if item["last_used"] else "N/A"
|
|
1597
|
-
click.echo(f"\n{i}. 查询: {item['query']}")
|
|
1598
|
-
click.echo(f" 使用次数: {item['count']}")
|
|
1599
|
-
click.echo(f" 最后使用: {last_used}")
|
|
1600
|
-
if item["last_paper_id"]:
|
|
1601
|
-
click.echo(f" 最后论文ID: {item['last_paper_id']}")
|
|
1602
|
-
|
|
1603
|
-
click.echo(f"\n💡 提示: 使用 'pulse search \"查询内容\" .' 重用搜索")
|
|
1604
|
-
click.echo(f" 或 'pulse search-advanced \"查询内容\" . --categories 分类' 进行高级搜索")
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
1386
|
if __name__ == "__main__":
|
|
1608
1387
|
cli()
|