arxiv-pulse 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arxiv_pulse/.ENV.TEMPLATE +93 -41
- arxiv_pulse/__version__.py +2 -2
- arxiv_pulse/arxiv_crawler.py +65 -23
- arxiv_pulse/cli.py +228 -433
- arxiv_pulse/config.py +6 -8
- arxiv_pulse/models.py +17 -9
- arxiv_pulse/output_manager.py +38 -54
- arxiv_pulse/report_generator.py +3 -46
- arxiv_pulse/search_engine.py +105 -53
- arxiv_pulse/summarizer.py +0 -1
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/METADATA +61 -124
- arxiv_pulse-0.6.1.dist-info/RECORD +17 -0
- arxiv_pulse-0.5.0.dist-info/RECORD +0 -17
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/WHEEL +0 -0
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/entry_points.txt +0 -0
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/top_level.txt +0 -0
arxiv_pulse/cli.py
CHANGED
|
@@ -10,14 +10,15 @@ from pathlib import Path
|
|
|
10
10
|
import click
|
|
11
11
|
from dotenv import load_dotenv
|
|
12
12
|
import json
|
|
13
|
-
from datetime import datetime, timedelta
|
|
13
|
+
from datetime import datetime, timedelta, timezone
|
|
14
14
|
import questionary
|
|
15
|
+
import wcwidth
|
|
15
16
|
|
|
16
17
|
from arxiv_pulse.config import Config
|
|
17
18
|
from arxiv_pulse.arxiv_crawler import ArXivCrawler
|
|
18
19
|
from arxiv_pulse.summarizer import PaperSummarizer
|
|
19
20
|
from arxiv_pulse.report_generator import ReportGenerator
|
|
20
|
-
from arxiv_pulse.output_manager import output
|
|
21
|
+
from arxiv_pulse.output_manager import output, OutputLevel
|
|
21
22
|
from arxiv_pulse.search_engine import SearchEngine, SearchFilter
|
|
22
23
|
from arxiv_pulse.__version__ import __version__
|
|
23
24
|
|
|
@@ -234,7 +235,6 @@ def setup_environment(directory: Path):
|
|
|
234
235
|
# 创建必要的目录
|
|
235
236
|
os.makedirs("data", exist_ok=True)
|
|
236
237
|
os.makedirs("reports", exist_ok=True)
|
|
237
|
-
os.makedirs("logs", exist_ok=True)
|
|
238
238
|
|
|
239
239
|
# 加载 .env 文件(如果存在)
|
|
240
240
|
env_file = directory / ".env"
|
|
@@ -267,12 +267,14 @@ def setup_environment(directory: Path):
|
|
|
267
267
|
Config.AI_MODEL = os.getenv("AI_MODEL", "DeepSeek-V3.2-Thinking")
|
|
268
268
|
Config.AI_BASE_URL = os.getenv("AI_BASE_URL", "https://llmapi.paratera.com")
|
|
269
269
|
Config.SUMMARY_MAX_TOKENS = int(os.getenv("SUMMARY_MAX_TOKENS", "2000"))
|
|
270
|
-
Config.SUMMARY_SENTENCES_LIMIT = int(os.getenv("SUMMARY_SENTENCES_LIMIT", "3"))
|
|
271
270
|
Config.TOKEN_PRICE_PER_MILLION = float(os.getenv("TOKEN_PRICE_PER_MILLION", "3.0"))
|
|
272
|
-
Config.MAX_RESULTS_INITIAL = int(os.getenv("MAX_RESULTS_INITIAL", "
|
|
273
|
-
Config.MAX_RESULTS_DAILY = int(os.getenv("MAX_RESULTS_DAILY", "
|
|
271
|
+
Config.MAX_RESULTS_INITIAL = int(os.getenv("MAX_RESULTS_INITIAL", "10000"))
|
|
272
|
+
Config.MAX_RESULTS_DAILY = int(os.getenv("MAX_RESULTS_DAILY", "500"))
|
|
274
273
|
Config.YEARS_BACK = int(os.getenv("YEARS_BACK", "3"))
|
|
275
|
-
Config.IMPORTANT_PAPERS_FILE = os.getenv("IMPORTANT_PAPERS_FILE", "important_papers.txt")
|
|
274
|
+
Config.IMPORTANT_PAPERS_FILE = os.getenv("IMPORTANT_PAPERS_FILE", "data/important_papers.txt")
|
|
275
|
+
Config.ARXIV_MAX_RESULTS = int(os.getenv("ARXIV_MAX_RESULTS", "30000"))
|
|
276
|
+
Config.ARXIV_SORT_BY = os.getenv("ARXIV_SORT_BY", "submittedDate")
|
|
277
|
+
Config.ARXIV_SORT_ORDER = os.getenv("ARXIV_SORT_ORDER", "descending")
|
|
276
278
|
Config.REPORT_MAX_PAPERS = int(os.getenv("REPORT_MAX_PAPERS", "50"))
|
|
277
279
|
|
|
278
280
|
# 更新 SEARCH_QUERIES
|
|
@@ -407,54 +409,74 @@ def print_banner_custom(fields):
|
|
|
407
409
|
else:
|
|
408
410
|
field_str = f"{fields[0]} • {fields[1]} • {fields[2]} • {fields[3]}"
|
|
409
411
|
|
|
410
|
-
#
|
|
411
|
-
# 第一行标题:"arXiv Pulse - 文献追踪系统" (25字符)
|
|
412
|
-
# 需要将字段字符串居中显示
|
|
412
|
+
# 横幅尺寸
|
|
413
413
|
banner_width = 55
|
|
414
414
|
content_width = 53
|
|
415
415
|
|
|
416
|
-
#
|
|
416
|
+
# 辅助函数:计算字符串显示宽度
|
|
417
|
+
def display_width(text):
|
|
418
|
+
return wcwidth.wcswidth(text)
|
|
419
|
+
|
|
420
|
+
# 辅助函数:截断字符串到指定显示宽度,添加省略号
|
|
421
|
+
def truncate_to_width(text, max_width):
|
|
422
|
+
if display_width(text) <= max_width:
|
|
423
|
+
return text
|
|
424
|
+
# 逐步减少字符直到宽度合适
|
|
425
|
+
result = ""
|
|
426
|
+
for char in text:
|
|
427
|
+
if display_width(result + char) > max_width - 3: # 为"..."留出空间
|
|
428
|
+
break
|
|
429
|
+
result += char
|
|
430
|
+
return result + "..." if result else "..." # 至少返回省略号
|
|
431
|
+
|
|
432
|
+
# 创建横幅边框
|
|
417
433
|
border_top = "╔" + "═" * (banner_width - 2) + "╗"
|
|
418
434
|
border_bottom = "╚" + "═" * (banner_width - 2) + "╝"
|
|
419
435
|
|
|
420
436
|
# 第一行标题
|
|
421
437
|
title = "arXiv Pulse - 文献追踪系统"
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
title_line = "║" + " " *
|
|
438
|
+
title_width = display_width(title)
|
|
439
|
+
# 计算左右填充
|
|
440
|
+
left_padding = (content_width - title_width) // 2
|
|
441
|
+
right_padding = content_width - title_width - left_padding
|
|
442
|
+
title_line = "║" + " " * left_padding + title + " " * right_padding + "║"
|
|
427
443
|
|
|
428
444
|
# 第二行字段
|
|
429
|
-
#
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
field_line =
|
|
439
|
-
"║" + " " * field_padding + field_str + " " * (content_width - len(field_str) * 2 - field_padding) + "║"
|
|
440
|
-
)
|
|
445
|
+
# 最大字段显示宽度(留出边距)
|
|
446
|
+
max_field_width = content_width - 4
|
|
447
|
+
# 截断字段字符串如果太长
|
|
448
|
+
field_str = truncate_to_width(field_str, max_field_width)
|
|
449
|
+
field_width = display_width(field_str)
|
|
450
|
+
|
|
451
|
+
# 计算字段行的左右填充
|
|
452
|
+
left_padding = (content_width - field_width) // 2
|
|
453
|
+
right_padding = content_width - field_width - left_padding
|
|
454
|
+
field_line = "║" + " " * left_padding + field_str + " " * right_padding + "║"
|
|
441
455
|
|
|
442
456
|
banner = f"\n{border_top}\n{title_line}\n{field_line}\n{border_bottom}\n"
|
|
443
457
|
click.echo(banner)
|
|
444
458
|
|
|
445
459
|
|
|
446
|
-
def sync_papers(years_back=1, summarize=False):
|
|
447
|
-
"""同步论文(内部函数)
|
|
460
|
+
def sync_papers(years_back=1, summarize=False, force=False):
|
|
461
|
+
"""同步论文(内部函数)
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
years_back: 回溯的年数
|
|
465
|
+
summarize: 是否总结新论文
|
|
466
|
+
force: 是否强制同步(重新下载所有论文,忽略重复检查)
|
|
467
|
+
"""
|
|
448
468
|
crawler = ArXivCrawler()
|
|
449
469
|
summarizer = PaperSummarizer()
|
|
450
470
|
|
|
451
|
-
|
|
471
|
+
mode_text = "强制同步" if force else "同步缺失论文"
|
|
472
|
+
click.echo(f"正在{mode_text}(回溯 {years_back} 年)...")
|
|
452
473
|
click.echo("=" * 50)
|
|
453
474
|
|
|
454
475
|
# 同步所有查询
|
|
455
476
|
click.echo("1. 正在同步搜索查询...")
|
|
456
|
-
sync_result = crawler.sync_all_queries(years_back=years_back)
|
|
457
|
-
|
|
477
|
+
sync_result = crawler.sync_all_queries(years_back=years_back, force=force)
|
|
478
|
+
result_text = "处理了" if force else "添加了"
|
|
479
|
+
click.echo(f" 从查询{result_text} {sync_result['total_new_papers']} 篇论文")
|
|
458
480
|
|
|
459
481
|
# 同步重要论文
|
|
460
482
|
click.echo("2. 正在同步重要论文...")
|
|
@@ -480,7 +502,7 @@ def sync_papers(years_back=1, summarize=False):
|
|
|
480
502
|
|
|
481
503
|
click.echo("\n" + "=" * 50)
|
|
482
504
|
click.echo("同步完成!")
|
|
483
|
-
click.echo(f"
|
|
505
|
+
click.echo(f"总共{result_text}论文: {total_new}")
|
|
484
506
|
click.echo(f"数据库现有 {crawl_stats['total_papers']} 篇论文")
|
|
485
507
|
click.echo(f"已总结: {summary_stats['summarized_papers']} ({summary_stats['summarization_rate']:.1%})")
|
|
486
508
|
|
|
@@ -490,12 +512,13 @@ def sync_papers(years_back=1, summarize=False):
|
|
|
490
512
|
"sync_result": sync_result,
|
|
491
513
|
"important_result": important_result,
|
|
492
514
|
"stats": {"crawl_stats": crawl_stats, "summary_stats": summary_stats},
|
|
515
|
+
"force_mode": force,
|
|
493
516
|
}
|
|
494
517
|
|
|
495
518
|
|
|
496
519
|
def get_workday_cutoff(days_back):
|
|
497
520
|
"""计算排除周末的截止日期"""
|
|
498
|
-
current = datetime.
|
|
521
|
+
current = datetime.now(timezone.utc).replace(tzinfo=None)
|
|
499
522
|
workdays_counted = 0
|
|
500
523
|
days_to_go_back = 0
|
|
501
524
|
|
|
@@ -609,6 +632,11 @@ def generate_search_report(query, search_terms, papers, paper_limit=50, summariz
|
|
|
609
632
|
"""生成搜索结果的报告(内部函数)"""
|
|
610
633
|
reporter = ReportGenerator()
|
|
611
634
|
|
|
635
|
+
# 如果没有找到论文,不生成报告
|
|
636
|
+
if not papers:
|
|
637
|
+
output.info("未找到论文,跳过报告生成")
|
|
638
|
+
return []
|
|
639
|
+
|
|
612
640
|
# 设置报告限制
|
|
613
641
|
original_limit = Config.REPORT_MAX_PAPERS
|
|
614
642
|
Config.REPORT_MAX_PAPERS = paper_limit
|
|
@@ -698,10 +726,15 @@ def generate_search_report(query, search_terms, papers, paper_limit=50, summariz
|
|
|
698
726
|
|
|
699
727
|
|
|
700
728
|
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
|
|
729
|
+
@click.option("--verbose", "-v", is_flag=True, help="显示详细输出(包括调试信息)")
|
|
701
730
|
@click.version_option(version=__version__, prog_name="arXiv Pulse")
|
|
702
|
-
def cli():
|
|
731
|
+
def cli(verbose):
|
|
703
732
|
"""arXiv Pulse: 智能arXiv文献追踪和分析系统"""
|
|
704
|
-
|
|
733
|
+
if verbose:
|
|
734
|
+
output.set_min_level(OutputLevel.DEBUG)
|
|
735
|
+
else:
|
|
736
|
+
# 确保使用环境变量中的LOG_LEVEL(output manager已经处理)
|
|
737
|
+
pass
|
|
705
738
|
|
|
706
739
|
|
|
707
740
|
def interactive_configuration():
|
|
@@ -778,10 +811,10 @@ def interactive_configuration():
|
|
|
778
811
|
click.echo("\n📊 爬虫配置")
|
|
779
812
|
click.echo("-" * 40)
|
|
780
813
|
|
|
781
|
-
max_results_initial = click.prompt("初始同步每个查询的最大论文数", default=
|
|
814
|
+
max_results_initial = click.prompt("初始同步每个查询的最大论文数", default=10000, type=int, show_default=True)
|
|
782
815
|
config["MAX_RESULTS_INITIAL"] = str(max_results_initial)
|
|
783
816
|
|
|
784
|
-
max_results_daily = click.prompt("每日同步每个查询的最大论文数", default=
|
|
817
|
+
max_results_daily = click.prompt("每日同步每个查询的最大论文数", default=500, type=int, show_default=True)
|
|
785
818
|
config["MAX_RESULTS_DAILY"] = str(max_results_daily)
|
|
786
819
|
|
|
787
820
|
years_back = click.prompt("初始同步回溯的年数", default=5, type=int, show_default=True)
|
|
@@ -866,27 +899,24 @@ def interactive_configuration():
|
|
|
866
899
|
click.echo("-" * 40)
|
|
867
900
|
|
|
868
901
|
# 根据领域数量提供建议
|
|
869
|
-
|
|
870
|
-
recommended_daily = 20
|
|
871
|
-
|
|
872
|
-
if num_selected_fields <= 3:
|
|
902
|
+
if num_selected_fields <= 6:
|
|
873
903
|
click.echo("✅ 您选择了少量领域,保持默认配置即可。")
|
|
874
|
-
elif num_selected_fields <=
|
|
875
|
-
recommended_initial =
|
|
876
|
-
recommended_daily =
|
|
904
|
+
elif num_selected_fields <= 10:
|
|
905
|
+
recommended_initial = 4000
|
|
906
|
+
recommended_daily = 200
|
|
877
907
|
click.echo(f"⚠️ 您选择了中等数量领域,建议调整爬虫配置以避免过多论文:")
|
|
878
|
-
click.echo(f" - 初始同步每个查询最大论文数: {recommended_initial}
|
|
879
|
-
click.echo(f" - 每日同步每个查询最大论文数: {recommended_daily}
|
|
908
|
+
click.echo(f" - 初始同步每个查询最大论文数: {recommended_initial}")
|
|
909
|
+
click.echo(f" - 每日同步每个查询最大论文数: {recommended_daily}")
|
|
880
910
|
else:
|
|
881
|
-
recommended_initial =
|
|
882
|
-
recommended_daily =
|
|
911
|
+
recommended_initial = 1000
|
|
912
|
+
recommended_daily = 50
|
|
883
913
|
click.echo(f"⚠️ 您选择了大量领域 ({num_selected_fields}个),强烈建议调整爬虫配置:")
|
|
884
|
-
click.echo(f" - 初始同步每个查询最大论文数: {recommended_initial}
|
|
885
|
-
click.echo(f" - 每日同步每个查询最大论文数: {recommended_daily}
|
|
914
|
+
click.echo(f" - 初始同步每个查询最大论文数: {recommended_initial}")
|
|
915
|
+
click.echo(f" - 每日同步每个查询最大论文数: {recommended_daily}")
|
|
886
916
|
click.echo(f" - 注意:同步大量领域可能需要较长时间和更多存储空间。")
|
|
887
917
|
|
|
888
918
|
# 询问用户是否应用建议
|
|
889
|
-
if num_selected_fields >
|
|
919
|
+
if num_selected_fields > 6:
|
|
890
920
|
if click.confirm("\n💡 是否应用上述建议调整爬虫配置?", default=True):
|
|
891
921
|
config["MAX_RESULTS_INITIAL"] = str(recommended_initial)
|
|
892
922
|
config["MAX_RESULTS_DAILY"] = str(recommended_daily)
|
|
@@ -903,9 +933,6 @@ def interactive_configuration():
|
|
|
903
933
|
report_max_papers = click.prompt("每份报告显示的最大论文数", default=50, type=int, show_default=True)
|
|
904
934
|
config["REPORT_MAX_PAPERS"] = str(report_max_papers)
|
|
905
935
|
|
|
906
|
-
summary_sentences_limit = click.prompt("摘要句子数限制", default=3, type=int, show_default=True)
|
|
907
|
-
config["SUMMARY_SENTENCES_LIMIT"] = str(summary_sentences_limit)
|
|
908
|
-
|
|
909
936
|
click.echo("\n✅ 配置完成!")
|
|
910
937
|
return config, int(years_back)
|
|
911
938
|
|
|
@@ -920,7 +947,6 @@ def init(directory, years_back):
|
|
|
920
947
|
# 创建目录结构
|
|
921
948
|
(directory / "data").mkdir(exist_ok=True)
|
|
922
949
|
(directory / "reports").mkdir(exist_ok=True)
|
|
923
|
-
(directory / "logs").mkdir(exist_ok=True)
|
|
924
950
|
|
|
925
951
|
# 创建 .env 文件(如果不存在)
|
|
926
952
|
env_file = directory / ".env"
|
|
@@ -937,59 +963,77 @@ def init(directory, years_back):
|
|
|
937
963
|
if years_back is None:
|
|
938
964
|
years_back = interactive_years_back
|
|
939
965
|
|
|
940
|
-
#
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
#
|
|
953
|
-
#
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
#
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
#
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
#
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
"""
|
|
966
|
+
# 读取 .ENV.TEMPLATE 文件作为基础模板
|
|
967
|
+
template_file = Path(__file__).parent / ".ENV.TEMPLATE"
|
|
968
|
+
if not template_file.exists():
|
|
969
|
+
click.echo(f"❌ 找不到模板文件: {template_file}")
|
|
970
|
+
click.echo("请确保 .ENV.TEMPLATE 文件存在于 arxiv_pulse 目录中")
|
|
971
|
+
return
|
|
972
|
+
|
|
973
|
+
env_content = template_file.read_text(encoding="utf-8")
|
|
974
|
+
|
|
975
|
+
# 添加生成时间戳注释(插入到第一行之后)
|
|
976
|
+
timestamp_comment = f"# 由交互式配置向导于 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} 生成\n"
|
|
977
|
+
lines = env_content.split("\n")
|
|
978
|
+
if lines and lines[0].startswith("#"):
|
|
979
|
+
# 在第一行注释后插入时间戳
|
|
980
|
+
lines.insert(1, timestamp_comment)
|
|
981
|
+
else:
|
|
982
|
+
# 如果没有注释行,添加到开头
|
|
983
|
+
lines.insert(0, timestamp_comment)
|
|
984
|
+
env_content = "\n".join(lines)
|
|
985
|
+
|
|
986
|
+
# 替换配置项(使用 config 字典中的值)
|
|
987
|
+
# 统一替换逻辑:搜索以键名开头的行,替换整行
|
|
988
|
+
lines = env_content.split("\n")
|
|
989
|
+
|
|
990
|
+
# AI API 配置
|
|
991
|
+
for i, line in enumerate(lines):
|
|
992
|
+
if line.strip().startswith("AI_API_KEY="):
|
|
993
|
+
lines[i] = f"AI_API_KEY={config.get('AI_API_KEY', 'your_api_key_here')}"
|
|
994
|
+
break
|
|
995
|
+
|
|
996
|
+
for i, line in enumerate(lines):
|
|
997
|
+
if line.strip().startswith("AI_MODEL="):
|
|
998
|
+
lines[i] = f"AI_MODEL={config.get('AI_MODEL', 'DeepSeek-V3.2-Thinking')}"
|
|
999
|
+
break
|
|
1000
|
+
|
|
1001
|
+
for i, line in enumerate(lines):
|
|
1002
|
+
if line.strip().startswith("AI_BASE_URL="):
|
|
1003
|
+
lines[i] = f"AI_BASE_URL={config.get('AI_BASE_URL', 'https://llmapi.paratera.com')}"
|
|
1004
|
+
break
|
|
1005
|
+
|
|
1006
|
+
# 爬虫配置
|
|
1007
|
+
for i, line in enumerate(lines):
|
|
1008
|
+
if line.strip().startswith("MAX_RESULTS_INITIAL="):
|
|
1009
|
+
lines[i] = f"MAX_RESULTS_INITIAL={config.get('MAX_RESULTS_INITIAL', '10000')}"
|
|
1010
|
+
break
|
|
1011
|
+
|
|
1012
|
+
for i, line in enumerate(lines):
|
|
1013
|
+
if line.strip().startswith("MAX_RESULTS_DAILY="):
|
|
1014
|
+
lines[i] = f"MAX_RESULTS_DAILY={config.get('MAX_RESULTS_DAILY', '500')}"
|
|
1015
|
+
break
|
|
1016
|
+
|
|
1017
|
+
# 搜索查询配置
|
|
1018
|
+
default_search_queries = 'condensed matter physics AND cat:cond-mat.*; (ti:"density functional" OR abs:"density functional") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph); (ti:"machine learning" OR abs:"machine learning") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph)'
|
|
1019
|
+
for i, line in enumerate(lines):
|
|
1020
|
+
if line.strip().startswith("SEARCH_QUERIES="):
|
|
1021
|
+
lines[i] = f"SEARCH_QUERIES={config.get('SEARCH_QUERIES', default_search_queries)}"
|
|
1022
|
+
break
|
|
1023
|
+
|
|
1024
|
+
# 报告配置
|
|
1025
|
+
for i, line in enumerate(lines):
|
|
1026
|
+
if line.strip().startswith("REPORT_MAX_PAPERS="):
|
|
1027
|
+
lines[i] = f"REPORT_MAX_PAPERS={config.get('REPORT_MAX_PAPERS', '50')}"
|
|
1028
|
+
break
|
|
1029
|
+
|
|
1030
|
+
# 同步配置
|
|
1031
|
+
for i, line in enumerate(lines):
|
|
1032
|
+
if line.strip().startswith("YEARS_BACK="):
|
|
1033
|
+
lines[i] = f"YEARS_BACK={config.get('YEARS_BACK', '5')}"
|
|
1034
|
+
break
|
|
1035
|
+
|
|
1036
|
+
env_content = "\n".join(lines)
|
|
993
1037
|
|
|
994
1038
|
env_file.write_text(env_content)
|
|
995
1039
|
click.echo(f"\n✅ 已在 {directory} 创建 .env 配置文件")
|
|
@@ -1000,10 +1044,12 @@ CRAWL_DELAY=1.0
|
|
|
1000
1044
|
years_back = 5 # 默认值
|
|
1001
1045
|
|
|
1002
1046
|
# 创建 important_papers.txt(如果不存在)
|
|
1003
|
-
important_file = directory /
|
|
1047
|
+
important_file = directory / Config.IMPORTANT_PAPERS_FILE
|
|
1048
|
+
# 确保父目录存在
|
|
1049
|
+
important_file.parent.mkdir(parents=True, exist_ok=True)
|
|
1004
1050
|
if not important_file.exists():
|
|
1005
1051
|
important_file.write_text("# 在此添加重要论文的arXiv ID,每行一个\n")
|
|
1006
|
-
click.echo(f"✅
|
|
1052
|
+
click.echo(f"✅ 已创建重要论文文件: {important_file}")
|
|
1007
1053
|
|
|
1008
1054
|
# 设置环境并验证配置
|
|
1009
1055
|
if not setup_environment(directory):
|
|
@@ -1046,10 +1092,15 @@ CRAWL_DELAY=1.0
|
|
|
1046
1092
|
|
|
1047
1093
|
@cli.command()
|
|
1048
1094
|
@click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
|
|
1049
|
-
@click.option("--years-back", type=int, default=
|
|
1095
|
+
@click.option("--years-back", type=int, default=None, help="同步回溯的年数(默认:强制模式5年,普通模式1年)")
|
|
1050
1096
|
@click.option("--summarize/--no-summarize", default=False, help="是否总结新论文(默认:否)")
|
|
1051
|
-
|
|
1052
|
-
|
|
1097
|
+
@click.option("--force", is_flag=True, default=False, help="强制同步:重新下载最近N年的所有论文,忽略重复检查")
|
|
1098
|
+
def sync(directory, years_back, summarize, force):
|
|
1099
|
+
"""同步最新论文到数据库
|
|
1100
|
+
|
|
1101
|
+
强制模式(--force): 重新下载最近N年的所有论文,忽略重复检查,默认回溯5年。
|
|
1102
|
+
普通模式: 只下载缺失的新论文,默认回溯1年。
|
|
1103
|
+
"""
|
|
1053
1104
|
directory = Path(directory).resolve()
|
|
1054
1105
|
click.echo(f"正在同步 arXiv Pulse 于 {directory}")
|
|
1055
1106
|
|
|
@@ -1058,8 +1109,13 @@ def sync(directory, years_back, summarize):
|
|
|
1058
1109
|
|
|
1059
1110
|
print_banner()
|
|
1060
1111
|
|
|
1112
|
+
# 设置默认years_back值
|
|
1113
|
+
if years_back is None:
|
|
1114
|
+
years_back = 5 if force else 1
|
|
1115
|
+
click.echo(f"使用默认回溯年数: {years_back} 年")
|
|
1116
|
+
|
|
1061
1117
|
# 同步论文
|
|
1062
|
-
sync_result = sync_papers(years_back=years_back, summarize=summarize)
|
|
1118
|
+
sync_result = sync_papers(years_back=years_back, summarize=summarize, force=force)
|
|
1063
1119
|
|
|
1064
1120
|
click.echo("\n" + "=" * 50)
|
|
1065
1121
|
click.echo("同步完成!数据库已更新。")
|
|
@@ -1073,8 +1129,19 @@ def sync(directory, years_back, summarize):
|
|
|
1073
1129
|
@click.option("--use-ai/--no-ai", default=True, help="是否使用AI理解自然语言查询(默认:是)")
|
|
1074
1130
|
@click.option("--summarize/--no-summarize", default=True, help="是否自动总结未总结的论文(默认:是)")
|
|
1075
1131
|
@click.option("--max-summarize", type=int, default=0, help="最大总结论文数(默认:0表示无限制)")
|
|
1076
|
-
|
|
1077
|
-
|
|
1132
|
+
@click.option("--categories", "-c", multiple=True, help="包含的分类(可多次使用)")
|
|
1133
|
+
@click.option("--days-back", type=int, help="回溯天数(例如:30表示最近30天)")
|
|
1134
|
+
@click.option("--authors", "-a", multiple=True, help="作者姓名(可多次使用)")
|
|
1135
|
+
@click.option(
|
|
1136
|
+
"--sort-by",
|
|
1137
|
+
type=click.Choice(["published", "relevance_score", "title", "updated"]),
|
|
1138
|
+
default="published",
|
|
1139
|
+
help="排序字段",
|
|
1140
|
+
)
|
|
1141
|
+
def search(
|
|
1142
|
+
query, directory, limit, years_back, use_ai, summarize, max_summarize, categories, days_back, authors, sort_by
|
|
1143
|
+
):
|
|
1144
|
+
"""智能搜索论文(支持自然语言查询和基本过滤)"""
|
|
1078
1145
|
directory = Path(directory).resolve()
|
|
1079
1146
|
|
|
1080
1147
|
if not setup_environment(directory):
|
|
@@ -1086,7 +1153,7 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
|
|
|
1086
1153
|
crawler = ArXivCrawler()
|
|
1087
1154
|
if years_back > 0:
|
|
1088
1155
|
click.echo(f"搜索前先同步最近 {years_back} 年论文...")
|
|
1089
|
-
sync_result = sync_papers(years_back=years_back, summarize=False)
|
|
1156
|
+
sync_result = sync_papers(years_back=years_back, summarize=False, force=False)
|
|
1090
1157
|
crawler = sync_result["crawler"]
|
|
1091
1158
|
|
|
1092
1159
|
click.echo(f"\n正在搜索: '{query}'")
|
|
@@ -1105,10 +1172,22 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
|
|
|
1105
1172
|
用户正在搜索arXiv物理/计算材料科学论文,查询是: "{query}"
|
|
1106
1173
|
|
|
1107
1174
|
请将自然语言查询转换为适合arXiv搜索的关键词或短语。
|
|
1108
|
-
考虑以下领域:凝聚态物理、密度泛函理论(DFT)、机器学习、力场、分子动力学、量子化学。
|
|
1109
1175
|
|
|
1110
|
-
|
|
1111
|
-
|
|
1176
|
+
重要规则:
|
|
1177
|
+
1. 如果查询已经是明确的搜索词(如"DeepH"、"deep learning Hamiltonian"、"DFT计算"),直接使用它,不要添加同义词
|
|
1178
|
+
2. 如果查询包含专业术语、缩写或专有名词,保持原样作为主要搜索词
|
|
1179
|
+
3. 仅当查询非常模糊或一般性时(如"机器学习在材料科学中的应用"),才生成1-2个相关关键词
|
|
1180
|
+
4. 优先保持查询的原始意图,不要添加不相关的关键词
|
|
1181
|
+
5. 对于英文查询,保持原样;对于中文查询,翻译为英文关键词
|
|
1182
|
+
考虑以下领域:凝聚态物理、密度泛函理论(DFT)、机器学习、力场、分子动力学、量子化学、计算材料科学。
|
|
1183
|
+
|
|
1184
|
+
返回格式:JSON数组,包含1-2个搜索关键词/短语。
|
|
1185
|
+
示例:
|
|
1186
|
+
- 查询"DeepH": ["DeepH"]
|
|
1187
|
+
- 查询"deep learning Hamiltonian": ["deep learning Hamiltonian"]
|
|
1188
|
+
- 查询"DFT计算": ["DFT"]
|
|
1189
|
+
- 查询"分子动力学模拟": ["molecular dynamics simulation"]
|
|
1190
|
+
- 查询"机器学习在材料科学中的应用": ["machine learning materials science"]
|
|
1112
1191
|
|
|
1113
1192
|
只返回JSON数组,不要其他文本。
|
|
1114
1193
|
"""
|
|
@@ -1116,7 +1195,10 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
|
|
|
1116
1195
|
response = client.chat.completions.create(
|
|
1117
1196
|
model=Config.AI_MODEL,
|
|
1118
1197
|
messages=[
|
|
1119
|
-
{
|
|
1198
|
+
{
|
|
1199
|
+
"role": "system",
|
|
1200
|
+
"content": "你是arXiv论文搜索助手,擅长识别专业术语并将自然语言查询转换为学术搜索关键词。",
|
|
1201
|
+
},
|
|
1120
1202
|
{"role": "user", "content": ai_prompt},
|
|
1121
1203
|
],
|
|
1122
1204
|
max_tokens=200,
|
|
@@ -1141,30 +1223,30 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
|
|
|
1141
1223
|
with crawler.db.get_session() as session:
|
|
1142
1224
|
from arxiv_pulse.models import Paper
|
|
1143
1225
|
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
papers = (
|
|
1147
|
-
session.query(Paper)
|
|
1148
|
-
.filter(
|
|
1149
|
-
Paper.title.contains(term)
|
|
1150
|
-
| Paper.abstract.contains(term)
|
|
1151
|
-
| Paper.categories.contains(term)
|
|
1152
|
-
| Paper.search_query.contains(term)
|
|
1153
|
-
)
|
|
1154
|
-
.order_by(Paper.published.desc())
|
|
1155
|
-
.limit(limit)
|
|
1156
|
-
.all()
|
|
1157
|
-
)
|
|
1158
|
-
all_results.extend(papers)
|
|
1226
|
+
# 使用增强搜索引擎进行模糊搜索
|
|
1227
|
+
search_engine = SearchEngine(session)
|
|
1159
1228
|
|
|
1160
|
-
#
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1229
|
+
# 将搜索词合并为一个查询(搜索引擎会处理单词拆分和同义词扩展)
|
|
1230
|
+
combined_query = " ".join(search_terms)
|
|
1231
|
+
|
|
1232
|
+
filter_config = SearchFilter(
|
|
1233
|
+
query=combined_query,
|
|
1234
|
+
search_fields=["title", "abstract"],
|
|
1235
|
+
categories=list(categories) if categories else None,
|
|
1236
|
+
authors=list(authors) if authors else None,
|
|
1237
|
+
author_match="contains", # 默认使用包含匹配
|
|
1238
|
+
days_back=days_back,
|
|
1239
|
+
limit=limit * min(len(search_terms), 2), # 扩大限制但最多2倍,避免过多结果
|
|
1240
|
+
sort_by=sort_by,
|
|
1241
|
+
sort_order="desc",
|
|
1242
|
+
match_all=True, # AND逻辑:匹配所有搜索词
|
|
1243
|
+
)
|
|
1244
|
+
|
|
1245
|
+
# 执行搜索
|
|
1246
|
+
papers_to_show = search_engine.search_papers(filter_config)
|
|
1165
1247
|
|
|
1166
|
-
|
|
1167
|
-
papers_to_show =
|
|
1248
|
+
# 确保不超过限制
|
|
1249
|
+
papers_to_show = papers_to_show[:limit]
|
|
1168
1250
|
|
|
1169
1251
|
click.echo(f"找到 {len(papers_to_show)} 篇论文:")
|
|
1170
1252
|
|
|
@@ -1219,7 +1301,7 @@ def recent(directory, limit, days_back, years_back, summarize, max_summarize):
|
|
|
1219
1301
|
# 先同步论文
|
|
1220
1302
|
if years_back > 0:
|
|
1221
1303
|
click.echo(f"报告前先同步最近 {years_back} 年论文...")
|
|
1222
|
-
sync_papers(years_back=years_back, summarize=False)
|
|
1304
|
+
sync_papers(years_back=years_back, summarize=False, force=False)
|
|
1223
1305
|
|
|
1224
1306
|
# 生成报告
|
|
1225
1307
|
click.echo("\n" + "=" * 50)
|
|
@@ -1294,8 +1376,6 @@ def stat(directory):
|
|
|
1294
1376
|
# 时间分布
|
|
1295
1377
|
click.echo(f"\n📅 时间分布:")
|
|
1296
1378
|
with crawler.db.get_session() as session:
|
|
1297
|
-
from datetime import datetime, timedelta
|
|
1298
|
-
|
|
1299
1379
|
# 按年统计
|
|
1300
1380
|
year_stats = {}
|
|
1301
1381
|
for paper in papers:
|
|
@@ -1319,290 +1399,5 @@ def stat(directory):
|
|
|
1319
1399
|
click.echo("统计完成 ✅")
|
|
1320
1400
|
|
|
1321
1401
|
|
|
1322
|
-
@cli.command()
|
|
1323
|
-
@click.argument("paper_id")
|
|
1324
|
-
@click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
|
|
1325
|
-
@click.option("--limit", default=10, help="返回结果的最大数量(默认:10)")
|
|
1326
|
-
@click.option("--threshold", type=float, default=0.5, help="相似度阈值(0.0-1.0,默认:0.5)")
|
|
1327
|
-
@click.option("--years-back", type=int, default=0, help="搜索前同步回溯的年数(默认:0,不更新)")
|
|
1328
|
-
def similar(paper_id, directory, limit, threshold, years_back):
|
|
1329
|
-
"""查找与指定论文相似的论文"""
|
|
1330
|
-
directory = Path(directory).resolve()
|
|
1331
|
-
|
|
1332
|
-
if not setup_environment(directory):
|
|
1333
|
-
sys.exit(1)
|
|
1334
|
-
|
|
1335
|
-
print_banner()
|
|
1336
|
-
|
|
1337
|
-
# 如果需要,先同步最新论文
|
|
1338
|
-
crawler = ArXivCrawler()
|
|
1339
|
-
if years_back > 0:
|
|
1340
|
-
click.echo(f"搜索前先同步最近 {years_back} 年论文...")
|
|
1341
|
-
sync_result = sync_papers(years_back=years_back, summarize=False)
|
|
1342
|
-
crawler = sync_result["crawler"]
|
|
1343
|
-
|
|
1344
|
-
click.echo(f"\n查找与论文 '{paper_id}' 相似的论文")
|
|
1345
|
-
click.echo("=" * 50)
|
|
1346
|
-
|
|
1347
|
-
with crawler.db.get_session() as session:
|
|
1348
|
-
# 创建搜索引擎
|
|
1349
|
-
search_engine = SearchEngine(session)
|
|
1350
|
-
|
|
1351
|
-
# 查找相似论文
|
|
1352
|
-
click.echo(f"正在查找相似度≥{threshold}的论文...")
|
|
1353
|
-
similar_papers_with_scores = search_engine.search_similar_papers(paper_id, limit=limit, threshold=threshold)
|
|
1354
|
-
|
|
1355
|
-
if not similar_papers_with_scores:
|
|
1356
|
-
click.echo("未找到相似论文。")
|
|
1357
|
-
return
|
|
1358
|
-
|
|
1359
|
-
click.echo(f"找到 {len(similar_papers_with_scores)} 篇相似论文:")
|
|
1360
|
-
|
|
1361
|
-
# 提取paper列表用于报告生成
|
|
1362
|
-
similar_papers = [paper for paper, _ in similar_papers_with_scores]
|
|
1363
|
-
|
|
1364
|
-
# 显示结果
|
|
1365
|
-
for i, (paper, similarity) in enumerate(similar_papers_with_scores, 1):
|
|
1366
|
-
authors = json.loads(paper.authors) if paper.authors else []
|
|
1367
|
-
author_names = [a.get("name", "") for a in authors[:2]]
|
|
1368
|
-
if len(authors) > 2:
|
|
1369
|
-
author_names.append("等")
|
|
1370
|
-
|
|
1371
|
-
click.echo(f"\n{i}. {paper.title}")
|
|
1372
|
-
click.echo(f" 相似度: {similarity:.2f}")
|
|
1373
|
-
click.echo(f" 作者: {', '.join(author_names)}")
|
|
1374
|
-
click.echo(f" arXiv ID: {paper.arxiv_id}")
|
|
1375
|
-
click.echo(f" 分类: {paper.categories}")
|
|
1376
|
-
click.echo(f" 发布日期: {paper.published.strftime('%Y-%m-%d') if paper.published else 'N/A'}")
|
|
1377
|
-
|
|
1378
|
-
# 生成报告
|
|
1379
|
-
click.echo("\n正在生成相似论文报告...")
|
|
1380
|
-
report_files = generate_search_report(
|
|
1381
|
-
f"与 {paper_id} 相似的论文", [f"similar to {paper_id}"], similar_papers, paper_limit=limit
|
|
1382
|
-
)
|
|
1383
|
-
|
|
1384
|
-
click.echo(f"报告生成完成:")
|
|
1385
|
-
for f in report_files:
|
|
1386
|
-
click.echo(f" - {f}")
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
@cli.command()
|
|
1390
|
-
@click.argument("query")
|
|
1391
|
-
@click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
|
|
1392
|
-
@click.option("--limit", default=20, help="返回结果的最大数量(默认:20)")
|
|
1393
|
-
@click.option("--years-back", type=int, default=0, help="搜索前同步回溯的年数(默认:0,不更新)")
|
|
1394
|
-
@click.option("--use-ai/--no-ai", default=True, help="是否使用AI理解自然语言查询(默认:是)")
|
|
1395
|
-
@click.option("--categories", "-c", multiple=True, help="包含的分类(可多次使用)")
|
|
1396
|
-
@click.option("--exclude-categories", "-ec", multiple=True, help="排除的分类(可多次使用)")
|
|
1397
|
-
@click.option("--primary-category", "-pc", help="主要分类")
|
|
1398
|
-
@click.option("--authors", "-a", multiple=True, help="作者姓名(可多次使用)")
|
|
1399
|
-
@click.option(
|
|
1400
|
-
"--author-match",
|
|
1401
|
-
type=click.Choice(["contains", "exact", "any"]),
|
|
1402
|
-
default="contains",
|
|
1403
|
-
help="作者匹配方式:contains(包含)、exact(精确)、any(任一)",
|
|
1404
|
-
)
|
|
1405
|
-
@click.option("--date-from", type=click.DateTime(formats=["%Y-%m-%d"]), help="起始日期(格式:YYYY-MM-DD)")
|
|
1406
|
-
@click.option("--date-to", type=click.DateTime(formats=["%Y-%m-%d"]), help="结束日期(格式:YYYY-MM-DD)")
|
|
1407
|
-
@click.option("--days-back", type=int, help="回溯天数(例如:30表示最近30天)")
|
|
1408
|
-
@click.option("--summarized-only/--no-summarized-only", default=False, help="仅显示已总结的论文")
|
|
1409
|
-
@click.option("--downloaded-only/--no-downloaded-only", default=False, help="仅显示已下载的论文")
|
|
1410
|
-
@click.option(
|
|
1411
|
-
"--sort-by",
|
|
1412
|
-
type=click.Choice(["published", "relevance_score", "title", "updated", "created_at"]),
|
|
1413
|
-
default="published",
|
|
1414
|
-
help="排序字段",
|
|
1415
|
-
)
|
|
1416
|
-
@click.option("--sort-order", type=click.Choice(["asc", "desc"]), default="desc", help="排序顺序")
|
|
1417
|
-
@click.option("--match-all/--match-any", default=False, help="匹配所有条件(AND逻辑)或任一条件(OR逻辑)")
|
|
1418
|
-
def search_advanced(
|
|
1419
|
-
query,
|
|
1420
|
-
directory,
|
|
1421
|
-
limit,
|
|
1422
|
-
years_back,
|
|
1423
|
-
use_ai,
|
|
1424
|
-
categories,
|
|
1425
|
-
exclude_categories,
|
|
1426
|
-
primary_category,
|
|
1427
|
-
authors,
|
|
1428
|
-
author_match,
|
|
1429
|
-
date_from,
|
|
1430
|
-
date_to,
|
|
1431
|
-
days_back,
|
|
1432
|
-
summarized_only,
|
|
1433
|
-
downloaded_only,
|
|
1434
|
-
sort_by,
|
|
1435
|
-
sort_order,
|
|
1436
|
-
match_all,
|
|
1437
|
-
):
|
|
1438
|
-
"""高级搜索论文(支持多字段过滤)"""
|
|
1439
|
-
directory = Path(directory).resolve()
|
|
1440
|
-
|
|
1441
|
-
if not setup_environment(directory):
|
|
1442
|
-
sys.exit(1)
|
|
1443
|
-
|
|
1444
|
-
print_banner()
|
|
1445
|
-
|
|
1446
|
-
# 如果需要,先同步最新论文
|
|
1447
|
-
crawler = ArXivCrawler()
|
|
1448
|
-
if years_back > 0:
|
|
1449
|
-
click.echo(f"搜索前先同步最近 {years_back} 年论文...")
|
|
1450
|
-
sync_result = sync_papers(years_back=years_back, summarize=False)
|
|
1451
|
-
crawler = sync_result["crawler"]
|
|
1452
|
-
|
|
1453
|
-
click.echo(f"\n高级搜索: '{query}'")
|
|
1454
|
-
click.echo("=" * 50)
|
|
1455
|
-
|
|
1456
|
-
search_terms = [query]
|
|
1457
|
-
|
|
1458
|
-
# 如果启用AI且配置了AI API密钥,尝试解析自然语言查询
|
|
1459
|
-
if use_ai and Config.AI_API_KEY:
|
|
1460
|
-
try:
|
|
1461
|
-
import openai
|
|
1462
|
-
|
|
1463
|
-
client = openai.OpenAI(api_key=Config.AI_API_KEY, base_url=Config.AI_BASE_URL)
|
|
1464
|
-
|
|
1465
|
-
ai_prompt = f"""
|
|
1466
|
-
用户正在搜索arXiv物理/计算材料科学论文,查询是: "{query}"
|
|
1467
|
-
|
|
1468
|
-
请将自然语言查询转换为适合arXiv搜索的关键词或短语。
|
|
1469
|
-
考虑以下领域:凝聚态物理、密度泛函理论(DFT)、机器学习、力场、分子动力学、量子化学。
|
|
1470
|
-
|
|
1471
|
-
返回格式:JSON数组,包含最多5个搜索关键词/短语。
|
|
1472
|
-
示例:["machine learning materials science", "density functional theory", "condensed matter physics"]
|
|
1473
|
-
|
|
1474
|
-
只返回JSON数组,不要其他文本。
|
|
1475
|
-
"""
|
|
1476
|
-
|
|
1477
|
-
response = client.chat.completions.create(
|
|
1478
|
-
model=Config.AI_MODEL,
|
|
1479
|
-
messages=[
|
|
1480
|
-
{"role": "system", "content": "你是arXiv论文搜索助手,擅长将自然语言查询转换为学术搜索关键词。"},
|
|
1481
|
-
{"role": "user", "content": ai_prompt},
|
|
1482
|
-
],
|
|
1483
|
-
max_tokens=200,
|
|
1484
|
-
temperature=0.3,
|
|
1485
|
-
)
|
|
1486
|
-
|
|
1487
|
-
ai_response = response.choices[0].message.content
|
|
1488
|
-
try:
|
|
1489
|
-
search_terms = json.loads(ai_response)
|
|
1490
|
-
if isinstance(search_terms, list) and len(search_terms) > 0:
|
|
1491
|
-
click.echo(f"AI解析的搜索词: {', '.join(search_terms[:3])}")
|
|
1492
|
-
if len(search_terms) > 3:
|
|
1493
|
-
click.echo(f" 以及 {len(search_terms) - 3} 个其他关键词")
|
|
1494
|
-
except:
|
|
1495
|
-
# 如果AI响应不是有效JSON,使用原始查询
|
|
1496
|
-
pass
|
|
1497
|
-
|
|
1498
|
-
except Exception as e:
|
|
1499
|
-
click.echo(f"AI解析失败,使用原始查询: {e}")
|
|
1500
|
-
|
|
1501
|
-
# 使用增强搜索引擎
|
|
1502
|
-
with crawler.db.get_session() as session:
|
|
1503
|
-
# 创建搜索过滤器
|
|
1504
|
-
filter_config = SearchFilter(
|
|
1505
|
-
query=query,
|
|
1506
|
-
categories=list(categories) if categories else None,
|
|
1507
|
-
exclude_categories=list(exclude_categories) if exclude_categories else None,
|
|
1508
|
-
primary_category=primary_category,
|
|
1509
|
-
authors=list(authors) if authors else None,
|
|
1510
|
-
author_match=author_match,
|
|
1511
|
-
date_from=date_from,
|
|
1512
|
-
date_to=date_to,
|
|
1513
|
-
days_back=days_back,
|
|
1514
|
-
summarized_only=summarized_only,
|
|
1515
|
-
downloaded_only=downloaded_only,
|
|
1516
|
-
limit=limit,
|
|
1517
|
-
sort_by=sort_by,
|
|
1518
|
-
sort_order=sort_order,
|
|
1519
|
-
match_all=match_all,
|
|
1520
|
-
)
|
|
1521
|
-
|
|
1522
|
-
# 创建搜索引擎
|
|
1523
|
-
search_engine = SearchEngine(session)
|
|
1524
|
-
|
|
1525
|
-
# 执行搜索
|
|
1526
|
-
click.echo(f"正在搜索...")
|
|
1527
|
-
papers = search_engine.search_papers(filter_config)
|
|
1528
|
-
|
|
1529
|
-
if not papers:
|
|
1530
|
-
click.echo("未找到匹配的论文。")
|
|
1531
|
-
return
|
|
1532
|
-
|
|
1533
|
-
click.echo(f"找到 {len(papers)} 篇论文:")
|
|
1534
|
-
|
|
1535
|
-
# 显示简要结果
|
|
1536
|
-
for i, paper in enumerate(papers[:5], 1): # 只显示前5篇作为预览
|
|
1537
|
-
authors_list = json.loads(paper.authors) if paper.authors else []
|
|
1538
|
-
author_names = [a.get("name", "") for a in authors_list[:2]]
|
|
1539
|
-
if len(authors_list) > 2:
|
|
1540
|
-
author_names.append("等")
|
|
1541
|
-
|
|
1542
|
-
click.echo(f"\n{i}. {paper.title}")
|
|
1543
|
-
click.echo(f" 作者: {', '.join(author_names)}")
|
|
1544
|
-
click.echo(f" arXiv ID: {paper.arxiv_id}")
|
|
1545
|
-
click.echo(f" 分类: {paper.categories}")
|
|
1546
|
-
click.echo(f" 发布日期: {paper.published.strftime('%Y-%m-%d') if paper.published else 'N/A'}")
|
|
1547
|
-
click.echo(f" 总结状态: {'已总结' if paper.summarized else '未总结'}")
|
|
1548
|
-
|
|
1549
|
-
if len(papers) > 5:
|
|
1550
|
-
click.echo(f"\n... 以及 {len(papers) - 5} 篇更多论文")
|
|
1551
|
-
|
|
1552
|
-
# 生成搜索报告
|
|
1553
|
-
click.echo("\n正在生成搜索报告...")
|
|
1554
|
-
files = generate_search_report(directory, query, search_terms, papers, paper_limit=limit)
|
|
1555
|
-
|
|
1556
|
-
click.echo(f"报告生成完成:")
|
|
1557
|
-
for f in files:
|
|
1558
|
-
click.echo(f" - {f}")
|
|
1559
|
-
click.echo(f"\n详细论文信息、中文翻译和PDF链接请查看生成的Markdown报告。")
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
@cli.command()
|
|
1563
|
-
@click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
|
|
1564
|
-
@click.option("--limit", default=10, help="显示的搜索查询数量(默认:10)")
|
|
1565
|
-
def search_history(directory, limit):
|
|
1566
|
-
"""显示搜索历史(按使用频率排序)"""
|
|
1567
|
-
directory = Path(directory).resolve()
|
|
1568
|
-
|
|
1569
|
-
if not setup_environment(directory):
|
|
1570
|
-
sys.exit(1)
|
|
1571
|
-
|
|
1572
|
-
print_banner()
|
|
1573
|
-
|
|
1574
|
-
crawler = ArXivCrawler()
|
|
1575
|
-
|
|
1576
|
-
click.echo("\n" + "=" * 50)
|
|
1577
|
-
click.echo("搜索历史")
|
|
1578
|
-
click.echo("=" * 50)
|
|
1579
|
-
|
|
1580
|
-
with crawler.db.get_session() as session:
|
|
1581
|
-
# 创建搜索引擎
|
|
1582
|
-
search_engine = SearchEngine(session)
|
|
1583
|
-
|
|
1584
|
-
# 获取搜索历史
|
|
1585
|
-
click.echo(f"正在获取搜索历史...")
|
|
1586
|
-
history = search_engine.get_search_history(limit=limit)
|
|
1587
|
-
|
|
1588
|
-
if not history:
|
|
1589
|
-
click.echo("暂无搜索历史。")
|
|
1590
|
-
return
|
|
1591
|
-
|
|
1592
|
-
click.echo(f"\n找到 {len(history)} 个搜索查询:")
|
|
1593
|
-
click.echo("-" * 50)
|
|
1594
|
-
|
|
1595
|
-
for i, item in enumerate(history, 1):
|
|
1596
|
-
last_used = item["last_used"].strftime("%Y-%m-%d") if item["last_used"] else "N/A"
|
|
1597
|
-
click.echo(f"\n{i}. 查询: {item['query']}")
|
|
1598
|
-
click.echo(f" 使用次数: {item['count']}")
|
|
1599
|
-
click.echo(f" 最后使用: {last_used}")
|
|
1600
|
-
if item["last_paper_id"]:
|
|
1601
|
-
click.echo(f" 最后论文ID: {item['last_paper_id']}")
|
|
1602
|
-
|
|
1603
|
-
click.echo(f"\n💡 提示: 使用 'pulse search \"查询内容\" .' 重用搜索")
|
|
1604
|
-
click.echo(f" 或 'pulse search-advanced \"查询内容\" . --categories 分类' 进行高级搜索")
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
1402
|
if __name__ == "__main__":
|
|
1608
1403
|
cli()
|