arxiv-pulse 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arxiv_pulse/cli.py CHANGED
@@ -10,14 +10,15 @@ from pathlib import Path
10
10
  import click
11
11
  from dotenv import load_dotenv
12
12
  import json
13
- from datetime import datetime, timedelta
13
+ from datetime import datetime, timedelta, timezone
14
14
  import questionary
15
+ import wcwidth
15
16
 
16
17
  from arxiv_pulse.config import Config
17
18
  from arxiv_pulse.arxiv_crawler import ArXivCrawler
18
19
  from arxiv_pulse.summarizer import PaperSummarizer
19
20
  from arxiv_pulse.report_generator import ReportGenerator
20
- from arxiv_pulse.output_manager import output
21
+ from arxiv_pulse.output_manager import output, OutputLevel
21
22
  from arxiv_pulse.search_engine import SearchEngine, SearchFilter
22
23
  from arxiv_pulse.__version__ import __version__
23
24
 
@@ -234,7 +235,6 @@ def setup_environment(directory: Path):
234
235
  # 创建必要的目录
235
236
  os.makedirs("data", exist_ok=True)
236
237
  os.makedirs("reports", exist_ok=True)
237
- os.makedirs("logs", exist_ok=True)
238
238
 
239
239
  # 加载 .env 文件(如果存在)
240
240
  env_file = directory / ".env"
@@ -267,12 +267,14 @@ def setup_environment(directory: Path):
267
267
  Config.AI_MODEL = os.getenv("AI_MODEL", "DeepSeek-V3.2-Thinking")
268
268
  Config.AI_BASE_URL = os.getenv("AI_BASE_URL", "https://llmapi.paratera.com")
269
269
  Config.SUMMARY_MAX_TOKENS = int(os.getenv("SUMMARY_MAX_TOKENS", "2000"))
270
- Config.SUMMARY_SENTENCES_LIMIT = int(os.getenv("SUMMARY_SENTENCES_LIMIT", "3"))
271
270
  Config.TOKEN_PRICE_PER_MILLION = float(os.getenv("TOKEN_PRICE_PER_MILLION", "3.0"))
272
- Config.MAX_RESULTS_INITIAL = int(os.getenv("MAX_RESULTS_INITIAL", "100"))
273
- Config.MAX_RESULTS_DAILY = int(os.getenv("MAX_RESULTS_DAILY", "20"))
271
+ Config.MAX_RESULTS_INITIAL = int(os.getenv("MAX_RESULTS_INITIAL", "10000"))
272
+ Config.MAX_RESULTS_DAILY = int(os.getenv("MAX_RESULTS_DAILY", "500"))
274
273
  Config.YEARS_BACK = int(os.getenv("YEARS_BACK", "3"))
275
- Config.IMPORTANT_PAPERS_FILE = os.getenv("IMPORTANT_PAPERS_FILE", "important_papers.txt")
274
+ Config.IMPORTANT_PAPERS_FILE = os.getenv("IMPORTANT_PAPERS_FILE", "data/important_papers.txt")
275
+ Config.ARXIV_MAX_RESULTS = int(os.getenv("ARXIV_MAX_RESULTS", "30000"))
276
+ Config.ARXIV_SORT_BY = os.getenv("ARXIV_SORT_BY", "submittedDate")
277
+ Config.ARXIV_SORT_ORDER = os.getenv("ARXIV_SORT_ORDER", "descending")
276
278
  Config.REPORT_MAX_PAPERS = int(os.getenv("REPORT_MAX_PAPERS", "50"))
277
279
 
278
280
  # 更新 SEARCH_QUERIES
@@ -407,54 +409,74 @@ def print_banner_custom(fields):
407
409
  else:
408
410
  field_str = f"{fields[0]} • {fields[1]} • {fields[2]} • {fields[3]}"
409
411
 
410
- # 计算居中位置 (横幅宽度为55字符,边框各占1字符,内容宽度53字符)
411
- # 第一行标题:"arXiv Pulse - 文献追踪系统" (25字符)
412
- # 需要将字段字符串居中显示
412
+ # 横幅尺寸
413
413
  banner_width = 55
414
414
  content_width = 53
415
415
 
416
- # 创建横幅
416
+ # 辅助函数:计算字符串显示宽度
417
+ def display_width(text):
418
+ return wcwidth.wcswidth(text)
419
+
420
+ # 辅助函数:截断字符串到指定显示宽度,添加省略号
421
+ def truncate_to_width(text, max_width):
422
+ if display_width(text) <= max_width:
423
+ return text
424
+ # 逐步减少字符直到宽度合适
425
+ result = ""
426
+ for char in text:
427
+ if display_width(result + char) > max_width - 3: # 为"..."留出空间
428
+ break
429
+ result += char
430
+ return result + "..." if result else "..." # 至少返回省略号
431
+
432
+ # 创建横幅边框
417
433
  border_top = "╔" + "═" * (banner_width - 2) + "╗"
418
434
  border_bottom = "╚" + "═" * (banner_width - 2) + "╝"
419
435
 
420
436
  # 第一行标题
421
437
  title = "arXiv Pulse - 文献追踪系统"
422
- # 标题居中
423
- title_padding = (content_width - len(title) * 2) // 2 # 中文占2个英文字符宽度
424
- if title_padding < 0:
425
- title_padding = 0
426
- title_line = "║" + " " * title_padding + title + " " * (content_width - len(title) * 2 - title_padding) + "║"
438
+ title_width = display_width(title)
439
+ # 计算左右填充
440
+ left_padding = (content_width - title_width) // 2
441
+ right_padding = content_width - title_width - left_padding
442
+ title_line = "║" + " " * left_padding + title + " " * right_padding + "║"
427
443
 
428
444
  # 第二行字段
429
- # 简单处理:如果字段字符串太长,截断
430
- max_field_len = content_width - 4 # 留出一些边距
431
- if len(field_str) * 2 > max_field_len: # 中文占2个英文字符宽度
432
- # 截断字段字符串
433
- field_str = field_str[: max_field_len // 2] + "..."
434
-
435
- field_padding = (content_width - len(field_str) * 2) // 2
436
- if field_padding < 0:
437
- field_padding = 0
438
- field_line = (
439
- "║" + " " * field_padding + field_str + " " * (content_width - len(field_str) * 2 - field_padding) + "║"
440
- )
445
+ # 最大字段显示宽度(留出边距)
446
+ max_field_width = content_width - 4
447
+ # 截断字段字符串如果太长
448
+ field_str = truncate_to_width(field_str, max_field_width)
449
+ field_width = display_width(field_str)
450
+
451
+ # 计算字段行的左右填充
452
+ left_padding = (content_width - field_width) // 2
453
+ right_padding = content_width - field_width - left_padding
454
+ field_line = "║" + " " * left_padding + field_str + " " * right_padding + "║"
441
455
 
442
456
  banner = f"\n{border_top}\n{title_line}\n{field_line}\n{border_bottom}\n"
443
457
  click.echo(banner)
444
458
 
445
459
 
446
- def sync_papers(years_back=1, summarize=False):
447
- """同步论文(内部函数)"""
460
+ def sync_papers(years_back=1, summarize=False, force=False):
461
+ """同步论文(内部函数)
462
+
463
+ Args:
464
+ years_back: 回溯的年数
465
+ summarize: 是否总结新论文
466
+ force: 是否强制同步(重新下载所有论文,忽略重复检查)
467
+ """
448
468
  crawler = ArXivCrawler()
449
469
  summarizer = PaperSummarizer()
450
470
 
451
- click.echo(f"正在同步缺失论文(回溯 {years_back} 年)...")
471
+ mode_text = "强制同步" if force else "同步缺失论文"
472
+ click.echo(f"正在{mode_text}(回溯 {years_back} 年)...")
452
473
  click.echo("=" * 50)
453
474
 
454
475
  # 同步所有查询
455
476
  click.echo("1. 正在同步搜索查询...")
456
- sync_result = crawler.sync_all_queries(years_back=years_back)
457
- click.echo(f" 从查询添加了 {sync_result['total_new_papers']} 篇新论文")
477
+ sync_result = crawler.sync_all_queries(years_back=years_back, force=force)
478
+ result_text = "处理了" if force else "添加了"
479
+ click.echo(f" 从查询{result_text} {sync_result['total_new_papers']} 篇论文")
458
480
 
459
481
  # 同步重要论文
460
482
  click.echo("2. 正在同步重要论文...")
@@ -480,7 +502,7 @@ def sync_papers(years_back=1, summarize=False):
480
502
 
481
503
  click.echo("\n" + "=" * 50)
482
504
  click.echo("同步完成!")
483
- click.echo(f"总共添加了新论文: {total_new}")
505
+ click.echo(f"总共{result_text}论文: {total_new}")
484
506
  click.echo(f"数据库现有 {crawl_stats['total_papers']} 篇论文")
485
507
  click.echo(f"已总结: {summary_stats['summarized_papers']} ({summary_stats['summarization_rate']:.1%})")
486
508
 
@@ -490,12 +512,13 @@ def sync_papers(years_back=1, summarize=False):
490
512
  "sync_result": sync_result,
491
513
  "important_result": important_result,
492
514
  "stats": {"crawl_stats": crawl_stats, "summary_stats": summary_stats},
515
+ "force_mode": force,
493
516
  }
494
517
 
495
518
 
496
519
  def get_workday_cutoff(days_back):
497
520
  """计算排除周末的截止日期"""
498
- current = datetime.utcnow()
521
+ current = datetime.now(timezone.utc).replace(tzinfo=None)
499
522
  workdays_counted = 0
500
523
  days_to_go_back = 0
501
524
 
@@ -609,6 +632,11 @@ def generate_search_report(query, search_terms, papers, paper_limit=50, summariz
609
632
  """生成搜索结果的报告(内部函数)"""
610
633
  reporter = ReportGenerator()
611
634
 
635
+ # 如果没有找到论文,不生成报告
636
+ if not papers:
637
+ output.info("未找到论文,跳过报告生成")
638
+ return []
639
+
612
640
  # 设置报告限制
613
641
  original_limit = Config.REPORT_MAX_PAPERS
614
642
  Config.REPORT_MAX_PAPERS = paper_limit
@@ -698,10 +726,15 @@ def generate_search_report(query, search_terms, papers, paper_limit=50, summariz
698
726
 
699
727
 
700
728
  @click.group(context_settings={"help_option_names": ["-h", "--help"]})
729
+ @click.option("--verbose", "-v", is_flag=True, help="显示详细输出(包括调试信息)")
701
730
  @click.version_option(version=__version__, prog_name="arXiv Pulse")
702
- def cli():
731
+ def cli(verbose):
703
732
  """arXiv Pulse: 智能arXiv文献追踪和分析系统"""
704
- pass
733
+ if verbose:
734
+ output.set_min_level(OutputLevel.DEBUG)
735
+ else:
736
+ # 确保使用环境变量中的LOG_LEVEL(output manager已经处理)
737
+ pass
705
738
 
706
739
 
707
740
  def interactive_configuration():
@@ -778,10 +811,10 @@ def interactive_configuration():
778
811
  click.echo("\n📊 爬虫配置")
779
812
  click.echo("-" * 40)
780
813
 
781
- max_results_initial = click.prompt("初始同步每个查询的最大论文数", default=100, type=int, show_default=True)
814
+ max_results_initial = click.prompt("初始同步每个查询的最大论文数", default=10000, type=int, show_default=True)
782
815
  config["MAX_RESULTS_INITIAL"] = str(max_results_initial)
783
816
 
784
- max_results_daily = click.prompt("每日同步每个查询的最大论文数", default=20, type=int, show_default=True)
817
+ max_results_daily = click.prompt("每日同步每个查询的最大论文数", default=500, type=int, show_default=True)
785
818
  config["MAX_RESULTS_DAILY"] = str(max_results_daily)
786
819
 
787
820
  years_back = click.prompt("初始同步回溯的年数", default=5, type=int, show_default=True)
@@ -866,27 +899,24 @@ def interactive_configuration():
866
899
  click.echo("-" * 40)
867
900
 
868
901
  # 根据领域数量提供建议
869
- recommended_initial = 100
870
- recommended_daily = 20
871
-
872
- if num_selected_fields <= 3:
902
+ if num_selected_fields <= 6:
873
903
  click.echo("✅ 您选择了少量领域,保持默认配置即可。")
874
- elif num_selected_fields <= 6:
875
- recommended_initial = 70
876
- recommended_daily = 15
904
+ elif num_selected_fields <= 10:
905
+ recommended_initial = 4000
906
+ recommended_daily = 200
877
907
  click.echo(f"⚠️ 您选择了中等数量领域,建议调整爬虫配置以避免过多论文:")
878
- click.echo(f" - 初始同步每个查询最大论文数: {recommended_initial} (原默认: 100)")
879
- click.echo(f" - 每日同步每个查询最大论文数: {recommended_daily} (原默认: 20)")
908
+ click.echo(f" - 初始同步每个查询最大论文数: {recommended_initial}")
909
+ click.echo(f" - 每日同步每个查询最大论文数: {recommended_daily}")
880
910
  else:
881
- recommended_initial = 50
882
- recommended_daily = 10
911
+ recommended_initial = 1000
912
+ recommended_daily = 50
883
913
  click.echo(f"⚠️ 您选择了大量领域 ({num_selected_fields}个),强烈建议调整爬虫配置:")
884
- click.echo(f" - 初始同步每个查询最大论文数: {recommended_initial} (原默认: 100)")
885
- click.echo(f" - 每日同步每个查询最大论文数: {recommended_daily} (原默认: 20)")
914
+ click.echo(f" - 初始同步每个查询最大论文数: {recommended_initial}")
915
+ click.echo(f" - 每日同步每个查询最大论文数: {recommended_daily}")
886
916
  click.echo(f" - 注意:同步大量领域可能需要较长时间和更多存储空间。")
887
917
 
888
918
  # 询问用户是否应用建议
889
- if num_selected_fields > 3:
919
+ if num_selected_fields > 6:
890
920
  if click.confirm("\n💡 是否应用上述建议调整爬虫配置?", default=True):
891
921
  config["MAX_RESULTS_INITIAL"] = str(recommended_initial)
892
922
  config["MAX_RESULTS_DAILY"] = str(recommended_daily)
@@ -903,9 +933,6 @@ def interactive_configuration():
903
933
  report_max_papers = click.prompt("每份报告显示的最大论文数", default=50, type=int, show_default=True)
904
934
  config["REPORT_MAX_PAPERS"] = str(report_max_papers)
905
935
 
906
- summary_sentences_limit = click.prompt("摘要句子数限制", default=3, type=int, show_default=True)
907
- config["SUMMARY_SENTENCES_LIMIT"] = str(summary_sentences_limit)
908
-
909
936
  click.echo("\n✅ 配置完成!")
910
937
  return config, int(years_back)
911
938
 
@@ -920,7 +947,6 @@ def init(directory, years_back):
920
947
  # 创建目录结构
921
948
  (directory / "data").mkdir(exist_ok=True)
922
949
  (directory / "reports").mkdir(exist_ok=True)
923
- (directory / "logs").mkdir(exist_ok=True)
924
950
 
925
951
  # 创建 .env 文件(如果不存在)
926
952
  env_file = directory / ".env"
@@ -937,59 +963,77 @@ def init(directory, years_back):
937
963
  if years_back is None:
938
964
  years_back = interactive_years_back
939
965
 
940
- # 生成 .env 文件内容
941
- env_content = f"""# arXiv Pulse 配置文件
942
- # 由交互式配置向导于 {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} 生成
943
-
944
- # ========================
945
- # AI API 配置 (支持 OpenAI 格式)
946
- # ========================
947
- AI_API_KEY={config.get("AI_API_KEY", "your_api_key_here")}
948
- AI_MODEL={config.get("AI_MODEL", "DeepSeek-V3.2-Thinking")}
949
- AI_BASE_URL={config.get("AI_BASE_URL", "https://llmapi.paratera.com")}
950
-
951
- # ========================
952
- # 数据库配置
953
- # ========================
954
- DATABASE_URL=sqlite:///data/arxiv_papers.db
955
-
956
- # ========================
957
- # 爬虫配置
958
- # ========================
959
- MAX_RESULTS_INITIAL={config.get("MAX_RESULTS_INITIAL", "100")} # init命令每个查询的论文数
960
- MAX_RESULTS_DAILY={config.get("MAX_RESULTS_DAILY", "20")} # sync命令每个查询的论文数
961
-
962
- # ========================
963
- # 搜索查询配置
964
- # ========================
965
- # 分号分隔,允许查询中包含逗号
966
- # 根据您的选择生成的研究领域查询
967
- SEARCH_QUERIES={config.get("SEARCH_QUERIES", 'condensed matter physics AND cat:cond-mat.*; (ti:"density functional" OR abs:"density functional") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph); (ti:"machine learning" OR abs:"machine learning") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph)')}
968
-
969
- # ========================
970
- # 报告配置
971
- # ========================
972
- REPORT_DIR=reports
973
- SUMMARY_MAX_TOKENS=2000 # 总结和翻译的最大token数
974
- SUMMARY_SENTENCES_LIMIT={config.get("SUMMARY_SENTENCES_LIMIT", "3")}
975
- TOKEN_PRICE_PER_MILLION=3.0
976
- REPORT_MAX_PAPERS={config.get("REPORT_MAX_PAPERS", "50")}
977
-
978
- # ========================
979
- # 同步配置
980
- # ========================
981
- YEARS_BACK={config.get("YEARS_BACK", "3")} # 同步回溯的年数
982
- IMPORTANT_PAPERS_FILE=important_papers.txt
983
-
984
- # ========================
985
- # 可选配置
986
- # ========================
987
- # 日志级别: DEBUG, INFO, WARNING, ERROR (默认: INFO)
988
- LOG_LEVEL=INFO
989
-
990
- # 爬虫延迟(秒,避免频繁请求 arXiv API)
991
- CRAWL_DELAY=1.0
992
- """
966
+ # 读取 .ENV.TEMPLATE 文件作为基础模板
967
+ template_file = Path(__file__).parent / ".ENV.TEMPLATE"
968
+ if not template_file.exists():
969
+ click.echo(f"❌ 找不到模板文件: {template_file}")
970
+ click.echo("请确保 .ENV.TEMPLATE 文件存在于 arxiv_pulse 目录中")
971
+ return
972
+
973
+ env_content = template_file.read_text(encoding="utf-8")
974
+
975
+ # 添加生成时间戳注释(插入到第一行之后)
976
+ timestamp_comment = f"# 由交互式配置向导于 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} 生成\n"
977
+ lines = env_content.split("\n")
978
+ if lines and lines[0].startswith("#"):
979
+ # 在第一行注释后插入时间戳
980
+ lines.insert(1, timestamp_comment)
981
+ else:
982
+ # 如果没有注释行,添加到开头
983
+ lines.insert(0, timestamp_comment)
984
+ env_content = "\n".join(lines)
985
+
986
+ # 替换配置项(使用 config 字典中的值)
987
+ # 统一替换逻辑:搜索以键名开头的行,替换整行
988
+ lines = env_content.split("\n")
989
+
990
+ # AI API 配置
991
+ for i, line in enumerate(lines):
992
+ if line.strip().startswith("AI_API_KEY="):
993
+ lines[i] = f"AI_API_KEY={config.get('AI_API_KEY', 'your_api_key_here')}"
994
+ break
995
+
996
+ for i, line in enumerate(lines):
997
+ if line.strip().startswith("AI_MODEL="):
998
+ lines[i] = f"AI_MODEL={config.get('AI_MODEL', 'DeepSeek-V3.2-Thinking')}"
999
+ break
1000
+
1001
+ for i, line in enumerate(lines):
1002
+ if line.strip().startswith("AI_BASE_URL="):
1003
+ lines[i] = f"AI_BASE_URL={config.get('AI_BASE_URL', 'https://llmapi.paratera.com')}"
1004
+ break
1005
+
1006
+ # 爬虫配置
1007
+ for i, line in enumerate(lines):
1008
+ if line.strip().startswith("MAX_RESULTS_INITIAL="):
1009
+ lines[i] = f"MAX_RESULTS_INITIAL={config.get('MAX_RESULTS_INITIAL', '10000')}"
1010
+ break
1011
+
1012
+ for i, line in enumerate(lines):
1013
+ if line.strip().startswith("MAX_RESULTS_DAILY="):
1014
+ lines[i] = f"MAX_RESULTS_DAILY={config.get('MAX_RESULTS_DAILY', '500')}"
1015
+ break
1016
+
1017
+ # 搜索查询配置
1018
+ default_search_queries = 'condensed matter physics AND cat:cond-mat.*; (ti:"density functional" OR abs:"density functional") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph); (ti:"machine learning" OR abs:"machine learning") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph)'
1019
+ for i, line in enumerate(lines):
1020
+ if line.strip().startswith("SEARCH_QUERIES="):
1021
+ lines[i] = f"SEARCH_QUERIES={config.get('SEARCH_QUERIES', default_search_queries)}"
1022
+ break
1023
+
1024
+ # 报告配置
1025
+ for i, line in enumerate(lines):
1026
+ if line.strip().startswith("REPORT_MAX_PAPERS="):
1027
+ lines[i] = f"REPORT_MAX_PAPERS={config.get('REPORT_MAX_PAPERS', '50')}"
1028
+ break
1029
+
1030
+ # 同步配置
1031
+ for i, line in enumerate(lines):
1032
+ if line.strip().startswith("YEARS_BACK="):
1033
+ lines[i] = f"YEARS_BACK={config.get('YEARS_BACK', '5')}"
1034
+ break
1035
+
1036
+ env_content = "\n".join(lines)
993
1037
 
994
1038
  env_file.write_text(env_content)
995
1039
  click.echo(f"\n✅ 已在 {directory} 创建 .env 配置文件")
@@ -1000,10 +1044,12 @@ CRAWL_DELAY=1.0
1000
1044
  years_back = 5 # 默认值
1001
1045
 
1002
1046
  # 创建 important_papers.txt(如果不存在)
1003
- important_file = directory / "important_papers.txt"
1047
+ important_file = directory / Config.IMPORTANT_PAPERS_FILE
1048
+ # 确保父目录存在
1049
+ important_file.parent.mkdir(parents=True, exist_ok=True)
1004
1050
  if not important_file.exists():
1005
1051
  important_file.write_text("# 在此添加重要论文的arXiv ID,每行一个\n")
1006
- click.echo(f"✅ 已在 {directory} 创建 important_papers.txt 文件")
1052
+ click.echo(f"✅ 已创建重要论文文件: {important_file}")
1007
1053
 
1008
1054
  # 设置环境并验证配置
1009
1055
  if not setup_environment(directory):
@@ -1046,10 +1092,15 @@ CRAWL_DELAY=1.0
1046
1092
 
1047
1093
  @cli.command()
1048
1094
  @click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
1049
- @click.option("--years-back", type=int, default=1, help="同步回溯的年数(默认:1年)")
1095
+ @click.option("--years-back", type=int, default=None, help="同步回溯的年数(默认:强制模式5年,普通模式1年)")
1050
1096
  @click.option("--summarize/--no-summarize", default=False, help="是否总结新论文(默认:否)")
1051
- def sync(directory, years_back, summarize):
1052
- """同步最新论文到数据库"""
1097
+ @click.option("--force", is_flag=True, default=False, help="强制同步:重新下载最近N年的所有论文,忽略重复检查")
1098
+ def sync(directory, years_back, summarize, force):
1099
+ """同步最新论文到数据库
1100
+
1101
+ 强制模式(--force): 重新下载最近N年的所有论文,忽略重复检查,默认回溯5年。
1102
+ 普通模式: 只下载缺失的新论文,默认回溯1年。
1103
+ """
1053
1104
  directory = Path(directory).resolve()
1054
1105
  click.echo(f"正在同步 arXiv Pulse 于 {directory}")
1055
1106
 
@@ -1058,8 +1109,13 @@ def sync(directory, years_back, summarize):
1058
1109
 
1059
1110
  print_banner()
1060
1111
 
1112
+ # 设置默认years_back值
1113
+ if years_back is None:
1114
+ years_back = 5 if force else 1
1115
+ click.echo(f"使用默认回溯年数: {years_back} 年")
1116
+
1061
1117
  # 同步论文
1062
- sync_result = sync_papers(years_back=years_back, summarize=summarize)
1118
+ sync_result = sync_papers(years_back=years_back, summarize=summarize, force=force)
1063
1119
 
1064
1120
  click.echo("\n" + "=" * 50)
1065
1121
  click.echo("同步完成!数据库已更新。")
@@ -1073,8 +1129,19 @@ def sync(directory, years_back, summarize):
1073
1129
  @click.option("--use-ai/--no-ai", default=True, help="是否使用AI理解自然语言查询(默认:是)")
1074
1130
  @click.option("--summarize/--no-summarize", default=True, help="是否自动总结未总结的论文(默认:是)")
1075
1131
  @click.option("--max-summarize", type=int, default=0, help="最大总结论文数(默认:0表示无限制)")
1076
- def search(query, directory, limit, years_back, use_ai, summarize, max_summarize):
1077
- """智能搜索论文(支持自然语言查询)"""
1132
+ @click.option("--categories", "-c", multiple=True, help="包含的分类(可多次使用)")
1133
+ @click.option("--days-back", type=int, help="回溯天数(例如:30表示最近30天)")
1134
+ @click.option("--authors", "-a", multiple=True, help="作者姓名(可多次使用)")
1135
+ @click.option(
1136
+ "--sort-by",
1137
+ type=click.Choice(["published", "relevance_score", "title", "updated"]),
1138
+ default="published",
1139
+ help="排序字段",
1140
+ )
1141
+ def search(
1142
+ query, directory, limit, years_back, use_ai, summarize, max_summarize, categories, days_back, authors, sort_by
1143
+ ):
1144
+ """智能搜索论文(支持自然语言查询和基本过滤)"""
1078
1145
  directory = Path(directory).resolve()
1079
1146
 
1080
1147
  if not setup_environment(directory):
@@ -1086,7 +1153,7 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
1086
1153
  crawler = ArXivCrawler()
1087
1154
  if years_back > 0:
1088
1155
  click.echo(f"搜索前先同步最近 {years_back} 年论文...")
1089
- sync_result = sync_papers(years_back=years_back, summarize=False)
1156
+ sync_result = sync_papers(years_back=years_back, summarize=False, force=False)
1090
1157
  crawler = sync_result["crawler"]
1091
1158
 
1092
1159
  click.echo(f"\n正在搜索: '{query}'")
@@ -1105,10 +1172,22 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
1105
1172
  用户正在搜索arXiv物理/计算材料科学论文,查询是: "{query}"
1106
1173
 
1107
1174
  请将自然语言查询转换为适合arXiv搜索的关键词或短语。
1108
- 考虑以下领域:凝聚态物理、密度泛函理论(DFT)、机器学习、力场、分子动力学、量子化学。
1109
1175
 
1110
- 返回格式:JSON数组,包含最多5个搜索关键词/短语。
1111
- 示例:["machine learning materials science", "density functional theory", "condensed matter physics"]
1176
+ 重要规则:
1177
+ 1. 如果查询已经是明确的搜索词(如"DeepH"、"deep learning Hamiltonian""DFT计算"),直接使用它,不要添加同义词
1178
+ 2. 如果查询包含专业术语、缩写或专有名词,保持原样作为主要搜索词
1179
+ 3. 仅当查询非常模糊或一般性时(如"机器学习在材料科学中的应用"),才生成1-2个相关关键词
1180
+ 4. 优先保持查询的原始意图,不要添加不相关的关键词
1181
+ 5. 对于英文查询,保持原样;对于中文查询,翻译为英文关键词
1182
+ 考虑以下领域:凝聚态物理、密度泛函理论(DFT)、机器学习、力场、分子动力学、量子化学、计算材料科学。
1183
+
1184
+ 返回格式:JSON数组,包含1-2个搜索关键词/短语。
1185
+ 示例:
1186
+ - 查询"DeepH": ["DeepH"]
1187
+ - 查询"deep learning Hamiltonian": ["deep learning Hamiltonian"]
1188
+ - 查询"DFT计算": ["DFT"]
1189
+ - 查询"分子动力学模拟": ["molecular dynamics simulation"]
1190
+ - 查询"机器学习在材料科学中的应用": ["machine learning materials science"]
1112
1191
 
1113
1192
  只返回JSON数组,不要其他文本。
1114
1193
  """
@@ -1116,7 +1195,10 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
1116
1195
  response = client.chat.completions.create(
1117
1196
  model=Config.AI_MODEL,
1118
1197
  messages=[
1119
- {"role": "system", "content": "你是arXiv论文搜索助手,擅长将自然语言查询转换为学术搜索关键词。"},
1198
+ {
1199
+ "role": "system",
1200
+ "content": "你是arXiv论文搜索助手,擅长识别专业术语并将自然语言查询转换为学术搜索关键词。",
1201
+ },
1120
1202
  {"role": "user", "content": ai_prompt},
1121
1203
  ],
1122
1204
  max_tokens=200,
@@ -1141,30 +1223,30 @@ def search(query, directory, limit, years_back, use_ai, summarize, max_summarize
1141
1223
  with crawler.db.get_session() as session:
1142
1224
  from arxiv_pulse.models import Paper
1143
1225
 
1144
- all_results = []
1145
- for term in search_terms:
1146
- papers = (
1147
- session.query(Paper)
1148
- .filter(
1149
- Paper.title.contains(term)
1150
- | Paper.abstract.contains(term)
1151
- | Paper.categories.contains(term)
1152
- | Paper.search_query.contains(term)
1153
- )
1154
- .order_by(Paper.published.desc())
1155
- .limit(limit)
1156
- .all()
1157
- )
1158
- all_results.extend(papers)
1226
+ # 使用增强搜索引擎进行模糊搜索
1227
+ search_engine = SearchEngine(session)
1159
1228
 
1160
- # 去重并排序
1161
- unique_papers = {}
1162
- for paper in all_results:
1163
- if paper.arxiv_id not in unique_papers:
1164
- unique_papers[paper.arxiv_id] = paper
1229
+ # 将搜索词合并为一个查询(搜索引擎会处理单词拆分和同义词扩展)
1230
+ combined_query = " ".join(search_terms)
1231
+
1232
+ filter_config = SearchFilter(
1233
+ query=combined_query,
1234
+ search_fields=["title", "abstract"],
1235
+ categories=list(categories) if categories else None,
1236
+ authors=list(authors) if authors else None,
1237
+ author_match="contains", # 默认使用包含匹配
1238
+ days_back=days_back,
1239
+ limit=limit * min(len(search_terms), 2), # 扩大限制但最多2倍,避免过多结果
1240
+ sort_by=sort_by,
1241
+ sort_order="desc",
1242
+ match_all=True, # AND逻辑:匹配所有搜索词
1243
+ )
1244
+
1245
+ # 执行搜索
1246
+ papers_to_show = search_engine.search_papers(filter_config)
1165
1247
 
1166
- sorted_papers = sorted(unique_papers.values(), key=lambda p: p.published or datetime.min, reverse=True)
1167
- papers_to_show = sorted_papers[:limit]
1248
+ # 确保不超过限制
1249
+ papers_to_show = papers_to_show[:limit]
1168
1250
 
1169
1251
  click.echo(f"找到 {len(papers_to_show)} 篇论文:")
1170
1252
 
@@ -1219,7 +1301,7 @@ def recent(directory, limit, days_back, years_back, summarize, max_summarize):
1219
1301
  # 先同步论文
1220
1302
  if years_back > 0:
1221
1303
  click.echo(f"报告前先同步最近 {years_back} 年论文...")
1222
- sync_papers(years_back=years_back, summarize=False)
1304
+ sync_papers(years_back=years_back, summarize=False, force=False)
1223
1305
 
1224
1306
  # 生成报告
1225
1307
  click.echo("\n" + "=" * 50)
@@ -1294,8 +1376,6 @@ def stat(directory):
1294
1376
  # 时间分布
1295
1377
  click.echo(f"\n📅 时间分布:")
1296
1378
  with crawler.db.get_session() as session:
1297
- from datetime import datetime, timedelta
1298
-
1299
1379
  # 按年统计
1300
1380
  year_stats = {}
1301
1381
  for paper in papers:
@@ -1319,290 +1399,5 @@ def stat(directory):
1319
1399
  click.echo("统计完成 ✅")
1320
1400
 
1321
1401
 
1322
- @cli.command()
1323
- @click.argument("paper_id")
1324
- @click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
1325
- @click.option("--limit", default=10, help="返回结果的最大数量(默认:10)")
1326
- @click.option("--threshold", type=float, default=0.5, help="相似度阈值(0.0-1.0,默认:0.5)")
1327
- @click.option("--years-back", type=int, default=0, help="搜索前同步回溯的年数(默认:0,不更新)")
1328
- def similar(paper_id, directory, limit, threshold, years_back):
1329
- """查找与指定论文相似的论文"""
1330
- directory = Path(directory).resolve()
1331
-
1332
- if not setup_environment(directory):
1333
- sys.exit(1)
1334
-
1335
- print_banner()
1336
-
1337
- # 如果需要,先同步最新论文
1338
- crawler = ArXivCrawler()
1339
- if years_back > 0:
1340
- click.echo(f"搜索前先同步最近 {years_back} 年论文...")
1341
- sync_result = sync_papers(years_back=years_back, summarize=False)
1342
- crawler = sync_result["crawler"]
1343
-
1344
- click.echo(f"\n查找与论文 '{paper_id}' 相似的论文")
1345
- click.echo("=" * 50)
1346
-
1347
- with crawler.db.get_session() as session:
1348
- # 创建搜索引擎
1349
- search_engine = SearchEngine(session)
1350
-
1351
- # 查找相似论文
1352
- click.echo(f"正在查找相似度≥{threshold}的论文...")
1353
- similar_papers_with_scores = search_engine.search_similar_papers(paper_id, limit=limit, threshold=threshold)
1354
-
1355
- if not similar_papers_with_scores:
1356
- click.echo("未找到相似论文。")
1357
- return
1358
-
1359
- click.echo(f"找到 {len(similar_papers_with_scores)} 篇相似论文:")
1360
-
1361
- # 提取paper列表用于报告生成
1362
- similar_papers = [paper for paper, _ in similar_papers_with_scores]
1363
-
1364
- # 显示结果
1365
- for i, (paper, similarity) in enumerate(similar_papers_with_scores, 1):
1366
- authors = json.loads(paper.authors) if paper.authors else []
1367
- author_names = [a.get("name", "") for a in authors[:2]]
1368
- if len(authors) > 2:
1369
- author_names.append("等")
1370
-
1371
- click.echo(f"\n{i}. {paper.title}")
1372
- click.echo(f" 相似度: {similarity:.2f}")
1373
- click.echo(f" 作者: {', '.join(author_names)}")
1374
- click.echo(f" arXiv ID: {paper.arxiv_id}")
1375
- click.echo(f" 分类: {paper.categories}")
1376
- click.echo(f" 发布日期: {paper.published.strftime('%Y-%m-%d') if paper.published else 'N/A'}")
1377
-
1378
- # 生成报告
1379
- click.echo("\n正在生成相似论文报告...")
1380
- report_files = generate_search_report(
1381
- f"与 {paper_id} 相似的论文", [f"similar to {paper_id}"], similar_papers, paper_limit=limit
1382
- )
1383
-
1384
- click.echo(f"报告生成完成:")
1385
- for f in report_files:
1386
- click.echo(f" - {f}")
1387
-
1388
-
1389
- @cli.command()
1390
- @click.argument("query")
1391
- @click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
1392
- @click.option("--limit", default=20, help="返回结果的最大数量(默认:20)")
1393
- @click.option("--years-back", type=int, default=0, help="搜索前同步回溯的年数(默认:0,不更新)")
1394
- @click.option("--use-ai/--no-ai", default=True, help="是否使用AI理解自然语言查询(默认:是)")
1395
- @click.option("--categories", "-c", multiple=True, help="包含的分类(可多次使用)")
1396
- @click.option("--exclude-categories", "-ec", multiple=True, help="排除的分类(可多次使用)")
1397
- @click.option("--primary-category", "-pc", help="主要分类")
1398
- @click.option("--authors", "-a", multiple=True, help="作者姓名(可多次使用)")
1399
- @click.option(
1400
- "--author-match",
1401
- type=click.Choice(["contains", "exact", "any"]),
1402
- default="contains",
1403
- help="作者匹配方式:contains(包含)、exact(精确)、any(任一)",
1404
- )
1405
- @click.option("--date-from", type=click.DateTime(formats=["%Y-%m-%d"]), help="起始日期(格式:YYYY-MM-DD)")
1406
- @click.option("--date-to", type=click.DateTime(formats=["%Y-%m-%d"]), help="结束日期(格式:YYYY-MM-DD)")
1407
- @click.option("--days-back", type=int, help="回溯天数(例如:30表示最近30天)")
1408
- @click.option("--summarized-only/--no-summarized-only", default=False, help="仅显示已总结的论文")
1409
- @click.option("--downloaded-only/--no-downloaded-only", default=False, help="仅显示已下载的论文")
1410
- @click.option(
1411
- "--sort-by",
1412
- type=click.Choice(["published", "relevance_score", "title", "updated", "created_at"]),
1413
- default="published",
1414
- help="排序字段",
1415
- )
1416
- @click.option("--sort-order", type=click.Choice(["asc", "desc"]), default="desc", help="排序顺序")
1417
- @click.option("--match-all/--match-any", default=False, help="匹配所有条件(AND逻辑)或任一条件(OR逻辑)")
1418
- def search_advanced(
1419
- query,
1420
- directory,
1421
- limit,
1422
- years_back,
1423
- use_ai,
1424
- categories,
1425
- exclude_categories,
1426
- primary_category,
1427
- authors,
1428
- author_match,
1429
- date_from,
1430
- date_to,
1431
- days_back,
1432
- summarized_only,
1433
- downloaded_only,
1434
- sort_by,
1435
- sort_order,
1436
- match_all,
1437
- ):
1438
- """高级搜索论文(支持多字段过滤)"""
1439
- directory = Path(directory).resolve()
1440
-
1441
- if not setup_environment(directory):
1442
- sys.exit(1)
1443
-
1444
- print_banner()
1445
-
1446
- # 如果需要,先同步最新论文
1447
- crawler = ArXivCrawler()
1448
- if years_back > 0:
1449
- click.echo(f"搜索前先同步最近 {years_back} 年论文...")
1450
- sync_result = sync_papers(years_back=years_back, summarize=False)
1451
- crawler = sync_result["crawler"]
1452
-
1453
- click.echo(f"\n高级搜索: '{query}'")
1454
- click.echo("=" * 50)
1455
-
1456
- search_terms = [query]
1457
-
1458
- # 如果启用AI且配置了AI API密钥,尝试解析自然语言查询
1459
- if use_ai and Config.AI_API_KEY:
1460
- try:
1461
- import openai
1462
-
1463
- client = openai.OpenAI(api_key=Config.AI_API_KEY, base_url=Config.AI_BASE_URL)
1464
-
1465
- ai_prompt = f"""
1466
- 用户正在搜索arXiv物理/计算材料科学论文,查询是: "{query}"
1467
-
1468
- 请将自然语言查询转换为适合arXiv搜索的关键词或短语。
1469
- 考虑以下领域:凝聚态物理、密度泛函理论(DFT)、机器学习、力场、分子动力学、量子化学。
1470
-
1471
- 返回格式:JSON数组,包含最多5个搜索关键词/短语。
1472
- 示例:["machine learning materials science", "density functional theory", "condensed matter physics"]
1473
-
1474
- 只返回JSON数组,不要其他文本。
1475
- """
1476
-
1477
- response = client.chat.completions.create(
1478
- model=Config.AI_MODEL,
1479
- messages=[
1480
- {"role": "system", "content": "你是arXiv论文搜索助手,擅长将自然语言查询转换为学术搜索关键词。"},
1481
- {"role": "user", "content": ai_prompt},
1482
- ],
1483
- max_tokens=200,
1484
- temperature=0.3,
1485
- )
1486
-
1487
- ai_response = response.choices[0].message.content
1488
- try:
1489
- search_terms = json.loads(ai_response)
1490
- if isinstance(search_terms, list) and len(search_terms) > 0:
1491
- click.echo(f"AI解析的搜索词: {', '.join(search_terms[:3])}")
1492
- if len(search_terms) > 3:
1493
- click.echo(f" 以及 {len(search_terms) - 3} 个其他关键词")
1494
- except:
1495
- # 如果AI响应不是有效JSON,使用原始查询
1496
- pass
1497
-
1498
- except Exception as e:
1499
- click.echo(f"AI解析失败,使用原始查询: {e}")
1500
-
1501
- # 使用增强搜索引擎
1502
- with crawler.db.get_session() as session:
1503
- # 创建搜索过滤器
1504
- filter_config = SearchFilter(
1505
- query=query,
1506
- categories=list(categories) if categories else None,
1507
- exclude_categories=list(exclude_categories) if exclude_categories else None,
1508
- primary_category=primary_category,
1509
- authors=list(authors) if authors else None,
1510
- author_match=author_match,
1511
- date_from=date_from,
1512
- date_to=date_to,
1513
- days_back=days_back,
1514
- summarized_only=summarized_only,
1515
- downloaded_only=downloaded_only,
1516
- limit=limit,
1517
- sort_by=sort_by,
1518
- sort_order=sort_order,
1519
- match_all=match_all,
1520
- )
1521
-
1522
- # 创建搜索引擎
1523
- search_engine = SearchEngine(session)
1524
-
1525
- # 执行搜索
1526
- click.echo(f"正在搜索...")
1527
- papers = search_engine.search_papers(filter_config)
1528
-
1529
- if not papers:
1530
- click.echo("未找到匹配的论文。")
1531
- return
1532
-
1533
- click.echo(f"找到 {len(papers)} 篇论文:")
1534
-
1535
- # 显示简要结果
1536
- for i, paper in enumerate(papers[:5], 1): # 只显示前5篇作为预览
1537
- authors_list = json.loads(paper.authors) if paper.authors else []
1538
- author_names = [a.get("name", "") for a in authors_list[:2]]
1539
- if len(authors_list) > 2:
1540
- author_names.append("等")
1541
-
1542
- click.echo(f"\n{i}. {paper.title}")
1543
- click.echo(f" 作者: {', '.join(author_names)}")
1544
- click.echo(f" arXiv ID: {paper.arxiv_id}")
1545
- click.echo(f" 分类: {paper.categories}")
1546
- click.echo(f" 发布日期: {paper.published.strftime('%Y-%m-%d') if paper.published else 'N/A'}")
1547
- click.echo(f" 总结状态: {'已总结' if paper.summarized else '未总结'}")
1548
-
1549
- if len(papers) > 5:
1550
- click.echo(f"\n... 以及 {len(papers) - 5} 篇更多论文")
1551
-
1552
- # 生成搜索报告
1553
- click.echo("\n正在生成搜索报告...")
1554
- files = generate_search_report(directory, query, search_terms, papers, paper_limit=limit)
1555
-
1556
- click.echo(f"报告生成完成:")
1557
- for f in files:
1558
- click.echo(f" - {f}")
1559
- click.echo(f"\n详细论文信息、中文翻译和PDF链接请查看生成的Markdown报告。")
1560
-
1561
-
1562
- @cli.command()
1563
- @click.argument("directory", type=click.Path(exists=True, file_okay=False), default=".")
1564
- @click.option("--limit", default=10, help="显示的搜索查询数量(默认:10)")
1565
- def search_history(directory, limit):
1566
- """显示搜索历史(按使用频率排序)"""
1567
- directory = Path(directory).resolve()
1568
-
1569
- if not setup_environment(directory):
1570
- sys.exit(1)
1571
-
1572
- print_banner()
1573
-
1574
- crawler = ArXivCrawler()
1575
-
1576
- click.echo("\n" + "=" * 50)
1577
- click.echo("搜索历史")
1578
- click.echo("=" * 50)
1579
-
1580
- with crawler.db.get_session() as session:
1581
- # 创建搜索引擎
1582
- search_engine = SearchEngine(session)
1583
-
1584
- # 获取搜索历史
1585
- click.echo(f"正在获取搜索历史...")
1586
- history = search_engine.get_search_history(limit=limit)
1587
-
1588
- if not history:
1589
- click.echo("暂无搜索历史。")
1590
- return
1591
-
1592
- click.echo(f"\n找到 {len(history)} 个搜索查询:")
1593
- click.echo("-" * 50)
1594
-
1595
- for i, item in enumerate(history, 1):
1596
- last_used = item["last_used"].strftime("%Y-%m-%d") if item["last_used"] else "N/A"
1597
- click.echo(f"\n{i}. 查询: {item['query']}")
1598
- click.echo(f" 使用次数: {item['count']}")
1599
- click.echo(f" 最后使用: {last_used}")
1600
- if item["last_paper_id"]:
1601
- click.echo(f" 最后论文ID: {item['last_paper_id']}")
1602
-
1603
- click.echo(f"\n💡 提示: 使用 'pulse search \"查询内容\" .' 重用搜索")
1604
- click.echo(f" 或 'pulse search-advanced \"查询内容\" . --categories 分类' 进行高级搜索")
1605
-
1606
-
1607
1402
  if __name__ == "__main__":
1608
1403
  cli()