PyPI - arxiv-pulse - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

arxiv-pulse 0.5.0py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

arxiv_pulse/.ENV.TEMPLATE +93 -41
arxiv_pulse/__version__.py +2 -2
arxiv_pulse/arxiv_crawler.py +65 -23
arxiv_pulse/cli.py +228 -433
arxiv_pulse/config.py +6 -8
arxiv_pulse/models.py +17 -9
arxiv_pulse/output_manager.py +38 -54
arxiv_pulse/report_generator.py +3 -46
arxiv_pulse/search_engine.py +105 -53
arxiv_pulse/summarizer.py +0 -1
{arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/METADATA +61 -124
arxiv_pulse-0.6.1.dist-info/RECORD +17 -0
arxiv_pulse-0.5.0.dist-info/RECORD +0 -17
{arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/WHEEL +0 -0
{arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/entry_points.txt +0 -0
{arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/licenses/LICENSE +0 -0
{arxiv_pulse-0.5.0.dist-info → arxiv_pulse-0.6.1.dist-info}/top_level.txt +0 -0

arxiv_pulse/.ENV.TEMPLATE CHANGED Viewed

@@ -1,72 +1,124 @@
 # arXiv Pulse 配置文件模板
 # 将此文件复制为 .env 并进行配置
+# 所有配置项均可通过环境变量覆盖
-# ========================
-# AI API 配置 (支持 OpenAI 格式)
-# ========================
-# 使用 OpenAI 格式的 API，支持 DeepSeek、Paratera AI 等
-# 示例：Paratera AI: https://llmapi.paratera.com
-# 示例：DeepSeek: https://api.deepseek.com
+# ============================================================================
+# 1. AI API 配置 (支持所有 OpenAI 兼容服务)
+# ============================================================================
+# 支持 DeepSeek、Paratera AI、OpenAI 等所有 OpenAI 兼容服务
+# 示例: Paratera AI: https://llmapi.paratera.com
+# 示例: DeepSeek: https://api.deepseek.com
+# 示例: OpenAI: https://api.openai.com
+# AI API 密钥 (必需，用于论文总结和翻译)
 AI_API_KEY=your_api_key_here
+# AI 模型名称 (默认: DeepSeek-V3.2-Thinking)
 AI_MODEL=DeepSeek-V3.2-Thinking
+# AI API 基础 URL (默认: Paratera AI)
 AI_BASE_URL=https://llmapi.paratera.com
-# ========================
-# 数据库配置
-# ========================
+# ============================================================================
+# 2. 数据库配置
+# ============================================================================
+# 数据库连接 URL (默认: SQLite)
 DATABASE_URL=sqlite:///data/arxiv_papers.db
-# ========================
-# 爬虫配置
-# ========================
-MAX_RESULTS_INITIAL=100    # init命令每个查询的论文数
-MAX_RESULTS_DAILY=20       # sync命令每个查询的论文数
+# ============================================================================
+# 3. 爬虫配置
+# ============================================================================
+# 初始同步每个查询的最大论文数 (默认: 10000)
+# 注意: 实际取 MIN(MAX_RESULTS_INITIAL, ARXIV_MAX_RESULTS, 30000)
+MAX_RESULTS_INITIAL=10000
+# 每日同步每个查询的最大论文数 (默认: 500)
+# 日常更新通常只需要最新论文，因此设置较小值
+MAX_RESULTS_DAILY=500
+# 爬虫延迟 (秒，避免频繁请求 arXiv API) (默认: 1.0)
+# 注意: 当前代码中未使用此配置，arXiv 客户端使用固定延迟 3.0 秒
+CRAWL_DELAY=1.0
+# arXiv API 最大返回论文数限制 (默认: 30000)
+# 注意: arXiv API 最大限制为 30000 篇，但实际查询可能受性能影响
+ARXIV_MAX_RESULTS=30000
+# arXiv API 排序方式 (默认: submittedDate)
+# 可选: submittedDate, lastUpdatedDate, relevance
+ARXIV_SORT_BY=submittedDate
+# arXiv API 排序顺序 (默认: descending)
+# 可选: descending, ascending
+ARXIV_SORT_ORDER=descending
-# ========================
-# 搜索查询配置
-# ========================
-# 分号分隔，允许查询中包含逗号
+# ============================================================================
+# 4. 搜索查询配置
+# ============================================================================
+# 分号分隔的搜索查询列表，允许查询中包含逗号
 # 默认监控凝聚态物理、DFT、机器学习、力场等研究领域
+# 通过交互式配置向导 (pulse init .) 可以自动生成优化查询
 SEARCH_QUERIES=condensed matter physics AND cat:cond-mat.*; (ti:"density functional" OR abs:"density functional") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph); (ti:"machine learning" OR abs:"machine learning") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci OR cat:physics.chem-ph); (ti:"force field" OR abs:"force field") AND (cat:physics.comp-ph OR cat:cond-mat.soft OR cat:physics.chem-ph); (ti:"first principles" OR abs:"first principles" OR ti:"ab initio" OR abs:"ab initio") AND (cat:physics.comp-ph OR cat:cond-mat.mtrl-sci); (ti:"molecular dynamics" OR abs:"molecular dynamics") AND (cat:physics.comp-ph OR cat:cond-mat.soft OR cat:physics.chem-ph); (ti:"quantum chemistry" OR abs:"quantum chemistry") AND (cat:physics.chem-ph OR cat:physics.comp-ph); cat:cond-mat.mtrl-sci AND (ti:"computational" OR abs:"computational" OR ti:"simulation" OR abs:"simulation")
-# ========================
-# 报告配置
-# ========================
+# ============================================================================
+# 5. 报告配置
+# ============================================================================
+# 报告输出目录 (默认: reports)
 REPORT_DIR=reports
-# SUMMARY_MODEL 现在由 AI_MODEL 控制，不再需要单独设置
-SUMMARY_MAX_TOKENS=2000    # 总结和翻译的最大token数
-SUMMARY_SENTENCES_LIMIT=3
-TOKEN_PRICE_PER_MILLION=3.0
+# 报告中包含的最大论文数 (默认: 50)
 REPORT_MAX_PAPERS=50
+# AI 总结和翻译的最大 token 数 (默认: 2000)
+SUMMARY_MAX_TOKENS=2000
+# AI API 每百万 token 价格 (人民币，用于费用估算) (默认: 3.0)
+TOKEN_PRICE_PER_MILLION=3.0
+# ============================================================================
+# 6. 同步配置
+# ============================================================================
-# ========================
-# 同步配置
-# ========================
-YEARS_BACK=3               # 同步回溯的年数
-IMPORTANT_PAPERS_FILE=important_papers.txt
+# 初始同步回溯的年数 (默认: 3 年)
+YEARS_BACK=3
+# 重要论文列表文件路径 (默认: data/important_papers.txt)
+IMPORTANT_PAPERS_FILE=data/important_papers.txt
-# ========================
-# 可选配置
-# ========================
-# 日志级别: DEBUG, INFO, WARNING, ERROR (默认: INFO)
+# ============================================================================
+# 7. 日志配置
+# ============================================================================
+# 日志级别 (默认: INFO)
+# 可选: DEBUG, INFO, WARNING, ERROR
 LOG_LEVEL=INFO
-# 爬虫延迟（秒，避免频繁请求 arXiv API）
-CRAWL_DELAY=1.0
-# ========================
+# ============================================================================
 # 使用说明
-# ========================
+# ============================================================================
 # 1. 将此文件复制为 .env
-# 2. 设置您的 AI API 密钥
+# 2. 设置您的 AI API 密钥 (AI_API_KEY)
 # 3. 根据需要调整其他配置
-# 4. 运行 pulse init . 初始化目录
+# 4. 运行 pulse init . 初始化目录 (推荐使用交互式配置向导)
 # 5. 运行 pulse sync . 同步论文
-# 6. 使用 pulse search "查询内容" . 搜索论文
+# 6. 使用 pulse search "查询内容" . 搜索论文
+# 7. 使用 pulse recent . 生成最近论文报告
+# 注意: 通过交互式配置向导 (pulse init .) 可以自动优化配置，
+#       包括研究领域选择、智能参数推荐等。
+# 关于 MAX_RESULTS_INITIAL 和 MAX_RESULTS_DAILY 的区别:
+# - MAX_RESULTS_INITIAL: 首次爬取某个查询时使用的限制，通常需要获取较多历史论文
+# - MAX_RESULTS_DAILY: 日常更新已有查询时使用的限制，通常只需要获取最新论文
+# 两者有不同的使用场景，因此需要分别配置。

arxiv_pulse/__version__.py CHANGED Viewed

@@ -29,5 +29,5 @@ try:
 except importlib.metadata.PackageNotFoundError:
     # 包未安装时使用默认版本
-    __version__ = "0.5.0"
-    __version_info__ = (0, 5, 0)
+    __version__ = "0.6.1"
+    __version_info__ = (0, 6, 1)

arxiv_pulse/arxiv_crawler.py CHANGED Viewed

@@ -43,11 +43,26 @@ class ArXivCrawler:
             cutoff_date: Optional UTC datetime cutoff; papers older than this will be skipped
                          and iteration will stop early due to descending date order.
         """
+        # Map sort_by string to arxiv enum
+        sort_by_map = {
+            "submittedDate": arxiv.SortCriterion.SubmittedDate,
+            "lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate,
+            "relevance": arxiv.SortCriterion.Relevance,
+        }
+        sort_by = sort_by_map.get(Config.ARXIV_SORT_BY, arxiv.SortCriterion.SubmittedDate)
+        # Map sort_order string to arxiv enum
+        sort_order_map = {
+            "descending": arxiv.SortOrder.Descending,
+            "ascending": arxiv.SortOrder.Ascending,
+        }
+        sort_order = sort_order_map.get(Config.ARXIV_SORT_ORDER, arxiv.SortOrder.Descending)
         search = arxiv.Search(
             query=query,
             max_results=max_results,
-            sort_by=arxiv.SortCriterion.SubmittedDate,
-            sort_order=arxiv.SortOrder.Descending,
+            sort_by=sort_by,
+            sort_order=sort_order,
         )
         results = []
@@ -208,31 +223,51 @@ class ArXivCrawler:
             )
             return latest_paper.published if latest_paper else None  # type: ignore
-    def sync_query(self, query: str, years_back: int = 3) -> Dict[str, Any]:
-        """Sync papers for a specific query, fetching missing papers from recent years"""
-        output.do(f"同步查询: {query}")
+    def sync_query(self, query: str, years_back: int = 3, force: bool = False) -> Dict[str, Any]:
+        """Sync papers for a specific query, fetching missing papers from recent years
-        # Get latest paper date in database for this query
-        latest_date = self.get_latest_paper_date_for_query(query)
+        Args:
+            query: arXiv search query
+            years_back: Number of years to look back
+            force: If True, ignore all max results limits (MAX_RESULTS_INITIAL,
+                   MAX_RESULTS_DAILY, ARXIV_MAX_RESULTS) but still skip existing papers
+        """
+        output.do(f"同步查询: {query}" + (" (强制模式)" if force else ""))
-        if latest_date:
-            # If we have papers, fetch from latest date onward
-            start_date = latest_date.replace(tzinfo=timezone.utc)
-            # 减去一天以确保获取所有可能的新论文，避免因时间精度问题错过论文
-            start_date = start_date - timedelta(days=1)
-            output.debug(f"获取论文从 {start_date.strftime('%Y-%m-%d')} 到现在")
-        else:
-            # If no papers, fetch from years_back years ago
+        if force:
+            # Force mode: always start from years_back years ago, ignore all limits
             start_date = datetime.now(timezone.utc) - timedelta(days=365 * years_back)
-            output.debug(f"获取最近 {years_back} 年的论文 ({start_date.strftime('%Y-%m-%d')} 到现在)")
+            output.debug(f"强制同步: 获取最近 {years_back} 年的所有论文 ({start_date.strftime('%Y-%m-%d')} 到现在)")
+            # Use a very large number to bypass all limits (arxiv API may have its own limits)
+            max_results = 30000  # Ignore MAX_RESULTS_INITIAL, MAX_RESULTS_DAILY, and ARXIV_MAX_RESULTS
+        else:
+            # Normal mode: get latest paper date in database for this query
+            latest_date = self.get_latest_paper_date_for_query(query)
+            if latest_date:
+                # If we have papers, fetch from latest date onward
+                start_date = latest_date.replace(tzinfo=timezone.utc)
+                # 减去一天以确保获取所有可能的新论文，避免因时间精度问题错过论文
+                start_date = start_date - timedelta(days=1)
+                output.debug(f"获取论文从 {start_date.strftime('%Y-%m-%d')} 到现在")
+                # Use daily limit, but respect arXiv API maximum
+                max_results = min(Config.MAX_RESULTS_DAILY, Config.ARXIV_MAX_RESULTS)
+            else:
+                # If no papers, fetch from years_back years ago
+                start_date = datetime.now(timezone.utc) - timedelta(days=365 * years_back)
+                output.debug(f"获取最近 {years_back} 年的论文 ({start_date.strftime('%Y-%m-%d')} 到现在)")
+                # Use initial limit, but respect arXiv API maximum
+                max_results = min(Config.MAX_RESULTS_INITIAL, Config.ARXIV_MAX_RESULTS)
         # Search arXiv without date filter (use cutoff_date for early stopping)
         try:
             papers = self.search_arxiv(
                 query,
-                max_results=Config.ARXIV_MAX_RESULTS,
+                max_results=max_results,
                 cutoff_date=start_date,
             )
+            # Always filter out existing papers (even in force mode)
             new_papers = self.filter_new_papers(papers)
             saved = self.save_papers(new_papers, query)
@@ -245,29 +280,36 @@ class ArXivCrawler:
                 "total_found": len(papers),
                 "new_papers": len(saved),
                 "saved_papers": saved,
+                "force_mode": force,
             }
         except Exception as e:
             output.error(f"同步查询失败: {query}", details={"exception": str(e)})
-            return {"query": query, "error": str(e), "new_papers": 0}
+            return {"query": query, "error": str(e), "new_papers": 0, "force_mode": force}
-    def sync_all_queries(self, years_back: int = 3) -> Dict[str, Any]:
-        """Sync all configured search queries"""
-        output.do(f"同步所有查询 (回溯 {years_back} 年)")
+    def sync_all_queries(self, years_back: int = 3, force: bool = False) -> Dict[str, Any]:
+        """Sync all configured search queries
+        Args:
+            years_back: Number of years to look back
+            force: If True, ignore all max results limits but still skip existing papers
+        """
+        output.do(f"同步所有查询 (回溯 {years_back} 年)" + (" (强制模式)" if force else ""))
         all_results = []
         total_new = 0
         for query in self.config.SEARCH_QUERIES:
-            result = self.sync_query(query, years_back)
+            result = self.sync_query(query, years_back, force)
             all_results.append(result)
             total_new += result.get("new_papers", 0)
-        output.done(f"同步完成: 共 {total_new} 篇新论文")
+        output.done(f"同步完成: 共 {total_new} 篇论文")
         return {
             "total_new_papers": total_new,
             "query_results": all_results,
             "years_back": years_back,
+            "force_mode": force,
         }
     def sync_important_papers(self) -> Dict[str, Any]:

arxiv-pulse 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

arxiv-pulse 0.5.0py3-none-any.whl → 0.6.1py3-none-any.whl