academic-search-mcp 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_search_mcp-0.1.3.dist-info/METADATA +243 -0
- academic_search_mcp-0.1.3.dist-info/RECORD +24 -0
- academic_search_mcp-0.1.3.dist-info/WHEEL +4 -0
- academic_search_mcp-0.1.3.dist-info/entry_points.txt +2 -0
- academic_search_mcp-0.1.3.dist-info/licenses/LICENSE +21 -0
- paper_search_mcp/__init__.py +0 -0
- paper_search_mcp/academic_platforms/__init__.py +0 -0
- paper_search_mcp/academic_platforms/arxiv.py +147 -0
- paper_search_mcp/academic_platforms/biorxiv.py +156 -0
- paper_search_mcp/academic_platforms/core.py +284 -0
- paper_search_mcp/academic_platforms/crossref.py +375 -0
- paper_search_mcp/academic_platforms/cyberleninka.py +396 -0
- paper_search_mcp/academic_platforms/google_scholar.py +249 -0
- paper_search_mcp/academic_platforms/hub.py +0 -0
- paper_search_mcp/academic_platforms/iacr.py +548 -0
- paper_search_mcp/academic_platforms/medrxiv.py +156 -0
- paper_search_mcp/academic_platforms/openalex.py +497 -0
- paper_search_mcp/academic_platforms/pubmed.py +159 -0
- paper_search_mcp/academic_platforms/sci_hub.py +178 -0
- paper_search_mcp/academic_platforms/semantic.py +492 -0
- paper_search_mcp/academic_platforms/ssrn.py +385 -0
- paper_search_mcp/paper.py +69 -0
- paper_search_mcp/pdf_utils.py +67 -0
- paper_search_mcp/server.py +514 -0
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: academic-search-mcp
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: A MCP server for searching and downloading academic papers from multiple sources.
|
|
5
|
+
Author-email: "P.S Zhang" <pengsongzhang96@gmail.com>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
9
|
+
Requires-Dist: curl-cffi>=0.5.0
|
|
10
|
+
Requires-Dist: fastmcp
|
|
11
|
+
Requires-Dist: feedparser
|
|
12
|
+
Requires-Dist: httpx[socks]>=0.28.1
|
|
13
|
+
Requires-Dist: lxml>=4.9.0
|
|
14
|
+
Requires-Dist: mcp[cli]>=1.6.0
|
|
15
|
+
Requires-Dist: pypdf2>=3.0.0
|
|
16
|
+
Requires-Dist: requests
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# Academic Search MCP
|
|
20
|
+
|
|
21
|
+
A Model Context Protocol (MCP) server for searching and downloading academic papers from multiple sources. Designed for seamless integration with large language models like Claude Desktop.
|
|
22
|
+
|
|
23
|
+
> **Fork Notice**: This is an extended fork of [openags/academic-search-mcp](https://github.com/openags/academic-search-mcp) with additional platforms (CORE, SSRN, CyberLeninka) and improvements.
|
|
24
|
+
|
|
25
|
+
 
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Table of Contents
|
|
30
|
+
|
|
31
|
+
- [Overview](#overview)
|
|
32
|
+
- [Features](#features)
|
|
33
|
+
- [Installation](#installation)
|
|
34
|
+
- [Quick Start](#quick-start)
|
|
35
|
+
- [Install Package](#install-package)
|
|
36
|
+
- [Configure Claude Desktop](#configure-claude-desktop)
|
|
37
|
+
- [For Development](#for-development)
|
|
38
|
+
- [Setup Environment](#setup-environment)
|
|
39
|
+
- [Install Dependencies](#install-dependencies)
|
|
40
|
+
- [Contributing](#contributing)
|
|
41
|
+
- [Demo](#demo)
|
|
42
|
+
- [License](#license)
|
|
43
|
+
- [TODO](#todo)
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Overview
|
|
48
|
+
|
|
49
|
+
`academic-search-mcp` is a Python-based MCP server that enables users to search and download academic papers from various platforms. It provides tools for searching papers (e.g., `search_arxiv`) and downloading PDFs (e.g., `download_arxiv`), making it ideal for researchers and AI-driven workflows. Built with the MCP Python SDK, it integrates seamlessly with LLM clients like Claude Desktop.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Features
|
|
54
|
+
|
|
55
|
+
- **Multi-Source Support**: Search and download papers from arXiv, PubMed, bioRxiv, medRxiv, Google Scholar, IACR ePrint Archive, Semantic Scholar, CrossRef, OpenAlex, CORE, SSRN, and CyberLeninka.
|
|
56
|
+
- **Date Filtering**: All sources support `date_from` and `date_to` parameters (YYYY-MM-DD format) to filter papers by publication date.
|
|
57
|
+
- **Citation Counts**: OpenAlex, Semantic Scholar, CrossRef, and Google Scholar include citation counts in search results.
|
|
58
|
+
- **Citation Graph**: OpenAlex tools to explore references (papers a work cites) and citations (papers citing a work), sorted by impact.
|
|
59
|
+
- **Open Access PDFs**: OpenAlex includes PDF URLs for open access papers.
|
|
60
|
+
- **Token-Optimized Output**: Configurable `abstract_limit` parameter to control abstract length (default: 200 chars, use -1 for full, 0 to omit).
|
|
61
|
+
- **Standardized Output**: Papers are returned in a consistent, compact dictionary format via the `Paper` class.
|
|
62
|
+
- **Asynchronous Tools**: Efficiently handles network requests using `httpx`.
|
|
63
|
+
- **MCP Integration**: Compatible with MCP clients for LLM context enhancement.
|
|
64
|
+
- **Extensible Design**: Easily add new academic platforms by extending the `academic_platforms` module.
|
|
65
|
+
|
|
66
|
+
### Search Parameters
|
|
67
|
+
|
|
68
|
+
All search tools support these common parameters:
|
|
69
|
+
|
|
70
|
+
| Parameter | Type | Default | Description |
|
|
71
|
+
|-----------|------|---------|-------------|
|
|
72
|
+
| `query` | str | required | Search query string |
|
|
73
|
+
| `max_results` | int | 10 | Maximum number of papers to return |
|
|
74
|
+
| `abstract_limit` | int | 200 | Max chars for abstract (0=omit, -1=full) |
|
|
75
|
+
| `date_from` | str | None | Start date in YYYY-MM-DD format |
|
|
76
|
+
| `date_to` | str | None | End date in YYYY-MM-DD format |
|
|
77
|
+
|
|
78
|
+
> **Note**: Google Scholar only supports year-level filtering (month/day are ignored).
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Installation
|
|
83
|
+
|
|
84
|
+
### Quick Start with uvx
|
|
85
|
+
|
|
86
|
+
The easiest way to use `academic-search-mcp`:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
uvx academic-search-mcp
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Or install globally:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
uv tool install academic-search-mcp
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Configure Claude Desktop
|
|
99
|
+
|
|
100
|
+
Add this configuration to `~/Library/Application Support/Claude/claude_desktop_config.json` (Mac) or `%APPDATA%\Claude\claude_desktop_config.json` (Windows):
|
|
101
|
+
|
|
102
|
+
```json
|
|
103
|
+
{
|
|
104
|
+
"mcpServers": {
|
|
105
|
+
"academic_search": {
|
|
106
|
+
"command": "uvx",
|
|
107
|
+
"args": ["academic-search-mcp"]
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Alternative: Install with pip/uv
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
pip install academic-search-mcp
|
|
117
|
+
# or
|
|
118
|
+
uv add academic-search-mcp
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
2. **Configure Claude Desktop (alternative)**:
|
|
122
|
+
Add this configuration to `~/Library/Application Support/Claude/claude_desktop_config.json` (Mac) or `%APPDATA%\Claude\claude_desktop_config.json` (Windows):
|
|
123
|
+
```json
|
|
124
|
+
{
|
|
125
|
+
"mcpServers": {
|
|
126
|
+
"paper_search_server": {
|
|
127
|
+
"command": "uv",
|
|
128
|
+
"args": [
|
|
129
|
+
"run",
|
|
130
|
+
"--directory",
|
|
131
|
+
"/path/to/your/academic-search-mcp",
|
|
132
|
+
"-m",
|
|
133
|
+
"paper_search_mcp.server"
|
|
134
|
+
],
|
|
135
|
+
"env": {
|
|
136
|
+
"SEMANTIC_SCHOLAR_API_KEY": "", // Optional: For enhanced Semantic Scholar features
|
|
137
|
+
"CORE_API_KEY": "" // Optional: for CORE repository access
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
> Note: Replace `/path/to/your/academic-search-mcp` with your actual installation path.
|
|
144
|
+
|
|
145
|
+
### For Development
|
|
146
|
+
|
|
147
|
+
For developers who want to modify the code or contribute:
|
|
148
|
+
|
|
149
|
+
1. **Setup Environment**:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
# Install uv if not installed
|
|
153
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
154
|
+
|
|
155
|
+
# Clone repository
|
|
156
|
+
git clone https://github.com/openags/academic-search-mcp.git
|
|
157
|
+
cd academic-search-mcp
|
|
158
|
+
|
|
159
|
+
# Create and activate virtual environment
|
|
160
|
+
uv venv
|
|
161
|
+
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
2. **Install Dependencies**:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
# Install project in editable mode
|
|
168
|
+
uv add -e .
|
|
169
|
+
|
|
170
|
+
# Add development dependencies (optional)
|
|
171
|
+
uv add pytest flake8
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## Contributing
|
|
177
|
+
|
|
178
|
+
We welcome contributions! Here's how to get started:
|
|
179
|
+
|
|
180
|
+
1. **Fork the Repository**:
|
|
181
|
+
Click "Fork" on GitHub.
|
|
182
|
+
|
|
183
|
+
2. **Clone and Set Up**:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
git clone https://github.com/yourusername/academic-search-mcp.git
|
|
187
|
+
cd academic-search-mcp
|
|
188
|
+
pip install -e ".[dev]" # Install dev dependencies (if added to pyproject.toml)
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
3. **Make Changes**:
|
|
192
|
+
|
|
193
|
+
- Add new platforms in `academic_platforms/`.
|
|
194
|
+
- Update tests in `tests/`.
|
|
195
|
+
|
|
196
|
+
4. **Submit a Pull Request**:
|
|
197
|
+
Push changes and create a PR on GitHub.
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Demo
|
|
202
|
+
|
|
203
|
+
<img src="docs\images\demo.png" alt="Demo" width="800">
|
|
204
|
+
|
|
205
|
+
## TODO
|
|
206
|
+
|
|
207
|
+
### Planned Academic Platforms
|
|
208
|
+
|
|
209
|
+
- [x] arXiv
|
|
210
|
+
- [x] PubMed
|
|
211
|
+
- [x] bioRxiv
|
|
212
|
+
- [x] medRxiv
|
|
213
|
+
- [x] Google Scholar
|
|
214
|
+
- [x] IACR ePrint Archive
|
|
215
|
+
- [x] Semantic Scholar
|
|
216
|
+
- [x] CrossRef
|
|
217
|
+
- [x] OpenAlex
|
|
218
|
+
- [x] CORE (200M+ open access papers)
|
|
219
|
+
- [x] SSRN (social sciences, law, business preprints)
|
|
220
|
+
- [x] CyberLeninka (Russian academic papers, VAK/RSCI/SCOPUS filters)
|
|
221
|
+
- [ ] PubMed Central (PMC)
|
|
222
|
+
- [ ] Science Direct
|
|
223
|
+
- [ ] Springer Link
|
|
224
|
+
- [ ] IEEE Xplore
|
|
225
|
+
- [ ] ACM Digital Library
|
|
226
|
+
- [ ] Web of Science
|
|
227
|
+
- [ ] Scopus
|
|
228
|
+
- [ ] JSTOR
|
|
229
|
+
- [ ] ResearchGate
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
This project is licensed under the MIT License. See the LICENSE file for details.
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
Happy researching with `academic-search-mcp`!
|
|
240
|
+
|
|
241
|
+
## Credits
|
|
242
|
+
|
|
243
|
+
Based on [openags/paper-search-mcp](https://github.com/openags/paper-search-mcp) by P.S Zhang.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
paper_search_mcp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
paper_search_mcp/paper.py,sha256=Flrn3ORhsojiEdEldUtKPvGF1RivXhl84zzq8mqAeFI,2969
|
|
3
|
+
paper_search_mcp/pdf_utils.py,sha256=sylqOQTFyOSlYnEzUMpSIe4VkY2kfgaQw_xd_EBYw2g,1909
|
|
4
|
+
paper_search_mcp/server.py,sha256=C542TF00oOUHF38F_5OU43D9RmIWQZSk7UiFHcXukWA,21663
|
|
5
|
+
paper_search_mcp/academic_platforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
paper_search_mcp/academic_platforms/arxiv.py,sha256=5SsFudqH1PIXIEE_8saCQHcf75bqCL6ApRUltLpp9Ow,5911
|
|
7
|
+
paper_search_mcp/academic_platforms/biorxiv.py,sha256=4k1Bg2BW-RBJiZ9jRVVmCEOge_4MtEDtXq2tMaPV0cg,6799
|
|
8
|
+
paper_search_mcp/academic_platforms/core.py,sha256=6xDq3NmlVh1NIEFnTRLPNayodkztrS7CPUC-jupd-Lw,9632
|
|
9
|
+
paper_search_mcp/academic_platforms/crossref.py,sha256=Zxj4U6SejaCa5o7whRmjjHVdd1U1H-DVtRP6DWzPwjk,14773
|
|
10
|
+
paper_search_mcp/academic_platforms/cyberleninka.py,sha256=88p9RZxjBRn5jAaOhZLr3EpP5ibMzmd0vCh1jD6PPEs,13421
|
|
11
|
+
paper_search_mcp/academic_platforms/google_scholar.py,sha256=B8VqgauJy3RJ8nR9woe107CXM-DrHQPapQAg_f948yg,9269
|
|
12
|
+
paper_search_mcp/academic_platforms/hub.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
paper_search_mcp/academic_platforms/iacr.py,sha256=Vem7q18NZRm5WXsDHsqRefyRIpl4PCceGGWYXhbXB2s,21135
|
|
14
|
+
paper_search_mcp/academic_platforms/medrxiv.py,sha256=tsbeO5RK-apz2mKKJruFCS19-BRG4jikwO4XXwqiktM,6772
|
|
15
|
+
paper_search_mcp/academic_platforms/openalex.py,sha256=lsWB_XlYufbQTNbR2ZT7tg7b74_FwYvxBcDt-YYBuYE,17792
|
|
16
|
+
paper_search_mcp/academic_platforms/pubmed.py,sha256=oS-JRHNI7lcCqxUGTlSVKp2i_QKuClwVUc6cA33URhE,6480
|
|
17
|
+
paper_search_mcp/academic_platforms/sci_hub.py,sha256=oma3M_gUseDByh-0Awi8Sxr0g3yojrb8XoD9iV0Exo8,7334
|
|
18
|
+
paper_search_mcp/academic_platforms/semantic.py,sha256=nk7nzrlsnrDNrHNUuRIfIBQfagfAT750J5HtdLputHQ,20594
|
|
19
|
+
paper_search_mcp/academic_platforms/ssrn.py,sha256=ntf22HRBZwNY6ctG5rdXjD5iT7CaML8k_xBbCn_qjbg,13694
|
|
20
|
+
academic_search_mcp-0.1.3.dist-info/METADATA,sha256=uZcbHayXO9tURHo3Yl7P50e4j3v4p20a7GoPbtlLTe4,7203
|
|
21
|
+
academic_search_mcp-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
22
|
+
academic_search_mcp-0.1.3.dist-info/entry_points.txt,sha256=RO1wFwD6a0WO_mZY8HZBYDtITfQ1dhnTR1BZlCrkRLc,69
|
|
23
|
+
academic_search_mcp-0.1.3.dist-info/licenses/LICENSE,sha256=TwRnWq1drFhdcy43SdxndU0mcfBUYBnhSJTJ4hhjfwQ,1085
|
|
24
|
+
academic_search_mcp-0.1.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 OPENAGS
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# paper_search_mcp/sources/arxiv.py
|
|
2
|
+
from typing import List
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import requests
|
|
5
|
+
import feedparser
|
|
6
|
+
from ..paper import Paper
|
|
7
|
+
from PyPDF2 import PdfReader
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
class PaperSource:
|
|
11
|
+
"""Abstract base class for paper sources"""
|
|
12
|
+
def search(self, query: str, **kwargs) -> List[Paper]:
|
|
13
|
+
raise NotImplementedError
|
|
14
|
+
|
|
15
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
16
|
+
raise NotImplementedError
|
|
17
|
+
|
|
18
|
+
def read_paper(self, paper_id: str, save_path: str) -> str:
|
|
19
|
+
raise NotImplementedError
|
|
20
|
+
|
|
21
|
+
class ArxivSearcher(PaperSource):
|
|
22
|
+
"""Searcher for arXiv papers"""
|
|
23
|
+
BASE_URL = "http://export.arxiv.org/api/query"
|
|
24
|
+
|
|
25
|
+
def search(self, query: str, max_results: int = 10,
|
|
26
|
+
date_from: str = None, date_to: str = None) -> List[Paper]:
|
|
27
|
+
"""Search arXiv papers.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
query: Search query string
|
|
31
|
+
max_results: Maximum number of results
|
|
32
|
+
date_from: Start date in YYYY-MM-DD format (optional)
|
|
33
|
+
date_to: End date in YYYY-MM-DD format (optional)
|
|
34
|
+
"""
|
|
35
|
+
# Build query with date filter if specified
|
|
36
|
+
# Format: submittedDate:[YYYYMMDDHHMM TO YYYYMMDDHHMM]
|
|
37
|
+
# Note: Must use field prefix (all:, ti:, abs:) with date filter
|
|
38
|
+
search_query = query
|
|
39
|
+
if date_from or date_to:
|
|
40
|
+
start = date_from.replace('-', '') + '0000' if date_from else '190001010000'
|
|
41
|
+
end = date_to.replace('-', '') + '2359' if date_to else '209912312359'
|
|
42
|
+
# Wrap query in all: if not already using a field prefix
|
|
43
|
+
if not any(query.startswith(p) for p in ['ti:', 'abs:', 'au:', 'cat:', 'all:']):
|
|
44
|
+
search_query = f'all:"{query}" AND submittedDate:[{start} TO {end}]'
|
|
45
|
+
else:
|
|
46
|
+
search_query = f'{query} AND submittedDate:[{start} TO {end}]'
|
|
47
|
+
|
|
48
|
+
params = {
|
|
49
|
+
'search_query': search_query,
|
|
50
|
+
'max_results': max_results,
|
|
51
|
+
'sortBy': 'submittedDate',
|
|
52
|
+
'sortOrder': 'descending'
|
|
53
|
+
}
|
|
54
|
+
response = requests.get(self.BASE_URL, params=params)
|
|
55
|
+
feed = feedparser.parse(response.content)
|
|
56
|
+
papers = []
|
|
57
|
+
for entry in feed.entries:
|
|
58
|
+
try:
|
|
59
|
+
authors = [author.name for author in entry.authors]
|
|
60
|
+
published = datetime.strptime(entry.published, '%Y-%m-%dT%H:%M:%SZ')
|
|
61
|
+
updated = datetime.strptime(entry.updated, '%Y-%m-%dT%H:%M:%SZ')
|
|
62
|
+
pdf_url = next((link.href for link in entry.links if link.type == 'application/pdf'), '')
|
|
63
|
+
papers.append(Paper(
|
|
64
|
+
paper_id=entry.id.split('/')[-1],
|
|
65
|
+
title=entry.title,
|
|
66
|
+
authors=authors,
|
|
67
|
+
abstract=entry.summary,
|
|
68
|
+
url=entry.id,
|
|
69
|
+
pdf_url=pdf_url,
|
|
70
|
+
published_date=published,
|
|
71
|
+
updated_date=updated,
|
|
72
|
+
source='arxiv',
|
|
73
|
+
categories=[tag.term for tag in entry.tags],
|
|
74
|
+
keywords=[],
|
|
75
|
+
doi=entry.get('doi', '')
|
|
76
|
+
))
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(f"Error parsing arXiv entry: {e}")
|
|
79
|
+
return papers
|
|
80
|
+
|
|
81
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
82
|
+
pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
|
|
83
|
+
response = requests.get(pdf_url)
|
|
84
|
+
os.makedirs(save_path, exist_ok=True)
|
|
85
|
+
output_file = f"{save_path}/{paper_id}.pdf"
|
|
86
|
+
with open(output_file, 'wb') as f:
|
|
87
|
+
f.write(response.content)
|
|
88
|
+
return output_file
|
|
89
|
+
|
|
90
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
91
|
+
"""Read a paper and convert it to text format.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
paper_id: arXiv paper ID
|
|
95
|
+
save_path: Directory where the PDF is/will be saved
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
str: The extracted text content of the paper
|
|
99
|
+
"""
|
|
100
|
+
from ..pdf_utils import extract_text_from_pdf
|
|
101
|
+
|
|
102
|
+
# First ensure we have the PDF
|
|
103
|
+
pdf_path = f"{save_path}/{paper_id}.pdf"
|
|
104
|
+
if not os.path.exists(pdf_path):
|
|
105
|
+
pdf_path = self.download_pdf(paper_id, save_path)
|
|
106
|
+
|
|
107
|
+
return extract_text_from_pdf(pdf_path)
|
|
108
|
+
|
|
109
|
+
if __name__ == "__main__":
|
|
110
|
+
# 测试 ArxivSearcher 的功能
|
|
111
|
+
searcher = ArxivSearcher()
|
|
112
|
+
|
|
113
|
+
# 测试搜索功能
|
|
114
|
+
print("Testing search functionality...")
|
|
115
|
+
query = "machine learning"
|
|
116
|
+
max_results = 5
|
|
117
|
+
try:
|
|
118
|
+
papers = searcher.search(query, max_results=max_results)
|
|
119
|
+
print(f"Found {len(papers)} papers for query '{query}':")
|
|
120
|
+
for i, paper in enumerate(papers, 1):
|
|
121
|
+
print(f"{i}. {paper.title} (ID: {paper.paper_id})")
|
|
122
|
+
except Exception as e:
|
|
123
|
+
print(f"Error during search: {e}")
|
|
124
|
+
|
|
125
|
+
# 测试 PDF 下载功能
|
|
126
|
+
if papers:
|
|
127
|
+
print("\nTesting PDF download functionality...")
|
|
128
|
+
paper_id = papers[0].paper_id
|
|
129
|
+
save_path = "./downloads" # 确保此目录存在
|
|
130
|
+
try:
|
|
131
|
+
os.makedirs(save_path, exist_ok=True)
|
|
132
|
+
pdf_path = searcher.download_pdf(paper_id, save_path)
|
|
133
|
+
print(f"PDF downloaded successfully: {pdf_path}")
|
|
134
|
+
except Exception as e:
|
|
135
|
+
print(f"Error during PDF download: {e}")
|
|
136
|
+
|
|
137
|
+
# 测试论文阅读功能
|
|
138
|
+
if papers:
|
|
139
|
+
print("\nTesting paper reading functionality...")
|
|
140
|
+
paper_id = papers[0].paper_id
|
|
141
|
+
try:
|
|
142
|
+
text_content = searcher.read_paper(paper_id)
|
|
143
|
+
print(f"\nFirst 500 characters of the paper content:")
|
|
144
|
+
print(text_content[:500] + "...")
|
|
145
|
+
print(f"\nTotal length of extracted text: {len(text_content)} characters")
|
|
146
|
+
except Exception as e:
|
|
147
|
+
print(f"Error during paper reading: {e}")
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import requests
|
|
3
|
+
import os
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
from ..paper import Paper
|
|
6
|
+
from PyPDF2 import PdfReader
|
|
7
|
+
|
|
8
|
+
class PaperSource:
|
|
9
|
+
"""Abstract base class for paper sources"""
|
|
10
|
+
def search(self, query: str, **kwargs) -> List[Paper]:
|
|
11
|
+
raise NotImplementedError
|
|
12
|
+
|
|
13
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
14
|
+
raise NotImplementedError
|
|
15
|
+
|
|
16
|
+
def read_paper(self, paper_id: str, save_path: str) -> str:
|
|
17
|
+
raise NotImplementedError
|
|
18
|
+
|
|
19
|
+
class BioRxivSearcher(PaperSource):
|
|
20
|
+
"""Searcher for bioRxiv papers"""
|
|
21
|
+
BASE_URL = "https://api.biorxiv.org/details/biorxiv"
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
self.session = requests.Session()
|
|
25
|
+
self.session.proxies = {'http': None, 'https': None}
|
|
26
|
+
self.timeout = 30
|
|
27
|
+
self.max_retries = 3
|
|
28
|
+
|
|
29
|
+
def search(self, query: str, max_results: int = 10, days: int = 30,
|
|
30
|
+
date_from: str = None, date_to: str = None) -> List[Paper]:
|
|
31
|
+
"""
|
|
32
|
+
Search for papers on bioRxiv by category within a date range.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
query: Category name to search for (e.g., "cell biology").
|
|
36
|
+
max_results: Maximum number of papers to return.
|
|
37
|
+
days: Number of days to look back for papers (used if date_from/date_to not specified).
|
|
38
|
+
date_from: Start date in YYYY-MM-DD format (optional, overrides days).
|
|
39
|
+
date_to: End date in YYYY-MM-DD format (optional, defaults to today).
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of Paper objects matching the category within the specified date range.
|
|
43
|
+
"""
|
|
44
|
+
# Use date_from/date_to if provided, otherwise fall back to days parameter
|
|
45
|
+
if date_from or date_to:
|
|
46
|
+
end_date = date_to if date_to else datetime.now().strftime('%Y-%m-%d')
|
|
47
|
+
start_date = date_from if date_from else '1900-01-01'
|
|
48
|
+
else:
|
|
49
|
+
# Calculate date range: last N days
|
|
50
|
+
end_date = datetime.now().strftime('%Y-%m-%d')
|
|
51
|
+
start_date = (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d')
|
|
52
|
+
|
|
53
|
+
# Format category: lowercase and replace spaces with underscores
|
|
54
|
+
category = query.lower().replace(' ', '_')
|
|
55
|
+
|
|
56
|
+
papers = []
|
|
57
|
+
cursor = 0
|
|
58
|
+
while len(papers) < max_results:
|
|
59
|
+
url = f"{self.BASE_URL}/{start_date}/{end_date}/{cursor}"
|
|
60
|
+
if category:
|
|
61
|
+
url += f"?category={category}"
|
|
62
|
+
tries = 0
|
|
63
|
+
while tries < self.max_retries:
|
|
64
|
+
try:
|
|
65
|
+
response = self.session.get(url, timeout=self.timeout)
|
|
66
|
+
response.raise_for_status()
|
|
67
|
+
data = response.json()
|
|
68
|
+
collection = data.get('collection', [])
|
|
69
|
+
for item in collection:
|
|
70
|
+
try:
|
|
71
|
+
date = datetime.strptime(item['date'], '%Y-%m-%d')
|
|
72
|
+
papers.append(Paper(
|
|
73
|
+
paper_id=item['doi'],
|
|
74
|
+
title=item['title'],
|
|
75
|
+
authors=item['authors'].split('; '),
|
|
76
|
+
abstract=item['abstract'],
|
|
77
|
+
url=f"https://www.biorxiv.org/content/{item['doi']}v{item.get('version', '1')}",
|
|
78
|
+
pdf_url=f"https://www.biorxiv.org/content/{item['doi']}v{item.get('version', '1')}.full.pdf",
|
|
79
|
+
published_date=date,
|
|
80
|
+
updated_date=date,
|
|
81
|
+
source="biorxiv",
|
|
82
|
+
categories=[item['category']],
|
|
83
|
+
keywords=[],
|
|
84
|
+
doi=item['doi']
|
|
85
|
+
))
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(f"Error parsing bioRxiv entry: {e}")
|
|
88
|
+
if len(collection) < 100:
|
|
89
|
+
break # No more results
|
|
90
|
+
cursor += 100
|
|
91
|
+
break # Exit retry loop on success
|
|
92
|
+
except requests.exceptions.RequestException as e:
|
|
93
|
+
tries += 1
|
|
94
|
+
if tries == self.max_retries:
|
|
95
|
+
print(f"Failed to connect to bioRxiv API after {self.max_retries} attempts: {e}")
|
|
96
|
+
break
|
|
97
|
+
print(f"Attempt {tries} failed, retrying...")
|
|
98
|
+
else:
|
|
99
|
+
continue
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
return papers[:max_results]
|
|
103
|
+
|
|
104
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
105
|
+
"""
|
|
106
|
+
Download a PDF for a given paper ID from bioRxiv.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
paper_id: The DOI of the paper.
|
|
110
|
+
save_path: Directory to save the PDF.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Path to the downloaded PDF file.
|
|
114
|
+
"""
|
|
115
|
+
if not paper_id:
|
|
116
|
+
raise ValueError("Invalid paper_id: paper_id is empty")
|
|
117
|
+
|
|
118
|
+
pdf_url = f"https://www.biorxiv.org/content/{paper_id}v1.full.pdf"
|
|
119
|
+
tries = 0
|
|
120
|
+
while tries < self.max_retries:
|
|
121
|
+
try:
|
|
122
|
+
# Add User-Agent to avoid potential 403 errors
|
|
123
|
+
headers = {
|
|
124
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
125
|
+
}
|
|
126
|
+
response = self.session.get(pdf_url, timeout=self.timeout, headers=headers)
|
|
127
|
+
response.raise_for_status()
|
|
128
|
+
os.makedirs(save_path, exist_ok=True)
|
|
129
|
+
output_file = f"{save_path}/{paper_id.replace('/', '_')}.pdf"
|
|
130
|
+
with open(output_file, 'wb') as f:
|
|
131
|
+
f.write(response.content)
|
|
132
|
+
return output_file
|
|
133
|
+
except requests.exceptions.RequestException as e:
|
|
134
|
+
tries += 1
|
|
135
|
+
if tries == self.max_retries:
|
|
136
|
+
raise Exception(f"Failed to download PDF after {self.max_retries} attempts: {e}")
|
|
137
|
+
print(f"Attempt {tries} failed, retrying...")
|
|
138
|
+
|
|
139
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
140
|
+
"""
|
|
141
|
+
Read a paper and convert it to text format.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
paper_id: bioRxiv DOI
|
|
145
|
+
save_path: Directory where the PDF is/will be saved
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
str: The extracted text content of the paper
|
|
149
|
+
"""
|
|
150
|
+
from ..pdf_utils import extract_text_from_pdf
|
|
151
|
+
|
|
152
|
+
pdf_path = f"{save_path}/{paper_id.replace('/', '_')}.pdf"
|
|
153
|
+
if not os.path.exists(pdf_path):
|
|
154
|
+
pdf_path = self.download_pdf(paper_id, save_path)
|
|
155
|
+
|
|
156
|
+
return extract_text_from_pdf(pdf_path)
|