academic-search-mcp 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_search_mcp-0.1.3.dist-info/METADATA +243 -0
- academic_search_mcp-0.1.3.dist-info/RECORD +24 -0
- academic_search_mcp-0.1.3.dist-info/WHEEL +4 -0
- academic_search_mcp-0.1.3.dist-info/entry_points.txt +2 -0
- academic_search_mcp-0.1.3.dist-info/licenses/LICENSE +21 -0
- paper_search_mcp/__init__.py +0 -0
- paper_search_mcp/academic_platforms/__init__.py +0 -0
- paper_search_mcp/academic_platforms/arxiv.py +147 -0
- paper_search_mcp/academic_platforms/biorxiv.py +156 -0
- paper_search_mcp/academic_platforms/core.py +284 -0
- paper_search_mcp/academic_platforms/crossref.py +375 -0
- paper_search_mcp/academic_platforms/cyberleninka.py +396 -0
- paper_search_mcp/academic_platforms/google_scholar.py +249 -0
- paper_search_mcp/academic_platforms/hub.py +0 -0
- paper_search_mcp/academic_platforms/iacr.py +548 -0
- paper_search_mcp/academic_platforms/medrxiv.py +156 -0
- paper_search_mcp/academic_platforms/openalex.py +497 -0
- paper_search_mcp/academic_platforms/pubmed.py +159 -0
- paper_search_mcp/academic_platforms/sci_hub.py +178 -0
- paper_search_mcp/academic_platforms/semantic.py +492 -0
- paper_search_mcp/academic_platforms/ssrn.py +385 -0
- paper_search_mcp/paper.py +69 -0
- paper_search_mcp/pdf_utils.py +67 -0
- paper_search_mcp/server.py +514 -0
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# paper_search_mcp/sources/pubmed.py
|
|
2
|
+
from typing import List
|
|
3
|
+
import requests
|
|
4
|
+
from xml.etree import ElementTree as ET
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from ..paper import Paper
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
class PaperSource:
|
|
10
|
+
"""Abstract base class for paper sources"""
|
|
11
|
+
def search(self, query: str, **kwargs) -> List[Paper]:
|
|
12
|
+
raise NotImplementedError
|
|
13
|
+
|
|
14
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
15
|
+
raise NotImplementedError
|
|
16
|
+
|
|
17
|
+
def read_paper(self, paper_id: str, save_path: str) -> str:
|
|
18
|
+
raise NotImplementedError
|
|
19
|
+
|
|
20
|
+
class PubMedSearcher(PaperSource):
|
|
21
|
+
"""Searcher for PubMed papers"""
|
|
22
|
+
SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
|
23
|
+
FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
|
24
|
+
|
|
25
|
+
def search(self, query: str, max_results: int = 10,
|
|
26
|
+
date_from: str = None, date_to: str = None) -> List[Paper]:
|
|
27
|
+
"""Search PubMed papers.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
query: Search query string
|
|
31
|
+
max_results: Maximum number of results
|
|
32
|
+
date_from: Start date in YYYY-MM-DD format (optional)
|
|
33
|
+
date_to: End date in YYYY-MM-DD format (optional)
|
|
34
|
+
"""
|
|
35
|
+
search_params = {
|
|
36
|
+
'db': 'pubmed',
|
|
37
|
+
'term': query,
|
|
38
|
+
'retmax': max_results,
|
|
39
|
+
'retmode': 'xml'
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Add date filtering if specified
|
|
43
|
+
# PubMed uses YYYY/MM/DD format
|
|
44
|
+
if date_from or date_to:
|
|
45
|
+
search_params['datetype'] = 'pdat' # publication date
|
|
46
|
+
if date_from:
|
|
47
|
+
search_params['mindate'] = date_from.replace('-', '/')
|
|
48
|
+
if date_to:
|
|
49
|
+
search_params['maxdate'] = date_to.replace('-', '/')
|
|
50
|
+
search_response = requests.get(self.SEARCH_URL, params=search_params)
|
|
51
|
+
search_root = ET.fromstring(search_response.content)
|
|
52
|
+
ids = [id.text for id in search_root.findall('.//Id')]
|
|
53
|
+
|
|
54
|
+
fetch_params = {
|
|
55
|
+
'db': 'pubmed',
|
|
56
|
+
'id': ','.join(ids),
|
|
57
|
+
'retmode': 'xml'
|
|
58
|
+
}
|
|
59
|
+
fetch_response = requests.get(self.FETCH_URL, params=fetch_params)
|
|
60
|
+
fetch_root = ET.fromstring(fetch_response.content)
|
|
61
|
+
|
|
62
|
+
papers = []
|
|
63
|
+
for article in fetch_root.findall('.//PubmedArticle'):
|
|
64
|
+
try:
|
|
65
|
+
pmid = article.find('.//PMID').text
|
|
66
|
+
title = article.find('.//ArticleTitle').text
|
|
67
|
+
authors = [f"{author.find('LastName').text} {author.find('Initials').text}"
|
|
68
|
+
for author in article.findall('.//Author')]
|
|
69
|
+
abstract = article.find('.//AbstractText').text if article.find('.//AbstractText') is not None else ''
|
|
70
|
+
pub_date = article.find('.//PubDate/Year').text
|
|
71
|
+
published = datetime.strptime(pub_date, '%Y')
|
|
72
|
+
doi = article.find('.//ELocationID[@EIdType="doi"]').text if article.find('.//ELocationID[@EIdType="doi"]') is not None else ''
|
|
73
|
+
papers.append(Paper(
|
|
74
|
+
paper_id=pmid,
|
|
75
|
+
title=title,
|
|
76
|
+
authors=authors,
|
|
77
|
+
abstract=abstract,
|
|
78
|
+
url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
|
|
79
|
+
pdf_url='', # PubMed 无直接 PDF
|
|
80
|
+
published_date=published,
|
|
81
|
+
updated_date=published,
|
|
82
|
+
source='pubmed',
|
|
83
|
+
categories=[],
|
|
84
|
+
keywords=[],
|
|
85
|
+
doi=doi
|
|
86
|
+
))
|
|
87
|
+
except Exception as e:
|
|
88
|
+
print(f"Error parsing PubMed article: {e}")
|
|
89
|
+
return papers
|
|
90
|
+
|
|
91
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
92
|
+
"""Attempt to download a paper's PDF from PubMed.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
paper_id: PubMed ID (PMID)
|
|
96
|
+
save_path: Directory to save the PDF
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
str: Error message indicating PDF download is not supported
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
NotImplementedError: Always raises this error as PubMed doesn't provide direct PDF access
|
|
103
|
+
"""
|
|
104
|
+
message = ("PubMed does not provide direct PDF downloads. "
|
|
105
|
+
"Please use the paper's DOI or URL to access the publisher's website.")
|
|
106
|
+
raise NotImplementedError(message)
|
|
107
|
+
|
|
108
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
109
|
+
"""Attempt to read and extract text from a PubMed paper.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
paper_id: PubMed ID (PMID)
|
|
113
|
+
save_path: Directory for potential PDF storage (unused)
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
str: Error message indicating PDF reading is not supported
|
|
117
|
+
"""
|
|
118
|
+
message = ("PubMed papers cannot be read directly through this tool. "
|
|
119
|
+
"Only metadata and abstracts are available through PubMed's API. "
|
|
120
|
+
"Please use the paper's DOI or URL to access the full text on the publisher's website.")
|
|
121
|
+
return message
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
# 测试 PubMedSearcher 的功能
|
|
125
|
+
searcher = PubMedSearcher()
|
|
126
|
+
|
|
127
|
+
# 测试搜索功能
|
|
128
|
+
print("Testing search functionality...")
|
|
129
|
+
query = "machine learning"
|
|
130
|
+
max_results = 5
|
|
131
|
+
try:
|
|
132
|
+
papers = searcher.search(query, max_results=max_results)
|
|
133
|
+
print(f"Found {len(papers)} papers for query '{query}':")
|
|
134
|
+
for i, paper in enumerate(papers, 1):
|
|
135
|
+
print(f"{i}. {paper.title}")
|
|
136
|
+
print(f" Authors: {', '.join(paper.authors)}")
|
|
137
|
+
print(f" DOI: {paper.doi}")
|
|
138
|
+
print(f" URL: {paper.url}\n")
|
|
139
|
+
except Exception as e:
|
|
140
|
+
print(f"Error during search: {e}")
|
|
141
|
+
|
|
142
|
+
# 测试 PDF 下载功能(会返回不支持的提示)
|
|
143
|
+
if papers:
|
|
144
|
+
print("\nTesting PDF download functionality...")
|
|
145
|
+
paper_id = papers[0].paper_id
|
|
146
|
+
try:
|
|
147
|
+
pdf_path = searcher.download_pdf(paper_id, "./downloads")
|
|
148
|
+
except NotImplementedError as e:
|
|
149
|
+
print(f"Expected error: {e}")
|
|
150
|
+
|
|
151
|
+
# 测试论文阅读功能(会返回不支持的提示)
|
|
152
|
+
if papers:
|
|
153
|
+
print("\nTesting paper reading functionality...")
|
|
154
|
+
paper_id = papers[0].paper_id
|
|
155
|
+
try:
|
|
156
|
+
message = searcher.read_paper(paper_id)
|
|
157
|
+
print(f"Response: {message}")
|
|
158
|
+
except Exception as e:
|
|
159
|
+
print(f"Error during paper reading: {e}")
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Sci-Hub downloader integration.
|
|
2
|
+
|
|
3
|
+
Simple wrapper adapted from scihub.py for downloading PDFs via Sci-Hub.
|
|
4
|
+
"""
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import re
|
|
7
|
+
import hashlib
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
import requests
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SciHubFetcher:
|
|
16
|
+
"""Simple Sci-Hub PDF downloader."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, base_url: str = "https://sci-hub.se", output_dir: str = "./downloads"):
|
|
19
|
+
"""Initialize with Sci-Hub URL and output directory."""
|
|
20
|
+
self.base_url = base_url.rstrip("/")
|
|
21
|
+
self.output_dir = Path(output_dir)
|
|
22
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
self.session = requests.Session()
|
|
24
|
+
self.session.headers = {
|
|
25
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
26
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
27
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
28
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
29
|
+
'DNT': '1',
|
|
30
|
+
'Connection': 'keep-alive',
|
|
31
|
+
'Upgrade-Insecure-Requests': '1',
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
def download_pdf(self, identifier: str) -> Optional[str]:
|
|
35
|
+
"""Download a PDF from Sci-Hub using a DOI, PMID, or URL.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
identifier: DOI, PMID, or URL to the paper
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Path to saved PDF or None on failure
|
|
42
|
+
"""
|
|
43
|
+
if not identifier.strip():
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
# Get direct URL to PDF
|
|
48
|
+
pdf_url = self._get_direct_url(identifier)
|
|
49
|
+
if not pdf_url:
|
|
50
|
+
logging.error(f"Could not find PDF URL for identifier: {identifier}")
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
# Download the PDF
|
|
54
|
+
response = self.session.get(pdf_url, verify=False, timeout=30)
|
|
55
|
+
|
|
56
|
+
if response.status_code != 200:
|
|
57
|
+
logging.error(f"Failed to download PDF, status {response.status_code}")
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
if response.headers.get('Content-Type') != 'application/pdf':
|
|
61
|
+
logging.error("Response is not a PDF")
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
# Generate filename and save
|
|
65
|
+
filename = self._generate_filename(response, identifier)
|
|
66
|
+
file_path = self.output_dir / filename
|
|
67
|
+
|
|
68
|
+
with open(file_path, 'wb') as f:
|
|
69
|
+
f.write(response.content)
|
|
70
|
+
|
|
71
|
+
return str(file_path)
|
|
72
|
+
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logging.error(f"Error downloading PDF for {identifier}: {e}")
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
def _get_direct_url(self, identifier: str) -> Optional[str]:
|
|
78
|
+
"""Get the direct PDF URL from Sci-Hub."""
|
|
79
|
+
try:
|
|
80
|
+
# If it's already a direct PDF URL, return it
|
|
81
|
+
if identifier.endswith('.pdf'):
|
|
82
|
+
return identifier
|
|
83
|
+
|
|
84
|
+
# Search on Sci-Hub
|
|
85
|
+
search_url = f"{self.base_url}/{identifier}"
|
|
86
|
+
response = self.session.get(search_url, verify=False, timeout=20)
|
|
87
|
+
|
|
88
|
+
if response.status_code != 200:
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
92
|
+
|
|
93
|
+
# Check for article not found
|
|
94
|
+
if "article not found" in response.text.lower():
|
|
95
|
+
logging.warning("Article not found on Sci-Hub")
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
# Look for embed tag with PDF (most common in modern Sci-Hub)
|
|
99
|
+
embed = soup.find('embed', {'type': 'application/pdf'})
|
|
100
|
+
logging.debug(f"Found embed tag: {embed}")
|
|
101
|
+
if embed:
|
|
102
|
+
src = embed.get('src') if hasattr(embed, 'get') else None
|
|
103
|
+
logging.debug(f"Embed src: {src}")
|
|
104
|
+
if src and isinstance(src, str):
|
|
105
|
+
if src.startswith('//'):
|
|
106
|
+
pdf_url = 'https:' + src
|
|
107
|
+
logging.debug(f"Returning PDF URL: {pdf_url}")
|
|
108
|
+
return pdf_url
|
|
109
|
+
elif src.startswith('/'):
|
|
110
|
+
pdf_url = self.base_url + src
|
|
111
|
+
logging.debug(f"Returning PDF URL: {pdf_url}")
|
|
112
|
+
return pdf_url
|
|
113
|
+
else:
|
|
114
|
+
logging.debug(f"Returning PDF URL: {src}")
|
|
115
|
+
return src
|
|
116
|
+
|
|
117
|
+
# Look for iframe with PDF (fallback)
|
|
118
|
+
iframe = soup.find('iframe')
|
|
119
|
+
if iframe:
|
|
120
|
+
src = iframe.get('src') if hasattr(iframe, 'get') else None
|
|
121
|
+
if src and isinstance(src, str):
|
|
122
|
+
if src.startswith('//'):
|
|
123
|
+
return 'https:' + src
|
|
124
|
+
elif src.startswith('/'):
|
|
125
|
+
return self.base_url + src
|
|
126
|
+
else:
|
|
127
|
+
return src
|
|
128
|
+
|
|
129
|
+
# Look for download button with onclick
|
|
130
|
+
for button in soup.find_all('button'):
|
|
131
|
+
onclick = button.get('onclick', '') if hasattr(button, 'get') else ''
|
|
132
|
+
if isinstance(onclick, str) and 'pdf' in onclick.lower():
|
|
133
|
+
# Extract URL from onclick JavaScript
|
|
134
|
+
url_match = re.search(r"location\.href='([^']+)'", onclick)
|
|
135
|
+
if url_match:
|
|
136
|
+
url = url_match.group(1)
|
|
137
|
+
if url.startswith('//'):
|
|
138
|
+
return 'https:' + url
|
|
139
|
+
elif url.startswith('/'):
|
|
140
|
+
return self.base_url + url
|
|
141
|
+
else:
|
|
142
|
+
return url
|
|
143
|
+
|
|
144
|
+
# Look for direct download links
|
|
145
|
+
for link in soup.find_all('a'):
|
|
146
|
+
href = link.get('href', '') if hasattr(link, 'get') else ''
|
|
147
|
+
if isinstance(href, str) and href and ('pdf' in href.lower() or href.endswith('.pdf')):
|
|
148
|
+
if href.startswith('//'):
|
|
149
|
+
return 'https:' + href
|
|
150
|
+
elif href.startswith('/'):
|
|
151
|
+
return self.base_url + href
|
|
152
|
+
elif href.startswith('http'):
|
|
153
|
+
return href
|
|
154
|
+
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logging.error(f"Error getting direct URL for {identifier}: {e}")
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
def _generate_filename(self, response: requests.Response, identifier: str) -> str:
|
|
162
|
+
"""Generate a unique filename for the PDF."""
|
|
163
|
+
# Try to get filename from URL
|
|
164
|
+
url_parts = response.url.split('/')
|
|
165
|
+
if url_parts:
|
|
166
|
+
name = url_parts[-1]
|
|
167
|
+
# Remove view parameters
|
|
168
|
+
name = re.sub(r'#view=(.+)', '', name)
|
|
169
|
+
if name.endswith('.pdf'):
|
|
170
|
+
# Generate hash for uniqueness
|
|
171
|
+
pdf_hash = hashlib.md5(response.content).hexdigest()[:8]
|
|
172
|
+
base_name = name[:-4] # Remove .pdf
|
|
173
|
+
return f"{pdf_hash}_{base_name}.pdf"
|
|
174
|
+
|
|
175
|
+
# Fallback: use identifier
|
|
176
|
+
clean_identifier = re.sub(r'[^\w\-_.]', '_', identifier)
|
|
177
|
+
pdf_hash = hashlib.md5(response.content).hexdigest()[:8]
|
|
178
|
+
return f"{pdf_hash}_{clean_identifier}.pdf"
|