academic-search-mcp 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,159 @@
1
+ # paper_search_mcp/sources/pubmed.py
2
+ from typing import List
3
+ import requests
4
+ from xml.etree import ElementTree as ET
5
+ from datetime import datetime
6
+ from ..paper import Paper
7
+ import os
8
+
9
+ class PaperSource:
10
+ """Abstract base class for paper sources"""
11
+ def search(self, query: str, **kwargs) -> List[Paper]:
12
+ raise NotImplementedError
13
+
14
+ def download_pdf(self, paper_id: str, save_path: str) -> str:
15
+ raise NotImplementedError
16
+
17
+ def read_paper(self, paper_id: str, save_path: str) -> str:
18
+ raise NotImplementedError
19
+
20
+ class PubMedSearcher(PaperSource):
21
+ """Searcher for PubMed papers"""
22
+ SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
23
+ FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
24
+
25
+ def search(self, query: str, max_results: int = 10,
26
+ date_from: str = None, date_to: str = None) -> List[Paper]:
27
+ """Search PubMed papers.
28
+
29
+ Args:
30
+ query: Search query string
31
+ max_results: Maximum number of results
32
+ date_from: Start date in YYYY-MM-DD format (optional)
33
+ date_to: End date in YYYY-MM-DD format (optional)
34
+ """
35
+ search_params = {
36
+ 'db': 'pubmed',
37
+ 'term': query,
38
+ 'retmax': max_results,
39
+ 'retmode': 'xml'
40
+ }
41
+
42
+ # Add date filtering if specified
43
+ # PubMed uses YYYY/MM/DD format
44
+ if date_from or date_to:
45
+ search_params['datetype'] = 'pdat' # publication date
46
+ if date_from:
47
+ search_params['mindate'] = date_from.replace('-', '/')
48
+ if date_to:
49
+ search_params['maxdate'] = date_to.replace('-', '/')
50
+ search_response = requests.get(self.SEARCH_URL, params=search_params)
51
+ search_root = ET.fromstring(search_response.content)
52
+ ids = [id.text for id in search_root.findall('.//Id')]
53
+
54
+ fetch_params = {
55
+ 'db': 'pubmed',
56
+ 'id': ','.join(ids),
57
+ 'retmode': 'xml'
58
+ }
59
+ fetch_response = requests.get(self.FETCH_URL, params=fetch_params)
60
+ fetch_root = ET.fromstring(fetch_response.content)
61
+
62
+ papers = []
63
+ for article in fetch_root.findall('.//PubmedArticle'):
64
+ try:
65
+ pmid = article.find('.//PMID').text
66
+ title = article.find('.//ArticleTitle').text
67
+ authors = [f"{author.find('LastName').text} {author.find('Initials').text}"
68
+ for author in article.findall('.//Author')]
69
+ abstract = article.find('.//AbstractText').text if article.find('.//AbstractText') is not None else ''
70
+ pub_date = article.find('.//PubDate/Year').text
71
+ published = datetime.strptime(pub_date, '%Y')
72
+ doi = article.find('.//ELocationID[@EIdType="doi"]').text if article.find('.//ELocationID[@EIdType="doi"]') is not None else ''
73
+ papers.append(Paper(
74
+ paper_id=pmid,
75
+ title=title,
76
+ authors=authors,
77
+ abstract=abstract,
78
+ url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
79
+ pdf_url='', # PubMed 无直接 PDF
80
+ published_date=published,
81
+ updated_date=published,
82
+ source='pubmed',
83
+ categories=[],
84
+ keywords=[],
85
+ doi=doi
86
+ ))
87
+ except Exception as e:
88
+ print(f"Error parsing PubMed article: {e}")
89
+ return papers
90
+
91
+ def download_pdf(self, paper_id: str, save_path: str) -> str:
92
+ """Attempt to download a paper's PDF from PubMed.
93
+
94
+ Args:
95
+ paper_id: PubMed ID (PMID)
96
+ save_path: Directory to save the PDF
97
+
98
+ Returns:
99
+ str: Error message indicating PDF download is not supported
100
+
101
+ Raises:
102
+ NotImplementedError: Always raises this error as PubMed doesn't provide direct PDF access
103
+ """
104
+ message = ("PubMed does not provide direct PDF downloads. "
105
+ "Please use the paper's DOI or URL to access the publisher's website.")
106
+ raise NotImplementedError(message)
107
+
108
+ def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
109
+ """Attempt to read and extract text from a PubMed paper.
110
+
111
+ Args:
112
+ paper_id: PubMed ID (PMID)
113
+ save_path: Directory for potential PDF storage (unused)
114
+
115
+ Returns:
116
+ str: Error message indicating PDF reading is not supported
117
+ """
118
+ message = ("PubMed papers cannot be read directly through this tool. "
119
+ "Only metadata and abstracts are available through PubMed's API. "
120
+ "Please use the paper's DOI or URL to access the full text on the publisher's website.")
121
+ return message
122
+
123
+ if __name__ == "__main__":
124
+ # 测试 PubMedSearcher 的功能
125
+ searcher = PubMedSearcher()
126
+
127
+ # 测试搜索功能
128
+ print("Testing search functionality...")
129
+ query = "machine learning"
130
+ max_results = 5
131
+ try:
132
+ papers = searcher.search(query, max_results=max_results)
133
+ print(f"Found {len(papers)} papers for query '{query}':")
134
+ for i, paper in enumerate(papers, 1):
135
+ print(f"{i}. {paper.title}")
136
+ print(f" Authors: {', '.join(paper.authors)}")
137
+ print(f" DOI: {paper.doi}")
138
+ print(f" URL: {paper.url}\n")
139
+ except Exception as e:
140
+ print(f"Error during search: {e}")
141
+
142
+ # 测试 PDF 下载功能(会返回不支持的提示)
143
+ if papers:
144
+ print("\nTesting PDF download functionality...")
145
+ paper_id = papers[0].paper_id
146
+ try:
147
+ pdf_path = searcher.download_pdf(paper_id, "./downloads")
148
+ except NotImplementedError as e:
149
+ print(f"Expected error: {e}")
150
+
151
+ # 测试论文阅读功能(会返回不支持的提示)
152
+ if papers:
153
+ print("\nTesting paper reading functionality...")
154
+ paper_id = papers[0].paper_id
155
+ try:
156
+ message = searcher.read_paper(paper_id)
157
+ print(f"Response: {message}")
158
+ except Exception as e:
159
+ print(f"Error during paper reading: {e}")
@@ -0,0 +1,178 @@
1
+ """Sci-Hub downloader integration.
2
+
3
+ Simple wrapper adapted from scihub.py for downloading PDFs via Sci-Hub.
4
+ """
5
+ from pathlib import Path
6
+ import re
7
+ import hashlib
8
+ import logging
9
+ from typing import Optional
10
+
11
+ import requests
12
+ from bs4 import BeautifulSoup
13
+
14
+
15
+ class SciHubFetcher:
16
+ """Simple Sci-Hub PDF downloader."""
17
+
18
+ def __init__(self, base_url: str = "https://sci-hub.se", output_dir: str = "./downloads"):
19
+ """Initialize with Sci-Hub URL and output directory."""
20
+ self.base_url = base_url.rstrip("/")
21
+ self.output_dir = Path(output_dir)
22
+ self.output_dir.mkdir(parents=True, exist_ok=True)
23
+ self.session = requests.Session()
24
+ self.session.headers = {
25
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
26
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
27
+ 'Accept-Language': 'en-US,en;q=0.5',
28
+ 'Accept-Encoding': 'gzip, deflate',
29
+ 'DNT': '1',
30
+ 'Connection': 'keep-alive',
31
+ 'Upgrade-Insecure-Requests': '1',
32
+ }
33
+
34
+ def download_pdf(self, identifier: str) -> Optional[str]:
35
+ """Download a PDF from Sci-Hub using a DOI, PMID, or URL.
36
+
37
+ Args:
38
+ identifier: DOI, PMID, or URL to the paper
39
+
40
+ Returns:
41
+ Path to saved PDF or None on failure
42
+ """
43
+ if not identifier.strip():
44
+ return None
45
+
46
+ try:
47
+ # Get direct URL to PDF
48
+ pdf_url = self._get_direct_url(identifier)
49
+ if not pdf_url:
50
+ logging.error(f"Could not find PDF URL for identifier: {identifier}")
51
+ return None
52
+
53
+ # Download the PDF
54
+ response = self.session.get(pdf_url, verify=False, timeout=30)
55
+
56
+ if response.status_code != 200:
57
+ logging.error(f"Failed to download PDF, status {response.status_code}")
58
+ return None
59
+
60
+ if response.headers.get('Content-Type') != 'application/pdf':
61
+ logging.error("Response is not a PDF")
62
+ return None
63
+
64
+ # Generate filename and save
65
+ filename = self._generate_filename(response, identifier)
66
+ file_path = self.output_dir / filename
67
+
68
+ with open(file_path, 'wb') as f:
69
+ f.write(response.content)
70
+
71
+ return str(file_path)
72
+
73
+ except Exception as e:
74
+ logging.error(f"Error downloading PDF for {identifier}: {e}")
75
+ return None
76
+
77
+ def _get_direct_url(self, identifier: str) -> Optional[str]:
78
+ """Get the direct PDF URL from Sci-Hub."""
79
+ try:
80
+ # If it's already a direct PDF URL, return it
81
+ if identifier.endswith('.pdf'):
82
+ return identifier
83
+
84
+ # Search on Sci-Hub
85
+ search_url = f"{self.base_url}/{identifier}"
86
+ response = self.session.get(search_url, verify=False, timeout=20)
87
+
88
+ if response.status_code != 200:
89
+ return None
90
+
91
+ soup = BeautifulSoup(response.content, 'html.parser')
92
+
93
+ # Check for article not found
94
+ if "article not found" in response.text.lower():
95
+ logging.warning("Article not found on Sci-Hub")
96
+ return None
97
+
98
+ # Look for embed tag with PDF (most common in modern Sci-Hub)
99
+ embed = soup.find('embed', {'type': 'application/pdf'})
100
+ logging.debug(f"Found embed tag: {embed}")
101
+ if embed:
102
+ src = embed.get('src') if hasattr(embed, 'get') else None
103
+ logging.debug(f"Embed src: {src}")
104
+ if src and isinstance(src, str):
105
+ if src.startswith('//'):
106
+ pdf_url = 'https:' + src
107
+ logging.debug(f"Returning PDF URL: {pdf_url}")
108
+ return pdf_url
109
+ elif src.startswith('/'):
110
+ pdf_url = self.base_url + src
111
+ logging.debug(f"Returning PDF URL: {pdf_url}")
112
+ return pdf_url
113
+ else:
114
+ logging.debug(f"Returning PDF URL: {src}")
115
+ return src
116
+
117
+ # Look for iframe with PDF (fallback)
118
+ iframe = soup.find('iframe')
119
+ if iframe:
120
+ src = iframe.get('src') if hasattr(iframe, 'get') else None
121
+ if src and isinstance(src, str):
122
+ if src.startswith('//'):
123
+ return 'https:' + src
124
+ elif src.startswith('/'):
125
+ return self.base_url + src
126
+ else:
127
+ return src
128
+
129
+ # Look for download button with onclick
130
+ for button in soup.find_all('button'):
131
+ onclick = button.get('onclick', '') if hasattr(button, 'get') else ''
132
+ if isinstance(onclick, str) and 'pdf' in onclick.lower():
133
+ # Extract URL from onclick JavaScript
134
+ url_match = re.search(r"location\.href='([^']+)'", onclick)
135
+ if url_match:
136
+ url = url_match.group(1)
137
+ if url.startswith('//'):
138
+ return 'https:' + url
139
+ elif url.startswith('/'):
140
+ return self.base_url + url
141
+ else:
142
+ return url
143
+
144
+ # Look for direct download links
145
+ for link in soup.find_all('a'):
146
+ href = link.get('href', '') if hasattr(link, 'get') else ''
147
+ if isinstance(href, str) and href and ('pdf' in href.lower() or href.endswith('.pdf')):
148
+ if href.startswith('//'):
149
+ return 'https:' + href
150
+ elif href.startswith('/'):
151
+ return self.base_url + href
152
+ elif href.startswith('http'):
153
+ return href
154
+
155
+ return None
156
+
157
+ except Exception as e:
158
+ logging.error(f"Error getting direct URL for {identifier}: {e}")
159
+ return None
160
+
161
+ def _generate_filename(self, response: requests.Response, identifier: str) -> str:
162
+ """Generate a unique filename for the PDF."""
163
+ # Try to get filename from URL
164
+ url_parts = response.url.split('/')
165
+ if url_parts:
166
+ name = url_parts[-1]
167
+ # Remove view parameters
168
+ name = re.sub(r'#view=(.+)', '', name)
169
+ if name.endswith('.pdf'):
170
+ # Generate hash for uniqueness
171
+ pdf_hash = hashlib.md5(response.content).hexdigest()[:8]
172
+ base_name = name[:-4] # Remove .pdf
173
+ return f"{pdf_hash}_{base_name}.pdf"
174
+
175
+ # Fallback: use identifier
176
+ clean_identifier = re.sub(r'[^\w\-_.]', '_', identifier)
177
+ pdf_hash = hashlib.md5(response.content).hexdigest()[:8]
178
+ return f"{pdf_hash}_{clean_identifier}.pdf"