academic-search-mcp 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_search_mcp-0.1.3.dist-info/METADATA +243 -0
- academic_search_mcp-0.1.3.dist-info/RECORD +24 -0
- academic_search_mcp-0.1.3.dist-info/WHEEL +4 -0
- academic_search_mcp-0.1.3.dist-info/entry_points.txt +2 -0
- academic_search_mcp-0.1.3.dist-info/licenses/LICENSE +21 -0
- paper_search_mcp/__init__.py +0 -0
- paper_search_mcp/academic_platforms/__init__.py +0 -0
- paper_search_mcp/academic_platforms/arxiv.py +147 -0
- paper_search_mcp/academic_platforms/biorxiv.py +156 -0
- paper_search_mcp/academic_platforms/core.py +284 -0
- paper_search_mcp/academic_platforms/crossref.py +375 -0
- paper_search_mcp/academic_platforms/cyberleninka.py +396 -0
- paper_search_mcp/academic_platforms/google_scholar.py +249 -0
- paper_search_mcp/academic_platforms/hub.py +0 -0
- paper_search_mcp/academic_platforms/iacr.py +548 -0
- paper_search_mcp/academic_platforms/medrxiv.py +156 -0
- paper_search_mcp/academic_platforms/openalex.py +497 -0
- paper_search_mcp/academic_platforms/pubmed.py +159 -0
- paper_search_mcp/academic_platforms/sci_hub.py +178 -0
- paper_search_mcp/academic_platforms/semantic.py +492 -0
- paper_search_mcp/academic_platforms/ssrn.py +385 -0
- paper_search_mcp/paper.py +69 -0
- paper_search_mcp/pdf_utils.py +67 -0
- paper_search_mcp/server.py +514 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import requests
|
|
3
|
+
import os
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
from ..paper import Paper
|
|
6
|
+
from PyPDF2 import PdfReader
|
|
7
|
+
|
|
8
|
+
class PaperSource:
|
|
9
|
+
"""Abstract base class for paper sources"""
|
|
10
|
+
def search(self, query: str, **kwargs) -> List[Paper]:
|
|
11
|
+
raise NotImplementedError
|
|
12
|
+
|
|
13
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
14
|
+
raise NotImplementedError
|
|
15
|
+
|
|
16
|
+
def read_paper(self, paper_id: str, save_path: str) -> str:
|
|
17
|
+
raise NotImplementedError
|
|
18
|
+
|
|
19
|
+
class MedRxivSearcher(PaperSource):
|
|
20
|
+
"""Searcher for medRxiv papers"""
|
|
21
|
+
BASE_URL = "https://api.biorxiv.org/details/medrxiv"
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
self.session = requests.Session()
|
|
25
|
+
self.session.proxies = {'http': None, 'https': None}
|
|
26
|
+
self.timeout = 30
|
|
27
|
+
self.max_retries = 3
|
|
28
|
+
|
|
29
|
+
def search(self, query: str, max_results: int = 10, days: int = 30,
|
|
30
|
+
date_from: str = None, date_to: str = None) -> List[Paper]:
|
|
31
|
+
"""
|
|
32
|
+
Search for papers on medRxiv by category within a date range.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
query: Category name to search for (e.g., "cardiovascular medicine").
|
|
36
|
+
max_results: Maximum number of papers to return.
|
|
37
|
+
days: Number of days to look back for papers (used if date_from/date_to not specified).
|
|
38
|
+
date_from: Start date in YYYY-MM-DD format (optional, overrides days).
|
|
39
|
+
date_to: End date in YYYY-MM-DD format (optional, defaults to today).
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of Paper objects matching the category within the specified date range.
|
|
43
|
+
"""
|
|
44
|
+
# Calculate date range: use explicit dates if provided, otherwise fall back to days
|
|
45
|
+
if date_from or date_to:
|
|
46
|
+
end_date = date_to if date_to else datetime.now().strftime('%Y-%m-%d')
|
|
47
|
+
start_date = date_from if date_from else '2000-01-01'
|
|
48
|
+
else:
|
|
49
|
+
end_date = datetime.now().strftime('%Y-%m-%d')
|
|
50
|
+
start_date = (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d')
|
|
51
|
+
|
|
52
|
+
# Format category: lowercase and replace spaces with underscores
|
|
53
|
+
category = query.lower().replace(' ', '_')
|
|
54
|
+
|
|
55
|
+
papers = []
|
|
56
|
+
cursor = 0
|
|
57
|
+
while len(papers) < max_results:
|
|
58
|
+
url = f"{self.BASE_URL}/{start_date}/{end_date}/{cursor}"
|
|
59
|
+
if category:
|
|
60
|
+
url += f"?category={category}"
|
|
61
|
+
|
|
62
|
+
tries = 0
|
|
63
|
+
while tries < self.max_retries:
|
|
64
|
+
try:
|
|
65
|
+
response = self.session.get(url, timeout=self.timeout)
|
|
66
|
+
response.raise_for_status()
|
|
67
|
+
data = response.json()
|
|
68
|
+
collection = data.get('collection', [])
|
|
69
|
+
for item in collection:
|
|
70
|
+
try:
|
|
71
|
+
date = datetime.strptime(item['date'], '%Y-%m-%d')
|
|
72
|
+
papers.append(Paper(
|
|
73
|
+
paper_id=item['doi'],
|
|
74
|
+
title=item['title'],
|
|
75
|
+
authors=item['authors'].split('; '),
|
|
76
|
+
abstract=item['abstract'],
|
|
77
|
+
url=f"https://www.medrxiv.org/content/{item['doi']}v{item.get('version', '1')}",
|
|
78
|
+
pdf_url=f"https://www.medrxiv.org/content/{item['doi']}v{item.get('version', '1')}.full.pdf",
|
|
79
|
+
published_date=date,
|
|
80
|
+
updated_date=date,
|
|
81
|
+
source="medrxiv",
|
|
82
|
+
categories=[item['category']],
|
|
83
|
+
keywords=[],
|
|
84
|
+
doi=item['doi']
|
|
85
|
+
))
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(f"Error parsing medRxiv entry: {e}")
|
|
88
|
+
if len(collection) < 100:
|
|
89
|
+
break # No more results
|
|
90
|
+
cursor += 100
|
|
91
|
+
break # Exit retry loop on success
|
|
92
|
+
except requests.exceptions.RequestException as e:
|
|
93
|
+
tries += 1
|
|
94
|
+
if tries == self.max_retries:
|
|
95
|
+
print(f"Failed to connect to medRxiv API after {self.max_retries} attempts: {e}")
|
|
96
|
+
break
|
|
97
|
+
print(f"Attempt {tries} failed, retrying...")
|
|
98
|
+
else:
|
|
99
|
+
continue
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
return papers[:max_results]
|
|
103
|
+
|
|
104
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
105
|
+
"""
|
|
106
|
+
Download a PDF for a given paper ID from medRxiv.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
paper_id: The DOI of the paper.
|
|
110
|
+
save_path: Directory to save the PDF.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Path to the downloaded PDF file.
|
|
114
|
+
"""
|
|
115
|
+
if not paper_id:
|
|
116
|
+
raise ValueError("Invalid paper_id: paper_id is empty")
|
|
117
|
+
|
|
118
|
+
pdf_url = f"https://www.medrxiv.org/content/{paper_id}v1.full.pdf"
|
|
119
|
+
tries = 0
|
|
120
|
+
while tries < self.max_retries:
|
|
121
|
+
try:
|
|
122
|
+
# Add User-Agent to avoid potential 403 errors
|
|
123
|
+
headers = {
|
|
124
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
125
|
+
}
|
|
126
|
+
response = self.session.get(pdf_url, timeout=self.timeout, headers=headers)
|
|
127
|
+
response.raise_for_status()
|
|
128
|
+
os.makedirs(save_path, exist_ok=True)
|
|
129
|
+
output_file = f"{save_path}/{paper_id.replace('/', '_')}.pdf"
|
|
130
|
+
with open(output_file, 'wb') as f:
|
|
131
|
+
f.write(response.content)
|
|
132
|
+
return output_file
|
|
133
|
+
except requests.exceptions.RequestException as e:
|
|
134
|
+
tries += 1
|
|
135
|
+
if tries == self.max_retries:
|
|
136
|
+
raise Exception(f"Failed to download PDF after {self.max_retries} attempts: {e}")
|
|
137
|
+
print(f"Attempt {tries} failed, retrying...")
|
|
138
|
+
|
|
139
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
140
|
+
"""
|
|
141
|
+
Read a paper and convert it to text format.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
paper_id: medRxiv DOI
|
|
145
|
+
save_path: Directory where the PDF is/will be saved
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
str: The extracted text content of the paper
|
|
149
|
+
"""
|
|
150
|
+
from ..pdf_utils import extract_text_from_pdf
|
|
151
|
+
|
|
152
|
+
pdf_path = f"{save_path}/{paper_id.replace('/', '_')}.pdf"
|
|
153
|
+
if not os.path.exists(pdf_path):
|
|
154
|
+
pdf_path = self.download_pdf(paper_id, save_path)
|
|
155
|
+
|
|
156
|
+
return extract_text_from_pdf(pdf_path)
|
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
# paper_search_mcp/academic_platforms/openalex.py
|
|
2
|
+
from typing import List, Optional, Dict, Any
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import requests
|
|
5
|
+
import logging
|
|
6
|
+
from ..paper import Paper
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class OpenAlexSearcher:
|
|
12
|
+
"""Searcher for OpenAlex - a fully open index of scholarly works.
|
|
13
|
+
|
|
14
|
+
OpenAlex aggregates data from CrossRef, PubMed, arXiv, institutional
|
|
15
|
+
repositories, and more. It indexes 240M+ works with 50k added daily.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
BASE_URL = "https://api.openalex.org"
|
|
19
|
+
|
|
20
|
+
# Polite pool email for faster rate limits
|
|
21
|
+
USER_EMAIL = "paper-search-mcp@example.org"
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
self.session = requests.Session()
|
|
25
|
+
self.session.headers.update({
|
|
26
|
+
'Accept': 'application/json',
|
|
27
|
+
'User-Agent': f'paper-search-mcp/0.1 (mailto:{self.USER_EMAIL})'
|
|
28
|
+
})
|
|
29
|
+
|
|
30
|
+
def search(
|
|
31
|
+
self,
|
|
32
|
+
query: str,
|
|
33
|
+
max_results: int = 10,
|
|
34
|
+
date_from: Optional[str] = None,
|
|
35
|
+
date_to: Optional[str] = None
|
|
36
|
+
) -> List[Paper]:
|
|
37
|
+
"""Search OpenAlex for academic works.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
query: Search query string
|
|
41
|
+
max_results: Maximum number of results (default: 10)
|
|
42
|
+
date_from: Start date in YYYY-MM-DD format (optional)
|
|
43
|
+
date_to: End date in YYYY-MM-DD format (optional)
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
List of Paper objects
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
# Build filter string
|
|
50
|
+
filters = [f'title_and_abstract.search:{query}']
|
|
51
|
+
|
|
52
|
+
if date_from:
|
|
53
|
+
filters.append(f'from_publication_date:{date_from}')
|
|
54
|
+
if date_to:
|
|
55
|
+
filters.append(f'to_publication_date:{date_to}')
|
|
56
|
+
|
|
57
|
+
params = {
|
|
58
|
+
'filter': ','.join(filters),
|
|
59
|
+
'per_page': min(max_results, 200), # OpenAlex max is 200
|
|
60
|
+
'mailto': self.USER_EMAIL,
|
|
61
|
+
'select': 'id,title,authorships,abstract_inverted_index,doi,publication_date,open_access,primary_location,type,cited_by_count,topics'
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
response = self.session.get(f'{self.BASE_URL}/works', params=params)
|
|
65
|
+
response.raise_for_status()
|
|
66
|
+
data = response.json()
|
|
67
|
+
|
|
68
|
+
papers = []
|
|
69
|
+
for item in data.get('results', []):
|
|
70
|
+
paper = self._parse_work(item)
|
|
71
|
+
if paper:
|
|
72
|
+
papers.append(paper)
|
|
73
|
+
|
|
74
|
+
return papers[:max_results]
|
|
75
|
+
|
|
76
|
+
except requests.RequestException as e:
|
|
77
|
+
logger.error(f"OpenAlex search error: {e}")
|
|
78
|
+
return []
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f"Unexpected error in OpenAlex search: {e}")
|
|
81
|
+
return []
|
|
82
|
+
|
|
83
|
+
def _parse_work(self, item: Dict[str, Any]) -> Optional[Paper]:
|
|
84
|
+
"""Parse an OpenAlex work object into a Paper."""
|
|
85
|
+
try:
|
|
86
|
+
# Extract OpenAlex ID (short form)
|
|
87
|
+
openalex_id = item.get('id', '').replace('https://openalex.org/', '')
|
|
88
|
+
|
|
89
|
+
# Extract DOI (remove URL prefix if present)
|
|
90
|
+
doi = item.get('doi', '') or ''
|
|
91
|
+
if doi.startswith('https://doi.org/'):
|
|
92
|
+
doi = doi[16:]
|
|
93
|
+
|
|
94
|
+
# Extract title
|
|
95
|
+
title = item.get('title', '') or ''
|
|
96
|
+
|
|
97
|
+
# Extract authors from authorships
|
|
98
|
+
authors = []
|
|
99
|
+
for authorship in item.get('authorships', []):
|
|
100
|
+
author = authorship.get('author', {})
|
|
101
|
+
name = author.get('display_name', '')
|
|
102
|
+
if name:
|
|
103
|
+
authors.append(name)
|
|
104
|
+
|
|
105
|
+
# Reconstruct abstract from inverted index
|
|
106
|
+
abstract = self._reconstruct_abstract(item.get('abstract_inverted_index'))
|
|
107
|
+
|
|
108
|
+
# Extract publication date
|
|
109
|
+
pub_date_str = item.get('publication_date', '')
|
|
110
|
+
published_date = None
|
|
111
|
+
if pub_date_str:
|
|
112
|
+
try:
|
|
113
|
+
published_date = datetime.strptime(pub_date_str, '%Y-%m-%d')
|
|
114
|
+
except ValueError:
|
|
115
|
+
try:
|
|
116
|
+
published_date = datetime.strptime(pub_date_str[:4], '%Y')
|
|
117
|
+
except ValueError:
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
# Extract PDF URL from open_access or primary_location
|
|
121
|
+
pdf_url = ''
|
|
122
|
+
open_access = item.get('open_access', {})
|
|
123
|
+
if open_access.get('is_oa'):
|
|
124
|
+
pdf_url = open_access.get('oa_url', '')
|
|
125
|
+
|
|
126
|
+
if not pdf_url:
|
|
127
|
+
primary_loc = item.get('primary_location', {}) or {}
|
|
128
|
+
pdf_url = primary_loc.get('pdf_url', '') or ''
|
|
129
|
+
|
|
130
|
+
# Extract categories from topics
|
|
131
|
+
categories = []
|
|
132
|
+
for topic in item.get('topics', [])[:3]: # Limit to top 3
|
|
133
|
+
if topic.get('display_name'):
|
|
134
|
+
categories.append(topic['display_name'])
|
|
135
|
+
|
|
136
|
+
# Work type as category if no topics
|
|
137
|
+
if not categories and item.get('type'):
|
|
138
|
+
categories = [item['type']]
|
|
139
|
+
|
|
140
|
+
return Paper(
|
|
141
|
+
paper_id=openalex_id,
|
|
142
|
+
title=title,
|
|
143
|
+
authors=authors,
|
|
144
|
+
abstract=abstract,
|
|
145
|
+
doi=doi,
|
|
146
|
+
published_date=published_date,
|
|
147
|
+
pdf_url=pdf_url,
|
|
148
|
+
url=f"https://openalex.org/{openalex_id}",
|
|
149
|
+
source='openalex',
|
|
150
|
+
categories=categories,
|
|
151
|
+
keywords=[],
|
|
152
|
+
citations=item.get('cited_by_count', 0) or 0
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logger.warning(f"Failed to parse OpenAlex work: {e}")
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
def _reconstruct_abstract(self, inverted_index: Optional[Dict[str, List[int]]]) -> str:
|
|
160
|
+
"""Reconstruct abstract text from OpenAlex inverted index format."""
|
|
161
|
+
if not inverted_index:
|
|
162
|
+
return ''
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
# Build list of (position, word) tuples
|
|
166
|
+
words = []
|
|
167
|
+
for word, positions in inverted_index.items():
|
|
168
|
+
for pos in positions:
|
|
169
|
+
words.append((pos, word))
|
|
170
|
+
|
|
171
|
+
# Sort by position and join
|
|
172
|
+
words.sort(key=lambda x: x[0])
|
|
173
|
+
return ' '.join(word for _, word in words)
|
|
174
|
+
except Exception:
|
|
175
|
+
return ''
|
|
176
|
+
|
|
177
|
+
def get_work_by_doi(self, doi: str) -> Optional[Paper]:
|
|
178
|
+
"""Get a specific work by DOI.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
doi: Digital Object Identifier (e.g., '10.1038/nature12373')
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Paper object if found, None otherwise
|
|
185
|
+
"""
|
|
186
|
+
try:
|
|
187
|
+
# Clean DOI - remove URL prefix if present
|
|
188
|
+
if doi.startswith('https://doi.org/'):
|
|
189
|
+
doi = doi[16:]
|
|
190
|
+
elif doi.startswith('http://doi.org/'):
|
|
191
|
+
doi = doi[15:]
|
|
192
|
+
elif doi.startswith('doi:'):
|
|
193
|
+
doi = doi[4:]
|
|
194
|
+
|
|
195
|
+
url = f'{self.BASE_URL}/works/https://doi.org/{doi}'
|
|
196
|
+
params = {'mailto': self.USER_EMAIL}
|
|
197
|
+
|
|
198
|
+
response = self.session.get(url, params=params)
|
|
199
|
+
|
|
200
|
+
if response.status_code == 404:
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
response.raise_for_status()
|
|
204
|
+
return self._parse_work(response.json())
|
|
205
|
+
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.error(f"Error fetching work by DOI {doi}: {e}")
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
def get_work_by_id(self, openalex_id: str) -> Optional[Paper]:
|
|
211
|
+
"""Get a specific work by OpenAlex ID.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
openalex_id: OpenAlex ID (e.g., 'W2741809807')
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Paper object if found, None otherwise
|
|
218
|
+
"""
|
|
219
|
+
try:
|
|
220
|
+
# Ensure proper ID format
|
|
221
|
+
if not openalex_id.startswith('W'):
|
|
222
|
+
openalex_id = f'W{openalex_id}'
|
|
223
|
+
|
|
224
|
+
url = f'{self.BASE_URL}/works/{openalex_id}'
|
|
225
|
+
params = {'mailto': self.USER_EMAIL}
|
|
226
|
+
|
|
227
|
+
response = self.session.get(url, params=params)
|
|
228
|
+
|
|
229
|
+
if response.status_code == 404:
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
response.raise_for_status()
|
|
233
|
+
return self._parse_work(response.json())
|
|
234
|
+
|
|
235
|
+
except Exception as e:
|
|
236
|
+
logger.error(f"Error fetching OpenAlex work {openalex_id}: {e}")
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
240
|
+
"""Download PDF if available (open access only).
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
paper_id: OpenAlex ID
|
|
244
|
+
save_path: Directory to save the PDF
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Path to downloaded PDF
|
|
248
|
+
|
|
249
|
+
Raises:
|
|
250
|
+
NotImplementedError: If PDF not available
|
|
251
|
+
"""
|
|
252
|
+
paper = self.get_work_by_id(paper_id)
|
|
253
|
+
if not paper or not paper.pdf_url:
|
|
254
|
+
raise NotImplementedError(
|
|
255
|
+
"PDF not available. This work may not be open access. "
|
|
256
|
+
"Try accessing via DOI or publisher URL."
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
import os
|
|
260
|
+
response = self.session.get(paper.pdf_url)
|
|
261
|
+
response.raise_for_status()
|
|
262
|
+
|
|
263
|
+
os.makedirs(save_path, exist_ok=True)
|
|
264
|
+
output_file = f"{save_path}/{paper_id}.pdf"
|
|
265
|
+
with open(output_file, 'wb') as f:
|
|
266
|
+
f.write(response.content)
|
|
267
|
+
|
|
268
|
+
return output_file
|
|
269
|
+
|
|
270
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
271
|
+
"""Read paper content (requires open access PDF).
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
paper_id: OpenAlex ID
|
|
275
|
+
save_path: Directory for PDF storage
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Extracted text content
|
|
279
|
+
"""
|
|
280
|
+
from ..pdf_utils import extract_text_from_pdf
|
|
281
|
+
import os
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
pdf_path = f"{save_path}/{paper_id}.pdf"
|
|
285
|
+
if not os.path.exists(pdf_path):
|
|
286
|
+
pdf_path = self.download_pdf(paper_id, save_path)
|
|
287
|
+
|
|
288
|
+
return extract_text_from_pdf(pdf_path)
|
|
289
|
+
|
|
290
|
+
except Exception as e:
|
|
291
|
+
return f"Could not read paper: {e}"
|
|
292
|
+
|
|
293
|
+
def get_references(self, paper_id: str, max_results: int = 25) -> List[Paper]:
|
|
294
|
+
"""Get papers that this work cites (outgoing references).
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
paper_id: OpenAlex work ID (e.g., 'W2741809807')
|
|
298
|
+
max_results: Maximum number of references to return (default: 25)
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
List of Paper objects for referenced works
|
|
302
|
+
"""
|
|
303
|
+
try:
|
|
304
|
+
# Ensure proper ID format
|
|
305
|
+
if not paper_id.startswith('W'):
|
|
306
|
+
paper_id = f'W{paper_id}'
|
|
307
|
+
|
|
308
|
+
# Get the work to extract referenced_works
|
|
309
|
+
url = f'{self.BASE_URL}/works/{paper_id}'
|
|
310
|
+
params = {'mailto': self.USER_EMAIL}
|
|
311
|
+
response = self.session.get(url, params=params)
|
|
312
|
+
response.raise_for_status()
|
|
313
|
+
data = response.json()
|
|
314
|
+
|
|
315
|
+
referenced_ids = data.get('referenced_works', [])
|
|
316
|
+
if not referenced_ids:
|
|
317
|
+
return []
|
|
318
|
+
|
|
319
|
+
# Extract IDs and fetch details
|
|
320
|
+
ref_ids = [r.replace('https://openalex.org/', '') for r in referenced_ids[:max_results]]
|
|
321
|
+
|
|
322
|
+
# Batch fetch referenced works
|
|
323
|
+
filter_str = '|'.join(ref_ids)
|
|
324
|
+
params = {
|
|
325
|
+
'filter': f'openalex:{filter_str}',
|
|
326
|
+
'per_page': max_results,
|
|
327
|
+
'sort': 'cited_by_count:desc',
|
|
328
|
+
'mailto': self.USER_EMAIL,
|
|
329
|
+
'select': 'id,title,authorships,abstract_inverted_index,doi,publication_date,open_access,primary_location,cited_by_count,topics'
|
|
330
|
+
}
|
|
331
|
+
response = self.session.get(f'{self.BASE_URL}/works', params=params)
|
|
332
|
+
response.raise_for_status()
|
|
333
|
+
|
|
334
|
+
papers = []
|
|
335
|
+
for item in response.json().get('results', []):
|
|
336
|
+
paper = self._parse_work(item)
|
|
337
|
+
if paper:
|
|
338
|
+
papers.append(paper)
|
|
339
|
+
|
|
340
|
+
return papers
|
|
341
|
+
|
|
342
|
+
except Exception as e:
|
|
343
|
+
logger.error(f"Error fetching references for {paper_id}: {e}")
|
|
344
|
+
return []
|
|
345
|
+
|
|
346
|
+
def get_citing_papers(self, paper_id: str, max_results: int = 25) -> List[Paper]:
|
|
347
|
+
"""Get papers that cite this work (incoming citations).
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
paper_id: OpenAlex work ID (e.g., 'W2741809807')
|
|
351
|
+
max_results: Maximum number of citing papers to return (default: 25)
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
List of Paper objects for citing works
|
|
355
|
+
"""
|
|
356
|
+
try:
|
|
357
|
+
# Ensure proper ID format
|
|
358
|
+
if not paper_id.startswith('W'):
|
|
359
|
+
paper_id = f'W{paper_id}'
|
|
360
|
+
|
|
361
|
+
params = {
|
|
362
|
+
'filter': f'cites:{paper_id}',
|
|
363
|
+
'per_page': min(max_results, 200),
|
|
364
|
+
'sort': 'cited_by_count:desc',
|
|
365
|
+
'mailto': self.USER_EMAIL,
|
|
366
|
+
'select': 'id,title,authorships,abstract_inverted_index,doi,publication_date,open_access,primary_location,cited_by_count,topics'
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
response = self.session.get(f'{self.BASE_URL}/works', params=params)
|
|
370
|
+
response.raise_for_status()
|
|
371
|
+
data = response.json()
|
|
372
|
+
|
|
373
|
+
papers = []
|
|
374
|
+
for item in data.get('results', []):
|
|
375
|
+
paper = self._parse_work(item)
|
|
376
|
+
if paper:
|
|
377
|
+
papers.append(paper)
|
|
378
|
+
|
|
379
|
+
return papers[:max_results]
|
|
380
|
+
|
|
381
|
+
except Exception as e:
|
|
382
|
+
logger.error(f"Error fetching citing papers for {paper_id}: {e}")
|
|
383
|
+
return []
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def search_authors(self, name: str, max_results: int = 10) -> List[Dict]:
|
|
387
|
+
"""Search for authors by name.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
name: Author name to search for
|
|
391
|
+
max_results: Maximum number of authors to return (default: 10)
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
List of author metadata dictionaries
|
|
395
|
+
"""
|
|
396
|
+
try:
|
|
397
|
+
params = {
|
|
398
|
+
'search': name,
|
|
399
|
+
'per_page': min(max_results, 200),
|
|
400
|
+
'mailto': self.USER_EMAIL
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
response = self.session.get(f'{self.BASE_URL}/authors', params=params)
|
|
404
|
+
response.raise_for_status()
|
|
405
|
+
data = response.json()
|
|
406
|
+
|
|
407
|
+
authors = []
|
|
408
|
+
for a in data.get('results', []):
|
|
409
|
+
author_id = a.get('id', '').replace('https://openalex.org/', '')
|
|
410
|
+
affiliations = [aff.get('display_name') for aff in a.get('affiliations', []) if aff.get('display_name')]
|
|
411
|
+
|
|
412
|
+
authors.append({
|
|
413
|
+
'id': author_id,
|
|
414
|
+
'name': a.get('display_name', ''),
|
|
415
|
+
'works_count': a.get('works_count', 0),
|
|
416
|
+
'citations': a.get('cited_by_count', 0),
|
|
417
|
+
'affiliations': affiliations[:3] if affiliations else None,
|
|
418
|
+
'orcid': a.get('orcid', '').replace('https://orcid.org/', '') if a.get('orcid') else None
|
|
419
|
+
})
|
|
420
|
+
|
|
421
|
+
# Filter out None values
|
|
422
|
+
for author in authors:
|
|
423
|
+
for key in list(author.keys()):
|
|
424
|
+
if author[key] is None:
|
|
425
|
+
del author[key]
|
|
426
|
+
|
|
427
|
+
return authors[:max_results]
|
|
428
|
+
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logger.error(f"Error searching authors: {e}")
|
|
431
|
+
return []
|
|
432
|
+
|
|
433
|
+
def get_author_papers(
|
|
434
|
+
self, author_id: str, max_results: int = 25,
|
|
435
|
+
date_from: Optional[str] = None, date_to: Optional[str] = None
|
|
436
|
+
) -> List[Paper]:
|
|
437
|
+
"""Get papers by an author.
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
author_id: OpenAlex author ID (e.g., 'A5015666723')
|
|
441
|
+
max_results: Maximum number of papers to return (default: 25)
|
|
442
|
+
date_from: Start date in YYYY-MM-DD format (optional)
|
|
443
|
+
date_to: End date in YYYY-MM-DD format (optional)
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
List of Paper objects sorted by citation count
|
|
447
|
+
"""
|
|
448
|
+
try:
|
|
449
|
+
# Ensure proper ID format
|
|
450
|
+
if not author_id.startswith('A'):
|
|
451
|
+
author_id = f'A{author_id}'
|
|
452
|
+
|
|
453
|
+
# Build filter
|
|
454
|
+
filters = [f'author.id:{author_id}']
|
|
455
|
+
if date_from:
|
|
456
|
+
filters.append(f'from_publication_date:{date_from}')
|
|
457
|
+
if date_to:
|
|
458
|
+
filters.append(f'to_publication_date:{date_to}')
|
|
459
|
+
|
|
460
|
+
params = {
|
|
461
|
+
'filter': ','.join(filters),
|
|
462
|
+
'per_page': min(max_results, 200),
|
|
463
|
+
'sort': 'cited_by_count:desc',
|
|
464
|
+
'mailto': self.USER_EMAIL,
|
|
465
|
+
'select': 'id,title,authorships,abstract_inverted_index,doi,publication_date,open_access,primary_location,cited_by_count,topics'
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
response = self.session.get(f'{self.BASE_URL}/works', params=params)
|
|
469
|
+
response.raise_for_status()
|
|
470
|
+
data = response.json()
|
|
471
|
+
|
|
472
|
+
papers = []
|
|
473
|
+
for item in data.get('results', []):
|
|
474
|
+
paper = self._parse_work(item)
|
|
475
|
+
if paper:
|
|
476
|
+
papers.append(paper)
|
|
477
|
+
|
|
478
|
+
return papers[:max_results]
|
|
479
|
+
|
|
480
|
+
except Exception as e:
|
|
481
|
+
logger.error(f"Error fetching papers for author {author_id}: {e}")
|
|
482
|
+
return []
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
if __name__ == "__main__":
|
|
486
|
+
# Test OpenAlex searcher
|
|
487
|
+
searcher = OpenAlexSearcher()
|
|
488
|
+
|
|
489
|
+
print("Testing OpenAlex search...")
|
|
490
|
+
papers = searcher.search("transformer attention mechanism", max_results=3)
|
|
491
|
+
|
|
492
|
+
for i, paper in enumerate(papers, 1):
|
|
493
|
+
print(f"\n{i}. {paper.title[:60]}...")
|
|
494
|
+
print(f" ID: {paper.paper_id}")
|
|
495
|
+
print(f" DOI: {paper.doi}")
|
|
496
|
+
print(f" PDF: {paper.pdf_url or '(not available)'}")
|
|
497
|
+
print(f" Citations: {paper.citations}")
|