academic-search-mcp 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,156 @@
1
+ from typing import List
2
+ import requests
3
+ import os
4
+ from datetime import datetime, timedelta
5
+ from ..paper import Paper
6
+ from PyPDF2 import PdfReader
7
+
8
+ class PaperSource:
9
+ """Abstract base class for paper sources"""
10
+ def search(self, query: str, **kwargs) -> List[Paper]:
11
+ raise NotImplementedError
12
+
13
+ def download_pdf(self, paper_id: str, save_path: str) -> str:
14
+ raise NotImplementedError
15
+
16
+ def read_paper(self, paper_id: str, save_path: str) -> str:
17
+ raise NotImplementedError
18
+
19
+ class MedRxivSearcher(PaperSource):
20
+ """Searcher for medRxiv papers"""
21
+ BASE_URL = "https://api.biorxiv.org/details/medrxiv"
22
+
23
+ def __init__(self):
24
+ self.session = requests.Session()
25
+ self.session.proxies = {'http': None, 'https': None}
26
+ self.timeout = 30
27
+ self.max_retries = 3
28
+
29
+ def search(self, query: str, max_results: int = 10, days: int = 30,
30
+ date_from: str = None, date_to: str = None) -> List[Paper]:
31
+ """
32
+ Search for papers on medRxiv by category within a date range.
33
+
34
+ Args:
35
+ query: Category name to search for (e.g., "cardiovascular medicine").
36
+ max_results: Maximum number of papers to return.
37
+ days: Number of days to look back for papers (used if date_from/date_to not specified).
38
+ date_from: Start date in YYYY-MM-DD format (optional, overrides days).
39
+ date_to: End date in YYYY-MM-DD format (optional, defaults to today).
40
+
41
+ Returns:
42
+ List of Paper objects matching the category within the specified date range.
43
+ """
44
+ # Calculate date range: use explicit dates if provided, otherwise fall back to days
45
+ if date_from or date_to:
46
+ end_date = date_to if date_to else datetime.now().strftime('%Y-%m-%d')
47
+ start_date = date_from if date_from else '2000-01-01'
48
+ else:
49
+ end_date = datetime.now().strftime('%Y-%m-%d')
50
+ start_date = (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d')
51
+
52
+ # Format category: lowercase and replace spaces with underscores
53
+ category = query.lower().replace(' ', '_')
54
+
55
+ papers = []
56
+ cursor = 0
57
+ while len(papers) < max_results:
58
+ url = f"{self.BASE_URL}/{start_date}/{end_date}/{cursor}"
59
+ if category:
60
+ url += f"?category={category}"
61
+
62
+ tries = 0
63
+ while tries < self.max_retries:
64
+ try:
65
+ response = self.session.get(url, timeout=self.timeout)
66
+ response.raise_for_status()
67
+ data = response.json()
68
+ collection = data.get('collection', [])
69
+ for item in collection:
70
+ try:
71
+ date = datetime.strptime(item['date'], '%Y-%m-%d')
72
+ papers.append(Paper(
73
+ paper_id=item['doi'],
74
+ title=item['title'],
75
+ authors=item['authors'].split('; '),
76
+ abstract=item['abstract'],
77
+ url=f"https://www.medrxiv.org/content/{item['doi']}v{item.get('version', '1')}",
78
+ pdf_url=f"https://www.medrxiv.org/content/{item['doi']}v{item.get('version', '1')}.full.pdf",
79
+ published_date=date,
80
+ updated_date=date,
81
+ source="medrxiv",
82
+ categories=[item['category']],
83
+ keywords=[],
84
+ doi=item['doi']
85
+ ))
86
+ except Exception as e:
87
+ print(f"Error parsing medRxiv entry: {e}")
88
+ if len(collection) < 100:
89
+ break # No more results
90
+ cursor += 100
91
+ break # Exit retry loop on success
92
+ except requests.exceptions.RequestException as e:
93
+ tries += 1
94
+ if tries == self.max_retries:
95
+ print(f"Failed to connect to medRxiv API after {self.max_retries} attempts: {e}")
96
+ break
97
+ print(f"Attempt {tries} failed, retrying...")
98
+ else:
99
+ continue
100
+ break
101
+
102
+ return papers[:max_results]
103
+
104
+ def download_pdf(self, paper_id: str, save_path: str) -> str:
105
+ """
106
+ Download a PDF for a given paper ID from medRxiv.
107
+
108
+ Args:
109
+ paper_id: The DOI of the paper.
110
+ save_path: Directory to save the PDF.
111
+
112
+ Returns:
113
+ Path to the downloaded PDF file.
114
+ """
115
+ if not paper_id:
116
+ raise ValueError("Invalid paper_id: paper_id is empty")
117
+
118
+ pdf_url = f"https://www.medrxiv.org/content/{paper_id}v1.full.pdf"
119
+ tries = 0
120
+ while tries < self.max_retries:
121
+ try:
122
+ # Add User-Agent to avoid potential 403 errors
123
+ headers = {
124
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
125
+ }
126
+ response = self.session.get(pdf_url, timeout=self.timeout, headers=headers)
127
+ response.raise_for_status()
128
+ os.makedirs(save_path, exist_ok=True)
129
+ output_file = f"{save_path}/{paper_id.replace('/', '_')}.pdf"
130
+ with open(output_file, 'wb') as f:
131
+ f.write(response.content)
132
+ return output_file
133
+ except requests.exceptions.RequestException as e:
134
+ tries += 1
135
+ if tries == self.max_retries:
136
+ raise Exception(f"Failed to download PDF after {self.max_retries} attempts: {e}")
137
+ print(f"Attempt {tries} failed, retrying...")
138
+
139
+ def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
140
+ """
141
+ Read a paper and convert it to text format.
142
+
143
+ Args:
144
+ paper_id: medRxiv DOI
145
+ save_path: Directory where the PDF is/will be saved
146
+
147
+ Returns:
148
+ str: The extracted text content of the paper
149
+ """
150
+ from ..pdf_utils import extract_text_from_pdf
151
+
152
+ pdf_path = f"{save_path}/{paper_id.replace('/', '_')}.pdf"
153
+ if not os.path.exists(pdf_path):
154
+ pdf_path = self.download_pdf(paper_id, save_path)
155
+
156
+ return extract_text_from_pdf(pdf_path)
@@ -0,0 +1,497 @@
1
+ # paper_search_mcp/academic_platforms/openalex.py
2
+ from typing import List, Optional, Dict, Any
3
+ from datetime import datetime
4
+ import requests
5
+ import logging
6
+ from ..paper import Paper
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class OpenAlexSearcher:
12
+ """Searcher for OpenAlex - a fully open index of scholarly works.
13
+
14
+ OpenAlex aggregates data from CrossRef, PubMed, arXiv, institutional
15
+ repositories, and more. It indexes 240M+ works with 50k added daily.
16
+ """
17
+
18
+ BASE_URL = "https://api.openalex.org"
19
+
20
+ # Polite pool email for faster rate limits
21
+ USER_EMAIL = "paper-search-mcp@example.org"
22
+
23
+ def __init__(self):
24
+ self.session = requests.Session()
25
+ self.session.headers.update({
26
+ 'Accept': 'application/json',
27
+ 'User-Agent': f'paper-search-mcp/0.1 (mailto:{self.USER_EMAIL})'
28
+ })
29
+
30
+ def search(
31
+ self,
32
+ query: str,
33
+ max_results: int = 10,
34
+ date_from: Optional[str] = None,
35
+ date_to: Optional[str] = None
36
+ ) -> List[Paper]:
37
+ """Search OpenAlex for academic works.
38
+
39
+ Args:
40
+ query: Search query string
41
+ max_results: Maximum number of results (default: 10)
42
+ date_from: Start date in YYYY-MM-DD format (optional)
43
+ date_to: End date in YYYY-MM-DD format (optional)
44
+
45
+ Returns:
46
+ List of Paper objects
47
+ """
48
+ try:
49
+ # Build filter string
50
+ filters = [f'title_and_abstract.search:{query}']
51
+
52
+ if date_from:
53
+ filters.append(f'from_publication_date:{date_from}')
54
+ if date_to:
55
+ filters.append(f'to_publication_date:{date_to}')
56
+
57
+ params = {
58
+ 'filter': ','.join(filters),
59
+ 'per_page': min(max_results, 200), # OpenAlex max is 200
60
+ 'mailto': self.USER_EMAIL,
61
+ 'select': 'id,title,authorships,abstract_inverted_index,doi,publication_date,open_access,primary_location,type,cited_by_count,topics'
62
+ }
63
+
64
+ response = self.session.get(f'{self.BASE_URL}/works', params=params)
65
+ response.raise_for_status()
66
+ data = response.json()
67
+
68
+ papers = []
69
+ for item in data.get('results', []):
70
+ paper = self._parse_work(item)
71
+ if paper:
72
+ papers.append(paper)
73
+
74
+ return papers[:max_results]
75
+
76
+ except requests.RequestException as e:
77
+ logger.error(f"OpenAlex search error: {e}")
78
+ return []
79
+ except Exception as e:
80
+ logger.error(f"Unexpected error in OpenAlex search: {e}")
81
+ return []
82
+
83
+ def _parse_work(self, item: Dict[str, Any]) -> Optional[Paper]:
84
+ """Parse an OpenAlex work object into a Paper."""
85
+ try:
86
+ # Extract OpenAlex ID (short form)
87
+ openalex_id = item.get('id', '').replace('https://openalex.org/', '')
88
+
89
+ # Extract DOI (remove URL prefix if present)
90
+ doi = item.get('doi', '') or ''
91
+ if doi.startswith('https://doi.org/'):
92
+ doi = doi[16:]
93
+
94
+ # Extract title
95
+ title = item.get('title', '') or ''
96
+
97
+ # Extract authors from authorships
98
+ authors = []
99
+ for authorship in item.get('authorships', []):
100
+ author = authorship.get('author', {})
101
+ name = author.get('display_name', '')
102
+ if name:
103
+ authors.append(name)
104
+
105
+ # Reconstruct abstract from inverted index
106
+ abstract = self._reconstruct_abstract(item.get('abstract_inverted_index'))
107
+
108
+ # Extract publication date
109
+ pub_date_str = item.get('publication_date', '')
110
+ published_date = None
111
+ if pub_date_str:
112
+ try:
113
+ published_date = datetime.strptime(pub_date_str, '%Y-%m-%d')
114
+ except ValueError:
115
+ try:
116
+ published_date = datetime.strptime(pub_date_str[:4], '%Y')
117
+ except ValueError:
118
+ pass
119
+
120
+ # Extract PDF URL from open_access or primary_location
121
+ pdf_url = ''
122
+ open_access = item.get('open_access', {})
123
+ if open_access.get('is_oa'):
124
+ pdf_url = open_access.get('oa_url', '')
125
+
126
+ if not pdf_url:
127
+ primary_loc = item.get('primary_location', {}) or {}
128
+ pdf_url = primary_loc.get('pdf_url', '') or ''
129
+
130
+ # Extract categories from topics
131
+ categories = []
132
+ for topic in item.get('topics', [])[:3]: # Limit to top 3
133
+ if topic.get('display_name'):
134
+ categories.append(topic['display_name'])
135
+
136
+ # Work type as category if no topics
137
+ if not categories and item.get('type'):
138
+ categories = [item['type']]
139
+
140
+ return Paper(
141
+ paper_id=openalex_id,
142
+ title=title,
143
+ authors=authors,
144
+ abstract=abstract,
145
+ doi=doi,
146
+ published_date=published_date,
147
+ pdf_url=pdf_url,
148
+ url=f"https://openalex.org/{openalex_id}",
149
+ source='openalex',
150
+ categories=categories,
151
+ keywords=[],
152
+ citations=item.get('cited_by_count', 0) or 0
153
+ )
154
+
155
+ except Exception as e:
156
+ logger.warning(f"Failed to parse OpenAlex work: {e}")
157
+ return None
158
+
159
+ def _reconstruct_abstract(self, inverted_index: Optional[Dict[str, List[int]]]) -> str:
160
+ """Reconstruct abstract text from OpenAlex inverted index format."""
161
+ if not inverted_index:
162
+ return ''
163
+
164
+ try:
165
+ # Build list of (position, word) tuples
166
+ words = []
167
+ for word, positions in inverted_index.items():
168
+ for pos in positions:
169
+ words.append((pos, word))
170
+
171
+ # Sort by position and join
172
+ words.sort(key=lambda x: x[0])
173
+ return ' '.join(word for _, word in words)
174
+ except Exception:
175
+ return ''
176
+
177
+ def get_work_by_doi(self, doi: str) -> Optional[Paper]:
178
+ """Get a specific work by DOI.
179
+
180
+ Args:
181
+ doi: Digital Object Identifier (e.g., '10.1038/nature12373')
182
+
183
+ Returns:
184
+ Paper object if found, None otherwise
185
+ """
186
+ try:
187
+ # Clean DOI - remove URL prefix if present
188
+ if doi.startswith('https://doi.org/'):
189
+ doi = doi[16:]
190
+ elif doi.startswith('http://doi.org/'):
191
+ doi = doi[15:]
192
+ elif doi.startswith('doi:'):
193
+ doi = doi[4:]
194
+
195
+ url = f'{self.BASE_URL}/works/https://doi.org/{doi}'
196
+ params = {'mailto': self.USER_EMAIL}
197
+
198
+ response = self.session.get(url, params=params)
199
+
200
+ if response.status_code == 404:
201
+ return None
202
+
203
+ response.raise_for_status()
204
+ return self._parse_work(response.json())
205
+
206
+ except Exception as e:
207
+ logger.error(f"Error fetching work by DOI {doi}: {e}")
208
+ return None
209
+
210
+ def get_work_by_id(self, openalex_id: str) -> Optional[Paper]:
211
+ """Get a specific work by OpenAlex ID.
212
+
213
+ Args:
214
+ openalex_id: OpenAlex ID (e.g., 'W2741809807')
215
+
216
+ Returns:
217
+ Paper object if found, None otherwise
218
+ """
219
+ try:
220
+ # Ensure proper ID format
221
+ if not openalex_id.startswith('W'):
222
+ openalex_id = f'W{openalex_id}'
223
+
224
+ url = f'{self.BASE_URL}/works/{openalex_id}'
225
+ params = {'mailto': self.USER_EMAIL}
226
+
227
+ response = self.session.get(url, params=params)
228
+
229
+ if response.status_code == 404:
230
+ return None
231
+
232
+ response.raise_for_status()
233
+ return self._parse_work(response.json())
234
+
235
+ except Exception as e:
236
+ logger.error(f"Error fetching OpenAlex work {openalex_id}: {e}")
237
+ return None
238
+
239
+ def download_pdf(self, paper_id: str, save_path: str) -> str:
240
+ """Download PDF if available (open access only).
241
+
242
+ Args:
243
+ paper_id: OpenAlex ID
244
+ save_path: Directory to save the PDF
245
+
246
+ Returns:
247
+ Path to downloaded PDF
248
+
249
+ Raises:
250
+ NotImplementedError: If PDF not available
251
+ """
252
+ paper = self.get_work_by_id(paper_id)
253
+ if not paper or not paper.pdf_url:
254
+ raise NotImplementedError(
255
+ "PDF not available. This work may not be open access. "
256
+ "Try accessing via DOI or publisher URL."
257
+ )
258
+
259
+ import os
260
+ response = self.session.get(paper.pdf_url)
261
+ response.raise_for_status()
262
+
263
+ os.makedirs(save_path, exist_ok=True)
264
+ output_file = f"{save_path}/{paper_id}.pdf"
265
+ with open(output_file, 'wb') as f:
266
+ f.write(response.content)
267
+
268
+ return output_file
269
+
270
+ def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
271
+ """Read paper content (requires open access PDF).
272
+
273
+ Args:
274
+ paper_id: OpenAlex ID
275
+ save_path: Directory for PDF storage
276
+
277
+ Returns:
278
+ Extracted text content
279
+ """
280
+ from ..pdf_utils import extract_text_from_pdf
281
+ import os
282
+
283
+ try:
284
+ pdf_path = f"{save_path}/{paper_id}.pdf"
285
+ if not os.path.exists(pdf_path):
286
+ pdf_path = self.download_pdf(paper_id, save_path)
287
+
288
+ return extract_text_from_pdf(pdf_path)
289
+
290
+ except Exception as e:
291
+ return f"Could not read paper: {e}"
292
+
293
+ def get_references(self, paper_id: str, max_results: int = 25) -> List[Paper]:
294
+ """Get papers that this work cites (outgoing references).
295
+
296
+ Args:
297
+ paper_id: OpenAlex work ID (e.g., 'W2741809807')
298
+ max_results: Maximum number of references to return (default: 25)
299
+
300
+ Returns:
301
+ List of Paper objects for referenced works
302
+ """
303
+ try:
304
+ # Ensure proper ID format
305
+ if not paper_id.startswith('W'):
306
+ paper_id = f'W{paper_id}'
307
+
308
+ # Get the work to extract referenced_works
309
+ url = f'{self.BASE_URL}/works/{paper_id}'
310
+ params = {'mailto': self.USER_EMAIL}
311
+ response = self.session.get(url, params=params)
312
+ response.raise_for_status()
313
+ data = response.json()
314
+
315
+ referenced_ids = data.get('referenced_works', [])
316
+ if not referenced_ids:
317
+ return []
318
+
319
+ # Extract IDs and fetch details
320
+ ref_ids = [r.replace('https://openalex.org/', '') for r in referenced_ids[:max_results]]
321
+
322
+ # Batch fetch referenced works
323
+ filter_str = '|'.join(ref_ids)
324
+ params = {
325
+ 'filter': f'openalex:{filter_str}',
326
+ 'per_page': max_results,
327
+ 'sort': 'cited_by_count:desc',
328
+ 'mailto': self.USER_EMAIL,
329
+ 'select': 'id,title,authorships,abstract_inverted_index,doi,publication_date,open_access,primary_location,cited_by_count,topics'
330
+ }
331
+ response = self.session.get(f'{self.BASE_URL}/works', params=params)
332
+ response.raise_for_status()
333
+
334
+ papers = []
335
+ for item in response.json().get('results', []):
336
+ paper = self._parse_work(item)
337
+ if paper:
338
+ papers.append(paper)
339
+
340
+ return papers
341
+
342
+ except Exception as e:
343
+ logger.error(f"Error fetching references for {paper_id}: {e}")
344
+ return []
345
+
346
+ def get_citing_papers(self, paper_id: str, max_results: int = 25) -> List[Paper]:
347
+ """Get papers that cite this work (incoming citations).
348
+
349
+ Args:
350
+ paper_id: OpenAlex work ID (e.g., 'W2741809807')
351
+ max_results: Maximum number of citing papers to return (default: 25)
352
+
353
+ Returns:
354
+ List of Paper objects for citing works
355
+ """
356
+ try:
357
+ # Ensure proper ID format
358
+ if not paper_id.startswith('W'):
359
+ paper_id = f'W{paper_id}'
360
+
361
+ params = {
362
+ 'filter': f'cites:{paper_id}',
363
+ 'per_page': min(max_results, 200),
364
+ 'sort': 'cited_by_count:desc',
365
+ 'mailto': self.USER_EMAIL,
366
+ 'select': 'id,title,authorships,abstract_inverted_index,doi,publication_date,open_access,primary_location,cited_by_count,topics'
367
+ }
368
+
369
+ response = self.session.get(f'{self.BASE_URL}/works', params=params)
370
+ response.raise_for_status()
371
+ data = response.json()
372
+
373
+ papers = []
374
+ for item in data.get('results', []):
375
+ paper = self._parse_work(item)
376
+ if paper:
377
+ papers.append(paper)
378
+
379
+ return papers[:max_results]
380
+
381
+ except Exception as e:
382
+ logger.error(f"Error fetching citing papers for {paper_id}: {e}")
383
+ return []
384
+
385
+
386
+ def search_authors(self, name: str, max_results: int = 10) -> List[Dict]:
387
+ """Search for authors by name.
388
+
389
+ Args:
390
+ name: Author name to search for
391
+ max_results: Maximum number of authors to return (default: 10)
392
+
393
+ Returns:
394
+ List of author metadata dictionaries
395
+ """
396
+ try:
397
+ params = {
398
+ 'search': name,
399
+ 'per_page': min(max_results, 200),
400
+ 'mailto': self.USER_EMAIL
401
+ }
402
+
403
+ response = self.session.get(f'{self.BASE_URL}/authors', params=params)
404
+ response.raise_for_status()
405
+ data = response.json()
406
+
407
+ authors = []
408
+ for a in data.get('results', []):
409
+ author_id = a.get('id', '').replace('https://openalex.org/', '')
410
+ affiliations = [aff.get('display_name') for aff in a.get('affiliations', []) if aff.get('display_name')]
411
+
412
+ authors.append({
413
+ 'id': author_id,
414
+ 'name': a.get('display_name', ''),
415
+ 'works_count': a.get('works_count', 0),
416
+ 'citations': a.get('cited_by_count', 0),
417
+ 'affiliations': affiliations[:3] if affiliations else None,
418
+ 'orcid': a.get('orcid', '').replace('https://orcid.org/', '') if a.get('orcid') else None
419
+ })
420
+
421
+ # Filter out None values
422
+ for author in authors:
423
+ for key in list(author.keys()):
424
+ if author[key] is None:
425
+ del author[key]
426
+
427
+ return authors[:max_results]
428
+
429
+ except Exception as e:
430
+ logger.error(f"Error searching authors: {e}")
431
+ return []
432
+
433
+ def get_author_papers(
434
+ self, author_id: str, max_results: int = 25,
435
+ date_from: Optional[str] = None, date_to: Optional[str] = None
436
+ ) -> List[Paper]:
437
+ """Get papers by an author.
438
+
439
+ Args:
440
+ author_id: OpenAlex author ID (e.g., 'A5015666723')
441
+ max_results: Maximum number of papers to return (default: 25)
442
+ date_from: Start date in YYYY-MM-DD format (optional)
443
+ date_to: End date in YYYY-MM-DD format (optional)
444
+
445
+ Returns:
446
+ List of Paper objects sorted by citation count
447
+ """
448
+ try:
449
+ # Ensure proper ID format
450
+ if not author_id.startswith('A'):
451
+ author_id = f'A{author_id}'
452
+
453
+ # Build filter
454
+ filters = [f'author.id:{author_id}']
455
+ if date_from:
456
+ filters.append(f'from_publication_date:{date_from}')
457
+ if date_to:
458
+ filters.append(f'to_publication_date:{date_to}')
459
+
460
+ params = {
461
+ 'filter': ','.join(filters),
462
+ 'per_page': min(max_results, 200),
463
+ 'sort': 'cited_by_count:desc',
464
+ 'mailto': self.USER_EMAIL,
465
+ 'select': 'id,title,authorships,abstract_inverted_index,doi,publication_date,open_access,primary_location,cited_by_count,topics'
466
+ }
467
+
468
+ response = self.session.get(f'{self.BASE_URL}/works', params=params)
469
+ response.raise_for_status()
470
+ data = response.json()
471
+
472
+ papers = []
473
+ for item in data.get('results', []):
474
+ paper = self._parse_work(item)
475
+ if paper:
476
+ papers.append(paper)
477
+
478
+ return papers[:max_results]
479
+
480
+ except Exception as e:
481
+ logger.error(f"Error fetching papers for author {author_id}: {e}")
482
+ return []
483
+
484
+
485
+ if __name__ == "__main__":
486
+ # Test OpenAlex searcher
487
+ searcher = OpenAlexSearcher()
488
+
489
+ print("Testing OpenAlex search...")
490
+ papers = searcher.search("transformer attention mechanism", max_results=3)
491
+
492
+ for i, paper in enumerate(papers, 1):
493
+ print(f"\n{i}. {paper.title[:60]}...")
494
+ print(f" ID: {paper.paper_id}")
495
+ print(f" DOI: {paper.doi}")
496
+ print(f" PDF: {paper.pdf_url or '(not available)'}")
497
+ print(f" Citations: {paper.citations}")