academic-search-mcp 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,492 @@
1
+ from typing import List, Optional
2
+ from datetime import datetime
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import time
6
+ import random
7
+ from ..paper import Paper
8
+ import logging
9
+ from PyPDF2 import PdfReader
10
+ import os
11
+ import re
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class PaperSource:
17
+ """Abstract base class for paper sources"""
18
+
19
+ def search(self, query: str, **kwargs) -> List[Paper]:
20
+ raise NotImplementedError
21
+
22
+ def download_pdf(self, paper_id: str, save_path: str) -> str:
23
+ raise NotImplementedError
24
+
25
+ def read_paper(self, paper_id: str, save_path: str) -> str:
26
+ raise NotImplementedError
27
+
28
+
29
+ class SemanticSearcher(PaperSource):
30
+ """Semantic Scholar paper search implementation"""
31
+
32
+ SEMANTIC_SEARCH_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
33
+ SEMANTIC_BASE_URL = "https://api.semanticscholar.org/graph/v1"
34
+ BROWSERS = [
35
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
36
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
37
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
38
+ ]
39
+
40
+ def __init__(self):
41
+ self._setup_session()
42
+
43
+ def _setup_session(self):
44
+ """Initialize session with random user agent"""
45
+ self.session = requests.Session()
46
+ self.session.headers.update(
47
+ {
48
+ "User-Agent": random.choice(self.BROWSERS),
49
+ "Accept": "text/html,application/xhtml+xml",
50
+ "Accept-Language": "en-US,en;q=0.9",
51
+ }
52
+ )
53
+
54
+ def _parse_date(self, date_str: str) -> Optional[datetime]:
55
+ """Parse date from Semantic Scholar format (e.g., '2025-06-02')"""
56
+ try:
57
+ return datetime.strptime(date_str.strip(), "%Y-%m-%d")
58
+ except ValueError:
59
+ logger.warning(f"Could not parse date: {date_str}")
60
+ return None
61
+
62
+ def _extract_url_from_disclaimer(self, disclaimer: str) -> str:
63
+ """Extract URL from disclaimer text"""
64
+ # 匹配常见的 URL 模式
65
+ url_patterns = [
66
+ r'https?://[^\s,)]+', # 基本的 HTTP/HTTPS URL
67
+ r'https?://arxiv\.org/abs/[^\s,)]+', # arXiv 链接
68
+ r'https?://[^\s,)]*\.pdf', # PDF 文件链接
69
+ ]
70
+
71
+ all_urls = []
72
+ for pattern in url_patterns:
73
+ matches = re.findall(pattern, disclaimer)
74
+ all_urls.extend(matches)
75
+
76
+ if not all_urls:
77
+ return ""
78
+
79
+ doi_urls = [url for url in all_urls if 'doi.org' in url]
80
+ if doi_urls:
81
+ return doi_urls[0]
82
+
83
+ non_unpaywall_urls = [url for url in all_urls if 'unpaywall.org' not in url]
84
+ if non_unpaywall_urls:
85
+ url = non_unpaywall_urls[0]
86
+ if 'arxiv.org/abs/' in url:
87
+ pdf_url = url.replace('/abs/', '/pdf/')
88
+ return pdf_url
89
+ return url
90
+
91
+ if all_urls:
92
+ url = all_urls[0]
93
+ if 'arxiv.org/abs/' in url:
94
+ pdf_url = url.replace('/abs/', '/pdf/')
95
+ return pdf_url
96
+ return url
97
+
98
+ return ""
99
+
100
+ def _parse_paper(self, item) -> Optional[Paper]:
101
+ """Parse single paper entry from Semantic Scholar HTML and optionally fetch detailed info"""
102
+ try:
103
+ authors = [author['name'] for author in item.get('authors', [])]
104
+
105
+ # Parse the publication date
106
+ published_date = self._parse_date(item.get('publicationDate', ''))
107
+
108
+ # Safely get PDF URL - 支持从 disclaimer 中提取
109
+ pdf_url = ""
110
+ if item.get('openAccessPdf'):
111
+ open_access_pdf = item['openAccessPdf']
112
+ # 首先尝试直接获取 URL
113
+ if open_access_pdf.get('url'):
114
+ pdf_url = open_access_pdf['url']
115
+ # 如果 URL 为空但有 disclaimer,尝试从 disclaimer 中提取
116
+ elif open_access_pdf.get('disclaimer'):
117
+ pdf_url = self._extract_url_from_disclaimer(open_access_pdf['disclaimer'])
118
+
119
+ # Safely get DOI
120
+ doi = ""
121
+ if item.get('externalIds') and item['externalIds'].get('DOI'):
122
+ doi = item['externalIds']['DOI']
123
+
124
+ # Safely get categories
125
+ categories = item.get('fieldsOfStudy', [])
126
+ if not categories:
127
+ categories = []
128
+
129
+ return Paper(
130
+ paper_id=item['paperId'],
131
+ title=item['title'],
132
+ authors=authors,
133
+ abstract=item.get('abstract', ''),
134
+ url=item.get('url', ''),
135
+ pdf_url=pdf_url,
136
+ published_date=published_date,
137
+ source="semantic",
138
+ categories=categories,
139
+ doi=doi,
140
+ citations=item.get('citationCount', 0),
141
+ )
142
+
143
+ except Exception as e:
144
+ logger.warning(f"Failed to parse Semantic paper: {e}")
145
+ return None
146
+
147
+ @staticmethod
148
+ def get_api_key() -> Optional[str]:
149
+ """
150
+ Get the Semantic Scholar API key from environment variables.
151
+ Returns None if no API key is set or if it's empty, enabling unauthenticated access.
152
+ """
153
+ api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
154
+ if not api_key or api_key.strip() == "":
155
+ logger.warning("No SEMANTIC_SCHOLAR_API_KEY set or it's empty. Using unauthenticated access with lower rate limits.")
156
+ return None
157
+ return api_key.strip()
158
+
159
+ def request_api(self, path: str, params: dict) -> dict:
160
+ """
161
+ Make a request to the Semantic Scholar API with optional API key.
162
+ """
163
+ max_retries = 3
164
+ retry_delay = 2 # seconds
165
+
166
+ for attempt in range(max_retries):
167
+ try:
168
+ api_key = self.get_api_key()
169
+ headers = {"x-api-key": api_key} if api_key else {}
170
+ url = f"{self.SEMANTIC_BASE_URL}/{path}"
171
+ response = self.session.get(url, params=params, headers=headers)
172
+
173
+ # 检查是否是429错误(限流)
174
+ if response.status_code == 429:
175
+ if attempt < max_retries - 1:
176
+ wait_time = retry_delay * (2 ** attempt) # 指数退避
177
+ logger.warning(f"Rate limited (429). Waiting {wait_time} seconds before retry {attempt + 1}/{max_retries}")
178
+ time.sleep(wait_time)
179
+ continue
180
+ else:
181
+ logger.error(f"Rate limited (429) after {max_retries} attempts. Please wait before making more requests.")
182
+ return {"error": "rate_limited", "status_code": 429, "message": "Too many requests. Please wait before retrying."}
183
+
184
+ response.raise_for_status()
185
+ return response
186
+
187
+ except requests.exceptions.HTTPError as e:
188
+ if e.response.status_code == 429:
189
+ if attempt < max_retries - 1:
190
+ wait_time = retry_delay * (2 ** attempt)
191
+ logger.warning(f"Rate limited (429). Waiting {wait_time} seconds before retry {attempt + 1}/{max_retries}")
192
+ time.sleep(wait_time)
193
+ continue
194
+ else:
195
+ logger.error(f"Rate limited (429) after {max_retries} attempts. Please wait before making more requests.")
196
+ return {"error": "rate_limited", "status_code": 429, "message": "Too many requests. Please wait before retrying."}
197
+ else:
198
+ logger.error(f"HTTP Error requesting API: {e}")
199
+ return {"error": "http_error", "status_code": e.response.status_code, "message": str(e)}
200
+ except Exception as e:
201
+ logger.error(f"Error requesting API: {e}")
202
+ return {"error": "general_error", "message": str(e)}
203
+
204
+ return {"error": "max_retries_exceeded", "message": "Maximum retry attempts exceeded"}
205
+
206
+ def search(self, query: str, year: Optional[str] = None, max_results: int = 10,
207
+ date_from: Optional[str] = None, date_to: Optional[str] = None) -> List[Paper]:
208
+ """
209
+ Search Semantic Scholar
210
+
211
+ Args:
212
+ query: Search query string
213
+ year (Optional[str]): Filter by publication year. Supports several formats:
214
+ - Single year: "2019"
215
+ - Year range: "2016-2020"
216
+ - Since year: "2010-"
217
+ - Until year: "-2015"
218
+ date_from: Start date in YYYY-MM-DD format (optional, overrides year)
219
+ date_to: End date in YYYY-MM-DD format (optional, overrides year)
220
+ max_results: Maximum number of results to return
221
+
222
+ Returns:
223
+ List[Paper]: List of paper objects
224
+ """
225
+ papers = []
226
+
227
+ try:
228
+ fields = ["title", "abstract", "year", "citationCount", "authors", "url","publicationDate","externalIds","fieldsOfStudy"]
229
+ # Construct search parameters
230
+ params = {
231
+ "query": query,
232
+ "limit": max_results,
233
+ "fields": ",".join(fields),
234
+ }
235
+ # Date filtering: date_from/date_to take precedence over year
236
+ if date_from or date_to:
237
+ start = date_from if date_from else ""
238
+ end = date_to if date_to else ""
239
+ params["publicationDateOrYear"] = f"{start}:{end}"
240
+ elif year:
241
+ params["year"] = year
242
+ # Make request
243
+ response = self.request_api("paper/search", params)
244
+
245
+ # Check for errors
246
+ if isinstance(response, dict) and "error" in response:
247
+ error_msg = response.get("message", "Unknown error")
248
+ if response.get("error") == "rate_limited":
249
+ logger.error(f"Rate limited by Semantic Scholar API: {error_msg}")
250
+ else:
251
+ logger.error(f"Semantic Scholar API error: {error_msg}")
252
+ return papers
253
+
254
+ # Check response status code
255
+ if not hasattr(response, 'status_code') or response.status_code != 200:
256
+ status_code = getattr(response, 'status_code', 'unknown')
257
+ logger.error(f"Semantic Scholar search failed with status {status_code}")
258
+ return papers
259
+
260
+ data = response.json()
261
+ results = data['data']
262
+
263
+ if not results:
264
+ logger.info("No results found for the query")
265
+ return papers
266
+
267
+ # Process each result
268
+ for i, item in enumerate(results):
269
+ if len(papers) >= max_results:
270
+ break
271
+
272
+ logger.info(f"Processing paper {i+1}/{min(len(results), max_results)}")
273
+ paper = self._parse_paper(item)
274
+ if paper:
275
+ papers.append(paper)
276
+
277
+ except Exception as e:
278
+ logger.error(f"Semantic Scholar search error: {e}")
279
+
280
+ return papers[:max_results]
281
+
282
+ def download_pdf(self, paper_id: str, save_path: str) -> str:
283
+ """
284
+ Download PDF from Semantic Scholar
285
+
286
+ Args:
287
+ paper_id (str): Paper identifier in one of the following formats:
288
+ - Semantic Scholar ID (e.g., "649def34f8be52c8b66281af98ae884c09aef38b")
289
+ - DOI:<doi> (e.g., "DOI:10.18653/v1/N18-3011")
290
+ - ARXIV:<id> (e.g., "ARXIV:2106.15928")
291
+ - MAG:<id> (e.g., "MAG:112218234")
292
+ - ACL:<id> (e.g., "ACL:W12-3903")
293
+ - PMID:<id> (e.g., "PMID:19872477")
294
+ - PMCID:<id> (e.g., "PMCID:2323736")
295
+ - URL:<url> (e.g., "URL:https://arxiv.org/abs/2106.15928v1")
296
+ save_path: Path to save the PDF
297
+
298
+ Returns:
299
+ str: Path to downloaded file or error message
300
+ """
301
+ try:
302
+ paper = self.get_paper_details(paper_id)
303
+ if not paper or not paper.pdf_url:
304
+ return f"Error: Could not find PDF URL for paper {paper_id}"
305
+ pdf_url = paper.pdf_url
306
+ pdf_response = requests.get(pdf_url, timeout=30)
307
+ pdf_response.raise_for_status()
308
+
309
+ # Create download directory if it doesn't exist
310
+ os.makedirs(save_path, exist_ok=True)
311
+
312
+ filename = f"semantic_{paper_id.replace('/', '_')}.pdf"
313
+ pdf_path = os.path.join(save_path, filename)
314
+
315
+ with open(pdf_path, "wb") as f:
316
+ f.write(pdf_response.content)
317
+ return pdf_path
318
+ except Exception as e:
319
+ logger.error(f"PDF download error: {e}")
320
+ return f"Error downloading PDF: {e}"
321
+
322
+ def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
323
+ """
324
+ Download and extract text from Semantic Scholar paper PDF
325
+
326
+ Args:
327
+ paper_id (str): Paper identifier in one of the following formats:
328
+ - Semantic Scholar ID (e.g., "649def34f8be52c8b66281af98ae884c09aef38b")
329
+ - DOI:<doi> (e.g., "DOI:10.18653/v1/N18-3011")
330
+ - ARXIV:<id> (e.g., "ARXIV:2106.15928")
331
+ - MAG:<id> (e.g., "MAG:112218234")
332
+ - ACL:<id> (e.g., "ACL:W12-3903")
333
+ - PMID:<id> (e.g., "PMID:19872477")
334
+ - PMCID:<id> (e.g., "PMCID:2323736")
335
+ - URL:<url> (e.g., "URL:https://arxiv.org/abs/2106.15928v1")
336
+ save_path: Directory to save downloaded PDF
337
+
338
+ Returns:
339
+ str: Extracted text from the PDF or error message
340
+ """
341
+ from ..pdf_utils import extract_text_from_pdf
342
+
343
+ try:
344
+ # First get paper details to get the PDF URL
345
+ paper = self.get_paper_details(paper_id)
346
+ if not paper or not paper.pdf_url:
347
+ return f"Error: Could not find PDF URL for paper {paper_id}"
348
+
349
+ # Download the PDF
350
+ pdf_response = requests.get(paper.pdf_url, timeout=30)
351
+ pdf_response.raise_for_status()
352
+
353
+ # Create download directory if it doesn't exist
354
+ os.makedirs(save_path, exist_ok=True)
355
+
356
+ # Save the PDF
357
+ filename = f"semantic_{paper_id.replace('/', '_')}.pdf"
358
+ pdf_path = os.path.join(save_path, filename)
359
+
360
+ with open(pdf_path, "wb") as f:
361
+ f.write(pdf_response.content)
362
+
363
+ # Extract text using pdftotext (with PyPDF2 fallback)
364
+ text = extract_text_from_pdf(pdf_path)
365
+
366
+ if not text.strip():
367
+ return (
368
+ f"PDF downloaded to {pdf_path}, but unable to extract readable text"
369
+ )
370
+
371
+ # Add paper metadata at the beginning
372
+ metadata = f"Title: {paper.title}\n"
373
+ metadata += f"Authors: {', '.join(paper.authors)}\n"
374
+ metadata += f"Published Date: {paper.published_date}\n"
375
+ metadata += f"URL: {paper.url}\n"
376
+ metadata += f"PDF downloaded to: {pdf_path}\n"
377
+ metadata += "=" * 80 + "\n\n"
378
+
379
+ return metadata + text.strip()
380
+
381
+ except requests.RequestException as e:
382
+ logger.error(f"Error downloading PDF: {e}")
383
+ return f"Error downloading PDF: {e}"
384
+ except Exception as e:
385
+ logger.error(f"Read paper error: {e}")
386
+ return f"Error reading paper: {e}"
387
+
388
+ def get_paper_details(self, paper_id: str) -> Optional[Paper]:
389
+ """
390
+ Fetch detailed information for a specific Semantic Scholar paper
391
+
392
+ Args:
393
+ paper_id (str): Paper identifier in one of the following formats:
394
+ - Semantic Scholar ID (e.g., "649def34f8be52c8b66281af98ae884c09aef38b")
395
+ - DOI:<doi> (e.g., "DOI:10.18653/v1/N18-3011")
396
+ - ARXIV:<id> (e.g., "ARXIV:2106.15928")
397
+ - MAG:<id> (e.g., "MAG:112218234")
398
+ - ACL:<id> (e.g., "ACL:W12-3903")
399
+ - PMID:<id> (e.g., "PMID:19872477")
400
+ - PMCID:<id> (e.g., "PMCID:2323736")
401
+ - URL:<url> (e.g., "URL:https://arxiv.org/abs/2106.15928v1")
402
+
403
+ Returns:
404
+ Paper: Detailed paper object with full metadata
405
+ """
406
+ try:
407
+ fields = ["title", "abstract", "year", "citationCount", "authors", "url","publicationDate","externalIds","fieldsOfStudy"]
408
+ params = {
409
+ "fields": ",".join(fields),
410
+ }
411
+
412
+ response = self.request_api(f"paper/{paper_id}", params)
413
+
414
+ # Check for errors
415
+ if isinstance(response, dict) and "error" in response:
416
+ error_msg = response.get("message", "Unknown error")
417
+ if response.get("error") == "rate_limited":
418
+ logger.error(f"Rate limited by Semantic Scholar API: {error_msg}")
419
+ else:
420
+ logger.error(f"Semantic Scholar API error: {error_msg}")
421
+ return None
422
+
423
+ # Check response status code
424
+ if not hasattr(response, 'status_code') or response.status_code != 200:
425
+ status_code = getattr(response, 'status_code', 'unknown')
426
+ logger.error(f"Semantic Scholar paper details fetch failed with status {status_code}")
427
+ return None
428
+
429
+ results = response.json()
430
+ paper = self._parse_paper(results)
431
+ if paper:
432
+ return paper
433
+ else:
434
+ return None
435
+ except Exception as e:
436
+ logger.error(f"Error fetching paper details for {paper_id}: {e}")
437
+ return None
438
+
439
+
440
+ if __name__ == "__main__":
441
+ # Test Semantic searcher
442
+ searcher = SemanticSearcher()
443
+
444
+ print("Testing Semantic search functionality...")
445
+ query = "secret sharing"
446
+ max_results = 2
447
+
448
+ print("\n" + "=" * 60)
449
+ print("1. Testing search with detailed information")
450
+ print("=" * 60)
451
+ try:
452
+ papers = searcher.search(query, year=None, max_results=max_results)
453
+ print(f"\nFound {len(papers)} papers for query '{query}' (with details):")
454
+ for i, paper in enumerate(papers, 1):
455
+ print(f"\n{i}. {paper.title}")
456
+ print(f" Paper ID: {paper.paper_id}")
457
+ print(f" Authors: {', '.join(paper.authors)}")
458
+ print(f" Categories: {', '.join(paper.categories)}")
459
+ print(f" URL: {paper.url}")
460
+ if paper.pdf_url:
461
+ print(f" PDF: {paper.pdf_url}")
462
+ if paper.published_date:
463
+ print(f" Published Date: {paper.published_date}")
464
+ if paper.abstract:
465
+ print(f" Abstract: {paper.abstract[:200]}...")
466
+ except Exception as e:
467
+ print(f"Error during detailed search: {e}")
468
+
469
+ print("\n" + "=" * 60)
470
+ print("2. Testing manual paper details fetching")
471
+ print("=" * 60)
472
+ test_paper_id = "5bbfdf2e62f0508c65ba6de9c72fe2066fd98138"
473
+ try:
474
+ paper_details = searcher.get_paper_details(test_paper_id)
475
+ if paper_details:
476
+ print(f"\nManual fetch for paper {test_paper_id}:")
477
+ print(f"Title: {paper_details.title}")
478
+ print(f"Authors: {', '.join(paper_details.authors)}")
479
+ print(f"Categories: {', '.join(paper_details.categories)}")
480
+ print(f"URL: {paper_details.url}")
481
+ if paper_details.pdf_url:
482
+ print(f"PDF: {paper_details.pdf_url}")
483
+ if paper_details.published_date:
484
+ print(f"Published Date: {paper_details.published_date}")
485
+ print(f"DOI: {paper_details.doi}")
486
+ print(f"Citations: {paper_details.citations}")
487
+ print(f"Abstract: {paper_details.abstract[:200]}...")
488
+ else:
489
+ print(f"Could not fetch details for paper {test_paper_id}")
490
+ except Exception as e:
491
+ print(f"Error fetching paper details: {e}")
492
+