academic-search-mcp 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,385 @@
1
+ """SSRN integration for preprints and early-stage research.
2
+
3
+ SSRN is a repository specializing in preprints from social sciences, law, business, and humanities.
4
+ Note: SSRN doesn't have a public API, so we use web scraping with proper rate limiting.
5
+ """
6
+ from typing import List, Optional, Dict
7
+ from datetime import datetime
8
+ from curl_cffi import requests
9
+ from bs4 import BeautifulSoup
10
+ import re
11
+ import os
12
+ import logging
13
+ import time
14
+
15
+ from ..paper import Paper
16
+ from ..pdf_utils import extract_text_from_pdf
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class SSRNSearcher:
22
+ """Searcher for SSRN preprints and early research.
23
+
24
+ SSRN covers:
25
+ - Economics and finance
26
+ - Law and legal studies
27
+ - Business (management, marketing, accounting)
28
+ - Social sciences
29
+ - Humanities
30
+ - Computer science
31
+ """
32
+
33
+ BASE_URL = "https://papers.ssrn.com"
34
+ ABSTRACT_URL = f"{BASE_URL}/sol3/papers.cfm"
35
+ SEARCH_URL = f"{BASE_URL}/sol3/JELJOUR_Results.cfm"
36
+
37
+ def __init__(self):
38
+ """Initialize SSRN searcher."""
39
+ self.impersonate = "chrome"
40
+ self.last_request_time = 0
41
+
42
+ def _rate_limit(self, delay: float = 1.0):
43
+ """Apply rate limiting to avoid being blocked."""
44
+ elapsed = time.time() - self.last_request_time
45
+ if elapsed < delay:
46
+ time.sleep(delay - elapsed)
47
+ self.last_request_time = time.time()
48
+
49
+ def search(
50
+ self,
51
+ query: str,
52
+ max_results: int = 10,
53
+ date_from: Optional[str] = None,
54
+ date_to: Optional[str] = None,
55
+ **kwargs
56
+ ) -> List[Paper]:
57
+ """Search SSRN for papers.
58
+
59
+ Args:
60
+ query: Search query string
61
+ max_results: Maximum number of papers (max: 100)
62
+ date_from: Start date YYYY-MM-DD (e.g., '2024-01-01')
63
+ date_to: End date YYYY-MM-DD (e.g., '2024-12-31')
64
+ **kwargs: Additional parameters (topic, author_id)
65
+
66
+ Returns:
67
+ List of Paper objects
68
+ """
69
+ papers = []
70
+
71
+ try:
72
+ self._rate_limit()
73
+
74
+ # Use the search URL with query
75
+ url = f"{self.BASE_URL}/sol3/displayAbstractSearch.cfm"
76
+ params = {
77
+ "txtKey_Words": query,
78
+ "search": "Search"
79
+ }
80
+
81
+ response = requests.get(url, params=params, timeout=30, impersonate=self.impersonate)
82
+
83
+ if response.status_code != 200:
84
+ logger.error(f"SSRN search failed with status {response.status_code}")
85
+ return papers
86
+
87
+ soup = BeautifulSoup(response.content, 'lxml')
88
+
89
+ # Parse results
90
+ results = self._parse_search_results(soup)
91
+
92
+ # Apply date filters
93
+ if date_from or date_to:
94
+ year_from = int(date_from[:4]) if date_from else 0
95
+ year_to = int(date_to[:4]) if date_to else 9999
96
+
97
+ filtered = []
98
+ for paper in results:
99
+ paper_year = paper.published_date.year if paper.published_date and paper.published_date != datetime.min else 0
100
+ if year_from <= paper_year <= year_to:
101
+ filtered.append(paper)
102
+ results = filtered
103
+
104
+ papers = results[:max_results]
105
+ logger.info(f"SSRN search: found {len(papers)} papers for '{query}'")
106
+
107
+ except Exception as e:
108
+ logger.error(f"SSRN search error: {e}")
109
+
110
+ return papers
111
+
112
+ def get_paper_by_id(self, paper_id: str) -> Optional[Paper]:
113
+ """Get a specific paper by its SSRN ID.
114
+
115
+ Args:
116
+ paper_id: SSRN paper ID (abstract_id)
117
+
118
+ Returns:
119
+ Paper object or None
120
+ """
121
+ try:
122
+ self._rate_limit()
123
+
124
+ url = self.ABSTRACT_URL
125
+ params = {"abstract_id": paper_id}
126
+ response = requests.get(url, params=params, timeout=30, impersonate=self.impersonate)
127
+
128
+ if response.status_code == 404:
129
+ return None
130
+ response.raise_for_status()
131
+
132
+ return self._parse_paper_page(response.content, paper_id)
133
+
134
+ except Exception as e:
135
+ logger.error(f"Error fetching SSRN paper {paper_id}: {e}")
136
+ return None
137
+
138
+ def search_by_doi(self, doi: str) -> Optional[Paper]:
139
+ """Search for paper by DOI.
140
+
141
+ Args:
142
+ doi: Digital Object Identifier
143
+
144
+ Returns:
145
+ Paper object or None
146
+ """
147
+ clean_doi = doi.replace("https://doi.org/", "").replace("doi:", "").strip()
148
+
149
+ # SSRN DOIs typically contain "ssrn" - extract the ID
150
+ if "ssrn" in clean_doi.lower():
151
+ match = re.search(r'(\d{6,})', clean_doi)
152
+ if match:
153
+ return self.get_paper_by_id(match.group(1))
154
+
155
+ # Try searching
156
+ results = self.search(clean_doi, max_results=1)
157
+ return results[0] if results else None
158
+
159
+ def download_pdf(self, paper_id: str, save_path: str = "./downloads") -> str:
160
+ """Download PDF from SSRN.
161
+
162
+ Args:
163
+ paper_id: SSRN paper ID
164
+ save_path: Directory to save
165
+
166
+ Returns:
167
+ Path to PDF or error message
168
+ """
169
+ try:
170
+ os.makedirs(save_path, exist_ok=True)
171
+
172
+ self._rate_limit()
173
+
174
+ # SSRN download URL format
175
+ download_url = f"{self.BASE_URL}/sol3/Delivery.cfm/SSRN_ID{paper_id}.pdf?abstractid={paper_id}&mirid=1"
176
+
177
+ response = requests.get(download_url, timeout=60, allow_redirects=True, impersonate=self.impersonate)
178
+
179
+ if response.status_code != 200:
180
+ return f"PDF download not available for {paper_id}"
181
+
182
+ content_type = response.headers.get("Content-Type", "")
183
+
184
+ if "pdf" in content_type.lower():
185
+ filename = f"ssrn_{paper_id}.pdf"
186
+ file_path = os.path.join(save_path, filename)
187
+
188
+ with open(file_path, 'wb') as f:
189
+ f.write(response.content)
190
+
191
+ return file_path
192
+
193
+ return f"PDF not available for {paper_id} (may require login)"
194
+
195
+ except Exception as e:
196
+ logger.error(f"Error downloading SSRN PDF: {e}")
197
+ return f"Failed to download PDF: {e}"
198
+
199
+ def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
200
+ """Read and extract text from a SSRN paper PDF.
201
+
202
+ Args:
203
+ paper_id: SSRN paper ID
204
+ save_path: Directory for PDF storage
205
+
206
+ Returns:
207
+ Extracted text or error message
208
+ """
209
+ pdf_path = self.download_pdf(paper_id, save_path)
210
+ if not os.path.exists(pdf_path):
211
+ # If PDF not available, return abstract
212
+ paper = self.get_paper_by_id(paper_id)
213
+ if paper and paper.abstract:
214
+ return f"[Abstract only - PDF requires login]\n\n{paper.abstract}"
215
+ return pdf_path # Return error message
216
+
217
+ text = extract_text_from_pdf(pdf_path)
218
+ return text if text else "Failed to extract text from PDF"
219
+
220
+ def _parse_search_results(self, soup: BeautifulSoup) -> List[Paper]:
221
+ """Parse search results page into Paper objects."""
222
+ papers = []
223
+
224
+ # Find paper entries - SSRN uses various formats
225
+ # Try finding by abstract links
226
+ for link in soup.find_all("a", href=re.compile(r"abstract_id=\d+")):
227
+ try:
228
+ href = link.get("href", "")
229
+ match = re.search(r"abstract_id=(\d+)", href)
230
+ if not match:
231
+ continue
232
+
233
+ paper_id = match.group(1)
234
+ title = link.get_text(strip=True)
235
+
236
+ if not title or len(title) < 5:
237
+ continue
238
+
239
+ # Try to find parent container for more info
240
+ parent = link.find_parent("div") or link.find_parent("tr")
241
+
242
+ authors = []
243
+ abstract = ""
244
+ published_date = datetime.min
245
+
246
+ if parent:
247
+ # Look for author info
248
+ author_elem = parent.find("span", {"class": re.compile(r"author", re.I)})
249
+ if author_elem:
250
+ author_text = author_elem.get_text(strip=True)
251
+ authors = [a.strip() for a in re.split(r'[,;]', author_text) if a.strip()]
252
+
253
+ # Look for date
254
+ date_match = re.search(r'(\w+\s+\d{1,2},?\s+\d{4}|\d{4})', parent.get_text())
255
+ if date_match:
256
+ try:
257
+ published_date = datetime.strptime(date_match.group(1), "%B %d, %Y")
258
+ except:
259
+ try:
260
+ published_date = datetime.strptime(date_match.group(1), "%Y")
261
+ except:
262
+ pass
263
+
264
+ url = f"{self.ABSTRACT_URL}?abstract_id={paper_id}"
265
+
266
+ papers.append(Paper(
267
+ paper_id=paper_id,
268
+ title=title,
269
+ authors=authors,
270
+ abstract=abstract,
271
+ doi=f"10.2139/ssrn.{paper_id}",
272
+ published_date=published_date,
273
+ pdf_url="",
274
+ url=url,
275
+ source="ssrn",
276
+ categories=[],
277
+ keywords=[],
278
+ citations=0,
279
+ references=[],
280
+ extra={"ssrn_id": paper_id}
281
+ ))
282
+
283
+ except Exception as e:
284
+ logger.warning(f"Error parsing SSRN result: {e}")
285
+ continue
286
+
287
+ return papers
288
+
289
+ def _parse_paper_page(self, content: bytes, paper_id: str) -> Optional[Paper]:
290
+ """Parse a paper detail page."""
291
+ try:
292
+ soup = BeautifulSoup(content, 'lxml')
293
+
294
+ # Title - try multiple selectors
295
+ title = ""
296
+ for selector in [
297
+ ("h1", {}),
298
+ ("meta", {"property": "og:title"}),
299
+ ("meta", {"name": "citation_title"}),
300
+ ("div", {"class": "title"})
301
+ ]:
302
+ elem = soup.find(selector[0], selector[1])
303
+ if elem:
304
+ title = elem.get("content") if elem.name == "meta" else elem.get_text(strip=True)
305
+ if title:
306
+ break
307
+
308
+ if not title:
309
+ return None
310
+
311
+ # Authors
312
+ authors = []
313
+ # Try meta tags first
314
+ for meta in soup.find_all("meta", {"name": "citation_author"}):
315
+ name = meta.get("content", "")
316
+ if name:
317
+ authors.append(name)
318
+
319
+ # Fallback to page content
320
+ if not authors:
321
+ author_section = soup.find("div", {"class": re.compile(r"author", re.I)})
322
+ if author_section:
323
+ for link in author_section.find_all("a"):
324
+ name = link.get_text(strip=True)
325
+ if name and len(name) > 2:
326
+ authors.append(name)
327
+
328
+ # Abstract
329
+ abstract = ""
330
+ abstract_elem = soup.find("div", {"class": "abstract-text"})
331
+ if not abstract_elem:
332
+ abstract_elem = soup.find("meta", {"name": "description"})
333
+ if abstract_elem:
334
+ abstract = abstract_elem.get("content") if abstract_elem.name == "meta" else abstract_elem.get_text(strip=True)
335
+
336
+ # Date
337
+ published_date = datetime.min
338
+ date_meta = soup.find("meta", {"name": "citation_publication_date"})
339
+ if date_meta:
340
+ date_str = date_meta.get("content", "")
341
+ try:
342
+ published_date = datetime.strptime(date_str, "%Y/%m/%d")
343
+ except:
344
+ try:
345
+ published_date = datetime.strptime(date_str, "%Y-%m-%d")
346
+ except:
347
+ pass
348
+
349
+ # Keywords
350
+ keywords = []
351
+ keywords_meta = soup.find("meta", {"name": "citation_keywords"})
352
+ if keywords_meta:
353
+ kw_text = keywords_meta.get("content", "")
354
+ keywords = [k.strip() for k in kw_text.split(",") if k.strip()]
355
+
356
+ # URL and DOI
357
+ url = f"{self.ABSTRACT_URL}?abstract_id={paper_id}"
358
+ doi = f"10.2139/ssrn.{paper_id}"
359
+
360
+ # PDF URL
361
+ pdf_url = ""
362
+ pdf_meta = soup.find("meta", {"name": "citation_pdf_url"})
363
+ if pdf_meta:
364
+ pdf_url = pdf_meta.get("content", "")
365
+
366
+ return Paper(
367
+ paper_id=paper_id,
368
+ title=title,
369
+ authors=authors,
370
+ abstract=abstract[:5000] if abstract else "",
371
+ doi=doi,
372
+ published_date=published_date,
373
+ pdf_url=pdf_url,
374
+ url=url,
375
+ source="ssrn",
376
+ categories=[],
377
+ keywords=keywords[:10],
378
+ citations=0,
379
+ references=[],
380
+ extra={"ssrn_id": paper_id}
381
+ )
382
+
383
+ except Exception as e:
384
+ logger.error(f"Error parsing SSRN page: {e}")
385
+ return None
@@ -0,0 +1,69 @@
1
+ # paper_search_mcp/paper.py
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from typing import List, Dict, Optional
5
+
6
+ @dataclass
7
+ class Paper:
8
+ """Standardized paper format with core fields for academic sources"""
9
+ # 核心字段(必填,但允许空值或默认值)
10
+ paper_id: str # Unique identifier (e.g., arXiv ID, PMID, DOI)
11
+ title: str # Paper title
12
+ authors: List[str] # List of author names
13
+ abstract: str # Abstract text
14
+ doi: str # Digital Object Identifier
15
+ published_date: datetime # Publication date
16
+ pdf_url: str # Direct PDF link
17
+ url: str # URL to paper page
18
+ source: str # Source platform (e.g., 'arxiv', 'pubmed')
19
+
20
+ # 可选字段
21
+ updated_date: Optional[datetime] = None # Last updated date
22
+ categories: List[str] = None # Subject categories
23
+ keywords: List[str] = None # Keywords
24
+ citations: int = 0 # Citation count
25
+ references: Optional[List[str]] = None # List of reference IDs/DOIs
26
+ extra: Optional[Dict] = None # Source-specific extra metadata
27
+
28
+ def __post_init__(self):
29
+ """Post-initialization to handle default values"""
30
+ if self.authors is None:
31
+ self.authors = []
32
+ if self.categories is None:
33
+ self.categories = []
34
+ if self.keywords is None:
35
+ self.keywords = []
36
+ if self.references is None:
37
+ self.references = []
38
+ if self.extra is None:
39
+ self.extra = {}
40
+
41
+ def to_dict(self, abstract_limit: int = 200) -> Dict:
42
+ """Convert paper to dictionary format for serialization.
43
+
44
+ Args:
45
+ abstract_limit: Max chars for abstract. 0 = omit, -1 = full (default: 200)
46
+ """
47
+ # Process abstract based on limit
48
+ if abstract_limit == 0:
49
+ abstract = None
50
+ elif abstract_limit > 0 and self.abstract and len(self.abstract) > abstract_limit:
51
+ abstract = self.abstract[:abstract_limit] + '...'
52
+ else:
53
+ abstract = self.abstract
54
+
55
+ result = {
56
+ 'id': self.paper_id,
57
+ 'source': self.source or None,
58
+ 'title': self.title,
59
+ 'authors': self.authors if self.authors else None,
60
+ 'abstract': abstract,
61
+ 'date': self.published_date.strftime('%Y-%m-%d') if self.published_date else None,
62
+ 'doi': self.doi or None,
63
+ 'pdf': self.pdf_url or None, # Include when available (e.g., open access)
64
+ 'categories': self.categories if self.categories else None,
65
+ 'citations': self.citations if self.citations else None,
66
+ }
67
+
68
+ # Remove None/empty values
69
+ return {k: v for k, v in result.items() if v is not None and v != ''}
@@ -0,0 +1,67 @@
1
+ """PDF text extraction utilities with pdftotext (poppler) support."""
2
+
3
+ import subprocess
4
+ import shutil
5
+ from typing import Optional
6
+
7
+
8
+ def extract_text_from_pdf(pdf_path: str) -> str:
9
+ """Extract text from a PDF file.
10
+
11
+ Uses pdftotext (poppler) for best quality extraction of academic papers,
12
+ with fallback to PyPDF2 if pdftotext is not available.
13
+
14
+ Args:
15
+ pdf_path: Path to the PDF file
16
+
17
+ Returns:
18
+ Extracted text content
19
+ """
20
+ # Try pdftotext first (better quality for academic papers)
21
+ text = _extract_with_pdftotext(pdf_path)
22
+ if text is not None:
23
+ return text
24
+
25
+ # Fallback to PyPDF2
26
+ return _extract_with_pypdf(pdf_path)
27
+
28
+
29
+ def _extract_with_pdftotext(pdf_path: str) -> Optional[str]:
30
+ """Extract text using pdftotext (poppler).
31
+
32
+ Returns None if pdftotext is not available.
33
+ """
34
+ if not shutil.which('pdftotext'):
35
+ return None
36
+
37
+ try:
38
+ # -layout preserves the original physical layout
39
+ # -enc UTF-8 ensures proper encoding
40
+ result = subprocess.run(
41
+ ['pdftotext', '-layout', '-enc', 'UTF-8', pdf_path, '-'],
42
+ capture_output=True,
43
+ text=True,
44
+ timeout=60
45
+ )
46
+ if result.returncode == 0:
47
+ return result.stdout.strip()
48
+ return None
49
+ except (subprocess.TimeoutExpired, subprocess.SubprocessError):
50
+ return None
51
+
52
+
53
+ def _extract_with_pypdf(pdf_path: str) -> str:
54
+ """Extract text using PyPDF2 as fallback."""
55
+ try:
56
+ from pypdf import PdfReader
57
+ except ImportError:
58
+ from PyPDF2 import PdfReader
59
+
60
+ try:
61
+ reader = PdfReader(pdf_path)
62
+ text = ""
63
+ for page in reader.pages:
64
+ text += page.extract_text() + "\n"
65
+ return text.strip()
66
+ except Exception as e:
67
+ return f"Error extracting text: {e}"