academic-search-mcp 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_search_mcp-0.1.3.dist-info/METADATA +243 -0
- academic_search_mcp-0.1.3.dist-info/RECORD +24 -0
- academic_search_mcp-0.1.3.dist-info/WHEEL +4 -0
- academic_search_mcp-0.1.3.dist-info/entry_points.txt +2 -0
- academic_search_mcp-0.1.3.dist-info/licenses/LICENSE +21 -0
- paper_search_mcp/__init__.py +0 -0
- paper_search_mcp/academic_platforms/__init__.py +0 -0
- paper_search_mcp/academic_platforms/arxiv.py +147 -0
- paper_search_mcp/academic_platforms/biorxiv.py +156 -0
- paper_search_mcp/academic_platforms/core.py +284 -0
- paper_search_mcp/academic_platforms/crossref.py +375 -0
- paper_search_mcp/academic_platforms/cyberleninka.py +396 -0
- paper_search_mcp/academic_platforms/google_scholar.py +249 -0
- paper_search_mcp/academic_platforms/hub.py +0 -0
- paper_search_mcp/academic_platforms/iacr.py +548 -0
- paper_search_mcp/academic_platforms/medrxiv.py +156 -0
- paper_search_mcp/academic_platforms/openalex.py +497 -0
- paper_search_mcp/academic_platforms/pubmed.py +159 -0
- paper_search_mcp/academic_platforms/sci_hub.py +178 -0
- paper_search_mcp/academic_platforms/semantic.py +492 -0
- paper_search_mcp/academic_platforms/ssrn.py +385 -0
- paper_search_mcp/paper.py +69 -0
- paper_search_mcp/pdf_utils.py +67 -0
- paper_search_mcp/server.py +514 -0
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
"""SSRN integration for preprints and early-stage research.
|
|
2
|
+
|
|
3
|
+
SSRN is a repository specializing in preprints from social sciences, law, business, and humanities.
|
|
4
|
+
Note: SSRN doesn't have a public API, so we use web scraping with proper rate limiting.
|
|
5
|
+
"""
|
|
6
|
+
from typing import List, Optional, Dict
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from curl_cffi import requests
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
import re
|
|
11
|
+
import os
|
|
12
|
+
import logging
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
from ..paper import Paper
|
|
16
|
+
from ..pdf_utils import extract_text_from_pdf
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SSRNSearcher:
|
|
22
|
+
"""Searcher for SSRN preprints and early research.
|
|
23
|
+
|
|
24
|
+
SSRN covers:
|
|
25
|
+
- Economics and finance
|
|
26
|
+
- Law and legal studies
|
|
27
|
+
- Business (management, marketing, accounting)
|
|
28
|
+
- Social sciences
|
|
29
|
+
- Humanities
|
|
30
|
+
- Computer science
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
BASE_URL = "https://papers.ssrn.com"
|
|
34
|
+
ABSTRACT_URL = f"{BASE_URL}/sol3/papers.cfm"
|
|
35
|
+
SEARCH_URL = f"{BASE_URL}/sol3/JELJOUR_Results.cfm"
|
|
36
|
+
|
|
37
|
+
def __init__(self):
|
|
38
|
+
"""Initialize SSRN searcher."""
|
|
39
|
+
self.impersonate = "chrome"
|
|
40
|
+
self.last_request_time = 0
|
|
41
|
+
|
|
42
|
+
def _rate_limit(self, delay: float = 1.0):
|
|
43
|
+
"""Apply rate limiting to avoid being blocked."""
|
|
44
|
+
elapsed = time.time() - self.last_request_time
|
|
45
|
+
if elapsed < delay:
|
|
46
|
+
time.sleep(delay - elapsed)
|
|
47
|
+
self.last_request_time = time.time()
|
|
48
|
+
|
|
49
|
+
def search(
|
|
50
|
+
self,
|
|
51
|
+
query: str,
|
|
52
|
+
max_results: int = 10,
|
|
53
|
+
date_from: Optional[str] = None,
|
|
54
|
+
date_to: Optional[str] = None,
|
|
55
|
+
**kwargs
|
|
56
|
+
) -> List[Paper]:
|
|
57
|
+
"""Search SSRN for papers.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
query: Search query string
|
|
61
|
+
max_results: Maximum number of papers (max: 100)
|
|
62
|
+
date_from: Start date YYYY-MM-DD (e.g., '2024-01-01')
|
|
63
|
+
date_to: End date YYYY-MM-DD (e.g., '2024-12-31')
|
|
64
|
+
**kwargs: Additional parameters (topic, author_id)
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
List of Paper objects
|
|
68
|
+
"""
|
|
69
|
+
papers = []
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
self._rate_limit()
|
|
73
|
+
|
|
74
|
+
# Use the search URL with query
|
|
75
|
+
url = f"{self.BASE_URL}/sol3/displayAbstractSearch.cfm"
|
|
76
|
+
params = {
|
|
77
|
+
"txtKey_Words": query,
|
|
78
|
+
"search": "Search"
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
response = requests.get(url, params=params, timeout=30, impersonate=self.impersonate)
|
|
82
|
+
|
|
83
|
+
if response.status_code != 200:
|
|
84
|
+
logger.error(f"SSRN search failed with status {response.status_code}")
|
|
85
|
+
return papers
|
|
86
|
+
|
|
87
|
+
soup = BeautifulSoup(response.content, 'lxml')
|
|
88
|
+
|
|
89
|
+
# Parse results
|
|
90
|
+
results = self._parse_search_results(soup)
|
|
91
|
+
|
|
92
|
+
# Apply date filters
|
|
93
|
+
if date_from or date_to:
|
|
94
|
+
year_from = int(date_from[:4]) if date_from else 0
|
|
95
|
+
year_to = int(date_to[:4]) if date_to else 9999
|
|
96
|
+
|
|
97
|
+
filtered = []
|
|
98
|
+
for paper in results:
|
|
99
|
+
paper_year = paper.published_date.year if paper.published_date and paper.published_date != datetime.min else 0
|
|
100
|
+
if year_from <= paper_year <= year_to:
|
|
101
|
+
filtered.append(paper)
|
|
102
|
+
results = filtered
|
|
103
|
+
|
|
104
|
+
papers = results[:max_results]
|
|
105
|
+
logger.info(f"SSRN search: found {len(papers)} papers for '{query}'")
|
|
106
|
+
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logger.error(f"SSRN search error: {e}")
|
|
109
|
+
|
|
110
|
+
return papers
|
|
111
|
+
|
|
112
|
+
def get_paper_by_id(self, paper_id: str) -> Optional[Paper]:
|
|
113
|
+
"""Get a specific paper by its SSRN ID.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
paper_id: SSRN paper ID (abstract_id)
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Paper object or None
|
|
120
|
+
"""
|
|
121
|
+
try:
|
|
122
|
+
self._rate_limit()
|
|
123
|
+
|
|
124
|
+
url = self.ABSTRACT_URL
|
|
125
|
+
params = {"abstract_id": paper_id}
|
|
126
|
+
response = requests.get(url, params=params, timeout=30, impersonate=self.impersonate)
|
|
127
|
+
|
|
128
|
+
if response.status_code == 404:
|
|
129
|
+
return None
|
|
130
|
+
response.raise_for_status()
|
|
131
|
+
|
|
132
|
+
return self._parse_paper_page(response.content, paper_id)
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.error(f"Error fetching SSRN paper {paper_id}: {e}")
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
def search_by_doi(self, doi: str) -> Optional[Paper]:
|
|
139
|
+
"""Search for paper by DOI.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
doi: Digital Object Identifier
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Paper object or None
|
|
146
|
+
"""
|
|
147
|
+
clean_doi = doi.replace("https://doi.org/", "").replace("doi:", "").strip()
|
|
148
|
+
|
|
149
|
+
# SSRN DOIs typically contain "ssrn" - extract the ID
|
|
150
|
+
if "ssrn" in clean_doi.lower():
|
|
151
|
+
match = re.search(r'(\d{6,})', clean_doi)
|
|
152
|
+
if match:
|
|
153
|
+
return self.get_paper_by_id(match.group(1))
|
|
154
|
+
|
|
155
|
+
# Try searching
|
|
156
|
+
results = self.search(clean_doi, max_results=1)
|
|
157
|
+
return results[0] if results else None
|
|
158
|
+
|
|
159
|
+
def download_pdf(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
160
|
+
"""Download PDF from SSRN.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
paper_id: SSRN paper ID
|
|
164
|
+
save_path: Directory to save
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Path to PDF or error message
|
|
168
|
+
"""
|
|
169
|
+
try:
|
|
170
|
+
os.makedirs(save_path, exist_ok=True)
|
|
171
|
+
|
|
172
|
+
self._rate_limit()
|
|
173
|
+
|
|
174
|
+
# SSRN download URL format
|
|
175
|
+
download_url = f"{self.BASE_URL}/sol3/Delivery.cfm/SSRN_ID{paper_id}.pdf?abstractid={paper_id}&mirid=1"
|
|
176
|
+
|
|
177
|
+
response = requests.get(download_url, timeout=60, allow_redirects=True, impersonate=self.impersonate)
|
|
178
|
+
|
|
179
|
+
if response.status_code != 200:
|
|
180
|
+
return f"PDF download not available for {paper_id}"
|
|
181
|
+
|
|
182
|
+
content_type = response.headers.get("Content-Type", "")
|
|
183
|
+
|
|
184
|
+
if "pdf" in content_type.lower():
|
|
185
|
+
filename = f"ssrn_{paper_id}.pdf"
|
|
186
|
+
file_path = os.path.join(save_path, filename)
|
|
187
|
+
|
|
188
|
+
with open(file_path, 'wb') as f:
|
|
189
|
+
f.write(response.content)
|
|
190
|
+
|
|
191
|
+
return file_path
|
|
192
|
+
|
|
193
|
+
return f"PDF not available for {paper_id} (may require login)"
|
|
194
|
+
|
|
195
|
+
except Exception as e:
|
|
196
|
+
logger.error(f"Error downloading SSRN PDF: {e}")
|
|
197
|
+
return f"Failed to download PDF: {e}"
|
|
198
|
+
|
|
199
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
200
|
+
"""Read and extract text from a SSRN paper PDF.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
paper_id: SSRN paper ID
|
|
204
|
+
save_path: Directory for PDF storage
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Extracted text or error message
|
|
208
|
+
"""
|
|
209
|
+
pdf_path = self.download_pdf(paper_id, save_path)
|
|
210
|
+
if not os.path.exists(pdf_path):
|
|
211
|
+
# If PDF not available, return abstract
|
|
212
|
+
paper = self.get_paper_by_id(paper_id)
|
|
213
|
+
if paper and paper.abstract:
|
|
214
|
+
return f"[Abstract only - PDF requires login]\n\n{paper.abstract}"
|
|
215
|
+
return pdf_path # Return error message
|
|
216
|
+
|
|
217
|
+
text = extract_text_from_pdf(pdf_path)
|
|
218
|
+
return text if text else "Failed to extract text from PDF"
|
|
219
|
+
|
|
220
|
+
def _parse_search_results(self, soup: BeautifulSoup) -> List[Paper]:
|
|
221
|
+
"""Parse search results page into Paper objects."""
|
|
222
|
+
papers = []
|
|
223
|
+
|
|
224
|
+
# Find paper entries - SSRN uses various formats
|
|
225
|
+
# Try finding by abstract links
|
|
226
|
+
for link in soup.find_all("a", href=re.compile(r"abstract_id=\d+")):
|
|
227
|
+
try:
|
|
228
|
+
href = link.get("href", "")
|
|
229
|
+
match = re.search(r"abstract_id=(\d+)", href)
|
|
230
|
+
if not match:
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
paper_id = match.group(1)
|
|
234
|
+
title = link.get_text(strip=True)
|
|
235
|
+
|
|
236
|
+
if not title or len(title) < 5:
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
# Try to find parent container for more info
|
|
240
|
+
parent = link.find_parent("div") or link.find_parent("tr")
|
|
241
|
+
|
|
242
|
+
authors = []
|
|
243
|
+
abstract = ""
|
|
244
|
+
published_date = datetime.min
|
|
245
|
+
|
|
246
|
+
if parent:
|
|
247
|
+
# Look for author info
|
|
248
|
+
author_elem = parent.find("span", {"class": re.compile(r"author", re.I)})
|
|
249
|
+
if author_elem:
|
|
250
|
+
author_text = author_elem.get_text(strip=True)
|
|
251
|
+
authors = [a.strip() for a in re.split(r'[,;]', author_text) if a.strip()]
|
|
252
|
+
|
|
253
|
+
# Look for date
|
|
254
|
+
date_match = re.search(r'(\w+\s+\d{1,2},?\s+\d{4}|\d{4})', parent.get_text())
|
|
255
|
+
if date_match:
|
|
256
|
+
try:
|
|
257
|
+
published_date = datetime.strptime(date_match.group(1), "%B %d, %Y")
|
|
258
|
+
except:
|
|
259
|
+
try:
|
|
260
|
+
published_date = datetime.strptime(date_match.group(1), "%Y")
|
|
261
|
+
except:
|
|
262
|
+
pass
|
|
263
|
+
|
|
264
|
+
url = f"{self.ABSTRACT_URL}?abstract_id={paper_id}"
|
|
265
|
+
|
|
266
|
+
papers.append(Paper(
|
|
267
|
+
paper_id=paper_id,
|
|
268
|
+
title=title,
|
|
269
|
+
authors=authors,
|
|
270
|
+
abstract=abstract,
|
|
271
|
+
doi=f"10.2139/ssrn.{paper_id}",
|
|
272
|
+
published_date=published_date,
|
|
273
|
+
pdf_url="",
|
|
274
|
+
url=url,
|
|
275
|
+
source="ssrn",
|
|
276
|
+
categories=[],
|
|
277
|
+
keywords=[],
|
|
278
|
+
citations=0,
|
|
279
|
+
references=[],
|
|
280
|
+
extra={"ssrn_id": paper_id}
|
|
281
|
+
))
|
|
282
|
+
|
|
283
|
+
except Exception as e:
|
|
284
|
+
logger.warning(f"Error parsing SSRN result: {e}")
|
|
285
|
+
continue
|
|
286
|
+
|
|
287
|
+
return papers
|
|
288
|
+
|
|
289
|
+
def _parse_paper_page(self, content: bytes, paper_id: str) -> Optional[Paper]:
|
|
290
|
+
"""Parse a paper detail page."""
|
|
291
|
+
try:
|
|
292
|
+
soup = BeautifulSoup(content, 'lxml')
|
|
293
|
+
|
|
294
|
+
# Title - try multiple selectors
|
|
295
|
+
title = ""
|
|
296
|
+
for selector in [
|
|
297
|
+
("h1", {}),
|
|
298
|
+
("meta", {"property": "og:title"}),
|
|
299
|
+
("meta", {"name": "citation_title"}),
|
|
300
|
+
("div", {"class": "title"})
|
|
301
|
+
]:
|
|
302
|
+
elem = soup.find(selector[0], selector[1])
|
|
303
|
+
if elem:
|
|
304
|
+
title = elem.get("content") if elem.name == "meta" else elem.get_text(strip=True)
|
|
305
|
+
if title:
|
|
306
|
+
break
|
|
307
|
+
|
|
308
|
+
if not title:
|
|
309
|
+
return None
|
|
310
|
+
|
|
311
|
+
# Authors
|
|
312
|
+
authors = []
|
|
313
|
+
# Try meta tags first
|
|
314
|
+
for meta in soup.find_all("meta", {"name": "citation_author"}):
|
|
315
|
+
name = meta.get("content", "")
|
|
316
|
+
if name:
|
|
317
|
+
authors.append(name)
|
|
318
|
+
|
|
319
|
+
# Fallback to page content
|
|
320
|
+
if not authors:
|
|
321
|
+
author_section = soup.find("div", {"class": re.compile(r"author", re.I)})
|
|
322
|
+
if author_section:
|
|
323
|
+
for link in author_section.find_all("a"):
|
|
324
|
+
name = link.get_text(strip=True)
|
|
325
|
+
if name and len(name) > 2:
|
|
326
|
+
authors.append(name)
|
|
327
|
+
|
|
328
|
+
# Abstract
|
|
329
|
+
abstract = ""
|
|
330
|
+
abstract_elem = soup.find("div", {"class": "abstract-text"})
|
|
331
|
+
if not abstract_elem:
|
|
332
|
+
abstract_elem = soup.find("meta", {"name": "description"})
|
|
333
|
+
if abstract_elem:
|
|
334
|
+
abstract = abstract_elem.get("content") if abstract_elem.name == "meta" else abstract_elem.get_text(strip=True)
|
|
335
|
+
|
|
336
|
+
# Date
|
|
337
|
+
published_date = datetime.min
|
|
338
|
+
date_meta = soup.find("meta", {"name": "citation_publication_date"})
|
|
339
|
+
if date_meta:
|
|
340
|
+
date_str = date_meta.get("content", "")
|
|
341
|
+
try:
|
|
342
|
+
published_date = datetime.strptime(date_str, "%Y/%m/%d")
|
|
343
|
+
except:
|
|
344
|
+
try:
|
|
345
|
+
published_date = datetime.strptime(date_str, "%Y-%m-%d")
|
|
346
|
+
except:
|
|
347
|
+
pass
|
|
348
|
+
|
|
349
|
+
# Keywords
|
|
350
|
+
keywords = []
|
|
351
|
+
keywords_meta = soup.find("meta", {"name": "citation_keywords"})
|
|
352
|
+
if keywords_meta:
|
|
353
|
+
kw_text = keywords_meta.get("content", "")
|
|
354
|
+
keywords = [k.strip() for k in kw_text.split(",") if k.strip()]
|
|
355
|
+
|
|
356
|
+
# URL and DOI
|
|
357
|
+
url = f"{self.ABSTRACT_URL}?abstract_id={paper_id}"
|
|
358
|
+
doi = f"10.2139/ssrn.{paper_id}"
|
|
359
|
+
|
|
360
|
+
# PDF URL
|
|
361
|
+
pdf_url = ""
|
|
362
|
+
pdf_meta = soup.find("meta", {"name": "citation_pdf_url"})
|
|
363
|
+
if pdf_meta:
|
|
364
|
+
pdf_url = pdf_meta.get("content", "")
|
|
365
|
+
|
|
366
|
+
return Paper(
|
|
367
|
+
paper_id=paper_id,
|
|
368
|
+
title=title,
|
|
369
|
+
authors=authors,
|
|
370
|
+
abstract=abstract[:5000] if abstract else "",
|
|
371
|
+
doi=doi,
|
|
372
|
+
published_date=published_date,
|
|
373
|
+
pdf_url=pdf_url,
|
|
374
|
+
url=url,
|
|
375
|
+
source="ssrn",
|
|
376
|
+
categories=[],
|
|
377
|
+
keywords=keywords[:10],
|
|
378
|
+
citations=0,
|
|
379
|
+
references=[],
|
|
380
|
+
extra={"ssrn_id": paper_id}
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
except Exception as e:
|
|
384
|
+
logger.error(f"Error parsing SSRN page: {e}")
|
|
385
|
+
return None
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# paper_search_mcp/paper.py
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import List, Dict, Optional
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Paper:
|
|
8
|
+
"""Standardized paper format with core fields for academic sources"""
|
|
9
|
+
# 核心字段(必填,但允许空值或默认值)
|
|
10
|
+
paper_id: str # Unique identifier (e.g., arXiv ID, PMID, DOI)
|
|
11
|
+
title: str # Paper title
|
|
12
|
+
authors: List[str] # List of author names
|
|
13
|
+
abstract: str # Abstract text
|
|
14
|
+
doi: str # Digital Object Identifier
|
|
15
|
+
published_date: datetime # Publication date
|
|
16
|
+
pdf_url: str # Direct PDF link
|
|
17
|
+
url: str # URL to paper page
|
|
18
|
+
source: str # Source platform (e.g., 'arxiv', 'pubmed')
|
|
19
|
+
|
|
20
|
+
# 可选字段
|
|
21
|
+
updated_date: Optional[datetime] = None # Last updated date
|
|
22
|
+
categories: List[str] = None # Subject categories
|
|
23
|
+
keywords: List[str] = None # Keywords
|
|
24
|
+
citations: int = 0 # Citation count
|
|
25
|
+
references: Optional[List[str]] = None # List of reference IDs/DOIs
|
|
26
|
+
extra: Optional[Dict] = None # Source-specific extra metadata
|
|
27
|
+
|
|
28
|
+
def __post_init__(self):
|
|
29
|
+
"""Post-initialization to handle default values"""
|
|
30
|
+
if self.authors is None:
|
|
31
|
+
self.authors = []
|
|
32
|
+
if self.categories is None:
|
|
33
|
+
self.categories = []
|
|
34
|
+
if self.keywords is None:
|
|
35
|
+
self.keywords = []
|
|
36
|
+
if self.references is None:
|
|
37
|
+
self.references = []
|
|
38
|
+
if self.extra is None:
|
|
39
|
+
self.extra = {}
|
|
40
|
+
|
|
41
|
+
def to_dict(self, abstract_limit: int = 200) -> Dict:
|
|
42
|
+
"""Convert paper to dictionary format for serialization.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
abstract_limit: Max chars for abstract. 0 = omit, -1 = full (default: 200)
|
|
46
|
+
"""
|
|
47
|
+
# Process abstract based on limit
|
|
48
|
+
if abstract_limit == 0:
|
|
49
|
+
abstract = None
|
|
50
|
+
elif abstract_limit > 0 and self.abstract and len(self.abstract) > abstract_limit:
|
|
51
|
+
abstract = self.abstract[:abstract_limit] + '...'
|
|
52
|
+
else:
|
|
53
|
+
abstract = self.abstract
|
|
54
|
+
|
|
55
|
+
result = {
|
|
56
|
+
'id': self.paper_id,
|
|
57
|
+
'source': self.source or None,
|
|
58
|
+
'title': self.title,
|
|
59
|
+
'authors': self.authors if self.authors else None,
|
|
60
|
+
'abstract': abstract,
|
|
61
|
+
'date': self.published_date.strftime('%Y-%m-%d') if self.published_date else None,
|
|
62
|
+
'doi': self.doi or None,
|
|
63
|
+
'pdf': self.pdf_url or None, # Include when available (e.g., open access)
|
|
64
|
+
'categories': self.categories if self.categories else None,
|
|
65
|
+
'citations': self.citations if self.citations else None,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# Remove None/empty values
|
|
69
|
+
return {k: v for k, v in result.items() if v is not None and v != ''}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""PDF text extraction utilities with pdftotext (poppler) support."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import shutil
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def extract_text_from_pdf(pdf_path: str) -> str:
|
|
9
|
+
"""Extract text from a PDF file.
|
|
10
|
+
|
|
11
|
+
Uses pdftotext (poppler) for best quality extraction of academic papers,
|
|
12
|
+
with fallback to PyPDF2 if pdftotext is not available.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
pdf_path: Path to the PDF file
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
Extracted text content
|
|
19
|
+
"""
|
|
20
|
+
# Try pdftotext first (better quality for academic papers)
|
|
21
|
+
text = _extract_with_pdftotext(pdf_path)
|
|
22
|
+
if text is not None:
|
|
23
|
+
return text
|
|
24
|
+
|
|
25
|
+
# Fallback to PyPDF2
|
|
26
|
+
return _extract_with_pypdf(pdf_path)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _extract_with_pdftotext(pdf_path: str) -> Optional[str]:
|
|
30
|
+
"""Extract text using pdftotext (poppler).
|
|
31
|
+
|
|
32
|
+
Returns None if pdftotext is not available.
|
|
33
|
+
"""
|
|
34
|
+
if not shutil.which('pdftotext'):
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
# -layout preserves the original physical layout
|
|
39
|
+
# -enc UTF-8 ensures proper encoding
|
|
40
|
+
result = subprocess.run(
|
|
41
|
+
['pdftotext', '-layout', '-enc', 'UTF-8', pdf_path, '-'],
|
|
42
|
+
capture_output=True,
|
|
43
|
+
text=True,
|
|
44
|
+
timeout=60
|
|
45
|
+
)
|
|
46
|
+
if result.returncode == 0:
|
|
47
|
+
return result.stdout.strip()
|
|
48
|
+
return None
|
|
49
|
+
except (subprocess.TimeoutExpired, subprocess.SubprocessError):
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _extract_with_pypdf(pdf_path: str) -> str:
|
|
54
|
+
"""Extract text using PyPDF2 as fallback."""
|
|
55
|
+
try:
|
|
56
|
+
from pypdf import PdfReader
|
|
57
|
+
except ImportError:
|
|
58
|
+
from PyPDF2 import PdfReader
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
reader = PdfReader(pdf_path)
|
|
62
|
+
text = ""
|
|
63
|
+
for page in reader.pages:
|
|
64
|
+
text += page.extract_text() + "\n"
|
|
65
|
+
return text.strip()
|
|
66
|
+
except Exception as e:
|
|
67
|
+
return f"Error extracting text: {e}"
|