academic-search-mcp 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,548 @@
1
+ from typing import List, Optional
2
+ from datetime import datetime
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import time
6
+ import random
7
+ from ..paper import Paper
8
+ import logging
9
+ from PyPDF2 import PdfReader
10
+ import os
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class PaperSource:
16
+ """Abstract base class for paper sources"""
17
+
18
+ def search(self, query: str, **kwargs) -> List[Paper]:
19
+ raise NotImplementedError
20
+
21
+ def download_pdf(self, paper_id: str, save_path: str) -> str:
22
+ raise NotImplementedError
23
+
24
+ def read_paper(self, paper_id: str, save_path: str) -> str:
25
+ raise NotImplementedError
26
+
27
+
28
+ class IACRSearcher(PaperSource):
29
+ """IACR ePrint Archive paper search implementation"""
30
+
31
+ IACR_SEARCH_URL = "https://eprint.iacr.org/search"
32
+ IACR_BASE_URL = "https://eprint.iacr.org"
33
+ BROWSERS = [
34
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
35
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
36
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
37
+ ]
38
+
39
+ def __init__(self):
40
+ self._setup_session()
41
+
42
+ def _setup_session(self):
43
+ """Initialize session with random user agent"""
44
+ self.session = requests.Session()
45
+ self.session.headers.update(
46
+ {
47
+ "User-Agent": random.choice(self.BROWSERS),
48
+ "Accept": "text/html,application/xhtml+xml",
49
+ "Accept-Language": "en-US,en;q=0.9",
50
+ }
51
+ )
52
+
53
+ def _parse_date(self, date_str: str) -> Optional[datetime]:
54
+ """Parse date from IACR format (e.g., '2025-06-02')"""
55
+ try:
56
+ return datetime.strptime(date_str.strip(), "%Y-%m-%d")
57
+ except ValueError:
58
+ logger.warning(f"Could not parse date: {date_str}")
59
+ return None
60
+
61
+ def _parse_paper(self, item, fetch_details: bool = True) -> Optional[Paper]:
62
+ """Parse single paper entry from IACR HTML and optionally fetch detailed info"""
63
+ try:
64
+ # Extract paper ID from the search result
65
+ header_div = item.find("div", class_="d-flex")
66
+ if not header_div:
67
+ return None
68
+
69
+ # Get paper ID from the link
70
+ paper_link = header_div.find("a", class_="paperlink")
71
+ if not paper_link:
72
+ return None
73
+
74
+ paper_id = paper_link.get_text(strip=True) # e.g., "2025/1014"
75
+
76
+ if fetch_details:
77
+ # Fetch detailed information for this paper
78
+ logger.info(f"Fetching detailed info for paper {paper_id}")
79
+ detailed_paper = self.get_paper_details(paper_id)
80
+ if detailed_paper:
81
+ return detailed_paper
82
+ else:
83
+ logger.warning(
84
+ f"Could not fetch details for {paper_id}, falling back to search result parsing"
85
+ )
86
+
87
+ # Fallback: parse from search results if detailed fetch fails or is disabled
88
+ paper_url = self.IACR_BASE_URL + paper_link["href"]
89
+
90
+ # Get PDF URL
91
+ pdf_link = header_div.find("a", href=True, string="(PDF)")
92
+ pdf_url = self.IACR_BASE_URL + pdf_link["href"] if pdf_link else ""
93
+
94
+ # Get last updated date
95
+ last_updated_elem = header_div.find("small", class_="ms-auto")
96
+ updated_date = None
97
+ if last_updated_elem:
98
+ date_text = last_updated_elem.get_text(strip=True)
99
+ if "Last updated:" in date_text:
100
+ date_str = date_text.replace("Last updated:", "").strip()
101
+ updated_date = self._parse_date(date_str)
102
+
103
+ # Get content from the second div
104
+ content_div = item.find("div", class_="ms-md-4")
105
+ if not content_div:
106
+ return None
107
+
108
+ # Extract title
109
+ title_elem = content_div.find("strong")
110
+ title = title_elem.get_text(strip=True) if title_elem else ""
111
+
112
+ # Extract authors
113
+ authors_elem = content_div.find("span", class_="fst-italic")
114
+ authors = []
115
+ if authors_elem:
116
+ authors_text = authors_elem.get_text(strip=True)
117
+ authors = [author.strip() for author in authors_text.split(",")]
118
+
119
+ # Extract category
120
+ category_elem = content_div.find("small", class_="badge")
121
+ categories = []
122
+ if category_elem:
123
+ category_text = category_elem.get_text(strip=True)
124
+ categories = [category_text]
125
+
126
+ # Extract abstract
127
+ abstract_elem = content_div.find("p", class_="search-abstract")
128
+ abstract = abstract_elem.get_text(strip=True) if abstract_elem else ""
129
+
130
+ # Create paper object with search result data
131
+ published_date = updated_date if updated_date else datetime(1900, 1, 1)
132
+
133
+ return Paper(
134
+ paper_id=paper_id,
135
+ title=title,
136
+ authors=authors,
137
+ abstract=abstract,
138
+ url=paper_url,
139
+ pdf_url=pdf_url,
140
+ published_date=published_date,
141
+ updated_date=updated_date,
142
+ source="iacr",
143
+ categories=categories,
144
+ keywords=[],
145
+ doi="",
146
+ citations=0,
147
+ )
148
+
149
+ except Exception as e:
150
+ logger.warning(f"Failed to parse IACR paper: {e}")
151
+ return None
152
+
153
+ def _parse_date_filter(self, date_str: str) -> Optional[datetime]:
154
+ """Parse a date filter string in YYYY-MM-DD format"""
155
+ if not date_str:
156
+ return None
157
+ try:
158
+ return datetime.strptime(date_str.strip(), "%Y-%m-%d")
159
+ except ValueError:
160
+ logger.warning(f"Invalid date filter format: {date_str}, expected YYYY-MM-DD")
161
+ return None
162
+
163
+ def _is_within_date_range(
164
+ self, paper: Paper, date_from: Optional[datetime], date_to: Optional[datetime]
165
+ ) -> bool:
166
+ """Check if paper's published_date is within the specified date range"""
167
+ if not paper.published_date:
168
+ return False
169
+
170
+ # Use date only for comparison (ignore time component)
171
+ paper_date = paper.published_date.date() if hasattr(paper.published_date, 'date') else paper.published_date
172
+
173
+ if date_from:
174
+ from_date = date_from.date() if hasattr(date_from, 'date') else date_from
175
+ if paper_date < from_date:
176
+ return False
177
+
178
+ if date_to:
179
+ to_date = date_to.date() if hasattr(date_to, 'date') else date_to
180
+ if paper_date > to_date:
181
+ return False
182
+
183
+ return True
184
+
185
+ def search(
186
+ self, query: str, max_results: int = 10, fetch_details: bool = True,
187
+ date_from: str = None, date_to: str = None
188
+ ) -> List[Paper]:
189
+ """
190
+ Search IACR ePrint Archive
191
+
192
+ Args:
193
+ query: Search query string
194
+ max_results: Maximum number of results to return
195
+ fetch_details: Whether to fetch detailed information for each paper (slower but more complete)
196
+ date_from: Start date in YYYY-MM-DD format (optional)
197
+ date_to: End date in YYYY-MM-DD format (optional)
198
+
199
+ Returns:
200
+ List[Paper]: List of paper objects
201
+ """
202
+ papers = []
203
+
204
+ # Parse date filters
205
+ date_from_dt = self._parse_date_filter(date_from)
206
+ date_to_dt = self._parse_date_filter(date_to)
207
+ has_date_filter = date_from_dt is not None or date_to_dt is not None
208
+
209
+ try:
210
+ # Construct search parameters
211
+ params = {"q": query}
212
+
213
+ # Make request
214
+ response = self.session.get(self.IACR_SEARCH_URL, params=params)
215
+
216
+ if response.status_code != 200:
217
+ logger.error(f"IACR search failed with status {response.status_code}")
218
+ return papers
219
+
220
+ # Parse results
221
+ soup = BeautifulSoup(response.text, "html.parser")
222
+
223
+ # Find all paper entries - they are divs with class "mb-4"
224
+ results = soup.find_all("div", class_="mb-4")
225
+
226
+ if not results:
227
+ logger.info("No results found for the query")
228
+ return papers
229
+
230
+ # Process each result
231
+ # When date filtering is active, we may need to process more results
232
+ # to find enough papers within the date range
233
+ for i, item in enumerate(results):
234
+ if len(papers) >= max_results:
235
+ break
236
+
237
+ logger.info(f"Processing paper {i+1}/{len(results)}")
238
+ paper = self._parse_paper(item, fetch_details=fetch_details)
239
+ if paper:
240
+ # Apply date filter if specified
241
+ if has_date_filter:
242
+ if self._is_within_date_range(paper, date_from_dt, date_to_dt):
243
+ papers.append(paper)
244
+ else:
245
+ logger.debug(f"Paper {paper.paper_id} excluded by date filter")
246
+ else:
247
+ papers.append(paper)
248
+
249
+ except Exception as e:
250
+ logger.error(f"IACR search error: {e}")
251
+
252
+ return papers[:max_results]
253
+
254
+ def download_pdf(self, paper_id: str, save_path: str) -> str:
255
+ """
256
+ Download PDF from IACR ePrint Archive
257
+
258
+ Args:
259
+ paper_id: IACR paper ID (e.g., "2025/1014")
260
+ save_path: Path to save the PDF
261
+
262
+ Returns:
263
+ str: Path to downloaded file or error message
264
+ """
265
+ try:
266
+ pdf_url = f"{self.IACR_BASE_URL}/{paper_id}.pdf"
267
+
268
+ response = self.session.get(pdf_url)
269
+
270
+ if response.status_code == 200:
271
+ os.makedirs(save_path, exist_ok=True)
272
+ filename = f"{save_path}/iacr_{paper_id.replace('/', '_')}.pdf"
273
+ with open(filename, "wb") as f:
274
+ f.write(response.content)
275
+ return filename
276
+ else:
277
+ return f"Failed to download PDF: HTTP {response.status_code}"
278
+
279
+ except Exception as e:
280
+ logger.error(f"PDF download error: {e}")
281
+ return f"Error downloading PDF: {e}"
282
+
283
+ def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
284
+ """
285
+ Download and extract text from IACR paper PDF
286
+
287
+ Args:
288
+ paper_id: IACR paper ID
289
+ save_path: Directory to save downloaded PDF
290
+
291
+ Returns:
292
+ str: Extracted text from the PDF or error message
293
+ """
294
+ from ..pdf_utils import extract_text_from_pdf
295
+
296
+ try:
297
+ # First get paper details to get the PDF URL
298
+ paper = self.get_paper_details(paper_id)
299
+ if not paper or not paper.pdf_url:
300
+ return f"Error: Could not find PDF URL for paper {paper_id}"
301
+
302
+ # Download the PDF
303
+ pdf_response = requests.get(paper.pdf_url, timeout=30)
304
+ pdf_response.raise_for_status()
305
+
306
+ # Create download directory if it doesn't exist
307
+ os.makedirs(save_path, exist_ok=True)
308
+
309
+ # Save the PDF
310
+ filename = f"iacr_{paper_id.replace('/', '_')}.pdf"
311
+ pdf_path = os.path.join(save_path, filename)
312
+
313
+ with open(pdf_path, "wb") as f:
314
+ f.write(pdf_response.content)
315
+
316
+ # Extract text using pdftotext (with PyPDF2 fallback)
317
+ text = extract_text_from_pdf(pdf_path)
318
+
319
+ if not text.strip():
320
+ return (
321
+ f"PDF downloaded to {pdf_path}, but unable to extract readable text"
322
+ )
323
+
324
+ # Add paper metadata at the beginning
325
+ metadata = f"Title: {paper.title}\n"
326
+ metadata += f"Authors: {', '.join(paper.authors)}\n"
327
+ metadata += f"Published Date: {paper.published_date}\n"
328
+ metadata += f"URL: {paper.url}\n"
329
+ metadata += f"PDF downloaded to: {pdf_path}\n"
330
+ metadata += "=" * 80 + "\n\n"
331
+
332
+ return metadata + text.strip()
333
+
334
+ except requests.RequestException as e:
335
+ logger.error(f"Error downloading PDF: {e}")
336
+ return f"Error downloading PDF: {e}"
337
+ except Exception as e:
338
+ logger.error(f"Read paper error: {e}")
339
+ return f"Error reading paper: {e}"
340
+
341
+ def get_paper_details(self, paper_id: str) -> Optional[Paper]:
342
+ """
343
+ Fetch detailed information for a specific IACR paper
344
+
345
+ Args:
346
+ paper_id: IACR paper ID (e.g., "2009/101") or full URL
347
+
348
+ Returns:
349
+ Paper: Detailed paper object with full metadata
350
+ """
351
+ try:
352
+ # Handle both paper ID and full URL
353
+ if paper_id.startswith("http"):
354
+ paper_url = paper_id
355
+ # Extract paper ID from URL
356
+ parts = paper_url.split("/")
357
+ if len(parts) >= 2:
358
+ paper_id = f"{parts[-2]}/{parts[-1]}"
359
+ else:
360
+ paper_url = f"{self.IACR_BASE_URL}/{paper_id}"
361
+
362
+ # Make request
363
+ response = self.session.get(paper_url)
364
+
365
+ if response.status_code != 200:
366
+ logger.error(
367
+ f"Failed to fetch paper details: HTTP {response.status_code}"
368
+ )
369
+ return None
370
+
371
+ # Parse the page
372
+ soup = BeautifulSoup(response.text, "html.parser")
373
+
374
+ # Extract title from h3 element
375
+ title = ""
376
+ title_elem = soup.find("h3", class_="mb-3")
377
+ if title_elem:
378
+ title = title_elem.get_text(strip=True)
379
+
380
+ # Extract authors from the italic paragraph
381
+ authors = []
382
+ author_elem = soup.find("p", class_="fst-italic")
383
+ if author_elem:
384
+ author_text = author_elem.get_text(strip=True)
385
+ # Split by " and " to get individual authors
386
+ authors = [
387
+ author.strip()
388
+ for author in author_text.replace(" and ", ",").split(",")
389
+ ]
390
+
391
+ # Extract abstract from the paragraph with white-space: pre-wrap style
392
+ abstract = ""
393
+ abstract_p = soup.find("p", style="white-space: pre-wrap;")
394
+ if abstract_p:
395
+ abstract = abstract_p.get_text(strip=True)
396
+
397
+ # Extract metadata using a simpler, safer approach
398
+ publication_info = ""
399
+ keywords = []
400
+ history_entries = []
401
+ last_updated = None
402
+
403
+ # Extract publication info
404
+ page_text = soup.get_text()
405
+ lines = page_text.split("\n")
406
+
407
+ # Find publication info
408
+ for i, line in enumerate(lines):
409
+ if "Publication info" in line and i + 1 < len(lines):
410
+ publication_info = lines[i + 1].strip()
411
+ break
412
+
413
+ # Find keywords using CSS selector for keyword badges
414
+ try:
415
+ keyword_elements = soup.select("a.badge.bg-secondary.keyword")
416
+ keywords = [elem.get_text(strip=True) for elem in keyword_elements]
417
+ except:
418
+ keywords = []
419
+
420
+ # Find history entries
421
+ history_found = False
422
+ for i, line in enumerate(lines):
423
+ if "History" in line and ":" not in line:
424
+ history_found = True
425
+ continue
426
+ elif (
427
+ history_found
428
+ and ":" in line
429
+ and not line.strip().startswith("Short URL")
430
+ ):
431
+ history_entries.append(line.strip())
432
+ # Try to extract the last updated date from the first history entry
433
+ if not last_updated:
434
+ date_str = line.split(":")[0].strip()
435
+ try:
436
+ last_updated = datetime.strptime(date_str, "%Y-%m-%d")
437
+ except ValueError:
438
+ pass
439
+ elif history_found and (
440
+ line.strip().startswith("Short URL")
441
+ or line.strip().startswith("License")
442
+ ):
443
+ break
444
+
445
+ # Combine history entries
446
+ history = "; ".join(history_entries) if history_entries else ""
447
+
448
+ # Construct PDF URL
449
+ pdf_url = f"{self.IACR_BASE_URL}/{paper_id}.pdf"
450
+
451
+ # Use last updated date or current date as published date
452
+ published_date = last_updated if last_updated else datetime.now()
453
+
454
+ return Paper(
455
+ paper_id=paper_id,
456
+ title=title,
457
+ authors=authors,
458
+ abstract=abstract,
459
+ url=paper_url,
460
+ pdf_url=pdf_url,
461
+ published_date=published_date,
462
+ updated_date=last_updated,
463
+ source="iacr",
464
+ categories=[],
465
+ keywords=keywords,
466
+ doi="",
467
+ citations=0,
468
+ extra={"publication_info": publication_info, "history": history},
469
+ )
470
+
471
+ except Exception as e:
472
+ logger.error(f"Error fetching paper details for {paper_id}: {e}")
473
+ return None
474
+
475
+
476
+ if __name__ == "__main__":
477
+ # Test IACR searcher
478
+ searcher = IACRSearcher()
479
+
480
+ print("Testing IACR search functionality...")
481
+ query = "secret sharing"
482
+ max_results = 2
483
+
484
+ print("\n" + "=" * 60)
485
+ print("1. Testing search with detailed information (slower but complete)")
486
+ print("=" * 60)
487
+ try:
488
+ papers = searcher.search(query, max_results=max_results, fetch_details=True)
489
+ print(f"\nFound {len(papers)} papers for query '{query}' (with details):")
490
+ for i, paper in enumerate(papers, 1):
491
+ print(f"\n{i}. {paper.title}")
492
+ print(f" Paper ID: {paper.paper_id}")
493
+ print(f" Authors: {', '.join(paper.authors)}")
494
+ print(f" Categories: {', '.join(paper.categories)}")
495
+ print(f" Keywords: {', '.join(paper.keywords)}")
496
+ print(f" Last Updated: {paper.updated_date}")
497
+ print(f" URL: {paper.url}")
498
+ print(f" PDF: {paper.pdf_url}")
499
+ if paper.abstract:
500
+ print(f" Abstract: {paper.abstract[:200]}...")
501
+ if paper.extra:
502
+ pub_info = paper.extra.get("publication_info", "")
503
+ if pub_info:
504
+ print(f" Publication Info: {pub_info}")
505
+ except Exception as e:
506
+ print(f"Error during detailed search: {e}")
507
+
508
+ print("\n" + "=" * 60)
509
+ print("2. Testing search with compact information only (faster)")
510
+ print("=" * 60)
511
+ try:
512
+ papers_compact = searcher.search(
513
+ query, max_results=max_results, fetch_details=False
514
+ )
515
+ print(f"\nFound {len(papers_compact)} papers for query '{query}' (compact):")
516
+ for i, paper in enumerate(papers_compact, 1):
517
+ print(f"\n{i}. {paper.title}")
518
+ print(f" Paper ID: {paper.paper_id}")
519
+ print(f" Authors: {', '.join(paper.authors)}")
520
+ print(f" Categories: {', '.join(paper.categories)}")
521
+ print(f" Keywords: {', '.join(paper.keywords)} (from search)")
522
+ if paper.abstract:
523
+ print(f" Abstract: {paper.abstract[:150]}...")
524
+ except Exception as e:
525
+ print(f"Error during compact search: {e}")
526
+
527
+ print("\n" + "=" * 60)
528
+ print("3. Testing manual paper details fetching")
529
+ print("=" * 60)
530
+ test_paper_id = "2009/101"
531
+ try:
532
+ paper_details = searcher.get_paper_details(test_paper_id)
533
+ if paper_details:
534
+ print(f"\nManual fetch for paper {test_paper_id}:")
535
+ print(f"Title: {paper_details.title}")
536
+ print(f"Authors: {', '.join(paper_details.authors)}")
537
+ print(f"Keywords: {', '.join(paper_details.keywords)}")
538
+ print(
539
+ f"Publication Info: {paper_details.extra.get('publication_info', 'N/A') if paper_details.extra else 'N/A'}"
540
+ )
541
+ print(
542
+ f"History: {paper_details.extra.get('history', 'N/A') if paper_details.extra else 'N/A'}"
543
+ )
544
+ print(f"Abstract: {paper_details.abstract[:200]}...")
545
+ else:
546
+ print(f"Could not fetch details for paper {test_paper_id}")
547
+ except Exception as e:
548
+ print(f"Error fetching paper details: {e}")