academic-search-mcp 0.1.3__py3-none-any.whl → 0.1.4b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-search-mcp
3
- Version: 0.1.3
3
+ Version: 0.1.4b1
4
4
  Summary: A MCP server for searching and downloading academic papers from multiple sources.
5
5
  Author-email: "P.S Zhang" <pengsongzhang96@gmail.com>
6
+ Maintainer-email: "Zadneprovskiy A. A." <zadneprovskiy_econ@mail.ru>
6
7
  License-File: LICENSE
7
8
  Requires-Python: >=3.10
8
9
  Requires-Dist: beautifulsoup4>=4.12.0
@@ -1,13 +1,13 @@
1
1
  paper_search_mcp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  paper_search_mcp/paper.py,sha256=Flrn3ORhsojiEdEldUtKPvGF1RivXhl84zzq8mqAeFI,2969
3
- paper_search_mcp/pdf_utils.py,sha256=sylqOQTFyOSlYnEzUMpSIe4VkY2kfgaQw_xd_EBYw2g,1909
3
+ paper_search_mcp/pdf_utils.py,sha256=kcutoE_HGrtaoheY9cFN0GW4LVkG-HnN54Mc15LPD44,2009
4
4
  paper_search_mcp/server.py,sha256=C542TF00oOUHF38F_5OU43D9RmIWQZSk7UiFHcXukWA,21663
5
5
  paper_search_mcp/academic_platforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  paper_search_mcp/academic_platforms/arxiv.py,sha256=5SsFudqH1PIXIEE_8saCQHcf75bqCL6ApRUltLpp9Ow,5911
7
7
  paper_search_mcp/academic_platforms/biorxiv.py,sha256=4k1Bg2BW-RBJiZ9jRVVmCEOge_4MtEDtXq2tMaPV0cg,6799
8
8
  paper_search_mcp/academic_platforms/core.py,sha256=6xDq3NmlVh1NIEFnTRLPNayodkztrS7CPUC-jupd-Lw,9632
9
9
  paper_search_mcp/academic_platforms/crossref.py,sha256=Zxj4U6SejaCa5o7whRmjjHVdd1U1H-DVtRP6DWzPwjk,14773
10
- paper_search_mcp/academic_platforms/cyberleninka.py,sha256=88p9RZxjBRn5jAaOhZLr3EpP5ibMzmd0vCh1jD6PPEs,13421
10
+ paper_search_mcp/academic_platforms/cyberleninka.py,sha256=X6ka5QqE7pagDPcq0dZq0EXoDeKBzjq7XJYAQw3AA_g,16656
11
11
  paper_search_mcp/academic_platforms/google_scholar.py,sha256=B8VqgauJy3RJ8nR9woe107CXM-DrHQPapQAg_f948yg,9269
12
12
  paper_search_mcp/academic_platforms/hub.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  paper_search_mcp/academic_platforms/iacr.py,sha256=Vem7q18NZRm5WXsDHsqRefyRIpl4PCceGGWYXhbXB2s,21135
@@ -17,8 +17,8 @@ paper_search_mcp/academic_platforms/pubmed.py,sha256=oS-JRHNI7lcCqxUGTlSVKp2i_QK
17
17
  paper_search_mcp/academic_platforms/sci_hub.py,sha256=oma3M_gUseDByh-0Awi8Sxr0g3yojrb8XoD9iV0Exo8,7334
18
18
  paper_search_mcp/academic_platforms/semantic.py,sha256=nk7nzrlsnrDNrHNUuRIfIBQfagfAT750J5HtdLputHQ,20594
19
19
  paper_search_mcp/academic_platforms/ssrn.py,sha256=ntf22HRBZwNY6ctG5rdXjD5iT7CaML8k_xBbCn_qjbg,13694
20
- academic_search_mcp-0.1.3.dist-info/METADATA,sha256=uZcbHayXO9tURHo3Yl7P50e4j3v4p20a7GoPbtlLTe4,7203
21
- academic_search_mcp-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
22
- academic_search_mcp-0.1.3.dist-info/entry_points.txt,sha256=RO1wFwD6a0WO_mZY8HZBYDtITfQ1dhnTR1BZlCrkRLc,69
23
- academic_search_mcp-0.1.3.dist-info/licenses/LICENSE,sha256=TwRnWq1drFhdcy43SdxndU0mcfBUYBnhSJTJ4hhjfwQ,1085
24
- academic_search_mcp-0.1.3.dist-info/RECORD,,
20
+ academic_search_mcp-0.1.4b1.dist-info/METADATA,sha256=F3uHgcJYLLX58tRDUCGXhCoVkjc00C-qLrklwT_YJPE,7274
21
+ academic_search_mcp-0.1.4b1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
22
+ academic_search_mcp-0.1.4b1.dist-info/entry_points.txt,sha256=RO1wFwD6a0WO_mZY8HZBYDtITfQ1dhnTR1BZlCrkRLc,69
23
+ academic_search_mcp-0.1.4b1.dist-info/licenses/LICENSE,sha256=TwRnWq1drFhdcy43SdxndU0mcfBUYBnhSJTJ4hhjfwQ,1085
24
+ academic_search_mcp-0.1.4b1.dist-info/RECORD,,
@@ -379,7 +379,9 @@ class CyberLeninkaSearcher:
379
379
  return f"Failed to download PDF: {e}"
380
380
 
381
381
  def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
382
- """Read and extract text from a CyberLeninka paper PDF.
382
+ """Read and extract text from a CyberLeninka paper.
383
+
384
+ Tries PDF extraction first, falls back to scraping article page text.
383
385
 
384
386
  Args:
385
387
  paper_id: CyberLeninka article slug
@@ -388,9 +390,83 @@ class CyberLeninkaSearcher:
388
390
  Returns:
389
391
  Extracted text or error message
390
392
  """
393
+ # Try PDF extraction first
391
394
  pdf_path = self.download_pdf(paper_id, save_path)
392
- if not os.path.exists(pdf_path):
393
- return pdf_path # Return error message
395
+ if os.path.exists(pdf_path):
396
+ text = extract_text_from_pdf(pdf_path)
397
+ if text and text.strip():
398
+ return text
399
+
400
+ # Fallback: extract text from article HTML page
401
+ return self._read_from_html(paper_id)
402
+
403
+ def _read_from_html(self, paper_id: str) -> str:
404
+ """Extract article text from the CyberLeninka HTML page."""
405
+ try:
406
+ self._rate_limit()
407
+
408
+ url = f"{self.BASE_URL}/article/n/{paper_id}"
409
+ response = requests.get(url, timeout=30, impersonate=self.impersonate)
410
+
411
+ if response.status_code != 200:
412
+ return f"Failed to read paper {paper_id}: HTTP {response.status_code}"
394
413
 
395
- text = extract_text_from_pdf(pdf_path)
396
- return text if text else "Failed to extract text from PDF"
414
+ soup = BeautifulSoup(response.text, 'lxml')
415
+
416
+ # Build metadata header
417
+ parts = []
418
+
419
+ title_meta = soup.find("meta", {"name": "citation_title"})
420
+ if title_meta and title_meta.get("content"):
421
+ parts.append(f"Title: {title_meta['content']}")
422
+
423
+ authors = [m.get("content", "") for m in soup.find_all("meta", {"name": "citation_author"}) if m.get("content")]
424
+ if authors:
425
+ parts.append(f"Authors: {', '.join(authors)}")
426
+
427
+ journal_meta = soup.find("meta", {"name": "citation_journal_title"})
428
+ if journal_meta and journal_meta.get("content"):
429
+ parts.append(f"Journal: {journal_meta['content']}")
430
+
431
+ date_meta = soup.find("meta", {"name": "citation_publication_date"})
432
+ if date_meta and date_meta.get("content"):
433
+ parts.append(f"Year: {date_meta['content']}")
434
+
435
+ if parts:
436
+ parts.append("") # blank line separator
437
+
438
+ # Extract article body text
439
+ # CyberLeninka uses div with class "ocr" for full text
440
+ body_div = soup.find("div", {"class": "ocr"})
441
+ if not body_div:
442
+ # Try alternative selectors
443
+ body_div = soup.find("div", {"itemprop": "articleBody"})
444
+ if not body_div:
445
+ body_div = soup.find("div", {"class": "full-text"})
446
+
447
+ if body_div:
448
+ # Get text with paragraph separation
449
+ paragraphs = body_div.find_all(["p", "h2", "h3", "h4"])
450
+ if paragraphs:
451
+ for p in paragraphs:
452
+ text = p.get_text(strip=True)
453
+ if text:
454
+ parts.append(text)
455
+ else:
456
+ body_text = body_div.get_text(separator="\n", strip=True)
457
+ if body_text:
458
+ parts.append(body_text)
459
+
460
+ if len(parts) <= 4: # Only metadata, no body text
461
+ # Try description as last resort
462
+ desc_meta = soup.find("meta", {"name": "description"})
463
+ if desc_meta and desc_meta.get("content"):
464
+ parts.append(f"Abstract: {desc_meta['content']}")
465
+ else:
466
+ return f"No text content found for paper {paper_id}"
467
+
468
+ return "\n".join(parts)
469
+
470
+ except Exception as e:
471
+ logger.error(f"Error reading CyberLeninka article HTML: {e}")
472
+ return f"Failed to read paper: {e}"
@@ -41,12 +41,14 @@ def _extract_with_pdftotext(pdf_path: str) -> Optional[str]:
41
41
  ['pdftotext', '-layout', '-enc', 'UTF-8', pdf_path, '-'],
42
42
  capture_output=True,
43
43
  text=True,
44
+ encoding='utf-8',
45
+ errors='replace',
44
46
  timeout=60
45
47
  )
46
- if result.returncode == 0:
48
+ if result.returncode == 0 and result.stdout:
47
49
  return result.stdout.strip()
48
50
  return None
49
- except (subprocess.TimeoutExpired, subprocess.SubprocessError):
51
+ except Exception:
50
52
  return None
51
53
 
52
54
 
@@ -61,7 +63,9 @@ def _extract_with_pypdf(pdf_path: str) -> str:
61
63
  reader = PdfReader(pdf_path)
62
64
  text = ""
63
65
  for page in reader.pages:
64
- text += page.extract_text() + "\n"
66
+ page_text = page.extract_text()
67
+ if page_text:
68
+ text += page_text + "\n"
65
69
  return text.strip()
66
70
  except Exception as e:
67
71
  return f"Error extracting text: {e}"