academic-search-mcp 0.1.3__py3-none-any.whl → 0.1.4b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_search_mcp-0.1.3.dist-info → academic_search_mcp-0.1.4b1.dist-info}/METADATA +2 -1
- {academic_search_mcp-0.1.3.dist-info → academic_search_mcp-0.1.4b1.dist-info}/RECORD +7 -7
- paper_search_mcp/academic_platforms/cyberleninka.py +81 -5
- paper_search_mcp/pdf_utils.py +7 -3
- {academic_search_mcp-0.1.3.dist-info → academic_search_mcp-0.1.4b1.dist-info}/WHEEL +0 -0
- {academic_search_mcp-0.1.3.dist-info → academic_search_mcp-0.1.4b1.dist-info}/entry_points.txt +0 -0
- {academic_search_mcp-0.1.3.dist-info → academic_search_mcp-0.1.4b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: academic-search-mcp
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4b1
|
|
4
4
|
Summary: A MCP server for searching and downloading academic papers from multiple sources.
|
|
5
5
|
Author-email: "P.S Zhang" <pengsongzhang96@gmail.com>
|
|
6
|
+
Maintainer-email: "Zadneprovskiy A. A." <zadneprovskiy_econ@mail.ru>
|
|
6
7
|
License-File: LICENSE
|
|
7
8
|
Requires-Python: >=3.10
|
|
8
9
|
Requires-Dist: beautifulsoup4>=4.12.0
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
paper_search_mcp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
paper_search_mcp/paper.py,sha256=Flrn3ORhsojiEdEldUtKPvGF1RivXhl84zzq8mqAeFI,2969
|
|
3
|
-
paper_search_mcp/pdf_utils.py,sha256=
|
|
3
|
+
paper_search_mcp/pdf_utils.py,sha256=kcutoE_HGrtaoheY9cFN0GW4LVkG-HnN54Mc15LPD44,2009
|
|
4
4
|
paper_search_mcp/server.py,sha256=C542TF00oOUHF38F_5OU43D9RmIWQZSk7UiFHcXukWA,21663
|
|
5
5
|
paper_search_mcp/academic_platforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
paper_search_mcp/academic_platforms/arxiv.py,sha256=5SsFudqH1PIXIEE_8saCQHcf75bqCL6ApRUltLpp9Ow,5911
|
|
7
7
|
paper_search_mcp/academic_platforms/biorxiv.py,sha256=4k1Bg2BW-RBJiZ9jRVVmCEOge_4MtEDtXq2tMaPV0cg,6799
|
|
8
8
|
paper_search_mcp/academic_platforms/core.py,sha256=6xDq3NmlVh1NIEFnTRLPNayodkztrS7CPUC-jupd-Lw,9632
|
|
9
9
|
paper_search_mcp/academic_platforms/crossref.py,sha256=Zxj4U6SejaCa5o7whRmjjHVdd1U1H-DVtRP6DWzPwjk,14773
|
|
10
|
-
paper_search_mcp/academic_platforms/cyberleninka.py,sha256=
|
|
10
|
+
paper_search_mcp/academic_platforms/cyberleninka.py,sha256=X6ka5QqE7pagDPcq0dZq0EXoDeKBzjq7XJYAQw3AA_g,16656
|
|
11
11
|
paper_search_mcp/academic_platforms/google_scholar.py,sha256=B8VqgauJy3RJ8nR9woe107CXM-DrHQPapQAg_f948yg,9269
|
|
12
12
|
paper_search_mcp/academic_platforms/hub.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
paper_search_mcp/academic_platforms/iacr.py,sha256=Vem7q18NZRm5WXsDHsqRefyRIpl4PCceGGWYXhbXB2s,21135
|
|
@@ -17,8 +17,8 @@ paper_search_mcp/academic_platforms/pubmed.py,sha256=oS-JRHNI7lcCqxUGTlSVKp2i_QK
|
|
|
17
17
|
paper_search_mcp/academic_platforms/sci_hub.py,sha256=oma3M_gUseDByh-0Awi8Sxr0g3yojrb8XoD9iV0Exo8,7334
|
|
18
18
|
paper_search_mcp/academic_platforms/semantic.py,sha256=nk7nzrlsnrDNrHNUuRIfIBQfagfAT750J5HtdLputHQ,20594
|
|
19
19
|
paper_search_mcp/academic_platforms/ssrn.py,sha256=ntf22HRBZwNY6ctG5rdXjD5iT7CaML8k_xBbCn_qjbg,13694
|
|
20
|
-
academic_search_mcp-0.1.
|
|
21
|
-
academic_search_mcp-0.1.
|
|
22
|
-
academic_search_mcp-0.1.
|
|
23
|
-
academic_search_mcp-0.1.
|
|
24
|
-
academic_search_mcp-0.1.
|
|
20
|
+
academic_search_mcp-0.1.4b1.dist-info/METADATA,sha256=F3uHgcJYLLX58tRDUCGXhCoVkjc00C-qLrklwT_YJPE,7274
|
|
21
|
+
academic_search_mcp-0.1.4b1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
22
|
+
academic_search_mcp-0.1.4b1.dist-info/entry_points.txt,sha256=RO1wFwD6a0WO_mZY8HZBYDtITfQ1dhnTR1BZlCrkRLc,69
|
|
23
|
+
academic_search_mcp-0.1.4b1.dist-info/licenses/LICENSE,sha256=TwRnWq1drFhdcy43SdxndU0mcfBUYBnhSJTJ4hhjfwQ,1085
|
|
24
|
+
academic_search_mcp-0.1.4b1.dist-info/RECORD,,
|
|
@@ -379,7 +379,9 @@ class CyberLeninkaSearcher:
|
|
|
379
379
|
return f"Failed to download PDF: {e}"
|
|
380
380
|
|
|
381
381
|
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
382
|
-
"""Read and extract text from a CyberLeninka paper
|
|
382
|
+
"""Read and extract text from a CyberLeninka paper.
|
|
383
|
+
|
|
384
|
+
Tries PDF extraction first, falls back to scraping article page text.
|
|
383
385
|
|
|
384
386
|
Args:
|
|
385
387
|
paper_id: CyberLeninka article slug
|
|
@@ -388,9 +390,83 @@ class CyberLeninkaSearcher:
|
|
|
388
390
|
Returns:
|
|
389
391
|
Extracted text or error message
|
|
390
392
|
"""
|
|
393
|
+
# Try PDF extraction first
|
|
391
394
|
pdf_path = self.download_pdf(paper_id, save_path)
|
|
392
|
-
if
|
|
393
|
-
|
|
395
|
+
if os.path.exists(pdf_path):
|
|
396
|
+
text = extract_text_from_pdf(pdf_path)
|
|
397
|
+
if text and text.strip():
|
|
398
|
+
return text
|
|
399
|
+
|
|
400
|
+
# Fallback: extract text from article HTML page
|
|
401
|
+
return self._read_from_html(paper_id)
|
|
402
|
+
|
|
403
|
+
def _read_from_html(self, paper_id: str) -> str:
|
|
404
|
+
"""Extract article text from the CyberLeninka HTML page."""
|
|
405
|
+
try:
|
|
406
|
+
self._rate_limit()
|
|
407
|
+
|
|
408
|
+
url = f"{self.BASE_URL}/article/n/{paper_id}"
|
|
409
|
+
response = requests.get(url, timeout=30, impersonate=self.impersonate)
|
|
410
|
+
|
|
411
|
+
if response.status_code != 200:
|
|
412
|
+
return f"Failed to read paper {paper_id}: HTTP {response.status_code}"
|
|
394
413
|
|
|
395
|
-
|
|
396
|
-
|
|
414
|
+
soup = BeautifulSoup(response.text, 'lxml')
|
|
415
|
+
|
|
416
|
+
# Build metadata header
|
|
417
|
+
parts = []
|
|
418
|
+
|
|
419
|
+
title_meta = soup.find("meta", {"name": "citation_title"})
|
|
420
|
+
if title_meta and title_meta.get("content"):
|
|
421
|
+
parts.append(f"Title: {title_meta['content']}")
|
|
422
|
+
|
|
423
|
+
authors = [m.get("content", "") for m in soup.find_all("meta", {"name": "citation_author"}) if m.get("content")]
|
|
424
|
+
if authors:
|
|
425
|
+
parts.append(f"Authors: {', '.join(authors)}")
|
|
426
|
+
|
|
427
|
+
journal_meta = soup.find("meta", {"name": "citation_journal_title"})
|
|
428
|
+
if journal_meta and journal_meta.get("content"):
|
|
429
|
+
parts.append(f"Journal: {journal_meta['content']}")
|
|
430
|
+
|
|
431
|
+
date_meta = soup.find("meta", {"name": "citation_publication_date"})
|
|
432
|
+
if date_meta and date_meta.get("content"):
|
|
433
|
+
parts.append(f"Year: {date_meta['content']}")
|
|
434
|
+
|
|
435
|
+
if parts:
|
|
436
|
+
parts.append("") # blank line separator
|
|
437
|
+
|
|
438
|
+
# Extract article body text
|
|
439
|
+
# CyberLeninka uses div with class "ocr" for full text
|
|
440
|
+
body_div = soup.find("div", {"class": "ocr"})
|
|
441
|
+
if not body_div:
|
|
442
|
+
# Try alternative selectors
|
|
443
|
+
body_div = soup.find("div", {"itemprop": "articleBody"})
|
|
444
|
+
if not body_div:
|
|
445
|
+
body_div = soup.find("div", {"class": "full-text"})
|
|
446
|
+
|
|
447
|
+
if body_div:
|
|
448
|
+
# Get text with paragraph separation
|
|
449
|
+
paragraphs = body_div.find_all(["p", "h2", "h3", "h4"])
|
|
450
|
+
if paragraphs:
|
|
451
|
+
for p in paragraphs:
|
|
452
|
+
text = p.get_text(strip=True)
|
|
453
|
+
if text:
|
|
454
|
+
parts.append(text)
|
|
455
|
+
else:
|
|
456
|
+
body_text = body_div.get_text(separator="\n", strip=True)
|
|
457
|
+
if body_text:
|
|
458
|
+
parts.append(body_text)
|
|
459
|
+
|
|
460
|
+
if len(parts) <= 4: # Only metadata, no body text
|
|
461
|
+
# Try description as last resort
|
|
462
|
+
desc_meta = soup.find("meta", {"name": "description"})
|
|
463
|
+
if desc_meta and desc_meta.get("content"):
|
|
464
|
+
parts.append(f"Abstract: {desc_meta['content']}")
|
|
465
|
+
else:
|
|
466
|
+
return f"No text content found for paper {paper_id}"
|
|
467
|
+
|
|
468
|
+
return "\n".join(parts)
|
|
469
|
+
|
|
470
|
+
except Exception as e:
|
|
471
|
+
logger.error(f"Error reading CyberLeninka article HTML: {e}")
|
|
472
|
+
return f"Failed to read paper: {e}"
|
paper_search_mcp/pdf_utils.py
CHANGED
|
@@ -41,12 +41,14 @@ def _extract_with_pdftotext(pdf_path: str) -> Optional[str]:
|
|
|
41
41
|
['pdftotext', '-layout', '-enc', 'UTF-8', pdf_path, '-'],
|
|
42
42
|
capture_output=True,
|
|
43
43
|
text=True,
|
|
44
|
+
encoding='utf-8',
|
|
45
|
+
errors='replace',
|
|
44
46
|
timeout=60
|
|
45
47
|
)
|
|
46
|
-
if result.returncode == 0:
|
|
48
|
+
if result.returncode == 0 and result.stdout:
|
|
47
49
|
return result.stdout.strip()
|
|
48
50
|
return None
|
|
49
|
-
except
|
|
51
|
+
except Exception:
|
|
50
52
|
return None
|
|
51
53
|
|
|
52
54
|
|
|
@@ -61,7 +63,9 @@ def _extract_with_pypdf(pdf_path: str) -> str:
|
|
|
61
63
|
reader = PdfReader(pdf_path)
|
|
62
64
|
text = ""
|
|
63
65
|
for page in reader.pages:
|
|
64
|
-
|
|
66
|
+
page_text = page.extract_text()
|
|
67
|
+
if page_text:
|
|
68
|
+
text += page_text + "\n"
|
|
65
69
|
return text.strip()
|
|
66
70
|
except Exception as e:
|
|
67
71
|
return f"Error extracting text: {e}"
|
|
File without changes
|
{academic_search_mcp-0.1.3.dist-info → academic_search_mcp-0.1.4b1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_search_mcp-0.1.3.dist-info → academic_search_mcp-0.1.4b1.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|