PyPI - academia-mcp - Versions diffs - 1.10.9__tar.gz → 1.11.1__tar.gz - Mend

academia-mcp 1.10.9tar.gz → 1.11.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{academia_mcp-1.10.9 → academia_mcp-1.11.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academia-mcp
-Version: 1.10.9
+Version: 1.11.1
 Summary: MCP server that provides different tools to search for scientific publications
 Author-email: Ilya Gusev <phoenixilya@gmail.com>
 Project-URL: Homepage, https://github.com/IlyaGusev/academia_mcp

{academia_mcp-1.10.9 → academia_mcp-1.11.1}/academia_mcp/server.py RENAMED Viewed

@@ -63,35 +63,34 @@ def find_free_port() -> int:
     raise RuntimeError("No free port in range 5000-6000 found")
-def run(
-    host: str = "0.0.0.0",
-    port: Optional[int] = None,
-    mount_path: str = "/",
+def create_server(
     streamable_http_path: str = "/mcp",
-    transport: Literal["stdio", "sse", "streamable-http"] = "streamable-http",
+    mount_path: str = "/",
+    stateless_http: bool = True,
     disable_web_search_tools: bool = False,
     disable_llm_tools: bool = False,
-) -> None:
-    configure_uvicorn_style_logging()
+    port: Optional[int] = None,
+    host: str = "0.0.0.0",
+) -> FastMCP:
     server = FastMCP(
         "Academia MCP",
-        stateless_http=True,
+        stateless_http=stateless_http,
         streamable_http_path=streamable_http_path,
         mount_path=mount_path,
     )
     logger = logging.getLogger(__name__)
-    server.add_tool(arxiv_search)
-    server.add_tool(arxiv_download)
-    server.add_tool(s2_get_citations)
-    server.add_tool(s2_get_references)
+    server.add_tool(arxiv_search, structured_output=True)
+    server.add_tool(arxiv_download, structured_output=True)
+    server.add_tool(visit_webpage, structured_output=True)
+    server.add_tool(s2_get_citations, structured_output=True)
+    server.add_tool(s2_get_references, structured_output=True)
+    server.add_tool(s2_get_info, structured_output=True)
     server.add_tool(s2_corpus_id_from_arxiv_id)
-    server.add_tool(s2_get_info)
     server.add_tool(hf_datasets_search)
     server.add_tool(anthology_search)
     server.add_tool(get_latex_template)
     server.add_tool(get_latex_templates_list)
-    server.add_tool(visit_webpage)
     server.add_tool(show_image)
     server.add_tool(yt_transcript)
@@ -106,20 +105,20 @@ def run(
     if not disable_web_search_tools:
         if settings.TAVILY_API_KEY:
-            server.add_tool(tavily_web_search)
+            server.add_tool(tavily_web_search, structured_output=True)
         if settings.EXA_API_KEY:
-            server.add_tool(exa_web_search)
+            server.add_tool(exa_web_search, structured_output=True)
         if settings.BRAVE_API_KEY:
-            server.add_tool(brave_web_search)
+            server.add_tool(brave_web_search, structured_output=True)
         if settings.EXA_API_KEY or settings.BRAVE_API_KEY or settings.TAVILY_API_KEY:
-            server.add_tool(web_search)
+            server.add_tool(web_search, structured_output=True)
         else:
             logger.warning("No web search tools keys are set, web_search will not be available!")
     if not disable_llm_tools and settings.OPENROUTER_API_KEY:
-        server.add_tool(extract_bitflip_info)
-        server.add_tool(generate_research_proposals)
-        server.add_tool(score_research_proposals)
+        server.add_tool(extract_bitflip_info, structured_output=True)
+        server.add_tool(generate_research_proposals, structured_output=True)
+        server.add_tool(score_research_proposals, structured_output=True)
         server.add_tool(document_qa)
         server.add_tool(describe_image)
         if settings.WORKSPACE_DIR:
@@ -140,6 +139,27 @@ def run(
     server.settings.port = port
     server.settings.host = host
+    return server
+def run(
+    host: str = "0.0.0.0",
+    port: Optional[int] = None,
+    mount_path: str = "/",
+    streamable_http_path: str = "/mcp",
+    transport: Literal["stdio", "sse", "streamable-http"] = "streamable-http",
+    disable_web_search_tools: bool = False,
+    disable_llm_tools: bool = False,
+) -> None:
+    configure_uvicorn_style_logging()
+    server = create_server(
+        streamable_http_path=streamable_http_path,
+        mount_path=mount_path,
+        disable_web_search_tools=disable_web_search_tools,
+        disable_llm_tools=disable_llm_tools,
+        port=port,
+        host=host,
+    )
     if transport == "streamable-http":
         # Enable CORS for browser-based clients

{academia_mcp-1.10.9 → academia_mcp-1.11.1}/academia_mcp/tools/arxiv_download.py RENAMED Viewed

@@ -3,19 +3,17 @@
 # https://github.com/bytedance/pasa/blob/main/utils.py
 import re
-import json
 import tempfile
 from pathlib import Path
-from typing import Any, List, Optional, Dict
-from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
-import requests
 import bs4
+import requests
 from markdownify import MarkdownConverter  # type: ignore
+from pydantic import BaseModel, Field
+from academia_mcp.pdf import download_pdf, parse_pdf_file
 from academia_mcp.utils import get_with_retries
-from academia_mcp.pdf import parse_pdf_file, download_pdf
 HTML_URL = "https://arxiv.org/html/{paper_id}"
 ABS_URL = "https://arxiv.org/abs/{paper_id}"
@@ -28,12 +26,24 @@ SECTION_STOP_WORDS = (
 )
-@dataclass
-class TOCEntry:
+class DownloadResponse(BaseModel):  # type: ignore
+    title: str = Field(description="Title of the paper")
+    abstract: str = Field(description="Abstract of the paper")
+    toc: str = Field(description="Table of Contents", default="")
+    sections: Optional[List[str]] = Field(description="Sections of the paper", default=None)
+    references: Optional[List[Dict[str, Any]]] = Field(
+        description="Parsed references from the paper", default=None
+    )
+    original_format: str = Field(
+        description="Original format of the paper (pdf or html)", default="html"
+    )
+class TOCEntry(BaseModel):  # type: ignore
     level: int
     title: str
     html_id: Optional[str] = None
-    subsections: List["TOCEntry"] = field(default_factory=list)
+    subsections: List["TOCEntry"] = Field(default_factory=list)
     def linearize(self) -> List["TOCEntry"]:
         entries = [self]
@@ -196,7 +206,7 @@ def _parse_citation_metadata(metas: List[str]) -> Dict[str, Any]:
     return result
-def _extract_citations(soup_biblist: bs4.element.Tag) -> List[Dict[str, Any]]:
+def _extract_references(soup_biblist: bs4.element.Tag) -> List[Dict[str, Any]]:
     extracted = []
     for li in soup_biblist.find_all("li", recursive=False):
         metas = [x.text.strip() for x in li.find_all("span", class_="ltx_bibblock")]
@@ -214,17 +224,17 @@ def _parse_html(paper_id: str) -> Dict[str, Any]:
     article = soup.article
     assert article and isinstance(article, bs4.element.Tag)
-    citations = []
+    references = []
     biblist_tag = article.find(class_="ltx_biblist")
     if biblist_tag and isinstance(biblist_tag, bs4.element.Tag):
-        citations = _extract_citations(biblist_tag)
+        references = _extract_references(biblist_tag)
     toc = _generate_toc(article)
     sections = _build_by_toc(toc, article, url)
     return {
         "toc": toc.to_str(),
         "sections": sections,
-        "citations": citations,
+        "references": references,
         "original_format": "html",
     }
@@ -255,36 +265,24 @@ def _parse_pdf(paper_id: str) -> Dict[str, Any]:
     return {
         "toc": "\n".join([f"Page {page_number}" for page_number in range(1, len(pages) + 1)]),
         "sections": pages,
-        "citations": [],
+        "references": [],
         "original_format": "pdf",
     }
 def arxiv_download(
     paper_id: str,
-    include_citations: Optional[bool] = False,
+    include_references: Optional[bool] = False,
     mode: Optional[str] = "html",
-) -> str:
+) -> DownloadResponse:
     """
     Downloads a paper from Arxiv and converts it to text.
     Use mode = "html" by default.
     Fall back to mode = "pdf" if there are any problems with the HTML version.
-    Returns a JSON with a following structure:
-    {
-        "title": "...",
-        "abstract": "...",
-        "toc": "...",
-        "sections": ["...", ...],
-        "citations": [...]
-    }
-    Use `json.loads` to deserialize the result if you want to get specific fields.
-    For example, `abstract = json.loads(arxiv_download("2409.06820v1"))`
-    The "toc" key contains Table of Contents, that sometimes has indexing for sections.
     Args:
         paper_id: ID of the paper on Arxiv. For instance: 2409.06820v1
-        include_citations: include "citations" in the result or not. False by default.
+        include_references: include "references" in the result or not. False by default.
         mode: Which version of paper to use. Options: ["html", "pdf"]. "html" by default.
     """
@@ -297,7 +295,6 @@ def arxiv_download(
     else:
         content = _parse_pdf(paper_id)
-    if not include_citations and "citations" in content:
-        content.pop("citations")
-    return json.dumps({**abs_meta, **content}, ensure_ascii=False)
+    if not include_references and "references" in content:
+        content.pop("references")
+    return DownloadResponse(**{**abs_meta, **content})

{academia_mcp-1.10.9 → academia_mcp-1.11.1}/academia_mcp/tools/arxiv_search.py RENAMED Viewed

@@ -2,12 +2,12 @@
 # https://github.com/jonatasgrosman/findpapers/blob/master/findpapers/searchers/arxiv_searcher.py
 # https://info.arxiv.org/help/api/user-manual.html
-import json
 import re
-from typing import Optional, List, Dict, Any, Union
-from datetime import datetime, date
+from datetime import date, datetime
+from typing import Any, Dict, List, Optional, Union
 import xmltodict
+from pydantic import BaseModel, Field
 from academia_mcp.utils import get_with_retries
@@ -17,6 +17,25 @@ SORT_BY_OPTIONS = ("relevance", "lastUpdatedDate", "submittedDate")
 SORT_ORDER_OPTIONS = ("ascending", "descending")
+class ArxivSearchEntry(BaseModel):  # type: ignore
+    id: str = Field(description="Paper ID")
+    title: str = Field(description="Paper title")
+    authors: str = Field(description="Authors of the paper")
+    published: str = Field(description="Published date of the paper")
+    updated: str = Field(description="Updated date of the paper")
+    categories: str = Field(description="Categories of the paper")
+    comment: str = Field(description="Comment of the paper")
+    index: int = Field(description="Index of the paper", default=0)
+    abstract: Optional[str] = Field(description="Abstract of the paper", default=None)
+class ArxivSearchResponse(BaseModel):  # type: ignore
+    total_count: int = Field(description="Total number of results")
+    returned_count: int = Field(description="Number of results returned")
+    offset: int = Field(description="Offset for pagination")
+    results: List[ArxivSearchEntry] = Field(description="Search entries")
 def _format_text_field(text: str) -> str:
     return " ".join([line.strip() for line in text.split() if line.strip()])
@@ -48,17 +67,17 @@ def _format_date(date: str) -> str:
     return dt.strftime("%B %d, %Y")
-def _clean_entry(entry: Dict[str, Any]) -> Dict[str, Any]:
-    return {
-        "id": entry["id"].split("/")[-1],
-        "title": _format_text_field(entry["title"]),
-        "authors": _format_authors(entry["author"]),
-        "abstract": _format_text_field(entry["summary"]),
-        "published": _format_date(entry["published"]),
-        "updated": _format_date(entry["updated"]),
-        "categories": _format_categories(entry.get("category", {})),
-        "comment": _format_text_field(entry.get("arxiv:comment", {}).get("#text", "")),
-    }
+def _clean_entry(entry: Dict[str, Any]) -> ArxivSearchEntry:
+    return ArxivSearchEntry(
+        id=entry["id"].split("/")[-1],
+        title=_format_text_field(entry["title"]),
+        authors=_format_authors(entry["author"]),
+        abstract=_format_text_field(entry["summary"]),
+        published=_format_date(entry["published"]),
+        updated=_format_date(entry["updated"]),
+        categories=_format_categories(entry.get("category", {})),
+        comment=_format_text_field(entry.get("arxiv:comment", {}).get("#text", "")),
+    )
 def _convert_to_yyyymmddtttt(date_str: str) -> str:
@@ -105,22 +124,19 @@ def _format_entries(
     start_index: int,
     include_abstracts: bool,
     total_results: int,
-) -> str:
+) -> ArxivSearchResponse:
     clean_entries: List[Dict[str, Any]] = []
     for entry_num, entry in enumerate(entries):
         clean_entry = _clean_entry(entry)
         if not include_abstracts:
-            clean_entry.pop("abstract")
-        clean_entry["index"] = start_index + entry_num
+            clean_entry.abstract = None
+        clean_entry.index = start_index + entry_num
         clean_entries.append(clean_entry)
-    return json.dumps(
-        {
-            "total_count": total_results,
-            "returned_count": len(entries),
-            "offset": start_index,
-            "results": clean_entries,
-        },
-        ensure_ascii=False,
+    return ArxivSearchResponse(
+        total_count=total_results,
+        returned_count=len(entries),
+        offset=start_index,
+        results=clean_entries,
     )
@@ -133,7 +149,7 @@ def arxiv_search(
     sort_by: Optional[str] = "relevance",
     sort_order: Optional[str] = "descending",
     include_abstracts: Optional[bool] = False,
-) -> str:
+) -> ArxivSearchResponse:
     """
     Search arXiv papers with field-specific queries.
@@ -158,12 +174,6 @@ def arxiv_search(
         all:role OR all:playing OR all:"language model"
         (au:vaswani OR au:"del maestro") ANDNOT ti:attention
-    Returns a JSON object serialized to a string. The structure is:
-    {"total_count": ..., "returned_count": ..., "offset": ..., "results": [...]}
-    Every item in the "results" has the following fields:
-    ("index", "id", "title", "authors", "abstract", "published", "updated", "categories", "comment")
-    Use `json.loads` to deserialize the result if you want to get specific fields.
     Args:
         query: The search query, required.
         offset: The offset to scroll search results. 10 items will be skipped if offset=10. 0 by default.
@@ -211,10 +221,9 @@ def arxiv_search(
     entries = feed.get("entry", [])
     if isinstance(entries, dict):
         entries = [entries]
-    formatted_entries: str = _format_entries(
+    return _format_entries(
         entries,
         start_index=start_index,
         total_results=total_results,
         include_abstracts=include_abstracts,
     )
-    return formatted_entries

{academia_mcp-1.10.9 → academia_mcp-1.11.1}/academia_mcp/tools/bitflip.py RENAMED Viewed

@@ -1,17 +1,18 @@
+# Based on
 # https://arxiv.org/abs/2504.12976
 # https://web.stanford.edu/class/cs197c/slides/02-literature-search.pdf
 import json
 import random
-from typing import List, Optional, Any, Dict
+from typing import Any, Dict, List, Optional
-from pydantic import BaseModel
 from datasets import load_dataset  # type: ignore
+from pydantic import BaseModel, Field
-from academia_mcp.tools.arxiv_download import arxiv_download
-from academia_mcp.utils import extract_json, encode_prompt
-from academia_mcp.llm import llm_acall, ChatMessage
+from academia_mcp.llm import ChatMessage, llm_acall
 from academia_mcp.settings import settings
+from academia_mcp.tools.arxiv_download import arxiv_download
+from academia_mcp.utils import encode_prompt, extract_json
 class ProposalDataset:
@@ -128,7 +129,7 @@ Return only the JSON list of proposals in this exact format:
         "spark": "4-6 word summary",
         "abstract": "An abstract that summarizes the proposal in conference format (approximately 250 words).",
         "experiments": ["...", "..."],
-        "risks_and_limitations": "A list of potential risks and limitations of the proposal."
+        "risks_and_limitations": ["...", "..."]
     },
     ...
 ]
@@ -177,12 +178,12 @@ Return only scores for all proposals in this exact format (no extra text):
 class BitFlipInfo(BaseModel):  # type: ignore
-    bit: str
-    flip: str
-    spark: str
+    bit: str = Field(description="Technical limitation or conventional approach")
+    flip: str = Field(description="Innovative approach or solution")
+    spark: str = Field(description="4-6 word summary")
-async def extract_bitflip_info(arxiv_id: str) -> str:
+async def extract_bitflip_info(arxiv_id: str) -> BitFlipInfo:
     """
     Extracts the Bit-Flip information from the arXiv paper.
@@ -190,20 +191,12 @@ async def extract_bitflip_info(arxiv_id: str) -> str:
     questioning existing constraints or reapplying techniques to new domains/scales.
     The "Bit" is the prevailing belief, and the "Flip" is the counterargument.
-    Returns a JSON object in this format:
-    {
-        "bit": "Technical limitation or conventional approach, in at least two sentences",
-        "flip": "Innovative approach or solution, in at least two sentences",
-        "spark": "4-6 word summary of the core idea"
-    }
-    Use `json.loads` to deserialize the result if you want to get specific fields.
     Args:
         arxiv_id: The arXiv ID of the paper to extract the Bit-Flip information from.
     """
     model_name = settings.BITFLIP_MODEL_NAME
     paper = arxiv_download(arxiv_id)
-    abstract = json.loads(paper)["abstract"]
+    abstract = paper.abstract
     prompt = encode_prompt(EXTRACT_PROMPT, abstract=abstract)
     content = await llm_acall(
         model_name=model_name,
@@ -212,12 +205,31 @@ async def extract_bitflip_info(arxiv_id: str) -> str:
     )
     result = extract_json(content)
     bitflip_info: BitFlipInfo = BitFlipInfo.model_validate(result)
-    return str(bitflip_info.model_dump_json())
+    return bitflip_info
+class ResearchProposal(BaseModel):  # type: ignore
+    proposal_id: int = Field(default=0, description="ID of the proposal")
+    flip: str = Field(description="Innovative approach or solution, in at least two sentences")
+    spark: str = Field(description="4-6 word summary")
+    abstract: str = Field(
+        description="An abstract that summarizes the proposal in conference format."
+    )
+    experiments: List[str] = Field(
+        description="A list of experiments that would be conducted to validate the proposal."
+    )
+    risks_and_limitations: List[str] = Field(
+        description="A list of potential risks and limitations of the proposal."
+    )
+class GenerateResearchProposalResponse(BaseModel):  # type: ignore
+    proposals: List[ResearchProposal] = Field(description="A list of research proposals")
 async def generate_research_proposals(
     bit: str, num_proposals: int = 3, additional_context: str = ""
-) -> str:
+) -> GenerateResearchProposalResponse:
     """
     Proposes improvement ideas for the Bit.
@@ -225,20 +237,6 @@ async def generate_research_proposals(
         bit: The Bit to propose improvement ideas for. The bit is a technical limitation or conventional approach of some paper.
         num_proposals: The number of proposals to generate.
         additional_context: Additional context to use when proposing the improvement idea.
-    Returns a JSON string with a research proposal in this format:
-    [
-        {
-            "proposal_id": ...,
-            "flip": "Innovative approach or solution, in at least two sentences",
-            "spark": "4-6 word summary",
-            "abstract": "An abstract that summarizes the proposal in conference format (approximately 250 words).",
-            "experiments": ["...", "..."],
-            "risks_and_limitations": "A list of potential risks and limitations of the proposal."
-        },
-        ...
-    ]
-    Use `json.loads` to deserialize the result if you want to get specific items.
     """
     model_name = settings.BITFLIP_MODEL_NAME
     max_completion_tokens = int(settings.BITFLIP_MAX_COMPLETION_TOKENS)
@@ -262,46 +260,51 @@ async def generate_research_proposals(
         temperature=1.0,
     )
     result = extract_json(content)
-    for proposal in result:
-        proposal["proposal_id"] = random.randint(0, 1000000)
-    return json.dumps(result, ensure_ascii=False)
+    return GenerateResearchProposalResponse(
+        proposals=[ResearchProposal.model_validate(proposal) for proposal in result]
+    )
+class ScoredProposal(BaseModel):  # type: ignore
+    proposal_id: int = Field(default=0, description="ID of the proposal")
+    spark: str = Field(description="4-6 word summary")
+    strengths: List[str] = Field(description="A list of strengths of the proposal")
+    weaknesses: List[str] = Field(description="A list of weaknesses of the proposal")
+    novelty: int = Field(description="Novelty rating from 1 to 4")
+    clarity: int = Field(description="Clarity rating from 1 to 4")
+    significance: int = Field(description="Significance rating from 1 to 4")
+    feasibility: int = Field(description="Feasibility rating from 1 to 4")
+    soundness: int = Field(description="Soundness rating from 1 to 4")
+    overall: int = Field(description="Overall rating from 1 to 10")
-async def score_research_proposals(proposals: str | List[str | Dict[str, Any] | Any]) -> str:
+class ScoreResearchProposalsResponse(BaseModel):  # type: ignore
+    proposals: List[ScoredProposal] = Field(description="List of scored proposals")
+async def score_research_proposals(
+    proposals: str | List[str | Dict[str, Any] | Any],
+) -> ScoreResearchProposalsResponse:
     """
     Scores a list of research proposals.
     Use proposals obtained with the `generate_research_proposal` tool.
-    Returns a JSON string with a list of scores in this format:
-    [
-        {
-            "proposal_id": 0,
-            "spark": "...",
-            "strengths": ["...", "..."],
-            "weaknesses": ["...", "..."],
-            "novelty": 2,
-            "clarity": 2,
-            "significance": 2,
-            "feasibility": 2,
-            "soundness": 2,
-            "overall": 5
-        },
-        ...
-    ]
-    Use `json.loads` to deserialize the result if you want to get specific fields.
     Args:
         proposals: A list of JSON strings with research proposals.
     """
     model_name = settings.BITFLIP_MODEL_NAME
     if isinstance(proposals, str):
         proposals = json.loads(proposals)
-        assert isinstance(proposals, list), "Proposals should be a list of JSON strings"
-    prompt = encode_prompt(SCORE_PROMPT, proposals=[str(p) for p in proposals])
+        assert isinstance(proposals, list), "Proposals should be a list"
+    if isinstance(proposals, list):
+        proposals = [str(p) for p in proposals]
+    prompt = encode_prompt(SCORE_PROMPT, proposals=proposals)
     content = await llm_acall(
         model_name=model_name,
         messages=[ChatMessage(role="user", content=prompt)],
         temperature=0.0,
     )
     scores = extract_json(content)
-    return json.dumps(scores, ensure_ascii=False)
+    return ScoreResearchProposalsResponse(
+        proposals=[ScoredProposal.model_validate(score) for score in scores]
+    )

academia-mcp 1.10.9__tar.gz → 1.11.1__tar.gz

academia-mcp 1.10.9tar.gz → 1.11.1tar.gz