academia-mcp 1.11.0__tar.gz → 1.11.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/PKG-INFO +1 -1
  2. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/server.py +12 -12
  3. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/arxiv_download.py +30 -33
  4. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/arxiv_search.py +6 -6
  5. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/bitflip.py +63 -60
  6. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/s2.py +50 -40
  7. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/visit_webpage.py +21 -13
  8. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/web_search.py +42 -35
  9. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/utils.py +2 -0
  10. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp.egg-info/PKG-INFO +1 -1
  11. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/pyproject.toml +2 -1
  12. academia_mcp-1.11.2/tests/test_arxiv_download.py +35 -0
  13. academia_mcp-1.11.2/tests/test_bitflip.py +52 -0
  14. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/tests/test_document_qa.py +1 -1
  15. academia_mcp-1.11.2/tests/test_s2.py +42 -0
  16. academia_mcp-1.11.2/tests/test_server.py +81 -0
  17. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/tests/test_visit_webpage.py +16 -10
  18. academia_mcp-1.11.2/tests/test_web_search.py +59 -0
  19. academia_mcp-1.11.0/tests/test_arxiv_download.py +0 -25
  20. academia_mcp-1.11.0/tests/test_bitflip.py +0 -54
  21. academia_mcp-1.11.0/tests/test_s2.py +0 -44
  22. academia_mcp-1.11.0/tests/test_server.py +0 -35
  23. academia_mcp-1.11.0/tests/test_web_search.py +0 -55
  24. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/LICENSE +0 -0
  25. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/README.md +0 -0
  26. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/__init__.py +0 -0
  27. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/__main__.py +0 -0
  28. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/files.py +0 -0
  29. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/latex_templates/agents4science_2025/agents4science_2025.sty +0 -0
  30. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/latex_templates/agents4science_2025/agents4science_2025.tex +0 -0
  31. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/llm.py +0 -0
  32. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/pdf.py +0 -0
  33. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/py.typed +0 -0
  34. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/settings.py +0 -0
  35. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/__init__.py +0 -0
  36. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/anthology_search.py +0 -0
  37. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/document_qa.py +0 -0
  38. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/hf_datasets_search.py +0 -0
  39. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/latex.py +0 -0
  40. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/py.typed +0 -0
  41. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/review.py +0 -0
  42. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/show_image.py +0 -0
  43. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/speech_to_text.py +0 -0
  44. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp/tools/yt_transcript.py +0 -0
  45. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp.egg-info/SOURCES.txt +0 -0
  46. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp.egg-info/dependency_links.txt +0 -0
  47. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp.egg-info/entry_points.txt +0 -0
  48. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp.egg-info/requires.txt +0 -0
  49. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/academia_mcp.egg-info/top_level.txt +0 -0
  50. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/setup.cfg +0 -0
  51. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/tests/test_anthology_search.py +0 -0
  52. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/tests/test_arxiv_search.py +0 -0
  53. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/tests/test_extract_json.py +0 -0
  54. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/tests/test_hf_dataset_search.py +0 -0
  55. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/tests/test_latex.py +0 -0
  56. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/tests/test_review.py +0 -0
  57. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/tests/test_show_image.py +0 -0
  58. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/tests/test_speech_to_text.py +0 -0
  59. {academia_mcp-1.11.0 → academia_mcp-1.11.2}/tests/test_yt_transcript.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academia-mcp
3
- Version: 1.11.0
3
+ Version: 1.11.2
4
4
  Summary: MCP server that provides different tools to search for scientific publications
5
5
  Author-email: Ilya Gusev <phoenixilya@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/IlyaGusev/academia_mcp
@@ -81,16 +81,16 @@ def create_server(
81
81
  logger = logging.getLogger(__name__)
82
82
 
83
83
  server.add_tool(arxiv_search, structured_output=True)
84
- server.add_tool(arxiv_download)
85
- server.add_tool(s2_get_citations)
86
- server.add_tool(s2_get_references)
84
+ server.add_tool(arxiv_download, structured_output=True)
85
+ server.add_tool(visit_webpage, structured_output=True)
86
+ server.add_tool(s2_get_citations, structured_output=True)
87
+ server.add_tool(s2_get_references, structured_output=True)
88
+ server.add_tool(s2_get_info, structured_output=True)
87
89
  server.add_tool(s2_corpus_id_from_arxiv_id)
88
- server.add_tool(s2_get_info)
89
90
  server.add_tool(hf_datasets_search)
90
91
  server.add_tool(anthology_search)
91
92
  server.add_tool(get_latex_template)
92
93
  server.add_tool(get_latex_templates_list)
93
- server.add_tool(visit_webpage)
94
94
  server.add_tool(show_image)
95
95
  server.add_tool(yt_transcript)
96
96
 
@@ -105,20 +105,20 @@ def create_server(
105
105
 
106
106
  if not disable_web_search_tools:
107
107
  if settings.TAVILY_API_KEY:
108
- server.add_tool(tavily_web_search)
108
+ server.add_tool(tavily_web_search, structured_output=True)
109
109
  if settings.EXA_API_KEY:
110
- server.add_tool(exa_web_search)
110
+ server.add_tool(exa_web_search, structured_output=True)
111
111
  if settings.BRAVE_API_KEY:
112
- server.add_tool(brave_web_search)
112
+ server.add_tool(brave_web_search, structured_output=True)
113
113
  if settings.EXA_API_KEY or settings.BRAVE_API_KEY or settings.TAVILY_API_KEY:
114
- server.add_tool(web_search)
114
+ server.add_tool(web_search, structured_output=True)
115
115
  else:
116
116
  logger.warning("No web search tools keys are set, web_search will not be available!")
117
117
 
118
118
  if not disable_llm_tools and settings.OPENROUTER_API_KEY:
119
- server.add_tool(extract_bitflip_info)
120
- server.add_tool(generate_research_proposals)
121
- server.add_tool(score_research_proposals)
119
+ server.add_tool(extract_bitflip_info, structured_output=True)
120
+ server.add_tool(generate_research_proposals, structured_output=True)
121
+ server.add_tool(score_research_proposals, structured_output=True)
122
122
  server.add_tool(document_qa)
123
123
  server.add_tool(describe_image)
124
124
  if settings.WORKSPACE_DIR:
@@ -3,19 +3,17 @@
3
3
  # https://github.com/bytedance/pasa/blob/main/utils.py
4
4
 
5
5
  import re
6
- import json
7
6
  import tempfile
8
7
  from pathlib import Path
9
- from typing import Any, List, Optional, Dict
10
- from dataclasses import dataclass, field
8
+ from typing import Any, Dict, List, Optional
11
9
 
12
- import requests
13
10
  import bs4
11
+ import requests
14
12
  from markdownify import MarkdownConverter # type: ignore
13
+ from pydantic import BaseModel, Field
15
14
 
15
+ from academia_mcp.pdf import download_pdf, parse_pdf_file
16
16
  from academia_mcp.utils import get_with_retries
17
- from academia_mcp.pdf import parse_pdf_file, download_pdf
18
-
19
17
 
20
18
  HTML_URL = "https://arxiv.org/html/{paper_id}"
21
19
  ABS_URL = "https://arxiv.org/abs/{paper_id}"
@@ -28,12 +26,24 @@ SECTION_STOP_WORDS = (
28
26
  )
29
27
 
30
28
 
31
- @dataclass
32
- class TOCEntry:
29
+ class DownloadResponse(BaseModel): # type: ignore
30
+ title: str = Field(description="Title of the paper")
31
+ abstract: str = Field(description="Abstract of the paper")
32
+ toc: str = Field(description="Table of Contents", default="")
33
+ sections: Optional[List[str]] = Field(description="Sections of the paper", default=None)
34
+ references: Optional[List[Dict[str, Any]]] = Field(
35
+ description="Parsed references from the paper", default=None
36
+ )
37
+ original_format: str = Field(
38
+ description="Original format of the paper (pdf or html)", default="html"
39
+ )
40
+
41
+
42
+ class TOCEntry(BaseModel): # type: ignore
33
43
  level: int
34
44
  title: str
35
45
  html_id: Optional[str] = None
36
- subsections: List["TOCEntry"] = field(default_factory=list)
46
+ subsections: List["TOCEntry"] = Field(default_factory=list)
37
47
 
38
48
  def linearize(self) -> List["TOCEntry"]:
39
49
  entries = [self]
@@ -196,7 +206,7 @@ def _parse_citation_metadata(metas: List[str]) -> Dict[str, Any]:
196
206
  return result
197
207
 
198
208
 
199
- def _extract_citations(soup_biblist: bs4.element.Tag) -> List[Dict[str, Any]]:
209
+ def _extract_references(soup_biblist: bs4.element.Tag) -> List[Dict[str, Any]]:
200
210
  extracted = []
201
211
  for li in soup_biblist.find_all("li", recursive=False):
202
212
  metas = [x.text.strip() for x in li.find_all("span", class_="ltx_bibblock")]
@@ -214,17 +224,17 @@ def _parse_html(paper_id: str) -> Dict[str, Any]:
214
224
  article = soup.article
215
225
  assert article and isinstance(article, bs4.element.Tag)
216
226
 
217
- citations = []
227
+ references = []
218
228
  biblist_tag = article.find(class_="ltx_biblist")
219
229
  if biblist_tag and isinstance(biblist_tag, bs4.element.Tag):
220
- citations = _extract_citations(biblist_tag)
230
+ references = _extract_references(biblist_tag)
221
231
 
222
232
  toc = _generate_toc(article)
223
233
  sections = _build_by_toc(toc, article, url)
224
234
  return {
225
235
  "toc": toc.to_str(),
226
236
  "sections": sections,
227
- "citations": citations,
237
+ "references": references,
228
238
  "original_format": "html",
229
239
  }
230
240
 
@@ -255,36 +265,24 @@ def _parse_pdf(paper_id: str) -> Dict[str, Any]:
255
265
  return {
256
266
  "toc": "\n".join([f"Page {page_number}" for page_number in range(1, len(pages) + 1)]),
257
267
  "sections": pages,
258
- "citations": [],
268
+ "references": [],
259
269
  "original_format": "pdf",
260
270
  }
261
271
 
262
272
 
263
273
  def arxiv_download(
264
274
  paper_id: str,
265
- include_citations: Optional[bool] = False,
275
+ include_references: Optional[bool] = False,
266
276
  mode: Optional[str] = "html",
267
- ) -> str:
277
+ ) -> DownloadResponse:
268
278
  """
269
279
  Downloads a paper from Arxiv and converts it to text.
270
280
  Use mode = "html" by default.
271
281
  Fall back to mode = "pdf" if there are any problems with the HTML version.
272
282
 
273
- Returns a JSON with a following structure:
274
- {
275
- "title": "...",
276
- "abstract": "...",
277
- "toc": "...",
278
- "sections": ["...", ...],
279
- "citations": [...]
280
- }
281
- Use `json.loads` to deserialize the result if you want to get specific fields.
282
- For example, `abstract = json.loads(arxiv_download("2409.06820v1"))`
283
- The "toc" key contains Table of Contents, that sometimes has indexing for sections.
284
-
285
283
  Args:
286
284
  paper_id: ID of the paper on Arxiv. For instance: 2409.06820v1
287
- include_citations: include "citations" in the result or not. False by default.
285
+ include_references: include "references" in the result or not. False by default.
288
286
  mode: Which version of paper to use. Options: ["html", "pdf"]. "html" by default.
289
287
  """
290
288
 
@@ -297,7 +295,6 @@ def arxiv_download(
297
295
  else:
298
296
  content = _parse_pdf(paper_id)
299
297
 
300
- if not include_citations and "citations" in content:
301
- content.pop("citations")
302
-
303
- return json.dumps({**abs_meta, **content}, ensure_ascii=False)
298
+ if not include_references and "references" in content:
299
+ content.pop("references")
300
+ return DownloadResponse(**{**abs_meta, **content})
@@ -3,8 +3,8 @@
3
3
  # https://info.arxiv.org/help/api/user-manual.html
4
4
 
5
5
  import re
6
- from typing import Optional, List, Dict, Any, Union
7
- from datetime import datetime, date
6
+ from datetime import date, datetime
7
+ from typing import Any, Dict, List, Optional, Union
8
8
 
9
9
  import xmltodict
10
10
  from pydantic import BaseModel, Field
@@ -30,10 +30,10 @@ class ArxivSearchEntry(BaseModel): # type: ignore
30
30
 
31
31
 
32
32
  class ArxivSearchResponse(BaseModel): # type: ignore
33
- total_count: int = Field(description="The total number of results")
34
- returned_count: int = Field(description="The number of results returned")
35
- offset: int = Field(description="The offset of the results")
36
- results: List[ArxivSearchEntry] = Field(description="The results, search entries")
33
+ total_count: int = Field(description="Total number of results")
34
+ returned_count: int = Field(description="Number of results returned")
35
+ offset: int = Field(description="Offset for pagination")
36
+ results: List[ArxivSearchEntry] = Field(description="Search entries")
37
37
 
38
38
 
39
39
  def _format_text_field(text: str) -> str:
@@ -1,17 +1,18 @@
1
+ # Based on
1
2
  # https://arxiv.org/abs/2504.12976
2
3
  # https://web.stanford.edu/class/cs197c/slides/02-literature-search.pdf
3
4
 
4
5
  import json
5
6
  import random
6
- from typing import List, Optional, Any, Dict
7
+ from typing import Any, Dict, List, Optional
7
8
 
8
- from pydantic import BaseModel
9
9
  from datasets import load_dataset # type: ignore
10
+ from pydantic import BaseModel, Field
10
11
 
11
- from academia_mcp.tools.arxiv_download import arxiv_download
12
- from academia_mcp.utils import extract_json, encode_prompt
13
- from academia_mcp.llm import llm_acall, ChatMessage
12
+ from academia_mcp.llm import ChatMessage, llm_acall
14
13
  from academia_mcp.settings import settings
14
+ from academia_mcp.tools.arxiv_download import arxiv_download
15
+ from academia_mcp.utils import encode_prompt, extract_json
15
16
 
16
17
 
17
18
  class ProposalDataset:
@@ -128,7 +129,7 @@ Return only the JSON list of proposals in this exact format:
128
129
  "spark": "4-6 word summary",
129
130
  "abstract": "An abstract that summarizes the proposal in conference format (approximately 250 words).",
130
131
  "experiments": ["...", "..."],
131
- "risks_and_limitations": "A list of potential risks and limitations of the proposal."
132
+ "risks_and_limitations": ["...", "..."]
132
133
  },
133
134
  ...
134
135
  ]
@@ -177,12 +178,12 @@ Return only scores for all proposals in this exact format (no extra text):
177
178
 
178
179
 
179
180
  class BitFlipInfo(BaseModel): # type: ignore
180
- bit: str
181
- flip: str
182
- spark: str
181
+ bit: str = Field(description="Technical limitation or conventional approach")
182
+ flip: str = Field(description="Innovative approach or solution")
183
+ spark: str = Field(description="4-6 word summary")
183
184
 
184
185
 
185
- async def extract_bitflip_info(arxiv_id: str) -> str:
186
+ async def extract_bitflip_info(arxiv_id: str) -> BitFlipInfo:
186
187
  """
187
188
  Extracts the Bit-Flip information from the arXiv paper.
188
189
 
@@ -190,20 +191,12 @@ async def extract_bitflip_info(arxiv_id: str) -> str:
190
191
  questioning existing constraints or reapplying techniques to new domains/scales.
191
192
  The "Bit" is the prevailing belief, and the "Flip" is the counterargument.
192
193
 
193
- Returns a JSON object in this format:
194
- {
195
- "bit": "Technical limitation or conventional approach, in at least two sentences",
196
- "flip": "Innovative approach or solution, in at least two sentences",
197
- "spark": "4-6 word summary of the core idea"
198
- }
199
- Use `json.loads` to deserialize the result if you want to get specific fields.
200
-
201
194
  Args:
202
195
  arxiv_id: The arXiv ID of the paper to extract the Bit-Flip information from.
203
196
  """
204
197
  model_name = settings.BITFLIP_MODEL_NAME
205
198
  paper = arxiv_download(arxiv_id)
206
- abstract = json.loads(paper)["abstract"]
199
+ abstract = paper.abstract
207
200
  prompt = encode_prompt(EXTRACT_PROMPT, abstract=abstract)
208
201
  content = await llm_acall(
209
202
  model_name=model_name,
@@ -212,12 +205,31 @@ async def extract_bitflip_info(arxiv_id: str) -> str:
212
205
  )
213
206
  result = extract_json(content)
214
207
  bitflip_info: BitFlipInfo = BitFlipInfo.model_validate(result)
215
- return str(bitflip_info.model_dump_json())
208
+ return bitflip_info
209
+
210
+
211
+ class ResearchProposal(BaseModel): # type: ignore
212
+ proposal_id: int = Field(default=0, description="ID of the proposal")
213
+ flip: str = Field(description="Innovative approach or solution, in at least two sentences")
214
+ spark: str = Field(description="4-6 word summary")
215
+ abstract: str = Field(
216
+ description="An abstract that summarizes the proposal in conference format."
217
+ )
218
+ experiments: List[str] = Field(
219
+ description="A list of experiments that would be conducted to validate the proposal."
220
+ )
221
+ risks_and_limitations: List[str] = Field(
222
+ description="A list of potential risks and limitations of the proposal."
223
+ )
224
+
225
+
226
+ class GenerateResearchProposalResponse(BaseModel): # type: ignore
227
+ proposals: List[ResearchProposal] = Field(description="A list of research proposals")
216
228
 
217
229
 
218
230
  async def generate_research_proposals(
219
231
  bit: str, num_proposals: int = 3, additional_context: str = ""
220
- ) -> str:
232
+ ) -> GenerateResearchProposalResponse:
221
233
  """
222
234
  Proposes improvement ideas for the Bit.
223
235
 
@@ -225,20 +237,6 @@ async def generate_research_proposals(
225
237
  bit: The Bit to propose improvement ideas for. The bit is a technical limitation or conventional approach of some paper.
226
238
  num_proposals: The number of proposals to generate.
227
239
  additional_context: Additional context to use when proposing the improvement idea.
228
-
229
- Returns a JSON string with a research proposal in this format:
230
- [
231
- {
232
- "proposal_id": ...,
233
- "flip": "Innovative approach or solution, in at least two sentences",
234
- "spark": "4-6 word summary",
235
- "abstract": "An abstract that summarizes the proposal in conference format (approximately 250 words).",
236
- "experiments": ["...", "..."],
237
- "risks_and_limitations": "A list of potential risks and limitations of the proposal."
238
- },
239
- ...
240
- ]
241
- Use `json.loads` to deserialize the result if you want to get specific items.
242
240
  """
243
241
  model_name = settings.BITFLIP_MODEL_NAME
244
242
  max_completion_tokens = int(settings.BITFLIP_MAX_COMPLETION_TOKENS)
@@ -262,46 +260,51 @@ async def generate_research_proposals(
262
260
  temperature=1.0,
263
261
  )
264
262
  result = extract_json(content)
265
- for proposal in result:
266
- proposal["proposal_id"] = random.randint(0, 1000000)
267
- return json.dumps(result, ensure_ascii=False)
263
+ return GenerateResearchProposalResponse(
264
+ proposals=[ResearchProposal.model_validate(proposal) for proposal in result]
265
+ )
266
+
267
+
268
+ class ScoredProposal(BaseModel): # type: ignore
269
+ proposal_id: int = Field(default=0, description="ID of the proposal")
270
+ spark: str = Field(description="4-6 word summary")
271
+ strengths: List[str] = Field(description="A list of strengths of the proposal")
272
+ weaknesses: List[str] = Field(description="A list of weaknesses of the proposal")
273
+ novelty: int = Field(description="Novelty rating from 1 to 4")
274
+ clarity: int = Field(description="Clarity rating from 1 to 4")
275
+ significance: int = Field(description="Significance rating from 1 to 4")
276
+ feasibility: int = Field(description="Feasibility rating from 1 to 4")
277
+ soundness: int = Field(description="Soundness rating from 1 to 4")
278
+ overall: int = Field(description="Overall rating from 1 to 10")
268
279
 
269
280
 
270
- async def score_research_proposals(proposals: str | List[str | Dict[str, Any] | Any]) -> str:
281
+ class ScoreResearchProposalsResponse(BaseModel): # type: ignore
282
+ proposals: List[ScoredProposal] = Field(description="List of scored proposals")
283
+
284
+
285
+ async def score_research_proposals(
286
+ proposals: str | List[str | Dict[str, Any] | Any],
287
+ ) -> ScoreResearchProposalsResponse:
271
288
  """
272
289
  Scores a list of research proposals.
273
290
  Use proposals obtained with the `generate_research_proposal` tool.
274
291
 
275
- Returns a JSON string with a list of scores in this format:
276
- [
277
- {
278
- "proposal_id": 0,
279
- "spark": "...",
280
- "strengths": ["...", "..."],
281
- "weaknesses": ["...", "..."],
282
- "novelty": 2,
283
- "clarity": 2,
284
- "significance": 2,
285
- "feasibility": 2,
286
- "soundness": 2,
287
- "overall": 5
288
- },
289
- ...
290
- ]
291
- Use `json.loads` to deserialize the result if you want to get specific fields.
292
-
293
292
  Args:
294
293
  proposals: A list of JSON strings with research proposals.
295
294
  """
296
295
  model_name = settings.BITFLIP_MODEL_NAME
297
296
  if isinstance(proposals, str):
298
297
  proposals = json.loads(proposals)
299
- assert isinstance(proposals, list), "Proposals should be a list of JSON strings"
300
- prompt = encode_prompt(SCORE_PROMPT, proposals=[str(p) for p in proposals])
298
+ assert isinstance(proposals, list), "Proposals should be a list"
299
+ if isinstance(proposals, list):
300
+ proposals = [str(p) for p in proposals]
301
+ prompt = encode_prompt(SCORE_PROMPT, proposals=proposals)
301
302
  content = await llm_acall(
302
303
  model_name=model_name,
303
304
  messages=[ChatMessage(role="user", content=prompt)],
304
305
  temperature=0.0,
305
306
  )
306
307
  scores = extract_json(content)
307
- return json.dumps(scores, ensure_ascii=False)
308
+ return ScoreResearchProposalsResponse(
309
+ proposals=[ScoredProposal.model_validate(score) for score in scores]
310
+ )
@@ -1,9 +1,10 @@
1
1
  # Based on
2
2
  # https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_get_paper_citations
3
3
 
4
- import json
5
4
  from typing import Optional, List, Dict, Any
6
5
 
6
+ from pydantic import BaseModel, Field
7
+
7
8
  from academia_mcp.utils import get_with_retries
8
9
 
9
10
 
@@ -13,42 +14,58 @@ REFERENCES_URL_TEMPLATE = "https://api.semanticscholar.org/graph/v1/paper/{paper
13
14
  FIELDS = "title,authors,externalIds,venue,citationCount,publicationDate"
14
15
 
15
16
 
17
+ class S2PaperInfo(BaseModel): # type: ignore
18
+ arxiv_id: Optional[str] = Field(description="ArXiv ID of the paper", default=None)
19
+ external_ids: Optional[Dict[str, Any]] = Field(
20
+ description="External IDs of the paper.", default=None
21
+ )
22
+ title: str = Field(description="Paper title")
23
+ authors: List[str] = Field(description="Authors of the paper")
24
+ venue: str = Field(description="Paper venue")
25
+ citation_count: Optional[int] = Field(description="Paper citation count", default=None)
26
+ publication_date: Optional[str] = Field(description="Paper publication date", default=None)
27
+
28
+
29
+ class S2SearchResponse(BaseModel): # type: ignore
30
+ total_count: int = Field(description="Total number of results.")
31
+ returned_count: int = Field(description="Number of results returned.")
32
+ offset: int = Field(description="Offset of the results.")
33
+ results: List[S2PaperInfo] = Field(description="Search entries")
34
+
35
+
16
36
  def _format_authors(authors: List[Dict[str, Any]]) -> List[str]:
17
37
  return [a["name"] for a in authors]
18
38
 
19
39
 
20
- def _clean_entry(entry: Dict[str, Any]) -> Dict[str, Any]:
40
+ def _clean_entry(entry: Dict[str, Any]) -> S2PaperInfo:
21
41
  entry = entry["citingPaper"] if "citingPaper" in entry else entry["citedPaper"]
22
42
  external_ids = entry.get("externalIds")
23
43
  if not external_ids:
24
44
  external_ids = dict()
25
45
  external_ids.pop("CorpusId", None)
26
46
  arxiv_id = external_ids.pop("ArXiv", None)
27
- return {
28
- "arxiv_id": arxiv_id,
29
- "external_ids": external_ids if external_ids else None,
30
- "title": entry["title"],
31
- "authors": _format_authors(entry["authors"]),
32
- "venue": entry.get("venue", ""),
33
- "citation_count": entry.get("citationCount", 0),
34
- "publication_date": entry.get("publicationDate", ""),
35
- }
47
+ return S2PaperInfo(
48
+ arxiv_id=arxiv_id,
49
+ external_ids=external_ids if external_ids else None,
50
+ title=entry["title"],
51
+ authors=_format_authors(entry["authors"]),
52
+ venue=entry.get("venue", ""),
53
+ citation_count=entry.get("citationCount"),
54
+ publication_date=entry.get("publicationDate"),
55
+ )
36
56
 
37
57
 
38
58
  def _format_entries(
39
59
  entries: List[Dict[str, Any]],
40
60
  start_index: int,
41
61
  total_results: int,
42
- ) -> str:
62
+ ) -> S2SearchResponse:
43
63
  clean_entries = [_clean_entry(e) for e in entries]
44
- return json.dumps(
45
- {
46
- "total_count": total_results,
47
- "returned_count": len(entries),
48
- "offset": start_index,
49
- "results": clean_entries,
50
- },
51
- ensure_ascii=False,
64
+ return S2SearchResponse(
65
+ total_count=total_results,
66
+ returned_count=len(entries),
67
+ offset=start_index,
68
+ results=clean_entries,
52
69
  )
53
70
 
54
71
 
@@ -56,16 +73,10 @@ def s2_get_citations(
56
73
  arxiv_id: str,
57
74
  offset: Optional[int] = 0,
58
75
  limit: Optional[int] = 50,
59
- ) -> str:
76
+ ) -> S2SearchResponse:
60
77
  """
61
78
  Get all papers that cited a given arXiv paper based on Semantic Scholar info.
62
79
 
63
- Returns a JSON object serialized to a string. The structure is:
64
- {"total_count": ..., "returned_count": ..., "offset": ..., "results": [...]}
65
- Every item in the "results" has the following fields:
66
- ("arxiv_id", "external_ids", "title", "authors", "venue", "citation_count", "publication_date")
67
- Use `json.loads` to deserialize the result if you want to get specific fields.
68
-
69
80
  Args:
70
81
  arxiv_id: The ID of a given arXiv paper.
71
82
  offset: The offset to scroll through citations. 10 items will be skipped if offset=10. 0 by default.
@@ -98,16 +109,10 @@ def s2_get_references(
98
109
  arxiv_id: str,
99
110
  offset: Optional[int] = 0,
100
111
  limit: Optional[int] = 50,
101
- ) -> str:
112
+ ) -> S2SearchResponse:
102
113
  """
103
114
  Get all papers that were cited by a given arXiv paper (references) based on Semantic Scholar info.
104
115
 
105
- Returns a JSON object serialized to a string. The structure is:
106
- {"total_count": ..., "returned_count": ..., "offset": ..., "results": [...]}
107
- Every item in the "results" has the following fields:
108
- ("arxiv_id", "external_ids", "title", "authors", "venue", "citation_count", "publication_date")
109
- Use `json.loads` to deserialize the result if you want to get specific fields.
110
-
111
116
  Args:
112
117
  arxiv_id: The ID of a given arXiv paper.
113
118
  offset: The offset to scroll through citations. 10 items will be skipped if offset=10. 0 by default.
@@ -144,14 +149,10 @@ def s2_corpus_id_from_arxiv_id(arxiv_id: str) -> int:
144
149
  return int(result["externalIds"]["CorpusId"])
145
150
 
146
151
 
147
- def s2_get_info(arxiv_id: str) -> str:
152
+ def s2_get_info(arxiv_id: str) -> S2PaperInfo:
148
153
  """
149
154
  Get the S2 info for a given arXiv ID.
150
155
 
151
- Returns a JSON object serialized to a string. The structure is:
152
- {"title": ..., "authors": ..., "externalIds": ..., "venue": ..., "citationCount": ..., "publicationDate": ...}
153
- Use `json.loads` to deserialize the result if you want to get specific fields.
154
-
155
156
  Args:
156
157
  arxiv_id: The ID of a given arXiv paper.
157
158
  """
@@ -160,4 +161,13 @@ def s2_get_info(arxiv_id: str) -> str:
160
161
  arxiv_id = arxiv_id.split("v")[0]
161
162
  paper_url = PAPER_URL_TEMPLATE.format(paper_id=f"arxiv:{arxiv_id}", fields=FIELDS)
162
163
  response = get_with_retries(paper_url)
163
- return json.dumps(response.json(), ensure_ascii=False)
164
+ json_data = response.json()
165
+ return S2PaperInfo(
166
+ arxiv_id=json_data.get("externalIds", {}).get("ArXiv"),
167
+ external_ids=json_data.get("externalIds", {}),
168
+ title=json_data["title"],
169
+ authors=_format_authors(json_data["authors"]),
170
+ venue=json_data.get("venue", ""),
171
+ citation_count=int(json_data.get("citationCount", 0)),
172
+ publication_date=str(json_data.get("publicationDate", "")),
173
+ )
@@ -1,12 +1,11 @@
1
1
  import re
2
- import json
3
- from typing import Optional, Dict, Any, cast
2
+ from typing import Any, Dict, List, Optional
4
3
 
5
4
  from markdownify import markdownify # type: ignore
5
+ from pydantic import BaseModel, Field
6
6
 
7
- from academia_mcp.utils import get_with_retries, post_with_retries
8
7
  from academia_mcp.settings import settings
9
- from academia_mcp.utils import sanitize_output
8
+ from academia_mcp.utils import get_with_retries, post_with_retries, sanitize_output
10
9
 
11
10
  EXA_CONTENTS_URL = "https://api.exa.ai/contents"
12
11
  TAVILY_EXTRACT_URL = "https://api.tavily.com/extract"
@@ -14,6 +13,16 @@ AVAILABLE_PROVIDERS = ("basic", "exa", "tavily")
14
13
  ERROR_MESSAGE = "Failed to get content from the page. Try to use another provider."
15
14
 
16
15
 
16
+ class VisitWebpageResponse(BaseModel): # type: ignore
17
+ id: str = Field(description="ID of the webpage, usually the URL")
18
+ provider: str = Field(description="Provider used to get the content")
19
+ text: Optional[str] = Field(description="Text content of the webpage", default=None)
20
+ images: List[str] = Field(description="Images of the webpage", default_factory=list)
21
+ error: Optional[str] = Field(
22
+ description="Error message if the webpage is not found", default=None
23
+ )
24
+
25
+
17
26
  def _exa_visit_webpage(url: str) -> Dict[str, Any]:
18
27
  key = settings.EXA_API_KEY or ""
19
28
  assert key, "Error: EXA_API_KEY is not set and no api_key was provided"
@@ -25,7 +34,7 @@ def _exa_visit_webpage(url: str) -> Dict[str, Any]:
25
34
  results = response.json()["results"]
26
35
  if not results:
27
36
  return {"error": ERROR_MESSAGE}
28
- return cast(Dict[str, Any], results[0])
37
+ return {"text": results[0]["text"]}
29
38
 
30
39
 
31
40
  def _tavily_visit_webpage(url: str) -> Dict[str, Any]:
@@ -61,13 +70,9 @@ def _basic_visit_webpage(url: str) -> Dict[str, Any]:
61
70
  return {"error": str(e) + "\n" + ERROR_MESSAGE}
62
71
 
63
72
 
64
- def visit_webpage(url: str, provider: Optional[str] = "basic") -> str:
73
+ def visit_webpage(url: str, provider: Optional[str] = "basic") -> VisitWebpageResponse:
65
74
  """
66
75
  Visit a webpage and return the content.
67
-
68
- Returns a JSON object serialized to a string. The structure is: {"id": "...", "text": "..."}.
69
- If there are errors, the structure is: {"id": "...", "error": "..."}.
70
- Use `json.loads` to deserialize the result if you want to get specific fields.
71
76
  Try to use both "tavily" and "basic" providers. They might work differently for the same URL.
72
77
 
73
78
  Args:
@@ -85,6 +90,9 @@ def visit_webpage(url: str, provider: Optional[str] = "basic") -> str:
85
90
  else:
86
91
  result = _basic_visit_webpage(url)
87
92
 
88
- result["id"] = url
89
- result["provider"] = provider
90
- return sanitize_output(json.dumps(result, ensure_ascii=False))
93
+ result = VisitWebpageResponse(id=url, provider=provider, **result)
94
+ if result.text:
95
+ result.text = sanitize_output(result.text)
96
+ if result.error:
97
+ result.error = sanitize_output(result.error)
98
+ return result