academia-mcp 1.10.9__py3-none-any.whl → 1.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academia_mcp/server.py +41 -21
- academia_mcp/tools/arxiv_download.py +30 -33
- academia_mcp/tools/arxiv_search.py +43 -34
- academia_mcp/tools/bitflip.py +63 -60
- academia_mcp/tools/s2.py +50 -40
- academia_mcp/tools/show_image.py +21 -6
- academia_mcp/tools/visit_webpage.py +25 -14
- academia_mcp/tools/web_search.py +42 -35
- academia_mcp/utils.py +2 -0
- {academia_mcp-1.10.9.dist-info → academia_mcp-1.11.1.dist-info}/METADATA +1 -1
- {academia_mcp-1.10.9.dist-info → academia_mcp-1.11.1.dist-info}/RECORD +15 -15
- {academia_mcp-1.10.9.dist-info → academia_mcp-1.11.1.dist-info}/WHEEL +0 -0
- {academia_mcp-1.10.9.dist-info → academia_mcp-1.11.1.dist-info}/entry_points.txt +0 -0
- {academia_mcp-1.10.9.dist-info → academia_mcp-1.11.1.dist-info}/licenses/LICENSE +0 -0
- {academia_mcp-1.10.9.dist-info → academia_mcp-1.11.1.dist-info}/top_level.txt +0 -0
academia_mcp/server.py
CHANGED
@@ -63,35 +63,34 @@ def find_free_port() -> int:
|
|
63
63
|
raise RuntimeError("No free port in range 5000-6000 found")
|
64
64
|
|
65
65
|
|
66
|
-
def
|
67
|
-
host: str = "0.0.0.0",
|
68
|
-
port: Optional[int] = None,
|
69
|
-
mount_path: str = "/",
|
66
|
+
def create_server(
|
70
67
|
streamable_http_path: str = "/mcp",
|
71
|
-
|
68
|
+
mount_path: str = "/",
|
69
|
+
stateless_http: bool = True,
|
72
70
|
disable_web_search_tools: bool = False,
|
73
71
|
disable_llm_tools: bool = False,
|
74
|
-
|
75
|
-
|
72
|
+
port: Optional[int] = None,
|
73
|
+
host: str = "0.0.0.0",
|
74
|
+
) -> FastMCP:
|
76
75
|
server = FastMCP(
|
77
76
|
"Academia MCP",
|
78
|
-
stateless_http=
|
77
|
+
stateless_http=stateless_http,
|
79
78
|
streamable_http_path=streamable_http_path,
|
80
79
|
mount_path=mount_path,
|
81
80
|
)
|
82
81
|
logger = logging.getLogger(__name__)
|
83
82
|
|
84
|
-
server.add_tool(arxiv_search)
|
85
|
-
server.add_tool(arxiv_download)
|
86
|
-
server.add_tool(
|
87
|
-
server.add_tool(
|
83
|
+
server.add_tool(arxiv_search, structured_output=True)
|
84
|
+
server.add_tool(arxiv_download, structured_output=True)
|
85
|
+
server.add_tool(visit_webpage, structured_output=True)
|
86
|
+
server.add_tool(s2_get_citations, structured_output=True)
|
87
|
+
server.add_tool(s2_get_references, structured_output=True)
|
88
|
+
server.add_tool(s2_get_info, structured_output=True)
|
88
89
|
server.add_tool(s2_corpus_id_from_arxiv_id)
|
89
|
-
server.add_tool(s2_get_info)
|
90
90
|
server.add_tool(hf_datasets_search)
|
91
91
|
server.add_tool(anthology_search)
|
92
92
|
server.add_tool(get_latex_template)
|
93
93
|
server.add_tool(get_latex_templates_list)
|
94
|
-
server.add_tool(visit_webpage)
|
95
94
|
server.add_tool(show_image)
|
96
95
|
server.add_tool(yt_transcript)
|
97
96
|
|
@@ -106,20 +105,20 @@ def run(
|
|
106
105
|
|
107
106
|
if not disable_web_search_tools:
|
108
107
|
if settings.TAVILY_API_KEY:
|
109
|
-
server.add_tool(tavily_web_search)
|
108
|
+
server.add_tool(tavily_web_search, structured_output=True)
|
110
109
|
if settings.EXA_API_KEY:
|
111
|
-
server.add_tool(exa_web_search)
|
110
|
+
server.add_tool(exa_web_search, structured_output=True)
|
112
111
|
if settings.BRAVE_API_KEY:
|
113
|
-
server.add_tool(brave_web_search)
|
112
|
+
server.add_tool(brave_web_search, structured_output=True)
|
114
113
|
if settings.EXA_API_KEY or settings.BRAVE_API_KEY or settings.TAVILY_API_KEY:
|
115
|
-
server.add_tool(web_search)
|
114
|
+
server.add_tool(web_search, structured_output=True)
|
116
115
|
else:
|
117
116
|
logger.warning("No web search tools keys are set, web_search will not be available!")
|
118
117
|
|
119
118
|
if not disable_llm_tools and settings.OPENROUTER_API_KEY:
|
120
|
-
server.add_tool(extract_bitflip_info)
|
121
|
-
server.add_tool(generate_research_proposals)
|
122
|
-
server.add_tool(score_research_proposals)
|
119
|
+
server.add_tool(extract_bitflip_info, structured_output=True)
|
120
|
+
server.add_tool(generate_research_proposals, structured_output=True)
|
121
|
+
server.add_tool(score_research_proposals, structured_output=True)
|
123
122
|
server.add_tool(document_qa)
|
124
123
|
server.add_tool(describe_image)
|
125
124
|
if settings.WORKSPACE_DIR:
|
@@ -140,6 +139,27 @@ def run(
|
|
140
139
|
|
141
140
|
server.settings.port = port
|
142
141
|
server.settings.host = host
|
142
|
+
return server
|
143
|
+
|
144
|
+
|
145
|
+
def run(
|
146
|
+
host: str = "0.0.0.0",
|
147
|
+
port: Optional[int] = None,
|
148
|
+
mount_path: str = "/",
|
149
|
+
streamable_http_path: str = "/mcp",
|
150
|
+
transport: Literal["stdio", "sse", "streamable-http"] = "streamable-http",
|
151
|
+
disable_web_search_tools: bool = False,
|
152
|
+
disable_llm_tools: bool = False,
|
153
|
+
) -> None:
|
154
|
+
configure_uvicorn_style_logging()
|
155
|
+
server = create_server(
|
156
|
+
streamable_http_path=streamable_http_path,
|
157
|
+
mount_path=mount_path,
|
158
|
+
disable_web_search_tools=disable_web_search_tools,
|
159
|
+
disable_llm_tools=disable_llm_tools,
|
160
|
+
port=port,
|
161
|
+
host=host,
|
162
|
+
)
|
143
163
|
|
144
164
|
if transport == "streamable-http":
|
145
165
|
# Enable CORS for browser-based clients
|
@@ -3,19 +3,17 @@
|
|
3
3
|
# https://github.com/bytedance/pasa/blob/main/utils.py
|
4
4
|
|
5
5
|
import re
|
6
|
-
import json
|
7
6
|
import tempfile
|
8
7
|
from pathlib import Path
|
9
|
-
from typing import Any, List, Optional
|
10
|
-
from dataclasses import dataclass, field
|
8
|
+
from typing import Any, Dict, List, Optional
|
11
9
|
|
12
|
-
import requests
|
13
10
|
import bs4
|
11
|
+
import requests
|
14
12
|
from markdownify import MarkdownConverter # type: ignore
|
13
|
+
from pydantic import BaseModel, Field
|
15
14
|
|
15
|
+
from academia_mcp.pdf import download_pdf, parse_pdf_file
|
16
16
|
from academia_mcp.utils import get_with_retries
|
17
|
-
from academia_mcp.pdf import parse_pdf_file, download_pdf
|
18
|
-
|
19
17
|
|
20
18
|
HTML_URL = "https://arxiv.org/html/{paper_id}"
|
21
19
|
ABS_URL = "https://arxiv.org/abs/{paper_id}"
|
@@ -28,12 +26,24 @@ SECTION_STOP_WORDS = (
|
|
28
26
|
)
|
29
27
|
|
30
28
|
|
31
|
-
|
32
|
-
|
29
|
+
class DownloadResponse(BaseModel): # type: ignore
|
30
|
+
title: str = Field(description="Title of the paper")
|
31
|
+
abstract: str = Field(description="Abstract of the paper")
|
32
|
+
toc: str = Field(description="Table of Contents", default="")
|
33
|
+
sections: Optional[List[str]] = Field(description="Sections of the paper", default=None)
|
34
|
+
references: Optional[List[Dict[str, Any]]] = Field(
|
35
|
+
description="Parsed references from the paper", default=None
|
36
|
+
)
|
37
|
+
original_format: str = Field(
|
38
|
+
description="Original format of the paper (pdf or html)", default="html"
|
39
|
+
)
|
40
|
+
|
41
|
+
|
42
|
+
class TOCEntry(BaseModel): # type: ignore
|
33
43
|
level: int
|
34
44
|
title: str
|
35
45
|
html_id: Optional[str] = None
|
36
|
-
subsections: List["TOCEntry"] =
|
46
|
+
subsections: List["TOCEntry"] = Field(default_factory=list)
|
37
47
|
|
38
48
|
def linearize(self) -> List["TOCEntry"]:
|
39
49
|
entries = [self]
|
@@ -196,7 +206,7 @@ def _parse_citation_metadata(metas: List[str]) -> Dict[str, Any]:
|
|
196
206
|
return result
|
197
207
|
|
198
208
|
|
199
|
-
def
|
209
|
+
def _extract_references(soup_biblist: bs4.element.Tag) -> List[Dict[str, Any]]:
|
200
210
|
extracted = []
|
201
211
|
for li in soup_biblist.find_all("li", recursive=False):
|
202
212
|
metas = [x.text.strip() for x in li.find_all("span", class_="ltx_bibblock")]
|
@@ -214,17 +224,17 @@ def _parse_html(paper_id: str) -> Dict[str, Any]:
|
|
214
224
|
article = soup.article
|
215
225
|
assert article and isinstance(article, bs4.element.Tag)
|
216
226
|
|
217
|
-
|
227
|
+
references = []
|
218
228
|
biblist_tag = article.find(class_="ltx_biblist")
|
219
229
|
if biblist_tag and isinstance(biblist_tag, bs4.element.Tag):
|
220
|
-
|
230
|
+
references = _extract_references(biblist_tag)
|
221
231
|
|
222
232
|
toc = _generate_toc(article)
|
223
233
|
sections = _build_by_toc(toc, article, url)
|
224
234
|
return {
|
225
235
|
"toc": toc.to_str(),
|
226
236
|
"sections": sections,
|
227
|
-
"
|
237
|
+
"references": references,
|
228
238
|
"original_format": "html",
|
229
239
|
}
|
230
240
|
|
@@ -255,36 +265,24 @@ def _parse_pdf(paper_id: str) -> Dict[str, Any]:
|
|
255
265
|
return {
|
256
266
|
"toc": "\n".join([f"Page {page_number}" for page_number in range(1, len(pages) + 1)]),
|
257
267
|
"sections": pages,
|
258
|
-
"
|
268
|
+
"references": [],
|
259
269
|
"original_format": "pdf",
|
260
270
|
}
|
261
271
|
|
262
272
|
|
263
273
|
def arxiv_download(
|
264
274
|
paper_id: str,
|
265
|
-
|
275
|
+
include_references: Optional[bool] = False,
|
266
276
|
mode: Optional[str] = "html",
|
267
|
-
) ->
|
277
|
+
) -> DownloadResponse:
|
268
278
|
"""
|
269
279
|
Downloads a paper from Arxiv and converts it to text.
|
270
280
|
Use mode = "html" by default.
|
271
281
|
Fall back to mode = "pdf" if there are any problems with the HTML version.
|
272
282
|
|
273
|
-
Returns a JSON with a following structure:
|
274
|
-
{
|
275
|
-
"title": "...",
|
276
|
-
"abstract": "...",
|
277
|
-
"toc": "...",
|
278
|
-
"sections": ["...", ...],
|
279
|
-
"citations": [...]
|
280
|
-
}
|
281
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
282
|
-
For example, `abstract = json.loads(arxiv_download("2409.06820v1"))`
|
283
|
-
The "toc" key contains Table of Contents, that sometimes has indexing for sections.
|
284
|
-
|
285
283
|
Args:
|
286
284
|
paper_id: ID of the paper on Arxiv. For instance: 2409.06820v1
|
287
|
-
|
285
|
+
include_references: include "references" in the result or not. False by default.
|
288
286
|
mode: Which version of paper to use. Options: ["html", "pdf"]. "html" by default.
|
289
287
|
"""
|
290
288
|
|
@@ -297,7 +295,6 @@ def arxiv_download(
|
|
297
295
|
else:
|
298
296
|
content = _parse_pdf(paper_id)
|
299
297
|
|
300
|
-
if not
|
301
|
-
content.pop("
|
302
|
-
|
303
|
-
return json.dumps({**abs_meta, **content}, ensure_ascii=False)
|
298
|
+
if not include_references and "references" in content:
|
299
|
+
content.pop("references")
|
300
|
+
return DownloadResponse(**{**abs_meta, **content})
|
@@ -2,12 +2,12 @@
|
|
2
2
|
# https://github.com/jonatasgrosman/findpapers/blob/master/findpapers/searchers/arxiv_searcher.py
|
3
3
|
# https://info.arxiv.org/help/api/user-manual.html
|
4
4
|
|
5
|
-
import json
|
6
5
|
import re
|
7
|
-
from
|
8
|
-
from
|
6
|
+
from datetime import date, datetime
|
7
|
+
from typing import Any, Dict, List, Optional, Union
|
9
8
|
|
10
9
|
import xmltodict
|
10
|
+
from pydantic import BaseModel, Field
|
11
11
|
|
12
12
|
from academia_mcp.utils import get_with_retries
|
13
13
|
|
@@ -17,6 +17,25 @@ SORT_BY_OPTIONS = ("relevance", "lastUpdatedDate", "submittedDate")
|
|
17
17
|
SORT_ORDER_OPTIONS = ("ascending", "descending")
|
18
18
|
|
19
19
|
|
20
|
+
class ArxivSearchEntry(BaseModel): # type: ignore
|
21
|
+
id: str = Field(description="Paper ID")
|
22
|
+
title: str = Field(description="Paper title")
|
23
|
+
authors: str = Field(description="Authors of the paper")
|
24
|
+
published: str = Field(description="Published date of the paper")
|
25
|
+
updated: str = Field(description="Updated date of the paper")
|
26
|
+
categories: str = Field(description="Categories of the paper")
|
27
|
+
comment: str = Field(description="Comment of the paper")
|
28
|
+
index: int = Field(description="Index of the paper", default=0)
|
29
|
+
abstract: Optional[str] = Field(description="Abstract of the paper", default=None)
|
30
|
+
|
31
|
+
|
32
|
+
class ArxivSearchResponse(BaseModel): # type: ignore
|
33
|
+
total_count: int = Field(description="Total number of results")
|
34
|
+
returned_count: int = Field(description="Number of results returned")
|
35
|
+
offset: int = Field(description="Offset for pagination")
|
36
|
+
results: List[ArxivSearchEntry] = Field(description="Search entries")
|
37
|
+
|
38
|
+
|
20
39
|
def _format_text_field(text: str) -> str:
|
21
40
|
return " ".join([line.strip() for line in text.split() if line.strip()])
|
22
41
|
|
@@ -48,17 +67,17 @@ def _format_date(date: str) -> str:
|
|
48
67
|
return dt.strftime("%B %d, %Y")
|
49
68
|
|
50
69
|
|
51
|
-
def _clean_entry(entry: Dict[str, Any]) ->
|
52
|
-
return
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
70
|
+
def _clean_entry(entry: Dict[str, Any]) -> ArxivSearchEntry:
|
71
|
+
return ArxivSearchEntry(
|
72
|
+
id=entry["id"].split("/")[-1],
|
73
|
+
title=_format_text_field(entry["title"]),
|
74
|
+
authors=_format_authors(entry["author"]),
|
75
|
+
abstract=_format_text_field(entry["summary"]),
|
76
|
+
published=_format_date(entry["published"]),
|
77
|
+
updated=_format_date(entry["updated"]),
|
78
|
+
categories=_format_categories(entry.get("category", {})),
|
79
|
+
comment=_format_text_field(entry.get("arxiv:comment", {}).get("#text", "")),
|
80
|
+
)
|
62
81
|
|
63
82
|
|
64
83
|
def _convert_to_yyyymmddtttt(date_str: str) -> str:
|
@@ -105,22 +124,19 @@ def _format_entries(
|
|
105
124
|
start_index: int,
|
106
125
|
include_abstracts: bool,
|
107
126
|
total_results: int,
|
108
|
-
) ->
|
127
|
+
) -> ArxivSearchResponse:
|
109
128
|
clean_entries: List[Dict[str, Any]] = []
|
110
129
|
for entry_num, entry in enumerate(entries):
|
111
130
|
clean_entry = _clean_entry(entry)
|
112
131
|
if not include_abstracts:
|
113
|
-
clean_entry.
|
114
|
-
clean_entry
|
132
|
+
clean_entry.abstract = None
|
133
|
+
clean_entry.index = start_index + entry_num
|
115
134
|
clean_entries.append(clean_entry)
|
116
|
-
return
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
"results": clean_entries,
|
122
|
-
},
|
123
|
-
ensure_ascii=False,
|
135
|
+
return ArxivSearchResponse(
|
136
|
+
total_count=total_results,
|
137
|
+
returned_count=len(entries),
|
138
|
+
offset=start_index,
|
139
|
+
results=clean_entries,
|
124
140
|
)
|
125
141
|
|
126
142
|
|
@@ -133,7 +149,7 @@ def arxiv_search(
|
|
133
149
|
sort_by: Optional[str] = "relevance",
|
134
150
|
sort_order: Optional[str] = "descending",
|
135
151
|
include_abstracts: Optional[bool] = False,
|
136
|
-
) ->
|
152
|
+
) -> ArxivSearchResponse:
|
137
153
|
"""
|
138
154
|
Search arXiv papers with field-specific queries.
|
139
155
|
|
@@ -158,12 +174,6 @@ def arxiv_search(
|
|
158
174
|
all:role OR all:playing OR all:"language model"
|
159
175
|
(au:vaswani OR au:"del maestro") ANDNOT ti:attention
|
160
176
|
|
161
|
-
Returns a JSON object serialized to a string. The structure is:
|
162
|
-
{"total_count": ..., "returned_count": ..., "offset": ..., "results": [...]}
|
163
|
-
Every item in the "results" has the following fields:
|
164
|
-
("index", "id", "title", "authors", "abstract", "published", "updated", "categories", "comment")
|
165
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
166
|
-
|
167
177
|
Args:
|
168
178
|
query: The search query, required.
|
169
179
|
offset: The offset to scroll search results. 10 items will be skipped if offset=10. 0 by default.
|
@@ -211,10 +221,9 @@ def arxiv_search(
|
|
211
221
|
entries = feed.get("entry", [])
|
212
222
|
if isinstance(entries, dict):
|
213
223
|
entries = [entries]
|
214
|
-
|
224
|
+
return _format_entries(
|
215
225
|
entries,
|
216
226
|
start_index=start_index,
|
217
227
|
total_results=total_results,
|
218
228
|
include_abstracts=include_abstracts,
|
219
229
|
)
|
220
|
-
return formatted_entries
|
academia_mcp/tools/bitflip.py
CHANGED
@@ -1,17 +1,18 @@
|
|
1
|
+
# Based on
|
1
2
|
# https://arxiv.org/abs/2504.12976
|
2
3
|
# https://web.stanford.edu/class/cs197c/slides/02-literature-search.pdf
|
3
4
|
|
4
5
|
import json
|
5
6
|
import random
|
6
|
-
from typing import
|
7
|
+
from typing import Any, Dict, List, Optional
|
7
8
|
|
8
|
-
from pydantic import BaseModel
|
9
9
|
from datasets import load_dataset # type: ignore
|
10
|
+
from pydantic import BaseModel, Field
|
10
11
|
|
11
|
-
from academia_mcp.
|
12
|
-
from academia_mcp.utils import extract_json, encode_prompt
|
13
|
-
from academia_mcp.llm import llm_acall, ChatMessage
|
12
|
+
from academia_mcp.llm import ChatMessage, llm_acall
|
14
13
|
from academia_mcp.settings import settings
|
14
|
+
from academia_mcp.tools.arxiv_download import arxiv_download
|
15
|
+
from academia_mcp.utils import encode_prompt, extract_json
|
15
16
|
|
16
17
|
|
17
18
|
class ProposalDataset:
|
@@ -128,7 +129,7 @@ Return only the JSON list of proposals in this exact format:
|
|
128
129
|
"spark": "4-6 word summary",
|
129
130
|
"abstract": "An abstract that summarizes the proposal in conference format (approximately 250 words).",
|
130
131
|
"experiments": ["...", "..."],
|
131
|
-
"risks_and_limitations": "
|
132
|
+
"risks_and_limitations": ["...", "..."]
|
132
133
|
},
|
133
134
|
...
|
134
135
|
]
|
@@ -177,12 +178,12 @@ Return only scores for all proposals in this exact format (no extra text):
|
|
177
178
|
|
178
179
|
|
179
180
|
class BitFlipInfo(BaseModel): # type: ignore
|
180
|
-
bit: str
|
181
|
-
flip: str
|
182
|
-
spark: str
|
181
|
+
bit: str = Field(description="Technical limitation or conventional approach")
|
182
|
+
flip: str = Field(description="Innovative approach or solution")
|
183
|
+
spark: str = Field(description="4-6 word summary")
|
183
184
|
|
184
185
|
|
185
|
-
async def extract_bitflip_info(arxiv_id: str) ->
|
186
|
+
async def extract_bitflip_info(arxiv_id: str) -> BitFlipInfo:
|
186
187
|
"""
|
187
188
|
Extracts the Bit-Flip information from the arXiv paper.
|
188
189
|
|
@@ -190,20 +191,12 @@ async def extract_bitflip_info(arxiv_id: str) -> str:
|
|
190
191
|
questioning existing constraints or reapplying techniques to new domains/scales.
|
191
192
|
The "Bit" is the prevailing belief, and the "Flip" is the counterargument.
|
192
193
|
|
193
|
-
Returns a JSON object in this format:
|
194
|
-
{
|
195
|
-
"bit": "Technical limitation or conventional approach, in at least two sentences",
|
196
|
-
"flip": "Innovative approach or solution, in at least two sentences",
|
197
|
-
"spark": "4-6 word summary of the core idea"
|
198
|
-
}
|
199
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
200
|
-
|
201
194
|
Args:
|
202
195
|
arxiv_id: The arXiv ID of the paper to extract the Bit-Flip information from.
|
203
196
|
"""
|
204
197
|
model_name = settings.BITFLIP_MODEL_NAME
|
205
198
|
paper = arxiv_download(arxiv_id)
|
206
|
-
abstract =
|
199
|
+
abstract = paper.abstract
|
207
200
|
prompt = encode_prompt(EXTRACT_PROMPT, abstract=abstract)
|
208
201
|
content = await llm_acall(
|
209
202
|
model_name=model_name,
|
@@ -212,12 +205,31 @@ async def extract_bitflip_info(arxiv_id: str) -> str:
|
|
212
205
|
)
|
213
206
|
result = extract_json(content)
|
214
207
|
bitflip_info: BitFlipInfo = BitFlipInfo.model_validate(result)
|
215
|
-
return
|
208
|
+
return bitflip_info
|
209
|
+
|
210
|
+
|
211
|
+
class ResearchProposal(BaseModel): # type: ignore
|
212
|
+
proposal_id: int = Field(default=0, description="ID of the proposal")
|
213
|
+
flip: str = Field(description="Innovative approach or solution, in at least two sentences")
|
214
|
+
spark: str = Field(description="4-6 word summary")
|
215
|
+
abstract: str = Field(
|
216
|
+
description="An abstract that summarizes the proposal in conference format."
|
217
|
+
)
|
218
|
+
experiments: List[str] = Field(
|
219
|
+
description="A list of experiments that would be conducted to validate the proposal."
|
220
|
+
)
|
221
|
+
risks_and_limitations: List[str] = Field(
|
222
|
+
description="A list of potential risks and limitations of the proposal."
|
223
|
+
)
|
224
|
+
|
225
|
+
|
226
|
+
class GenerateResearchProposalResponse(BaseModel): # type: ignore
|
227
|
+
proposals: List[ResearchProposal] = Field(description="A list of research proposals")
|
216
228
|
|
217
229
|
|
218
230
|
async def generate_research_proposals(
|
219
231
|
bit: str, num_proposals: int = 3, additional_context: str = ""
|
220
|
-
) ->
|
232
|
+
) -> GenerateResearchProposalResponse:
|
221
233
|
"""
|
222
234
|
Proposes improvement ideas for the Bit.
|
223
235
|
|
@@ -225,20 +237,6 @@ async def generate_research_proposals(
|
|
225
237
|
bit: The Bit to propose improvement ideas for. The bit is a technical limitation or conventional approach of some paper.
|
226
238
|
num_proposals: The number of proposals to generate.
|
227
239
|
additional_context: Additional context to use when proposing the improvement idea.
|
228
|
-
|
229
|
-
Returns a JSON string with a research proposal in this format:
|
230
|
-
[
|
231
|
-
{
|
232
|
-
"proposal_id": ...,
|
233
|
-
"flip": "Innovative approach or solution, in at least two sentences",
|
234
|
-
"spark": "4-6 word summary",
|
235
|
-
"abstract": "An abstract that summarizes the proposal in conference format (approximately 250 words).",
|
236
|
-
"experiments": ["...", "..."],
|
237
|
-
"risks_and_limitations": "A list of potential risks and limitations of the proposal."
|
238
|
-
},
|
239
|
-
...
|
240
|
-
]
|
241
|
-
Use `json.loads` to deserialize the result if you want to get specific items.
|
242
240
|
"""
|
243
241
|
model_name = settings.BITFLIP_MODEL_NAME
|
244
242
|
max_completion_tokens = int(settings.BITFLIP_MAX_COMPLETION_TOKENS)
|
@@ -262,46 +260,51 @@ async def generate_research_proposals(
|
|
262
260
|
temperature=1.0,
|
263
261
|
)
|
264
262
|
result = extract_json(content)
|
265
|
-
|
266
|
-
|
267
|
-
|
263
|
+
return GenerateResearchProposalResponse(
|
264
|
+
proposals=[ResearchProposal.model_validate(proposal) for proposal in result]
|
265
|
+
)
|
266
|
+
|
267
|
+
|
268
|
+
class ScoredProposal(BaseModel): # type: ignore
|
269
|
+
proposal_id: int = Field(default=0, description="ID of the proposal")
|
270
|
+
spark: str = Field(description="4-6 word summary")
|
271
|
+
strengths: List[str] = Field(description="A list of strengths of the proposal")
|
272
|
+
weaknesses: List[str] = Field(description="A list of weaknesses of the proposal")
|
273
|
+
novelty: int = Field(description="Novelty rating from 1 to 4")
|
274
|
+
clarity: int = Field(description="Clarity rating from 1 to 4")
|
275
|
+
significance: int = Field(description="Significance rating from 1 to 4")
|
276
|
+
feasibility: int = Field(description="Feasibility rating from 1 to 4")
|
277
|
+
soundness: int = Field(description="Soundness rating from 1 to 4")
|
278
|
+
overall: int = Field(description="Overall rating from 1 to 10")
|
268
279
|
|
269
280
|
|
270
|
-
|
281
|
+
class ScoreResearchProposalsResponse(BaseModel): # type: ignore
|
282
|
+
proposals: List[ScoredProposal] = Field(description="List of scored proposals")
|
283
|
+
|
284
|
+
|
285
|
+
async def score_research_proposals(
|
286
|
+
proposals: str | List[str | Dict[str, Any] | Any],
|
287
|
+
) -> ScoreResearchProposalsResponse:
|
271
288
|
"""
|
272
289
|
Scores a list of research proposals.
|
273
290
|
Use proposals obtained with the `generate_research_proposal` tool.
|
274
291
|
|
275
|
-
Returns a JSON string with a list of scores in this format:
|
276
|
-
[
|
277
|
-
{
|
278
|
-
"proposal_id": 0,
|
279
|
-
"spark": "...",
|
280
|
-
"strengths": ["...", "..."],
|
281
|
-
"weaknesses": ["...", "..."],
|
282
|
-
"novelty": 2,
|
283
|
-
"clarity": 2,
|
284
|
-
"significance": 2,
|
285
|
-
"feasibility": 2,
|
286
|
-
"soundness": 2,
|
287
|
-
"overall": 5
|
288
|
-
},
|
289
|
-
...
|
290
|
-
]
|
291
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
292
|
-
|
293
292
|
Args:
|
294
293
|
proposals: A list of JSON strings with research proposals.
|
295
294
|
"""
|
296
295
|
model_name = settings.BITFLIP_MODEL_NAME
|
297
296
|
if isinstance(proposals, str):
|
298
297
|
proposals = json.loads(proposals)
|
299
|
-
assert isinstance(proposals, list), "Proposals should be a list
|
300
|
-
|
298
|
+
assert isinstance(proposals, list), "Proposals should be a list"
|
299
|
+
if isinstance(proposals, list):
|
300
|
+
proposals = [str(p) for p in proposals]
|
301
|
+
prompt = encode_prompt(SCORE_PROMPT, proposals=proposals)
|
301
302
|
content = await llm_acall(
|
302
303
|
model_name=model_name,
|
303
304
|
messages=[ChatMessage(role="user", content=prompt)],
|
304
305
|
temperature=0.0,
|
305
306
|
)
|
306
307
|
scores = extract_json(content)
|
307
|
-
return
|
308
|
+
return ScoreResearchProposalsResponse(
|
309
|
+
proposals=[ScoredProposal.model_validate(score) for score in scores]
|
310
|
+
)
|
academia_mcp/tools/s2.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
# Based on
|
2
2
|
# https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_get_paper_citations
|
3
3
|
|
4
|
-
import json
|
5
4
|
from typing import Optional, List, Dict, Any
|
6
5
|
|
6
|
+
from pydantic import BaseModel, Field
|
7
|
+
|
7
8
|
from academia_mcp.utils import get_with_retries
|
8
9
|
|
9
10
|
|
@@ -13,42 +14,58 @@ REFERENCES_URL_TEMPLATE = "https://api.semanticscholar.org/graph/v1/paper/{paper
|
|
13
14
|
FIELDS = "title,authors,externalIds,venue,citationCount,publicationDate"
|
14
15
|
|
15
16
|
|
17
|
+
class S2PaperInfo(BaseModel): # type: ignore
|
18
|
+
arxiv_id: Optional[str] = Field(description="ArXiv ID of the paper", default=None)
|
19
|
+
external_ids: Optional[Dict[str, Any]] = Field(
|
20
|
+
description="External IDs of the paper.", default=None
|
21
|
+
)
|
22
|
+
title: str = Field(description="Paper title")
|
23
|
+
authors: List[str] = Field(description="Authors of the paper")
|
24
|
+
venue: str = Field(description="Paper venue")
|
25
|
+
citation_count: Optional[int] = Field(description="Paper citation count", default=None)
|
26
|
+
publication_date: Optional[str] = Field(description="Paper publication date", default=None)
|
27
|
+
|
28
|
+
|
29
|
+
class S2SearchResponse(BaseModel): # type: ignore
|
30
|
+
total_count: int = Field(description="Total number of results.")
|
31
|
+
returned_count: int = Field(description="Number of results returned.")
|
32
|
+
offset: int = Field(description="Offset of the results.")
|
33
|
+
results: List[S2PaperInfo] = Field(description="Search entries")
|
34
|
+
|
35
|
+
|
16
36
|
def _format_authors(authors: List[Dict[str, Any]]) -> List[str]:
|
17
37
|
return [a["name"] for a in authors]
|
18
38
|
|
19
39
|
|
20
|
-
def _clean_entry(entry: Dict[str, Any]) ->
|
40
|
+
def _clean_entry(entry: Dict[str, Any]) -> S2PaperInfo:
|
21
41
|
entry = entry["citingPaper"] if "citingPaper" in entry else entry["citedPaper"]
|
22
42
|
external_ids = entry.get("externalIds")
|
23
43
|
if not external_ids:
|
24
44
|
external_ids = dict()
|
25
45
|
external_ids.pop("CorpusId", None)
|
26
46
|
arxiv_id = external_ids.pop("ArXiv", None)
|
27
|
-
return
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
47
|
+
return S2PaperInfo(
|
48
|
+
arxiv_id=arxiv_id,
|
49
|
+
external_ids=external_ids if external_ids else None,
|
50
|
+
title=entry["title"],
|
51
|
+
authors=_format_authors(entry["authors"]),
|
52
|
+
venue=entry.get("venue", ""),
|
53
|
+
citation_count=entry.get("citationCount"),
|
54
|
+
publication_date=entry.get("publicationDate"),
|
55
|
+
)
|
36
56
|
|
37
57
|
|
38
58
|
def _format_entries(
|
39
59
|
entries: List[Dict[str, Any]],
|
40
60
|
start_index: int,
|
41
61
|
total_results: int,
|
42
|
-
) ->
|
62
|
+
) -> S2SearchResponse:
|
43
63
|
clean_entries = [_clean_entry(e) for e in entries]
|
44
|
-
return
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
"results": clean_entries,
|
50
|
-
},
|
51
|
-
ensure_ascii=False,
|
64
|
+
return S2SearchResponse(
|
65
|
+
total_count=total_results,
|
66
|
+
returned_count=len(entries),
|
67
|
+
offset=start_index,
|
68
|
+
results=clean_entries,
|
52
69
|
)
|
53
70
|
|
54
71
|
|
@@ -56,16 +73,10 @@ def s2_get_citations(
|
|
56
73
|
arxiv_id: str,
|
57
74
|
offset: Optional[int] = 0,
|
58
75
|
limit: Optional[int] = 50,
|
59
|
-
) ->
|
76
|
+
) -> S2SearchResponse:
|
60
77
|
"""
|
61
78
|
Get all papers that cited a given arXiv paper based on Semantic Scholar info.
|
62
79
|
|
63
|
-
Returns a JSON object serialized to a string. The structure is:
|
64
|
-
{"total_count": ..., "returned_count": ..., "offset": ..., "results": [...]}
|
65
|
-
Every item in the "results" has the following fields:
|
66
|
-
("arxiv_id", "external_ids", "title", "authors", "venue", "citation_count", "publication_date")
|
67
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
68
|
-
|
69
80
|
Args:
|
70
81
|
arxiv_id: The ID of a given arXiv paper.
|
71
82
|
offset: The offset to scroll through citations. 10 items will be skipped if offset=10. 0 by default.
|
@@ -98,16 +109,10 @@ def s2_get_references(
|
|
98
109
|
arxiv_id: str,
|
99
110
|
offset: Optional[int] = 0,
|
100
111
|
limit: Optional[int] = 50,
|
101
|
-
) ->
|
112
|
+
) -> S2SearchResponse:
|
102
113
|
"""
|
103
114
|
Get all papers that were cited by a given arXiv paper (references) based on Semantic Scholar info.
|
104
115
|
|
105
|
-
Returns a JSON object serialized to a string. The structure is:
|
106
|
-
{"total_count": ..., "returned_count": ..., "offset": ..., "results": [...]}
|
107
|
-
Every item in the "results" has the following fields:
|
108
|
-
("arxiv_id", "external_ids", "title", "authors", "venue", "citation_count", "publication_date")
|
109
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
110
|
-
|
111
116
|
Args:
|
112
117
|
arxiv_id: The ID of a given arXiv paper.
|
113
118
|
offset: The offset to scroll through citations. 10 items will be skipped if offset=10. 0 by default.
|
@@ -144,14 +149,10 @@ def s2_corpus_id_from_arxiv_id(arxiv_id: str) -> int:
|
|
144
149
|
return int(result["externalIds"]["CorpusId"])
|
145
150
|
|
146
151
|
|
147
|
-
def s2_get_info(arxiv_id: str) ->
|
152
|
+
def s2_get_info(arxiv_id: str) -> S2PaperInfo:
|
148
153
|
"""
|
149
154
|
Get the S2 info for a given arXiv ID.
|
150
155
|
|
151
|
-
Returns a JSON object serialized to a string. The structure is:
|
152
|
-
{"title": ..., "authors": ..., "externalIds": ..., "venue": ..., "citationCount": ..., "publicationDate": ...}
|
153
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
154
|
-
|
155
156
|
Args:
|
156
157
|
arxiv_id: The ID of a given arXiv paper.
|
157
158
|
"""
|
@@ -160,4 +161,13 @@ def s2_get_info(arxiv_id: str) -> str:
|
|
160
161
|
arxiv_id = arxiv_id.split("v")[0]
|
161
162
|
paper_url = PAPER_URL_TEMPLATE.format(paper_id=f"arxiv:{arxiv_id}", fields=FIELDS)
|
162
163
|
response = get_with_retries(paper_url)
|
163
|
-
|
164
|
+
json_data = response.json()
|
165
|
+
return S2PaperInfo(
|
166
|
+
arxiv_id=json_data.get("externalIds", {}).get("ArXiv"),
|
167
|
+
external_ids=json_data.get("externalIds", {}),
|
168
|
+
title=json_data["title"],
|
169
|
+
authors=_format_authors(json_data["authors"]),
|
170
|
+
venue=json_data.get("venue", ""),
|
171
|
+
citation_count=int(json_data.get("citationCount", 0)),
|
172
|
+
publication_date=str(json_data.get("publicationDate", "")),
|
173
|
+
)
|
academia_mcp/tools/show_image.py
CHANGED
@@ -30,7 +30,20 @@ DESCRIBE_PROMPTS = {
|
|
30
30
|
4. Any immediate tactical opportunities or threats
|
31
31
|
5. Suggested next moves with brief explanations"""
|
32
32
|
),
|
33
|
-
"text":
|
33
|
+
"text": dedent(
|
34
|
+
"""You are performing OCR and transcription.
|
35
|
+
Extract ALL text and numbers from the image verbatim.
|
36
|
+
- Preserve original casing, punctuation, symbols, mathematical notation, and whitespace layout when possible.
|
37
|
+
- If layout is multi-column or tabular, reconstruct lines top-to-bottom, left-to-right; use line breaks between blocks.
|
38
|
+
- For any uncertain or low-confidence characters, mark with a '?' and include a note.
|
39
|
+
- After the raw extraction, provide a clean, normalized version (fixing obvious OCR artifacts) as a separate section.
|
40
|
+
Return two sections:
|
41
|
+
[RAW TRANSCRIPTION]
|
42
|
+
...
|
43
|
+
[NORMALIZED]
|
44
|
+
...
|
45
|
+
"""
|
46
|
+
),
|
34
47
|
}
|
35
48
|
|
36
49
|
|
@@ -44,10 +57,8 @@ def show_image(path: str) -> Dict[str, str]:
|
|
44
57
|
```
|
45
58
|
Do not print it ever, just return as the last expression.
|
46
59
|
|
47
|
-
Returns an dictionary with a single "image" key.
|
48
|
-
|
49
60
|
Args:
|
50
|
-
|
61
|
+
path: Path to file inside current work directory or web URL
|
51
62
|
"""
|
52
63
|
if path.startswith("http"):
|
53
64
|
response = httpx.get(path, timeout=10)
|
@@ -80,7 +91,7 @@ async def describe_image(
|
|
80
91
|
- "general": General description of the image
|
81
92
|
- "detailed": Detailed analysis of the image
|
82
93
|
- "chess": Analysis of a chess position
|
83
|
-
- "text": Extract and describe text from the image
|
94
|
+
- "text": Extract and describe text or numbers from the image
|
84
95
|
- "custom": Custom description based on user prompt
|
85
96
|
"""
|
86
97
|
image_base64 = show_image(path)["image_base64"]
|
@@ -93,12 +104,16 @@ async def describe_image(
|
|
93
104
|
{"type": "text", "text": prompt},
|
94
105
|
{
|
95
106
|
"type": "image_url",
|
96
|
-
"image_url": {"url": f"data:image/
|
107
|
+
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
97
108
|
},
|
98
109
|
]
|
99
110
|
model_name = settings.DESCRIBE_IMAGE_MODEL_NAME
|
111
|
+
llm_kwargs = {}
|
112
|
+
if description_type in {"text", "chess"}:
|
113
|
+
llm_kwargs["temperature"] = 0.0
|
100
114
|
response = await llm_acall(
|
101
115
|
model_name=model_name,
|
102
116
|
messages=[ChatMessage(role="user", content=content)],
|
117
|
+
**llm_kwargs,
|
103
118
|
)
|
104
119
|
return response
|
@@ -1,12 +1,11 @@
|
|
1
1
|
import re
|
2
|
-
import
|
3
|
-
from typing import Optional, Dict, Any, cast
|
2
|
+
from typing import Any, Dict, List, Optional
|
4
3
|
|
5
4
|
from markdownify import markdownify # type: ignore
|
5
|
+
from pydantic import BaseModel, Field
|
6
6
|
|
7
|
-
from academia_mcp.utils import get_with_retries, post_with_retries
|
8
7
|
from academia_mcp.settings import settings
|
9
|
-
from academia_mcp.utils import sanitize_output
|
8
|
+
from academia_mcp.utils import get_with_retries, post_with_retries, sanitize_output
|
10
9
|
|
11
10
|
EXA_CONTENTS_URL = "https://api.exa.ai/contents"
|
12
11
|
TAVILY_EXTRACT_URL = "https://api.tavily.com/extract"
|
@@ -14,6 +13,16 @@ AVAILABLE_PROVIDERS = ("basic", "exa", "tavily")
|
|
14
13
|
ERROR_MESSAGE = "Failed to get content from the page. Try to use another provider."
|
15
14
|
|
16
15
|
|
16
|
+
class VisitWebpageResponse(BaseModel): # type: ignore
|
17
|
+
id: str = Field(description="ID of the webpage, usually the URL")
|
18
|
+
provider: str = Field(description="Provider used to get the content")
|
19
|
+
text: Optional[str] = Field(description="Text content of the webpage", default=None)
|
20
|
+
images: List[str] = Field(description="Images of the webpage", default_factory=list)
|
21
|
+
error: Optional[str] = Field(
|
22
|
+
description="Error message if the webpage is not found", default=None
|
23
|
+
)
|
24
|
+
|
25
|
+
|
17
26
|
def _exa_visit_webpage(url: str) -> Dict[str, Any]:
|
18
27
|
key = settings.EXA_API_KEY or ""
|
19
28
|
assert key, "Error: EXA_API_KEY is not set and no api_key was provided"
|
@@ -25,7 +34,7 @@ def _exa_visit_webpage(url: str) -> Dict[str, Any]:
|
|
25
34
|
results = response.json()["results"]
|
26
35
|
if not results:
|
27
36
|
return {"error": ERROR_MESSAGE}
|
28
|
-
return
|
37
|
+
return {"text": results[0]["text"]}
|
29
38
|
|
30
39
|
|
31
40
|
def _tavily_visit_webpage(url: str) -> Dict[str, Any]:
|
@@ -33,12 +42,15 @@ def _tavily_visit_webpage(url: str) -> Dict[str, Any]:
|
|
33
42
|
assert key, "Error: TAVILY_API_KEY is not set and no api_key was provided"
|
34
43
|
payload = {
|
35
44
|
"urls": [url],
|
45
|
+
"extract_depth": "advanced",
|
46
|
+
"include_images": True,
|
36
47
|
}
|
37
48
|
response = post_with_retries(TAVILY_EXTRACT_URL, payload=payload, api_key=key)
|
38
49
|
results = response.json()["results"]
|
39
50
|
if not results:
|
40
51
|
return {"error": ERROR_MESSAGE}
|
41
|
-
|
52
|
+
result = results[0]
|
53
|
+
return {"text": result["raw_content"], "images": result["images"]}
|
42
54
|
|
43
55
|
|
44
56
|
def _basic_visit_webpage(url: str) -> Dict[str, Any]:
|
@@ -58,13 +70,9 @@ def _basic_visit_webpage(url: str) -> Dict[str, Any]:
|
|
58
70
|
return {"error": str(e) + "\n" + ERROR_MESSAGE}
|
59
71
|
|
60
72
|
|
61
|
-
def visit_webpage(url: str, provider: Optional[str] = "basic") ->
|
73
|
+
def visit_webpage(url: str, provider: Optional[str] = "basic") -> VisitWebpageResponse:
|
62
74
|
"""
|
63
75
|
Visit a webpage and return the content.
|
64
|
-
|
65
|
-
Returns a JSON object serialized to a string. The structure is: {"id": "...", "text": "..."}.
|
66
|
-
If there are errors, the structure is: {"id": "...", "error": "..."}.
|
67
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
68
76
|
Try to use both "tavily" and "basic" providers. They might work differently for the same URL.
|
69
77
|
|
70
78
|
Args:
|
@@ -82,6 +90,9 @@ def visit_webpage(url: str, provider: Optional[str] = "basic") -> str:
|
|
82
90
|
else:
|
83
91
|
result = _basic_visit_webpage(url)
|
84
92
|
|
85
|
-
result
|
86
|
-
result
|
87
|
-
|
93
|
+
result = VisitWebpageResponse(id=url, provider=provider, **result)
|
94
|
+
if result.text:
|
95
|
+
result.text = sanitize_output(result.text)
|
96
|
+
if result.error:
|
97
|
+
result.error = sanitize_output(result.error)
|
98
|
+
return result
|
academia_mcp/tools/web_search.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1
|
-
import
|
2
|
-
from typing import Optional, List, Tuple
|
1
|
+
from typing import List, Optional, Tuple
|
3
2
|
|
4
|
-
from
|
5
|
-
from academia_mcp.settings import settings
|
6
|
-
from academia_mcp.utils import sanitize_output
|
3
|
+
from pydantic import BaseModel, Field
|
7
4
|
|
5
|
+
from academia_mcp.settings import settings
|
6
|
+
from academia_mcp.utils import get_with_retries, post_with_retries, sanitize_output
|
8
7
|
|
9
8
|
EXA_SEARCH_URL = "https://api.exa.ai/search"
|
10
9
|
TAVILY_SEARCH_URL = "https://api.tavily.com/search"
|
@@ -30,20 +29,27 @@ def _parse_domains(query: str) -> Tuple[str, List[str]]:
|
|
30
29
|
return query, include_domains
|
31
30
|
|
32
31
|
|
32
|
+
class WebSearchEntry(BaseModel): # type: ignore
|
33
|
+
id: str = Field(description="ID of the search entry, usually the URL")
|
34
|
+
title: str = Field(description="Title of the web page")
|
35
|
+
content: str = Field(description="Content of the web page")
|
36
|
+
|
37
|
+
|
38
|
+
class WebSearchResponse(BaseModel): # type: ignore
|
39
|
+
results: List[WebSearchEntry] = Field(description="Results of the search")
|
40
|
+
search_provider: str = Field(description="Provider used to get the results")
|
41
|
+
|
42
|
+
|
33
43
|
def web_search(
|
34
44
|
query: str,
|
35
45
|
limit: Optional[int] = 20,
|
36
46
|
provider: Optional[str] = "tavily",
|
37
47
|
include_domains: Optional[List[str]] = None,
|
38
|
-
) ->
|
48
|
+
) -> WebSearchResponse:
|
39
49
|
"""
|
40
50
|
Search the web using Exa Search, Brave Search or Tavily and return normalized results.
|
41
51
|
If the specified provider is not available, the function will try to use the next available provider.
|
42
52
|
|
43
|
-
Returns a JSON object serialized to a string. The structure is: {"results": [...]}
|
44
|
-
Every item in the "results" has at least the following fields: ("title", "url")
|
45
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
46
|
-
|
47
53
|
Args:
|
48
54
|
query: The search query, required.
|
49
55
|
limit: The maximum number of items to return. 20 by default, maximum 25.
|
@@ -81,27 +87,23 @@ def web_search(
|
|
81
87
|
provider = p
|
82
88
|
break
|
83
89
|
|
84
|
-
result =
|
90
|
+
result: Optional[WebSearchResponse] = None
|
85
91
|
if provider == "exa":
|
86
|
-
result =
|
92
|
+
result = exa_web_search(query, limit, include_domains=include_domains)
|
87
93
|
elif provider == "brave":
|
88
|
-
result =
|
94
|
+
result = brave_web_search(query, limit)
|
89
95
|
elif provider == "tavily":
|
90
|
-
result =
|
91
|
-
result
|
92
|
-
return
|
96
|
+
result = tavily_web_search(query, limit, include_domains=include_domains)
|
97
|
+
assert result is not None, "Error: No provider was available"
|
98
|
+
return result
|
93
99
|
|
94
100
|
|
95
101
|
def tavily_web_search(
|
96
102
|
query: str, limit: Optional[int] = 20, include_domains: Optional[List[str]] = None
|
97
|
-
) ->
|
103
|
+
) -> WebSearchResponse:
|
98
104
|
"""
|
99
105
|
Search the web using Tavily and return results.
|
100
106
|
|
101
|
-
Returns a JSON object serialized to a string. The structure is: {"results": [...]}
|
102
|
-
Every item in the "results" has at least the following fields: ("title", "url")
|
103
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
104
|
-
|
105
107
|
Args:
|
106
108
|
query: The search query, required.
|
107
109
|
limit: The maximum number of items to return. 20 by default, maximum 25.
|
@@ -131,23 +133,23 @@ def tavily_web_search(
|
|
131
133
|
results = response.json()["results"]
|
132
134
|
for result in results:
|
133
135
|
content = " ".join(result["content"].split(" ")[:40])
|
134
|
-
content = content.strip("., ")
|
136
|
+
content = sanitize_output(content.strip("., "))
|
135
137
|
result["content"] = content
|
136
138
|
result.pop("raw_content", None)
|
137
139
|
result.pop("score", None)
|
138
|
-
|
140
|
+
entries = [
|
141
|
+
WebSearchEntry(id=result["url"], title=result["title"], content=result["content"])
|
142
|
+
for result in results
|
143
|
+
]
|
144
|
+
return WebSearchResponse(results=entries, search_provider="tavily")
|
139
145
|
|
140
146
|
|
141
147
|
def exa_web_search(
|
142
148
|
query: str, limit: Optional[int] = 20, include_domains: Optional[List[str]] = None
|
143
|
-
) ->
|
149
|
+
) -> WebSearchResponse:
|
144
150
|
"""
|
145
151
|
Search the web using Exa and return results.
|
146
152
|
|
147
|
-
Returns a JSON object serialized to a string. The structure is: {"results": [...]}
|
148
|
-
Every item in the "results" has at least the following fields: ("title", "url")
|
149
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
150
|
-
|
151
153
|
Args:
|
152
154
|
query: The search query, required.
|
153
155
|
limit: The maximum number of items to return. 20 by default, maximum 25.
|
@@ -184,17 +186,18 @@ def exa_web_search(
|
|
184
186
|
|
185
187
|
response = post_with_retries(EXA_SEARCH_URL, payload, key)
|
186
188
|
results = response.json()["results"]
|
187
|
-
|
189
|
+
entries = []
|
190
|
+
for result in results:
|
191
|
+
content = " || ".join(result["highlights"])
|
192
|
+
content = sanitize_output(content)
|
193
|
+
entries.append(WebSearchEntry(id=result["url"], title=result["title"], content=content))
|
194
|
+
return WebSearchResponse(results=entries, search_provider="exa")
|
188
195
|
|
189
196
|
|
190
|
-
def brave_web_search(query: str, limit: Optional[int] = 20) ->
|
197
|
+
def brave_web_search(query: str, limit: Optional[int] = 20) -> WebSearchResponse:
|
191
198
|
"""
|
192
199
|
Search the web using Brave and return results.
|
193
200
|
|
194
|
-
Returns a JSON object serialized to a string. The structure is: {"results": [...]}
|
195
|
-
Every item in the "results" has at least the following fields: ("title", "url")
|
196
|
-
Use `json.loads` to deserialize the result if you want to get specific fields.
|
197
|
-
|
198
201
|
Args:
|
199
202
|
query: The search query, required.
|
200
203
|
limit: The maximum number of items to return. 20 by default, maximum 20.
|
@@ -212,4 +215,8 @@ def brave_web_search(query: str, limit: Optional[int] = 20) -> str:
|
|
212
215
|
}
|
213
216
|
response = get_with_retries(BRAVE_SEARCH_URL, key, params=payload)
|
214
217
|
results = response.json()["web"]["results"]
|
215
|
-
|
218
|
+
entries = []
|
219
|
+
for result in results:
|
220
|
+
content = sanitize_output(result["description"])
|
221
|
+
entries.append(WebSearchEntry(id=result["url"], title=result["title"], content=content))
|
222
|
+
return WebSearchResponse(results=entries, search_provider="brave")
|
academia_mcp/utils.py
CHANGED
@@ -171,6 +171,8 @@ def sanitize_output(output: str) -> str:
|
|
171
171
|
"""
|
172
172
|
See https://github.com/modelcontextprotocol/python-sdk/issues/1144#issuecomment-3076506124
|
173
173
|
"""
|
174
|
+
if not output:
|
175
|
+
return output
|
174
176
|
output = output.replace("\x85", " ")
|
175
177
|
output = output.replace("\u0085", " ")
|
176
178
|
return output
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: academia-mcp
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.11.1
|
4
4
|
Summary: MCP server that provides different tools to search for scientific publications
|
5
5
|
Author-email: Ilya Gusev <phoenixilya@gmail.com>
|
6
6
|
Project-URL: Homepage, https://github.com/IlyaGusev/academia_mcp
|
@@ -4,30 +4,30 @@ academia_mcp/files.py,sha256=ynIt0XbU1Z7EPWkv_hVX0pGKsLlmjYv-MVJLOfi6yzs,817
|
|
4
4
|
academia_mcp/llm.py,sha256=zpGkuJFf58Ofgys_fi28-47_wJ1a7sIs_yZvI1Si6z0,993
|
5
5
|
academia_mcp/pdf.py,sha256=9PlXzHGhb6ay3ldbTdxCcTWvH4TkET3bnb64mgoh9i0,1273
|
6
6
|
academia_mcp/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
academia_mcp/server.py,sha256=
|
7
|
+
academia_mcp/server.py,sha256=uJNV5YStMJ3sEbx4waOclSw_hN0Gkz5PJ-q8VjX1kfA,6390
|
8
8
|
academia_mcp/settings.py,sha256=c5s4dI8V_cWmMED-jKDmHjfdIaBcxwEK4HdHNQ3WUIg,1096
|
9
|
-
academia_mcp/utils.py,sha256=
|
9
|
+
academia_mcp/utils.py,sha256=ixtYI7qidFJpc8Tzc5sokseCLx4r0yFFgbktKaY-Ixo,4904
|
10
10
|
academia_mcp/latex_templates/agents4science_2025/agents4science_2025.sty,sha256=hGcEPCYBJS4vdhWvN_yEaJC4GvT_yDroI94CfY2Oguk,12268
|
11
11
|
academia_mcp/latex_templates/agents4science_2025/agents4science_2025.tex,sha256=Tl1QkHXHRopw9VEfWrD3Layr5JP_0gIzVQjL4KXIWqc,15814
|
12
12
|
academia_mcp/tools/__init__.py,sha256=Z30vULZwUeUX5nDz5wcv0znhAeBtZRa0dvz7vD8SUYE,1555
|
13
13
|
academia_mcp/tools/anthology_search.py,sha256=rhFpJZqGLABgr0raDuH0CARBiAJNJtEI4dlMrKNHfDQ,7669
|
14
|
-
academia_mcp/tools/arxiv_download.py,sha256=
|
15
|
-
academia_mcp/tools/arxiv_search.py,sha256=
|
16
|
-
academia_mcp/tools/bitflip.py,sha256=
|
14
|
+
academia_mcp/tools/arxiv_download.py,sha256=4zl9QkWF7uU2BYz4XH7Fu_51htolSYtO7a2v4_ikhxg,10633
|
15
|
+
academia_mcp/tools/arxiv_search.py,sha256=p4DeCvV3TUpj58etx0QOxm-GYKAWkP9AGjLv4HUUqUc,8857
|
16
|
+
academia_mcp/tools/bitflip.py,sha256=0OPxQRU5VLWrcIlj4Ubjeq4gIq0pR5FDhXDHuLR0ppw,12759
|
17
17
|
academia_mcp/tools/document_qa.py,sha256=Wb2nEEVu9UyPp8ktHWeT9wS2JBle8fb9zRjTNVIDdBE,2463
|
18
18
|
academia_mcp/tools/hf_datasets_search.py,sha256=KiBkqT4rXjEN4oc1AWZOPnqN_Go90TQogY5-DUm3LQo,2854
|
19
19
|
academia_mcp/tools/latex.py,sha256=B1Leqt1FHY6H3DlUgeYse4LMFpf4-K1FQViXl5MKk8A,6144
|
20
20
|
academia_mcp/tools/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
academia_mcp/tools/review.py,sha256=Va0lFJJKuk-NvWhKS3UZ-Dnuk7CyuDQ4S1nd70D-ffE,11117
|
22
|
-
academia_mcp/tools/s2.py,sha256=
|
23
|
-
academia_mcp/tools/show_image.py,sha256=
|
22
|
+
academia_mcp/tools/s2.py,sha256=ykJOkpHnyZRlfyDnCIL9m-Rnu5dBecoKxg0SjlzdvLk,6457
|
23
|
+
academia_mcp/tools/show_image.py,sha256=DWSnYMTn_dJpGTLL1r_sbX5XsB6p9z-vClApDANz84s,4534
|
24
24
|
academia_mcp/tools/speech_to_text.py,sha256=YZzMqdvunzXkpcadP_mYhm6cs4qH1Y_42SfY-7eX4O4,1601
|
25
|
-
academia_mcp/tools/visit_webpage.py,sha256=
|
26
|
-
academia_mcp/tools/web_search.py,sha256=
|
25
|
+
academia_mcp/tools/visit_webpage.py,sha256=uJZx9vBGS8q-J-VH4Pr7T9lNtDsWU83gJhlotcd1ajg,3788
|
26
|
+
academia_mcp/tools/web_search.py,sha256=CHgco8DufTFwtVecgDOOMylIY99iUmCdb0oZtpGntx0,8646
|
27
27
|
academia_mcp/tools/yt_transcript.py,sha256=ilfOpX14moC1bKHbFmOVvZ8-_NxuQQUoQbV28e9FBaE,1217
|
28
|
-
academia_mcp-1.
|
29
|
-
academia_mcp-1.
|
30
|
-
academia_mcp-1.
|
31
|
-
academia_mcp-1.
|
32
|
-
academia_mcp-1.
|
33
|
-
academia_mcp-1.
|
28
|
+
academia_mcp-1.11.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
29
|
+
academia_mcp-1.11.1.dist-info/METADATA,sha256=e8HbUnRcj5HSHxC7soI4yRFpO5-xyoL_4kMYTUNPwK0,6356
|
30
|
+
academia_mcp-1.11.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
31
|
+
academia_mcp-1.11.1.dist-info/entry_points.txt,sha256=gxkiKJ74w2FwJpSECpjA3XtCfI5ZfrM6N8cqnwsq4yY,51
|
32
|
+
academia_mcp-1.11.1.dist-info/top_level.txt,sha256=CzGpRFsRRJRqWEb1e3SUlcfGqRzOxevZGaJWrtGF8W0,13
|
33
|
+
academia_mcp-1.11.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|