aiagents4pharma 1.30.0__py3-none-any.whl → 1.30.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2scholars/agents/main_agent.py +18 -10
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +5 -6
- aiagents4pharma/talk2scholars/agents/pdf_agent.py +4 -10
- aiagents4pharma/talk2scholars/agents/zotero_agent.py +9 -7
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +18 -9
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +2 -2
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +9 -15
- aiagents4pharma/talk2scholars/configs/app/frontend/default.yaml +1 -0
- aiagents4pharma/talk2scholars/configs/tools/multi_paper_recommendation/default.yaml +6 -1
- aiagents4pharma/talk2scholars/configs/tools/search/default.yaml +7 -1
- aiagents4pharma/talk2scholars/configs/tools/single_paper_recommendation/default.yaml +6 -1
- aiagents4pharma/talk2scholars/configs/tools/zotero_read/default.yaml +1 -1
- aiagents4pharma/talk2scholars/configs/tools/zotero_write/default.yaml +55 -0
- aiagents4pharma/talk2scholars/state/state_talk2scholars.py +7 -1
- aiagents4pharma/talk2scholars/tests/test_llm_main_integration.py +84 -53
- aiagents4pharma/talk2scholars/tests/test_main_agent.py +24 -0
- aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +79 -15
- aiagents4pharma/talk2scholars/tests/test_routing_logic.py +13 -10
- aiagents4pharma/talk2scholars/tests/test_s2_multi.py +27 -4
- aiagents4pharma/talk2scholars/tests/test_s2_search.py +19 -3
- aiagents4pharma/talk2scholars/tests/test_s2_single.py +27 -3
- aiagents4pharma/talk2scholars/tests/test_zotero_agent.py +3 -2
- aiagents4pharma/talk2scholars/tests/test_zotero_human_in_the_loop.py +273 -0
- aiagents4pharma/talk2scholars/tests/test_zotero_path.py +419 -1
- aiagents4pharma/talk2scholars/tests/test_zotero_read.py +25 -18
- aiagents4pharma/talk2scholars/tests/test_zotero_write.py +123 -588
- aiagents4pharma/talk2scholars/tools/paper_download/abstract_downloader.py +2 -0
- aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py +11 -4
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +5 -1
- aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +73 -26
- aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +46 -22
- aiagents4pharma/talk2scholars/tools/s2/query_results.py +1 -1
- aiagents4pharma/talk2scholars/tools/s2/search.py +40 -12
- aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +42 -16
- aiagents4pharma/talk2scholars/tools/zotero/__init__.py +1 -0
- aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_path.py +125 -0
- aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +35 -20
- aiagents4pharma/talk2scholars/tools/zotero/zotero_review.py +198 -0
- aiagents4pharma/talk2scholars/tools/zotero/zotero_write.py +86 -118
- {aiagents4pharma-1.30.0.dist-info → aiagents4pharma-1.30.2.dist-info}/METADATA +4 -3
- {aiagents4pharma-1.30.0.dist-info → aiagents4pharma-1.30.2.dist-info}/RECORD +44 -41
- {aiagents4pharma-1.30.0.dist-info → aiagents4pharma-1.30.2.dist-info}/WHEEL +1 -1
- {aiagents4pharma-1.30.0.dist-info → aiagents4pharma-1.30.2.dist-info/licenses}/LICENSE +0 -0
- {aiagents4pharma-1.30.0.dist-info → aiagents4pharma-1.30.2.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,7 @@ downloads the corresponding PDF.
|
|
8
8
|
By using an abstract base class, this implementation is extendable to other
|
9
9
|
APIs like PubMed, IEEE Xplore, etc.
|
10
10
|
"""
|
11
|
+
|
11
12
|
import xml.etree.ElementTree as ET
|
12
13
|
from typing import Any, Dict
|
13
14
|
import logging
|
@@ -19,6 +20,7 @@ from .abstract_downloader import AbstractPaperDownloader
|
|
19
20
|
logging.basicConfig(level=logging.INFO)
|
20
21
|
logger = logging.getLogger(__name__)
|
21
22
|
|
23
|
+
|
22
24
|
class ArxivPaperDownloader(AbstractPaperDownloader):
|
23
25
|
"""
|
24
26
|
Downloader class for arXiv.
|
@@ -35,13 +37,13 @@ class ArxivPaperDownloader(AbstractPaperDownloader):
|
|
35
37
|
"""
|
36
38
|
with hydra.initialize(version_base=None, config_path="../../configs"):
|
37
39
|
cfg = hydra.compose(
|
38
|
-
config_name="config",
|
39
|
-
overrides=["tools/download_arxiv_paper=default"]
|
40
|
+
config_name="config", overrides=["tools/download_arxiv_paper=default"]
|
40
41
|
)
|
41
42
|
self.api_url = cfg.tools.download_arxiv_paper.api_url
|
42
43
|
self.request_timeout = cfg.tools.download_arxiv_paper.request_timeout
|
43
44
|
self.chunk_size = cfg.tools.download_arxiv_paper.chunk_size
|
44
45
|
self.pdf_base_url = cfg.tools.download_arxiv_paper.pdf_base_url
|
46
|
+
|
45
47
|
def fetch_metadata(self, paper_id: str) -> Dict[str, Any]:
|
46
48
|
"""
|
47
49
|
Fetch metadata from arXiv for a given paper ID.
|
@@ -95,11 +97,16 @@ class ArxivPaperDownloader(AbstractPaperDownloader):
|
|
95
97
|
logger.info("Downloading PDF from: %s", pdf_url)
|
96
98
|
pdf_response = requests.get(pdf_url, stream=True, timeout=self.request_timeout)
|
97
99
|
pdf_response.raise_for_status()
|
100
|
+
# print (pdf_response)
|
98
101
|
|
99
102
|
# Combine the PDF data from chunks.
|
100
103
|
pdf_object = b"".join(
|
101
|
-
chunk
|
102
|
-
)
|
104
|
+
chunk
|
105
|
+
for chunk in pdf_response.iter_content(chunk_size=self.chunk_size)
|
106
|
+
if chunk
|
107
|
+
)
|
108
|
+
# print (pdf_object)
|
109
|
+
print("PDF_URL", pdf_url)
|
103
110
|
|
104
111
|
return {
|
105
112
|
"pdf_object": pdf_object,
|
@@ -14,16 +14,19 @@ from langgraph.types import Command
|
|
14
14
|
# Local import from the same package:
|
15
15
|
from .arxiv_downloader import ArxivPaperDownloader
|
16
16
|
|
17
|
+
|
17
18
|
class DownloadArxivPaperInput(BaseModel):
|
18
19
|
"""
|
19
20
|
Input schema for the arXiv paper download tool.
|
20
21
|
(Optional: if you decide to keep Pydantic validation in the future)
|
21
22
|
"""
|
23
|
+
|
22
24
|
arxiv_id: str = Field(
|
23
25
|
description="The arXiv paper ID used to retrieve the paper details and PDF."
|
24
|
-
|
26
|
+
)
|
25
27
|
tool_call_id: Annotated[str, InjectedToolCallId]
|
26
28
|
|
29
|
+
|
27
30
|
@tool(args_schema=DownloadArxivPaperInput, parse_docstring=True)
|
28
31
|
def download_arxiv_paper(
|
29
32
|
arxiv_id: str,
|
@@ -49,6 +52,7 @@ def download_arxiv_paper(
|
|
49
52
|
|
50
53
|
# If the downloader fails or the arxiv_id is invalid, this might raise an error
|
51
54
|
pdf_data = downloader.download_pdf(arxiv_id)
|
55
|
+
# print (pdf_data)
|
52
56
|
|
53
57
|
content = f"Successfully downloaded PDF for arXiv ID {arxiv_id}"
|
54
58
|
|
@@ -2,8 +2,8 @@
|
|
2
2
|
"""
|
3
3
|
question_and_answer: Tool for performing Q&A on PDF documents using retrieval augmented generation.
|
4
4
|
|
5
|
-
This module provides functionality to extract text from PDF binary data, split it into
|
6
|
-
chunks, retrieve relevant segments via a vector store, and generate an answer to a
|
5
|
+
This module provides functionality to extract text from PDF binary data, split it into
|
6
|
+
chunks, retrieve relevant segments via a vector store, and generate an answer to a
|
7
7
|
user-provided question using a language model chain.
|
8
8
|
"""
|
9
9
|
|
@@ -18,13 +18,15 @@ import hydra
|
|
18
18
|
from langchain.chains.question_answering import load_qa_chain
|
19
19
|
from langchain.docstore.document import Document
|
20
20
|
from langchain.text_splitter import CharacterTextSplitter
|
21
|
-
from langchain_community.vectorstores import Annoy
|
22
|
-
from langchain_openai import OpenAIEmbeddings
|
23
21
|
from langchain_core.language_models.chat_models import BaseChatModel
|
24
|
-
|
22
|
+
from langchain_core.vectorstores import InMemoryVectorStore
|
25
23
|
from langchain_core.messages import ToolMessage
|
26
24
|
from langchain_core.tools import tool
|
27
25
|
from langchain_core.tools.base import InjectedToolCallId
|
26
|
+
from langchain_core.embeddings import Embeddings
|
27
|
+
from langchain_community.vectorstores import Annoy
|
28
|
+
from langchain_community.document_loaders import PyPDFLoader
|
29
|
+
from langchain_openai import OpenAIEmbeddings
|
28
30
|
from langgraph.types import Command
|
29
31
|
from langgraph.prebuilt import InjectedState
|
30
32
|
|
@@ -35,10 +37,13 @@ logger.setLevel(logging.INFO)
|
|
35
37
|
|
36
38
|
# Load configuration using Hydra.
|
37
39
|
with hydra.initialize(version_base=None, config_path="../../configs"):
|
38
|
-
cfg = hydra.compose(
|
40
|
+
cfg = hydra.compose(
|
41
|
+
config_name="config", overrides=["tools/question_and_answer=default"]
|
42
|
+
)
|
39
43
|
cfg = cfg.tools.question_and_answer
|
40
44
|
logger.info("Loaded Question and Answer tool configuration.")
|
41
45
|
|
46
|
+
|
42
47
|
class QuestionAndAnswerInput(BaseModel):
|
43
48
|
"""
|
44
49
|
Input schema for the PDF Question and Answer tool.
|
@@ -47,12 +52,12 @@ class QuestionAndAnswerInput(BaseModel):
|
|
47
52
|
question (str): The question to ask regarding the PDF content.
|
48
53
|
tool_call_id (str): Unique identifier for the tool call, injected automatically.
|
49
54
|
"""
|
50
|
-
|
51
|
-
|
52
|
-
)
|
55
|
+
|
56
|
+
question: str = Field(description="The question to ask regarding the PDF content.")
|
53
57
|
tool_call_id: Annotated[str, InjectedToolCallId]
|
54
58
|
state: Annotated[dict, InjectedState]
|
55
59
|
|
60
|
+
|
56
61
|
def extract_text_from_pdf_data(pdf_bytes: bytes) -> str:
|
57
62
|
"""
|
58
63
|
Extract text content from PDF binary data.
|
@@ -73,7 +78,10 @@ def extract_text_from_pdf_data(pdf_bytes: bytes) -> str:
|
|
73
78
|
text += page_text
|
74
79
|
return text
|
75
80
|
|
76
|
-
|
81
|
+
|
82
|
+
def generate_answer(
|
83
|
+
question: str, pdf_bytes: bytes, llm_model: BaseChatModel
|
84
|
+
) -> Dict[str, Any]:
|
77
85
|
"""
|
78
86
|
Generate an answer for a question using retrieval augmented generation on PDF content.
|
79
87
|
|
@@ -92,9 +100,7 @@ def generate_answer(question: str, pdf_bytes: bytes, llm_model: BaseChatModel) -
|
|
92
100
|
text = extract_text_from_pdf_data(pdf_bytes)
|
93
101
|
logger.info("Extracted text from PDF.")
|
94
102
|
text_splitter = CharacterTextSplitter(
|
95
|
-
separator="\n",
|
96
|
-
chunk_size=cfg.chunk_size,
|
97
|
-
chunk_overlap=cfg.chunk_overlap
|
103
|
+
separator="\n", chunk_size=cfg.chunk_size, chunk_overlap=cfg.chunk_overlap
|
98
104
|
)
|
99
105
|
chunks = text_splitter.split_text(text)
|
100
106
|
documents: List[Document] = [Document(page_content=chunk) for chunk in chunks]
|
@@ -102,10 +108,7 @@ def generate_answer(question: str, pdf_bytes: bytes, llm_model: BaseChatModel) -
|
|
102
108
|
|
103
109
|
embeddings = OpenAIEmbeddings(openai_api_key=cfg.openai_api_key)
|
104
110
|
vector_store = Annoy.from_documents(documents, embeddings)
|
105
|
-
search_results = vector_store.similarity_search(
|
106
|
-
question,
|
107
|
-
k=cfg.num_retrievals
|
108
|
-
)
|
111
|
+
search_results = vector_store.similarity_search(question, k=cfg.num_retrievals)
|
109
112
|
logger.info("Retrieved %d relevant document chunks.", len(search_results))
|
110
113
|
# Use the provided llm_model to build the QA chain.
|
111
114
|
qa_chain = load_qa_chain(llm_model, chain_type=cfg.qa_chain_type)
|
@@ -114,6 +117,49 @@ def generate_answer(question: str, pdf_bytes: bytes, llm_model: BaseChatModel) -
|
|
114
117
|
)
|
115
118
|
return answer
|
116
119
|
|
120
|
+
|
121
|
+
def generate_answer2(
|
122
|
+
question: str, pdf_url: str, text_embedding_model: Embeddings
|
123
|
+
) -> Dict[str, Any]:
|
124
|
+
"""
|
125
|
+
Generate an answer for a question using retrieval augmented generation on PDF content.
|
126
|
+
|
127
|
+
This function extracts text from the PDF data, splits the text into manageable chunks,
|
128
|
+
performs a similarity search to retrieve the most relevant segments, and then uses a
|
129
|
+
question-answering chain (built using the provided llm_model) to generate an answer.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
question (str): The question to be answered.
|
133
|
+
pdf_bytes (bytes): The binary content of the PDF document.
|
134
|
+
llm_model (BaseChatModel): The language model instance to use for answering.
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
Dict[str, Any]: A dictionary containing the answer generated by the language model.
|
138
|
+
"""
|
139
|
+
# text = extract_text_from_pdf_data(pdf_bytes)
|
140
|
+
# logger.info("Extracted text from PDF.")
|
141
|
+
logger.log(logging.INFO, "searching the article with the question: %s", question)
|
142
|
+
# Load the article
|
143
|
+
# loader = PyPDFLoader(state['pdf_file_name'])
|
144
|
+
# loader = PyPDFLoader("https://arxiv.org/pdf/2310.08365")
|
145
|
+
loader = PyPDFLoader(pdf_url)
|
146
|
+
# Load the pages of the article
|
147
|
+
pages = []
|
148
|
+
for page in loader.lazy_load():
|
149
|
+
pages.append(page)
|
150
|
+
# Set up text embedding model
|
151
|
+
# text_embedding_model = state['text_embedding_model']
|
152
|
+
# text_embedding_model = OpenAIEmbeddings(openai_api_key=cfg.openai_api_key)
|
153
|
+
logging.info("Loaded text embedding model %s", text_embedding_model)
|
154
|
+
# Create a vector store from the pages
|
155
|
+
vector_store = InMemoryVectorStore.from_documents(pages, text_embedding_model)
|
156
|
+
# Search the article with the question
|
157
|
+
docs = vector_store.similarity_search(question)
|
158
|
+
# Return the content of the pages
|
159
|
+
return "\n".join([doc.page_content for doc in docs])
|
160
|
+
# return answer
|
161
|
+
|
162
|
+
|
117
163
|
@tool(args_schema=QuestionAndAnswerInput)
|
118
164
|
def question_and_answer_tool(
|
119
165
|
question: str,
|
@@ -124,7 +170,7 @@ def question_and_answer_tool(
|
|
124
170
|
Answer a question using PDF content stored in the state via retrieval augmented generation.
|
125
171
|
|
126
172
|
This tool retrieves the PDF binary data from the state (under the key "pdf_data"), extracts its
|
127
|
-
textual content, and generates an answer to the specified question. It also extracts the
|
173
|
+
textual content, and generates an answer to the specified question. It also extracts the
|
128
174
|
llm_model (of type BaseChatModel) from the state to use for answering.
|
129
175
|
|
130
176
|
Args:
|
@@ -138,15 +184,15 @@ def question_and_answer_tool(
|
|
138
184
|
Dict[str, Any]: A dictionary containing the generated answer or an error message.
|
139
185
|
"""
|
140
186
|
logger.info("Starting PDF Question and Answer tool using PDF data from state.")
|
187
|
+
# print (state['text_embedding_model'])
|
188
|
+
text_embedding_model = state["text_embedding_model"]
|
141
189
|
pdf_state = state.get("pdf_data")
|
142
190
|
if not pdf_state:
|
143
191
|
error_msg = "No pdf_data found in state."
|
144
192
|
logger.error(error_msg)
|
145
193
|
return Command(
|
146
194
|
update={
|
147
|
-
"messages": [
|
148
|
-
ToolMessage(content=error_msg, tool_call_id=tool_call_id)
|
149
|
-
]
|
195
|
+
"messages": [ToolMessage(content=error_msg, tool_call_id=tool_call_id)]
|
150
196
|
}
|
151
197
|
)
|
152
198
|
pdf_bytes = pdf_state.get("pdf_object")
|
@@ -155,16 +201,17 @@ def question_and_answer_tool(
|
|
155
201
|
logger.error(error_msg)
|
156
202
|
return Command(
|
157
203
|
update={
|
158
|
-
"messages": [
|
159
|
-
ToolMessage(content=error_msg, tool_call_id=tool_call_id)
|
160
|
-
]
|
204
|
+
"messages": [ToolMessage(content=error_msg, tool_call_id=tool_call_id)]
|
161
205
|
}
|
162
206
|
)
|
207
|
+
pdf_url = pdf_state.get("pdf_url")
|
163
208
|
# Retrieve llm_model from state; use a default if not provided.
|
164
209
|
llm_model = state.get("llm_model")
|
165
210
|
if not llm_model:
|
166
211
|
logger.error("Missing LLM model instance in state.")
|
167
212
|
return {"error": "No LLM model found in state."}
|
168
|
-
answer = generate_answer(question, pdf_bytes, llm_model)
|
169
|
-
|
213
|
+
# answer = generate_answer(question, pdf_bytes, llm_model)
|
214
|
+
print(pdf_url)
|
215
|
+
answer = generate_answer2(question, pdf_url, text_embedding_model)
|
216
|
+
# logger.info("Generated answer: %s", answer)
|
170
217
|
return answer
|
@@ -30,7 +30,7 @@ class MultiPaperRecInput(BaseModel):
|
|
30
30
|
description="List of Semantic Scholar Paper IDs to get recommendations for"
|
31
31
|
)
|
32
32
|
limit: int = Field(
|
33
|
-
default=
|
33
|
+
default=10,
|
34
34
|
description="Maximum total number of recommendations to return",
|
35
35
|
ge=1,
|
36
36
|
le=500,
|
@@ -90,23 +90,33 @@ def get_multi_paper_recommendations(
|
|
90
90
|
params["year"] = year
|
91
91
|
|
92
92
|
# Wrap API call in try/except to catch connectivity issues and validate response format
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
93
|
+
response = None
|
94
|
+
for attempt in range(10):
|
95
|
+
try:
|
96
|
+
response = requests.post(
|
97
|
+
endpoint,
|
98
|
+
headers=headers,
|
99
|
+
params=params,
|
100
|
+
data=json.dumps(payload),
|
101
|
+
timeout=cfg.request_timeout,
|
102
|
+
)
|
103
|
+
response.raise_for_status() # Raises HTTPError for bad responses
|
104
|
+
break # Exit loop if request is successful
|
105
|
+
except requests.exceptions.RequestException as e:
|
106
|
+
logger.error(
|
107
|
+
"Attempt %d: Failed to connect to Semantic Scholar API for "
|
108
|
+
"multi-paper recommendations: %s",
|
109
|
+
attempt + 1,
|
110
|
+
e,
|
111
|
+
)
|
112
|
+
if attempt == 9: # Last attempt
|
113
|
+
raise RuntimeError(
|
114
|
+
"Failed to connect to Semantic Scholar API after 10 attempts."
|
115
|
+
"Please retry the same query."
|
116
|
+
) from e
|
117
|
+
|
118
|
+
if response is None:
|
119
|
+
raise RuntimeError("Failed to obtain a response from the Semantic Scholar API.")
|
110
120
|
|
111
121
|
logger.info(
|
112
122
|
"API Response Status for multi-paper recommendations: %s", response.status_code
|
@@ -137,11 +147,22 @@ def get_multi_paper_recommendations(
|
|
137
147
|
# Create a dictionary to store the papers
|
138
148
|
filtered_papers = {
|
139
149
|
paper["paperId"]: {
|
140
|
-
"
|
150
|
+
"semantic_scholar_paper_id": paper["paperId"],
|
141
151
|
"Title": paper.get("title", "N/A"),
|
142
152
|
"Abstract": paper.get("abstract", "N/A"),
|
143
153
|
"Year": paper.get("year", "N/A"),
|
154
|
+
"Publication Date": paper.get("publicationDate", "N/A"),
|
155
|
+
"Venue": paper.get("venue", "N/A"),
|
156
|
+
# "Publication Venue": (paper.get("publicationVenue") or {}).get("name", "N/A"),
|
157
|
+
# "Venue Type": (paper.get("publicationVenue") or {}).get("name", "N/A"),
|
158
|
+
"Journal Name": (paper.get("journal") or {}).get("name", "N/A"),
|
159
|
+
# "Journal Volume": paper.get("journal", {}).get("volume", "N/A"),
|
160
|
+
# "Journal Pages": paper.get("journal", {}).get("pages", "N/A"),
|
144
161
|
"Citation Count": paper.get("citationCount", "N/A"),
|
162
|
+
"Authors": [
|
163
|
+
f"{author.get('name', 'N/A')} (ID: {author.get('authorId', 'N/A')})"
|
164
|
+
for author in paper.get("authors", [])
|
165
|
+
],
|
145
166
|
"URL": paper.get("url", "N/A"),
|
146
167
|
"arxiv_id": paper.get("externalIds", {}).get("ArXiv", "N/A"),
|
147
168
|
}
|
@@ -153,7 +174,10 @@ def get_multi_paper_recommendations(
|
|
153
174
|
top_papers = list(filtered_papers.values())[:3]
|
154
175
|
top_papers_info = "\n".join(
|
155
176
|
[
|
156
|
-
f"{i+1}. {paper['Title']} ({paper['Year']})"
|
177
|
+
# f"{i+1}. {paper['Title']} ({paper['Year']})"
|
178
|
+
f"{i+1}. {paper['Title']} ({paper['Year']}; "
|
179
|
+
f"semantic_scholar_paper_id: {paper['semantic_scholar_paper_id']}; "
|
180
|
+
f"arXiv ID: {paper['arxiv_id']})"
|
157
181
|
for i, paper in enumerate(top_papers)
|
158
182
|
]
|
159
183
|
)
|
@@ -165,10 +189,10 @@ def get_multi_paper_recommendations(
|
|
165
189
|
"Papers are attached as an artifact."
|
166
190
|
)
|
167
191
|
content += " Here is a summary of the recommendations:\n"
|
168
|
-
content += f"Number of papers found: {len(filtered_papers)}\n"
|
192
|
+
content += f"Number of recommended papers found: {len(filtered_papers)}\n"
|
169
193
|
content += f"Query Paper IDs: {', '.join(paper_ids)}\n"
|
170
194
|
content += f"Year: {year}\n" if year else ""
|
171
|
-
content += "
|
195
|
+
content += "Here are a few of these papers:\n" + top_papers_info
|
172
196
|
|
173
197
|
return Command(
|
174
198
|
update={
|
@@ -44,7 +44,7 @@ def query_results(question: str, state: Annotated[dict, InjectedState]) -> str:
|
|
44
44
|
raise NoPapersFoundError(
|
45
45
|
"No papers found. A search needs to be performed first."
|
46
46
|
)
|
47
|
-
context_key = state.get("last_displayed_papers","pdf_data")
|
47
|
+
context_key = state.get("last_displayed_papers", "pdf_data")
|
48
48
|
dic_papers = state.get(context_key)
|
49
49
|
df_papers = pd.DataFrame.from_dict(dic_papers, orient="index")
|
50
50
|
df_agent = create_pandas_dataframe_agent(
|
@@ -14,6 +14,7 @@ from langchain_core.tools.base import InjectedToolCallId
|
|
14
14
|
from langgraph.types import Command
|
15
15
|
from pydantic import BaseModel, Field
|
16
16
|
|
17
|
+
# pylint: disable=R0914,R0912,R0915
|
17
18
|
# Configure logging
|
18
19
|
logging.basicConfig(level=logging.INFO)
|
19
20
|
logger = logging.getLogger(__name__)
|
@@ -27,7 +28,7 @@ class SearchInput(BaseModel):
|
|
27
28
|
"Be specific and include relevant academic terms."
|
28
29
|
)
|
29
30
|
limit: int = Field(
|
30
|
-
default=
|
31
|
+
default=10, description="Maximum number of results to return", ge=1, le=100
|
31
32
|
)
|
32
33
|
year: Optional[str] = Field(
|
33
34
|
default=None,
|
@@ -75,14 +76,26 @@ def search_tool(
|
|
75
76
|
params["year"] = year
|
76
77
|
|
77
78
|
# Wrap API call in try/except to catch connectivity issues
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
79
|
+
response = None
|
80
|
+
for attempt in range(10):
|
81
|
+
try:
|
82
|
+
response = requests.get(endpoint, params=params, timeout=10)
|
83
|
+
response.raise_for_status() # Raises HTTPError for bad responses
|
84
|
+
break # Exit loop if request is successful
|
85
|
+
except requests.exceptions.RequestException as e:
|
86
|
+
logger.error(
|
87
|
+
"Attempt %d: Failed to connect to Semantic Scholar API: %s",
|
88
|
+
attempt + 1,
|
89
|
+
e,
|
90
|
+
)
|
91
|
+
if attempt == 9: # Last attempt
|
92
|
+
raise RuntimeError(
|
93
|
+
"Failed to connect to Semantic Scholar API after 10 attempts."
|
94
|
+
"Please retry the same query."
|
95
|
+
) from e
|
96
|
+
|
97
|
+
if response is None:
|
98
|
+
raise RuntimeError("Failed to obtain a response from the Semantic Scholar API.")
|
86
99
|
|
87
100
|
data = response.json()
|
88
101
|
|
@@ -108,11 +121,22 @@ def search_tool(
|
|
108
121
|
# Create a dictionary to store the papers
|
109
122
|
filtered_papers = {
|
110
123
|
paper["paperId"]: {
|
111
|
-
"
|
124
|
+
"semantic_scholar_paper_id": paper["paperId"],
|
112
125
|
"Title": paper.get("title", "N/A"),
|
113
126
|
"Abstract": paper.get("abstract", "N/A"),
|
114
127
|
"Year": paper.get("year", "N/A"),
|
128
|
+
"Publication Date": paper.get("publicationDate", "N/A"),
|
129
|
+
"Venue": paper.get("venue", "N/A"),
|
130
|
+
# "Publication Venue": (paper.get("publicationVenue") or {}).get("name", "N/A"),
|
131
|
+
# "Venue Type": (paper.get("publicationVenue") or {}).get("name", "N/A"),
|
132
|
+
"Journal Name": (paper.get("journal") or {}).get("name", "N/A"),
|
133
|
+
# "Journal Volume": paper.get("journal", {}).get("volume", "N/A"),
|
134
|
+
# "Journal Pages": paper.get("journal", {}).get("pages", "N/A"),
|
115
135
|
"Citation Count": paper.get("citationCount", "N/A"),
|
136
|
+
"Authors": [
|
137
|
+
f"{author.get('name', 'N/A')} (ID: {author.get('authorId', 'N/A')})"
|
138
|
+
for author in paper.get("authors", [])
|
139
|
+
],
|
116
140
|
"URL": paper.get("url", "N/A"),
|
117
141
|
"arxiv_id": paper.get("externalIds", {}).get("ArXiv", "N/A"),
|
118
142
|
}
|
@@ -126,11 +150,15 @@ def search_tool(
|
|
126
150
|
top_papers = list(filtered_papers.values())[:3]
|
127
151
|
top_papers_info = "\n".join(
|
128
152
|
[
|
129
|
-
f"{i+1}. {paper['Title']} ({paper['Year']}
|
153
|
+
f"{i+1}. {paper['Title']} ({paper['Year']}; "
|
154
|
+
f"semantic_scholar_paper_id: {paper['semantic_scholar_paper_id']}; "
|
155
|
+
f"arXiv ID: {paper['arxiv_id']})"
|
130
156
|
for i, paper in enumerate(top_papers)
|
131
157
|
]
|
132
158
|
)
|
133
159
|
|
160
|
+
logger.info("-----------Filtered %d papers", len(filtered_papers))
|
161
|
+
|
134
162
|
content = (
|
135
163
|
"Search was successful. Papers are attached as an artifact. "
|
136
164
|
"Here is a summary of the search results:\n"
|
@@ -138,7 +166,7 @@ def search_tool(
|
|
138
166
|
content += f"Number of papers found: {len(filtered_papers)}\n"
|
139
167
|
content += f"Query: {query}\n"
|
140
168
|
content += f"Year: {year}\n" if year else ""
|
141
|
-
content += "Top papers:\n" + top_papers_info
|
169
|
+
content += "Top 3 papers:\n" + top_papers_info
|
142
170
|
|
143
171
|
return Command(
|
144
172
|
update={
|
@@ -14,6 +14,7 @@ from langchain_core.tools.base import InjectedToolCallId
|
|
14
14
|
from langgraph.types import Command
|
15
15
|
from pydantic import BaseModel, Field
|
16
16
|
|
17
|
+
# pylint: disable=R0914,R0912,R0915
|
17
18
|
# Configure logging
|
18
19
|
logging.basicConfig(level=logging.INFO)
|
19
20
|
logger = logging.getLogger(__name__)
|
@@ -44,7 +45,7 @@ class SinglePaperRecInput(BaseModel):
|
|
44
45
|
def get_single_paper_recommendations(
|
45
46
|
paper_id: str,
|
46
47
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
47
|
-
limit: int =
|
48
|
+
limit: int = 10,
|
48
49
|
year: Optional[str] = None,
|
49
50
|
) -> Command[Any]:
|
50
51
|
"""
|
@@ -85,16 +86,28 @@ def get_single_paper_recommendations(
|
|
85
86
|
params["year"] = year
|
86
87
|
|
87
88
|
# Wrap API call in try/except to catch connectivity issues and check response format
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
89
|
+
response = None
|
90
|
+
for attempt in range(10):
|
91
|
+
try:
|
92
|
+
response = requests.get(
|
93
|
+
endpoint, params=params, timeout=cfg.request_timeout
|
94
|
+
)
|
95
|
+
response.raise_for_status() # Raises HTTPError for bad responses
|
96
|
+
break # Exit loop if request is successful
|
97
|
+
except requests.exceptions.RequestException as e:
|
98
|
+
logger.error(
|
99
|
+
"Attempt %d: Failed to connect to Semantic Scholar API for recommendations: %s",
|
100
|
+
attempt + 1,
|
101
|
+
e,
|
102
|
+
)
|
103
|
+
if attempt == 9: # Last attempt
|
104
|
+
raise RuntimeError(
|
105
|
+
"Failed to connect to Semantic Scholar API after 10 attempts."
|
106
|
+
"Please retry the same query."
|
107
|
+
) from e
|
108
|
+
|
109
|
+
if response is None:
|
110
|
+
raise RuntimeError("Failed to obtain a response from the Semantic Scholar API.")
|
98
111
|
|
99
112
|
logger.info(
|
100
113
|
"API Response Status for recommendations of paper %s: %s",
|
@@ -125,11 +138,22 @@ def get_single_paper_recommendations(
|
|
125
138
|
# Extract paper ID and title from recommendations
|
126
139
|
filtered_papers = {
|
127
140
|
paper["paperId"]: {
|
128
|
-
"
|
141
|
+
"semantic_scholar_paper_id": paper["paperId"],
|
129
142
|
"Title": paper.get("title", "N/A"),
|
130
143
|
"Abstract": paper.get("abstract", "N/A"),
|
131
144
|
"Year": paper.get("year", "N/A"),
|
145
|
+
"Publication Date": paper.get("publicationDate", "N/A"),
|
146
|
+
"Venue": paper.get("venue", "N/A"),
|
147
|
+
# "Publication Venue": (paper.get("publicationVenue") or {}).get("name", "N/A"),
|
148
|
+
# "Venue Type": (paper.get("publicationVenue") or {}).get("name", "N/A"),
|
149
|
+
"Journal Name": (paper.get("journal") or {}).get("name", "N/A"),
|
150
|
+
# "Journal Volume": paper.get("journal", {}).get("volume", "N/A"),
|
151
|
+
# "Journal Pages": paper.get("journal", {}).get("pages", "N/A"),
|
132
152
|
"Citation Count": paper.get("citationCount", "N/A"),
|
153
|
+
"Authors": [
|
154
|
+
f"{author.get('name', 'N/A')} (ID: {author.get('authorId', 'N/A')})"
|
155
|
+
for author in paper.get("authors", [])
|
156
|
+
],
|
133
157
|
"URL": paper.get("url", "N/A"),
|
134
158
|
"arxiv_id": paper.get("externalIds", {}).get("ArXiv", "N/A"),
|
135
159
|
}
|
@@ -141,7 +165,10 @@ def get_single_paper_recommendations(
|
|
141
165
|
top_papers = list(filtered_papers.values())[:3]
|
142
166
|
top_papers_info = "\n".join(
|
143
167
|
[
|
144
|
-
f"{i+1}. {paper['Title']} ({paper['Year']})"
|
168
|
+
# f"{i+1}. {paper['Title']} ({paper['Year']})"
|
169
|
+
f"{i+1}. {paper['Title']} ({paper['Year']}; "
|
170
|
+
f"semantic_scholar_paper_id: {paper['semantic_scholar_paper_id']}; "
|
171
|
+
f"arXiv ID: {paper['arxiv_id']})"
|
145
172
|
for i, paper in enumerate(top_papers)
|
146
173
|
]
|
147
174
|
)
|
@@ -153,10 +180,9 @@ def get_single_paper_recommendations(
|
|
153
180
|
"Papers are attached as an artifact. "
|
154
181
|
"Here is a summary of the recommendations:\n"
|
155
182
|
)
|
156
|
-
content += f"Number of papers found: {len(filtered_papers)}\n"
|
183
|
+
content += f"Number of recommended papers found: {len(filtered_papers)}\n"
|
157
184
|
content += f"Query Paper ID: {paper_id}\n"
|
158
|
-
content +=
|
159
|
-
content += "Top papers:\n" + top_papers_info
|
185
|
+
content += "Here are a few of these papers:\n" + top_papers_info
|
160
186
|
|
161
187
|
return Command(
|
162
188
|
update={
|