aiagents4pharma 1.29.0__py3-none-any.whl → 1.30.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2scholars/agents/__init__.py +1 -0
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +86 -0
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/__init__.py +3 -0
- aiagents4pharma/talk2scholars/configs/config.yaml +2 -0
- aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/__init__.py +3 -0
- aiagents4pharma/talk2scholars/state/state_talk2scholars.py +1 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +142 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +154 -0
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +17 -0
- aiagents4pharma/talk2scholars/tools/paper_download/abstract_downloader.py +43 -0
- aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py +108 -0
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +60 -0
- {aiagents4pharma-1.29.0.dist-info → aiagents4pharma-1.30.0.dist-info}/METADATA +1 -1
- {aiagents4pharma-1.29.0.dist-info → aiagents4pharma-1.30.0.dist-info}/RECORD +17 -8
- {aiagents4pharma-1.29.0.dist-info → aiagents4pharma-1.30.0.dist-info}/LICENSE +0 -0
- {aiagents4pharma-1.29.0.dist-info → aiagents4pharma-1.30.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.29.0.dist-info → aiagents4pharma-1.30.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
This module defines the paper download agent that connects to the arXiv API to fetch
|
4
|
+
paper details and PDFs. It is part of the Talk2Scholars project.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
from typing import Any, Dict
|
9
|
+
import hydra
|
10
|
+
from langchain_core.language_models.chat_models import BaseChatModel
|
11
|
+
from langgraph.graph import START, StateGraph
|
12
|
+
from langgraph.prebuilt.chat_agent_executor import create_react_agent
|
13
|
+
from langgraph.prebuilt.tool_node import ToolNode
|
14
|
+
from langgraph.checkpoint.memory import MemorySaver
|
15
|
+
from ..state.state_talk2scholars import Talk2Scholars
|
16
|
+
from ..tools.paper_download import download_arxiv_paper
|
17
|
+
from ..tools.s2.query_results import query_results
|
18
|
+
|
19
|
+
# Initialize logger
|
20
|
+
logging.basicConfig(level=logging.INFO)
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
def get_app(uniq_id, llm_model: BaseChatModel):
|
24
|
+
"""
|
25
|
+
Initializes and returns the LangGraph application for the Talk2Scholars paper download agent.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
uniq_id (str): A unique identifier for tracking the current session.
|
29
|
+
llm_model (BaseChatModel, optional): The language model to be used by the agent.
|
30
|
+
Defaults to ChatOpenAI(model="gpt-4o-mini", temperature=0.5).
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
StateGraph: A compiled LangGraph application that enables the paper download agent to
|
34
|
+
process user queries and retrieve arXiv papers.
|
35
|
+
"""
|
36
|
+
|
37
|
+
# Load Hydra configuration
|
38
|
+
logger.info("Loading Hydra configuration for Talk2Scholars paper download agent")
|
39
|
+
with hydra.initialize(version_base=None, config_path="../configs"):
|
40
|
+
cfg = hydra.compose(
|
41
|
+
config_name="config",
|
42
|
+
overrides=["agents/talk2scholars/paper_download_agent=default"]
|
43
|
+
)
|
44
|
+
cfg = cfg.agents.talk2scholars.paper_download_agent
|
45
|
+
|
46
|
+
# Define tools properly
|
47
|
+
tools = ToolNode(
|
48
|
+
[download_arxiv_paper, query_results]
|
49
|
+
)
|
50
|
+
|
51
|
+
# Define the model
|
52
|
+
logger.info("Using OpenAI model %s", llm_model)
|
53
|
+
model = create_react_agent(
|
54
|
+
llm_model,
|
55
|
+
tools=tools,
|
56
|
+
state_schema=Talk2Scholars,
|
57
|
+
prompt=cfg.prompt,
|
58
|
+
checkpointer=MemorySaver(),
|
59
|
+
)
|
60
|
+
|
61
|
+
def paper_download_agent_node(state: Talk2Scholars) -> Dict[str, Any]:
|
62
|
+
"""
|
63
|
+
Processes the current state to fetch the arXiv paper.
|
64
|
+
"""
|
65
|
+
logger.info("Creating paper download agent node with thread_id: %s", uniq_id)
|
66
|
+
result = model.invoke(state, {"configurable": {"thread_id": uniq_id}})
|
67
|
+
return result
|
68
|
+
|
69
|
+
# Define new graph
|
70
|
+
workflow = StateGraph(Talk2Scholars)
|
71
|
+
|
72
|
+
# Adding node for paper download agent
|
73
|
+
workflow.add_node("paper_download_agent", paper_download_agent_node)
|
74
|
+
|
75
|
+
# Entering into the agent
|
76
|
+
workflow.add_edge(START, "paper_download_agent")
|
77
|
+
|
78
|
+
# Memory management for states between graph runs
|
79
|
+
checkpointer = MemorySaver()
|
80
|
+
|
81
|
+
# Compile the graph
|
82
|
+
app = workflow.compile(checkpointer=checkpointer)
|
83
|
+
|
84
|
+
# Logging the information and returning the app
|
85
|
+
logger.info("Compiled the graph")
|
86
|
+
return app
|
@@ -2,10 +2,12 @@ defaults:
|
|
2
2
|
- _self_
|
3
3
|
- agents/talk2scholars/main_agent: default
|
4
4
|
- agents/talk2scholars/s2_agent: default
|
5
|
+
- agents/talk2scholars/paper_download_agent: default
|
5
6
|
- agents/talk2scholars/zotero_agent: default
|
6
7
|
- app/frontend: default
|
7
8
|
- agents/talk2scholars/pdf_agent: default
|
8
9
|
- tools/search: default
|
10
|
+
- tools/download_arxiv_paper: default
|
9
11
|
- tools/single_paper_recommendation: default
|
10
12
|
- tools/multi_paper_recommendation: default
|
11
13
|
- tools/retrieve_semantic_scholar_paper_id: default
|
@@ -0,0 +1,142 @@
|
|
1
|
+
"""Unit tests for the paper download agent in Talk2Scholars."""
|
2
|
+
|
3
|
+
from unittest import mock
|
4
|
+
import pytest
|
5
|
+
from langchain_core.messages import HumanMessage, AIMessage
|
6
|
+
from langchain_core.language_models.chat_models import BaseChatModel
|
7
|
+
from ..agents.paper_download_agent import get_app
|
8
|
+
from ..state.state_talk2scholars import Talk2Scholars
|
9
|
+
|
10
|
+
|
11
|
+
@pytest.fixture(autouse=True)
|
12
|
+
def mock_hydra_fixture():
|
13
|
+
"""Mocks Hydra configuration for tests."""
|
14
|
+
with mock.patch("hydra.initialize"), mock.patch("hydra.compose") as mock_compose:
|
15
|
+
cfg_mock = mock.MagicMock()
|
16
|
+
cfg_mock.agents.talk2scholars.s2_agent.temperature = 0
|
17
|
+
cfg_mock.agents.talk2scholars.paper_download_agent.prompt = "Test prompt"
|
18
|
+
mock_compose.return_value = cfg_mock
|
19
|
+
yield mock_compose
|
20
|
+
|
21
|
+
|
22
|
+
@pytest.fixture
|
23
|
+
def mock_tools_fixture():
|
24
|
+
"""Mocks paper download tools to prevent real HTTP calls."""
|
25
|
+
with (
|
26
|
+
mock.patch(
|
27
|
+
"aiagents4pharma.talk2scholars.tools.paper_download."
|
28
|
+
"download_arxiv_input.download_arxiv_paper"
|
29
|
+
) as mock_download_arxiv_paper,
|
30
|
+
mock.patch(
|
31
|
+
"aiagents4pharma.talk2scholars.tools.s2.query_results.query_results"
|
32
|
+
) as mock_query_results,
|
33
|
+
):
|
34
|
+
mock_download_arxiv_paper.return_value = {
|
35
|
+
"pdf_data": {"dummy_key": "dummy_value"}
|
36
|
+
}
|
37
|
+
mock_query_results.return_value = {
|
38
|
+
"result": "Mocked Query Result"
|
39
|
+
}
|
40
|
+
yield [mock_download_arxiv_paper, mock_query_results]
|
41
|
+
|
42
|
+
@pytest.mark.usefixtures("mock_hydra_fixture")
|
43
|
+
def test_paper_download_agent_initialization():
|
44
|
+
"""Ensures the paper download agent initializes properly with a prompt."""
|
45
|
+
thread_id = "test_thread_paper_dl"
|
46
|
+
llm_mock = mock.Mock(spec=BaseChatModel) # Mock LLM
|
47
|
+
|
48
|
+
with mock.patch(
|
49
|
+
"aiagents4pharma.talk2scholars.agents.paper_download_agent.create_react_agent"
|
50
|
+
) as mock_create_agent:
|
51
|
+
mock_create_agent.return_value = mock.Mock()
|
52
|
+
|
53
|
+
app = get_app(thread_id, llm_mock)
|
54
|
+
assert app is not None, "The agent app should be successfully created."
|
55
|
+
assert mock_create_agent.called
|
56
|
+
|
57
|
+
def test_paper_download_agent_invocation():
|
58
|
+
"""Verifies agent processes queries and updates state correctly."""
|
59
|
+
_ = mock_tools_fixture # Prevents unused-argument warning
|
60
|
+
thread_id = "test_thread_paper_dl"
|
61
|
+
mock_state = Talk2Scholars(
|
62
|
+
messages=[HumanMessage(content="Download paper 1234.5678")]
|
63
|
+
)
|
64
|
+
llm_mock = mock.Mock(spec=BaseChatModel)
|
65
|
+
|
66
|
+
with mock.patch(
|
67
|
+
"aiagents4pharma.talk2scholars.agents.paper_download_agent.create_react_agent"
|
68
|
+
) as mock_create_agent:
|
69
|
+
mock_agent = mock.Mock()
|
70
|
+
mock_create_agent.return_value = mock_agent
|
71
|
+
mock_agent.invoke.return_value = {
|
72
|
+
"messages": [AIMessage(content="Here is the paper")],
|
73
|
+
"pdf_data": {"file_bytes": b"FAKE_PDF_CONTENTS"},
|
74
|
+
}
|
75
|
+
|
76
|
+
|
77
|
+
app = get_app(thread_id, llm_mock)
|
78
|
+
result = app.invoke(
|
79
|
+
mock_state,
|
80
|
+
config={
|
81
|
+
"configurable": {
|
82
|
+
"thread_id": thread_id,
|
83
|
+
"checkpoint_ns": "test_ns",
|
84
|
+
"checkpoint_id": "test_checkpoint",
|
85
|
+
}
|
86
|
+
},
|
87
|
+
)
|
88
|
+
|
89
|
+
assert "messages" in result
|
90
|
+
assert "pdf_data" in result
|
91
|
+
|
92
|
+
|
93
|
+
def test_paper_download_agent_tools_assignment(request): # Keep fixture name
|
94
|
+
"""Checks correct tool assignment (download_arxiv_paper, query_results)."""
|
95
|
+
thread_id = "test_thread_paper_dl"
|
96
|
+
mock_tools = request.getfixturevalue("mock_tools_fixture")
|
97
|
+
llm_mock = mock.Mock(spec=BaseChatModel)
|
98
|
+
|
99
|
+
with (
|
100
|
+
mock.patch(
|
101
|
+
"aiagents4pharma.talk2scholars.agents.paper_download_agent.create_react_agent"
|
102
|
+
) as mock_create_agent,
|
103
|
+
mock.patch(
|
104
|
+
"aiagents4pharma.talk2scholars.agents.paper_download_agent.ToolNode"
|
105
|
+
) as mock_toolnode,
|
106
|
+
):
|
107
|
+
mock_agent = mock.Mock()
|
108
|
+
mock_create_agent.return_value = mock_agent
|
109
|
+
mock_tool_instance = mock.Mock()
|
110
|
+
mock_tool_instance.tools = mock_tools
|
111
|
+
mock_toolnode.return_value= mock_tool_instance
|
112
|
+
|
113
|
+
get_app(thread_id, llm_mock)
|
114
|
+
assert mock_toolnode.called
|
115
|
+
assert len(mock_tool_instance.tools) == 2
|
116
|
+
|
117
|
+
|
118
|
+
def test_paper_download_agent_hydra_failure():
|
119
|
+
"""Confirms the agent gracefully handles exceptions if Hydra fails."""
|
120
|
+
thread_id = "test_thread_paper_dl"
|
121
|
+
llm_mock = mock.Mock(spec=BaseChatModel)
|
122
|
+
|
123
|
+
with mock.patch("hydra.initialize", side_effect=Exception("Mock Hydra failure")):
|
124
|
+
with pytest.raises(Exception) as exc_info:
|
125
|
+
get_app(thread_id, llm_mock)
|
126
|
+
assert "Mock Hydra failure" in str(exc_info.value)
|
127
|
+
|
128
|
+
|
129
|
+
def test_paper_download_agent_model_failure():
|
130
|
+
"""Ensures agent handles model-related failures gracefully."""
|
131
|
+
thread_id = "test_thread_paper_dl"
|
132
|
+
llm_mock = mock.Mock(spec=BaseChatModel)
|
133
|
+
|
134
|
+
with mock.patch(
|
135
|
+
"aiagents4pharma.talk2scholars.agents.paper_download_agent.create_react_agent",
|
136
|
+
side_effect=Exception("Mock model failure"),
|
137
|
+
):
|
138
|
+
with pytest.raises(Exception) as exc_info:
|
139
|
+
get_app(thread_id, llm_mock)
|
140
|
+
assert "Mock model failure" in str(exc_info.value), (
|
141
|
+
"Model initialization failure should raise an exception."
|
142
|
+
)
|
@@ -0,0 +1,154 @@
|
|
1
|
+
"""
|
2
|
+
Unit tests for arXiv paper downloading functionality, including:
|
3
|
+
- AbstractPaperDownloader (base class)
|
4
|
+
- ArxivPaperDownloader (arXiv-specific implementation)
|
5
|
+
- download_arxiv_paper tool function.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from unittest.mock import patch, MagicMock
|
9
|
+
import pytest
|
10
|
+
import requests
|
11
|
+
from requests.exceptions import HTTPError
|
12
|
+
from langgraph.types import Command
|
13
|
+
from langchain_core.messages import ToolMessage
|
14
|
+
|
15
|
+
# Import the classes and function under test
|
16
|
+
from aiagents4pharma.talk2scholars.tools.paper_download.abstract_downloader import (
|
17
|
+
AbstractPaperDownloader,
|
18
|
+
)
|
19
|
+
from aiagents4pharma.talk2scholars.tools.paper_download.arxiv_downloader import (
|
20
|
+
ArxivPaperDownloader,
|
21
|
+
)
|
22
|
+
from aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input import (
|
23
|
+
download_arxiv_paper,
|
24
|
+
)
|
25
|
+
|
26
|
+
@pytest.mark.parametrize("class_obj", [AbstractPaperDownloader])
|
27
|
+
|
28
|
+
def test_abstract_downloader_cannot_be_instantiated(class_obj):
|
29
|
+
"""
|
30
|
+
Validates that AbstractPaperDownloader is indeed abstract and raises TypeError
|
31
|
+
if anyone attempts to instantiate it directly.
|
32
|
+
"""
|
33
|
+
with pytest.raises(TypeError):
|
34
|
+
class_obj()
|
35
|
+
|
36
|
+
|
37
|
+
@pytest.fixture(name="arxiv_downloader_fixture")
|
38
|
+
@pytest.mark.usefixtures("mock_hydra_config_setup")
|
39
|
+
def fixture_arxiv_downloader():
|
40
|
+
"""
|
41
|
+
Provides an ArxivPaperDownloader instance with a mocked Hydra config.
|
42
|
+
"""
|
43
|
+
return ArxivPaperDownloader()
|
44
|
+
|
45
|
+
|
46
|
+
def test_fetch_metadata_success(arxiv_downloader_fixture,):
|
47
|
+
"""
|
48
|
+
Ensures fetch_metadata retrieves XML data correctly, given a successful HTTP response.
|
49
|
+
"""
|
50
|
+
mock_response = MagicMock()
|
51
|
+
mock_response.text = "<xml>Mock ArXiv Metadata</xml>"
|
52
|
+
mock_response.raise_for_status = MagicMock()
|
53
|
+
|
54
|
+
with patch.object(requests, "get", return_value=mock_response) as mock_get:
|
55
|
+
paper_id = "1234.5678"
|
56
|
+
result = arxiv_downloader_fixture.fetch_metadata(paper_id)
|
57
|
+
mock_get.assert_called_once_with(
|
58
|
+
"http://export.arxiv.org/api/query?search_query=id:1234.5678&start=0&max_results=1",
|
59
|
+
timeout=10,
|
60
|
+
)
|
61
|
+
assert result["xml"] == "<xml>Mock ArXiv Metadata</xml>"
|
62
|
+
|
63
|
+
|
64
|
+
def test_fetch_metadata_http_error(arxiv_downloader_fixture):
|
65
|
+
"""
|
66
|
+
Validates that fetch_metadata raises HTTPError when the response indicates a failure.
|
67
|
+
"""
|
68
|
+
mock_response = MagicMock()
|
69
|
+
mock_response.raise_for_status.side_effect = HTTPError("Mocked HTTP failure")
|
70
|
+
|
71
|
+
with patch.object(requests, "get", return_value=mock_response):
|
72
|
+
with pytest.raises(HTTPError):
|
73
|
+
arxiv_downloader_fixture.fetch_metadata("invalid_id")
|
74
|
+
|
75
|
+
|
76
|
+
def test_download_pdf_success(arxiv_downloader_fixture):
|
77
|
+
"""
|
78
|
+
Tests that download_pdf fetches the PDF link from metadata and successfully
|
79
|
+
retrieves the binary content.
|
80
|
+
"""
|
81
|
+
mock_metadata = {
|
82
|
+
"xml": """
|
83
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
84
|
+
<entry>
|
85
|
+
<link title="pdf" href="http://test.arxiv.org/pdf/1234.5678v1.pdf"/>
|
86
|
+
</entry>
|
87
|
+
</feed>
|
88
|
+
"""
|
89
|
+
}
|
90
|
+
|
91
|
+
mock_pdf_response = MagicMock()
|
92
|
+
mock_pdf_response.raise_for_status = MagicMock()
|
93
|
+
mock_pdf_response.iter_content = lambda chunk_size: [b"FAKE_PDF_CONTENT"]
|
94
|
+
|
95
|
+
with patch.object(arxiv_downloader_fixture, "fetch_metadata", return_value=mock_metadata):
|
96
|
+
with patch.object(requests, "get", return_value=mock_pdf_response) as mock_get:
|
97
|
+
result = arxiv_downloader_fixture.download_pdf("1234.5678")
|
98
|
+
assert result["pdf_object"] == b"FAKE_PDF_CONTENT"
|
99
|
+
assert result["pdf_url"] == "http://test.arxiv.org/pdf/1234.5678v1.pdf"
|
100
|
+
assert result["arxiv_id"] == "1234.5678"
|
101
|
+
mock_get.assert_called_once_with(
|
102
|
+
"http://test.arxiv.org/pdf/1234.5678v1.pdf",
|
103
|
+
stream=True,
|
104
|
+
timeout=10,
|
105
|
+
)
|
106
|
+
|
107
|
+
|
108
|
+
def test_download_pdf_no_pdf_link(arxiv_downloader_fixture):
|
109
|
+
"""
|
110
|
+
Ensures a RuntimeError is raised if no <link> with title="pdf" is found in the XML.
|
111
|
+
"""
|
112
|
+
mock_metadata = {"xml": "<feed></feed>"}
|
113
|
+
|
114
|
+
with patch.object(arxiv_downloader_fixture, "fetch_metadata", return_value=mock_metadata):
|
115
|
+
with pytest.raises(RuntimeError, match="Failed to download PDF"):
|
116
|
+
arxiv_downloader_fixture.download_pdf("1234.5678")
|
117
|
+
|
118
|
+
|
119
|
+
def test_download_arxiv_paper_tool_success(arxiv_downloader_fixture):
|
120
|
+
"""
|
121
|
+
Validates download_arxiv_paper orchestrates the ArxivPaperDownloader correctly,
|
122
|
+
returning a Command with PDF data and success messages.
|
123
|
+
"""
|
124
|
+
mock_metadata = {"xml": "<mockxml></mockxml>"}
|
125
|
+
mock_pdf_response = {
|
126
|
+
"pdf_object": b"FAKE_PDF_CONTENT",
|
127
|
+
"pdf_url": "http://test.arxiv.org/mock.pdf",
|
128
|
+
"arxiv_id": "9999.8888",
|
129
|
+
}
|
130
|
+
|
131
|
+
with patch(
|
132
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input."
|
133
|
+
"ArxivPaperDownloader",
|
134
|
+
return_value=arxiv_downloader_fixture,
|
135
|
+
):
|
136
|
+
with patch.object(arxiv_downloader_fixture, "fetch_metadata", return_value=mock_metadata):
|
137
|
+
with patch.object(
|
138
|
+
arxiv_downloader_fixture,
|
139
|
+
"download_pdf",
|
140
|
+
return_value=mock_pdf_response,
|
141
|
+
):
|
142
|
+
command_result = download_arxiv_paper.invoke(
|
143
|
+
{"arxiv_id": "9999.8888", "tool_call_id": "test_tool_call"}
|
144
|
+
)
|
145
|
+
|
146
|
+
assert isinstance(command_result, Command)
|
147
|
+
assert "pdf_data" in command_result.update
|
148
|
+
assert command_result.update["pdf_data"] == mock_pdf_response
|
149
|
+
|
150
|
+
messages = command_result.update.get("messages", [])
|
151
|
+
assert len(messages) == 1
|
152
|
+
assert isinstance(messages[0], ToolMessage)
|
153
|
+
assert "Successfully downloaded PDF" in messages[0].content
|
154
|
+
assert "9999.8888" in messages[0].content
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
This package provides modules for fetching and downloading academic papers from arXiv.
|
4
|
+
"""
|
5
|
+
|
6
|
+
# Import modules
|
7
|
+
from . import abstract_downloader
|
8
|
+
from . import arxiv_downloader
|
9
|
+
from . import download_arxiv_input
|
10
|
+
from .download_arxiv_input import download_arxiv_paper
|
11
|
+
|
12
|
+
__all__ = [
|
13
|
+
"abstract_downloader",
|
14
|
+
"arxiv_downloader",
|
15
|
+
"download_arxiv_input",
|
16
|
+
"download_arxiv_paper",
|
17
|
+
]
|
@@ -0,0 +1,43 @@
|
|
1
|
+
"""
|
2
|
+
Abstract Base Class for Paper Downloaders.
|
3
|
+
|
4
|
+
This module defines the `AbstractPaperDownloader` class, which serves as a
|
5
|
+
base class for downloading scholarly papers from different sources
|
6
|
+
(e.g., arXiv, PubMed, IEEE Xplore). Any specific downloader should
|
7
|
+
inherit from this class and implement its methods.
|
8
|
+
"""
|
9
|
+
|
10
|
+
from abc import ABC, abstractmethod
|
11
|
+
from typing import Any, Dict
|
12
|
+
class AbstractPaperDownloader(ABC):
|
13
|
+
"""
|
14
|
+
Abstract base class for scholarly paper downloaders.
|
15
|
+
|
16
|
+
This is designed to be extended for different paper sources
|
17
|
+
like arXiv, PubMed, IEEE Xplore, etc. Each implementation
|
18
|
+
must define methods for fetching metadata and downloading PDFs.
|
19
|
+
"""
|
20
|
+
|
21
|
+
@abstractmethod
|
22
|
+
def fetch_metadata(self, paper_id: str) -> Dict[str, Any]:
|
23
|
+
"""
|
24
|
+
Fetch metadata for a given paper ID.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
paper_id (str): The unique identifier for the paper.
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
Dict[str, Any]: The metadata dictionary (format depends on the data source).
|
31
|
+
"""
|
32
|
+
|
33
|
+
@abstractmethod
|
34
|
+
def download_pdf(self, paper_id: str) -> bytes:
|
35
|
+
"""
|
36
|
+
Download the PDF for a given paper ID.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
paper_id (str): The unique identifier for the paper.
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
bytes: The binary content of the downloaded PDF.
|
43
|
+
"""
|
@@ -0,0 +1,108 @@
|
|
1
|
+
"""
|
2
|
+
Arxiv Paper Downloader
|
3
|
+
|
4
|
+
This module provides an implementation of `AbstractPaperDownloader` for arXiv.
|
5
|
+
It connects to the arXiv API, retrieves metadata for a research paper, and
|
6
|
+
downloads the corresponding PDF.
|
7
|
+
|
8
|
+
By using an abstract base class, this implementation is extendable to other
|
9
|
+
APIs like PubMed, IEEE Xplore, etc.
|
10
|
+
"""
|
11
|
+
import xml.etree.ElementTree as ET
|
12
|
+
from typing import Any, Dict
|
13
|
+
import logging
|
14
|
+
import hydra
|
15
|
+
import requests
|
16
|
+
from .abstract_downloader import AbstractPaperDownloader
|
17
|
+
|
18
|
+
# Configure logging
|
19
|
+
logging.basicConfig(level=logging.INFO)
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
class ArxivPaperDownloader(AbstractPaperDownloader):
|
23
|
+
"""
|
24
|
+
Downloader class for arXiv.
|
25
|
+
|
26
|
+
This class interfaces with the arXiv API to fetch metadata
|
27
|
+
and retrieve PDFs of academic papers based on their arXiv IDs.
|
28
|
+
"""
|
29
|
+
|
30
|
+
def __init__(self):
|
31
|
+
"""
|
32
|
+
Initializes the arXiv paper downloader.
|
33
|
+
|
34
|
+
Uses Hydra for configuration management to retrieve API details.
|
35
|
+
"""
|
36
|
+
with hydra.initialize(version_base=None, config_path="../../configs"):
|
37
|
+
cfg = hydra.compose(
|
38
|
+
config_name="config",
|
39
|
+
overrides=["tools/download_arxiv_paper=default"]
|
40
|
+
)
|
41
|
+
self.api_url = cfg.tools.download_arxiv_paper.api_url
|
42
|
+
self.request_timeout = cfg.tools.download_arxiv_paper.request_timeout
|
43
|
+
self.chunk_size = cfg.tools.download_arxiv_paper.chunk_size
|
44
|
+
self.pdf_base_url = cfg.tools.download_arxiv_paper.pdf_base_url
|
45
|
+
def fetch_metadata(self, paper_id: str) -> Dict[str, Any]:
|
46
|
+
"""
|
47
|
+
Fetch metadata from arXiv for a given paper ID.
|
48
|
+
|
49
|
+
Args:
|
50
|
+
paper_id (str): The arXiv ID of the paper.
|
51
|
+
|
52
|
+
Returns:
|
53
|
+
Dict[str, Any]: A dictionary containing metadata, including the XML response.
|
54
|
+
"""
|
55
|
+
logger.info("Fetching metadata from arXiv for paper ID: %s", paper_id)
|
56
|
+
api_url = f"{self.api_url}?search_query=id:{paper_id}&start=0&max_results=1"
|
57
|
+
response = requests.get(api_url, timeout=self.request_timeout)
|
58
|
+
response.raise_for_status()
|
59
|
+
return {"xml": response.text}
|
60
|
+
|
61
|
+
def download_pdf(self, paper_id: str) -> Dict[str, Any]:
|
62
|
+
"""
|
63
|
+
Download the PDF of a paper from arXiv.
|
64
|
+
|
65
|
+
This function first retrieves the paper's metadata to locate the PDF link
|
66
|
+
before downloading the file.
|
67
|
+
|
68
|
+
Args:
|
69
|
+
paper_id (str): The arXiv ID of the paper.
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
Dict[str, Any]: A dictionary containing:
|
73
|
+
- `pdf_object`: The binary content of the downloaded PDF.
|
74
|
+
- `pdf_url`: The URL from which the PDF was fetched.
|
75
|
+
- `arxiv_id`: The arXiv ID of the downloaded paper.
|
76
|
+
"""
|
77
|
+
metadata = self.fetch_metadata(paper_id)
|
78
|
+
|
79
|
+
# Parse the XML response to locate the PDF link.
|
80
|
+
root = ET.fromstring(metadata["xml"])
|
81
|
+
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
82
|
+
pdf_url = next(
|
83
|
+
(
|
84
|
+
link.attrib.get("href")
|
85
|
+
for entry in root.findall("atom:entry", ns)
|
86
|
+
for link in entry.findall("atom:link", ns)
|
87
|
+
if link.attrib.get("title") == "pdf"
|
88
|
+
),
|
89
|
+
None,
|
90
|
+
)
|
91
|
+
|
92
|
+
if not pdf_url:
|
93
|
+
raise RuntimeError(f"Failed to download PDF for arXiv ID {paper_id}.")
|
94
|
+
|
95
|
+
logger.info("Downloading PDF from: %s", pdf_url)
|
96
|
+
pdf_response = requests.get(pdf_url, stream=True, timeout=self.request_timeout)
|
97
|
+
pdf_response.raise_for_status()
|
98
|
+
|
99
|
+
# Combine the PDF data from chunks.
|
100
|
+
pdf_object = b"".join(
|
101
|
+
chunk for chunk in pdf_response.iter_content(chunk_size=self.chunk_size) if chunk
|
102
|
+
)
|
103
|
+
|
104
|
+
return {
|
105
|
+
"pdf_object": pdf_object,
|
106
|
+
"pdf_url": pdf_url,
|
107
|
+
"arxiv_id": paper_id,
|
108
|
+
}
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# File: aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py
|
2
|
+
"""
|
3
|
+
This module defines the `download_arxiv_paper` tool, which leverages the
|
4
|
+
`ArxivPaperDownloader` class to fetch and download academic papers from arXiv
|
5
|
+
based on their unique arXiv ID.
|
6
|
+
"""
|
7
|
+
from typing import Annotated, Any
|
8
|
+
from pydantic import BaseModel, Field
|
9
|
+
from langchain_core.tools import tool
|
10
|
+
from langchain_core.messages import ToolMessage
|
11
|
+
from langchain_core.tools.base import InjectedToolCallId
|
12
|
+
from langgraph.types import Command
|
13
|
+
|
14
|
+
# Local import from the same package:
|
15
|
+
from .arxiv_downloader import ArxivPaperDownloader
|
16
|
+
|
17
|
+
class DownloadArxivPaperInput(BaseModel):
|
18
|
+
"""
|
19
|
+
Input schema for the arXiv paper download tool.
|
20
|
+
(Optional: if you decide to keep Pydantic validation in the future)
|
21
|
+
"""
|
22
|
+
arxiv_id: str = Field(
|
23
|
+
description="The arXiv paper ID used to retrieve the paper details and PDF."
|
24
|
+
)
|
25
|
+
tool_call_id: Annotated[str, InjectedToolCallId]
|
26
|
+
|
27
|
+
@tool(args_schema=DownloadArxivPaperInput, parse_docstring=True)
|
28
|
+
def download_arxiv_paper(
|
29
|
+
arxiv_id: str,
|
30
|
+
tool_call_id: Annotated[str, InjectedToolCallId],
|
31
|
+
) -> Command[Any]:
|
32
|
+
"""
|
33
|
+
Download an arXiv paper's PDF using its unique arXiv ID.
|
34
|
+
|
35
|
+
This function:
|
36
|
+
1. Creates an `ArxivPaperDownloader` instance.
|
37
|
+
2. Fetches metadata from arXiv using the provided `arxiv_id`.
|
38
|
+
3. Downloads the PDF from the returned link.
|
39
|
+
4. Returns a `Command` object containing the PDF data and a success message.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
arxiv_id (str): The unique arXiv paper ID.
|
43
|
+
tool_call_id (InjectedToolCallId): A unique identifier for tracking this tool call.
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
Command[Any]: Contains metadata and messages about the success of the operation.
|
47
|
+
"""
|
48
|
+
downloader = ArxivPaperDownloader()
|
49
|
+
|
50
|
+
# If the downloader fails or the arxiv_id is invalid, this might raise an error
|
51
|
+
pdf_data = downloader.download_pdf(arxiv_id)
|
52
|
+
|
53
|
+
content = f"Successfully downloaded PDF for arXiv ID {arxiv_id}"
|
54
|
+
|
55
|
+
return Command(
|
56
|
+
update={
|
57
|
+
"pdf_data": pdf_data,
|
58
|
+
"messages": [ToolMessage(content=content, tool_call_id=tool_call_id)],
|
59
|
+
}
|
60
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: aiagents4pharma
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.30.0
|
4
4
|
Summary: AI Agents for drug discovery, drug development, and other pharmaceutical R&D.
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -135,17 +135,19 @@ aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py,sha256
|
|
135
135
|
aiagents4pharma/talk2knowledgegraphs/utils/extractions/__init__.py,sha256=7gwwtfzKhB8GuOBD47XRi0NprwEXkOzwNl5eeu-hDTI,86
|
136
136
|
aiagents4pharma/talk2knowledgegraphs/utils/extractions/pcst.py,sha256=m5p0yoJb7I19ua5yeQfXPf7c4r6S1XPwttsrM7Qoy94,9336
|
137
137
|
aiagents4pharma/talk2scholars/__init__.py,sha256=gphERyVKZHvOnMQsml7TIHlaIshHJ75R1J3FKExkfuY,120
|
138
|
-
aiagents4pharma/talk2scholars/agents/__init__.py,sha256=
|
138
|
+
aiagents4pharma/talk2scholars/agents/__init__.py,sha256=inLJpRDlT80RNSi3OFNi2lpbbTisQgzNkMYTvnhFjVY,203
|
139
139
|
aiagents4pharma/talk2scholars/agents/main_agent.py,sha256=TABzGSOg7I0_fJ0qybBVqZDdrU8YCjyG_m-kasO4WgE,2854
|
140
|
+
aiagents4pharma/talk2scholars/agents/paper_download_agent.py,sha256=3GxxNhA_VGf3QOozIjr5cEY2te5n6rQSdZpdFajZttA,3006
|
140
141
|
aiagents4pharma/talk2scholars/agents/pdf_agent.py,sha256=c9-_z5qp5Zkgh6piEIlgI4uo4OMXD3janZNmfYwnFCg,3729
|
141
142
|
aiagents4pharma/talk2scholars/agents/s2_agent.py,sha256=ua1bjKE2HBKZuLnDn8me5fuV1lSvdZbwAlo3Yp27TT4,4659
|
142
143
|
aiagents4pharma/talk2scholars/agents/zotero_agent.py,sha256=5jfIJiLsRdlCJjkF7BQMkP5PsEY_Gr7SfztWKozbUGo,4223
|
143
144
|
aiagents4pharma/talk2scholars/configs/__init__.py,sha256=tf2gz8n7M4ko6xLdX_C925ELVIxoP6SgkPcbeh59ad4,151
|
144
|
-
aiagents4pharma/talk2scholars/configs/config.yaml,sha256
|
145
|
+
aiagents4pharma/talk2scholars/configs/config.yaml,sha256=-8X0_gTmjEuXAeIrnppw3Npy8HICelHZOvTKEScI-rs,596
|
145
146
|
aiagents4pharma/talk2scholars/configs/agents/__init__.py,sha256=yyh7PB2oY_JulnpSQCWS4wwCH_uzIdt47O2Ay48x_oU,75
|
146
147
|
aiagents4pharma/talk2scholars/configs/agents/talk2scholars/__init__.py,sha256=64GEWAoKOd_YHLi27eSOcOC5eSLK0IG_FNra3ZBt02Y,146
|
147
148
|
aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/__init__.py,sha256=fqQQ-GlRcbzru2KmEk3oMma0R6_SzGM8dOXzYeU4oVA,46
|
148
149
|
aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml,sha256=rZfZ_dJArjlznHzusjxCnOjhptLTyejFiB0euV5R13c,662
|
150
|
+
aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/__init__.py,sha256=fqQQ-GlRcbzru2KmEk3oMma0R6_SzGM8dOXzYeU4oVA,46
|
149
151
|
aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/__init__.py,sha256=fqQQ-GlRcbzru2KmEk3oMma0R6_SzGM8dOXzYeU4oVA,46
|
150
152
|
aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/__init__.py,sha256=fqQQ-GlRcbzru2KmEk3oMma0R6_SzGM8dOXzYeU4oVA,46
|
151
153
|
aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml,sha256=sn6vX6r-P0CR7UWS63ZqCmMKKn4As8pZoITRWx8sdoo,1151
|
@@ -155,6 +157,7 @@ aiagents4pharma/talk2scholars/configs/app/__init__.py,sha256=JoSZV6N669kGMv5zLDs
|
|
155
157
|
aiagents4pharma/talk2scholars/configs/app/frontend/__init__.py,sha256=fqQQ-GlRcbzru2KmEk3oMma0R6_SzGM8dOXzYeU4oVA,46
|
156
158
|
aiagents4pharma/talk2scholars/configs/app/frontend/default.yaml,sha256=wsELBdRLv6UqZ9QZfwpS7K4xfMj5s-a99-aXqIs6WEI,868
|
157
159
|
aiagents4pharma/talk2scholars/configs/tools/__init__.py,sha256=GwpgnRrfjyZDVsangewSVTG3H3GBYM6s_YaQd9-zI10,238
|
160
|
+
aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/__init__.py,sha256=fqQQ-GlRcbzru2KmEk3oMma0R6_SzGM8dOXzYeU4oVA,46
|
158
161
|
aiagents4pharma/talk2scholars/configs/tools/multi_paper_recommendation/__init__.py,sha256=fqQQ-GlRcbzru2KmEk3oMma0R6_SzGM8dOXzYeU4oVA,46
|
159
162
|
aiagents4pharma/talk2scholars/configs/tools/multi_paper_recommendation/default.yaml,sha256=QV7HrG7NdjBEjTMszh27MbGBYMbf_78V3sCGftdTtvo,442
|
160
163
|
aiagents4pharma/talk2scholars/configs/tools/question_and_answer/__init__.py,sha256=fqQQ-GlRcbzru2KmEk3oMma0R6_SzGM8dOXzYeU4oVA,46
|
@@ -168,10 +171,12 @@ aiagents4pharma/talk2scholars/configs/tools/zotero_read/__init__.py,sha256=fqQQ-
|
|
168
171
|
aiagents4pharma/talk2scholars/configs/tools/zotero_read/default.yaml,sha256=6ZvZdCsnudPeVjnatv78Z0QfMwsHZuliE2RCIRCW05Y,1221
|
169
172
|
aiagents4pharma/talk2scholars/configs/tools/zotero_write/__inti__.py,sha256=fqQQ-GlRcbzru2KmEk3oMma0R6_SzGM8dOXzYeU4oVA,46
|
170
173
|
aiagents4pharma/talk2scholars/state/__init__.py,sha256=S6SxlszIMZSIMJehjevPF9sKyR-PAwWb5TEdo6xWXE8,103
|
171
|
-
aiagents4pharma/talk2scholars/state/state_talk2scholars.py,sha256=
|
174
|
+
aiagents4pharma/talk2scholars/state/state_talk2scholars.py,sha256=0dFSdsGiiilNIuuHQFEjpjQmcZXlK0JQwMV_GCiAsuU,2490
|
172
175
|
aiagents4pharma/talk2scholars/tests/__init__.py,sha256=U3PsTiUZaUBD1IZanFGkDIOdFieDVJtGKQ5-woYUo8c,45
|
173
176
|
aiagents4pharma/talk2scholars/tests/test_llm_main_integration.py,sha256=SAMG-Kb2S9sei8Us5vUWCUJikTKXPZVKQ6aJJPEhJsc,1880
|
174
177
|
aiagents4pharma/talk2scholars/tests/test_main_agent.py,sha256=5QnOPKNrQCd5GdYU-vVF3bUrmitOsUcazZA7BsXeomo,5947
|
178
|
+
aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py,sha256=CP4fKFU_JYP_AXvTptnwpjaVar1d5lVKV5vxYgH_1j4,5309
|
179
|
+
aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py,sha256=_bGuoo4b6zD_vwLa7jGziWDT5qRtavsf02Jiaa7JIRU,5817
|
175
180
|
aiagents4pharma/talk2scholars/tests/test_pdf_agent.py,sha256=TN4Sq5-SCxv-9VfFyq7sOlBlxbekmnWuB7-qh4MrhkA,4656
|
176
181
|
aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py,sha256=TpCDiGfsC2y6bOkm0ZTXjT1Vp8D-Po25wiEH5aDT_DA,6491
|
177
182
|
aiagents4pharma/talk2scholars/tests/test_routing_logic.py,sha256=AZrvaEBDk51KL6edrZY3GpQ_N6VbrlADqXFeg_jxDoQ,2284
|
@@ -188,6 +193,10 @@ aiagents4pharma/talk2scholars/tests/test_zotero_path.py,sha256=XeXYqTlSkJgZ02tCz
|
|
188
193
|
aiagents4pharma/talk2scholars/tests/test_zotero_read.py,sha256=vLAPAFeL8MjDju_HlsLnio-9HxzN1RqOApr9jyemYBk,14951
|
189
194
|
aiagents4pharma/talk2scholars/tests/test_zotero_write.py,sha256=76V7ezb6Xw-BEEwdJQvJs78JPGRYpAsijHIi3bTGsW8,23206
|
190
195
|
aiagents4pharma/talk2scholars/tools/__init__.py,sha256=UtGutYNNaRcr2nOmT_XqbTiaJpgVYKo3KVGVPFVrX2Y,107
|
196
|
+
aiagents4pharma/talk2scholars/tools/paper_download/__init__.py,sha256=0XmPLEqCply536Y1uWksmHYjlgNWcmcMpZx63XvGEFI,413
|
197
|
+
aiagents4pharma/talk2scholars/tools/paper_download/abstract_downloader.py,sha256=UgJOu9o9RAjlzMahUgPWV6iCGC6n7atDOa0VEp8bGx0,1325
|
198
|
+
aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py,sha256=kP5tyLc92zlkF5EPA7zVYSjpVk724pCsjHFgOntb_Tw,3869
|
199
|
+
aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py,sha256=EJBr9RSSog8tFa7BIFIDZ-Qn7qjqJIAuRb_hF4wZ49Q,2181
|
191
200
|
aiagents4pharma/talk2scholars/tools/pdf/__init__.py,sha256=WOm-o-fFzyjFZBaHg658Gjzdiu1Kt-h9xvzvw0hR7aE,103
|
192
201
|
aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py,sha256=22JvT7F0rY11TF40pBfe9Cn2Y-6Tx73NfWDt4NJv700,6639
|
193
202
|
aiagents4pharma/talk2scholars/tools/s2/__init__.py,sha256=wytqCmGm8Fbl8y5qLdIkxhhG8VHLYMifCGjbH_LK2Fc,258
|
@@ -202,8 +211,8 @@ aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py,sha256=eRqdQCyWws8q6iC
|
|
202
211
|
aiagents4pharma/talk2scholars/tools/zotero/zotero_write.py,sha256=dqYc5HWMK3vz77psHYUosMLE63NYg9Nk6xbWy8TOrU4,9246
|
203
212
|
aiagents4pharma/talk2scholars/tools/zotero/utils/__init__.py,sha256=Ll8YQZj9sYJpXmoGxj_0ZcuEHDj06_CUqdDlTlevGL4,53
|
204
213
|
aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_path.py,sha256=nHmYe3kcrygNOslHki4YeMztfnmRDPul4gZvXl_XsV0,1954
|
205
|
-
aiagents4pharma-1.
|
206
|
-
aiagents4pharma-1.
|
207
|
-
aiagents4pharma-1.
|
208
|
-
aiagents4pharma-1.
|
209
|
-
aiagents4pharma-1.
|
214
|
+
aiagents4pharma-1.30.0.dist-info/LICENSE,sha256=IcIbyB1Hyk5ZDah03VNQvJkbNk2hkBCDqQ8qtnCvB4Q,1077
|
215
|
+
aiagents4pharma-1.30.0.dist-info/METADATA,sha256=411N0HHxJVGSKxY07zzYQ4Z60aIJRN7fd4cMaSa7uVc,13245
|
216
|
+
aiagents4pharma-1.30.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
217
|
+
aiagents4pharma-1.30.0.dist-info/top_level.txt,sha256=-AH8rMmrSnJtq7HaAObS78UU-cTCwvX660dSxeM7a0A,16
|
218
|
+
aiagents4pharma-1.30.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|