aiagents4pharma 1.38.0__py3-none-any.whl → 1.39.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. aiagents4pharma/talk2scholars/agents/main_agent.py +7 -7
  2. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +12 -4
  3. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +88 -12
  4. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +1 -20
  5. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +1 -26
  6. aiagents4pharma/talk2scholars/configs/config.yaml +2 -0
  7. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +3 -0
  8. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +3 -0
  9. aiagents4pharma/talk2scholars/tests/test_main_agent.py +20 -2
  10. aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker_utils.py +28 -0
  11. aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +151 -0
  12. aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +151 -0
  13. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +107 -29
  14. aiagents4pharma/talk2scholars/tests/test_pdf_agent.py +2 -3
  15. aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +194 -543
  16. aiagents4pharma/talk2scholars/tests/test_s2_agent.py +2 -2
  17. aiagents4pharma/talk2scholars/tests/{test_s2_display.py → test_s2_display_dataframe.py} +2 -3
  18. aiagents4pharma/talk2scholars/tests/test_s2_query_dataframe.py +201 -0
  19. aiagents4pharma/talk2scholars/tests/test_s2_retrieve.py +7 -6
  20. aiagents4pharma/talk2scholars/tests/test_s2_utils_ext_ids.py +413 -0
  21. aiagents4pharma/talk2scholars/tests/test_tool_helper_utils.py +140 -0
  22. aiagents4pharma/talk2scholars/tests/test_zotero_agent.py +0 -1
  23. aiagents4pharma/talk2scholars/tests/test_zotero_read.py +16 -18
  24. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +4 -1
  25. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +92 -37
  26. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +112 -0
  27. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +112 -0
  28. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +73 -556
  29. aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +10 -0
  30. aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +97 -0
  31. aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +77 -0
  32. aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +83 -0
  33. aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +125 -0
  34. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +162 -0
  35. aiagents4pharma/talk2scholars/tools/s2/display_dataframe.py +33 -10
  36. aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +39 -16
  37. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +124 -10
  38. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +49 -17
  39. aiagents4pharma/talk2scholars/tools/s2/search.py +39 -16
  40. aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +34 -16
  41. aiagents4pharma/talk2scholars/tools/s2/utils/multi_helper.py +49 -14
  42. aiagents4pharma/talk2scholars/tools/s2/utils/search_helper.py +51 -14
  43. aiagents4pharma/talk2scholars/tools/s2/utils/single_helper.py +50 -15
  44. {aiagents4pharma-1.38.0.dist-info → aiagents4pharma-1.39.1.dist-info}/METADATA +58 -105
  45. {aiagents4pharma-1.38.0.dist-info → aiagents4pharma-1.39.1.dist-info}/RECORD +48 -35
  46. {aiagents4pharma-1.38.0.dist-info → aiagents4pharma-1.39.1.dist-info}/WHEEL +1 -1
  47. aiagents4pharma/talk2scholars/tests/test_llm_main_integration.py +0 -89
  48. aiagents4pharma/talk2scholars/tests/test_routing_logic.py +0 -74
  49. aiagents4pharma/talk2scholars/tests/test_s2_query.py +0 -95
  50. {aiagents4pharma-1.38.0.dist-info → aiagents4pharma-1.39.1.dist-info}/licenses/LICENSE +0 -0
  51. {aiagents4pharma-1.38.0.dist-info → aiagents4pharma-1.39.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,140 @@
1
+ """
2
+ Unit tests for QAToolHelper routines in tool_helper.py
3
+ """
4
+
5
+ import unittest
6
+ from types import SimpleNamespace
7
+ from unittest.mock import MagicMock, patch
8
+
9
+ from aiagents4pharma.talk2scholars.tools.pdf.utils.tool_helper import QAToolHelper
10
+
11
+
12
+ class TestQAToolHelper(unittest.TestCase):
13
+ """tests for QAToolHelper routines in tool_helper.py"""
14
+
15
+ def setUp(self):
16
+ """set up test case"""
17
+ self.helper = QAToolHelper()
18
+
19
+ def test_start_call_sets_config_and_call_id(self):
20
+ """test start_call sets config and call_id"""
21
+ cfg = SimpleNamespace(foo="bar")
22
+ self.helper.start_call(cfg, "call123")
23
+ self.assertIs(self.helper.config, cfg)
24
+ self.assertEqual(self.helper.call_id, "call123")
25
+
26
+ def test_init_vector_store_reuse(self):
27
+ """test init_vector_store reuses existing instance"""
28
+ emb_model = MagicMock()
29
+ first = self.helper.init_vector_store(emb_model)
30
+ second = self.helper.init_vector_store(emb_model)
31
+ self.assertIs(second, first)
32
+
33
+ def test_get_state_models_and_data_success(self):
34
+ """test get_state_models_and_data returns models and data"""
35
+ emb = MagicMock()
36
+ llm = MagicMock()
37
+ articles = {"p": {}}
38
+ state = {
39
+ "text_embedding_model": emb,
40
+ "llm_model": llm,
41
+ "article_data": articles,
42
+ }
43
+ ret_emb, ret_llm, ret_articles = self.helper.get_state_models_and_data(state)
44
+ self.assertIs(ret_emb, emb)
45
+ self.assertIs(ret_llm, llm)
46
+ self.assertIs(ret_articles, articles)
47
+
48
+ def test_get_state_models_and_data_missing_text_embedding(self):
49
+ """test get_state_models_and_data raises ValueError if missing text embedding"""
50
+ state = {"llm_model": MagicMock(), "article_data": {"p": {}}}
51
+ with self.assertRaises(ValueError) as cm:
52
+ self.helper.get_state_models_and_data(state)
53
+ self.assertEqual(str(cm.exception), "No text embedding model found in state.")
54
+
55
+ def test_get_state_models_and_data_missing_llm(self):
56
+ """test get_state_models_and_data raises ValueError if missing LLM"""
57
+ state = {"text_embedding_model": MagicMock(), "article_data": {"p": {}}}
58
+ with self.assertRaises(ValueError) as cm:
59
+ self.helper.get_state_models_and_data(state)
60
+ self.assertEqual(str(cm.exception), "No LLM model found in state.")
61
+
62
+ def test_get_state_models_and_data_missing_article_data(self):
63
+ """test get_state_models_and_data raises ValueError if missing article data"""
64
+ state = {"text_embedding_model": MagicMock(), "llm_model": MagicMock()}
65
+ with self.assertRaises(ValueError) as cm:
66
+ self.helper.get_state_models_and_data(state)
67
+ self.assertEqual(str(cm.exception), "No article_data found in state.")
68
+
69
+ def test_load_candidate_papers_calls_add_paper_only_for_valid(self):
70
+ """test load_candidate_papers calls add_paper only for valid candidates"""
71
+ vs = SimpleNamespace(loaded_papers=set(), add_paper=MagicMock())
72
+ articles = {"p1": {"pdf_url": "url1"}, "p2": {}, "p3": {"pdf_url": None}}
73
+ candidates = ["p1", "p2", "p3"]
74
+ self.helper.load_candidate_papers(vs, articles, candidates)
75
+ vs.add_paper.assert_called_once_with("p1", "url1", articles["p1"])
76
+
77
+ def test_load_candidate_papers_handles_add_paper_exception(self):
78
+ """test load_candidate_papers handles add_paper exception"""
79
+ # If add_paper raises, it should be caught and not propagate
80
+ vs = SimpleNamespace(
81
+ loaded_papers=set(), add_paper=MagicMock(side_effect=ValueError("oops"))
82
+ )
83
+ articles = {"p1": {"pdf_url": "url1"}}
84
+ # Start call to set call_id (used in logging)
85
+ self.helper.start_call(SimpleNamespace(), "call001")
86
+ # Should not raise despite exception
87
+ self.helper.load_candidate_papers(vs, articles, ["p1"])
88
+ vs.add_paper.assert_called_once_with("p1", "url1", articles["p1"])
89
+
90
+ def test_run_reranker_success_and_filtering(self):
91
+ """test run_reranker success and filtering"""
92
+ # Successful rerank returns filtered candidates
93
+ cfg = SimpleNamespace(top_k_papers=2)
94
+ self.helper.config = cfg
95
+ vs = MagicMock()
96
+ with patch(
97
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.tool_helper.rank_papers_by_query",
98
+ return_value=["a", "c"],
99
+ ):
100
+ out = self.helper.run_reranker(vs, "q", ["a", "b"])
101
+ self.assertEqual(out, ["a"])
102
+
103
+ def test_run_reranker_exception_fallback(self):
104
+ """test run_reranker exception fallback"""
105
+ # On reranker failure, should return original candidates
106
+ cfg = SimpleNamespace(top_k_papers=5)
107
+ self.helper.config = cfg
108
+ vs = MagicMock()
109
+
110
+ def fail(*args, **kwargs):
111
+ raise RuntimeError("fail")
112
+
113
+ with patch(
114
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.tool_helper.rank_papers_by_query",
115
+ side_effect=fail,
116
+ ):
117
+ candidates = ["x", "y"]
118
+ out = self.helper.run_reranker(vs, "q", candidates)
119
+ self.assertEqual(out, candidates)
120
+
121
+ def test_format_answer_with_and_without_sources(self):
122
+ """test format_answer with and without sources"""
123
+ articles = {"p1": {"Title": "T1"}, "p2": {"Title": "T2"}}
124
+ # With sources
125
+ with patch(
126
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.tool_helper.generate_answer",
127
+ return_value={"output_text": "ans", "papers_used": ["p1", "p2"]},
128
+ ):
129
+ res = self.helper.format_answer("q", [], MagicMock(), articles)
130
+ self.assertIn("ans", res)
131
+ self.assertIn("Sources:", res)
132
+ self.assertIn("- T1", res)
133
+ self.assertIn("- T2", res)
134
+ # Without sources
135
+ with patch(
136
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.tool_helper.generate_answer",
137
+ return_value={"output_text": "ans", "papers_used": []},
138
+ ):
139
+ res2 = self.helper.format_answer("q", [], MagicMock(), {})
140
+ self.assertEqual(res2, "ans")
@@ -2,7 +2,6 @@
2
2
  Updated Unit Tests for the Zotero agent (Zotero Library Managent sub-agent).
3
3
  """
4
4
 
5
- # pylint: disable=redefined-outer-name
6
5
  from unittest import mock
7
6
  import pytest
8
7
  from langchain_core.messages import HumanMessage, AIMessage
@@ -17,8 +17,6 @@ from aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_pdf_downloader impo
17
17
  )
18
18
  from aiagents4pharma.talk2scholars.tools.zotero.zotero_read import zotero_read
19
19
 
20
- # pylint: disable=protected-access
21
- # pylint: disable=protected-access, too-many-arguments, too-many-positional-arguments
22
20
 
23
21
  # Dummy Hydra configuration to be used in tests
24
22
  dummy_zotero_read_config = SimpleNamespace(
@@ -211,15 +209,15 @@ class TestZoteroSearchTool(unittest.TestCase):
211
209
  @patch(
212
210
  "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.download_pdfs_in_parallel"
213
211
  )
214
- def test_filtering_no_matching_papers(
215
- self,
216
- mock_batch_download,
217
- mock_hydra_init,
218
- mock_hydra_compose,
219
- mock_zotero_class,
220
- mock_get_item_collections,
221
- ):
212
+ def test_filtering_no_matching_papers(self, *mocks):
222
213
  """Testing filtering when no paper matching"""
214
+ (
215
+ mock_batch_download,
216
+ mock_hydra_init,
217
+ mock_hydra_compose,
218
+ mock_zotero_class,
219
+ mock_get_item_collections,
220
+ ) = mocks
223
221
  mock_hydra_compose.return_value = dummy_cfg
224
222
  mock_hydra_init.return_value.__enter__.return_value = None
225
223
 
@@ -460,15 +458,15 @@ class TestZoteroSearchTool(unittest.TestCase):
460
458
  @patch(
461
459
  "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.requests.Session.get"
462
460
  )
463
- def test_pdf_attachment_success(
464
- self,
465
- mock_session_get,
466
- mock_hydra_init,
467
- mock_hydra_compose,
468
- mock_zotero_class,
469
- mock_get_item_collections,
470
- ):
461
+ def test_pdf_attachment_success(self, *mocks):
471
462
  """Test for pdf attachment success"""
463
+ (
464
+ mock_session_get,
465
+ mock_hydra_init,
466
+ mock_hydra_compose,
467
+ mock_zotero_class,
468
+ mock_get_item_collections,
469
+ ) = mocks
472
470
  mock_hydra_compose.return_value = dummy_cfg
473
471
  mock_hydra_init.return_value.__enter__.return_value = None
474
472
 
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- This package provides modules for fetching and downloading academic papers from arXiv.
3
+ This package provides modules for fetching and downloading academic papers from arXiv,
4
+ biorxiv and medrxiv.
4
5
  """
5
6
 
6
7
  # Import modules
@@ -8,4 +9,6 @@ from . import download_arxiv_input
8
9
 
9
10
  __all__ = [
10
11
  "download_arxiv_input",
12
+ "download_biorxiv_input",
13
+ "download_medrxiv_input",
11
14
  ]
@@ -5,7 +5,7 @@ Tool for downloading arXiv paper metadata and retrieving the PDF URL.
5
5
 
6
6
  import logging
7
7
  import xml.etree.ElementTree as ET
8
- from typing import Annotated, Any
8
+ from typing import Annotated, Any, List
9
9
 
10
10
  import hydra
11
11
  import requests
@@ -23,12 +23,22 @@ logger = logging.getLogger(__name__)
23
23
  class DownloadArxivPaperInput(BaseModel):
24
24
  """Input schema for the arXiv paper download tool."""
25
25
 
26
- arxiv_id: str = Field(
27
- description="The arXiv paper ID used to retrieve the paper details and PDF URL."
26
+ arxiv_ids: List[str] = Field(
27
+ description="List of arXiv paper IDs used to retrieve paper details and PDF URLs."
28
28
  )
29
29
  tool_call_id: Annotated[str, InjectedToolCallId]
30
30
 
31
31
 
32
+ # Helper to load arXiv download configuration
33
+ def _get_arxiv_config() -> Any:
34
+ """Load arXiv download configuration."""
35
+ with hydra.initialize(version_base=None, config_path="../../configs"):
36
+ cfg = hydra.compose(
37
+ config_name="config", overrides=["tools/download_arxiv_paper=default"]
38
+ )
39
+ return cfg.tools.download_arxiv_paper
40
+
41
+
32
42
  def fetch_arxiv_metadata(
33
43
  api_url: str, arxiv_id: str, request_timeout: int
34
44
  ) -> ET.Element:
@@ -42,19 +52,21 @@ def fetch_arxiv_metadata(
42
52
  def extract_metadata(entry: ET.Element, ns: dict, arxiv_id: str) -> dict:
43
53
  """Extract metadata from the XML entry."""
44
54
  title_elem = entry.find("atom:title", ns)
45
- title = title_elem.text.strip() if title_elem is not None else "N/A"
55
+ title = (title_elem.text or "").strip() if title_elem is not None else "N/A"
46
56
 
47
- authors = [
48
- author_elem.find("atom:name", ns).text.strip()
49
- for author_elem in entry.findall("atom:author", ns)
50
- if author_elem.find("atom:name", ns) is not None
51
- ]
57
+ authors = []
58
+ for author_elem in entry.findall("atom:author", ns):
59
+ name_elem = author_elem.find("atom:name", ns)
60
+ if name_elem is not None and name_elem.text:
61
+ authors.append(name_elem.text.strip())
52
62
 
53
63
  summary_elem = entry.find("atom:summary", ns)
54
- abstract = summary_elem.text.strip() if summary_elem is not None else "N/A"
64
+ abstract = (summary_elem.text or "").strip() if summary_elem is not None else "N/A"
55
65
 
56
66
  published_elem = entry.find("atom:published", ns)
57
- pub_date = published_elem.text.strip() if published_elem is not None else "N/A"
67
+ pub_date = (
68
+ (published_elem.text or "").strip() if published_elem is not None else "N/A"
69
+ )
58
70
 
59
71
  pdf_url = next(
60
72
  (
@@ -80,43 +92,86 @@ def extract_metadata(entry: ET.Element, ns: dict, arxiv_id: str) -> dict:
80
92
  }
81
93
 
82
94
 
83
- @tool(args_schema=DownloadArxivPaperInput, parse_docstring=True)
95
+ def _get_snippet(abstract: str) -> str:
96
+ """Extract the first one or two sentences from an abstract."""
97
+ if not abstract or abstract == "N/A":
98
+ return ""
99
+ sentences = abstract.split(". ")
100
+ snippet_sentences = sentences[:2]
101
+ snippet = ". ".join(snippet_sentences)
102
+ if not snippet.endswith("."):
103
+ snippet += "."
104
+ return snippet
105
+
106
+
107
+ def _build_summary(article_data: dict[str, Any]) -> str:
108
+ """Build a summary string for up to three papers with snippets."""
109
+ top = list(article_data.values())[:3]
110
+ lines: list[str] = []
111
+ for idx, paper in enumerate(top):
112
+ title = paper.get("Title", "N/A")
113
+ pub_date = paper.get("Publication Date", "N/A")
114
+ url = paper.get("URL", "")
115
+ snippet = _get_snippet(paper.get("Abstract", ""))
116
+ line = f"{idx+1}. {title} ({pub_date})"
117
+ if url:
118
+ line += f"\n View PDF: {url}"
119
+ if snippet:
120
+ line += f"\n Abstract snippet: {snippet}"
121
+ lines.append(line)
122
+ summary = "\n".join(lines)
123
+ return (
124
+ "Download was successful. Papers metadata are attached as an artifact. "
125
+ "Here is a summary of the results:\n"
126
+ f"Number of papers found: {len(article_data)}\n"
127
+ "Top 3 papers:\n" + summary
128
+ )
129
+
130
+
131
+ @tool(
132
+ args_schema=DownloadArxivPaperInput,
133
+ parse_docstring=True,
134
+ )
84
135
  def download_arxiv_paper(
85
- arxiv_id: str,
136
+ arxiv_ids: List[str],
86
137
  tool_call_id: Annotated[str, InjectedToolCallId],
87
138
  ) -> Command[Any]:
88
139
  """
89
- Get metadata and PDF URL for an arXiv paper using its unique arXiv ID.
140
+ Get metadata and PDF URLs for one or more arXiv papers using their unique arXiv IDs.
90
141
  """
91
- logger.info("Fetching metadata from arXiv for paper ID: %s", arxiv_id)
142
+ logger.info("Fetching metadata from arXiv for paper IDs: %s", arxiv_ids)
92
143
 
93
144
  # Load configuration
94
- with hydra.initialize(version_base=None, config_path="../../configs"):
95
- cfg = hydra.compose(
96
- config_name="config", overrides=["tools/download_arxiv_paper=default"]
145
+ cfg = _get_arxiv_config()
146
+ api_url = cfg.api_url
147
+ request_timeout = cfg.request_timeout
148
+
149
+ # Aggregate results
150
+ article_data: dict[str, Any] = {}
151
+ for aid in arxiv_ids:
152
+ logger.info("Processing arXiv ID: %s", aid)
153
+ # Fetch and parse metadata
154
+ entry = fetch_arxiv_metadata(api_url, aid, request_timeout).find(
155
+ "atom:entry", {"atom": "http://www.w3.org/2005/Atom"}
156
+ )
157
+ if entry is None:
158
+ logger.warning("No entry found for arXiv ID %s", aid)
159
+ continue
160
+ article_data[aid] = extract_metadata(
161
+ entry, {"atom": "http://www.w3.org/2005/Atom"}, aid
97
162
  )
98
- api_url = cfg.tools.download_arxiv_paper.api_url
99
- request_timeout = cfg.tools.download_arxiv_paper.request_timeout
100
-
101
- # Fetch and parse metadata
102
- root = fetch_arxiv_metadata(api_url, arxiv_id, request_timeout)
103
- ns = {"atom": "http://www.w3.org/2005/Atom"}
104
-
105
- entry = root.find("atom:entry", ns)
106
- if entry is None:
107
- raise ValueError(f"No entry found for arXiv ID {arxiv_id}")
108
-
109
- # Extract metadata
110
- metadata = extract_metadata(entry, ns, arxiv_id)
111
-
112
- # Create article_data entry with the paper ID as the key
113
- article_data = {arxiv_id: metadata}
114
-
115
- content = f"Successfully retrieved metadata and PDF URL for arXiv ID {arxiv_id}"
116
163
 
164
+ # Build and return summary
165
+ content = _build_summary(article_data)
117
166
  return Command(
118
167
  update={
119
168
  "article_data": article_data,
120
- "messages": [ToolMessage(content=content, tool_call_id=tool_call_id)],
169
+ "messages": [
170
+ ToolMessage(
171
+ content=content,
172
+ tool_call_id=tool_call_id,
173
+ artifact=article_data,
174
+ )
175
+ ],
121
176
  }
122
177
  )
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Tool for downloading bioRxiv paper metadata and retrieving the PDF URL.
4
+ """
5
+
6
+ import logging
7
+ from typing import Annotated, Any
8
+
9
+ import hydra
10
+ import requests
11
+ from langchain_core.messages import ToolMessage
12
+ from langchain_core.tools import tool
13
+ from langchain_core.tools.base import InjectedToolCallId
14
+ from langgraph.types import Command
15
+ from pydantic import BaseModel, Field
16
+
17
+ # Configure logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class DownloadBiorxivPaperInput(BaseModel):
23
+ """Input schema for the bioRxiv paper download tool."""
24
+
25
+ doi: str = Field(description=
26
+ """The bioRxiv DOI, from search_helper or multi_helper or single_helper,
27
+ used to retrieve the paper details and PDF URL."""
28
+ )
29
+ logger.info("DOI Received: %s", doi)
30
+ tool_call_id: Annotated[str, InjectedToolCallId]
31
+
32
+ def fetch_biorxiv_metadata(doi: str, api_url: str, request_timeout: int) -> dict:
33
+ """
34
+ Fetch metadata for a bioRxiv paper using its DOI and extract relevant fields.
35
+
36
+ Parameters:
37
+ doi (str): The DOI of the bioRxiv paper.
38
+
39
+ Returns:
40
+ dict: A dictionary containing the title, authors, abstract, publication date, and URLs.
41
+ """
42
+ # Strip any version suffix (e.g., v1) since bioRxiv's API is version-sensitive
43
+ clean_doi = doi.split("v")[0]
44
+
45
+ api_url = f"{api_url}{clean_doi}"
46
+ logger.info("Fetching metadata from api url: %s", api_url)
47
+ response = requests.get(api_url, timeout=request_timeout)
48
+ response.raise_for_status()
49
+
50
+ data = response.json()
51
+ if not data.get("collection"):
52
+ raise ValueError(f"No metadata found for DOI: {doi}")
53
+
54
+ data = response.json()
55
+
56
+ return data["collection"][0]
57
+
58
+ def extract_metadata(paper: dict, doi: str) -> dict:
59
+ """
60
+ Extract relevant metadata fields from a bioRxiv paper entry.
61
+ """
62
+ title = paper.get("title", "")
63
+ authors = paper.get("authors", "")
64
+ abstract = paper.get("abstract", "")
65
+ pub_date = paper.get("date", "")
66
+ doi_suffix = paper.get("doi", "").split("10.1101/")[-1]
67
+ pdf_url = f"https://www.biorxiv.org/content/10.1101/{doi_suffix}.full.pdf"
68
+ logger.info("PDF URL: %s", pdf_url)
69
+ return {
70
+ "Title": title,
71
+ "Authors": authors,
72
+ "Abstract": abstract,
73
+ "Publication Date": pub_date,
74
+ "URL": pdf_url,
75
+ "pdf_url": pdf_url,
76
+ "filename": f"{doi_suffix}.pdf",
77
+ "source": "biorxiv",
78
+ "biorxiv_id": doi
79
+ }
80
+
81
+ @tool(args_schema=DownloadBiorxivPaperInput, parse_docstring=True)
82
+ def download_biorxiv_paper(
83
+ doi: str,
84
+ tool_call_id: Annotated[str, InjectedToolCallId],
85
+ ) -> Command[Any]:
86
+ """
87
+ Get metadata and PDF URL for a bioRxiv paper using its DOI.
88
+ """
89
+ logger.info("Fetching metadata from bioRxiv for DOI: %s", doi)
90
+
91
+ # Load configuration
92
+ with hydra.initialize(version_base=None, config_path="../../configs"):
93
+ cfg = hydra.compose(
94
+ config_name="config", overrides=["tools/download_biorxiv_paper=default"]
95
+ )
96
+ api_url = cfg.tools.download_biorxiv_paper.api_url
97
+ request_timeout = cfg.tools.download_biorxiv_paper.request_timeout
98
+ logger.info("API URL: %s", api_url)
99
+ logger.info("Request Timeout: %s", request_timeout)
100
+
101
+ # Fetch metadata
102
+ raw_data = fetch_biorxiv_metadata(doi, api_url, request_timeout)
103
+ metadata = extract_metadata(raw_data, doi)
104
+ article_data = {doi: metadata}
105
+ content = f"Successfully retrieved metadata and PDF URL for bioRxiv DOI {doi}"
106
+
107
+ return Command(
108
+ update={
109
+ "article_data": article_data,
110
+ "messages": [ToolMessage(content=content, tool_call_id=tool_call_id)],
111
+ }
112
+ )
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Tool for downloading medRxiv paper metadata and retrieving the PDF URL.
4
+ """
5
+
6
+ import logging
7
+ from typing import Annotated, Any
8
+
9
+ import hydra
10
+ import requests
11
+ from langchain_core.messages import ToolMessage
12
+ from langchain_core.tools import tool
13
+ from langchain_core.tools.base import InjectedToolCallId
14
+ from langgraph.types import Command
15
+ from pydantic import BaseModel, Field
16
+
17
+ # Configure logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class DownloadMedrxivPaperInput(BaseModel):
23
+ """Input schema for the medRxiv paper download tool."""
24
+
25
+ doi: str = Field(description=
26
+ """The medRxiv DOI, from search_helper or multi_helper or single_helper,
27
+ used to retrieve the paper details and PDF URL."""
28
+ )
29
+ logger.info("DOI Received: %s", doi)
30
+ tool_call_id: Annotated[str, InjectedToolCallId]
31
+
32
+ # Fetching raw metadata from medRxiv API for a given DOI
33
+ def fetch_medrxiv_metadata(doi: str, api_url: str, request_timeout: int) -> dict:
34
+ """
35
+ Fetch metadata for a medRxiv paper using its DOI and extract relevant fields.
36
+
37
+ Parameters:
38
+ doi (str): The DOI of the medRxiv paper.
39
+
40
+ Returns:
41
+ dict: A dictionary containing the title, authors, abstract, publication date, and URLs.
42
+ """
43
+ # Strip any version suffix (e.g., v1) since bioRxiv's API is version-sensitive
44
+ clean_doi = doi.split("v")[0]
45
+
46
+ api_url = f"{api_url}{clean_doi}"
47
+ logger.info("Fetching metadata from api url: %s", api_url)
48
+ response = requests.get(api_url, timeout=request_timeout)
49
+ response.raise_for_status()
50
+
51
+ data = response.json()
52
+ if not data.get("collection"):
53
+ raise ValueError(f"No entry found for medRxiv ID {doi}")
54
+
55
+ return data["collection"][0]
56
+
57
+ # Extracting relevant metadata fields from the raw data
58
+ def extract_metadata(paper: dict, doi: str) -> dict:
59
+ """
60
+ Extract relevant metadata fields from a medRxiv paper entry.
61
+ """
62
+ title = paper.get("title", "")
63
+ authors = paper.get("authors", "")
64
+ abstract = paper.get("abstract", "")
65
+ pub_date = paper.get("date", "")
66
+ doi_suffix = paper.get("doi", "").split("10.1101/")[-1]
67
+ pdf_url = f"https://www.medrxiv.org/content/10.1101/{doi_suffix}.full.pdf"
68
+ logger.info("PDF URL: %s", pdf_url)
69
+ return {
70
+ "Title": title,
71
+ "Authors": authors,
72
+ "Abstract": abstract,
73
+ "Publication Date": pub_date,
74
+ "URL": pdf_url,
75
+ "pdf_url": pdf_url,
76
+ "filename": f"{doi_suffix}.pdf",
77
+ "source": "medrxiv",
78
+ "medrxiv_id": doi
79
+ }
80
+
81
+ # Tool to download medRxiv paper metadata and PDF URL
82
+ @tool(args_schema=DownloadMedrxivPaperInput, parse_docstring=True)
83
+ def download_medrxiv_paper(
84
+ doi: str,
85
+ tool_call_id: Annotated[str, InjectedToolCallId],
86
+ ) -> Command[Any]:
87
+ """
88
+ Get metadata and PDF URL for a medRxiv paper using its doi or medrxiv id.
89
+ """
90
+ logger.info("Fetching metadata from medRxiv for DOI: %s", doi)
91
+
92
+ # Load configuration
93
+ with hydra.initialize(version_base=None, config_path="../../configs"):
94
+ cfg = hydra.compose(
95
+ config_name="config", overrides=["tools/download_medrxiv_paper=default"]
96
+ )
97
+ api_url = cfg.tools.download_medrxiv_paper.api_url
98
+ request_timeout = cfg.tools.download_medrxiv_paper.request_timeout
99
+ logger.info("API URL: %s", api_url)
100
+
101
+ raw_data = fetch_medrxiv_metadata(doi, api_url, request_timeout)
102
+ metadata = extract_metadata(raw_data, doi)
103
+ article_data = {doi: metadata}
104
+
105
+ content = f"Successfully retrieved metadata and PDF URL for medRxiv DOI {doi}"
106
+
107
+ return Command(
108
+ update={
109
+ "article_data": article_data,
110
+ "messages": [ToolMessage(content=content, tool_call_id=tool_call_id)],
111
+ }
112
+ )