aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
  2. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
  3. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
  4. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
  5. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
  6. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
  7. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
  8. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
  9. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
  10. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
  11. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
  12. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
  13. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
  14. aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
  15. aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
  16. aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
  17. aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
  18. aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
  19. aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
  20. aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
  21. aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
  22. aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
  23. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
  24. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
  25. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
  26. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
  27. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
  28. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
  29. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
  30. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
  31. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
  32. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
  33. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
  34. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
  35. aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
  36. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
  37. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
  38. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
  39. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
  40. aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
  41. aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
  42. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
  43. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
  44. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
  45. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
  46. /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
  47. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
  48. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
  49. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -1,177 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Tool for downloading arXiv paper metadata and retrieving the PDF URL.
4
- """
5
-
6
- import logging
7
- import xml.etree.ElementTree as ET
8
- from typing import Annotated, Any, List
9
-
10
- import hydra
11
- import requests
12
- from langchain_core.messages import ToolMessage
13
- from langchain_core.tools import tool
14
- from langchain_core.tools.base import InjectedToolCallId
15
- from langgraph.types import Command
16
- from pydantic import BaseModel, Field
17
-
18
- # Configure logging
19
- logging.basicConfig(level=logging.INFO)
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- class DownloadArxivPaperInput(BaseModel):
24
- """Input schema for the arXiv paper download tool."""
25
-
26
- arxiv_ids: List[str] = Field(
27
- description="List of arXiv paper IDs used to retrieve paper details and PDF URLs."
28
- )
29
- tool_call_id: Annotated[str, InjectedToolCallId]
30
-
31
-
32
- # Helper to load arXiv download configuration
33
- def _get_arxiv_config() -> Any:
34
- """Load arXiv download configuration."""
35
- with hydra.initialize(version_base=None, config_path="../../configs"):
36
- cfg = hydra.compose(
37
- config_name="config", overrides=["tools/download_arxiv_paper=default"]
38
- )
39
- return cfg.tools.download_arxiv_paper
40
-
41
-
42
- def fetch_arxiv_metadata(
43
- api_url: str, arxiv_id: str, request_timeout: int
44
- ) -> ET.Element:
45
- """Fetch and parse metadata from the arXiv API."""
46
- query_url = f"{api_url}?search_query=id:{arxiv_id}&start=0&max_results=1"
47
- response = requests.get(query_url, timeout=request_timeout)
48
- response.raise_for_status()
49
- return ET.fromstring(response.text)
50
-
51
-
52
- def extract_metadata(entry: ET.Element, ns: dict, arxiv_id: str) -> dict:
53
- """Extract metadata from the XML entry."""
54
- title_elem = entry.find("atom:title", ns)
55
- title = (title_elem.text or "").strip() if title_elem is not None else "N/A"
56
-
57
- authors = []
58
- for author_elem in entry.findall("atom:author", ns):
59
- name_elem = author_elem.find("atom:name", ns)
60
- if name_elem is not None and name_elem.text:
61
- authors.append(name_elem.text.strip())
62
-
63
- summary_elem = entry.find("atom:summary", ns)
64
- abstract = (summary_elem.text or "").strip() if summary_elem is not None else "N/A"
65
-
66
- published_elem = entry.find("atom:published", ns)
67
- pub_date = (
68
- (published_elem.text or "").strip() if published_elem is not None else "N/A"
69
- )
70
-
71
- pdf_url = next(
72
- (
73
- link.attrib.get("href")
74
- for link in entry.findall("atom:link", ns)
75
- if link.attrib.get("title") == "pdf"
76
- ),
77
- None,
78
- )
79
- if not pdf_url:
80
- raise RuntimeError(f"Could not find PDF URL for arXiv ID {arxiv_id}")
81
-
82
- return {
83
- "Title": title,
84
- "Authors": authors,
85
- "Abstract": abstract,
86
- "Publication Date": pub_date,
87
- "URL": pdf_url,
88
- "pdf_url": pdf_url,
89
- "filename": f"{arxiv_id}.pdf",
90
- "source": "arxiv",
91
- "arxiv_id": arxiv_id,
92
- }
93
-
94
-
95
- def _get_snippet(abstract: str) -> str:
96
- """Extract the first one or two sentences from an abstract."""
97
- if not abstract or abstract == "N/A":
98
- return ""
99
- sentences = abstract.split(". ")
100
- snippet_sentences = sentences[:2]
101
- snippet = ". ".join(snippet_sentences)
102
- if not snippet.endswith("."):
103
- snippet += "."
104
- return snippet
105
-
106
-
107
- def _build_summary(article_data: dict[str, Any]) -> str:
108
- """Build a summary string for up to three papers with snippets."""
109
- top = list(article_data.values())[:3]
110
- lines: list[str] = []
111
- for idx, paper in enumerate(top):
112
- title = paper.get("Title", "N/A")
113
- pub_date = paper.get("Publication Date", "N/A")
114
- url = paper.get("URL", "")
115
- snippet = _get_snippet(paper.get("Abstract", ""))
116
- line = f"{idx+1}. {title} ({pub_date})"
117
- if url:
118
- line += f"\n View PDF: {url}"
119
- if snippet:
120
- line += f"\n Abstract snippet: {snippet}"
121
- lines.append(line)
122
- summary = "\n".join(lines)
123
- return (
124
- "Download was successful. Papers metadata are attached as an artifact. "
125
- "Here is a summary of the results:\n"
126
- f"Number of papers found: {len(article_data)}\n"
127
- "Top 3 papers:\n" + summary
128
- )
129
-
130
-
131
- @tool(
132
- args_schema=DownloadArxivPaperInput,
133
- parse_docstring=True,
134
- )
135
- def download_arxiv_paper(
136
- arxiv_ids: List[str],
137
- tool_call_id: Annotated[str, InjectedToolCallId],
138
- ) -> Command[Any]:
139
- """
140
- Get metadata and PDF URLs for one or more arXiv papers using their unique arXiv IDs.
141
- """
142
- logger.info("Fetching metadata from arXiv for paper IDs: %s", arxiv_ids)
143
-
144
- # Load configuration
145
- cfg = _get_arxiv_config()
146
- api_url = cfg.api_url
147
- request_timeout = cfg.request_timeout
148
-
149
- # Aggregate results
150
- article_data: dict[str, Any] = {}
151
- for aid in arxiv_ids:
152
- logger.info("Processing arXiv ID: %s", aid)
153
- # Fetch and parse metadata
154
- entry = fetch_arxiv_metadata(api_url, aid, request_timeout).find(
155
- "atom:entry", {"atom": "http://www.w3.org/2005/Atom"}
156
- )
157
- if entry is None:
158
- logger.warning("No entry found for arXiv ID %s", aid)
159
- continue
160
- article_data[aid] = extract_metadata(
161
- entry, {"atom": "http://www.w3.org/2005/Atom"}, aid
162
- )
163
-
164
- # Build and return summary
165
- content = _build_summary(article_data)
166
- return Command(
167
- update={
168
- "article_data": article_data,
169
- "messages": [
170
- ToolMessage(
171
- content=content,
172
- tool_call_id=tool_call_id,
173
- artifact=article_data,
174
- )
175
- ],
176
- }
177
- )
@@ -1,114 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Tool for downloading bioRxiv paper metadata and retrieving the PDF URL.
4
- """
5
-
6
- import logging
7
- from typing import Annotated, Any
8
-
9
- import hydra
10
- import requests
11
- from langchain_core.messages import ToolMessage
12
- from langchain_core.tools import tool
13
- from langchain_core.tools.base import InjectedToolCallId
14
- from langgraph.types import Command
15
- from pydantic import BaseModel, Field
16
-
17
- # Configure logging
18
- logging.basicConfig(level=logging.INFO)
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class DownloadBiorxivPaperInput(BaseModel):
23
- """Input schema for the bioRxiv paper download tool."""
24
-
25
- doi: str = Field(
26
- description="""The bioRxiv DOI, from search_helper or multi_helper or single_helper,
27
- used to retrieve the paper details and PDF URL."""
28
- )
29
- tool_call_id: Annotated[str, InjectedToolCallId]
30
-
31
-
32
- def fetch_biorxiv_metadata(doi: str, api_url: str, request_timeout: int) -> dict:
33
- """
34
- Fetch metadata for a bioRxiv paper using its DOI and extract relevant fields.
35
-
36
- Parameters:
37
- doi (str): The DOI of the bioRxiv paper.
38
-
39
- Returns:
40
- dict: A dictionary containing the title, authors, abstract, publication date, and URLs.
41
- """
42
- # Strip any version suffix (e.g., v1) since bioRxiv's API is version-sensitive
43
- clean_doi = doi.split("v")[0]
44
-
45
- api_url = f"{api_url}{clean_doi}"
46
- logger.info("Fetching metadata from api url: %s", api_url)
47
- response = requests.get(api_url, timeout=request_timeout)
48
- response.raise_for_status()
49
-
50
- data = response.json()
51
- if not data.get("collection"):
52
- raise ValueError(f"No metadata found for DOI: {doi}")
53
-
54
- data = response.json()
55
-
56
- return data["collection"][0]
57
-
58
-
59
- def extract_metadata(paper: dict, doi: str) -> dict:
60
- """
61
- Extract relevant metadata fields from a bioRxiv paper entry.
62
- """
63
- title = paper.get("title", "")
64
- authors = paper.get("authors", "")
65
- abstract = paper.get("abstract", "")
66
- pub_date = paper.get("date", "")
67
- doi_suffix = paper.get("doi", "").split("10.1101/")[-1]
68
- pdf_url = f"https://www.biorxiv.org/content/10.1101/{doi_suffix}.full.pdf"
69
- logger.info("PDF URL: %s", pdf_url)
70
- return {
71
- "Title": title,
72
- "Authors": authors,
73
- "Abstract": abstract,
74
- "Publication Date": pub_date,
75
- "URL": pdf_url,
76
- "pdf_url": pdf_url,
77
- "filename": f"{doi_suffix}.pdf",
78
- "source": "biorxiv",
79
- "biorxiv_id": doi,
80
- }
81
-
82
-
83
- @tool(args_schema=DownloadBiorxivPaperInput, parse_docstring=True)
84
- def download_biorxiv_paper(
85
- doi: str,
86
- tool_call_id: Annotated[str, InjectedToolCallId],
87
- ) -> Command[Any]:
88
- """
89
- Get metadata and PDF URL for a bioRxiv paper using its DOI.
90
- """
91
- logger.info("Fetching metadata from bioRxiv for DOI: %s", doi)
92
-
93
- # Load configuration
94
- with hydra.initialize(version_base=None, config_path="../../configs"):
95
- cfg = hydra.compose(
96
- config_name="config", overrides=["tools/download_biorxiv_paper=default"]
97
- )
98
- api_url = cfg.tools.download_biorxiv_paper.api_url
99
- request_timeout = cfg.tools.download_biorxiv_paper.request_timeout
100
- logger.info("API URL: %s", api_url)
101
- logger.info("Request Timeout: %s", request_timeout)
102
-
103
- # Fetch metadata
104
- raw_data = fetch_biorxiv_metadata(doi, api_url, request_timeout)
105
- metadata = extract_metadata(raw_data, doi)
106
- article_data = {doi: metadata}
107
- content = f"Successfully retrieved metadata and PDF URL for bioRxiv DOI {doi}"
108
-
109
- return Command(
110
- update={
111
- "article_data": article_data,
112
- "messages": [ToolMessage(content=content, tool_call_id=tool_call_id)],
113
- }
114
- )
@@ -1,114 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Tool for downloading medRxiv paper metadata and retrieving the PDF URL.
4
- """
5
-
6
- import logging
7
- from typing import Annotated, Any
8
-
9
- import hydra
10
- import requests
11
- from langchain_core.messages import ToolMessage
12
- from langchain_core.tools import tool
13
- from langchain_core.tools.base import InjectedToolCallId
14
- from langgraph.types import Command
15
- from pydantic import BaseModel, Field
16
-
17
- # Configure logging
18
- logging.basicConfig(level=logging.INFO)
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class DownloadMedrxivPaperInput(BaseModel):
23
- """Input schema for the medRxiv paper download tool."""
24
-
25
- doi: str = Field(
26
- description="""The medRxiv DOI, from search_helper or multi_helper or single_helper,
27
- used to retrieve the paper details and PDF URL."""
28
- )
29
- tool_call_id: Annotated[str, InjectedToolCallId]
30
-
31
-
32
- # Fetching raw metadata from medRxiv API for a given DOI
33
- def fetch_medrxiv_metadata(doi: str, api_url: str, request_timeout: int) -> dict:
34
- """
35
- Fetch metadata for a medRxiv paper using its DOI and extract relevant fields.
36
-
37
- Parameters:
38
- doi (str): The DOI of the medRxiv paper.
39
-
40
- Returns:
41
- dict: A dictionary containing the title, authors, abstract, publication date, and URLs.
42
- """
43
- # Strip any version suffix (e.g., v1) since bioRxiv's API is version-sensitive
44
- clean_doi = doi.split("v")[0]
45
-
46
- api_url = f"{api_url}{clean_doi}"
47
- logger.info("Fetching metadata from api url: %s", api_url)
48
- response = requests.get(api_url, timeout=request_timeout)
49
- response.raise_for_status()
50
-
51
- data = response.json()
52
- if not data.get("collection"):
53
- raise ValueError(f"No entry found for medRxiv ID {doi}")
54
-
55
- return data["collection"][0]
56
-
57
-
58
- # Extracting relevant metadata fields from the raw data
59
- def extract_metadata(paper: dict, doi: str) -> dict:
60
- """
61
- Extract relevant metadata fields from a medRxiv paper entry.
62
- """
63
- title = paper.get("title", "")
64
- authors = paper.get("authors", "")
65
- abstract = paper.get("abstract", "")
66
- pub_date = paper.get("date", "")
67
- doi_suffix = paper.get("doi", "").split("10.1101/")[-1]
68
- pdf_url = f"https://www.medrxiv.org/content/10.1101/{doi_suffix}.full.pdf"
69
- logger.info("PDF URL: %s", pdf_url)
70
- return {
71
- "Title": title,
72
- "Authors": authors,
73
- "Abstract": abstract,
74
- "Publication Date": pub_date,
75
- "URL": pdf_url,
76
- "pdf_url": pdf_url,
77
- "filename": f"{doi_suffix}.pdf",
78
- "source": "medrxiv",
79
- "medrxiv_id": doi,
80
- }
81
-
82
-
83
- # Tool to download medRxiv paper metadata and PDF URL
84
- @tool(args_schema=DownloadMedrxivPaperInput, parse_docstring=True)
85
- def download_medrxiv_paper(
86
- doi: str,
87
- tool_call_id: Annotated[str, InjectedToolCallId],
88
- ) -> Command[Any]:
89
- """
90
- Get metadata and PDF URL for a medRxiv paper using its doi or medrxiv id.
91
- """
92
- logger.info("Fetching metadata from medRxiv for DOI: %s", doi)
93
-
94
- # Load configuration
95
- with hydra.initialize(version_base=None, config_path="../../configs"):
96
- cfg = hydra.compose(
97
- config_name="config", overrides=["tools/download_medrxiv_paper=default"]
98
- )
99
- api_url = cfg.tools.download_medrxiv_paper.api_url
100
- request_timeout = cfg.tools.download_medrxiv_paper.request_timeout
101
- logger.info("API URL: %s", api_url)
102
-
103
- raw_data = fetch_medrxiv_metadata(doi, api_url, request_timeout)
104
- metadata = extract_metadata(raw_data, doi)
105
- article_data = {doi: metadata}
106
-
107
- content = f"Successfully retrieved metadata and PDF URL for medRxiv DOI {doi}"
108
-
109
- return Command(
110
- update={
111
- "article_data": article_data,
112
- "messages": [ToolMessage(content=content, tool_call_id=tool_call_id)],
113
- }
114
- )