aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
- aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
- aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
- aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
- aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
- aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
- aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
- aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
- aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
- aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
- aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
- aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
- /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,457 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Unified paper download tool for LangGraph.
|
4
|
+
Supports downloading papers from arXiv, medRxiv, bioRxiv, and PubMed through a single interface.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import threading
|
9
|
+
from typing import Annotated, Any, List, Literal, Optional
|
10
|
+
|
11
|
+
import hydra
|
12
|
+
from hydra.core.global_hydra import GlobalHydra
|
13
|
+
from omegaconf import OmegaConf
|
14
|
+
from langchain_core.messages import ToolMessage
|
15
|
+
from langchain_core.tools import tool
|
16
|
+
from langchain_core.tools.base import InjectedToolCallId
|
17
|
+
from langgraph.types import Command
|
18
|
+
from pydantic import BaseModel, Field
|
19
|
+
from .utils.arxiv_downloader import ArxivDownloader
|
20
|
+
from .utils.base_paper_downloader import BasePaperDownloader
|
21
|
+
from .utils.biorxiv_downloader import BiorxivDownloader
|
22
|
+
from .utils.medrxiv_downloader import MedrxivDownloader
|
23
|
+
from .utils.pubmed_downloader import PubmedDownloader
|
24
|
+
|
25
|
+
# Configure logging
|
26
|
+
logging.basicConfig(level=logging.INFO)
|
27
|
+
logger = logging.getLogger(__name__)
|
28
|
+
|
29
|
+
|
30
|
+
class UnifiedPaperDownloadInput(BaseModel):
|
31
|
+
"""Input schema for the unified paper download tool."""
|
32
|
+
|
33
|
+
service: Optional[Literal["arxiv", "medrxiv", "biorxiv", "pubmed"]] = Field(
|
34
|
+
default=None,
|
35
|
+
description=(
|
36
|
+
"Paper service to download from: 'arxiv', 'medrxiv', 'biorxiv', or 'pubmed'. "
|
37
|
+
"If not specified, uses the configured default service."
|
38
|
+
),
|
39
|
+
)
|
40
|
+
identifiers: List[str] = Field(
|
41
|
+
description=(
|
42
|
+
"List of paper identifiers. Format depends on service:\n"
|
43
|
+
"- arxiv: arXiv IDs (e.g., ['1234.5678', '2301.12345'])\n"
|
44
|
+
"- medrxiv: DOIs (e.g., ['10.1101/2020.09.09.20191205'])\n"
|
45
|
+
"- biorxiv: DOIs (e.g., ['10.1101/2020.09.09.20191205'])\n"
|
46
|
+
"- pubmed: PMIDs (e.g., ['12345678', '87654321'])"
|
47
|
+
)
|
48
|
+
)
|
49
|
+
tool_call_id: Annotated[str, InjectedToolCallId]
|
50
|
+
|
51
|
+
|
52
|
+
class PaperDownloaderFactory:
|
53
|
+
"""Factory class for creating paper downloader instances."""
|
54
|
+
|
55
|
+
# Class-level cache for configuration
|
56
|
+
_cached_config = None
|
57
|
+
_config_lock = None
|
58
|
+
|
59
|
+
@classmethod
|
60
|
+
def clear_cache(cls) -> None:
|
61
|
+
"""Clear cached configuration."""
|
62
|
+
cls._cached_config = None
|
63
|
+
|
64
|
+
@staticmethod
|
65
|
+
def get_default_service() -> Literal["arxiv", "medrxiv", "biorxiv", "pubmed"]:
|
66
|
+
"""
|
67
|
+
Get the default service from configuration.
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
Default service name from config, fallback to 'pubmed'
|
71
|
+
"""
|
72
|
+
config = PaperDownloaderFactory._get_unified_config()
|
73
|
+
default_service = getattr(config.tool, "default_service", "pubmed")
|
74
|
+
# Ensure the default service is valid and return with proper type
|
75
|
+
if default_service == "arxiv":
|
76
|
+
return "arxiv"
|
77
|
+
if default_service == "medrxiv":
|
78
|
+
return "medrxiv"
|
79
|
+
if default_service == "biorxiv":
|
80
|
+
return "biorxiv"
|
81
|
+
if default_service == "pubmed":
|
82
|
+
return "pubmed"
|
83
|
+
logger.warning(
|
84
|
+
"Invalid default service '%s' in config, falling back to 'pubmed'",
|
85
|
+
default_service,
|
86
|
+
)
|
87
|
+
return "pubmed"
|
88
|
+
|
89
|
+
@staticmethod
|
90
|
+
def create(
|
91
|
+
service: Literal["arxiv", "medrxiv", "biorxiv", "pubmed"],
|
92
|
+
) -> BasePaperDownloader:
|
93
|
+
"""
|
94
|
+
Create appropriate downloader instance for the specified service.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
service: Service name ('arxiv', 'medrxiv', 'biorxiv', 'pubmed')
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
Configured downloader instance
|
101
|
+
|
102
|
+
Raises:
|
103
|
+
ValueError: If service is not supported
|
104
|
+
"""
|
105
|
+
config = PaperDownloaderFactory._get_unified_config()
|
106
|
+
service_config = PaperDownloaderFactory._build_service_config(config, service)
|
107
|
+
|
108
|
+
if service == "arxiv":
|
109
|
+
return ArxivDownloader(service_config)
|
110
|
+
if service == "medrxiv":
|
111
|
+
return MedrxivDownloader(service_config)
|
112
|
+
if service == "biorxiv":
|
113
|
+
return BiorxivDownloader(service_config)
|
114
|
+
# service == "pubmed"
|
115
|
+
return PubmedDownloader(service_config)
|
116
|
+
|
117
|
+
@staticmethod
|
118
|
+
def _get_unified_config() -> Any:
|
119
|
+
"""
|
120
|
+
Load unified paper download configuration using Hydra with caching.
|
121
|
+
This avoids the GlobalHydra reinitialization issue by caching the config.
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
Unified configuration object
|
125
|
+
"""
|
126
|
+
# Return cached config if available
|
127
|
+
if PaperDownloaderFactory._cached_config is not None:
|
128
|
+
return PaperDownloaderFactory._cached_config
|
129
|
+
|
130
|
+
# Ensure lock exists and get a local reference
|
131
|
+
lock = PaperDownloaderFactory._config_lock
|
132
|
+
if lock is None:
|
133
|
+
lock = threading.Lock()
|
134
|
+
PaperDownloaderFactory._config_lock = lock
|
135
|
+
|
136
|
+
# Thread-safe config loading with guaranteed non-None lock
|
137
|
+
with lock:
|
138
|
+
# Double-check pattern - another thread might have loaded it
|
139
|
+
if PaperDownloaderFactory._cached_config is not None:
|
140
|
+
return PaperDownloaderFactory._cached_config
|
141
|
+
|
142
|
+
try:
|
143
|
+
|
144
|
+
# Clear if already initialized
|
145
|
+
if GlobalHydra().is_initialized():
|
146
|
+
logger.info(
|
147
|
+
"GlobalHydra already initialized, clearing for config load"
|
148
|
+
)
|
149
|
+
GlobalHydra.instance().clear()
|
150
|
+
|
151
|
+
# Load configuration
|
152
|
+
with hydra.initialize(version_base=None, config_path="../../configs"):
|
153
|
+
cfg = hydra.compose(
|
154
|
+
config_name="config", overrides=["tools/paper_download=default"]
|
155
|
+
)
|
156
|
+
|
157
|
+
# Cache the configuration
|
158
|
+
PaperDownloaderFactory._cached_config = cfg.tools.paper_download
|
159
|
+
logger.info(
|
160
|
+
"Successfully loaded and cached paper download configuration"
|
161
|
+
)
|
162
|
+
|
163
|
+
return PaperDownloaderFactory._cached_config
|
164
|
+
|
165
|
+
except Exception as e:
|
166
|
+
logger.error(
|
167
|
+
"Failed to load unified paper download configuration: %s", e
|
168
|
+
)
|
169
|
+
raise RuntimeError(f"Configuration loading failed: {e}") from e
|
170
|
+
|
171
|
+
@staticmethod
|
172
|
+
def _build_service_config(unified_config: Any, service: str) -> Any:
|
173
|
+
"""
|
174
|
+
Build service-specific configuration by merging common and service settings.
|
175
|
+
Handles Hydra's OmegaConf objects properly.
|
176
|
+
|
177
|
+
Args:
|
178
|
+
unified_config: The unified configuration object
|
179
|
+
service: Service name
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
Service-specific configuration object
|
183
|
+
"""
|
184
|
+
if (
|
185
|
+
not hasattr(unified_config, "services")
|
186
|
+
or service not in unified_config.services
|
187
|
+
):
|
188
|
+
raise ValueError(f"Service '{service}' not found in configuration")
|
189
|
+
|
190
|
+
# Create a simple config object that combines common and service-specific settings
|
191
|
+
class ServiceConfig:
|
192
|
+
"""Service-specific configuration holder."""
|
193
|
+
|
194
|
+
def get_config_dict(self):
|
195
|
+
"""Return configuration as dictionary."""
|
196
|
+
return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
|
197
|
+
|
198
|
+
def has_attribute(self, name: str) -> bool:
|
199
|
+
"""Check if configuration has a specific attribute."""
|
200
|
+
return hasattr(self, name)
|
201
|
+
|
202
|
+
config_obj = ServiceConfig()
|
203
|
+
|
204
|
+
# Handle common config (using helper method to reduce branches)
|
205
|
+
PaperDownloaderFactory._apply_config(
|
206
|
+
config_obj, unified_config.common, "common"
|
207
|
+
)
|
208
|
+
|
209
|
+
# Handle service-specific config (using helper method to reduce branches)
|
210
|
+
PaperDownloaderFactory._apply_config(
|
211
|
+
config_obj, unified_config.services[service], service
|
212
|
+
)
|
213
|
+
|
214
|
+
return config_obj
|
215
|
+
|
216
|
+
@staticmethod
|
217
|
+
def _apply_config(config_obj: Any, source_config: Any, config_type: str) -> None:
|
218
|
+
"""
|
219
|
+
Apply configuration from source to target object using multiple fallback methods.
|
220
|
+
This preserves all the original logic but reduces branches in the main method.
|
221
|
+
|
222
|
+
Args:
|
223
|
+
config_obj: Target configuration object
|
224
|
+
source_config: Source configuration to extract from
|
225
|
+
config_type: Type description for logging
|
226
|
+
"""
|
227
|
+
try:
|
228
|
+
PaperDownloaderFactory._try_config_extraction(config_obj, source_config)
|
229
|
+
except (AttributeError, TypeError, KeyError) as e:
|
230
|
+
logger.warning("Failed to process %s config: %s", config_type, e)
|
231
|
+
|
232
|
+
@staticmethod
|
233
|
+
def _try_config_extraction(config_obj: Any, source_config: Any) -> None:
|
234
|
+
"""Try different methods to extract configuration data."""
|
235
|
+
# Method 1: Try OmegaConf conversion
|
236
|
+
if hasattr(source_config, "_content"):
|
237
|
+
PaperDownloaderFactory._extract_from_omegaconf(config_obj, source_config)
|
238
|
+
return
|
239
|
+
|
240
|
+
# Method 2: Try direct attribute access
|
241
|
+
if hasattr(source_config, "__dict__"):
|
242
|
+
PaperDownloaderFactory._extract_from_dict(
|
243
|
+
config_obj, source_config.__dict__
|
244
|
+
)
|
245
|
+
return
|
246
|
+
|
247
|
+
# Method 3: Try items() method
|
248
|
+
if hasattr(source_config, "items"):
|
249
|
+
PaperDownloaderFactory._extract_from_items(config_obj, source_config)
|
250
|
+
return
|
251
|
+
|
252
|
+
# Method 4: Try dir() approach as fallback
|
253
|
+
PaperDownloaderFactory._extract_from_dir(config_obj, source_config)
|
254
|
+
|
255
|
+
@staticmethod
|
256
|
+
def _extract_from_omegaconf(config_obj: Any, source_config: Any) -> None:
|
257
|
+
"""Extract configuration from OmegaConf object."""
|
258
|
+
config_dict = OmegaConf.to_container(source_config, resolve=True)
|
259
|
+
if isinstance(config_dict, dict):
|
260
|
+
for key, value in config_dict.items():
|
261
|
+
if isinstance(key, str): # Type guard for key
|
262
|
+
setattr(config_obj, key, value)
|
263
|
+
|
264
|
+
@staticmethod
|
265
|
+
def _extract_from_dict(config_obj: Any, config_dict: dict) -> None:
|
266
|
+
"""Extract configuration from dictionary."""
|
267
|
+
for key, value in config_dict.items():
|
268
|
+
if not key.startswith("_"):
|
269
|
+
setattr(config_obj, key, value)
|
270
|
+
|
271
|
+
@staticmethod
|
272
|
+
def _extract_from_items(config_obj: Any, source_config: Any) -> None:
|
273
|
+
"""Extract configuration using items() method."""
|
274
|
+
for key, value in source_config.items():
|
275
|
+
if isinstance(key, str): # Type guard for key
|
276
|
+
setattr(config_obj, key, value)
|
277
|
+
|
278
|
+
@staticmethod
|
279
|
+
def _extract_from_dir(config_obj: Any, source_config: Any) -> None:
|
280
|
+
"""Extract configuration using dir() approach as fallback."""
|
281
|
+
for key in dir(source_config):
|
282
|
+
if not key.startswith("_"):
|
283
|
+
value = getattr(source_config, key)
|
284
|
+
if not callable(value):
|
285
|
+
setattr(config_obj, key, value)
|
286
|
+
|
287
|
+
|
288
|
+
@tool(
|
289
|
+
args_schema=UnifiedPaperDownloadInput,
|
290
|
+
parse_docstring=True,
|
291
|
+
)
|
292
|
+
def download_papers(
|
293
|
+
service: Optional[Literal["arxiv", "medrxiv", "biorxiv", "pubmed"]],
|
294
|
+
identifiers: List[str],
|
295
|
+
tool_call_id: Annotated[str, InjectedToolCallId],
|
296
|
+
) -> Command[Any]:
|
297
|
+
"""
|
298
|
+
Universal paper download tool supporting multiple academic paper services.
|
299
|
+
|
300
|
+
Downloads paper metadata and PDFs from arXiv, medRxiv, bioRxiv, or PubMed and stores them
|
301
|
+
in temporary files for further processing. The downloaded PDFs can be accessed
|
302
|
+
using the temp_file_path in the returned metadata.
|
303
|
+
|
304
|
+
Args:
|
305
|
+
service: Paper service to download from (optional, uses configured default if not specified)
|
306
|
+
- 'arxiv': For arXiv preprints (requires arXiv IDs)
|
307
|
+
- 'medrxiv': For medRxiv preprints (requires DOIs)
|
308
|
+
- 'biorxiv': For bioRxiv preprints (requires DOIs)
|
309
|
+
- 'pubmed': For PubMed papers (requires PMIDs)
|
310
|
+
identifiers: List of paper identifiers in the format expected by the service
|
311
|
+
|
312
|
+
Returns:
|
313
|
+
Command with article_data containing paper metadata and local file paths
|
314
|
+
|
315
|
+
Examples:
|
316
|
+
# Download from arXiv
|
317
|
+
download_papers("arxiv", ["1234.5678", "2301.12345"])
|
318
|
+
|
319
|
+
# Download from medRxiv
|
320
|
+
download_papers("medrxiv", ["10.1101/2020.09.09.20191205"])
|
321
|
+
|
322
|
+
# Download from bioRxiv
|
323
|
+
download_papers("biorxiv", ["10.1101/2020.09.09.20191205"])
|
324
|
+
|
325
|
+
# Download from PubMed
|
326
|
+
download_papers("pubmed", ["12345678", "87654321"])
|
327
|
+
|
328
|
+
# Use default service (configured in default.yaml)
|
329
|
+
download_papers(None, ["12345678", "87654321"])
|
330
|
+
"""
|
331
|
+
return _download_papers_impl(service, identifiers, tool_call_id)
|
332
|
+
|
333
|
+
|
334
|
+
# Convenience functions for backward compatibility (optional)
|
335
|
+
# These functions explicitly specify the service, bypassing the default service config
|
336
|
+
def download_arxiv_papers(
|
337
|
+
arxiv_ids: List[str], tool_call_id: Annotated[str, InjectedToolCallId]
|
338
|
+
) -> Command[Any]:
|
339
|
+
"""Convenience function for downloading arXiv papers (explicitly uses arXiv service)."""
|
340
|
+
return _download_papers_impl("arxiv", arxiv_ids, tool_call_id)
|
341
|
+
|
342
|
+
|
343
|
+
def download_medrxiv_papers(
|
344
|
+
dois: List[str], tool_call_id: Annotated[str, InjectedToolCallId]
|
345
|
+
) -> Command[Any]:
|
346
|
+
"""Convenience function for downloading medRxiv papers (explicitly uses medRxiv service)."""
|
347
|
+
return _download_papers_impl("medrxiv", dois, tool_call_id)
|
348
|
+
|
349
|
+
|
350
|
+
def download_biorxiv_papers(
|
351
|
+
dois: List[str], tool_call_id: Annotated[str, InjectedToolCallId]
|
352
|
+
) -> Command[Any]:
|
353
|
+
"""Convenience function for downloading bioRxiv papers (explicitly uses bioRxiv service)."""
|
354
|
+
return _download_papers_impl("biorxiv", dois, tool_call_id)
|
355
|
+
|
356
|
+
|
357
|
+
def download_pubmed_papers(
|
358
|
+
pmids: List[str], tool_call_id: Annotated[str, InjectedToolCallId]
|
359
|
+
) -> Command[Any]:
|
360
|
+
"""Convenience function for downloading PubMed papers (explicitly uses PubMed service)."""
|
361
|
+
return _download_papers_impl("pubmed", pmids, tool_call_id)
|
362
|
+
|
363
|
+
|
364
|
+
def _download_papers_impl(
|
365
|
+
service: Optional[Literal["arxiv", "medrxiv", "biorxiv", "pubmed"]],
|
366
|
+
identifiers: List[str],
|
367
|
+
tool_call_id: str,
|
368
|
+
) -> Command[Any]:
|
369
|
+
"""
|
370
|
+
Internal implementation function that contains the actual download logic.
|
371
|
+
This is called by both the decorated tool and the convenience functions.
|
372
|
+
"""
|
373
|
+
# Resolve default service if not specified
|
374
|
+
if service is None:
|
375
|
+
service = PaperDownloaderFactory.get_default_service()
|
376
|
+
logger.info("No service specified, using configured default: %s", service)
|
377
|
+
logger.info(
|
378
|
+
"Starting unified paper download for service '%s' with %d identifiers: %s",
|
379
|
+
service,
|
380
|
+
len(identifiers),
|
381
|
+
identifiers,
|
382
|
+
)
|
383
|
+
|
384
|
+
try:
|
385
|
+
# Step 1: Create appropriate downloader using factory
|
386
|
+
downloader = PaperDownloaderFactory.create(service)
|
387
|
+
logger.info("Created %s downloader successfully", downloader.get_service_name())
|
388
|
+
|
389
|
+
# Step 2: Process all identifiers
|
390
|
+
article_data = downloader.process_identifiers(identifiers)
|
391
|
+
|
392
|
+
# Step 3: Build summary for user
|
393
|
+
content = downloader.build_summary(article_data)
|
394
|
+
|
395
|
+
# Step 4: Log results summary
|
396
|
+
total_papers = len(article_data)
|
397
|
+
successful_downloads = sum(
|
398
|
+
1
|
399
|
+
for paper in article_data.values()
|
400
|
+
if paper.get("access_type") == "open_access_downloaded"
|
401
|
+
)
|
402
|
+
logger.info(
|
403
|
+
"Download complete for %s: %d papers processed, %d PDFs downloaded",
|
404
|
+
service,
|
405
|
+
total_papers,
|
406
|
+
successful_downloads,
|
407
|
+
)
|
408
|
+
|
409
|
+
# Step 5: Return command with results
|
410
|
+
return Command(
|
411
|
+
update={
|
412
|
+
"article_data": article_data,
|
413
|
+
"messages": [
|
414
|
+
ToolMessage(
|
415
|
+
content=content,
|
416
|
+
tool_call_id=tool_call_id,
|
417
|
+
artifact=article_data,
|
418
|
+
)
|
419
|
+
],
|
420
|
+
}
|
421
|
+
)
|
422
|
+
|
423
|
+
except ValueError as e:
|
424
|
+
# Handle service/configuration errors
|
425
|
+
error_msg = f"Service error for '{service}': {str(e)}"
|
426
|
+
logger.error(error_msg)
|
427
|
+
|
428
|
+
return Command(
|
429
|
+
update={
|
430
|
+
"article_data": {},
|
431
|
+
"messages": [
|
432
|
+
ToolMessage(
|
433
|
+
content=f"Error: {error_msg}",
|
434
|
+
tool_call_id=tool_call_id,
|
435
|
+
artifact={},
|
436
|
+
)
|
437
|
+
],
|
438
|
+
}
|
439
|
+
)
|
440
|
+
|
441
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
442
|
+
# Handle unexpected errors
|
443
|
+
error_msg = f"Unexpected error during paper download: {str(e)}"
|
444
|
+
logger.error(error_msg, exc_info=True)
|
445
|
+
|
446
|
+
return Command(
|
447
|
+
update={
|
448
|
+
"article_data": {},
|
449
|
+
"messages": [
|
450
|
+
ToolMessage(
|
451
|
+
content=f"Error: {error_msg}",
|
452
|
+
tool_call_id=tool_call_id,
|
453
|
+
artifact={},
|
454
|
+
)
|
455
|
+
],
|
456
|
+
}
|
457
|
+
)
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
This package provides modules for fetching and downloading academic papers from arXiv,
|
4
|
+
biorxiv and medrxiv.
|
5
|
+
"""
|
6
|
+
|
7
|
+
# Import modules
|
8
|
+
from . import arxiv_downloader
|
9
|
+
from . import base_paper_downloader
|
10
|
+
from . import biorxiv_downloader
|
11
|
+
from . import medrxiv_downloader
|
12
|
+
from . import pubmed_downloader
|
13
|
+
|
14
|
+
__all__ = [
|
15
|
+
"arxiv_downloader",
|
16
|
+
"base_paper_downloader",
|
17
|
+
"biorxiv_downloader",
|
18
|
+
"medrxiv_downloader",
|
19
|
+
"pubmed_downloader",
|
20
|
+
]
|
@@ -0,0 +1,209 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
ArXiv paper downloader implementation.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import logging
|
7
|
+
import xml.etree.ElementTree as ET
|
8
|
+
from typing import Any, Dict, Optional, Tuple
|
9
|
+
|
10
|
+
import requests
|
11
|
+
|
12
|
+
from .base_paper_downloader import BasePaperDownloader
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class ArxivDownloader(BasePaperDownloader):
|
18
|
+
"""ArXiv-specific implementation of paper downloader."""
|
19
|
+
|
20
|
+
def __init__(self, config: Any):
|
21
|
+
"""Initialize ArXiv downloader with configuration."""
|
22
|
+
super().__init__(config)
|
23
|
+
self.api_url = config.api_url
|
24
|
+
self.pdf_base_url = config.pdf_base_url
|
25
|
+
# XML namespace configuration
|
26
|
+
self.xml_namespaces = getattr(
|
27
|
+
config, "xml_namespace", {"atom": "http://www.w3.org/2005/Atom"}
|
28
|
+
)
|
29
|
+
|
30
|
+
def fetch_metadata(self, identifier: str) -> ET.Element:
|
31
|
+
"""
|
32
|
+
Fetch paper metadata from arXiv API.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
identifier: arXiv ID (e.g., '1234.5678' or '2301.12345')
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
XML root element from arXiv API response
|
39
|
+
|
40
|
+
Raises:
|
41
|
+
requests.RequestException: If API call fails
|
42
|
+
RuntimeError: If no entry found in response
|
43
|
+
"""
|
44
|
+
query_url = f"{self.api_url}?search_query=id:{identifier}&start=0&max_results=1"
|
45
|
+
logger.info("Fetching metadata for arXiv ID %s from: %s", identifier, query_url)
|
46
|
+
|
47
|
+
response = requests.get(query_url, timeout=self.request_timeout)
|
48
|
+
response.raise_for_status()
|
49
|
+
|
50
|
+
root = ET.fromstring(response.text)
|
51
|
+
entry = root.find("atom:entry", self.xml_namespaces)
|
52
|
+
|
53
|
+
if entry is None:
|
54
|
+
raise RuntimeError("No entry found in arXiv API response")
|
55
|
+
|
56
|
+
return root
|
57
|
+
|
58
|
+
def construct_pdf_url(self, metadata: ET.Element, identifier: str) -> str:
|
59
|
+
"""
|
60
|
+
Extract or construct PDF URL from arXiv metadata.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
metadata: XML root from arXiv API
|
64
|
+
identifier: arXiv ID
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
PDF URL string
|
68
|
+
"""
|
69
|
+
entry = metadata.find("atom:entry", self.xml_namespaces)
|
70
|
+
|
71
|
+
if entry is None:
|
72
|
+
return ""
|
73
|
+
|
74
|
+
# Try to find PDF link in metadata first
|
75
|
+
pdf_url = next(
|
76
|
+
(
|
77
|
+
link.attrib.get("href")
|
78
|
+
for link in entry.findall("atom:link", self.xml_namespaces)
|
79
|
+
if link.attrib.get("title") == "pdf"
|
80
|
+
),
|
81
|
+
None,
|
82
|
+
)
|
83
|
+
|
84
|
+
# Fallback to constructed PDF URL if not found in metadata
|
85
|
+
if not pdf_url:
|
86
|
+
pdf_url = f"{self.pdf_base_url}/{identifier}.pdf"
|
87
|
+
logger.info("Using constructed PDF URL for %s: %s", identifier, pdf_url)
|
88
|
+
|
89
|
+
return pdf_url
|
90
|
+
|
91
|
+
def extract_paper_metadata(
|
92
|
+
self,
|
93
|
+
metadata: ET.Element,
|
94
|
+
identifier: str,
|
95
|
+
pdf_result: Optional[Tuple[str, str]],
|
96
|
+
) -> Dict[str, Any]:
|
97
|
+
"""
|
98
|
+
Extract structured metadata from arXiv API response.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
metadata: XML root from arXiv API
|
102
|
+
identifier: arXiv ID
|
103
|
+
pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
|
104
|
+
|
105
|
+
Returns:
|
106
|
+
Standardized paper metadata dictionary
|
107
|
+
"""
|
108
|
+
entry = metadata.find("atom:entry", self.xml_namespaces)
|
109
|
+
|
110
|
+
if entry is None:
|
111
|
+
raise RuntimeError("No entry found in metadata")
|
112
|
+
|
113
|
+
# Extract basic metadata
|
114
|
+
basic_metadata = self._extract_basic_metadata(entry, self.xml_namespaces)
|
115
|
+
|
116
|
+
# Handle PDF download results
|
117
|
+
pdf_metadata = self._extract_pdf_metadata(pdf_result, identifier)
|
118
|
+
|
119
|
+
# Combine all metadata
|
120
|
+
return {
|
121
|
+
**basic_metadata,
|
122
|
+
**pdf_metadata,
|
123
|
+
"source": "arxiv",
|
124
|
+
"arxiv_id": identifier,
|
125
|
+
}
|
126
|
+
|
127
|
+
def _extract_basic_metadata(self, entry: ET.Element, ns: dict) -> Dict[str, Any]:
|
128
|
+
"""Extract basic metadata (title, authors, abstract, date) from entry."""
|
129
|
+
title = self._extract_title(entry, ns)
|
130
|
+
authors = self._extract_authors(entry, ns)
|
131
|
+
abstract = self._extract_abstract(entry, ns)
|
132
|
+
pub_date = self._extract_publication_date(entry, ns)
|
133
|
+
|
134
|
+
return {
|
135
|
+
"Title": title,
|
136
|
+
"Authors": authors,
|
137
|
+
"Abstract": abstract,
|
138
|
+
"Publication Date": pub_date,
|
139
|
+
}
|
140
|
+
|
141
|
+
def _extract_title(self, entry: ET.Element, ns: dict) -> str:
|
142
|
+
"""Extract title from entry."""
|
143
|
+
title_elem = entry.find("atom:title", ns)
|
144
|
+
return (title_elem.text or "").strip() if title_elem is not None else "N/A"
|
145
|
+
|
146
|
+
def _extract_authors(self, entry: ET.Element, ns: dict) -> list:
|
147
|
+
"""Extract authors from entry."""
|
148
|
+
authors = []
|
149
|
+
for author_elem in entry.findall("atom:author", ns):
|
150
|
+
name_elem = author_elem.find("atom:name", ns)
|
151
|
+
if name_elem is not None and name_elem.text:
|
152
|
+
authors.append(name_elem.text.strip())
|
153
|
+
return authors
|
154
|
+
|
155
|
+
def _extract_abstract(self, entry: ET.Element, ns: dict) -> str:
|
156
|
+
"""Extract abstract from entry."""
|
157
|
+
summary_elem = entry.find("atom:summary", ns)
|
158
|
+
return (summary_elem.text or "").strip() if summary_elem is not None else "N/A"
|
159
|
+
|
160
|
+
def _extract_publication_date(self, entry: ET.Element, ns: dict) -> str:
|
161
|
+
"""Extract publication date from entry."""
|
162
|
+
published_elem = entry.find("atom:published", ns)
|
163
|
+
return (
|
164
|
+
(published_elem.text or "").strip() if published_elem is not None else "N/A"
|
165
|
+
)
|
166
|
+
|
167
|
+
def _extract_pdf_metadata(
|
168
|
+
self, pdf_result: Optional[Tuple[str, str]], identifier: str
|
169
|
+
) -> Dict[str, Any]:
|
170
|
+
"""Extract PDF-related metadata."""
|
171
|
+
if pdf_result:
|
172
|
+
temp_file_path, filename = pdf_result
|
173
|
+
return {
|
174
|
+
"URL": temp_file_path,
|
175
|
+
"pdf_url": temp_file_path,
|
176
|
+
"filename": filename,
|
177
|
+
"access_type": "open_access_downloaded",
|
178
|
+
"temp_file_path": temp_file_path,
|
179
|
+
}
|
180
|
+
|
181
|
+
return {
|
182
|
+
"URL": "",
|
183
|
+
"pdf_url": "",
|
184
|
+
"filename": self.get_default_filename(identifier),
|
185
|
+
"access_type": "download_failed",
|
186
|
+
"temp_file_path": "",
|
187
|
+
}
|
188
|
+
|
189
|
+
def get_service_name(self) -> str:
|
190
|
+
"""Return service name."""
|
191
|
+
return "arXiv"
|
192
|
+
|
193
|
+
def get_identifier_name(self) -> str:
|
194
|
+
"""Return identifier display name."""
|
195
|
+
return "arXiv ID"
|
196
|
+
|
197
|
+
def get_default_filename(self, identifier: str) -> str:
|
198
|
+
"""Generate default filename for arXiv paper."""
|
199
|
+
return f"{identifier}.pdf"
|
200
|
+
|
201
|
+
def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
|
202
|
+
"""Get arXiv-specific identifier info for paper summary."""
|
203
|
+
arxiv_id = paper.get("arxiv_id", "N/A")
|
204
|
+
pub_date = paper.get("Publication Date", "N/A")
|
205
|
+
return f" (arXiv:{arxiv_id}, {pub_date})"
|
206
|
+
|
207
|
+
def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
|
208
|
+
"""Add arXiv ID field to entry."""
|
209
|
+
entry["arxiv_id"] = identifier
|