aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
  2. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
  3. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
  4. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
  5. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
  6. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
  7. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
  8. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
  9. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
  10. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
  11. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
  12. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
  13. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
  14. aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
  15. aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
  16. aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
  17. aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
  18. aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
  19. aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
  20. aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
  21. aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
  22. aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
  23. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
  24. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
  25. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
  26. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
  27. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
  28. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
  29. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
  30. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
  31. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
  32. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
  33. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
  34. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
  35. aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
  36. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
  37. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
  38. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
  39. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
  40. aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
  41. aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
  42. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
  43. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
  44. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
  45. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
  46. /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
  47. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
  48. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
  49. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -5,10 +5,8 @@ biorxiv and medrxiv.
5
5
  """
6
6
 
7
7
  # Import modules
8
- from . import download_arxiv_input
8
+ from . import paper_downloader
9
9
 
10
10
  __all__ = [
11
- "download_arxiv_input",
12
- "download_biorxiv_input",
13
- "download_medrxiv_input",
11
+ "paper_downloader",
14
12
  ]
@@ -0,0 +1,457 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unified paper download tool for LangGraph.
4
+ Supports downloading papers from arXiv, medRxiv, bioRxiv, and PubMed through a single interface.
5
+ """
6
+
7
+ import logging
8
+ import threading
9
+ from typing import Annotated, Any, List, Literal, Optional
10
+
11
+ import hydra
12
+ from hydra.core.global_hydra import GlobalHydra
13
+ from omegaconf import OmegaConf
14
+ from langchain_core.messages import ToolMessage
15
+ from langchain_core.tools import tool
16
+ from langchain_core.tools.base import InjectedToolCallId
17
+ from langgraph.types import Command
18
+ from pydantic import BaseModel, Field
19
+ from .utils.arxiv_downloader import ArxivDownloader
20
+ from .utils.base_paper_downloader import BasePaperDownloader
21
+ from .utils.biorxiv_downloader import BiorxivDownloader
22
+ from .utils.medrxiv_downloader import MedrxivDownloader
23
+ from .utils.pubmed_downloader import PubmedDownloader
24
+
25
+ # Configure logging
26
+ logging.basicConfig(level=logging.INFO)
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class UnifiedPaperDownloadInput(BaseModel):
31
+ """Input schema for the unified paper download tool."""
32
+
33
+ service: Optional[Literal["arxiv", "medrxiv", "biorxiv", "pubmed"]] = Field(
34
+ default=None,
35
+ description=(
36
+ "Paper service to download from: 'arxiv', 'medrxiv', 'biorxiv', or 'pubmed'. "
37
+ "If not specified, uses the configured default service."
38
+ ),
39
+ )
40
+ identifiers: List[str] = Field(
41
+ description=(
42
+ "List of paper identifiers. Format depends on service:\n"
43
+ "- arxiv: arXiv IDs (e.g., ['1234.5678', '2301.12345'])\n"
44
+ "- medrxiv: DOIs (e.g., ['10.1101/2020.09.09.20191205'])\n"
45
+ "- biorxiv: DOIs (e.g., ['10.1101/2020.09.09.20191205'])\n"
46
+ "- pubmed: PMIDs (e.g., ['12345678', '87654321'])"
47
+ )
48
+ )
49
+ tool_call_id: Annotated[str, InjectedToolCallId]
50
+
51
+
52
+ class PaperDownloaderFactory:
53
+ """Factory class for creating paper downloader instances."""
54
+
55
+ # Class-level cache for configuration
56
+ _cached_config = None
57
+ _config_lock = None
58
+
59
+ @classmethod
60
+ def clear_cache(cls) -> None:
61
+ """Clear cached configuration."""
62
+ cls._cached_config = None
63
+
64
+ @staticmethod
65
+ def get_default_service() -> Literal["arxiv", "medrxiv", "biorxiv", "pubmed"]:
66
+ """
67
+ Get the default service from configuration.
68
+
69
+ Returns:
70
+ Default service name from config, fallback to 'pubmed'
71
+ """
72
+ config = PaperDownloaderFactory._get_unified_config()
73
+ default_service = getattr(config.tool, "default_service", "pubmed")
74
+ # Ensure the default service is valid and return with proper type
75
+ if default_service == "arxiv":
76
+ return "arxiv"
77
+ if default_service == "medrxiv":
78
+ return "medrxiv"
79
+ if default_service == "biorxiv":
80
+ return "biorxiv"
81
+ if default_service == "pubmed":
82
+ return "pubmed"
83
+ logger.warning(
84
+ "Invalid default service '%s' in config, falling back to 'pubmed'",
85
+ default_service,
86
+ )
87
+ return "pubmed"
88
+
89
+ @staticmethod
90
+ def create(
91
+ service: Literal["arxiv", "medrxiv", "biorxiv", "pubmed"],
92
+ ) -> BasePaperDownloader:
93
+ """
94
+ Create appropriate downloader instance for the specified service.
95
+
96
+ Args:
97
+ service: Service name ('arxiv', 'medrxiv', 'biorxiv', 'pubmed')
98
+
99
+ Returns:
100
+ Configured downloader instance
101
+
102
+ Raises:
103
+ ValueError: If service is not supported
104
+ """
105
+ config = PaperDownloaderFactory._get_unified_config()
106
+ service_config = PaperDownloaderFactory._build_service_config(config, service)
107
+
108
+ if service == "arxiv":
109
+ return ArxivDownloader(service_config)
110
+ if service == "medrxiv":
111
+ return MedrxivDownloader(service_config)
112
+ if service == "biorxiv":
113
+ return BiorxivDownloader(service_config)
114
+ # service == "pubmed"
115
+ return PubmedDownloader(service_config)
116
+
117
+ @staticmethod
118
+ def _get_unified_config() -> Any:
119
+ """
120
+ Load unified paper download configuration using Hydra with caching.
121
+ This avoids the GlobalHydra reinitialization issue by caching the config.
122
+
123
+ Returns:
124
+ Unified configuration object
125
+ """
126
+ # Return cached config if available
127
+ if PaperDownloaderFactory._cached_config is not None:
128
+ return PaperDownloaderFactory._cached_config
129
+
130
+ # Ensure lock exists and get a local reference
131
+ lock = PaperDownloaderFactory._config_lock
132
+ if lock is None:
133
+ lock = threading.Lock()
134
+ PaperDownloaderFactory._config_lock = lock
135
+
136
+ # Thread-safe config loading with guaranteed non-None lock
137
+ with lock:
138
+ # Double-check pattern - another thread might have loaded it
139
+ if PaperDownloaderFactory._cached_config is not None:
140
+ return PaperDownloaderFactory._cached_config
141
+
142
+ try:
143
+
144
+ # Clear if already initialized
145
+ if GlobalHydra().is_initialized():
146
+ logger.info(
147
+ "GlobalHydra already initialized, clearing for config load"
148
+ )
149
+ GlobalHydra.instance().clear()
150
+
151
+ # Load configuration
152
+ with hydra.initialize(version_base=None, config_path="../../configs"):
153
+ cfg = hydra.compose(
154
+ config_name="config", overrides=["tools/paper_download=default"]
155
+ )
156
+
157
+ # Cache the configuration
158
+ PaperDownloaderFactory._cached_config = cfg.tools.paper_download
159
+ logger.info(
160
+ "Successfully loaded and cached paper download configuration"
161
+ )
162
+
163
+ return PaperDownloaderFactory._cached_config
164
+
165
+ except Exception as e:
166
+ logger.error(
167
+ "Failed to load unified paper download configuration: %s", e
168
+ )
169
+ raise RuntimeError(f"Configuration loading failed: {e}") from e
170
+
171
+ @staticmethod
172
+ def _build_service_config(unified_config: Any, service: str) -> Any:
173
+ """
174
+ Build service-specific configuration by merging common and service settings.
175
+ Handles Hydra's OmegaConf objects properly.
176
+
177
+ Args:
178
+ unified_config: The unified configuration object
179
+ service: Service name
180
+
181
+ Returns:
182
+ Service-specific configuration object
183
+ """
184
+ if (
185
+ not hasattr(unified_config, "services")
186
+ or service not in unified_config.services
187
+ ):
188
+ raise ValueError(f"Service '{service}' not found in configuration")
189
+
190
+ # Create a simple config object that combines common and service-specific settings
191
+ class ServiceConfig:
192
+ """Service-specific configuration holder."""
193
+
194
+ def get_config_dict(self):
195
+ """Return configuration as dictionary."""
196
+ return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
197
+
198
+ def has_attribute(self, name: str) -> bool:
199
+ """Check if configuration has a specific attribute."""
200
+ return hasattr(self, name)
201
+
202
+ config_obj = ServiceConfig()
203
+
204
+ # Handle common config (using helper method to reduce branches)
205
+ PaperDownloaderFactory._apply_config(
206
+ config_obj, unified_config.common, "common"
207
+ )
208
+
209
+ # Handle service-specific config (using helper method to reduce branches)
210
+ PaperDownloaderFactory._apply_config(
211
+ config_obj, unified_config.services[service], service
212
+ )
213
+
214
+ return config_obj
215
+
216
+ @staticmethod
217
+ def _apply_config(config_obj: Any, source_config: Any, config_type: str) -> None:
218
+ """
219
+ Apply configuration from source to target object using multiple fallback methods.
220
+ This preserves all the original logic but reduces branches in the main method.
221
+
222
+ Args:
223
+ config_obj: Target configuration object
224
+ source_config: Source configuration to extract from
225
+ config_type: Type description for logging
226
+ """
227
+ try:
228
+ PaperDownloaderFactory._try_config_extraction(config_obj, source_config)
229
+ except (AttributeError, TypeError, KeyError) as e:
230
+ logger.warning("Failed to process %s config: %s", config_type, e)
231
+
232
+ @staticmethod
233
+ def _try_config_extraction(config_obj: Any, source_config: Any) -> None:
234
+ """Try different methods to extract configuration data."""
235
+ # Method 1: Try OmegaConf conversion
236
+ if hasattr(source_config, "_content"):
237
+ PaperDownloaderFactory._extract_from_omegaconf(config_obj, source_config)
238
+ return
239
+
240
+ # Method 2: Try direct attribute access
241
+ if hasattr(source_config, "__dict__"):
242
+ PaperDownloaderFactory._extract_from_dict(
243
+ config_obj, source_config.__dict__
244
+ )
245
+ return
246
+
247
+ # Method 3: Try items() method
248
+ if hasattr(source_config, "items"):
249
+ PaperDownloaderFactory._extract_from_items(config_obj, source_config)
250
+ return
251
+
252
+ # Method 4: Try dir() approach as fallback
253
+ PaperDownloaderFactory._extract_from_dir(config_obj, source_config)
254
+
255
+ @staticmethod
256
+ def _extract_from_omegaconf(config_obj: Any, source_config: Any) -> None:
257
+ """Extract configuration from OmegaConf object."""
258
+ config_dict = OmegaConf.to_container(source_config, resolve=True)
259
+ if isinstance(config_dict, dict):
260
+ for key, value in config_dict.items():
261
+ if isinstance(key, str): # Type guard for key
262
+ setattr(config_obj, key, value)
263
+
264
+ @staticmethod
265
+ def _extract_from_dict(config_obj: Any, config_dict: dict) -> None:
266
+ """Extract configuration from dictionary."""
267
+ for key, value in config_dict.items():
268
+ if not key.startswith("_"):
269
+ setattr(config_obj, key, value)
270
+
271
+ @staticmethod
272
+ def _extract_from_items(config_obj: Any, source_config: Any) -> None:
273
+ """Extract configuration using items() method."""
274
+ for key, value in source_config.items():
275
+ if isinstance(key, str): # Type guard for key
276
+ setattr(config_obj, key, value)
277
+
278
+ @staticmethod
279
+ def _extract_from_dir(config_obj: Any, source_config: Any) -> None:
280
+ """Extract configuration using dir() approach as fallback."""
281
+ for key in dir(source_config):
282
+ if not key.startswith("_"):
283
+ value = getattr(source_config, key)
284
+ if not callable(value):
285
+ setattr(config_obj, key, value)
286
+
287
+
288
+ @tool(
289
+ args_schema=UnifiedPaperDownloadInput,
290
+ parse_docstring=True,
291
+ )
292
+ def download_papers(
293
+ service: Optional[Literal["arxiv", "medrxiv", "biorxiv", "pubmed"]],
294
+ identifiers: List[str],
295
+ tool_call_id: Annotated[str, InjectedToolCallId],
296
+ ) -> Command[Any]:
297
+ """
298
+ Universal paper download tool supporting multiple academic paper services.
299
+
300
+ Downloads paper metadata and PDFs from arXiv, medRxiv, bioRxiv, or PubMed and stores them
301
+ in temporary files for further processing. The downloaded PDFs can be accessed
302
+ using the temp_file_path in the returned metadata.
303
+
304
+ Args:
305
+ service: Paper service to download from (optional, uses configured default if not specified)
306
+ - 'arxiv': For arXiv preprints (requires arXiv IDs)
307
+ - 'medrxiv': For medRxiv preprints (requires DOIs)
308
+ - 'biorxiv': For bioRxiv preprints (requires DOIs)
309
+ - 'pubmed': For PubMed papers (requires PMIDs)
310
+ identifiers: List of paper identifiers in the format expected by the service
311
+
312
+ Returns:
313
+ Command with article_data containing paper metadata and local file paths
314
+
315
+ Examples:
316
+ # Download from arXiv
317
+ download_papers("arxiv", ["1234.5678", "2301.12345"])
318
+
319
+ # Download from medRxiv
320
+ download_papers("medrxiv", ["10.1101/2020.09.09.20191205"])
321
+
322
+ # Download from bioRxiv
323
+ download_papers("biorxiv", ["10.1101/2020.09.09.20191205"])
324
+
325
+ # Download from PubMed
326
+ download_papers("pubmed", ["12345678", "87654321"])
327
+
328
+ # Use default service (configured in default.yaml)
329
+ download_papers(None, ["12345678", "87654321"])
330
+ """
331
+ return _download_papers_impl(service, identifiers, tool_call_id)
332
+
333
+
334
+ # Convenience functions for backward compatibility (optional)
335
+ # These functions explicitly specify the service, bypassing the default service config
336
+ def download_arxiv_papers(
337
+ arxiv_ids: List[str], tool_call_id: Annotated[str, InjectedToolCallId]
338
+ ) -> Command[Any]:
339
+ """Convenience function for downloading arXiv papers (explicitly uses arXiv service)."""
340
+ return _download_papers_impl("arxiv", arxiv_ids, tool_call_id)
341
+
342
+
343
+ def download_medrxiv_papers(
344
+ dois: List[str], tool_call_id: Annotated[str, InjectedToolCallId]
345
+ ) -> Command[Any]:
346
+ """Convenience function for downloading medRxiv papers (explicitly uses medRxiv service)."""
347
+ return _download_papers_impl("medrxiv", dois, tool_call_id)
348
+
349
+
350
+ def download_biorxiv_papers(
351
+ dois: List[str], tool_call_id: Annotated[str, InjectedToolCallId]
352
+ ) -> Command[Any]:
353
+ """Convenience function for downloading bioRxiv papers (explicitly uses bioRxiv service)."""
354
+ return _download_papers_impl("biorxiv", dois, tool_call_id)
355
+
356
+
357
+ def download_pubmed_papers(
358
+ pmids: List[str], tool_call_id: Annotated[str, InjectedToolCallId]
359
+ ) -> Command[Any]:
360
+ """Convenience function for downloading PubMed papers (explicitly uses PubMed service)."""
361
+ return _download_papers_impl("pubmed", pmids, tool_call_id)
362
+
363
+
364
+ def _download_papers_impl(
365
+ service: Optional[Literal["arxiv", "medrxiv", "biorxiv", "pubmed"]],
366
+ identifiers: List[str],
367
+ tool_call_id: str,
368
+ ) -> Command[Any]:
369
+ """
370
+ Internal implementation function that contains the actual download logic.
371
+ This is called by both the decorated tool and the convenience functions.
372
+ """
373
+ # Resolve default service if not specified
374
+ if service is None:
375
+ service = PaperDownloaderFactory.get_default_service()
376
+ logger.info("No service specified, using configured default: %s", service)
377
+ logger.info(
378
+ "Starting unified paper download for service '%s' with %d identifiers: %s",
379
+ service,
380
+ len(identifiers),
381
+ identifiers,
382
+ )
383
+
384
+ try:
385
+ # Step 1: Create appropriate downloader using factory
386
+ downloader = PaperDownloaderFactory.create(service)
387
+ logger.info("Created %s downloader successfully", downloader.get_service_name())
388
+
389
+ # Step 2: Process all identifiers
390
+ article_data = downloader.process_identifiers(identifiers)
391
+
392
+ # Step 3: Build summary for user
393
+ content = downloader.build_summary(article_data)
394
+
395
+ # Step 4: Log results summary
396
+ total_papers = len(article_data)
397
+ successful_downloads = sum(
398
+ 1
399
+ for paper in article_data.values()
400
+ if paper.get("access_type") == "open_access_downloaded"
401
+ )
402
+ logger.info(
403
+ "Download complete for %s: %d papers processed, %d PDFs downloaded",
404
+ service,
405
+ total_papers,
406
+ successful_downloads,
407
+ )
408
+
409
+ # Step 5: Return command with results
410
+ return Command(
411
+ update={
412
+ "article_data": article_data,
413
+ "messages": [
414
+ ToolMessage(
415
+ content=content,
416
+ tool_call_id=tool_call_id,
417
+ artifact=article_data,
418
+ )
419
+ ],
420
+ }
421
+ )
422
+
423
+ except ValueError as e:
424
+ # Handle service/configuration errors
425
+ error_msg = f"Service error for '{service}': {str(e)}"
426
+ logger.error(error_msg)
427
+
428
+ return Command(
429
+ update={
430
+ "article_data": {},
431
+ "messages": [
432
+ ToolMessage(
433
+ content=f"Error: {error_msg}",
434
+ tool_call_id=tool_call_id,
435
+ artifact={},
436
+ )
437
+ ],
438
+ }
439
+ )
440
+
441
+ except Exception as e: # pylint: disable=broad-exception-caught
442
+ # Handle unexpected errors
443
+ error_msg = f"Unexpected error during paper download: {str(e)}"
444
+ logger.error(error_msg, exc_info=True)
445
+
446
+ return Command(
447
+ update={
448
+ "article_data": {},
449
+ "messages": [
450
+ ToolMessage(
451
+ content=f"Error: {error_msg}",
452
+ tool_call_id=tool_call_id,
453
+ artifact={},
454
+ )
455
+ ],
456
+ }
457
+ )
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ This package provides modules for fetching and downloading academic papers from arXiv,
4
+ biorxiv and medrxiv.
5
+ """
6
+
7
+ # Import modules
8
+ from . import arxiv_downloader
9
+ from . import base_paper_downloader
10
+ from . import biorxiv_downloader
11
+ from . import medrxiv_downloader
12
+ from . import pubmed_downloader
13
+
14
+ __all__ = [
15
+ "arxiv_downloader",
16
+ "base_paper_downloader",
17
+ "biorxiv_downloader",
18
+ "medrxiv_downloader",
19
+ "pubmed_downloader",
20
+ ]
@@ -0,0 +1,209 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ArXiv paper downloader implementation.
4
+ """
5
+
6
+ import logging
7
+ import xml.etree.ElementTree as ET
8
+ from typing import Any, Dict, Optional, Tuple
9
+
10
+ import requests
11
+
12
+ from .base_paper_downloader import BasePaperDownloader
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class ArxivDownloader(BasePaperDownloader):
18
+ """ArXiv-specific implementation of paper downloader."""
19
+
20
+ def __init__(self, config: Any):
21
+ """Initialize ArXiv downloader with configuration."""
22
+ super().__init__(config)
23
+ self.api_url = config.api_url
24
+ self.pdf_base_url = config.pdf_base_url
25
+ # XML namespace configuration
26
+ self.xml_namespaces = getattr(
27
+ config, "xml_namespace", {"atom": "http://www.w3.org/2005/Atom"}
28
+ )
29
+
30
+ def fetch_metadata(self, identifier: str) -> ET.Element:
31
+ """
32
+ Fetch paper metadata from arXiv API.
33
+
34
+ Args:
35
+ identifier: arXiv ID (e.g., '1234.5678' or '2301.12345')
36
+
37
+ Returns:
38
+ XML root element from arXiv API response
39
+
40
+ Raises:
41
+ requests.RequestException: If API call fails
42
+ RuntimeError: If no entry found in response
43
+ """
44
+ query_url = f"{self.api_url}?search_query=id:{identifier}&start=0&max_results=1"
45
+ logger.info("Fetching metadata for arXiv ID %s from: %s", identifier, query_url)
46
+
47
+ response = requests.get(query_url, timeout=self.request_timeout)
48
+ response.raise_for_status()
49
+
50
+ root = ET.fromstring(response.text)
51
+ entry = root.find("atom:entry", self.xml_namespaces)
52
+
53
+ if entry is None:
54
+ raise RuntimeError("No entry found in arXiv API response")
55
+
56
+ return root
57
+
58
+ def construct_pdf_url(self, metadata: ET.Element, identifier: str) -> str:
59
+ """
60
+ Extract or construct PDF URL from arXiv metadata.
61
+
62
+ Args:
63
+ metadata: XML root from arXiv API
64
+ identifier: arXiv ID
65
+
66
+ Returns:
67
+ PDF URL string
68
+ """
69
+ entry = metadata.find("atom:entry", self.xml_namespaces)
70
+
71
+ if entry is None:
72
+ return ""
73
+
74
+ # Try to find PDF link in metadata first
75
+ pdf_url = next(
76
+ (
77
+ link.attrib.get("href")
78
+ for link in entry.findall("atom:link", self.xml_namespaces)
79
+ if link.attrib.get("title") == "pdf"
80
+ ),
81
+ None,
82
+ )
83
+
84
+ # Fallback to constructed PDF URL if not found in metadata
85
+ if not pdf_url:
86
+ pdf_url = f"{self.pdf_base_url}/{identifier}.pdf"
87
+ logger.info("Using constructed PDF URL for %s: %s", identifier, pdf_url)
88
+
89
+ return pdf_url
90
+
91
+ def extract_paper_metadata(
92
+ self,
93
+ metadata: ET.Element,
94
+ identifier: str,
95
+ pdf_result: Optional[Tuple[str, str]],
96
+ ) -> Dict[str, Any]:
97
+ """
98
+ Extract structured metadata from arXiv API response.
99
+
100
+ Args:
101
+ metadata: XML root from arXiv API
102
+ identifier: arXiv ID
103
+ pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
104
+
105
+ Returns:
106
+ Standardized paper metadata dictionary
107
+ """
108
+ entry = metadata.find("atom:entry", self.xml_namespaces)
109
+
110
+ if entry is None:
111
+ raise RuntimeError("No entry found in metadata")
112
+
113
+ # Extract basic metadata
114
+ basic_metadata = self._extract_basic_metadata(entry, self.xml_namespaces)
115
+
116
+ # Handle PDF download results
117
+ pdf_metadata = self._extract_pdf_metadata(pdf_result, identifier)
118
+
119
+ # Combine all metadata
120
+ return {
121
+ **basic_metadata,
122
+ **pdf_metadata,
123
+ "source": "arxiv",
124
+ "arxiv_id": identifier,
125
+ }
126
+
127
+ def _extract_basic_metadata(self, entry: ET.Element, ns: dict) -> Dict[str, Any]:
128
+ """Extract basic metadata (title, authors, abstract, date) from entry."""
129
+ title = self._extract_title(entry, ns)
130
+ authors = self._extract_authors(entry, ns)
131
+ abstract = self._extract_abstract(entry, ns)
132
+ pub_date = self._extract_publication_date(entry, ns)
133
+
134
+ return {
135
+ "Title": title,
136
+ "Authors": authors,
137
+ "Abstract": abstract,
138
+ "Publication Date": pub_date,
139
+ }
140
+
141
+ def _extract_title(self, entry: ET.Element, ns: dict) -> str:
142
+ """Extract title from entry."""
143
+ title_elem = entry.find("atom:title", ns)
144
+ return (title_elem.text or "").strip() if title_elem is not None else "N/A"
145
+
146
+ def _extract_authors(self, entry: ET.Element, ns: dict) -> list:
147
+ """Extract authors from entry."""
148
+ authors = []
149
+ for author_elem in entry.findall("atom:author", ns):
150
+ name_elem = author_elem.find("atom:name", ns)
151
+ if name_elem is not None and name_elem.text:
152
+ authors.append(name_elem.text.strip())
153
+ return authors
154
+
155
+ def _extract_abstract(self, entry: ET.Element, ns: dict) -> str:
156
+ """Extract abstract from entry."""
157
+ summary_elem = entry.find("atom:summary", ns)
158
+ return (summary_elem.text or "").strip() if summary_elem is not None else "N/A"
159
+
160
+ def _extract_publication_date(self, entry: ET.Element, ns: dict) -> str:
161
+ """Extract publication date from entry."""
162
+ published_elem = entry.find("atom:published", ns)
163
+ return (
164
+ (published_elem.text or "").strip() if published_elem is not None else "N/A"
165
+ )
166
+
167
+ def _extract_pdf_metadata(
168
+ self, pdf_result: Optional[Tuple[str, str]], identifier: str
169
+ ) -> Dict[str, Any]:
170
+ """Extract PDF-related metadata."""
171
+ if pdf_result:
172
+ temp_file_path, filename = pdf_result
173
+ return {
174
+ "URL": temp_file_path,
175
+ "pdf_url": temp_file_path,
176
+ "filename": filename,
177
+ "access_type": "open_access_downloaded",
178
+ "temp_file_path": temp_file_path,
179
+ }
180
+
181
+ return {
182
+ "URL": "",
183
+ "pdf_url": "",
184
+ "filename": self.get_default_filename(identifier),
185
+ "access_type": "download_failed",
186
+ "temp_file_path": "",
187
+ }
188
+
189
+ def get_service_name(self) -> str:
190
+ """Return service name."""
191
+ return "arXiv"
192
+
193
+ def get_identifier_name(self) -> str:
194
+ """Return identifier display name."""
195
+ return "arXiv ID"
196
+
197
+ def get_default_filename(self, identifier: str) -> str:
198
+ """Generate default filename for arXiv paper."""
199
+ return f"{identifier}.pdf"
200
+
201
+ def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
202
+ """Get arXiv-specific identifier info for paper summary."""
203
+ arxiv_id = paper.get("arxiv_id", "N/A")
204
+ pub_date = paper.get("Publication Date", "N/A")
205
+ return f" (arXiv:{arxiv_id}, {pub_date})"
206
+
207
+ def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
208
+ """Add arXiv ID field to entry."""
209
+ entry["arxiv_id"] = identifier