aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
- aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
- aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
- aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
- aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
- aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
- aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
- aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
- aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
- aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
- aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
- aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
- /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,343 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Abstract base class for paper download tools.
|
4
|
+
Provides common functionality for arXiv, medRxiv, PubMed, and future paper sources.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import re
|
9
|
+
import tempfile
|
10
|
+
from abc import ABC, abstractmethod
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
12
|
+
|
13
|
+
import requests
|
14
|
+
|
15
|
+
# Configure logging
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class BasePaperDownloader(ABC):
|
20
|
+
"""Abstract base class for paper download tools."""
|
21
|
+
|
22
|
+
def __init__(self, config: Any):
|
23
|
+
"""Initialize with service-specific configuration."""
|
24
|
+
self.config = config
|
25
|
+
self.request_timeout = getattr(config, "request_timeout", 15)
|
26
|
+
self.chunk_size = getattr(config, "chunk_size", 8192)
|
27
|
+
self.user_agent = getattr(
|
28
|
+
config, "user_agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
|
29
|
+
)
|
30
|
+
|
31
|
+
# Abstract methods that each service must implement
|
32
|
+
@abstractmethod
|
33
|
+
def fetch_metadata(self, identifier: str) -> Any:
|
34
|
+
"""
|
35
|
+
Fetch paper metadata from the service API.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
identifier: Paper identifier (arXiv ID, DOI, PMID, etc.)
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
Service-specific metadata object (XML, JSON, etc.)
|
42
|
+
"""
|
43
|
+
raise NotImplementedError
|
44
|
+
|
45
|
+
@abstractmethod
|
46
|
+
def construct_pdf_url(self, metadata: Any, identifier: str) -> str:
|
47
|
+
"""
|
48
|
+
Construct or extract PDF URL from metadata.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
metadata: Metadata returned from fetch_metadata()
|
52
|
+
identifier: Original paper identifier
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
PDF URL string (empty if not available)
|
56
|
+
"""
|
57
|
+
raise NotImplementedError
|
58
|
+
|
59
|
+
@abstractmethod
|
60
|
+
def extract_paper_metadata(
|
61
|
+
self, metadata: Any, identifier: str, pdf_result: Optional[Tuple[str, str]]
|
62
|
+
) -> Dict[str, Any]:
|
63
|
+
"""
|
64
|
+
Extract and structure metadata into standardized format.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
metadata: Raw metadata from API
|
68
|
+
identifier: Original paper identifier
|
69
|
+
pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
Standardized paper metadata dictionary
|
73
|
+
"""
|
74
|
+
raise NotImplementedError
|
75
|
+
|
76
|
+
@abstractmethod
|
77
|
+
def get_service_name(self) -> str:
|
78
|
+
"""Return service name (e.g., 'arxiv', 'medrxiv', 'pubmed')."""
|
79
|
+
raise NotImplementedError
|
80
|
+
|
81
|
+
@abstractmethod
|
82
|
+
def get_identifier_name(self) -> str:
|
83
|
+
"""Return identifier display name (e.g., 'arXiv ID', 'DOI', 'PMID')."""
|
84
|
+
raise NotImplementedError
|
85
|
+
|
86
|
+
@abstractmethod
|
87
|
+
def get_default_filename(self, identifier: str) -> str:
|
88
|
+
"""Generate default filename for the paper PDF."""
|
89
|
+
raise NotImplementedError
|
90
|
+
|
91
|
+
# Common methods shared by all services
|
92
|
+
def download_pdf_to_temp(
|
93
|
+
self, pdf_url: str, identifier: str
|
94
|
+
) -> Optional[Tuple[str, str]]:
|
95
|
+
"""
|
96
|
+
Download PDF from URL to a temporary file.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
pdf_url: URL to download PDF from
|
100
|
+
identifier: Paper identifier for logging
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
Tuple of (temp_file_path, filename) or None if failed
|
104
|
+
"""
|
105
|
+
if not pdf_url:
|
106
|
+
logger.info(
|
107
|
+
"No PDF URL available for %s %s", self.get_identifier_name(), identifier
|
108
|
+
)
|
109
|
+
return None
|
110
|
+
|
111
|
+
try:
|
112
|
+
logger.info(
|
113
|
+
"Downloading PDF for %s %s from %s",
|
114
|
+
self.get_identifier_name(),
|
115
|
+
identifier,
|
116
|
+
pdf_url,
|
117
|
+
)
|
118
|
+
|
119
|
+
headers = {"User-Agent": self.user_agent}
|
120
|
+
response = requests.get(
|
121
|
+
pdf_url, headers=headers, timeout=self.request_timeout, stream=True
|
122
|
+
)
|
123
|
+
response.raise_for_status()
|
124
|
+
|
125
|
+
# Download to temporary file
|
126
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
127
|
+
for chunk in response.iter_content(chunk_size=self.chunk_size):
|
128
|
+
if chunk: # Filter out keep-alive chunks
|
129
|
+
temp_file.write(chunk)
|
130
|
+
temp_file_path = temp_file.name
|
131
|
+
|
132
|
+
logger.info(
|
133
|
+
"%s PDF downloaded to temporary file: %s",
|
134
|
+
self.get_service_name(),
|
135
|
+
temp_file_path,
|
136
|
+
)
|
137
|
+
|
138
|
+
# Try to extract filename from Content-Disposition header
|
139
|
+
filename = self.get_default_filename(identifier)
|
140
|
+
content_disposition = response.headers.get("Content-Disposition", "")
|
141
|
+
|
142
|
+
if "filename=" in content_disposition:
|
143
|
+
try:
|
144
|
+
|
145
|
+
filename_match = re.search(
|
146
|
+
r'filename[*]?=(?:"([^"]+)"|([^;]+))', content_disposition
|
147
|
+
)
|
148
|
+
if filename_match:
|
149
|
+
extracted_filename = filename_match.group(
|
150
|
+
1
|
151
|
+
) or filename_match.group(2)
|
152
|
+
extracted_filename = extracted_filename.strip().strip('"')
|
153
|
+
if extracted_filename and extracted_filename.endswith(".pdf"):
|
154
|
+
filename = extracted_filename
|
155
|
+
logger.info("Extracted filename from header: %s", filename)
|
156
|
+
except requests.RequestException as e:
|
157
|
+
logger.warning("Failed to extract filename from header: %s", e)
|
158
|
+
|
159
|
+
return temp_file_path, filename
|
160
|
+
|
161
|
+
except (requests.exceptions.RequestException, OSError) as e:
|
162
|
+
logger.error(
|
163
|
+
"Failed to download PDF for %s %s: %s",
|
164
|
+
self.get_identifier_name(),
|
165
|
+
identifier,
|
166
|
+
e,
|
167
|
+
)
|
168
|
+
return None
|
169
|
+
|
170
|
+
def get_snippet(self, abstract: str) -> str:
|
171
|
+
"""
|
172
|
+
Extract the first one or two sentences from an abstract.
|
173
|
+
|
174
|
+
Args:
|
175
|
+
abstract: Full abstract text
|
176
|
+
|
177
|
+
Returns:
|
178
|
+
Snippet of first 1-2 sentences
|
179
|
+
"""
|
180
|
+
if not abstract or abstract == "N/A":
|
181
|
+
return ""
|
182
|
+
|
183
|
+
sentences = abstract.split(". ")
|
184
|
+
snippet_sentences = sentences[:2]
|
185
|
+
snippet = ". ".join(snippet_sentences)
|
186
|
+
|
187
|
+
if not snippet.endswith("."):
|
188
|
+
snippet += "."
|
189
|
+
|
190
|
+
return snippet
|
191
|
+
|
192
|
+
def create_error_entry(self, identifier: str, error_msg: str) -> Dict[str, Any]:
|
193
|
+
"""
|
194
|
+
Create standardized error entry for failed paper processing.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
identifier: Paper identifier
|
198
|
+
error_msg: Error message
|
199
|
+
|
200
|
+
Returns:
|
201
|
+
Error entry dictionary
|
202
|
+
"""
|
203
|
+
return {
|
204
|
+
"Title": "Error fetching paper",
|
205
|
+
"Authors": [],
|
206
|
+
"Abstract": f"Error: {error_msg}",
|
207
|
+
"Publication Date": "N/A",
|
208
|
+
"URL": "",
|
209
|
+
"pdf_url": "",
|
210
|
+
"filename": self.get_default_filename(identifier),
|
211
|
+
"source": self.get_service_name(),
|
212
|
+
"access_type": "error",
|
213
|
+
"temp_file_path": "",
|
214
|
+
"error": error_msg,
|
215
|
+
# Service-specific identifier field will be added by subclasses
|
216
|
+
}
|
217
|
+
|
218
|
+
def build_summary(self, article_data: Dict[str, Any]) -> str:
|
219
|
+
"""
|
220
|
+
Build a summary string for up to three papers with snippets.
|
221
|
+
|
222
|
+
Args:
|
223
|
+
article_data: Dictionary of paper data keyed by identifier
|
224
|
+
|
225
|
+
Returns:
|
226
|
+
Formatted summary string
|
227
|
+
"""
|
228
|
+
top = list(article_data.values())[:3]
|
229
|
+
lines: List[str] = []
|
230
|
+
downloaded_count = sum(
|
231
|
+
1
|
232
|
+
for paper in article_data.values()
|
233
|
+
if paper.get("access_type") == "open_access_downloaded"
|
234
|
+
)
|
235
|
+
|
236
|
+
for idx, paper in enumerate(top):
|
237
|
+
title = paper.get("Title", "N/A")
|
238
|
+
access_type = paper.get("access_type", "N/A")
|
239
|
+
temp_file_path = paper.get("temp_file_path", "")
|
240
|
+
snippet = self.get_snippet(paper.get("Abstract", ""))
|
241
|
+
|
242
|
+
# Build paper line with service-specific identifier info
|
243
|
+
line = f"{idx+1}. {title}"
|
244
|
+
line += self._get_paper_identifier_info(paper)
|
245
|
+
line += f"\n Access: {access_type}"
|
246
|
+
|
247
|
+
if temp_file_path:
|
248
|
+
line += f"\n Downloaded to: {temp_file_path}"
|
249
|
+
if snippet:
|
250
|
+
line += f"\n Abstract snippet: {snippet}"
|
251
|
+
|
252
|
+
lines.append(line)
|
253
|
+
|
254
|
+
summary = "\n".join(lines)
|
255
|
+
service_name = self.get_service_name()
|
256
|
+
|
257
|
+
return (
|
258
|
+
f"Download was successful from {service_name}. "
|
259
|
+
"Papers metadata are attached as an artifact. "
|
260
|
+
"Here is a summary of the results:\n"
|
261
|
+
f"Number of papers found: {len(article_data)}\n"
|
262
|
+
f"PDFs successfully downloaded: {downloaded_count}\n"
|
263
|
+
"Top 3 papers:\n" + summary
|
264
|
+
)
|
265
|
+
|
266
|
+
@abstractmethod
|
267
|
+
def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
|
268
|
+
"""
|
269
|
+
Get service-specific identifier info for paper summary.
|
270
|
+
|
271
|
+
Args:
|
272
|
+
paper: Paper metadata dictionary
|
273
|
+
|
274
|
+
Returns:
|
275
|
+
Formatted identifier string (e.g., " (arXiv:1234.5678, 2023-01-01)")
|
276
|
+
"""
|
277
|
+
raise NotImplementedError
|
278
|
+
|
279
|
+
def process_identifiers(self, identifiers: List[str]) -> Dict[str, Any]:
|
280
|
+
"""
|
281
|
+
Main processing loop for downloading papers.
|
282
|
+
|
283
|
+
Args:
|
284
|
+
identifiers: List of paper identifiers
|
285
|
+
|
286
|
+
Returns:
|
287
|
+
Dictionary of paper data keyed by identifier
|
288
|
+
"""
|
289
|
+
logger.info(
|
290
|
+
"Processing %d identifiers from %s: %s",
|
291
|
+
len(identifiers),
|
292
|
+
self.get_service_name(),
|
293
|
+
identifiers,
|
294
|
+
)
|
295
|
+
|
296
|
+
article_data: Dict[str, Any] = {}
|
297
|
+
|
298
|
+
for identifier in identifiers:
|
299
|
+
logger.info("Processing %s: %s", self.get_identifier_name(), identifier)
|
300
|
+
|
301
|
+
try:
|
302
|
+
# Step 1: Fetch metadata
|
303
|
+
metadata = self.fetch_metadata(identifier)
|
304
|
+
|
305
|
+
# Step 2: Extract PDF URL
|
306
|
+
pdf_url = self.construct_pdf_url(metadata, identifier)
|
307
|
+
|
308
|
+
# Step 3: Download PDF if available
|
309
|
+
pdf_result = None
|
310
|
+
if pdf_url:
|
311
|
+
pdf_result = self.download_pdf_to_temp(pdf_url, identifier)
|
312
|
+
|
313
|
+
# Step 4: Extract and structure metadata
|
314
|
+
article_data[identifier] = self.extract_paper_metadata(
|
315
|
+
metadata, identifier, pdf_result
|
316
|
+
)
|
317
|
+
|
318
|
+
except requests.RequestException as e:
|
319
|
+
logger.warning(
|
320
|
+
"Error processing %s %s: %s",
|
321
|
+
self.get_identifier_name(),
|
322
|
+
identifier,
|
323
|
+
str(e),
|
324
|
+
)
|
325
|
+
|
326
|
+
# Create error entry
|
327
|
+
error_entry = self.create_error_entry(identifier, str(e))
|
328
|
+
# Add service-specific identifier field
|
329
|
+
self._add_service_identifier(error_entry, identifier)
|
330
|
+
article_data[identifier] = error_entry
|
331
|
+
|
332
|
+
return article_data
|
333
|
+
|
334
|
+
@abstractmethod
|
335
|
+
def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
|
336
|
+
"""
|
337
|
+
Add service-specific identifier field to entry.
|
338
|
+
|
339
|
+
Args:
|
340
|
+
entry: Paper entry dictionary to modify
|
341
|
+
identifier: Original identifier
|
342
|
+
"""
|
343
|
+
raise NotImplementedError
|
@@ -0,0 +1,321 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
BioRxiv paper downloader implementation.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import logging
|
7
|
+
import re
|
8
|
+
import tempfile
|
9
|
+
from typing import Any, Dict, Optional, Tuple
|
10
|
+
|
11
|
+
import cloudscraper
|
12
|
+
import requests
|
13
|
+
|
14
|
+
from .base_paper_downloader import BasePaperDownloader
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class BiorxivDownloader(BasePaperDownloader):
|
20
|
+
"""BioRxiv-specific implementation of paper downloader."""
|
21
|
+
|
22
|
+
def __init__(self, config: Any):
|
23
|
+
"""Initialize BioRxiv downloader with configuration."""
|
24
|
+
super().__init__(config)
|
25
|
+
self.api_url = config.api_url
|
26
|
+
self.pdf_base_url = getattr(
|
27
|
+
config, "pdf_base_url", "https://www.biorxiv.org/content/10.1101/"
|
28
|
+
)
|
29
|
+
self.landing_url_template = getattr(
|
30
|
+
config,
|
31
|
+
"landing_url_template",
|
32
|
+
"https://www.biorxiv.org/content/{doi}v{version}",
|
33
|
+
)
|
34
|
+
self.pdf_url_template = getattr(
|
35
|
+
config,
|
36
|
+
"pdf_url_template",
|
37
|
+
"https://www.biorxiv.org/content/{doi}v{version}.full.pdf",
|
38
|
+
)
|
39
|
+
|
40
|
+
# Default values
|
41
|
+
self.default_version = getattr(config, "default_version", "1")
|
42
|
+
|
43
|
+
# CloudScraper specific settings
|
44
|
+
self.cf_clearance_timeout = getattr(config, "cf_clearance_timeout", 30)
|
45
|
+
self.session_reuse = getattr(config, "session_reuse", True)
|
46
|
+
self.browser_config_type = getattr(config, "browser_config", {}).get(
|
47
|
+
"type", "custom"
|
48
|
+
)
|
49
|
+
|
50
|
+
# Initialize shared CloudScraper session if enabled
|
51
|
+
self._scraper = None
|
52
|
+
if self.session_reuse:
|
53
|
+
self._scraper = cloudscraper.create_scraper(
|
54
|
+
browser={self.browser_config_type: self.user_agent},
|
55
|
+
delay=self.cf_clearance_timeout,
|
56
|
+
)
|
57
|
+
|
58
|
+
def fetch_metadata(self, identifier: str) -> Dict[str, Any]:
|
59
|
+
"""
|
60
|
+
Fetch paper metadata from bioRxiv API.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
identifier: DOI (e.g., '10.1101/2020.09.09.20191205')
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
JSON response as dictionary from bioRxiv API
|
67
|
+
|
68
|
+
Raises:
|
69
|
+
requests.RequestException: If API call fails
|
70
|
+
RuntimeError: If no collection data found in response
|
71
|
+
"""
|
72
|
+
query_url = f"{self.api_url}/biorxiv/{identifier}/na/json"
|
73
|
+
logger.info("Fetching metadata for DOI %s from: %s", identifier, query_url)
|
74
|
+
|
75
|
+
# Use CloudScraper for metadata as well, in case API is behind CF protection
|
76
|
+
scraper = self._scraper or cloudscraper.create_scraper(
|
77
|
+
browser={self.browser_config_type: self.user_agent},
|
78
|
+
delay=self.cf_clearance_timeout,
|
79
|
+
)
|
80
|
+
|
81
|
+
response = scraper.get(query_url, timeout=self.request_timeout)
|
82
|
+
response.raise_for_status()
|
83
|
+
|
84
|
+
paper_data = response.json()
|
85
|
+
|
86
|
+
if "collection" not in paper_data or not paper_data["collection"]:
|
87
|
+
raise RuntimeError("No collection data found in bioRxiv API response")
|
88
|
+
|
89
|
+
return paper_data
|
90
|
+
|
91
|
+
def construct_pdf_url(self, metadata: Dict[str, Any], identifier: str) -> str:
|
92
|
+
"""
|
93
|
+
Construct PDF URL from bioRxiv metadata and DOI.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
metadata: JSON response from bioRxiv API
|
97
|
+
identifier: DOI
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
Constructed PDF URL string
|
101
|
+
"""
|
102
|
+
if "collection" not in metadata or not metadata["collection"]:
|
103
|
+
return ""
|
104
|
+
|
105
|
+
paper = metadata["collection"][0] # Get first (and should be only) paper
|
106
|
+
version = paper.get("version", self.default_version)
|
107
|
+
|
108
|
+
# Construct bioRxiv PDF URL using template
|
109
|
+
pdf_url = self.pdf_url_template.format(doi=identifier, version=version)
|
110
|
+
logger.info("Constructed PDF URL for DOI %s: %s", identifier, pdf_url)
|
111
|
+
|
112
|
+
return pdf_url
|
113
|
+
|
114
|
+
def download_pdf_to_temp(
|
115
|
+
self, pdf_url: str, identifier: str
|
116
|
+
) -> Optional[Tuple[str, str]]:
|
117
|
+
"""
|
118
|
+
Override base method to use CloudScraper for bioRxiv PDF downloads.
|
119
|
+
Includes landing page visit to handle CloudFlare protection.
|
120
|
+
|
121
|
+
Args:
|
122
|
+
pdf_url: URL to download PDF from
|
123
|
+
identifier: DOI for logging
|
124
|
+
|
125
|
+
Returns:
|
126
|
+
Tuple of (temp_file_path, filename) or None if failed
|
127
|
+
"""
|
128
|
+
if not pdf_url:
|
129
|
+
logger.info("No PDF URL available for DOI %s", identifier)
|
130
|
+
return None
|
131
|
+
|
132
|
+
try:
|
133
|
+
logger.info("Downloading PDF for DOI %s from %s", identifier, pdf_url)
|
134
|
+
|
135
|
+
# Get scraper and visit landing page if needed
|
136
|
+
scraper = self._get_scraper()
|
137
|
+
self._visit_landing_page(scraper, pdf_url, identifier)
|
138
|
+
|
139
|
+
# Download and save PDF
|
140
|
+
response = scraper.get(pdf_url, timeout=self.request_timeout, stream=True)
|
141
|
+
response.raise_for_status()
|
142
|
+
|
143
|
+
temp_file_path = self._save_pdf_to_temp(response)
|
144
|
+
filename = self._extract_filename(response, identifier)
|
145
|
+
|
146
|
+
return temp_file_path, filename
|
147
|
+
|
148
|
+
except requests.RequestException as e:
|
149
|
+
logger.error("Failed to download PDF for DOI %s: %s", identifier, e)
|
150
|
+
return None
|
151
|
+
|
152
|
+
def _get_scraper(self):
|
153
|
+
"""Get or create CloudScraper instance."""
|
154
|
+
return self._scraper or cloudscraper.create_scraper(
|
155
|
+
browser={self.browser_config_type: self.user_agent},
|
156
|
+
delay=self.cf_clearance_timeout,
|
157
|
+
)
|
158
|
+
|
159
|
+
def _visit_landing_page(self, scraper, pdf_url: str, identifier: str) -> None:
|
160
|
+
"""Visit landing page to handle CloudFlare protection."""
|
161
|
+
if ".full.pdf" in pdf_url:
|
162
|
+
landing_url = pdf_url.replace(".full.pdf", "")
|
163
|
+
logger.info("Visiting landing page first: %s", landing_url)
|
164
|
+
|
165
|
+
landing_response = scraper.get(landing_url, timeout=self.request_timeout)
|
166
|
+
landing_response.raise_for_status()
|
167
|
+
logger.info("Successfully accessed landing page for %s", identifier)
|
168
|
+
|
169
|
+
def _save_pdf_to_temp(self, response) -> str:
|
170
|
+
"""Save PDF response to temporary file."""
|
171
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
172
|
+
for chunk in response.iter_content(chunk_size=self.chunk_size):
|
173
|
+
if chunk: # Filter out keep-alive chunks
|
174
|
+
temp_file.write(chunk)
|
175
|
+
temp_file_path = temp_file.name
|
176
|
+
|
177
|
+
logger.info("BioRxiv PDF downloaded to temporary file: %s", temp_file_path)
|
178
|
+
return temp_file_path
|
179
|
+
|
180
|
+
def _extract_filename(self, response, identifier: str) -> str:
|
181
|
+
"""Extract filename from response headers or generate default."""
|
182
|
+
filename = self.get_default_filename(identifier)
|
183
|
+
|
184
|
+
content_disposition = response.headers.get("Content-Disposition", "")
|
185
|
+
if "filename=" in content_disposition:
|
186
|
+
try:
|
187
|
+
filename_match = re.search(
|
188
|
+
r'filename[*]?=(?:"([^"]+)"|([^;]+))', content_disposition
|
189
|
+
)
|
190
|
+
if filename_match:
|
191
|
+
extracted_filename = filename_match.group(
|
192
|
+
1
|
193
|
+
) or filename_match.group(2)
|
194
|
+
extracted_filename = extracted_filename.strip().strip('"')
|
195
|
+
if extracted_filename and extracted_filename.endswith(".pdf"):
|
196
|
+
filename = extracted_filename
|
197
|
+
logger.info("Extracted filename from header: %s", filename)
|
198
|
+
except requests.RequestException as e:
|
199
|
+
logger.warning("Failed to extract filename from header: %s", e)
|
200
|
+
|
201
|
+
return filename
|
202
|
+
|
203
|
+
def extract_paper_metadata(
|
204
|
+
self,
|
205
|
+
metadata: Dict[str, Any],
|
206
|
+
identifier: str,
|
207
|
+
pdf_result: Optional[Tuple[str, str]],
|
208
|
+
) -> Dict[str, Any]:
|
209
|
+
"""
|
210
|
+
Extract structured metadata from bioRxiv API response.
|
211
|
+
|
212
|
+
Args:
|
213
|
+
metadata: JSON response from bioRxiv API
|
214
|
+
identifier: DOI
|
215
|
+
pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
Standardized paper metadata dictionary
|
219
|
+
"""
|
220
|
+
if "collection" not in metadata or not metadata["collection"]:
|
221
|
+
raise RuntimeError("No collection data found in metadata")
|
222
|
+
|
223
|
+
paper = metadata["collection"][0] # Get first (and should be only) paper
|
224
|
+
|
225
|
+
# Extract basic metadata
|
226
|
+
basic_metadata = self._extract_basic_metadata(paper, identifier)
|
227
|
+
|
228
|
+
# Handle PDF download results
|
229
|
+
pdf_metadata = self._extract_pdf_metadata(pdf_result, identifier)
|
230
|
+
|
231
|
+
# Combine all metadata
|
232
|
+
return {
|
233
|
+
**basic_metadata,
|
234
|
+
**pdf_metadata,
|
235
|
+
}
|
236
|
+
|
237
|
+
def _extract_basic_metadata(
|
238
|
+
self, paper: Dict[str, Any], identifier: str
|
239
|
+
) -> Dict[str, Any]:
|
240
|
+
"""Extract basic metadata from paper data."""
|
241
|
+
# Extract basic fields
|
242
|
+
title = paper.get("title", "N/A").strip()
|
243
|
+
abstract = paper.get("abstract", "N/A").strip()
|
244
|
+
pub_date = paper.get("date", "N/A").strip()
|
245
|
+
category = paper.get("category", "N/A").strip()
|
246
|
+
version = paper.get("version", "N/A")
|
247
|
+
|
248
|
+
# Extract authors - typically in a semicolon-separated string
|
249
|
+
authors = self._extract_authors(paper.get("authors", ""))
|
250
|
+
|
251
|
+
return {
|
252
|
+
"Title": title,
|
253
|
+
"Authors": authors,
|
254
|
+
"Abstract": abstract,
|
255
|
+
"Publication Date": pub_date,
|
256
|
+
"DOI": identifier,
|
257
|
+
"Category": category,
|
258
|
+
"Version": version,
|
259
|
+
"source": "biorxiv",
|
260
|
+
"server": "biorxiv",
|
261
|
+
}
|
262
|
+
|
263
|
+
def _extract_authors(self, authors_str: str) -> list:
|
264
|
+
"""Extract and clean authors from semicolon-separated string."""
|
265
|
+
if not authors_str:
|
266
|
+
return []
|
267
|
+
return [author.strip() for author in authors_str.split(";") if author.strip()]
|
268
|
+
|
269
|
+
def _extract_pdf_metadata(
|
270
|
+
self, pdf_result: Optional[Tuple[str, str]], identifier: str
|
271
|
+
) -> Dict[str, Any]:
|
272
|
+
"""Extract PDF-related metadata."""
|
273
|
+
if pdf_result:
|
274
|
+
temp_file_path, filename = pdf_result
|
275
|
+
return {
|
276
|
+
"URL": temp_file_path,
|
277
|
+
"pdf_url": temp_file_path,
|
278
|
+
"filename": filename,
|
279
|
+
"access_type": "open_access_downloaded",
|
280
|
+
"temp_file_path": temp_file_path,
|
281
|
+
}
|
282
|
+
|
283
|
+
return {
|
284
|
+
"URL": "",
|
285
|
+
"pdf_url": "",
|
286
|
+
"filename": self.get_default_filename(identifier),
|
287
|
+
"access_type": "download_failed",
|
288
|
+
"temp_file_path": "",
|
289
|
+
}
|
290
|
+
|
291
|
+
def get_service_name(self) -> str:
|
292
|
+
"""Return service name."""
|
293
|
+
return "bioRxiv"
|
294
|
+
|
295
|
+
def get_identifier_name(self) -> str:
|
296
|
+
"""Return identifier display name."""
|
297
|
+
return "DOI"
|
298
|
+
|
299
|
+
def get_default_filename(self, identifier: str) -> str:
|
300
|
+
"""Generate default filename for bioRxiv paper."""
|
301
|
+
# Sanitize DOI for filename use
|
302
|
+
return f"{identifier.replace('/', '_').replace('.', '_')}.pdf"
|
303
|
+
|
304
|
+
def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
|
305
|
+
"""Get bioRxiv-specific identifier info for paper summary."""
|
306
|
+
doi = paper.get("DOI", "N/A")
|
307
|
+
pub_date = paper.get("Publication Date", "N/A")
|
308
|
+
category = paper.get("Category", "N/A")
|
309
|
+
|
310
|
+
info = f" (DOI:{doi}, {pub_date})"
|
311
|
+
if category != "N/A":
|
312
|
+
info += f"\n Category: {category}"
|
313
|
+
|
314
|
+
return info
|
315
|
+
|
316
|
+
def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
|
317
|
+
"""Add DOI and bioRxiv-specific fields to entry."""
|
318
|
+
entry["DOI"] = identifier
|
319
|
+
entry["Category"] = "N/A"
|
320
|
+
entry["Version"] = "N/A"
|
321
|
+
entry["server"] = "biorxiv"
|