aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
  2. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
  3. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
  4. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
  5. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
  6. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
  7. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
  8. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
  9. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
  10. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
  11. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
  12. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
  13. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
  14. aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
  15. aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
  16. aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
  17. aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
  18. aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
  19. aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
  20. aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
  21. aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
  22. aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
  23. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
  24. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
  25. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
  26. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
  27. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
  28. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
  29. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
  30. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
  31. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
  32. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
  33. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
  34. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
  35. aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
  36. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
  37. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
  38. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
  39. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
  40. aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
  41. aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
  42. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
  43. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
  44. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
  45. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
  46. /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
  47. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
  48. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
  49. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,343 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Abstract base class for paper download tools.
4
+ Provides common functionality for arXiv, medRxiv, PubMed, and future paper sources.
5
+ """
6
+
7
+ import logging
8
+ import re
9
+ import tempfile
10
+ from abc import ABC, abstractmethod
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ import requests
14
+
15
+ # Configure logging
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class BasePaperDownloader(ABC):
20
+ """Abstract base class for paper download tools."""
21
+
22
+ def __init__(self, config: Any):
23
+ """Initialize with service-specific configuration."""
24
+ self.config = config
25
+ self.request_timeout = getattr(config, "request_timeout", 15)
26
+ self.chunk_size = getattr(config, "chunk_size", 8192)
27
+ self.user_agent = getattr(
28
+ config, "user_agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
29
+ )
30
+
31
+ # Abstract methods that each service must implement
32
+ @abstractmethod
33
+ def fetch_metadata(self, identifier: str) -> Any:
34
+ """
35
+ Fetch paper metadata from the service API.
36
+
37
+ Args:
38
+ identifier: Paper identifier (arXiv ID, DOI, PMID, etc.)
39
+
40
+ Returns:
41
+ Service-specific metadata object (XML, JSON, etc.)
42
+ """
43
+ raise NotImplementedError
44
+
45
+ @abstractmethod
46
+ def construct_pdf_url(self, metadata: Any, identifier: str) -> str:
47
+ """
48
+ Construct or extract PDF URL from metadata.
49
+
50
+ Args:
51
+ metadata: Metadata returned from fetch_metadata()
52
+ identifier: Original paper identifier
53
+
54
+ Returns:
55
+ PDF URL string (empty if not available)
56
+ """
57
+ raise NotImplementedError
58
+
59
+ @abstractmethod
60
+ def extract_paper_metadata(
61
+ self, metadata: Any, identifier: str, pdf_result: Optional[Tuple[str, str]]
62
+ ) -> Dict[str, Any]:
63
+ """
64
+ Extract and structure metadata into standardized format.
65
+
66
+ Args:
67
+ metadata: Raw metadata from API
68
+ identifier: Original paper identifier
69
+ pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
70
+
71
+ Returns:
72
+ Standardized paper metadata dictionary
73
+ """
74
+ raise NotImplementedError
75
+
76
+ @abstractmethod
77
+ def get_service_name(self) -> str:
78
+ """Return service name (e.g., 'arxiv', 'medrxiv', 'pubmed')."""
79
+ raise NotImplementedError
80
+
81
+ @abstractmethod
82
+ def get_identifier_name(self) -> str:
83
+ """Return identifier display name (e.g., 'arXiv ID', 'DOI', 'PMID')."""
84
+ raise NotImplementedError
85
+
86
+ @abstractmethod
87
+ def get_default_filename(self, identifier: str) -> str:
88
+ """Generate default filename for the paper PDF."""
89
+ raise NotImplementedError
90
+
91
+ # Common methods shared by all services
92
+ def download_pdf_to_temp(
93
+ self, pdf_url: str, identifier: str
94
+ ) -> Optional[Tuple[str, str]]:
95
+ """
96
+ Download PDF from URL to a temporary file.
97
+
98
+ Args:
99
+ pdf_url: URL to download PDF from
100
+ identifier: Paper identifier for logging
101
+
102
+ Returns:
103
+ Tuple of (temp_file_path, filename) or None if failed
104
+ """
105
+ if not pdf_url:
106
+ logger.info(
107
+ "No PDF URL available for %s %s", self.get_identifier_name(), identifier
108
+ )
109
+ return None
110
+
111
+ try:
112
+ logger.info(
113
+ "Downloading PDF for %s %s from %s",
114
+ self.get_identifier_name(),
115
+ identifier,
116
+ pdf_url,
117
+ )
118
+
119
+ headers = {"User-Agent": self.user_agent}
120
+ response = requests.get(
121
+ pdf_url, headers=headers, timeout=self.request_timeout, stream=True
122
+ )
123
+ response.raise_for_status()
124
+
125
+ # Download to temporary file
126
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
127
+ for chunk in response.iter_content(chunk_size=self.chunk_size):
128
+ if chunk: # Filter out keep-alive chunks
129
+ temp_file.write(chunk)
130
+ temp_file_path = temp_file.name
131
+
132
+ logger.info(
133
+ "%s PDF downloaded to temporary file: %s",
134
+ self.get_service_name(),
135
+ temp_file_path,
136
+ )
137
+
138
+ # Try to extract filename from Content-Disposition header
139
+ filename = self.get_default_filename(identifier)
140
+ content_disposition = response.headers.get("Content-Disposition", "")
141
+
142
+ if "filename=" in content_disposition:
143
+ try:
144
+
145
+ filename_match = re.search(
146
+ r'filename[*]?=(?:"([^"]+)"|([^;]+))', content_disposition
147
+ )
148
+ if filename_match:
149
+ extracted_filename = filename_match.group(
150
+ 1
151
+ ) or filename_match.group(2)
152
+ extracted_filename = extracted_filename.strip().strip('"')
153
+ if extracted_filename and extracted_filename.endswith(".pdf"):
154
+ filename = extracted_filename
155
+ logger.info("Extracted filename from header: %s", filename)
156
+ except requests.RequestException as e:
157
+ logger.warning("Failed to extract filename from header: %s", e)
158
+
159
+ return temp_file_path, filename
160
+
161
+ except (requests.exceptions.RequestException, OSError) as e:
162
+ logger.error(
163
+ "Failed to download PDF for %s %s: %s",
164
+ self.get_identifier_name(),
165
+ identifier,
166
+ e,
167
+ )
168
+ return None
169
+
170
+ def get_snippet(self, abstract: str) -> str:
171
+ """
172
+ Extract the first one or two sentences from an abstract.
173
+
174
+ Args:
175
+ abstract: Full abstract text
176
+
177
+ Returns:
178
+ Snippet of first 1-2 sentences
179
+ """
180
+ if not abstract or abstract == "N/A":
181
+ return ""
182
+
183
+ sentences = abstract.split(". ")
184
+ snippet_sentences = sentences[:2]
185
+ snippet = ". ".join(snippet_sentences)
186
+
187
+ if not snippet.endswith("."):
188
+ snippet += "."
189
+
190
+ return snippet
191
+
192
+ def create_error_entry(self, identifier: str, error_msg: str) -> Dict[str, Any]:
193
+ """
194
+ Create standardized error entry for failed paper processing.
195
+
196
+ Args:
197
+ identifier: Paper identifier
198
+ error_msg: Error message
199
+
200
+ Returns:
201
+ Error entry dictionary
202
+ """
203
+ return {
204
+ "Title": "Error fetching paper",
205
+ "Authors": [],
206
+ "Abstract": f"Error: {error_msg}",
207
+ "Publication Date": "N/A",
208
+ "URL": "",
209
+ "pdf_url": "",
210
+ "filename": self.get_default_filename(identifier),
211
+ "source": self.get_service_name(),
212
+ "access_type": "error",
213
+ "temp_file_path": "",
214
+ "error": error_msg,
215
+ # Service-specific identifier field will be added by subclasses
216
+ }
217
+
218
+ def build_summary(self, article_data: Dict[str, Any]) -> str:
219
+ """
220
+ Build a summary string for up to three papers with snippets.
221
+
222
+ Args:
223
+ article_data: Dictionary of paper data keyed by identifier
224
+
225
+ Returns:
226
+ Formatted summary string
227
+ """
228
+ top = list(article_data.values())[:3]
229
+ lines: List[str] = []
230
+ downloaded_count = sum(
231
+ 1
232
+ for paper in article_data.values()
233
+ if paper.get("access_type") == "open_access_downloaded"
234
+ )
235
+
236
+ for idx, paper in enumerate(top):
237
+ title = paper.get("Title", "N/A")
238
+ access_type = paper.get("access_type", "N/A")
239
+ temp_file_path = paper.get("temp_file_path", "")
240
+ snippet = self.get_snippet(paper.get("Abstract", ""))
241
+
242
+ # Build paper line with service-specific identifier info
243
+ line = f"{idx+1}. {title}"
244
+ line += self._get_paper_identifier_info(paper)
245
+ line += f"\n Access: {access_type}"
246
+
247
+ if temp_file_path:
248
+ line += f"\n Downloaded to: {temp_file_path}"
249
+ if snippet:
250
+ line += f"\n Abstract snippet: {snippet}"
251
+
252
+ lines.append(line)
253
+
254
+ summary = "\n".join(lines)
255
+ service_name = self.get_service_name()
256
+
257
+ return (
258
+ f"Download was successful from {service_name}. "
259
+ "Papers metadata are attached as an artifact. "
260
+ "Here is a summary of the results:\n"
261
+ f"Number of papers found: {len(article_data)}\n"
262
+ f"PDFs successfully downloaded: {downloaded_count}\n"
263
+ "Top 3 papers:\n" + summary
264
+ )
265
+
266
+ @abstractmethod
267
+ def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
268
+ """
269
+ Get service-specific identifier info for paper summary.
270
+
271
+ Args:
272
+ paper: Paper metadata dictionary
273
+
274
+ Returns:
275
+ Formatted identifier string (e.g., " (arXiv:1234.5678, 2023-01-01)")
276
+ """
277
+ raise NotImplementedError
278
+
279
+ def process_identifiers(self, identifiers: List[str]) -> Dict[str, Any]:
280
+ """
281
+ Main processing loop for downloading papers.
282
+
283
+ Args:
284
+ identifiers: List of paper identifiers
285
+
286
+ Returns:
287
+ Dictionary of paper data keyed by identifier
288
+ """
289
+ logger.info(
290
+ "Processing %d identifiers from %s: %s",
291
+ len(identifiers),
292
+ self.get_service_name(),
293
+ identifiers,
294
+ )
295
+
296
+ article_data: Dict[str, Any] = {}
297
+
298
+ for identifier in identifiers:
299
+ logger.info("Processing %s: %s", self.get_identifier_name(), identifier)
300
+
301
+ try:
302
+ # Step 1: Fetch metadata
303
+ metadata = self.fetch_metadata(identifier)
304
+
305
+ # Step 2: Extract PDF URL
306
+ pdf_url = self.construct_pdf_url(metadata, identifier)
307
+
308
+ # Step 3: Download PDF if available
309
+ pdf_result = None
310
+ if pdf_url:
311
+ pdf_result = self.download_pdf_to_temp(pdf_url, identifier)
312
+
313
+ # Step 4: Extract and structure metadata
314
+ article_data[identifier] = self.extract_paper_metadata(
315
+ metadata, identifier, pdf_result
316
+ )
317
+
318
+ except requests.RequestException as e:
319
+ logger.warning(
320
+ "Error processing %s %s: %s",
321
+ self.get_identifier_name(),
322
+ identifier,
323
+ str(e),
324
+ )
325
+
326
+ # Create error entry
327
+ error_entry = self.create_error_entry(identifier, str(e))
328
+ # Add service-specific identifier field
329
+ self._add_service_identifier(error_entry, identifier)
330
+ article_data[identifier] = error_entry
331
+
332
+ return article_data
333
+
334
+ @abstractmethod
335
+ def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
336
+ """
337
+ Add service-specific identifier field to entry.
338
+
339
+ Args:
340
+ entry: Paper entry dictionary to modify
341
+ identifier: Original identifier
342
+ """
343
+ raise NotImplementedError
@@ -0,0 +1,321 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ BioRxiv paper downloader implementation.
4
+ """
5
+
6
+ import logging
7
+ import re
8
+ import tempfile
9
+ from typing import Any, Dict, Optional, Tuple
10
+
11
+ import cloudscraper
12
+ import requests
13
+
14
+ from .base_paper_downloader import BasePaperDownloader
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class BiorxivDownloader(BasePaperDownloader):
20
+ """BioRxiv-specific implementation of paper downloader."""
21
+
22
+ def __init__(self, config: Any):
23
+ """Initialize BioRxiv downloader with configuration."""
24
+ super().__init__(config)
25
+ self.api_url = config.api_url
26
+ self.pdf_base_url = getattr(
27
+ config, "pdf_base_url", "https://www.biorxiv.org/content/10.1101/"
28
+ )
29
+ self.landing_url_template = getattr(
30
+ config,
31
+ "landing_url_template",
32
+ "https://www.biorxiv.org/content/{doi}v{version}",
33
+ )
34
+ self.pdf_url_template = getattr(
35
+ config,
36
+ "pdf_url_template",
37
+ "https://www.biorxiv.org/content/{doi}v{version}.full.pdf",
38
+ )
39
+
40
+ # Default values
41
+ self.default_version = getattr(config, "default_version", "1")
42
+
43
+ # CloudScraper specific settings
44
+ self.cf_clearance_timeout = getattr(config, "cf_clearance_timeout", 30)
45
+ self.session_reuse = getattr(config, "session_reuse", True)
46
+ self.browser_config_type = getattr(config, "browser_config", {}).get(
47
+ "type", "custom"
48
+ )
49
+
50
+ # Initialize shared CloudScraper session if enabled
51
+ self._scraper = None
52
+ if self.session_reuse:
53
+ self._scraper = cloudscraper.create_scraper(
54
+ browser={self.browser_config_type: self.user_agent},
55
+ delay=self.cf_clearance_timeout,
56
+ )
57
+
58
+ def fetch_metadata(self, identifier: str) -> Dict[str, Any]:
59
+ """
60
+ Fetch paper metadata from bioRxiv API.
61
+
62
+ Args:
63
+ identifier: DOI (e.g., '10.1101/2020.09.09.20191205')
64
+
65
+ Returns:
66
+ JSON response as dictionary from bioRxiv API
67
+
68
+ Raises:
69
+ requests.RequestException: If API call fails
70
+ RuntimeError: If no collection data found in response
71
+ """
72
+ query_url = f"{self.api_url}/biorxiv/{identifier}/na/json"
73
+ logger.info("Fetching metadata for DOI %s from: %s", identifier, query_url)
74
+
75
+ # Use CloudScraper for metadata as well, in case API is behind CF protection
76
+ scraper = self._scraper or cloudscraper.create_scraper(
77
+ browser={self.browser_config_type: self.user_agent},
78
+ delay=self.cf_clearance_timeout,
79
+ )
80
+
81
+ response = scraper.get(query_url, timeout=self.request_timeout)
82
+ response.raise_for_status()
83
+
84
+ paper_data = response.json()
85
+
86
+ if "collection" not in paper_data or not paper_data["collection"]:
87
+ raise RuntimeError("No collection data found in bioRxiv API response")
88
+
89
+ return paper_data
90
+
91
+ def construct_pdf_url(self, metadata: Dict[str, Any], identifier: str) -> str:
92
+ """
93
+ Construct PDF URL from bioRxiv metadata and DOI.
94
+
95
+ Args:
96
+ metadata: JSON response from bioRxiv API
97
+ identifier: DOI
98
+
99
+ Returns:
100
+ Constructed PDF URL string
101
+ """
102
+ if "collection" not in metadata or not metadata["collection"]:
103
+ return ""
104
+
105
+ paper = metadata["collection"][0] # Get first (and should be only) paper
106
+ version = paper.get("version", self.default_version)
107
+
108
+ # Construct bioRxiv PDF URL using template
109
+ pdf_url = self.pdf_url_template.format(doi=identifier, version=version)
110
+ logger.info("Constructed PDF URL for DOI %s: %s", identifier, pdf_url)
111
+
112
+ return pdf_url
113
+
114
+ def download_pdf_to_temp(
115
+ self, pdf_url: str, identifier: str
116
+ ) -> Optional[Tuple[str, str]]:
117
+ """
118
+ Override base method to use CloudScraper for bioRxiv PDF downloads.
119
+ Includes landing page visit to handle CloudFlare protection.
120
+
121
+ Args:
122
+ pdf_url: URL to download PDF from
123
+ identifier: DOI for logging
124
+
125
+ Returns:
126
+ Tuple of (temp_file_path, filename) or None if failed
127
+ """
128
+ if not pdf_url:
129
+ logger.info("No PDF URL available for DOI %s", identifier)
130
+ return None
131
+
132
+ try:
133
+ logger.info("Downloading PDF for DOI %s from %s", identifier, pdf_url)
134
+
135
+ # Get scraper and visit landing page if needed
136
+ scraper = self._get_scraper()
137
+ self._visit_landing_page(scraper, pdf_url, identifier)
138
+
139
+ # Download and save PDF
140
+ response = scraper.get(pdf_url, timeout=self.request_timeout, stream=True)
141
+ response.raise_for_status()
142
+
143
+ temp_file_path = self._save_pdf_to_temp(response)
144
+ filename = self._extract_filename(response, identifier)
145
+
146
+ return temp_file_path, filename
147
+
148
+ except requests.RequestException as e:
149
+ logger.error("Failed to download PDF for DOI %s: %s", identifier, e)
150
+ return None
151
+
152
+ def _get_scraper(self):
153
+ """Get or create CloudScraper instance."""
154
+ return self._scraper or cloudscraper.create_scraper(
155
+ browser={self.browser_config_type: self.user_agent},
156
+ delay=self.cf_clearance_timeout,
157
+ )
158
+
159
+ def _visit_landing_page(self, scraper, pdf_url: str, identifier: str) -> None:
160
+ """Visit landing page to handle CloudFlare protection."""
161
+ if ".full.pdf" in pdf_url:
162
+ landing_url = pdf_url.replace(".full.pdf", "")
163
+ logger.info("Visiting landing page first: %s", landing_url)
164
+
165
+ landing_response = scraper.get(landing_url, timeout=self.request_timeout)
166
+ landing_response.raise_for_status()
167
+ logger.info("Successfully accessed landing page for %s", identifier)
168
+
169
+ def _save_pdf_to_temp(self, response) -> str:
170
+ """Save PDF response to temporary file."""
171
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
172
+ for chunk in response.iter_content(chunk_size=self.chunk_size):
173
+ if chunk: # Filter out keep-alive chunks
174
+ temp_file.write(chunk)
175
+ temp_file_path = temp_file.name
176
+
177
+ logger.info("BioRxiv PDF downloaded to temporary file: %s", temp_file_path)
178
+ return temp_file_path
179
+
180
+ def _extract_filename(self, response, identifier: str) -> str:
181
+ """Extract filename from response headers or generate default."""
182
+ filename = self.get_default_filename(identifier)
183
+
184
+ content_disposition = response.headers.get("Content-Disposition", "")
185
+ if "filename=" in content_disposition:
186
+ try:
187
+ filename_match = re.search(
188
+ r'filename[*]?=(?:"([^"]+)"|([^;]+))', content_disposition
189
+ )
190
+ if filename_match:
191
+ extracted_filename = filename_match.group(
192
+ 1
193
+ ) or filename_match.group(2)
194
+ extracted_filename = extracted_filename.strip().strip('"')
195
+ if extracted_filename and extracted_filename.endswith(".pdf"):
196
+ filename = extracted_filename
197
+ logger.info("Extracted filename from header: %s", filename)
198
+ except requests.RequestException as e:
199
+ logger.warning("Failed to extract filename from header: %s", e)
200
+
201
+ return filename
202
+
203
+ def extract_paper_metadata(
204
+ self,
205
+ metadata: Dict[str, Any],
206
+ identifier: str,
207
+ pdf_result: Optional[Tuple[str, str]],
208
+ ) -> Dict[str, Any]:
209
+ """
210
+ Extract structured metadata from bioRxiv API response.
211
+
212
+ Args:
213
+ metadata: JSON response from bioRxiv API
214
+ identifier: DOI
215
+ pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
216
+
217
+ Returns:
218
+ Standardized paper metadata dictionary
219
+ """
220
+ if "collection" not in metadata or not metadata["collection"]:
221
+ raise RuntimeError("No collection data found in metadata")
222
+
223
+ paper = metadata["collection"][0] # Get first (and should be only) paper
224
+
225
+ # Extract basic metadata
226
+ basic_metadata = self._extract_basic_metadata(paper, identifier)
227
+
228
+ # Handle PDF download results
229
+ pdf_metadata = self._extract_pdf_metadata(pdf_result, identifier)
230
+
231
+ # Combine all metadata
232
+ return {
233
+ **basic_metadata,
234
+ **pdf_metadata,
235
+ }
236
+
237
+ def _extract_basic_metadata(
238
+ self, paper: Dict[str, Any], identifier: str
239
+ ) -> Dict[str, Any]:
240
+ """Extract basic metadata from paper data."""
241
+ # Extract basic fields
242
+ title = paper.get("title", "N/A").strip()
243
+ abstract = paper.get("abstract", "N/A").strip()
244
+ pub_date = paper.get("date", "N/A").strip()
245
+ category = paper.get("category", "N/A").strip()
246
+ version = paper.get("version", "N/A")
247
+
248
+ # Extract authors - typically in a semicolon-separated string
249
+ authors = self._extract_authors(paper.get("authors", ""))
250
+
251
+ return {
252
+ "Title": title,
253
+ "Authors": authors,
254
+ "Abstract": abstract,
255
+ "Publication Date": pub_date,
256
+ "DOI": identifier,
257
+ "Category": category,
258
+ "Version": version,
259
+ "source": "biorxiv",
260
+ "server": "biorxiv",
261
+ }
262
+
263
+ def _extract_authors(self, authors_str: str) -> list:
264
+ """Extract and clean authors from semicolon-separated string."""
265
+ if not authors_str:
266
+ return []
267
+ return [author.strip() for author in authors_str.split(";") if author.strip()]
268
+
269
+ def _extract_pdf_metadata(
270
+ self, pdf_result: Optional[Tuple[str, str]], identifier: str
271
+ ) -> Dict[str, Any]:
272
+ """Extract PDF-related metadata."""
273
+ if pdf_result:
274
+ temp_file_path, filename = pdf_result
275
+ return {
276
+ "URL": temp_file_path,
277
+ "pdf_url": temp_file_path,
278
+ "filename": filename,
279
+ "access_type": "open_access_downloaded",
280
+ "temp_file_path": temp_file_path,
281
+ }
282
+
283
+ return {
284
+ "URL": "",
285
+ "pdf_url": "",
286
+ "filename": self.get_default_filename(identifier),
287
+ "access_type": "download_failed",
288
+ "temp_file_path": "",
289
+ }
290
+
291
+ def get_service_name(self) -> str:
292
+ """Return service name."""
293
+ return "bioRxiv"
294
+
295
+ def get_identifier_name(self) -> str:
296
+ """Return identifier display name."""
297
+ return "DOI"
298
+
299
+ def get_default_filename(self, identifier: str) -> str:
300
+ """Generate default filename for bioRxiv paper."""
301
+ # Sanitize DOI for filename use
302
+ return f"{identifier.replace('/', '_').replace('.', '_')}.pdf"
303
+
304
+ def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
305
+ """Get bioRxiv-specific identifier info for paper summary."""
306
+ doi = paper.get("DOI", "N/A")
307
+ pub_date = paper.get("Publication Date", "N/A")
308
+ category = paper.get("Category", "N/A")
309
+
310
+ info = f" (DOI:{doi}, {pub_date})"
311
+ if category != "N/A":
312
+ info += f"\n Category: {category}"
313
+
314
+ return info
315
+
316
+ def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
317
+ """Add DOI and bioRxiv-specific fields to entry."""
318
+ entry["DOI"] = identifier
319
+ entry["Category"] = "N/A"
320
+ entry["Version"] = "N/A"
321
+ entry["server"] = "biorxiv"