aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
  2. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
  3. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
  4. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
  5. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
  6. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
  7. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
  8. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
  9. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
  10. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
  11. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
  12. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
  13. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
  14. aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
  15. aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
  16. aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
  17. aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
  18. aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
  19. aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
  20. aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
  21. aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
  22. aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
  23. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
  24. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
  25. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
  26. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
  27. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
  28. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
  29. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
  30. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
  31. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
  32. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
  33. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
  34. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
  35. aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
  36. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
  37. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
  38. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
  39. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
  40. aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
  41. aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
  42. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
  43. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
  44. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
  45. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
  46. /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
  47. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
  48. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
  49. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,198 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ MedRxiv paper downloader implementation.
4
+ """
5
+
6
+ import logging
7
+ from typing import Any, Dict, Optional, Tuple
8
+
9
+ import requests
10
+
11
+ from .base_paper_downloader import BasePaperDownloader
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class MedrxivDownloader(BasePaperDownloader):
17
+ """MedRxiv-specific implementation of paper downloader."""
18
+
19
+ def __init__(self, config: Any):
20
+ """Initialize MedRxiv downloader with configuration."""
21
+ super().__init__(config)
22
+ self.api_url = config.api_url
23
+ self.pdf_url_template = getattr(
24
+ config,
25
+ "pdf_url_template",
26
+ "https://www.medrxiv.org/content/{identifier}v{version}.full.pdf",
27
+ )
28
+ self.default_version = getattr(config, "default_version", "1")
29
+
30
+ def fetch_metadata(self, identifier: str) -> Dict[str, Any]:
31
+ """
32
+ Fetch paper metadata from medRxiv API.
33
+
34
+ Args:
35
+ identifier: DOI (e.g., '10.1101/2020.09.09.20191205')
36
+
37
+ Returns:
38
+ JSON response as dictionary from medRxiv API
39
+
40
+ Raises:
41
+ requests.RequestException: If API call fails
42
+ RuntimeError: If no collection data found in response
43
+ """
44
+ query_url = f"{self.api_url}/medrxiv/{identifier}/na/json"
45
+ logger.info("Fetching metadata for DOI %s from: %s", identifier, query_url)
46
+
47
+ response = requests.get(query_url, timeout=self.request_timeout)
48
+ response.raise_for_status()
49
+
50
+ paper_data = response.json()
51
+
52
+ if "collection" not in paper_data or not paper_data["collection"]:
53
+ raise RuntimeError("No collection data found in medRxiv API response")
54
+
55
+ return paper_data
56
+
57
+ def construct_pdf_url(self, metadata: Dict[str, Any], identifier: str) -> str:
58
+ """
59
+ Construct PDF URL from medRxiv metadata and DOI.
60
+
61
+ Args:
62
+ metadata: JSON response from medRxiv API
63
+ identifier: DOI
64
+
65
+ Returns:
66
+ Constructed PDF URL string
67
+ """
68
+ if "collection" not in metadata or not metadata["collection"]:
69
+ return ""
70
+
71
+ paper = metadata["collection"][0] # Get first (and should be only) paper
72
+ version = paper.get("version", self.default_version)
73
+
74
+ # Construct medRxiv PDF URL using template
75
+ pdf_url = self.pdf_url_template.format(identifier=identifier, version=version)
76
+ logger.info("Constructed PDF URL for DOI %s: %s", identifier, pdf_url)
77
+
78
+ return pdf_url
79
+
80
+ def extract_paper_metadata(
81
+ self,
82
+ metadata: Dict[str, Any],
83
+ identifier: str,
84
+ pdf_result: Optional[Tuple[str, str]],
85
+ ) -> Dict[str, Any]:
86
+ """
87
+ Extract structured metadata from medRxiv API response.
88
+
89
+ Args:
90
+ metadata: JSON response from medRxiv API
91
+ identifier: DOI
92
+ pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
93
+
94
+ Returns:
95
+ Standardized paper metadata dictionary
96
+ """
97
+ if "collection" not in metadata or not metadata["collection"]:
98
+ raise RuntimeError("No collection data found in metadata")
99
+
100
+ paper = metadata["collection"][0] # Get first (and should be only) paper
101
+
102
+ # Extract basic metadata
103
+ basic_metadata = self._extract_basic_metadata(paper, identifier)
104
+
105
+ # Handle PDF download results
106
+ pdf_metadata = self._extract_pdf_metadata(pdf_result, identifier)
107
+
108
+ # Combine all metadata
109
+ return {
110
+ **basic_metadata,
111
+ **pdf_metadata,
112
+ }
113
+
114
+ def _extract_basic_metadata(
115
+ self, paper: Dict[str, Any], identifier: str
116
+ ) -> Dict[str, Any]:
117
+ """Extract basic metadata from paper data."""
118
+ # Extract basic fields
119
+ title = paper.get("title", "N/A").strip()
120
+ abstract = paper.get("abstract", "N/A").strip()
121
+ pub_date = paper.get("date", "N/A").strip()
122
+ category = paper.get("category", "N/A").strip()
123
+ version = paper.get("version", "N/A")
124
+
125
+ # Extract authors - typically in a semicolon-separated string
126
+ authors = self._extract_authors(paper.get("authors", ""))
127
+
128
+ return {
129
+ "Title": title,
130
+ "Authors": authors,
131
+ "Abstract": abstract,
132
+ "Publication Date": pub_date,
133
+ "DOI": identifier,
134
+ "Category": category,
135
+ "Version": version,
136
+ "source": "medrxiv",
137
+ "server": "medrxiv",
138
+ }
139
+
140
+ def _extract_authors(self, authors_str: str) -> list:
141
+ """Extract and clean authors from semicolon-separated string."""
142
+ if not authors_str:
143
+ return []
144
+ return [author.strip() for author in authors_str.split(";") if author.strip()]
145
+
146
+ def _extract_pdf_metadata(
147
+ self, pdf_result: Optional[Tuple[str, str]], identifier: str
148
+ ) -> Dict[str, Any]:
149
+ """Extract PDF-related metadata."""
150
+ if pdf_result:
151
+ temp_file_path, filename = pdf_result
152
+ return {
153
+ "URL": temp_file_path,
154
+ "pdf_url": temp_file_path,
155
+ "filename": filename,
156
+ "access_type": "open_access_downloaded",
157
+ "temp_file_path": temp_file_path,
158
+ }
159
+
160
+ return {
161
+ "URL": "",
162
+ "pdf_url": "",
163
+ "filename": self.get_default_filename(identifier),
164
+ "access_type": "download_failed",
165
+ "temp_file_path": "",
166
+ }
167
+
168
+ def get_service_name(self) -> str:
169
+ """Return service name."""
170
+ return "medRxiv"
171
+
172
+ def get_identifier_name(self) -> str:
173
+ """Return identifier display name."""
174
+ return "DOI"
175
+
176
+ def get_default_filename(self, identifier: str) -> str:
177
+ """Generate default filename for medRxiv paper."""
178
+ # Sanitize DOI for filename use
179
+ return f"{identifier.replace('/', '_').replace('.', '_')}.pdf"
180
+
181
+ def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
182
+ """Get medRxiv-specific identifier info for paper summary."""
183
+ doi = paper.get("DOI", "N/A")
184
+ pub_date = paper.get("Publication Date", "N/A")
185
+ category = paper.get("Category", "N/A")
186
+
187
+ info = f" (DOI:{doi}, {pub_date})"
188
+ if category != "N/A":
189
+ info += f"\n Category: {category}"
190
+
191
+ return info
192
+
193
+ def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
194
+ """Add DOI and medRxiv-specific fields to entry."""
195
+ entry["DOI"] = identifier
196
+ entry["Category"] = "N/A"
197
+ entry["Version"] = "N/A"
198
+ entry["server"] = "medrxiv"
@@ -0,0 +1,337 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ PubMed paper downloader implementation.
4
+ """
5
+
6
+ import logging
7
+ import xml.etree.ElementTree as ET
8
+ from typing import Any, Dict, Optional, Tuple, cast
9
+
10
+ import requests
11
+ from bs4 import BeautifulSoup, Tag
12
+
13
+ from .base_paper_downloader import BasePaperDownloader
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class PubmedDownloader(BasePaperDownloader):
19
+ """PubMed-specific implementation of paper downloader."""
20
+
21
+ def __init__(self, config: Any):
22
+ """Initialize PubMed downloader with configuration."""
23
+ super().__init__(config)
24
+ self.id_converter_url = config.id_converter_url
25
+ self.oa_api_url = config.oa_api_url
26
+
27
+ # Alternative PDF sources
28
+ self.europe_pmc_base_url = config.europe_pmc_base_url
29
+ self.pmc_page_base_url = config.pmc_page_base_url
30
+ self.direct_pmc_pdf_base_url = config.direct_pmc_pdf_base_url
31
+
32
+ # URL conversion for NCBI FTP links
33
+ self.ftp_base_url = config.ftp_base_url
34
+ self.https_base_url = config.https_base_url
35
+ # Configuration values
36
+ self.id_converter_format = getattr(config, "id_converter_format", "json")
37
+ self.pdf_meta_name = getattr(config, "pdf_meta_name", "citation_pdf_url")
38
+ self.default_error_code = getattr(config, "default_error_code", "unknown")
39
+
40
+ def fetch_metadata(self, identifier: str) -> Dict[str, Any]:
41
+ """
42
+ Fetch paper metadata from PubMed ID Converter API.
43
+
44
+ Args:
45
+ identifier: PMID (e.g., '12345678')
46
+
47
+ Returns:
48
+ JSON response from PMC ID Converter API
49
+
50
+ Raises:
51
+ requests.RequestException: If API call fails
52
+ RuntimeError: If no records found in response
53
+ """
54
+ query_url = f"{self.id_converter_url}?ids={identifier}&format={self.id_converter_format}"
55
+ logger.info(
56
+ "Fetching metadata from ID converter for PMID %s: %s", identifier, query_url
57
+ )
58
+
59
+ response = requests.get(query_url, timeout=self.request_timeout)
60
+ response.raise_for_status()
61
+
62
+ result = response.json()
63
+ logger.info("ID converter response for PMID %s: %s", identifier, result)
64
+
65
+ if "records" not in result or not result["records"]:
66
+ raise RuntimeError("No records found in PMC ID Converter API response")
67
+
68
+ return result
69
+
70
+ def construct_pdf_url(self, metadata: Dict[str, Any], identifier: str) -> str:
71
+ """
72
+ Construct PDF URL using multiple fallback strategies.
73
+
74
+ Args:
75
+ metadata: JSON response from ID converter
76
+ identifier: PMID
77
+
78
+ Returns:
79
+ PDF URL string (empty if no PDF available)
80
+ """
81
+ if "records" not in metadata or not metadata["records"]:
82
+ return ""
83
+
84
+ record = metadata["records"][0]
85
+ pmcid = record.get("pmcid", "")
86
+
87
+ if not pmcid or pmcid == "N/A":
88
+ logger.info("No PMCID available for PDF fetch: PMID %s", identifier)
89
+ return ""
90
+
91
+ return self._fetch_pdf_url_with_fallbacks(pmcid)
92
+
93
+ def _fetch_pdf_url_with_fallbacks(self, pmcid: str) -> str:
94
+ """
95
+ Fetch PDF URL from OA API with comprehensive fallback strategies.
96
+
97
+ Args:
98
+ pmcid: PMC ID (e.g., 'PMC1234567')
99
+
100
+ Returns:
101
+ PDF URL string (empty if all strategies fail)
102
+ """
103
+ logger.info("Fetching PDF URL for PMCID: %s", pmcid)
104
+
105
+ # Strategy 1: Official OA API (fastest when it works)
106
+ pdf_url = self._try_oa_api(pmcid)
107
+ if pdf_url:
108
+ return pdf_url
109
+
110
+ # Strategy 2: Europe PMC Service (most reliable fallback)
111
+ pdf_url = self._try_europe_pmc(pmcid)
112
+ if pdf_url:
113
+ return pdf_url
114
+
115
+ # Strategy 3: Scrape PMC page for citation_pdf_url meta tag
116
+ pdf_url = self._try_pmc_page_scraping(pmcid)
117
+ if pdf_url:
118
+ return pdf_url
119
+
120
+ # Strategy 4: Direct PMC PDF URL pattern (least reliable)
121
+ pdf_url = self._try_direct_pmc_url(pmcid)
122
+ if pdf_url:
123
+ return pdf_url
124
+
125
+ logger.warning("All PDF URL strategies failed for PMCID: %s", pmcid)
126
+ return ""
127
+
128
+ def _try_oa_api(self, pmcid: str) -> str:
129
+ """Try to get PDF URL from official OA API."""
130
+ query_url = f"{self.oa_api_url}?id={pmcid}"
131
+ logger.info("Trying OA API for PMCID %s: %s", pmcid, query_url)
132
+
133
+ try:
134
+ response = requests.get(query_url, timeout=self.request_timeout)
135
+ response.raise_for_status()
136
+
137
+ logger.info("OA API response for PMCID %s: %s", pmcid, response.text[:500])
138
+
139
+ # Parse XML response
140
+
141
+ root = ET.fromstring(response.text)
142
+
143
+ # Check for error first
144
+ error_elem = root.find(".//error")
145
+ if error_elem is not None:
146
+ error_code = error_elem.get("code", self.default_error_code)
147
+ error_text = error_elem.text or "unknown error"
148
+ logger.info(
149
+ "OA API error for PMCID %s: %s - %s", pmcid, error_code, error_text
150
+ )
151
+ return ""
152
+
153
+ # Look for PDF link
154
+ pdf_link = root.find(".//link[@format='pdf']")
155
+ if pdf_link is not None:
156
+ pdf_url = pdf_link.get("href", "")
157
+ logger.info(
158
+ "Found PDF URL from OA API for PMCID %s: %s", pmcid, pdf_url
159
+ )
160
+
161
+ # Convert FTP links to HTTPS for download compatibility
162
+ if pdf_url.startswith(self.ftp_base_url):
163
+ pdf_url = pdf_url.replace(self.ftp_base_url, self.https_base_url)
164
+ logger.info("Converted FTP to HTTPS for %s: %s", pmcid, pdf_url)
165
+
166
+ return pdf_url
167
+
168
+ except requests.RequestException as e:
169
+ logger.info("OA API failed for %s: %s", pmcid, str(e))
170
+
171
+ return ""
172
+
173
+ def _try_europe_pmc(self, pmcid: str) -> str:
174
+ """Try Europe PMC service for PDF."""
175
+ europe_pmc_url = f"{self.europe_pmc_base_url}?accid={pmcid}&blobtype=pdf"
176
+ logger.info("Trying Europe PMC service for %s: %s", pmcid, europe_pmc_url)
177
+
178
+ try:
179
+ response = requests.head(europe_pmc_url, timeout=self.request_timeout)
180
+ if response.status_code == 200:
181
+ logger.info("Europe PMC service works for %s", pmcid)
182
+ return europe_pmc_url
183
+ except requests.RequestException as e:
184
+ logger.info("Europe PMC service failed for %s: %s", pmcid, str(e))
185
+
186
+ return ""
187
+
188
+ def _try_pmc_page_scraping(self, pmcid: str) -> str:
189
+ """Try scraping PMC page for PDF meta tag."""
190
+ pmc_page_url = f"{self.pmc_page_base_url}/{pmcid}/"
191
+ logger.info(
192
+ "Scraping PMC page for PDF meta tag for %s: %s", pmcid, pmc_page_url
193
+ )
194
+
195
+ try:
196
+ headers = {"User-Agent": self.user_agent}
197
+ response = requests.get(
198
+ pmc_page_url, headers=headers, timeout=self.request_timeout
199
+ )
200
+ response.raise_for_status()
201
+
202
+ soup = BeautifulSoup(response.content, "html.parser")
203
+
204
+ # Look for PDF meta tag
205
+ pdf_meta = soup.find("meta", attrs={"name": self.pdf_meta_name})
206
+ if pdf_meta is not None:
207
+ # Cast to Tag to help type checker understand this is a BeautifulSoup Tag object
208
+ meta_tag = cast(Tag, pdf_meta)
209
+ content = meta_tag.get("content")
210
+ if content:
211
+ logger.info(
212
+ "Found %s meta tag for %s: %s",
213
+ self.pdf_meta_name,
214
+ pmcid,
215
+ content,
216
+ )
217
+ return str(content)
218
+
219
+ except requests.RequestException as e:
220
+ logger.info("PMC page scraping failed for %s: %s", pmcid, str(e))
221
+
222
+ return ""
223
+
224
+ def _try_direct_pmc_url(self, pmcid: str) -> str:
225
+ """Try direct PMC PDF URL pattern."""
226
+ direct_pmc_url = f"{self.direct_pmc_pdf_base_url}/{pmcid}/pdf/"
227
+ logger.info("Trying direct PMC PDF URL for %s: %s", pmcid, direct_pmc_url)
228
+
229
+ try:
230
+ response = requests.head(direct_pmc_url, timeout=self.request_timeout)
231
+ if response.status_code == 200:
232
+ logger.info("Direct PMC PDF URL works for %s", pmcid)
233
+ return direct_pmc_url
234
+ except requests.RequestException as e:
235
+ logger.info("Direct PMC PDF URL failed for %s: %s", pmcid, str(e))
236
+
237
+ return ""
238
+
239
+ def extract_paper_metadata(
240
+ self,
241
+ metadata: Dict[str, Any],
242
+ identifier: str,
243
+ pdf_result: Optional[Tuple[str, str]],
244
+ ) -> Dict[str, Any]:
245
+ """
246
+ Extract structured metadata from PubMed ID converter response.
247
+
248
+ Args:
249
+ metadata: JSON response from ID converter
250
+ identifier: PMID
251
+ pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
252
+
253
+ Returns:
254
+ Standardized paper metadata dictionary
255
+ """
256
+ if "records" not in metadata or not metadata["records"]:
257
+ raise RuntimeError("No records found in metadata")
258
+
259
+ record = metadata["records"][0] # Get first (and should be only) record
260
+
261
+ # Extract basic fields from ID converter
262
+ pmcid = record.get("pmcid", "N/A")
263
+ doi = record.get("doi", "N/A")
264
+
265
+ # Handle PDF download results
266
+ if pdf_result:
267
+ temp_file_path, filename = pdf_result
268
+ access_type = "open_access_downloaded"
269
+ pdf_url = temp_file_path # Use local temp file path
270
+ else:
271
+ temp_file_path = ""
272
+ filename = self.get_default_filename(identifier)
273
+ access_type = "abstract_only" if pmcid != "N/A" else "no_pmcid"
274
+ pdf_url = ""
275
+
276
+ # Note: For PubMed, we don't get title/authors from ID converter
277
+ # In a real implementation, you might want to call E-utilities for full metadata
278
+ # For now, we'll use placeholders and focus on the ID conversion functionality
279
+
280
+ return {
281
+ "Title": (
282
+ f"PubMed Article {identifier}"
283
+ ), # Placeholder - would need E-utilities for real title
284
+ "Authors": [], # Placeholder - would need E-utilities for real authors
285
+ "Abstract": "Abstract available in PubMed", # Placeholder
286
+ "Publication Date": "N/A", # Would need E-utilities for this
287
+ "PMID": identifier,
288
+ "PMCID": pmcid,
289
+ "DOI": doi,
290
+ "Journal": "N/A", # Would need E-utilities for this
291
+ "URL": pdf_url,
292
+ "pdf_url": pdf_url,
293
+ "access_type": access_type,
294
+ "filename": filename,
295
+ "source": "pubmed",
296
+ "temp_file_path": temp_file_path,
297
+ }
298
+
299
+ def get_service_name(self) -> str:
300
+ """Return service name."""
301
+ return "PubMed"
302
+
303
+ def get_identifier_name(self) -> str:
304
+ """Return identifier display name."""
305
+ return "PMID"
306
+
307
+ def get_default_filename(self, identifier: str) -> str:
308
+ """Generate default filename for PubMed paper."""
309
+ return f"pmid_{identifier}.pdf"
310
+
311
+ def get_snippet(self, abstract: str) -> str:
312
+ """Override to handle PubMed-specific abstract placeholder."""
313
+ if (
314
+ not abstract
315
+ or abstract == "N/A"
316
+ or abstract == "Abstract available in PubMed"
317
+ ):
318
+ return ""
319
+ return super().get_snippet(abstract)
320
+
321
+ def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
322
+ """Get PubMed-specific identifier info for paper summary."""
323
+ pmid = paper.get("PMID", "N/A")
324
+ pmcid = paper.get("PMCID", "N/A")
325
+
326
+ info = f" (PMID: {pmid})"
327
+ if pmcid != "N/A":
328
+ info += f"\n PMCID: {pmcid}"
329
+
330
+ return info
331
+
332
+ def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
333
+ """Add PMID and PubMed-specific fields to entry."""
334
+ entry["PMID"] = identifier
335
+ entry["PMCID"] = "N/A"
336
+ entry["DOI"] = "N/A"
337
+ entry["Journal"] = "N/A"