aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
- aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
- aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
- aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
- aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
- aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
- aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
- aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
- aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
- aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
- aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
- aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
- /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,198 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
MedRxiv paper downloader implementation.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import logging
|
7
|
+
from typing import Any, Dict, Optional, Tuple
|
8
|
+
|
9
|
+
import requests
|
10
|
+
|
11
|
+
from .base_paper_downloader import BasePaperDownloader
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class MedrxivDownloader(BasePaperDownloader):
|
17
|
+
"""MedRxiv-specific implementation of paper downloader."""
|
18
|
+
|
19
|
+
def __init__(self, config: Any):
|
20
|
+
"""Initialize MedRxiv downloader with configuration."""
|
21
|
+
super().__init__(config)
|
22
|
+
self.api_url = config.api_url
|
23
|
+
self.pdf_url_template = getattr(
|
24
|
+
config,
|
25
|
+
"pdf_url_template",
|
26
|
+
"https://www.medrxiv.org/content/{identifier}v{version}.full.pdf",
|
27
|
+
)
|
28
|
+
self.default_version = getattr(config, "default_version", "1")
|
29
|
+
|
30
|
+
def fetch_metadata(self, identifier: str) -> Dict[str, Any]:
|
31
|
+
"""
|
32
|
+
Fetch paper metadata from medRxiv API.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
identifier: DOI (e.g., '10.1101/2020.09.09.20191205')
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
JSON response as dictionary from medRxiv API
|
39
|
+
|
40
|
+
Raises:
|
41
|
+
requests.RequestException: If API call fails
|
42
|
+
RuntimeError: If no collection data found in response
|
43
|
+
"""
|
44
|
+
query_url = f"{self.api_url}/medrxiv/{identifier}/na/json"
|
45
|
+
logger.info("Fetching metadata for DOI %s from: %s", identifier, query_url)
|
46
|
+
|
47
|
+
response = requests.get(query_url, timeout=self.request_timeout)
|
48
|
+
response.raise_for_status()
|
49
|
+
|
50
|
+
paper_data = response.json()
|
51
|
+
|
52
|
+
if "collection" not in paper_data or not paper_data["collection"]:
|
53
|
+
raise RuntimeError("No collection data found in medRxiv API response")
|
54
|
+
|
55
|
+
return paper_data
|
56
|
+
|
57
|
+
def construct_pdf_url(self, metadata: Dict[str, Any], identifier: str) -> str:
|
58
|
+
"""
|
59
|
+
Construct PDF URL from medRxiv metadata and DOI.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
metadata: JSON response from medRxiv API
|
63
|
+
identifier: DOI
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
Constructed PDF URL string
|
67
|
+
"""
|
68
|
+
if "collection" not in metadata or not metadata["collection"]:
|
69
|
+
return ""
|
70
|
+
|
71
|
+
paper = metadata["collection"][0] # Get first (and should be only) paper
|
72
|
+
version = paper.get("version", self.default_version)
|
73
|
+
|
74
|
+
# Construct medRxiv PDF URL using template
|
75
|
+
pdf_url = self.pdf_url_template.format(identifier=identifier, version=version)
|
76
|
+
logger.info("Constructed PDF URL for DOI %s: %s", identifier, pdf_url)
|
77
|
+
|
78
|
+
return pdf_url
|
79
|
+
|
80
|
+
def extract_paper_metadata(
|
81
|
+
self,
|
82
|
+
metadata: Dict[str, Any],
|
83
|
+
identifier: str,
|
84
|
+
pdf_result: Optional[Tuple[str, str]],
|
85
|
+
) -> Dict[str, Any]:
|
86
|
+
"""
|
87
|
+
Extract structured metadata from medRxiv API response.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
metadata: JSON response from medRxiv API
|
91
|
+
identifier: DOI
|
92
|
+
pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
Standardized paper metadata dictionary
|
96
|
+
"""
|
97
|
+
if "collection" not in metadata or not metadata["collection"]:
|
98
|
+
raise RuntimeError("No collection data found in metadata")
|
99
|
+
|
100
|
+
paper = metadata["collection"][0] # Get first (and should be only) paper
|
101
|
+
|
102
|
+
# Extract basic metadata
|
103
|
+
basic_metadata = self._extract_basic_metadata(paper, identifier)
|
104
|
+
|
105
|
+
# Handle PDF download results
|
106
|
+
pdf_metadata = self._extract_pdf_metadata(pdf_result, identifier)
|
107
|
+
|
108
|
+
# Combine all metadata
|
109
|
+
return {
|
110
|
+
**basic_metadata,
|
111
|
+
**pdf_metadata,
|
112
|
+
}
|
113
|
+
|
114
|
+
def _extract_basic_metadata(
|
115
|
+
self, paper: Dict[str, Any], identifier: str
|
116
|
+
) -> Dict[str, Any]:
|
117
|
+
"""Extract basic metadata from paper data."""
|
118
|
+
# Extract basic fields
|
119
|
+
title = paper.get("title", "N/A").strip()
|
120
|
+
abstract = paper.get("abstract", "N/A").strip()
|
121
|
+
pub_date = paper.get("date", "N/A").strip()
|
122
|
+
category = paper.get("category", "N/A").strip()
|
123
|
+
version = paper.get("version", "N/A")
|
124
|
+
|
125
|
+
# Extract authors - typically in a semicolon-separated string
|
126
|
+
authors = self._extract_authors(paper.get("authors", ""))
|
127
|
+
|
128
|
+
return {
|
129
|
+
"Title": title,
|
130
|
+
"Authors": authors,
|
131
|
+
"Abstract": abstract,
|
132
|
+
"Publication Date": pub_date,
|
133
|
+
"DOI": identifier,
|
134
|
+
"Category": category,
|
135
|
+
"Version": version,
|
136
|
+
"source": "medrxiv",
|
137
|
+
"server": "medrxiv",
|
138
|
+
}
|
139
|
+
|
140
|
+
def _extract_authors(self, authors_str: str) -> list:
|
141
|
+
"""Extract and clean authors from semicolon-separated string."""
|
142
|
+
if not authors_str:
|
143
|
+
return []
|
144
|
+
return [author.strip() for author in authors_str.split(";") if author.strip()]
|
145
|
+
|
146
|
+
def _extract_pdf_metadata(
|
147
|
+
self, pdf_result: Optional[Tuple[str, str]], identifier: str
|
148
|
+
) -> Dict[str, Any]:
|
149
|
+
"""Extract PDF-related metadata."""
|
150
|
+
if pdf_result:
|
151
|
+
temp_file_path, filename = pdf_result
|
152
|
+
return {
|
153
|
+
"URL": temp_file_path,
|
154
|
+
"pdf_url": temp_file_path,
|
155
|
+
"filename": filename,
|
156
|
+
"access_type": "open_access_downloaded",
|
157
|
+
"temp_file_path": temp_file_path,
|
158
|
+
}
|
159
|
+
|
160
|
+
return {
|
161
|
+
"URL": "",
|
162
|
+
"pdf_url": "",
|
163
|
+
"filename": self.get_default_filename(identifier),
|
164
|
+
"access_type": "download_failed",
|
165
|
+
"temp_file_path": "",
|
166
|
+
}
|
167
|
+
|
168
|
+
def get_service_name(self) -> str:
|
169
|
+
"""Return service name."""
|
170
|
+
return "medRxiv"
|
171
|
+
|
172
|
+
def get_identifier_name(self) -> str:
|
173
|
+
"""Return identifier display name."""
|
174
|
+
return "DOI"
|
175
|
+
|
176
|
+
def get_default_filename(self, identifier: str) -> str:
|
177
|
+
"""Generate default filename for medRxiv paper."""
|
178
|
+
# Sanitize DOI for filename use
|
179
|
+
return f"{identifier.replace('/', '_').replace('.', '_')}.pdf"
|
180
|
+
|
181
|
+
def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
|
182
|
+
"""Get medRxiv-specific identifier info for paper summary."""
|
183
|
+
doi = paper.get("DOI", "N/A")
|
184
|
+
pub_date = paper.get("Publication Date", "N/A")
|
185
|
+
category = paper.get("Category", "N/A")
|
186
|
+
|
187
|
+
info = f" (DOI:{doi}, {pub_date})"
|
188
|
+
if category != "N/A":
|
189
|
+
info += f"\n Category: {category}"
|
190
|
+
|
191
|
+
return info
|
192
|
+
|
193
|
+
def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
|
194
|
+
"""Add DOI and medRxiv-specific fields to entry."""
|
195
|
+
entry["DOI"] = identifier
|
196
|
+
entry["Category"] = "N/A"
|
197
|
+
entry["Version"] = "N/A"
|
198
|
+
entry["server"] = "medrxiv"
|
@@ -0,0 +1,337 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
PubMed paper downloader implementation.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import logging
|
7
|
+
import xml.etree.ElementTree as ET
|
8
|
+
from typing import Any, Dict, Optional, Tuple, cast
|
9
|
+
|
10
|
+
import requests
|
11
|
+
from bs4 import BeautifulSoup, Tag
|
12
|
+
|
13
|
+
from .base_paper_downloader import BasePaperDownloader
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
class PubmedDownloader(BasePaperDownloader):
|
19
|
+
"""PubMed-specific implementation of paper downloader."""
|
20
|
+
|
21
|
+
def __init__(self, config: Any):
|
22
|
+
"""Initialize PubMed downloader with configuration."""
|
23
|
+
super().__init__(config)
|
24
|
+
self.id_converter_url = config.id_converter_url
|
25
|
+
self.oa_api_url = config.oa_api_url
|
26
|
+
|
27
|
+
# Alternative PDF sources
|
28
|
+
self.europe_pmc_base_url = config.europe_pmc_base_url
|
29
|
+
self.pmc_page_base_url = config.pmc_page_base_url
|
30
|
+
self.direct_pmc_pdf_base_url = config.direct_pmc_pdf_base_url
|
31
|
+
|
32
|
+
# URL conversion for NCBI FTP links
|
33
|
+
self.ftp_base_url = config.ftp_base_url
|
34
|
+
self.https_base_url = config.https_base_url
|
35
|
+
# Configuration values
|
36
|
+
self.id_converter_format = getattr(config, "id_converter_format", "json")
|
37
|
+
self.pdf_meta_name = getattr(config, "pdf_meta_name", "citation_pdf_url")
|
38
|
+
self.default_error_code = getattr(config, "default_error_code", "unknown")
|
39
|
+
|
40
|
+
def fetch_metadata(self, identifier: str) -> Dict[str, Any]:
|
41
|
+
"""
|
42
|
+
Fetch paper metadata from PubMed ID Converter API.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
identifier: PMID (e.g., '12345678')
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
JSON response from PMC ID Converter API
|
49
|
+
|
50
|
+
Raises:
|
51
|
+
requests.RequestException: If API call fails
|
52
|
+
RuntimeError: If no records found in response
|
53
|
+
"""
|
54
|
+
query_url = f"{self.id_converter_url}?ids={identifier}&format={self.id_converter_format}"
|
55
|
+
logger.info(
|
56
|
+
"Fetching metadata from ID converter for PMID %s: %s", identifier, query_url
|
57
|
+
)
|
58
|
+
|
59
|
+
response = requests.get(query_url, timeout=self.request_timeout)
|
60
|
+
response.raise_for_status()
|
61
|
+
|
62
|
+
result = response.json()
|
63
|
+
logger.info("ID converter response for PMID %s: %s", identifier, result)
|
64
|
+
|
65
|
+
if "records" not in result or not result["records"]:
|
66
|
+
raise RuntimeError("No records found in PMC ID Converter API response")
|
67
|
+
|
68
|
+
return result
|
69
|
+
|
70
|
+
def construct_pdf_url(self, metadata: Dict[str, Any], identifier: str) -> str:
|
71
|
+
"""
|
72
|
+
Construct PDF URL using multiple fallback strategies.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
metadata: JSON response from ID converter
|
76
|
+
identifier: PMID
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
PDF URL string (empty if no PDF available)
|
80
|
+
"""
|
81
|
+
if "records" not in metadata or not metadata["records"]:
|
82
|
+
return ""
|
83
|
+
|
84
|
+
record = metadata["records"][0]
|
85
|
+
pmcid = record.get("pmcid", "")
|
86
|
+
|
87
|
+
if not pmcid or pmcid == "N/A":
|
88
|
+
logger.info("No PMCID available for PDF fetch: PMID %s", identifier)
|
89
|
+
return ""
|
90
|
+
|
91
|
+
return self._fetch_pdf_url_with_fallbacks(pmcid)
|
92
|
+
|
93
|
+
def _fetch_pdf_url_with_fallbacks(self, pmcid: str) -> str:
|
94
|
+
"""
|
95
|
+
Fetch PDF URL from OA API with comprehensive fallback strategies.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
pmcid: PMC ID (e.g., 'PMC1234567')
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
PDF URL string (empty if all strategies fail)
|
102
|
+
"""
|
103
|
+
logger.info("Fetching PDF URL for PMCID: %s", pmcid)
|
104
|
+
|
105
|
+
# Strategy 1: Official OA API (fastest when it works)
|
106
|
+
pdf_url = self._try_oa_api(pmcid)
|
107
|
+
if pdf_url:
|
108
|
+
return pdf_url
|
109
|
+
|
110
|
+
# Strategy 2: Europe PMC Service (most reliable fallback)
|
111
|
+
pdf_url = self._try_europe_pmc(pmcid)
|
112
|
+
if pdf_url:
|
113
|
+
return pdf_url
|
114
|
+
|
115
|
+
# Strategy 3: Scrape PMC page for citation_pdf_url meta tag
|
116
|
+
pdf_url = self._try_pmc_page_scraping(pmcid)
|
117
|
+
if pdf_url:
|
118
|
+
return pdf_url
|
119
|
+
|
120
|
+
# Strategy 4: Direct PMC PDF URL pattern (least reliable)
|
121
|
+
pdf_url = self._try_direct_pmc_url(pmcid)
|
122
|
+
if pdf_url:
|
123
|
+
return pdf_url
|
124
|
+
|
125
|
+
logger.warning("All PDF URL strategies failed for PMCID: %s", pmcid)
|
126
|
+
return ""
|
127
|
+
|
128
|
+
def _try_oa_api(self, pmcid: str) -> str:
|
129
|
+
"""Try to get PDF URL from official OA API."""
|
130
|
+
query_url = f"{self.oa_api_url}?id={pmcid}"
|
131
|
+
logger.info("Trying OA API for PMCID %s: %s", pmcid, query_url)
|
132
|
+
|
133
|
+
try:
|
134
|
+
response = requests.get(query_url, timeout=self.request_timeout)
|
135
|
+
response.raise_for_status()
|
136
|
+
|
137
|
+
logger.info("OA API response for PMCID %s: %s", pmcid, response.text[:500])
|
138
|
+
|
139
|
+
# Parse XML response
|
140
|
+
|
141
|
+
root = ET.fromstring(response.text)
|
142
|
+
|
143
|
+
# Check for error first
|
144
|
+
error_elem = root.find(".//error")
|
145
|
+
if error_elem is not None:
|
146
|
+
error_code = error_elem.get("code", self.default_error_code)
|
147
|
+
error_text = error_elem.text or "unknown error"
|
148
|
+
logger.info(
|
149
|
+
"OA API error for PMCID %s: %s - %s", pmcid, error_code, error_text
|
150
|
+
)
|
151
|
+
return ""
|
152
|
+
|
153
|
+
# Look for PDF link
|
154
|
+
pdf_link = root.find(".//link[@format='pdf']")
|
155
|
+
if pdf_link is not None:
|
156
|
+
pdf_url = pdf_link.get("href", "")
|
157
|
+
logger.info(
|
158
|
+
"Found PDF URL from OA API for PMCID %s: %s", pmcid, pdf_url
|
159
|
+
)
|
160
|
+
|
161
|
+
# Convert FTP links to HTTPS for download compatibility
|
162
|
+
if pdf_url.startswith(self.ftp_base_url):
|
163
|
+
pdf_url = pdf_url.replace(self.ftp_base_url, self.https_base_url)
|
164
|
+
logger.info("Converted FTP to HTTPS for %s: %s", pmcid, pdf_url)
|
165
|
+
|
166
|
+
return pdf_url
|
167
|
+
|
168
|
+
except requests.RequestException as e:
|
169
|
+
logger.info("OA API failed for %s: %s", pmcid, str(e))
|
170
|
+
|
171
|
+
return ""
|
172
|
+
|
173
|
+
def _try_europe_pmc(self, pmcid: str) -> str:
|
174
|
+
"""Try Europe PMC service for PDF."""
|
175
|
+
europe_pmc_url = f"{self.europe_pmc_base_url}?accid={pmcid}&blobtype=pdf"
|
176
|
+
logger.info("Trying Europe PMC service for %s: %s", pmcid, europe_pmc_url)
|
177
|
+
|
178
|
+
try:
|
179
|
+
response = requests.head(europe_pmc_url, timeout=self.request_timeout)
|
180
|
+
if response.status_code == 200:
|
181
|
+
logger.info("Europe PMC service works for %s", pmcid)
|
182
|
+
return europe_pmc_url
|
183
|
+
except requests.RequestException as e:
|
184
|
+
logger.info("Europe PMC service failed for %s: %s", pmcid, str(e))
|
185
|
+
|
186
|
+
return ""
|
187
|
+
|
188
|
+
def _try_pmc_page_scraping(self, pmcid: str) -> str:
|
189
|
+
"""Try scraping PMC page for PDF meta tag."""
|
190
|
+
pmc_page_url = f"{self.pmc_page_base_url}/{pmcid}/"
|
191
|
+
logger.info(
|
192
|
+
"Scraping PMC page for PDF meta tag for %s: %s", pmcid, pmc_page_url
|
193
|
+
)
|
194
|
+
|
195
|
+
try:
|
196
|
+
headers = {"User-Agent": self.user_agent}
|
197
|
+
response = requests.get(
|
198
|
+
pmc_page_url, headers=headers, timeout=self.request_timeout
|
199
|
+
)
|
200
|
+
response.raise_for_status()
|
201
|
+
|
202
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
203
|
+
|
204
|
+
# Look for PDF meta tag
|
205
|
+
pdf_meta = soup.find("meta", attrs={"name": self.pdf_meta_name})
|
206
|
+
if pdf_meta is not None:
|
207
|
+
# Cast to Tag to help type checker understand this is a BeautifulSoup Tag object
|
208
|
+
meta_tag = cast(Tag, pdf_meta)
|
209
|
+
content = meta_tag.get("content")
|
210
|
+
if content:
|
211
|
+
logger.info(
|
212
|
+
"Found %s meta tag for %s: %s",
|
213
|
+
self.pdf_meta_name,
|
214
|
+
pmcid,
|
215
|
+
content,
|
216
|
+
)
|
217
|
+
return str(content)
|
218
|
+
|
219
|
+
except requests.RequestException as e:
|
220
|
+
logger.info("PMC page scraping failed for %s: %s", pmcid, str(e))
|
221
|
+
|
222
|
+
return ""
|
223
|
+
|
224
|
+
def _try_direct_pmc_url(self, pmcid: str) -> str:
|
225
|
+
"""Try direct PMC PDF URL pattern."""
|
226
|
+
direct_pmc_url = f"{self.direct_pmc_pdf_base_url}/{pmcid}/pdf/"
|
227
|
+
logger.info("Trying direct PMC PDF URL for %s: %s", pmcid, direct_pmc_url)
|
228
|
+
|
229
|
+
try:
|
230
|
+
response = requests.head(direct_pmc_url, timeout=self.request_timeout)
|
231
|
+
if response.status_code == 200:
|
232
|
+
logger.info("Direct PMC PDF URL works for %s", pmcid)
|
233
|
+
return direct_pmc_url
|
234
|
+
except requests.RequestException as e:
|
235
|
+
logger.info("Direct PMC PDF URL failed for %s: %s", pmcid, str(e))
|
236
|
+
|
237
|
+
return ""
|
238
|
+
|
239
|
+
def extract_paper_metadata(
|
240
|
+
self,
|
241
|
+
metadata: Dict[str, Any],
|
242
|
+
identifier: str,
|
243
|
+
pdf_result: Optional[Tuple[str, str]],
|
244
|
+
) -> Dict[str, Any]:
|
245
|
+
"""
|
246
|
+
Extract structured metadata from PubMed ID converter response.
|
247
|
+
|
248
|
+
Args:
|
249
|
+
metadata: JSON response from ID converter
|
250
|
+
identifier: PMID
|
251
|
+
pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
|
252
|
+
|
253
|
+
Returns:
|
254
|
+
Standardized paper metadata dictionary
|
255
|
+
"""
|
256
|
+
if "records" not in metadata or not metadata["records"]:
|
257
|
+
raise RuntimeError("No records found in metadata")
|
258
|
+
|
259
|
+
record = metadata["records"][0] # Get first (and should be only) record
|
260
|
+
|
261
|
+
# Extract basic fields from ID converter
|
262
|
+
pmcid = record.get("pmcid", "N/A")
|
263
|
+
doi = record.get("doi", "N/A")
|
264
|
+
|
265
|
+
# Handle PDF download results
|
266
|
+
if pdf_result:
|
267
|
+
temp_file_path, filename = pdf_result
|
268
|
+
access_type = "open_access_downloaded"
|
269
|
+
pdf_url = temp_file_path # Use local temp file path
|
270
|
+
else:
|
271
|
+
temp_file_path = ""
|
272
|
+
filename = self.get_default_filename(identifier)
|
273
|
+
access_type = "abstract_only" if pmcid != "N/A" else "no_pmcid"
|
274
|
+
pdf_url = ""
|
275
|
+
|
276
|
+
# Note: For PubMed, we don't get title/authors from ID converter
|
277
|
+
# In a real implementation, you might want to call E-utilities for full metadata
|
278
|
+
# For now, we'll use placeholders and focus on the ID conversion functionality
|
279
|
+
|
280
|
+
return {
|
281
|
+
"Title": (
|
282
|
+
f"PubMed Article {identifier}"
|
283
|
+
), # Placeholder - would need E-utilities for real title
|
284
|
+
"Authors": [], # Placeholder - would need E-utilities for real authors
|
285
|
+
"Abstract": "Abstract available in PubMed", # Placeholder
|
286
|
+
"Publication Date": "N/A", # Would need E-utilities for this
|
287
|
+
"PMID": identifier,
|
288
|
+
"PMCID": pmcid,
|
289
|
+
"DOI": doi,
|
290
|
+
"Journal": "N/A", # Would need E-utilities for this
|
291
|
+
"URL": pdf_url,
|
292
|
+
"pdf_url": pdf_url,
|
293
|
+
"access_type": access_type,
|
294
|
+
"filename": filename,
|
295
|
+
"source": "pubmed",
|
296
|
+
"temp_file_path": temp_file_path,
|
297
|
+
}
|
298
|
+
|
299
|
+
def get_service_name(self) -> str:
|
300
|
+
"""Return service name."""
|
301
|
+
return "PubMed"
|
302
|
+
|
303
|
+
def get_identifier_name(self) -> str:
|
304
|
+
"""Return identifier display name."""
|
305
|
+
return "PMID"
|
306
|
+
|
307
|
+
def get_default_filename(self, identifier: str) -> str:
|
308
|
+
"""Generate default filename for PubMed paper."""
|
309
|
+
return f"pmid_{identifier}.pdf"
|
310
|
+
|
311
|
+
def get_snippet(self, abstract: str) -> str:
|
312
|
+
"""Override to handle PubMed-specific abstract placeholder."""
|
313
|
+
if (
|
314
|
+
not abstract
|
315
|
+
or abstract == "N/A"
|
316
|
+
or abstract == "Abstract available in PubMed"
|
317
|
+
):
|
318
|
+
return ""
|
319
|
+
return super().get_snippet(abstract)
|
320
|
+
|
321
|
+
def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
|
322
|
+
"""Get PubMed-specific identifier info for paper summary."""
|
323
|
+
pmid = paper.get("PMID", "N/A")
|
324
|
+
pmcid = paper.get("PMCID", "N/A")
|
325
|
+
|
326
|
+
info = f" (PMID: {pmid})"
|
327
|
+
if pmcid != "N/A":
|
328
|
+
info += f"\n PMCID: {pmcid}"
|
329
|
+
|
330
|
+
return info
|
331
|
+
|
332
|
+
def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
|
333
|
+
"""Add PMID and PubMed-specific fields to entry."""
|
334
|
+
entry["PMID"] = identifier
|
335
|
+
entry["PMCID"] = "N/A"
|
336
|
+
entry["DOI"] = "N/A"
|
337
|
+
entry["Journal"] = "N/A"
|