scitex 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__version__.py +1 -1
- scitex/browser/__init__.py +53 -0
- scitex/browser/auth/__init__.py +35 -0
- scitex/browser/auth/google.py +381 -0
- scitex/browser/collaboration/__init__.py +5 -0
- scitex/browser/debugging/__init__.py +56 -0
- scitex/browser/debugging/_failure_capture.py +372 -0
- scitex/browser/debugging/_sync_session.py +259 -0
- scitex/browser/debugging/_test_monitor.py +284 -0
- scitex/browser/debugging/_visual_cursor.py +432 -0
- scitex/scholar/citation_graph/README.md +117 -0
- scitex/scholar/citation_graph/__init__.py +29 -0
- scitex/scholar/citation_graph/builder.py +214 -0
- scitex/scholar/citation_graph/database.py +246 -0
- scitex/scholar/citation_graph/example.py +96 -0
- scitex/scholar/citation_graph/models.py +80 -0
- scitex/scholar/config/ScholarConfig.py +23 -3
- scitex/scholar/config/default.yaml +56 -0
- scitex/scholar/core/Paper.py +102 -0
- scitex/scholar/core/__init__.py +44 -0
- scitex/scholar/core/journal_normalizer.py +524 -0
- scitex/scholar/core/oa_cache.py +285 -0
- scitex/scholar/core/open_access.py +457 -0
- scitex/scholar/metadata_engines/ScholarEngine.py +9 -1
- scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +82 -21
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
- scitex/scholar/pdf_download/strategies/__init__.py +6 -0
- scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
- scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +27 -9
- scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +24 -8
- scitex/scholar/search_engines/ScholarSearchEngine.py +6 -1
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/METADATA +1 -1
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/RECORD +36 -20
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/WHEEL +0 -0
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/entry_points.txt +0 -0
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -33,6 +33,7 @@ from scitex.scholar.pdf_download.strategies import (
|
|
|
33
33
|
try_download_direct_async,
|
|
34
34
|
try_download_manual_async,
|
|
35
35
|
try_download_response_body_async,
|
|
36
|
+
try_download_open_access_async,
|
|
36
37
|
)
|
|
37
38
|
|
|
38
39
|
logger = logging.getLogger(__name__)
|
|
@@ -65,6 +66,17 @@ class ScholarPDFDownloader:
|
|
|
65
66
|
self.context = context
|
|
66
67
|
self.output_dir = self.config.get_library_downloads_dir()
|
|
67
68
|
|
|
69
|
+
# Load access preferences from config
|
|
70
|
+
self.prefer_open_access = self.config.resolve(
|
|
71
|
+
"prefer_open_access", default=True, type=bool
|
|
72
|
+
)
|
|
73
|
+
self.enable_paywall_access = self.config.resolve(
|
|
74
|
+
"enable_paywall_access", default=False, type=bool
|
|
75
|
+
)
|
|
76
|
+
self.track_paywall_attempts = self.config.resolve(
|
|
77
|
+
"track_paywall_attempts", default=True, type=bool
|
|
78
|
+
)
|
|
79
|
+
|
|
68
80
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
69
81
|
pass
|
|
70
82
|
|
|
@@ -130,6 +142,131 @@ class ScholarPDFDownloader:
|
|
|
130
142
|
)
|
|
131
143
|
return saved_paths
|
|
132
144
|
|
|
145
|
+
async def download_open_access(
|
|
146
|
+
self,
|
|
147
|
+
oa_url: str,
|
|
148
|
+
output_path: Union[str, Path],
|
|
149
|
+
metadata: Optional[dict] = None,
|
|
150
|
+
) -> Optional[Path]:
|
|
151
|
+
"""Download PDF from an Open Access URL.
|
|
152
|
+
|
|
153
|
+
This is a simpler path for known OA papers - no browser automation needed.
|
|
154
|
+
Uses direct HTTP download with appropriate handling for different OA sources
|
|
155
|
+
(arXiv, PMC, OpenAlex OA URLs, etc.).
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
oa_url: Open Access URL (from paper.metadata.access.oa_url)
|
|
159
|
+
output_path: Path to save the downloaded PDF
|
|
160
|
+
metadata: Optional paper metadata for logging
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Path to downloaded PDF if successful, None otherwise
|
|
164
|
+
"""
|
|
165
|
+
if not oa_url:
|
|
166
|
+
logger.debug(f"{self.name}: No OA URL provided")
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
if isinstance(output_path, str):
|
|
170
|
+
output_path = Path(output_path)
|
|
171
|
+
if not str(output_path).endswith(".pdf"):
|
|
172
|
+
output_path = Path(str(output_path) + ".pdf")
|
|
173
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
174
|
+
|
|
175
|
+
logger.info(f"{self.name}: Attempting OA download from {oa_url[:60]}...")
|
|
176
|
+
|
|
177
|
+
result = await try_download_open_access_async(
|
|
178
|
+
oa_url=oa_url,
|
|
179
|
+
output_path=output_path,
|
|
180
|
+
metadata=metadata,
|
|
181
|
+
func_name=self.name,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
if result:
|
|
185
|
+
logger.info(f"{self.name}: Successfully downloaded OA PDF to {result}")
|
|
186
|
+
else:
|
|
187
|
+
logger.debug(f"{self.name}: OA download failed, may need browser-based download")
|
|
188
|
+
|
|
189
|
+
return result
|
|
190
|
+
|
|
191
|
+
async def download_smart(
|
|
192
|
+
self,
|
|
193
|
+
paper,
|
|
194
|
+
output_path: Union[str, Path],
|
|
195
|
+
) -> Optional[Path]:
|
|
196
|
+
"""Smart download method that chooses the best strategy based on paper metadata.
|
|
197
|
+
|
|
198
|
+
Priority order:
|
|
199
|
+
1. Try Open Access URL if available and prefer_open_access is True
|
|
200
|
+
2. Try regular PDF URLs if available
|
|
201
|
+
3. Try paywall access if enable_paywall_access is True and OA failed
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
paper: Paper object with metadata (from scitex.scholar.core.Paper)
|
|
205
|
+
output_path: Path to save the downloaded PDF
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Path to downloaded PDF if successful, None otherwise
|
|
209
|
+
"""
|
|
210
|
+
from scitex.scholar.core.Paper import Paper
|
|
211
|
+
|
|
212
|
+
if isinstance(output_path, str):
|
|
213
|
+
output_path = Path(output_path)
|
|
214
|
+
if not str(output_path).endswith(".pdf"):
|
|
215
|
+
output_path = Path(str(output_path) + ".pdf")
|
|
216
|
+
|
|
217
|
+
# Extract metadata
|
|
218
|
+
meta = paper.metadata if hasattr(paper, 'metadata') else paper
|
|
219
|
+
access = getattr(meta, 'access', None)
|
|
220
|
+
url_meta = getattr(meta, 'url', None)
|
|
221
|
+
id_meta = getattr(meta, 'id', None)
|
|
222
|
+
|
|
223
|
+
is_open_access = getattr(access, 'is_open_access', False) if access else False
|
|
224
|
+
oa_url = getattr(access, 'oa_url', None) if access else None
|
|
225
|
+
pdf_urls = getattr(url_meta, 'pdfs', []) if url_meta else []
|
|
226
|
+
doi = getattr(id_meta, 'doi', None) if id_meta else None
|
|
227
|
+
|
|
228
|
+
logger.info(f"{self.name}: Smart download for DOI={doi}, OA={is_open_access}")
|
|
229
|
+
|
|
230
|
+
# Strategy 1: Try Open Access if available
|
|
231
|
+
if self.prefer_open_access and oa_url:
|
|
232
|
+
logger.info(f"{self.name}: Trying Open Access URL first")
|
|
233
|
+
result = await self.download_open_access(oa_url, output_path)
|
|
234
|
+
if result:
|
|
235
|
+
# Update access metadata to record successful OA download
|
|
236
|
+
if access and self.track_paywall_attempts:
|
|
237
|
+
access.paywall_bypass_attempted = False
|
|
238
|
+
return result
|
|
239
|
+
|
|
240
|
+
# Strategy 2: Try available PDF URLs
|
|
241
|
+
for pdf_entry in pdf_urls:
|
|
242
|
+
pdf_url = pdf_entry.get('url') if isinstance(pdf_entry, dict) else pdf_entry
|
|
243
|
+
if pdf_url:
|
|
244
|
+
logger.info(f"{self.name}: Trying PDF URL: {pdf_url[:60]}...")
|
|
245
|
+
result = await self.download_from_url(pdf_url, output_path, doi=doi)
|
|
246
|
+
if result:
|
|
247
|
+
return result
|
|
248
|
+
|
|
249
|
+
# Strategy 3: Try paywall access if enabled
|
|
250
|
+
if self.enable_paywall_access and not is_open_access:
|
|
251
|
+
logger.info(f"{self.name}: Attempting paywall access (opt-in enabled)")
|
|
252
|
+
if access and self.track_paywall_attempts:
|
|
253
|
+
access.paywall_bypass_attempted = True
|
|
254
|
+
|
|
255
|
+
# Use DOI-based URL if available
|
|
256
|
+
if doi:
|
|
257
|
+
doi_url = f"https://doi.org/{doi}"
|
|
258
|
+
result = await self.download_from_url(doi_url, output_path, doi=doi)
|
|
259
|
+
if result:
|
|
260
|
+
if access and self.track_paywall_attempts:
|
|
261
|
+
access.paywall_bypass_success = True
|
|
262
|
+
return result
|
|
263
|
+
else:
|
|
264
|
+
if access and self.track_paywall_attempts:
|
|
265
|
+
access.paywall_bypass_success = False
|
|
266
|
+
|
|
267
|
+
logger.warning(f"{self.name}: All download strategies exhausted for DOI={doi}")
|
|
268
|
+
return None
|
|
269
|
+
|
|
133
270
|
async def download_from_url(
|
|
134
271
|
self,
|
|
135
272
|
pdf_url: str,
|
|
@@ -11,6 +11,10 @@ from .chrome_pdf_viewer import try_download_chrome_pdf_viewer_async
|
|
|
11
11
|
from .direct_download import try_download_direct_async
|
|
12
12
|
from .response_body import try_download_response_body_async
|
|
13
13
|
from .manual_download_fallback import try_download_manual_async
|
|
14
|
+
from .open_access_download import (
|
|
15
|
+
try_download_open_access_async,
|
|
16
|
+
try_download_open_access_sync,
|
|
17
|
+
)
|
|
14
18
|
|
|
15
19
|
# Manual download utilities
|
|
16
20
|
from .manual_download_utils import (
|
|
@@ -27,6 +31,8 @@ __all__ = [
|
|
|
27
31
|
"try_download_direct_async",
|
|
28
32
|
"try_download_response_body_async",
|
|
29
33
|
"try_download_manual_async",
|
|
34
|
+
"try_download_open_access_async",
|
|
35
|
+
"try_download_open_access_sync",
|
|
30
36
|
# Manual download utilities
|
|
31
37
|
"DownloadMonitorAndSync",
|
|
32
38
|
"FlexibleFilenameGenerator",
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# File: ./src/scitex/scholar/pdf_download/strategies/open_access_download.py
|
|
4
|
+
"""
|
|
5
|
+
Open Access PDF Download Strategy.
|
|
6
|
+
|
|
7
|
+
Downloads PDFs from known Open Access sources with appropriate handling
|
|
8
|
+
for each source type (arXiv, PubMed Central, OpenAlex OA URLs, etc.).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional, Dict, Any
|
|
13
|
+
import aiohttp
|
|
14
|
+
|
|
15
|
+
from scitex import logging
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Known OA source patterns and their handlers
|
|
21
|
+
OA_SOURCE_PATTERNS = {
|
|
22
|
+
'arxiv': {
|
|
23
|
+
'patterns': ['arxiv.org'],
|
|
24
|
+
'pdf_transform': lambda url: url.replace('/abs/', '/pdf/') + '.pdf' if '/abs/' in url else url,
|
|
25
|
+
},
|
|
26
|
+
'pmc': {
|
|
27
|
+
'patterns': ['ncbi.nlm.nih.gov/pmc', 'europepmc.org'],
|
|
28
|
+
'pdf_transform': lambda url: url, # PMC links are usually direct
|
|
29
|
+
},
|
|
30
|
+
'biorxiv': {
|
|
31
|
+
'patterns': ['biorxiv.org', 'medrxiv.org'],
|
|
32
|
+
'pdf_transform': lambda url: url + '.full.pdf' if not url.endswith('.pdf') else url,
|
|
33
|
+
},
|
|
34
|
+
'doaj': {
|
|
35
|
+
'patterns': ['doaj.org'],
|
|
36
|
+
'pdf_transform': lambda url: url,
|
|
37
|
+
},
|
|
38
|
+
'zenodo': {
|
|
39
|
+
'patterns': ['zenodo.org'],
|
|
40
|
+
'pdf_transform': lambda url: url,
|
|
41
|
+
},
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _identify_oa_source(url: str) -> Optional[str]:
|
|
46
|
+
"""Identify which OA source a URL belongs to."""
|
|
47
|
+
url_lower = url.lower()
|
|
48
|
+
for source_name, config in OA_SOURCE_PATTERNS.items():
|
|
49
|
+
for pattern in config['patterns']:
|
|
50
|
+
if pattern in url_lower:
|
|
51
|
+
return source_name
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _transform_to_pdf_url(url: str, source: str) -> str:
|
|
56
|
+
"""Transform URL to direct PDF URL based on source."""
|
|
57
|
+
if source in OA_SOURCE_PATTERNS:
|
|
58
|
+
transform_func = OA_SOURCE_PATTERNS[source]['pdf_transform']
|
|
59
|
+
return transform_func(url)
|
|
60
|
+
return url
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
async def try_download_open_access_async(
|
|
64
|
+
oa_url: str,
|
|
65
|
+
output_path: Path,
|
|
66
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
67
|
+
func_name: str = "try_download_open_access_async",
|
|
68
|
+
timeout: int = 60,
|
|
69
|
+
) -> Optional[Path]:
|
|
70
|
+
"""
|
|
71
|
+
Download PDF from an Open Access URL.
|
|
72
|
+
|
|
73
|
+
This strategy is simpler than browser-based strategies because OA PDFs
|
|
74
|
+
are typically directly accessible without authentication.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
oa_url: Open Access URL (from OpenAlex oa_url, arXiv, PMC, etc.)
|
|
78
|
+
output_path: Path to save the downloaded PDF
|
|
79
|
+
metadata: Optional paper metadata for logging
|
|
80
|
+
func_name: Function name for logging
|
|
81
|
+
timeout: Download timeout in seconds
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Path to downloaded PDF if successful, None otherwise
|
|
85
|
+
"""
|
|
86
|
+
if not oa_url:
|
|
87
|
+
logger.debug(f"{func_name}: No OA URL provided")
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
# Identify source and transform URL if needed
|
|
91
|
+
source = _identify_oa_source(oa_url)
|
|
92
|
+
pdf_url = _transform_to_pdf_url(oa_url, source) if source else oa_url
|
|
93
|
+
|
|
94
|
+
logger.info(f"{func_name}: Attempting OA download from {source or 'unknown'}: {pdf_url[:80]}...")
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
# Create output directory if needed
|
|
98
|
+
output_path = Path(output_path)
|
|
99
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
|
|
101
|
+
# Use aiohttp for async download
|
|
102
|
+
async with aiohttp.ClientSession() as session:
|
|
103
|
+
headers = {
|
|
104
|
+
'User-Agent': 'SciTeX/1.0 (Academic Research Tool; mailto:contact@scitex.io)',
|
|
105
|
+
'Accept': 'application/pdf,*/*',
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
async with session.get(pdf_url, headers=headers, timeout=aiohttp.ClientTimeout(total=timeout)) as response:
|
|
109
|
+
if response.status != 200:
|
|
110
|
+
logger.warning(f"{func_name}: HTTP {response.status} from {pdf_url}")
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
content_type = response.headers.get('Content-Type', '')
|
|
114
|
+
|
|
115
|
+
# Verify we're getting a PDF
|
|
116
|
+
if 'pdf' not in content_type.lower() and not pdf_url.endswith('.pdf'):
|
|
117
|
+
# Some servers don't set content-type correctly, check magic bytes
|
|
118
|
+
first_bytes = await response.content.read(5)
|
|
119
|
+
if first_bytes != b'%PDF-':
|
|
120
|
+
logger.warning(f"{func_name}: Response is not a PDF (content-type: {content_type})")
|
|
121
|
+
return None
|
|
122
|
+
# Reset for full download
|
|
123
|
+
content = first_bytes + await response.content.read()
|
|
124
|
+
else:
|
|
125
|
+
content = await response.read()
|
|
126
|
+
|
|
127
|
+
# Validate PDF content
|
|
128
|
+
if len(content) < 1000: # PDF should be at least 1KB
|
|
129
|
+
logger.warning(f"{func_name}: Downloaded content too small ({len(content)} bytes)")
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
if not content.startswith(b'%PDF-'):
|
|
133
|
+
logger.warning(f"{func_name}: Downloaded content is not a valid PDF")
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
# Save to file
|
|
137
|
+
with open(output_path, 'wb') as f:
|
|
138
|
+
f.write(content)
|
|
139
|
+
|
|
140
|
+
size_mb = len(content) / 1024 / 1024
|
|
141
|
+
logger.info(f"{func_name}: Successfully downloaded {size_mb:.2f} MB to {output_path}")
|
|
142
|
+
return output_path
|
|
143
|
+
|
|
144
|
+
except aiohttp.ClientError as e:
|
|
145
|
+
logger.warning(f"{func_name}: HTTP client error: {e}")
|
|
146
|
+
return None
|
|
147
|
+
except TimeoutError:
|
|
148
|
+
logger.warning(f"{func_name}: Download timed out after {timeout}s")
|
|
149
|
+
return None
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.error(f"{func_name}: Download failed: {e}")
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def try_download_open_access_sync(
|
|
156
|
+
oa_url: str,
|
|
157
|
+
output_path: Path,
|
|
158
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
159
|
+
timeout: int = 60,
|
|
160
|
+
) -> Optional[Path]:
|
|
161
|
+
"""
|
|
162
|
+
Synchronous wrapper for try_download_open_access_async.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
oa_url: Open Access URL
|
|
166
|
+
output_path: Path to save the downloaded PDF
|
|
167
|
+
metadata: Optional paper metadata
|
|
168
|
+
timeout: Download timeout in seconds
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Path to downloaded PDF if successful, None otherwise
|
|
172
|
+
"""
|
|
173
|
+
import asyncio
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
loop = asyncio.get_event_loop()
|
|
177
|
+
except RuntimeError:
|
|
178
|
+
loop = asyncio.new_event_loop()
|
|
179
|
+
asyncio.set_event_loop(loop)
|
|
180
|
+
|
|
181
|
+
return loop.run_until_complete(
|
|
182
|
+
try_download_open_access_async(oa_url, output_path, metadata, timeout=timeout)
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# EOF
|
|
@@ -32,6 +32,7 @@ from datetime import datetime
|
|
|
32
32
|
|
|
33
33
|
from scitex import logging
|
|
34
34
|
from scitex.scholar.core import Paper
|
|
35
|
+
from scitex.scholar.core import normalize_journal_name
|
|
35
36
|
from scitex.scholar.search_engines.individual.PubMedSearchEngine import PubMedSearchEngine
|
|
36
37
|
from scitex.scholar.search_engines.individual.CrossRefSearchEngine import CrossRefSearchEngine
|
|
37
38
|
from scitex.scholar.search_engines.individual.ArXivSearchEngine import ArXivSearchEngine
|
|
@@ -49,6 +50,7 @@ class ScholarPipelineSearchParallel:
|
|
|
49
50
|
max_workers: int = 5,
|
|
50
51
|
timeout_per_engine: float = 30.0,
|
|
51
52
|
use_cache: bool = True,
|
|
53
|
+
email: str = None,
|
|
52
54
|
):
|
|
53
55
|
"""Initialize parallel search pipeline.
|
|
54
56
|
|
|
@@ -56,19 +58,21 @@ class ScholarPipelineSearchParallel:
|
|
|
56
58
|
max_workers: Maximum number of parallel engine queries
|
|
57
59
|
timeout_per_engine: Timeout for each engine in seconds
|
|
58
60
|
use_cache: Whether to use caching for API results
|
|
61
|
+
email: User email for API rate limit benefits (PubMed, CrossRef, OpenAlex)
|
|
59
62
|
"""
|
|
60
63
|
self.name = self.__class__.__name__
|
|
61
64
|
self.max_workers = max_workers
|
|
62
65
|
self.timeout_per_engine = timeout_per_engine
|
|
63
66
|
self.use_cache = use_cache
|
|
67
|
+
self.email = email or "research@scitex.io"
|
|
64
68
|
|
|
65
|
-
# Initialize search engines
|
|
69
|
+
# Initialize search engines with email for rate limit benefits
|
|
66
70
|
self.engines = {
|
|
67
|
-
'PubMed': PubMedSearchEngine(),
|
|
68
|
-
'CrossRef': CrossRefSearchEngine(),
|
|
69
|
-
'arXiv': ArXivSearchEngine(),
|
|
70
|
-
'Semantic_Scholar': SemanticScholarSearchEngine(),
|
|
71
|
-
'OpenAlex': OpenAlexSearchEngine(),
|
|
71
|
+
'PubMed': PubMedSearchEngine(email=self.email),
|
|
72
|
+
'CrossRef': CrossRefSearchEngine(email=self.email),
|
|
73
|
+
'arXiv': ArXivSearchEngine(email=self.email),
|
|
74
|
+
'Semantic_Scholar': SemanticScholarSearchEngine(email=self.email),
|
|
75
|
+
'OpenAlex': OpenAlexSearchEngine(email=self.email),
|
|
72
76
|
}
|
|
73
77
|
|
|
74
78
|
# Statistics
|
|
@@ -328,12 +332,18 @@ class ScholarPipelineSearchParallel:
|
|
|
328
332
|
if 'metrics' in result:
|
|
329
333
|
if result['metrics'].get('citation_count'):
|
|
330
334
|
paper.metadata.citation_count.total = result['metrics']['citation_count']
|
|
331
|
-
|
|
335
|
+
if 'is_open_access' in result['metrics']:
|
|
336
|
+
paper.metadata.access.is_open_access = result['metrics']['is_open_access']
|
|
337
|
+
paper.metadata.access.is_open_access_engines = [engine_name]
|
|
332
338
|
|
|
333
339
|
if 'urls' in result:
|
|
334
340
|
if result['urls'].get('pdf'):
|
|
335
341
|
# pdfs is a list of dicts with url/source keys
|
|
336
342
|
paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
|
|
343
|
+
# If this is an open access paper, also store the PDF URL as oa_url
|
|
344
|
+
if paper.metadata.access.is_open_access:
|
|
345
|
+
paper.metadata.access.oa_url = result['urls']['pdf']
|
|
346
|
+
paper.metadata.access.oa_url_engines = [engine_name]
|
|
337
347
|
if result['urls'].get('publisher'):
|
|
338
348
|
paper.metadata.url.publisher = result['urls']['publisher']
|
|
339
349
|
if result['urls'].get('doi_url'):
|
|
@@ -730,13 +740,21 @@ class ScholarPipelineSearchParallel:
|
|
|
730
740
|
|
|
731
741
|
# Publication info
|
|
732
742
|
if hasattr(meta, 'publication'):
|
|
733
|
-
|
|
743
|
+
journal_raw = meta.publication.journal or ''
|
|
744
|
+
result['journal'] = normalize_journal_name(journal_raw) if journal_raw else ''
|
|
734
745
|
result['impact_factor'] = meta.publication.impact_factor
|
|
735
746
|
|
|
736
747
|
# Metrics
|
|
737
748
|
if hasattr(meta, 'citation_count'):
|
|
738
749
|
result['citation_count'] = meta.citation_count.total or 0
|
|
739
|
-
|
|
750
|
+
|
|
751
|
+
# Access metadata
|
|
752
|
+
if hasattr(meta, 'access'):
|
|
753
|
+
result['is_open_access'] = meta.access.is_open_access or False
|
|
754
|
+
result['oa_status'] = meta.access.oa_status
|
|
755
|
+
result['oa_url'] = meta.access.oa_url
|
|
756
|
+
else:
|
|
757
|
+
result['is_open_access'] = False
|
|
740
758
|
|
|
741
759
|
# URLs
|
|
742
760
|
if hasattr(meta, 'url'):
|
|
@@ -47,22 +47,25 @@ class ScholarPipelineSearchSingle:
|
|
|
47
47
|
def __init__(
|
|
48
48
|
self,
|
|
49
49
|
use_cache: bool = True,
|
|
50
|
+
email: str = None,
|
|
50
51
|
):
|
|
51
52
|
"""Initialize sequential search pipeline.
|
|
52
53
|
|
|
53
54
|
Args:
|
|
54
55
|
use_cache: Whether to use caching for API results
|
|
56
|
+
email: User email for API rate limit benefits (PubMed, CrossRef, OpenAlex)
|
|
55
57
|
"""
|
|
56
58
|
self.name = self.__class__.__name__
|
|
57
59
|
self.use_cache = use_cache
|
|
60
|
+
self.email = email or "research@scitex.io"
|
|
58
61
|
|
|
59
|
-
# Initialize search engines
|
|
62
|
+
# Initialize search engines with email for rate limit benefits
|
|
60
63
|
self.engines = {
|
|
61
|
-
'PubMed': PubMedSearchEngine(),
|
|
62
|
-
'CrossRef': CrossRefSearchEngine(),
|
|
63
|
-
'arXiv': ArXivSearchEngine(),
|
|
64
|
-
'Semantic_Scholar': SemanticScholarSearchEngine(),
|
|
65
|
-
'OpenAlex': OpenAlexSearchEngine(),
|
|
64
|
+
'PubMed': PubMedSearchEngine(email=self.email),
|
|
65
|
+
'CrossRef': CrossRefSearchEngine(email=self.email),
|
|
66
|
+
'arXiv': ArXivSearchEngine(email=self.email),
|
|
67
|
+
'Semantic_Scholar': SemanticScholarSearchEngine(email=self.email),
|
|
68
|
+
'OpenAlex': OpenAlexSearchEngine(email=self.email),
|
|
66
69
|
}
|
|
67
70
|
|
|
68
71
|
# Statistics
|
|
@@ -265,12 +268,18 @@ class ScholarPipelineSearchSingle:
|
|
|
265
268
|
if 'metrics' in result:
|
|
266
269
|
if result['metrics'].get('citation_count'):
|
|
267
270
|
paper.metadata.citation_count.total = result['metrics']['citation_count']
|
|
268
|
-
|
|
271
|
+
if 'is_open_access' in result['metrics']:
|
|
272
|
+
paper.metadata.access.is_open_access = result['metrics']['is_open_access']
|
|
273
|
+
paper.metadata.access.is_open_access_engines = [engine_name]
|
|
269
274
|
|
|
270
275
|
if 'urls' in result:
|
|
271
276
|
if result['urls'].get('pdf'):
|
|
272
277
|
# pdfs is a list of dicts with url/source keys
|
|
273
278
|
paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
|
|
279
|
+
# If this is an open access paper, also store the PDF URL as oa_url
|
|
280
|
+
if paper.metadata.access.is_open_access:
|
|
281
|
+
paper.metadata.access.oa_url = result['urls']['pdf']
|
|
282
|
+
paper.metadata.access.oa_url_engines = [engine_name]
|
|
274
283
|
if result['urls'].get('publisher'):
|
|
275
284
|
paper.metadata.url.publisher = result['urls']['publisher']
|
|
276
285
|
if result['urls'].get('doi_url'):
|
|
@@ -458,7 +467,14 @@ class ScholarPipelineSearchSingle:
|
|
|
458
467
|
# Metrics
|
|
459
468
|
if hasattr(meta, 'citation_count'):
|
|
460
469
|
result['citation_count'] = meta.citation_count.total or 0
|
|
461
|
-
|
|
470
|
+
|
|
471
|
+
# Access metadata
|
|
472
|
+
if hasattr(meta, 'access'):
|
|
473
|
+
result['is_open_access'] = meta.access.is_open_access or False
|
|
474
|
+
result['oa_status'] = meta.access.oa_status
|
|
475
|
+
result['oa_url'] = meta.access.oa_url
|
|
476
|
+
else:
|
|
477
|
+
result['is_open_access'] = False
|
|
462
478
|
|
|
463
479
|
# URLs
|
|
464
480
|
if hasattr(meta, 'url'):
|
|
@@ -45,26 +45,31 @@ class ScholarSearchEngine:
|
|
|
45
45
|
self,
|
|
46
46
|
default_mode: Literal['parallel', 'single'] = 'parallel',
|
|
47
47
|
use_cache: bool = True,
|
|
48
|
+
email: str = None,
|
|
48
49
|
):
|
|
49
50
|
"""Initialize unified search engine.
|
|
50
51
|
|
|
51
52
|
Args:
|
|
52
53
|
default_mode: Default search mode ('parallel' or 'single')
|
|
53
54
|
use_cache: Whether to use caching for API results
|
|
55
|
+
email: User email for API rate limit benefits (PubMed, CrossRef, OpenAlex)
|
|
54
56
|
"""
|
|
55
57
|
self.name = self.__class__.__name__
|
|
56
58
|
self.default_mode = default_mode
|
|
57
59
|
self.use_cache = use_cache
|
|
60
|
+
self.email = email
|
|
58
61
|
|
|
59
|
-
# Initialize both pipeline modes
|
|
62
|
+
# Initialize both pipeline modes with email for rate limit benefits
|
|
60
63
|
self.parallel_pipeline = ScholarPipelineSearchParallel(
|
|
61
64
|
max_workers=5,
|
|
62
65
|
timeout_per_engine=30.0,
|
|
63
66
|
use_cache=use_cache,
|
|
67
|
+
email=email,
|
|
64
68
|
)
|
|
65
69
|
|
|
66
70
|
self.single_pipeline = ScholarPipelineSearchSingle(
|
|
67
71
|
use_cache=use_cache,
|
|
72
|
+
email=email,
|
|
68
73
|
)
|
|
69
74
|
|
|
70
75
|
# Statistics
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scitex
|
|
3
|
-
Version: 2.4.
|
|
3
|
+
Version: 2.4.3
|
|
4
4
|
Summary: A comprehensive Python library for scientific computing and data analysis
|
|
5
5
|
Project-URL: Homepage, https://github.com/ywatanabe1989/scitex-code
|
|
6
6
|
Project-URL: Documentation, https://scitex.readthedocs.io
|