scitex 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. scitex/__version__.py +1 -1
  2. scitex/browser/__init__.py +53 -0
  3. scitex/browser/auth/__init__.py +35 -0
  4. scitex/browser/auth/google.py +381 -0
  5. scitex/browser/collaboration/__init__.py +5 -0
  6. scitex/browser/debugging/__init__.py +56 -0
  7. scitex/browser/debugging/_failure_capture.py +372 -0
  8. scitex/browser/debugging/_sync_session.py +259 -0
  9. scitex/browser/debugging/_test_monitor.py +284 -0
  10. scitex/browser/debugging/_visual_cursor.py +432 -0
  11. scitex/scholar/citation_graph/README.md +117 -0
  12. scitex/scholar/citation_graph/__init__.py +29 -0
  13. scitex/scholar/citation_graph/builder.py +214 -0
  14. scitex/scholar/citation_graph/database.py +246 -0
  15. scitex/scholar/citation_graph/example.py +96 -0
  16. scitex/scholar/citation_graph/models.py +80 -0
  17. scitex/scholar/config/ScholarConfig.py +23 -3
  18. scitex/scholar/config/default.yaml +56 -0
  19. scitex/scholar/core/Paper.py +102 -0
  20. scitex/scholar/core/__init__.py +44 -0
  21. scitex/scholar/core/journal_normalizer.py +524 -0
  22. scitex/scholar/core/oa_cache.py +285 -0
  23. scitex/scholar/core/open_access.py +457 -0
  24. scitex/scholar/metadata_engines/ScholarEngine.py +9 -1
  25. scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +82 -21
  26. scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
  27. scitex/scholar/pdf_download/strategies/__init__.py +6 -0
  28. scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
  29. scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +27 -9
  30. scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +24 -8
  31. scitex/scholar/search_engines/ScholarSearchEngine.py +6 -1
  32. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/METADATA +1 -1
  33. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/RECORD +36 -20
  34. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/WHEEL +0 -0
  35. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/entry_points.txt +0 -0
  36. {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/licenses/LICENSE +0 -0
@@ -33,6 +33,7 @@ from scitex.scholar.pdf_download.strategies import (
33
33
  try_download_direct_async,
34
34
  try_download_manual_async,
35
35
  try_download_response_body_async,
36
+ try_download_open_access_async,
36
37
  )
37
38
 
38
39
  logger = logging.getLogger(__name__)
@@ -65,6 +66,17 @@ class ScholarPDFDownloader:
65
66
  self.context = context
66
67
  self.output_dir = self.config.get_library_downloads_dir()
67
68
 
69
+ # Load access preferences from config
70
+ self.prefer_open_access = self.config.resolve(
71
+ "prefer_open_access", default=True, type=bool
72
+ )
73
+ self.enable_paywall_access = self.config.resolve(
74
+ "enable_paywall_access", default=False, type=bool
75
+ )
76
+ self.track_paywall_attempts = self.config.resolve(
77
+ "track_paywall_attempts", default=True, type=bool
78
+ )
79
+
68
80
  async def __aexit__(self, exc_type, exc_val, exc_tb):
69
81
  pass
70
82
 
@@ -130,6 +142,131 @@ class ScholarPDFDownloader:
130
142
  )
131
143
  return saved_paths
132
144
 
145
+ async def download_open_access(
146
+ self,
147
+ oa_url: str,
148
+ output_path: Union[str, Path],
149
+ metadata: Optional[dict] = None,
150
+ ) -> Optional[Path]:
151
+ """Download PDF from an Open Access URL.
152
+
153
+ This is a simpler path for known OA papers - no browser automation needed.
154
+ Uses direct HTTP download with appropriate handling for different OA sources
155
+ (arXiv, PMC, OpenAlex OA URLs, etc.).
156
+
157
+ Args:
158
+ oa_url: Open Access URL (from paper.metadata.access.oa_url)
159
+ output_path: Path to save the downloaded PDF
160
+ metadata: Optional paper metadata for logging
161
+
162
+ Returns:
163
+ Path to downloaded PDF if successful, None otherwise
164
+ """
165
+ if not oa_url:
166
+ logger.debug(f"{self.name}: No OA URL provided")
167
+ return None
168
+
169
+ if isinstance(output_path, str):
170
+ output_path = Path(output_path)
171
+ if not str(output_path).endswith(".pdf"):
172
+ output_path = Path(str(output_path) + ".pdf")
173
+ output_path.parent.mkdir(parents=True, exist_ok=True)
174
+
175
+ logger.info(f"{self.name}: Attempting OA download from {oa_url[:60]}...")
176
+
177
+ result = await try_download_open_access_async(
178
+ oa_url=oa_url,
179
+ output_path=output_path,
180
+ metadata=metadata,
181
+ func_name=self.name,
182
+ )
183
+
184
+ if result:
185
+ logger.info(f"{self.name}: Successfully downloaded OA PDF to {result}")
186
+ else:
187
+ logger.debug(f"{self.name}: OA download failed, may need browser-based download")
188
+
189
+ return result
190
+
191
+ async def download_smart(
192
+ self,
193
+ paper,
194
+ output_path: Union[str, Path],
195
+ ) -> Optional[Path]:
196
+ """Smart download method that chooses the best strategy based on paper metadata.
197
+
198
+ Priority order:
199
+ 1. Try Open Access URL if available and prefer_open_access is True
200
+ 2. Try regular PDF URLs if available
201
+ 3. Try paywall access if enable_paywall_access is True and OA failed
202
+
203
+ Args:
204
+ paper: Paper object with metadata (from scitex.scholar.core.Paper)
205
+ output_path: Path to save the downloaded PDF
206
+
207
+ Returns:
208
+ Path to downloaded PDF if successful, None otherwise
209
+ """
210
+ from scitex.scholar.core.Paper import Paper
211
+
212
+ if isinstance(output_path, str):
213
+ output_path = Path(output_path)
214
+ if not str(output_path).endswith(".pdf"):
215
+ output_path = Path(str(output_path) + ".pdf")
216
+
217
+ # Extract metadata
218
+ meta = paper.metadata if hasattr(paper, 'metadata') else paper
219
+ access = getattr(meta, 'access', None)
220
+ url_meta = getattr(meta, 'url', None)
221
+ id_meta = getattr(meta, 'id', None)
222
+
223
+ is_open_access = getattr(access, 'is_open_access', False) if access else False
224
+ oa_url = getattr(access, 'oa_url', None) if access else None
225
+ pdf_urls = getattr(url_meta, 'pdfs', []) if url_meta else []
226
+ doi = getattr(id_meta, 'doi', None) if id_meta else None
227
+
228
+ logger.info(f"{self.name}: Smart download for DOI={doi}, OA={is_open_access}")
229
+
230
+ # Strategy 1: Try Open Access if available
231
+ if self.prefer_open_access and oa_url:
232
+ logger.info(f"{self.name}: Trying Open Access URL first")
233
+ result = await self.download_open_access(oa_url, output_path)
234
+ if result:
235
+ # Update access metadata to record successful OA download
236
+ if access and self.track_paywall_attempts:
237
+ access.paywall_bypass_attempted = False
238
+ return result
239
+
240
+ # Strategy 2: Try available PDF URLs
241
+ for pdf_entry in pdf_urls:
242
+ pdf_url = pdf_entry.get('url') if isinstance(pdf_entry, dict) else pdf_entry
243
+ if pdf_url:
244
+ logger.info(f"{self.name}: Trying PDF URL: {pdf_url[:60]}...")
245
+ result = await self.download_from_url(pdf_url, output_path, doi=doi)
246
+ if result:
247
+ return result
248
+
249
+ # Strategy 3: Try paywall access if enabled
250
+ if self.enable_paywall_access and not is_open_access:
251
+ logger.info(f"{self.name}: Attempting paywall access (opt-in enabled)")
252
+ if access and self.track_paywall_attempts:
253
+ access.paywall_bypass_attempted = True
254
+
255
+ # Use DOI-based URL if available
256
+ if doi:
257
+ doi_url = f"https://doi.org/{doi}"
258
+ result = await self.download_from_url(doi_url, output_path, doi=doi)
259
+ if result:
260
+ if access and self.track_paywall_attempts:
261
+ access.paywall_bypass_success = True
262
+ return result
263
+ else:
264
+ if access and self.track_paywall_attempts:
265
+ access.paywall_bypass_success = False
266
+
267
+ logger.warning(f"{self.name}: All download strategies exhausted for DOI={doi}")
268
+ return None
269
+
133
270
  async def download_from_url(
134
271
  self,
135
272
  pdf_url: str,
@@ -11,6 +11,10 @@ from .chrome_pdf_viewer import try_download_chrome_pdf_viewer_async
11
11
  from .direct_download import try_download_direct_async
12
12
  from .response_body import try_download_response_body_async
13
13
  from .manual_download_fallback import try_download_manual_async
14
+ from .open_access_download import (
15
+ try_download_open_access_async,
16
+ try_download_open_access_sync,
17
+ )
14
18
 
15
19
  # Manual download utilities
16
20
  from .manual_download_utils import (
@@ -27,6 +31,8 @@ __all__ = [
27
31
  "try_download_direct_async",
28
32
  "try_download_response_body_async",
29
33
  "try_download_manual_async",
34
+ "try_download_open_access_async",
35
+ "try_download_open_access_sync",
30
36
  # Manual download utilities
31
37
  "DownloadMonitorAndSync",
32
38
  "FlexibleFilenameGenerator",
@@ -0,0 +1,186 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # File: ./src/scitex/scholar/pdf_download/strategies/open_access_download.py
4
+ """
5
+ Open Access PDF Download Strategy.
6
+
7
+ Downloads PDFs from known Open Access sources with appropriate handling
8
+ for each source type (arXiv, PubMed Central, OpenAlex OA URLs, etc.).
9
+ """
10
+
11
+ from pathlib import Path
12
+ from typing import Optional, Dict, Any
13
+ import aiohttp
14
+
15
+ from scitex import logging
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ # Known OA source patterns and their handlers
21
+ OA_SOURCE_PATTERNS = {
22
+ 'arxiv': {
23
+ 'patterns': ['arxiv.org'],
24
+ 'pdf_transform': lambda url: url.replace('/abs/', '/pdf/') + '.pdf' if '/abs/' in url else url,
25
+ },
26
+ 'pmc': {
27
+ 'patterns': ['ncbi.nlm.nih.gov/pmc', 'europepmc.org'],
28
+ 'pdf_transform': lambda url: url, # PMC links are usually direct
29
+ },
30
+ 'biorxiv': {
31
+ 'patterns': ['biorxiv.org', 'medrxiv.org'],
32
+ 'pdf_transform': lambda url: url + '.full.pdf' if not url.endswith('.pdf') else url,
33
+ },
34
+ 'doaj': {
35
+ 'patterns': ['doaj.org'],
36
+ 'pdf_transform': lambda url: url,
37
+ },
38
+ 'zenodo': {
39
+ 'patterns': ['zenodo.org'],
40
+ 'pdf_transform': lambda url: url,
41
+ },
42
+ }
43
+
44
+
45
+ def _identify_oa_source(url: str) -> Optional[str]:
46
+ """Identify which OA source a URL belongs to."""
47
+ url_lower = url.lower()
48
+ for source_name, config in OA_SOURCE_PATTERNS.items():
49
+ for pattern in config['patterns']:
50
+ if pattern in url_lower:
51
+ return source_name
52
+ return None
53
+
54
+
55
+ def _transform_to_pdf_url(url: str, source: str) -> str:
56
+ """Transform URL to direct PDF URL based on source."""
57
+ if source in OA_SOURCE_PATTERNS:
58
+ transform_func = OA_SOURCE_PATTERNS[source]['pdf_transform']
59
+ return transform_func(url)
60
+ return url
61
+
62
+
63
+ async def try_download_open_access_async(
64
+ oa_url: str,
65
+ output_path: Path,
66
+ metadata: Optional[Dict[str, Any]] = None,
67
+ func_name: str = "try_download_open_access_async",
68
+ timeout: int = 60,
69
+ ) -> Optional[Path]:
70
+ """
71
+ Download PDF from an Open Access URL.
72
+
73
+ This strategy is simpler than browser-based strategies because OA PDFs
74
+ are typically directly accessible without authentication.
75
+
76
+ Args:
77
+ oa_url: Open Access URL (from OpenAlex oa_url, arXiv, PMC, etc.)
78
+ output_path: Path to save the downloaded PDF
79
+ metadata: Optional paper metadata for logging
80
+ func_name: Function name for logging
81
+ timeout: Download timeout in seconds
82
+
83
+ Returns:
84
+ Path to downloaded PDF if successful, None otherwise
85
+ """
86
+ if not oa_url:
87
+ logger.debug(f"{func_name}: No OA URL provided")
88
+ return None
89
+
90
+ # Identify source and transform URL if needed
91
+ source = _identify_oa_source(oa_url)
92
+ pdf_url = _transform_to_pdf_url(oa_url, source) if source else oa_url
93
+
94
+ logger.info(f"{func_name}: Attempting OA download from {source or 'unknown'}: {pdf_url[:80]}...")
95
+
96
+ try:
97
+ # Create output directory if needed
98
+ output_path = Path(output_path)
99
+ output_path.parent.mkdir(parents=True, exist_ok=True)
100
+
101
+ # Use aiohttp for async download
102
+ async with aiohttp.ClientSession() as session:
103
+ headers = {
104
+ 'User-Agent': 'SciTeX/1.0 (Academic Research Tool; mailto:contact@scitex.io)',
105
+ 'Accept': 'application/pdf,*/*',
106
+ }
107
+
108
+ async with session.get(pdf_url, headers=headers, timeout=aiohttp.ClientTimeout(total=timeout)) as response:
109
+ if response.status != 200:
110
+ logger.warning(f"{func_name}: HTTP {response.status} from {pdf_url}")
111
+ return None
112
+
113
+ content_type = response.headers.get('Content-Type', '')
114
+
115
+ # Verify we're getting a PDF
116
+ if 'pdf' not in content_type.lower() and not pdf_url.endswith('.pdf'):
117
+ # Some servers don't set content-type correctly, check magic bytes
118
+ first_bytes = await response.content.read(5)
119
+ if first_bytes != b'%PDF-':
120
+ logger.warning(f"{func_name}: Response is not a PDF (content-type: {content_type})")
121
+ return None
122
+ # Reset for full download
123
+ content = first_bytes + await response.content.read()
124
+ else:
125
+ content = await response.read()
126
+
127
+ # Validate PDF content
128
+ if len(content) < 1000: # PDF should be at least 1KB
129
+ logger.warning(f"{func_name}: Downloaded content too small ({len(content)} bytes)")
130
+ return None
131
+
132
+ if not content.startswith(b'%PDF-'):
133
+ logger.warning(f"{func_name}: Downloaded content is not a valid PDF")
134
+ return None
135
+
136
+ # Save to file
137
+ with open(output_path, 'wb') as f:
138
+ f.write(content)
139
+
140
+ size_mb = len(content) / 1024 / 1024
141
+ logger.info(f"{func_name}: Successfully downloaded {size_mb:.2f} MB to {output_path}")
142
+ return output_path
143
+
144
+ except aiohttp.ClientError as e:
145
+ logger.warning(f"{func_name}: HTTP client error: {e}")
146
+ return None
147
+ except TimeoutError:
148
+ logger.warning(f"{func_name}: Download timed out after {timeout}s")
149
+ return None
150
+ except Exception as e:
151
+ logger.error(f"{func_name}: Download failed: {e}")
152
+ return None
153
+
154
+
155
+ def try_download_open_access_sync(
156
+ oa_url: str,
157
+ output_path: Path,
158
+ metadata: Optional[Dict[str, Any]] = None,
159
+ timeout: int = 60,
160
+ ) -> Optional[Path]:
161
+ """
162
+ Synchronous wrapper for try_download_open_access_async.
163
+
164
+ Args:
165
+ oa_url: Open Access URL
166
+ output_path: Path to save the downloaded PDF
167
+ metadata: Optional paper metadata
168
+ timeout: Download timeout in seconds
169
+
170
+ Returns:
171
+ Path to downloaded PDF if successful, None otherwise
172
+ """
173
+ import asyncio
174
+
175
+ try:
176
+ loop = asyncio.get_event_loop()
177
+ except RuntimeError:
178
+ loop = asyncio.new_event_loop()
179
+ asyncio.set_event_loop(loop)
180
+
181
+ return loop.run_until_complete(
182
+ try_download_open_access_async(oa_url, output_path, metadata, timeout=timeout)
183
+ )
184
+
185
+
186
+ # EOF
@@ -32,6 +32,7 @@ from datetime import datetime
32
32
 
33
33
  from scitex import logging
34
34
  from scitex.scholar.core import Paper
35
+ from scitex.scholar.core import normalize_journal_name
35
36
  from scitex.scholar.search_engines.individual.PubMedSearchEngine import PubMedSearchEngine
36
37
  from scitex.scholar.search_engines.individual.CrossRefSearchEngine import CrossRefSearchEngine
37
38
  from scitex.scholar.search_engines.individual.ArXivSearchEngine import ArXivSearchEngine
@@ -49,6 +50,7 @@ class ScholarPipelineSearchParallel:
49
50
  max_workers: int = 5,
50
51
  timeout_per_engine: float = 30.0,
51
52
  use_cache: bool = True,
53
+ email: str = None,
52
54
  ):
53
55
  """Initialize parallel search pipeline.
54
56
 
@@ -56,19 +58,21 @@ class ScholarPipelineSearchParallel:
56
58
  max_workers: Maximum number of parallel engine queries
57
59
  timeout_per_engine: Timeout for each engine in seconds
58
60
  use_cache: Whether to use caching for API results
61
+ email: User email for API rate limit benefits (PubMed, CrossRef, OpenAlex)
59
62
  """
60
63
  self.name = self.__class__.__name__
61
64
  self.max_workers = max_workers
62
65
  self.timeout_per_engine = timeout_per_engine
63
66
  self.use_cache = use_cache
67
+ self.email = email or "research@scitex.io"
64
68
 
65
- # Initialize search engines
69
+ # Initialize search engines with email for rate limit benefits
66
70
  self.engines = {
67
- 'PubMed': PubMedSearchEngine(),
68
- 'CrossRef': CrossRefSearchEngine(),
69
- 'arXiv': ArXivSearchEngine(),
70
- 'Semantic_Scholar': SemanticScholarSearchEngine(),
71
- 'OpenAlex': OpenAlexSearchEngine(),
71
+ 'PubMed': PubMedSearchEngine(email=self.email),
72
+ 'CrossRef': CrossRefSearchEngine(email=self.email),
73
+ 'arXiv': ArXivSearchEngine(email=self.email),
74
+ 'Semantic_Scholar': SemanticScholarSearchEngine(email=self.email),
75
+ 'OpenAlex': OpenAlexSearchEngine(email=self.email),
72
76
  }
73
77
 
74
78
  # Statistics
@@ -328,12 +332,18 @@ class ScholarPipelineSearchParallel:
328
332
  if 'metrics' in result:
329
333
  if result['metrics'].get('citation_count'):
330
334
  paper.metadata.citation_count.total = result['metrics']['citation_count']
331
- # Note: is_open_access not in Paper structure
335
+ if 'is_open_access' in result['metrics']:
336
+ paper.metadata.access.is_open_access = result['metrics']['is_open_access']
337
+ paper.metadata.access.is_open_access_engines = [engine_name]
332
338
 
333
339
  if 'urls' in result:
334
340
  if result['urls'].get('pdf'):
335
341
  # pdfs is a list of dicts with url/source keys
336
342
  paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
343
+ # If this is an open access paper, also store the PDF URL as oa_url
344
+ if paper.metadata.access.is_open_access:
345
+ paper.metadata.access.oa_url = result['urls']['pdf']
346
+ paper.metadata.access.oa_url_engines = [engine_name]
337
347
  if result['urls'].get('publisher'):
338
348
  paper.metadata.url.publisher = result['urls']['publisher']
339
349
  if result['urls'].get('doi_url'):
@@ -730,13 +740,21 @@ class ScholarPipelineSearchParallel:
730
740
 
731
741
  # Publication info
732
742
  if hasattr(meta, 'publication'):
733
- result['journal'] = meta.publication.journal or ''
743
+ journal_raw = meta.publication.journal or ''
744
+ result['journal'] = normalize_journal_name(journal_raw) if journal_raw else ''
734
745
  result['impact_factor'] = meta.publication.impact_factor
735
746
 
736
747
  # Metrics
737
748
  if hasattr(meta, 'citation_count'):
738
749
  result['citation_count'] = meta.citation_count.total or 0
739
- result['is_open_access'] = False # Not stored in current Paper structure
750
+
751
+ # Access metadata
752
+ if hasattr(meta, 'access'):
753
+ result['is_open_access'] = meta.access.is_open_access or False
754
+ result['oa_status'] = meta.access.oa_status
755
+ result['oa_url'] = meta.access.oa_url
756
+ else:
757
+ result['is_open_access'] = False
740
758
 
741
759
  # URLs
742
760
  if hasattr(meta, 'url'):
@@ -47,22 +47,25 @@ class ScholarPipelineSearchSingle:
47
47
  def __init__(
48
48
  self,
49
49
  use_cache: bool = True,
50
+ email: str = None,
50
51
  ):
51
52
  """Initialize sequential search pipeline.
52
53
 
53
54
  Args:
54
55
  use_cache: Whether to use caching for API results
56
+ email: User email for API rate limit benefits (PubMed, CrossRef, OpenAlex)
55
57
  """
56
58
  self.name = self.__class__.__name__
57
59
  self.use_cache = use_cache
60
+ self.email = email or "research@scitex.io"
58
61
 
59
- # Initialize search engines
62
+ # Initialize search engines with email for rate limit benefits
60
63
  self.engines = {
61
- 'PubMed': PubMedSearchEngine(),
62
- 'CrossRef': CrossRefSearchEngine(),
63
- 'arXiv': ArXivSearchEngine(),
64
- 'Semantic_Scholar': SemanticScholarSearchEngine(),
65
- 'OpenAlex': OpenAlexSearchEngine(),
64
+ 'PubMed': PubMedSearchEngine(email=self.email),
65
+ 'CrossRef': CrossRefSearchEngine(email=self.email),
66
+ 'arXiv': ArXivSearchEngine(email=self.email),
67
+ 'Semantic_Scholar': SemanticScholarSearchEngine(email=self.email),
68
+ 'OpenAlex': OpenAlexSearchEngine(email=self.email),
66
69
  }
67
70
 
68
71
  # Statistics
@@ -265,12 +268,18 @@ class ScholarPipelineSearchSingle:
265
268
  if 'metrics' in result:
266
269
  if result['metrics'].get('citation_count'):
267
270
  paper.metadata.citation_count.total = result['metrics']['citation_count']
268
- # Note: is_open_access not in Paper structure
271
+ if 'is_open_access' in result['metrics']:
272
+ paper.metadata.access.is_open_access = result['metrics']['is_open_access']
273
+ paper.metadata.access.is_open_access_engines = [engine_name]
269
274
 
270
275
  if 'urls' in result:
271
276
  if result['urls'].get('pdf'):
272
277
  # pdfs is a list of dicts with url/source keys
273
278
  paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
279
+ # If this is an open access paper, also store the PDF URL as oa_url
280
+ if paper.metadata.access.is_open_access:
281
+ paper.metadata.access.oa_url = result['urls']['pdf']
282
+ paper.metadata.access.oa_url_engines = [engine_name]
274
283
  if result['urls'].get('publisher'):
275
284
  paper.metadata.url.publisher = result['urls']['publisher']
276
285
  if result['urls'].get('doi_url'):
@@ -458,7 +467,14 @@ class ScholarPipelineSearchSingle:
458
467
  # Metrics
459
468
  if hasattr(meta, 'citation_count'):
460
469
  result['citation_count'] = meta.citation_count.total or 0
461
- result['is_open_access'] = False # Not stored in current Paper structure
470
+
471
+ # Access metadata
472
+ if hasattr(meta, 'access'):
473
+ result['is_open_access'] = meta.access.is_open_access or False
474
+ result['oa_status'] = meta.access.oa_status
475
+ result['oa_url'] = meta.access.oa_url
476
+ else:
477
+ result['is_open_access'] = False
462
478
 
463
479
  # URLs
464
480
  if hasattr(meta, 'url'):
@@ -45,26 +45,31 @@ class ScholarSearchEngine:
45
45
  self,
46
46
  default_mode: Literal['parallel', 'single'] = 'parallel',
47
47
  use_cache: bool = True,
48
+ email: str = None,
48
49
  ):
49
50
  """Initialize unified search engine.
50
51
 
51
52
  Args:
52
53
  default_mode: Default search mode ('parallel' or 'single')
53
54
  use_cache: Whether to use caching for API results
55
+ email: User email for API rate limit benefits (PubMed, CrossRef, OpenAlex)
54
56
  """
55
57
  self.name = self.__class__.__name__
56
58
  self.default_mode = default_mode
57
59
  self.use_cache = use_cache
60
+ self.email = email
58
61
 
59
- # Initialize both pipeline modes
62
+ # Initialize both pipeline modes with email for rate limit benefits
60
63
  self.parallel_pipeline = ScholarPipelineSearchParallel(
61
64
  max_workers=5,
62
65
  timeout_per_engine=30.0,
63
66
  use_cache=use_cache,
67
+ email=email,
64
68
  )
65
69
 
66
70
  self.single_pipeline = ScholarPipelineSearchSingle(
67
71
  use_cache=use_cache,
72
+ email=email,
68
73
  )
69
74
 
70
75
  # Statistics
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scitex
3
- Version: 2.4.1
3
+ Version: 2.4.3
4
4
  Summary: A comprehensive Python library for scientific computing and data analysis
5
5
  Project-URL: Homepage, https://github.com/ywatanabe1989/scitex-code
6
6
  Project-URL: Documentation, https://scitex.readthedocs.io