scitex 2.16.0__py3-none-any.whl → 2.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/_mcp_tools/audio.py +11 -65
- scitex/audio/README.md +40 -12
- scitex/audio/__init__.py +27 -235
- scitex/audio/_audio_check.py +93 -0
- scitex/audio/_mcp/speak_handlers.py +56 -8
- scitex/audio/_speak.py +295 -0
- scitex/audio/mcp_server.py +98 -73
- scitex/scholar/url_finder/.tmp/open_url/KNOWN_RESOLVERS.py +462 -0
- scitex/scholar/url_finder/.tmp/open_url/README.md +223 -0
- scitex/scholar/url_finder/.tmp/open_url/_DOIToURLResolver.py +694 -0
- scitex/scholar/url_finder/.tmp/open_url/_OpenURLResolver.py +1160 -0
- scitex/scholar/url_finder/.tmp/open_url/_ResolverLinkFinder.py +344 -0
- scitex/scholar/url_finder/.tmp/open_url/__init__.py +24 -0
- scitex/social/__init__.py +1 -24
- scitex/writer/README.md +25 -409
- scitex/writer/__init__.py +98 -13
- {scitex-2.16.0.dist-info → scitex-2.16.2.dist-info}/METADATA +6 -1
- {scitex-2.16.0.dist-info → scitex-2.16.2.dist-info}/RECORD +21 -93
- scitex/dev/plt/data/mpl/PLOTTING_FUNCTIONS.yaml +0 -90
- scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES.yaml +0 -1571
- scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES_DETAILED.yaml +0 -6262
- scitex/dev/plt/data/mpl/SIGNATURES_FLATTENED.yaml +0 -1274
- scitex/dev/plt/data/mpl/dir_ax.txt +0 -459
- scitex/scholar/data/.gitkeep +0 -0
- scitex/scholar/data/README.md +0 -44
- scitex/scholar/data/bib_files/bibliography.bib +0 -1952
- scitex/scholar/data/bib_files/neurovista.bib +0 -277
- scitex/scholar/data/bib_files/neurovista_enriched.bib +0 -441
- scitex/scholar/data/bib_files/neurovista_enriched_enriched.bib +0 -441
- scitex/scholar/data/bib_files/neurovista_processed.bib +0 -338
- scitex/scholar/data/bib_files/openaccess.bib +0 -89
- scitex/scholar/data/bib_files/pac-seizure_prediction_enriched.bib +0 -2178
- scitex/scholar/data/bib_files/pac.bib +0 -698
- scitex/scholar/data/bib_files/pac_enriched.bib +0 -1061
- scitex/scholar/data/bib_files/pac_processed.bib +0 -0
- scitex/scholar/data/bib_files/pac_titles.txt +0 -75
- scitex/scholar/data/bib_files/paywalled.bib +0 -98
- scitex/scholar/data/bib_files/related-papers-by-coauthors.bib +0 -58
- scitex/scholar/data/bib_files/related-papers-by-coauthors_enriched.bib +0 -87
- scitex/scholar/data/bib_files/seizure_prediction.bib +0 -694
- scitex/scholar/data/bib_files/seizure_prediction_processed.bib +0 -0
- scitex/scholar/data/bib_files/test_complete_enriched.bib +0 -437
- scitex/scholar/data/bib_files/test_final_enriched.bib +0 -437
- scitex/scholar/data/bib_files/test_seizure.bib +0 -46
- scitex/scholar/data/impact_factor/JCR_IF_2022.xlsx +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024.db +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024.xlsx +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024_v01.db +0 -0
- scitex/scholar/data/impact_factor.db +0 -0
- scitex/writer/Writer.py +0 -487
- scitex/writer/_clone_writer_project.py +0 -160
- scitex/writer/_compile/__init__.py +0 -41
- scitex/writer/_compile/_compile_async.py +0 -130
- scitex/writer/_compile/_compile_unified.py +0 -148
- scitex/writer/_compile/_parser.py +0 -63
- scitex/writer/_compile/_runner.py +0 -457
- scitex/writer/_compile/_validator.py +0 -46
- scitex/writer/_compile/manuscript.py +0 -110
- scitex/writer/_compile/revision.py +0 -82
- scitex/writer/_compile/supplementary.py +0 -100
- scitex/writer/_dataclasses/__init__.py +0 -44
- scitex/writer/_dataclasses/config/_CONSTANTS.py +0 -46
- scitex/writer/_dataclasses/config/_WriterConfig.py +0 -175
- scitex/writer/_dataclasses/config/__init__.py +0 -9
- scitex/writer/_dataclasses/contents/_ManuscriptContents.py +0 -236
- scitex/writer/_dataclasses/contents/_RevisionContents.py +0 -136
- scitex/writer/_dataclasses/contents/_SupplementaryContents.py +0 -114
- scitex/writer/_dataclasses/contents/__init__.py +0 -9
- scitex/writer/_dataclasses/core/_Document.py +0 -146
- scitex/writer/_dataclasses/core/_DocumentSection.py +0 -546
- scitex/writer/_dataclasses/core/__init__.py +0 -7
- scitex/writer/_dataclasses/results/_CompilationResult.py +0 -165
- scitex/writer/_dataclasses/results/_LaTeXIssue.py +0 -102
- scitex/writer/_dataclasses/results/_SaveSectionsResponse.py +0 -118
- scitex/writer/_dataclasses/results/_SectionReadResponse.py +0 -131
- scitex/writer/_dataclasses/results/__init__.py +0 -11
- scitex/writer/_dataclasses/tree/MINIMUM_FILES.md +0 -121
- scitex/writer/_dataclasses/tree/_ConfigTree.py +0 -86
- scitex/writer/_dataclasses/tree/_ManuscriptTree.py +0 -84
- scitex/writer/_dataclasses/tree/_RevisionTree.py +0 -97
- scitex/writer/_dataclasses/tree/_ScriptsTree.py +0 -118
- scitex/writer/_dataclasses/tree/_SharedTree.py +0 -100
- scitex/writer/_dataclasses/tree/_SupplementaryTree.py +0 -101
- scitex/writer/_dataclasses/tree/__init__.py +0 -23
- scitex/writer/_mcp/__init__.py +0 -4
- scitex/writer/_mcp/handlers.py +0 -32
- scitex/writer/_mcp/tool_schemas.py +0 -33
- scitex/writer/_project/__init__.py +0 -29
- scitex/writer/_project/_create.py +0 -89
- scitex/writer/_project/_trees.py +0 -63
- scitex/writer/_project/_validate.py +0 -61
- scitex/writer/utils/.legacy_git_retry.py +0 -164
- scitex/writer/utils/__init__.py +0 -24
- scitex/writer/utils/_converters.py +0 -635
- scitex/writer/utils/_parse_latex_logs.py +0 -138
- scitex/writer/utils/_parse_script_args.py +0 -156
- scitex/writer/utils/_verify_tree_structure.py +0 -205
- scitex/writer/utils/_watch.py +0 -96
- {scitex-2.16.0.dist-info → scitex-2.16.2.dist-info}/WHEEL +0 -0
- {scitex-2.16.0.dist-info → scitex-2.16.2.dist-info}/entry_points.txt +0 -0
- {scitex-2.16.0.dist-info → scitex-2.16.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1160 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# Timestamp: "2025-08-06 16:55:39 (ywatanabe)"
|
|
4
|
+
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/open_url/_OpenURLResolver.py
|
|
5
|
+
# ----------------------------------------
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
__FILE__ = (
|
|
11
|
+
"./src/scitex/scholar/open_url/_OpenURLResolver.py"
|
|
12
|
+
)
|
|
13
|
+
__DIR__ = os.path.dirname(__FILE__)
|
|
14
|
+
# ----------------------------------------
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import random
|
|
18
|
+
import time
|
|
19
|
+
from typing import List, Union
|
|
20
|
+
|
|
21
|
+
from scitex import logging
|
|
22
|
+
|
|
23
|
+
"""OpenURL resolver for finding full-text access through institutional libraries."""
|
|
24
|
+
|
|
25
|
+
from typing import Any, Dict, Optional
|
|
26
|
+
from urllib.parse import urlencode
|
|
27
|
+
|
|
28
|
+
from playwright.async_api import Page
|
|
29
|
+
|
|
30
|
+
from scitex.scholar.browser import BrowserManager
|
|
31
|
+
from scitex.scholar.config import ScholarConfig
|
|
32
|
+
|
|
33
|
+
from ...errors import ScholarError
|
|
34
|
+
from ._ResolverLinkFinder import ResolverLinkFinder
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class OpenURLResolver:
|
|
40
|
+
"""Resolves DOIs/metadata to full-text URLs via institutional OpenURL resolver.
|
|
41
|
+
|
|
42
|
+
OpenURL is a standardized format for encoding bibliographic information
|
|
43
|
+
that libraries use to link to full-text resources."""
|
|
44
|
+
|
|
45
|
+
AUTH_PATTERNS = [
|
|
46
|
+
"openathens.net",
|
|
47
|
+
"shibauth",
|
|
48
|
+
"saml",
|
|
49
|
+
"institutionlogin",
|
|
50
|
+
"iam.atypon.com",
|
|
51
|
+
"auth.elsevier.com",
|
|
52
|
+
"go.gale.com/ps/headerQuickSearch",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
PUBLISHER_DOMAINS = [
|
|
56
|
+
"sciencedirect.com",
|
|
57
|
+
"nature.com",
|
|
58
|
+
"springer.com",
|
|
59
|
+
"wiley.com",
|
|
60
|
+
"onlinelibrary.wiley.com",
|
|
61
|
+
"acs.org",
|
|
62
|
+
"tandfonline.com",
|
|
63
|
+
"sagepub.com",
|
|
64
|
+
"academic.oup.com",
|
|
65
|
+
"science.org",
|
|
66
|
+
"pnas.org",
|
|
67
|
+
"bmj.com",
|
|
68
|
+
"cell.com",
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
auth_manager,
|
|
74
|
+
resolver_url: Optional[str] = None,
|
|
75
|
+
browser_mode: str = "stealth",
|
|
76
|
+
config: Optional[ScholarConfig] = None,
|
|
77
|
+
):
|
|
78
|
+
"""Initialize OpenURL resolver.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
auth_manager: Authentication manager for institutional access
|
|
82
|
+
resolver_url: Base URL of institutional OpenURL resolver
|
|
83
|
+
(Details can be seen at https://www.zotero.org/openurl_resolvers)
|
|
84
|
+
browser_mode: Browser mode ("stealth" or "interactive")
|
|
85
|
+
config: ScholarConfig instance (creates new if None)
|
|
86
|
+
"""
|
|
87
|
+
self.auth_manager = auth_manager
|
|
88
|
+
|
|
89
|
+
# Initialize config
|
|
90
|
+
if config is None:
|
|
91
|
+
config = ScholarConfig()
|
|
92
|
+
self.config = config
|
|
93
|
+
|
|
94
|
+
# Resolve resolver URL from config
|
|
95
|
+
self.resolver_url = self.config.resolve(
|
|
96
|
+
"openurl_resolver_url", resolver_url, None, str
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Create BrowserManager with simplified configuration
|
|
100
|
+
self.browser = BrowserManager(
|
|
101
|
+
auth_manager=auth_manager,
|
|
102
|
+
browser_mode=browser_mode,
|
|
103
|
+
config=self.config,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
self.timeout = 30
|
|
107
|
+
self._link_finder = ResolverLinkFinder()
|
|
108
|
+
|
|
109
|
+
# Screenshot capture setup (optional, controlled by config)
|
|
110
|
+
self.capture_screenshots = self.config.resolve(
|
|
111
|
+
"capture_screenshots", None, False, bool
|
|
112
|
+
)
|
|
113
|
+
if self.capture_screenshots:
|
|
114
|
+
from datetime import datetime
|
|
115
|
+
|
|
116
|
+
self.screenshot_dir = (
|
|
117
|
+
self.config.paths.get_screenshots_dir() / "openurl"
|
|
118
|
+
)
|
|
119
|
+
self.screenshot_dir.mkdir(parents=True, exist_ok=True)
|
|
120
|
+
self.session_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
121
|
+
|
|
122
|
+
async def _capture_checkpoint_screenshot_async(
|
|
123
|
+
self, page, stage: str, doi: str = ""
|
|
124
|
+
) -> Optional[str]:
|
|
125
|
+
"""Capture screenshot at checkpoint for debugging."""
|
|
126
|
+
if not self.capture_screenshots:
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
from datetime import datetime
|
|
131
|
+
|
|
132
|
+
timestamp = datetime.now().strftime("%H%M%S")
|
|
133
|
+
doi_safe = (
|
|
134
|
+
doi.replace("/", "-").replace(".", "_") if doi else "unknown"
|
|
135
|
+
)
|
|
136
|
+
screenshot_name = f"openurl_{stage}_{doi_safe}_{timestamp}.png"
|
|
137
|
+
screenshot_path = self.screenshot_dir / screenshot_name
|
|
138
|
+
|
|
139
|
+
await page.screenshot(path=str(screenshot_path), full_page=True)
|
|
140
|
+
logger.info(
|
|
141
|
+
f"📸 Screenshot captured: {stage} -> {screenshot_name}"
|
|
142
|
+
)
|
|
143
|
+
return str(screenshot_path)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.warning(f"📸 Screenshot capture failed at {stage}: {e}")
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
def build_openurl(
|
|
149
|
+
self,
|
|
150
|
+
title: str = "",
|
|
151
|
+
authors: Optional[list] = None,
|
|
152
|
+
journal: str = "",
|
|
153
|
+
year: Optional[int] = None,
|
|
154
|
+
volume: Optional[int] = None,
|
|
155
|
+
issue: Optional[int] = None,
|
|
156
|
+
pages: str = "",
|
|
157
|
+
doi: str = "",
|
|
158
|
+
pmid: str = "",
|
|
159
|
+
) -> str:
|
|
160
|
+
"""Build OpenURL query string from paper metadata."""
|
|
161
|
+
params = {
|
|
162
|
+
"ctx_ver": "Z39.88-2004",
|
|
163
|
+
"rft_val_fmt": "info:ofi/fmt:kev:mtx:journal",
|
|
164
|
+
"rft.genre": "article",
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if title:
|
|
168
|
+
params["rft.atitle"] = title
|
|
169
|
+
if journal:
|
|
170
|
+
params["rft.jtitle"] = journal
|
|
171
|
+
if year:
|
|
172
|
+
params["rft.date"] = str(year)
|
|
173
|
+
if volume:
|
|
174
|
+
params["rft.volume"] = str(volume)
|
|
175
|
+
if issue:
|
|
176
|
+
params["rft.issue"] = str(issue)
|
|
177
|
+
if pages:
|
|
178
|
+
if "-" in str(pages):
|
|
179
|
+
spage, epage = pages.split("-", 1)
|
|
180
|
+
params["rft.spage"] = spage.strip()
|
|
181
|
+
params["rft.epage"] = epage.strip()
|
|
182
|
+
else:
|
|
183
|
+
params["rft.spage"] = str(pages)
|
|
184
|
+
if doi:
|
|
185
|
+
params["rft.doi"] = doi
|
|
186
|
+
if pmid:
|
|
187
|
+
params["rft.pmid"] = str(pmid)
|
|
188
|
+
|
|
189
|
+
if authors:
|
|
190
|
+
first_author = authors[0]
|
|
191
|
+
if "," in first_author:
|
|
192
|
+
last, first = first_author.split(",", 1)
|
|
193
|
+
params["rft.aulast"] = last.strip()
|
|
194
|
+
params["rft.aufirst"] = first.strip()
|
|
195
|
+
params["rft.au"] = first_author
|
|
196
|
+
|
|
197
|
+
query_string = urlencode(params, safe=":/")
|
|
198
|
+
return f"{self.resolver_url}?{query_string}"
|
|
199
|
+
|
|
200
|
+
def _is_publisher_url(self, url: str, doi: str = "") -> bool:
|
|
201
|
+
"""Check if URL is from expected publisher domain."""
|
|
202
|
+
if not url:
|
|
203
|
+
return False
|
|
204
|
+
if any(pattern in url.lower() for pattern in self.AUTH_PATTERNS):
|
|
205
|
+
return False
|
|
206
|
+
if any(domain in url.lower() for domain in self.PUBLISHER_DOMAINS):
|
|
207
|
+
return True
|
|
208
|
+
else:
|
|
209
|
+
return False
|
|
210
|
+
|
|
211
|
+
async def _follow_saml_redirect_async(self, page, saml_url, doi=""):
|
|
212
|
+
"""Follow SAML/SSO redirect chain until publisher URL is reached."""
|
|
213
|
+
logger.info(f"Following SAML redirect chain starting from: {saml_url}")
|
|
214
|
+
|
|
215
|
+
if self._is_publisher_url(saml_url, doi):
|
|
216
|
+
return saml_url
|
|
217
|
+
|
|
218
|
+
await page.goto(
|
|
219
|
+
saml_url,
|
|
220
|
+
wait_until="domcontentloaded",
|
|
221
|
+
timeout=15000, # Increased from 1.5-3s to 15s
|
|
222
|
+
)
|
|
223
|
+
last_url = ""
|
|
224
|
+
|
|
225
|
+
for attempt in range(8):
|
|
226
|
+
current_url = page.url
|
|
227
|
+
logger.debug(f"SAML redirect attempt {attempt + 1}: {current_url}")
|
|
228
|
+
|
|
229
|
+
if self._is_publisher_url(current_url, doi):
|
|
230
|
+
logger.info(
|
|
231
|
+
f"Successfully navigated to publisher URL: {current_url}"
|
|
232
|
+
)
|
|
233
|
+
return current_url
|
|
234
|
+
|
|
235
|
+
# If URL hasn't changed in 2 attempts, we're stuck
|
|
236
|
+
if current_url == last_url and attempt > 1:
|
|
237
|
+
logger.warning(f"SAML redirect stuck at: {current_url}")
|
|
238
|
+
return current_url
|
|
239
|
+
|
|
240
|
+
last_url = current_url
|
|
241
|
+
|
|
242
|
+
# Only try form submission first few attempts
|
|
243
|
+
if attempt < 3:
|
|
244
|
+
try:
|
|
245
|
+
forms = await page.query_selector_all("form")
|
|
246
|
+
for form in forms:
|
|
247
|
+
if await form.is_visible():
|
|
248
|
+
logger.debug("Submitting visible form...")
|
|
249
|
+
await form.evaluate("form => form.submit()")
|
|
250
|
+
await page.wait_for_load_state(
|
|
251
|
+
"domcontentloaded", timeout=10000
|
|
252
|
+
)
|
|
253
|
+
break
|
|
254
|
+
except:
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
await page.wait_for_timeout(1000)
|
|
258
|
+
|
|
259
|
+
final_url = page.url
|
|
260
|
+
logger.info(f"SAML redirect completed at: {final_url}")
|
|
261
|
+
return final_url
|
|
262
|
+
|
|
263
|
+
async def _find_and_click_publisher_go_button_async(self, page, doi=""):
|
|
264
|
+
"""Find and click the appropriate publisher GO button on the OpenURL resolver page.
|
|
265
|
+
|
|
266
|
+
This method implements our proven GO button detection and clicking logic
|
|
267
|
+
that successfully worked for Science.org and Nature.com access.
|
|
268
|
+
"""
|
|
269
|
+
try:
|
|
270
|
+
logger.info("Looking for publisher GO buttons on resolver page...")
|
|
271
|
+
|
|
272
|
+
# Get all GO buttons with context information
|
|
273
|
+
go_buttons = await page.evaluate(
|
|
274
|
+
"""() => {
|
|
275
|
+
const goButtons = Array.from(document.querySelectorAll('input[value="Go"], button[value="Go"], input[value="GO"], button[value="GO"]'));
|
|
276
|
+
return goButtons.map((btn, index) => {
|
|
277
|
+
const parentRow = btn.closest('tr') || btn.parentElement;
|
|
278
|
+
const rowText = parentRow ? parentRow.textContent.trim() : '';
|
|
279
|
+
|
|
280
|
+
// Check for publisher indicators in the row text
|
|
281
|
+
const isScience = rowText.toLowerCase().includes('american association') ||
|
|
282
|
+
rowText.toLowerCase().includes('aaas') ||
|
|
283
|
+
rowText.toLowerCase().includes('science');
|
|
284
|
+
const isNature = rowText.toLowerCase().includes('nature') ||
|
|
285
|
+
rowText.toLowerCase().includes('springer');
|
|
286
|
+
const isWiley = rowText.toLowerCase().includes('wiley');
|
|
287
|
+
const isElsevier = rowText.toLowerCase().includes('elsevier') ||
|
|
288
|
+
rowText.toLowerCase().includes('sciencedirect');
|
|
289
|
+
|
|
290
|
+
return {
|
|
291
|
+
index: index,
|
|
292
|
+
globalIndex: Array.from(document.querySelectorAll('input, button, a, [onclick]')).indexOf(btn),
|
|
293
|
+
value: btn.value,
|
|
294
|
+
rowText: rowText,
|
|
295
|
+
isScience: isScience,
|
|
296
|
+
isNature: isNature,
|
|
297
|
+
isWiley: isWiley,
|
|
298
|
+
isElsevier: isElsevier,
|
|
299
|
+
isPublisher: isScience || isNature || isWiley || isElsevier
|
|
300
|
+
};
|
|
301
|
+
});
|
|
302
|
+
}"""
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
if not go_buttons:
|
|
306
|
+
logger.warning("No GO buttons found on resolver page")
|
|
307
|
+
return {"success": False, "reason": "no_go_buttons"}
|
|
308
|
+
|
|
309
|
+
logger.info(f"Found {len(go_buttons)} GO buttons")
|
|
310
|
+
for btn in go_buttons:
|
|
311
|
+
logger.debug(
|
|
312
|
+
f"GO button {btn['index']}: {btn['rowText'][:50]}... (Publisher: {btn['isPublisher']})"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Find the most appropriate publisher button
|
|
316
|
+
target_button = None
|
|
317
|
+
|
|
318
|
+
# Priority order: Science > Nature > Other publishers
|
|
319
|
+
for btn in go_buttons:
|
|
320
|
+
if btn["isScience"]:
|
|
321
|
+
target_button = btn
|
|
322
|
+
logger.info(
|
|
323
|
+
f"Selected Science/AAAS GO button: {btn['rowText'][:50]}..."
|
|
324
|
+
)
|
|
325
|
+
break
|
|
326
|
+
elif btn["isNature"]:
|
|
327
|
+
target_button = btn
|
|
328
|
+
logger.info(
|
|
329
|
+
f"Selected Nature GO button: {btn['rowText'][:50]}..."
|
|
330
|
+
)
|
|
331
|
+
break
|
|
332
|
+
elif btn["isPublisher"]:
|
|
333
|
+
target_button = btn
|
|
334
|
+
logger.info(
|
|
335
|
+
f"Selected publisher GO button: {btn['rowText'][:50]}..."
|
|
336
|
+
)
|
|
337
|
+
break
|
|
338
|
+
|
|
339
|
+
# Fallback: if no publisher buttons, try the first few GO buttons (might be direct access)
|
|
340
|
+
if not target_button and go_buttons:
|
|
341
|
+
target_button = go_buttons[0]
|
|
342
|
+
logger.info(
|
|
343
|
+
f"Using fallback GO button: {target_button['rowText'][:50]}..."
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
if not target_button:
|
|
347
|
+
logger.warning("No suitable GO button found")
|
|
348
|
+
return {"success": False, "reason": "no_suitable_button"}
|
|
349
|
+
|
|
350
|
+
# Click the selected GO button and handle popup
|
|
351
|
+
logger.info("Clicking GO button and waiting for popup...")
|
|
352
|
+
|
|
353
|
+
try:
|
|
354
|
+
# Set up popup listener before clicking
|
|
355
|
+
popup_promise = page.wait_for_event("popup", timeout=30000)
|
|
356
|
+
|
|
357
|
+
# Click the button using its global index for reliability
|
|
358
|
+
click_result = await page.evaluate(
|
|
359
|
+
f"""() => {{
|
|
360
|
+
const allElements = Array.from(document.querySelectorAll('input, button, a, [onclick]'));
|
|
361
|
+
const targetButton = allElements[{target_button['globalIndex']}];
|
|
362
|
+
if (targetButton && (targetButton.value === 'Go' || targetButton.value === 'GO')) {{
|
|
363
|
+
console.log('Clicking GO button:', targetButton);
|
|
364
|
+
targetButton.click();
|
|
365
|
+
return 'clicked';
|
|
366
|
+
}}
|
|
367
|
+
return 'not-found';
|
|
368
|
+
}}"""
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
if click_result != "clicked":
|
|
372
|
+
logger.warning("Failed to click GO button")
|
|
373
|
+
return {"success": False, "reason": "click_failed"}
|
|
374
|
+
|
|
375
|
+
# Wait for popup
|
|
376
|
+
popup = await popup_promise
|
|
377
|
+
logger.info("Publisher popup opened successfully")
|
|
378
|
+
|
|
379
|
+
# Wait for popup to load
|
|
380
|
+
await popup.wait_for_load_state(
|
|
381
|
+
"domcontentloaded", timeout=30000
|
|
382
|
+
)
|
|
383
|
+
await popup.wait_for_timeout(5000) # Allow time for redirects
|
|
384
|
+
|
|
385
|
+
final_url = popup.url
|
|
386
|
+
popup_title = await popup.title()
|
|
387
|
+
|
|
388
|
+
logger.info(f"Successfully accessed: {popup_title}")
|
|
389
|
+
logger.info(f"Final URL: {final_url}")
|
|
390
|
+
|
|
391
|
+
# Verify we reached a publisher URL
|
|
392
|
+
if self._is_publisher_url(final_url, doi):
|
|
393
|
+
logger.success(
|
|
394
|
+
f"Successfully reached publisher: {final_url}"
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
result = {
|
|
398
|
+
"final_url": final_url,
|
|
399
|
+
"resolver_url": page.url,
|
|
400
|
+
"access_type": "publisher_go_button",
|
|
401
|
+
"success": True,
|
|
402
|
+
"publisher_detected": True,
|
|
403
|
+
"popup_page": popup, # Keep popup open for potential PDF download
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
# Don't close popup immediately - let caller decide
|
|
407
|
+
# await popup.close()
|
|
408
|
+
return result
|
|
409
|
+
else:
|
|
410
|
+
logger.info(f"Reached non-publisher URL: {final_url}")
|
|
411
|
+
|
|
412
|
+
result = {
|
|
413
|
+
"final_url": final_url,
|
|
414
|
+
"resolver_url": page.url,
|
|
415
|
+
"access_type": "go_button_redirect",
|
|
416
|
+
"success": True,
|
|
417
|
+
"publisher_detected": False,
|
|
418
|
+
"popup_page": popup, # Keep popup open for potential PDF download
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
# Don't close popup immediately - let caller decide
|
|
422
|
+
# await popup.close()
|
|
423
|
+
return result
|
|
424
|
+
|
|
425
|
+
except Exception as popup_error:
|
|
426
|
+
logger.warning(f"Popup handling failed: {popup_error}")
|
|
427
|
+
return {
|
|
428
|
+
"success": False,
|
|
429
|
+
"reason": f"popup_error: {popup_error}",
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
except Exception as e:
|
|
433
|
+
logger.error(f"GO button detection failed: {e}")
|
|
434
|
+
return {"success": False, "reason": f"detection_error: {e}"}
|
|
435
|
+
|
|
436
|
+
async def _download_pdf_async_from_publisher_page(
|
|
437
|
+
self, popup, filename, download_dir="downloads"
|
|
438
|
+
):
|
|
439
|
+
"""Download PDF from publisher page after successful GO button access.
|
|
440
|
+
|
|
441
|
+
This method implements our proven PDF download logic that works
|
|
442
|
+
with various publisher sites including Science.org and Nature.com.
|
|
443
|
+
"""
|
|
444
|
+
from pathlib import Path
|
|
445
|
+
|
|
446
|
+
try:
|
|
447
|
+
download_path = Path(download_dir)
|
|
448
|
+
download_path.mkdir(exist_ok=True)
|
|
449
|
+
|
|
450
|
+
logger.info("Looking for PDF download links on publisher page...")
|
|
451
|
+
|
|
452
|
+
# Find PDF download links
|
|
453
|
+
pdf_links = await popup.evaluate(
|
|
454
|
+
"""() => {
|
|
455
|
+
const allLinks = Array.from(document.querySelectorAll('a, button, input'));
|
|
456
|
+
return allLinks.filter(el =>
|
|
457
|
+
el.textContent.toLowerCase().includes('pdf') ||
|
|
458
|
+
el.textContent.toLowerCase().includes('download') ||
|
|
459
|
+
el.href?.includes('pdf') ||
|
|
460
|
+
el.getAttribute('data-track-action')?.includes('pdf')
|
|
461
|
+
).map(el => ({
|
|
462
|
+
tag: el.tagName,
|
|
463
|
+
text: el.textContent.trim(),
|
|
464
|
+
href: el.href || el.value || 'no-href',
|
|
465
|
+
className: el.className,
|
|
466
|
+
id: el.id,
|
|
467
|
+
trackAction: el.getAttribute('data-track-action') || 'none'
|
|
468
|
+
}));
|
|
469
|
+
}"""
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
if not pdf_links:
|
|
473
|
+
logger.warning("No PDF links found on publisher page")
|
|
474
|
+
return {"success": False, "reason": "no_pdf_links"}
|
|
475
|
+
|
|
476
|
+
logger.info(f"Found {len(pdf_links)} potential PDF links")
|
|
477
|
+
for i, link in enumerate(pdf_links):
|
|
478
|
+
logger.debug(
|
|
479
|
+
f"PDF link {i}: {link['text'][:30]}... | {link['href'][:50]}..."
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
# Find the best PDF download link
|
|
483
|
+
main_pdf_link = None
|
|
484
|
+
|
|
485
|
+
# Priority order: direct download > PDF with download > PDF view
|
|
486
|
+
for link in pdf_links:
|
|
487
|
+
if "download pdf" in link["text"].lower():
|
|
488
|
+
main_pdf_link = link
|
|
489
|
+
break
|
|
490
|
+
elif link["href"] != "no-href" and link["href"].endswith(
|
|
491
|
+
".pdf"
|
|
492
|
+
):
|
|
493
|
+
main_pdf_link = link
|
|
494
|
+
break
|
|
495
|
+
elif (
|
|
496
|
+
"pdf" in link["text"].lower()
|
|
497
|
+
and "view" not in link["text"].lower()
|
|
498
|
+
):
|
|
499
|
+
main_pdf_link = link
|
|
500
|
+
break
|
|
501
|
+
|
|
502
|
+
if not main_pdf_link:
|
|
503
|
+
main_pdf_link = pdf_links[
|
|
504
|
+
0
|
|
505
|
+
] # Fallback to first PDF-related link
|
|
506
|
+
|
|
507
|
+
logger.info(f"Selected PDF link: {main_pdf_link['text'][:40]}...")
|
|
508
|
+
|
|
509
|
+
# Set up download path
|
|
510
|
+
file_path = download_path / filename
|
|
511
|
+
|
|
512
|
+
# Configure download headers
|
|
513
|
+
await popup.set_extra_http_headers(
|
|
514
|
+
{"Accept": "application/pdf,application/octet-stream,*/*"}
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# Try download methods
|
|
518
|
+
pdf_download = False
|
|
519
|
+
|
|
520
|
+
# Method 1: Direct URL navigation
|
|
521
|
+
if (
|
|
522
|
+
main_pdf_link["href"] != "no-href"
|
|
523
|
+
and "pdf" in main_pdf_link["href"].lower()
|
|
524
|
+
):
|
|
525
|
+
try:
|
|
526
|
+
logger.info("Attempting direct PDF URL download...")
|
|
527
|
+
download_promise = popup.wait_for_event(
|
|
528
|
+
"download", timeout=30000
|
|
529
|
+
)
|
|
530
|
+
await popup.goto(main_pdf_link["href"])
|
|
531
|
+
|
|
532
|
+
download = await download_promise
|
|
533
|
+
await download.save_as(str(file_path))
|
|
534
|
+
|
|
535
|
+
if file_path.exists():
|
|
536
|
+
size_mb = file_path.stat().st_size / (1024 * 1024)
|
|
537
|
+
logger.success(
|
|
538
|
+
f"PDF download successfully: {filename} ({size_mb:.1f} MB)"
|
|
539
|
+
)
|
|
540
|
+
pdf_download = True
|
|
541
|
+
|
|
542
|
+
except Exception as e:
|
|
543
|
+
logger.debug(f"Direct download failed: {e}")
|
|
544
|
+
|
|
545
|
+
# Method 2: Click-based download
|
|
546
|
+
if not pdf_download:
|
|
547
|
+
try:
|
|
548
|
+
logger.info("Attempting click-based PDF download...")
|
|
549
|
+
download_promise = popup.wait_for_event(
|
|
550
|
+
"download", timeout=30000
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
# Click the first PDF link
|
|
554
|
+
await popup.evaluate(
|
|
555
|
+
"""() => {
|
|
556
|
+
const allLinks = Array.from(document.querySelectorAll('a, button, input'));
|
|
557
|
+
const pdfLinks = allLinks.filter(el =>
|
|
558
|
+
el.textContent.toLowerCase().includes('pdf') ||
|
|
559
|
+
el.textContent.toLowerCase().includes('download') ||
|
|
560
|
+
el.href?.includes('pdf')
|
|
561
|
+
);
|
|
562
|
+
if (pdfLinks.length > 0) {
|
|
563
|
+
pdfLinks[0].click();
|
|
564
|
+
return 'clicked';
|
|
565
|
+
}
|
|
566
|
+
return 'no-link';
|
|
567
|
+
}"""
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
download = await download_promise
|
|
571
|
+
await download.save_as(str(file_path))
|
|
572
|
+
|
|
573
|
+
if file_path.exists():
|
|
574
|
+
size_mb = file_path.stat().st_size / (1024 * 1024)
|
|
575
|
+
logger.success(
|
|
576
|
+
f"PDF download successfully: {filename} ({size_mb:.1f} MB)"
|
|
577
|
+
)
|
|
578
|
+
pdf_download = True
|
|
579
|
+
|
|
580
|
+
except Exception as e:
|
|
581
|
+
logger.debug(f"Click-based download failed: {e}")
|
|
582
|
+
|
|
583
|
+
if pdf_download:
|
|
584
|
+
return {
|
|
585
|
+
"success": True,
|
|
586
|
+
"filename": filename,
|
|
587
|
+
"path": str(file_path),
|
|
588
|
+
"size_mb": (
|
|
589
|
+
file_path.stat().st_size / (1024 * 1024)
|
|
590
|
+
if file_path.exists()
|
|
591
|
+
else 0
|
|
592
|
+
),
|
|
593
|
+
}
|
|
594
|
+
else:
|
|
595
|
+
logger.warning("All PDF download methods failed")
|
|
596
|
+
# Take screenshot for debugging
|
|
597
|
+
screenshot_path = (
|
|
598
|
+
download_path
|
|
599
|
+
/ f"pdf_download_failed_{filename.replace('.pdf', '.png')}"
|
|
600
|
+
)
|
|
601
|
+
await popup.screenshot(
|
|
602
|
+
path=str(screenshot_path), full_page=True
|
|
603
|
+
)
|
|
604
|
+
logger.info(
|
|
605
|
+
f"Screenshot saved for debugging: {screenshot_path}"
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
return {
|
|
609
|
+
"success": False,
|
|
610
|
+
"reason": "download_failed",
|
|
611
|
+
"screenshot": str(screenshot_path),
|
|
612
|
+
"available_links": [link["text"] for link in pdf_links],
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
except Exception as e:
|
|
616
|
+
logger.error(f"PDF download failed: {e}")
|
|
617
|
+
return {"success": False, "reason": f"error: {e}"}
|
|
618
|
+
|
|
619
|
+
async def resolve_and_download_pdf_async(
|
|
620
|
+
self,
|
|
621
|
+
title: str = "",
|
|
622
|
+
authors: Optional[list] = None,
|
|
623
|
+
journal: str = "",
|
|
624
|
+
year: Optional[int] = None,
|
|
625
|
+
volume: Optional[int] = None,
|
|
626
|
+
issue: Optional[int] = None,
|
|
627
|
+
pages: str = "",
|
|
628
|
+
doi: str = "",
|
|
629
|
+
pmid: str = "",
|
|
630
|
+
filename: str = None,
|
|
631
|
+
download_dir: str = "downloads",
|
|
632
|
+
) -> Dict[str, Any]:
|
|
633
|
+
"""Resolve paper access and download PDF in one operation.
|
|
634
|
+
|
|
635
|
+
This method combines our GO button resolution with PDF download
|
|
636
|
+
to provide a complete paper acquisition workflow.
|
|
637
|
+
"""
|
|
638
|
+
if not filename:
|
|
639
|
+
# Generate filename from metadata
|
|
640
|
+
first_author = (
|
|
641
|
+
authors[0].split(",")[0].strip() if authors else "Unknown"
|
|
642
|
+
)
|
|
643
|
+
filename = f"{first_author}-{year}-{journal.replace(' ', '')}-{title[:30].replace(' ', '_')}.pdf"
|
|
644
|
+
# Clean filename
|
|
645
|
+
filename = "".join(
|
|
646
|
+
c for c in filename if c.isalnum() or c in ".-_"
|
|
647
|
+
).strip()
|
|
648
|
+
|
|
649
|
+
logger.info(f"Starting resolve and download for: {filename}")
|
|
650
|
+
|
|
651
|
+
# Create fresh context for this operation
|
|
652
|
+
browser, context = await self.browser.get_authenticated_browser_and_context_async()
|
|
653
|
+
page = await context.new_page()
|
|
654
|
+
|
|
655
|
+
try:
|
|
656
|
+
# Build OpenURL
|
|
657
|
+
openurl = self.build_openurl(
|
|
658
|
+
title, authors, journal, year, volume, issue, pages, doi, pmid
|
|
659
|
+
)
|
|
660
|
+
logger.info(f"Resolving and downloading via OpenURL: {openurl}")
|
|
661
|
+
|
|
662
|
+
# Navigate to OpenURL resolver
|
|
663
|
+
await page.goto(
|
|
664
|
+
openurl, wait_until="domcontentloaded", timeout=30000
|
|
665
|
+
)
|
|
666
|
+
await page.wait_for_timeout(2000)
|
|
667
|
+
|
|
668
|
+
# Try GO button method
|
|
669
|
+
go_button_result = (
|
|
670
|
+
await self._find_and_click_publisher_go_button_async(page, doi)
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
if (
|
|
674
|
+
go_button_result["success"]
|
|
675
|
+
and "popup_page" in go_button_result
|
|
676
|
+
):
|
|
677
|
+
popup = go_button_result["popup_page"]
|
|
678
|
+
|
|
679
|
+
logger.info(
|
|
680
|
+
"Successfully accessed publisher page, attempting PDF download..."
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
# Try to download PDF
|
|
684
|
+
download_result = (
|
|
685
|
+
await self._download_pdf_async_from_publisher_page(
|
|
686
|
+
popup, filename, download_dir
|
|
687
|
+
)
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
# Close popup
|
|
691
|
+
try:
|
|
692
|
+
await popup.close()
|
|
693
|
+
except:
|
|
694
|
+
pass
|
|
695
|
+
|
|
696
|
+
# Combine results
|
|
697
|
+
final_result = {
|
|
698
|
+
**go_button_result,
|
|
699
|
+
"pdf_download": download_result,
|
|
700
|
+
"filename": filename,
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
# Remove popup reference
|
|
704
|
+
if "popup_page" in final_result:
|
|
705
|
+
del final_result["popup_page"]
|
|
706
|
+
|
|
707
|
+
if download_result["success"]:
|
|
708
|
+
logger.success(f"Successfully download PDF: {filename}")
|
|
709
|
+
else:
|
|
710
|
+
logger.warning(
|
|
711
|
+
f"Paper accessed but PDF download failed: {download_result.get('reason', 'unknown')}"
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
return final_result
|
|
715
|
+
|
|
716
|
+
else:
|
|
717
|
+
logger.warning("Could not access publisher page via GO button")
|
|
718
|
+
return {
|
|
719
|
+
"success": False,
|
|
720
|
+
"reason": "go_button_failed",
|
|
721
|
+
"go_button_result": go_button_result,
|
|
722
|
+
"filename": filename,
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
except Exception as e:
|
|
726
|
+
logger.error(f"Resolve and download failed: {e}")
|
|
727
|
+
return {
|
|
728
|
+
"success": False,
|
|
729
|
+
"reason": f"error: {e}",
|
|
730
|
+
"filename": filename,
|
|
731
|
+
}
|
|
732
|
+
finally:
|
|
733
|
+
await context.close()
|
|
734
|
+
|
|
735
|
+
async def _resolve_single_async(
|
|
736
|
+
self,
|
|
737
|
+
title: str = "",
|
|
738
|
+
authors: Optional[list] = None,
|
|
739
|
+
journal: str = "",
|
|
740
|
+
year: Optional[int] = None,
|
|
741
|
+
volume: Optional[int] = None,
|
|
742
|
+
issue: Optional[int] = None,
|
|
743
|
+
pages: str = "",
|
|
744
|
+
doi: str = "",
|
|
745
|
+
pmid: str = "",
|
|
746
|
+
) -> Optional[Dict[str, Any]]:
|
|
747
|
+
|
|
748
|
+
# Note: Removed self.__init__ call that was creating second BrowserManager
|
|
749
|
+
# This was causing configuration inconsistency - the resolver is already initialized
|
|
750
|
+
|
|
751
|
+
if not doi:
|
|
752
|
+
logger.warning("DOI is required for reliable resolution")
|
|
753
|
+
|
|
754
|
+
# Create fresh context for each resolution
|
|
755
|
+
browser, context = await self.browser.get_authenticated_browser_and_context_async()
|
|
756
|
+
page = await context.new_page()
|
|
757
|
+
|
|
758
|
+
openurl = self.build_openurl(
|
|
759
|
+
title, authors, journal, year, volume, issue, pages, doi, pmid
|
|
760
|
+
)
|
|
761
|
+
logger.info(f"Resolving via OpenURL: {openurl}")
|
|
762
|
+
|
|
763
|
+
try:
|
|
764
|
+
logger.info("Navigating to OpenURL resolver...")
|
|
765
|
+
|
|
766
|
+
# Clear any existing navigation state
|
|
767
|
+
await page.wait_for_timeout(1000)
|
|
768
|
+
|
|
769
|
+
await page.goto(
|
|
770
|
+
openurl, wait_until="domcontentloaded", timeout=30000
|
|
771
|
+
)
|
|
772
|
+
|
|
773
|
+
# Checkpoint 1: After loading OpenURL resolver page
|
|
774
|
+
await self._capture_checkpoint_screenshot_async(
|
|
775
|
+
page, "01_openurl_loaded", doi
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
# Apply stealth behaviors if using standard browser
|
|
779
|
+
if hasattr(self.browser, "stealth_manager"):
|
|
780
|
+
await self.browser.stealth_manager.human_delay_async()
|
|
781
|
+
await self.browser.stealth_manager.human_mouse_move_async(page)
|
|
782
|
+
await self.browser.stealth_manager.human_scroll_async(page)
|
|
783
|
+
|
|
784
|
+
await page.wait_for_timeout(2000)
|
|
785
|
+
|
|
786
|
+
# Checkpoint 2: After stealth behaviors applied
|
|
787
|
+
await self._capture_checkpoint_screenshot_async(
|
|
788
|
+
page, "02_stealth_applied", doi
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
current_url = page.url
|
|
792
|
+
if self._is_publisher_url(current_url, doi):
|
|
793
|
+
logger.info(
|
|
794
|
+
f"Resolver redirected directly to publisher: {current_url}"
|
|
795
|
+
)
|
|
796
|
+
return {
|
|
797
|
+
"final_url": current_url,
|
|
798
|
+
"resolver_url": openurl,
|
|
799
|
+
"access_type": "direct_redirect",
|
|
800
|
+
"success": True,
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
content = await page.content()
|
|
804
|
+
if any(
|
|
805
|
+
phrase in content
|
|
806
|
+
for phrase in [
|
|
807
|
+
"No online text available",
|
|
808
|
+
"No full text available",
|
|
809
|
+
"No electronic access",
|
|
810
|
+
]
|
|
811
|
+
):
|
|
812
|
+
logger.warn("Resolver indicates no access available")
|
|
813
|
+
return {
|
|
814
|
+
"final_url": None,
|
|
815
|
+
"resolver_url": current_url,
|
|
816
|
+
"access_type": "no_access",
|
|
817
|
+
"success": False,
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
logger.info("Looking for full-text link on resolver page...")
|
|
821
|
+
|
|
822
|
+
# First try our GO button method
|
|
823
|
+
go_button_result = (
|
|
824
|
+
await self._find_and_click_publisher_go_button_async(page, doi)
|
|
825
|
+
)
|
|
826
|
+
if go_button_result["success"]:
|
|
827
|
+
# Clean up popup if it exists
|
|
828
|
+
if "popup_page" in go_button_result:
|
|
829
|
+
popup = go_button_result["popup_page"]
|
|
830
|
+
try:
|
|
831
|
+
await popup.close()
|
|
832
|
+
except:
|
|
833
|
+
pass
|
|
834
|
+
# Remove popup reference from result
|
|
835
|
+
del go_button_result["popup_page"]
|
|
836
|
+
return go_button_result
|
|
837
|
+
|
|
838
|
+
# Fallback to original link finder method
|
|
839
|
+
link_result = await self._link_finder.find_link_async(page, doi)
|
|
840
|
+
|
|
841
|
+
if not link_result["success"]:
|
|
842
|
+
logger.warning(
|
|
843
|
+
"Could not find full-text link on resolver page"
|
|
844
|
+
)
|
|
845
|
+
return {
|
|
846
|
+
"final_url": None,
|
|
847
|
+
"resolver_url": current_url,
|
|
848
|
+
"access_type": "link_not_found",
|
|
849
|
+
"success": False,
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
link_url = link_result["url"]
|
|
853
|
+
|
|
854
|
+
if link_url.startswith("javascript:"):
|
|
855
|
+
logger.info("Handling JavaScript link...")
|
|
856
|
+
try:
|
|
857
|
+
async with page.expect_popup(timeout=30000) as popup_info:
|
|
858
|
+
await page.evaluate(
|
|
859
|
+
link_url.replace("javascript:", "")
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
popup = await popup_info.value
|
|
863
|
+
await popup.wait_for_load_state(
|
|
864
|
+
"domcontentloaded", timeout=30000
|
|
865
|
+
)
|
|
866
|
+
final_url = popup.url
|
|
867
|
+
|
|
868
|
+
if any(
|
|
869
|
+
domain in final_url
|
|
870
|
+
for domain in ["openathens.net", "saml", "shibauth"]
|
|
871
|
+
):
|
|
872
|
+
final_url = await self._follow_saml_redirect_async(
|
|
873
|
+
popup, final_url, doi
|
|
874
|
+
)
|
|
875
|
+
|
|
876
|
+
logger.info(f"Successfully resolved to popup: {final_url}")
|
|
877
|
+
await popup.close()
|
|
878
|
+
|
|
879
|
+
return {
|
|
880
|
+
"final_url": final_url,
|
|
881
|
+
"resolver_url": openurl,
|
|
882
|
+
"access_type": "institutional",
|
|
883
|
+
"success": True,
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
except Exception as popup_error:
|
|
887
|
+
logger.warning(f"Popup handling failed: {popup_error}")
|
|
888
|
+
return {
|
|
889
|
+
"final_url": None,
|
|
890
|
+
"resolver_url": openurl,
|
|
891
|
+
"access_type": "popup_error",
|
|
892
|
+
"success": False,
|
|
893
|
+
}
|
|
894
|
+
else:
|
|
895
|
+
try:
|
|
896
|
+
new_page_promise = None
|
|
897
|
+
|
|
898
|
+
def handle_page_async(new_page):
|
|
899
|
+
nonlocal new_page_promise
|
|
900
|
+
new_page_promise = new_page
|
|
901
|
+
logger.info(f"New page detected: {new_page.url}")
|
|
902
|
+
|
|
903
|
+
context.on("page", handle_page_async)
|
|
904
|
+
|
|
905
|
+
await page.goto(
|
|
906
|
+
link_url, wait_until="domcontentloaded", timeout=30000
|
|
907
|
+
)
|
|
908
|
+
await page.wait_for_timeout(3000)
|
|
909
|
+
|
|
910
|
+
if new_page_promise:
|
|
911
|
+
target_page = new_page_promise
|
|
912
|
+
await target_page.wait_for_load_state(
|
|
913
|
+
"domcontentloaded", timeout=30000
|
|
914
|
+
)
|
|
915
|
+
final_url = target_page.url
|
|
916
|
+
logger.info(f"Using new window: {final_url}")
|
|
917
|
+
await target_page.close()
|
|
918
|
+
else:
|
|
919
|
+
final_url = page.url
|
|
920
|
+
|
|
921
|
+
if any(
|
|
922
|
+
domain in final_url.lower()
|
|
923
|
+
for domain in [
|
|
924
|
+
"openathens.net",
|
|
925
|
+
"saml",
|
|
926
|
+
"shibauth",
|
|
927
|
+
"institutionlogin",
|
|
928
|
+
]
|
|
929
|
+
):
|
|
930
|
+
final_url = await self._follow_saml_redirect_async(
|
|
931
|
+
page, final_url, doi
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
return {
|
|
935
|
+
"final_url": final_url,
|
|
936
|
+
"resolver_url": openurl,
|
|
937
|
+
"access_type": "institutional",
|
|
938
|
+
"success": True,
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
except Exception as nav_error:
|
|
942
|
+
logger.error(f"Navigation failed: {nav_error}")
|
|
943
|
+
return {
|
|
944
|
+
"final_url": None,
|
|
945
|
+
"resolver_url": openurl,
|
|
946
|
+
"access_type": "navigation_error",
|
|
947
|
+
"success": False,
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
except Exception as e:
|
|
951
|
+
logger.error(f"OpenURL resolution failed: {e}")
|
|
952
|
+
return {
|
|
953
|
+
"final_url": None,
|
|
954
|
+
"resolver_url": openurl,
|
|
955
|
+
"access_type": "error",
|
|
956
|
+
"success": False,
|
|
957
|
+
}
|
|
958
|
+
finally:
|
|
959
|
+
await context.close()
|
|
960
|
+
# await page.close()
|
|
961
|
+
|
|
962
|
+
def _resolve_single(self, **kwargs) -> str:
|
|
963
|
+
"""Synchronous wrapper for _resolve_single_async."""
|
|
964
|
+
import asyncio
|
|
965
|
+
|
|
966
|
+
try:
|
|
967
|
+
# Try to get existing loop
|
|
968
|
+
loop = asyncio.get_running_loop()
|
|
969
|
+
# If we're in Jupyter/IPython, use nest_asyncio
|
|
970
|
+
import nest_asyncio
|
|
971
|
+
|
|
972
|
+
nest_asyncio.apply()
|
|
973
|
+
result = asyncio.run(self._resolve_single_async(**kwargs))
|
|
974
|
+
except RuntimeError:
|
|
975
|
+
# No running loop, create new one
|
|
976
|
+
result = asyncio.run(self._resolve_single_async(**kwargs))
|
|
977
|
+
|
|
978
|
+
self._validate_final_url(kwargs.get("doi", ""), result)
|
|
979
|
+
return result.get("resolved_url") if result else None
|
|
980
|
+
|
|
981
|
+
async def _resolve_parallel_async(
|
|
982
|
+
self, dois: Union[str, List[str]], concurrency: int = 2
|
|
983
|
+
) -> List[Optional[Dict[str, Any]]]:
|
|
984
|
+
"""Resolves a list of DOIs in parallel with controlled concurrency.
|
|
985
|
+
|
|
986
|
+
Args:
|
|
987
|
+
dois: A list of DOI strings to resolve.
|
|
988
|
+
concurrency: Maximum number of concurrent tasks (default: 2)
|
|
989
|
+
|
|
990
|
+
Returns:
|
|
991
|
+
A list of result dictionaries, in the same order as the input DOIs.
|
|
992
|
+
"""
|
|
993
|
+
if not dois:
|
|
994
|
+
return []
|
|
995
|
+
|
|
996
|
+
is_single = False
|
|
997
|
+
if isinstance(dois, str):
|
|
998
|
+
dois = [dois]
|
|
999
|
+
is_single = True
|
|
1000
|
+
|
|
1001
|
+
logger.info(
|
|
1002
|
+
f"--- Starting parallel resolution for {len(dois)} DOIs (concurrency: {concurrency}) ---"
|
|
1003
|
+
)
|
|
1004
|
+
|
|
1005
|
+
# Create semaphore to limit concurrency
|
|
1006
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
1007
|
+
|
|
1008
|
+
async def worker_async(doi):
|
|
1009
|
+
async with semaphore:
|
|
1010
|
+
# Add random delay between requests to appear more human
|
|
1011
|
+
await asyncio.sleep(random.uniform(0.5, 2.0))
|
|
1012
|
+
return await self._resolve_single_async(doi=doi)
|
|
1013
|
+
|
|
1014
|
+
# Create tasks using the worker_async function
|
|
1015
|
+
tasks = [worker_async(doi) for doi in dois]
|
|
1016
|
+
results = await asyncio.gather(*tasks)
|
|
1017
|
+
|
|
1018
|
+
logger.info("--- Parallel resolution finished ---")
|
|
1019
|
+
return results[0] if is_single else results
|
|
1020
|
+
|
|
1021
|
+
def resolve(
|
|
1022
|
+
self, dois: Union[str, List[str]], concurrency: int = 5
|
|
1023
|
+
) -> Union[str, List[str]]:
|
|
1024
|
+
"""Synchronous wrapper for _resolve_parallel_async."""
|
|
1025
|
+
try:
|
|
1026
|
+
loop = asyncio.get_running_loop()
|
|
1027
|
+
import nest_asyncio
|
|
1028
|
+
|
|
1029
|
+
nest_asyncio.apply()
|
|
1030
|
+
results = asyncio.run(
|
|
1031
|
+
self._resolve_parallel_async(dois, concurrency)
|
|
1032
|
+
)
|
|
1033
|
+
except RuntimeError:
|
|
1034
|
+
results = asyncio.run(
|
|
1035
|
+
self._resolve_parallel_async(dois, concurrency)
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
# Validate results
|
|
1039
|
+
dois_list = [dois] if isinstance(dois, str) else dois
|
|
1040
|
+
results_list = [results] if not isinstance(results, list) else results
|
|
1041
|
+
for doi, result in zip(dois_list, results_list):
|
|
1042
|
+
self._validate_final_url(doi, result)
|
|
1043
|
+
|
|
1044
|
+
return results
|
|
1045
|
+
|
|
1046
|
+
def _validate_final_url(self, doi, result):
|
|
1047
|
+
if result and result.get("success"):
|
|
1048
|
+
final_url = result.get("final_url", "")
|
|
1049
|
+
|
|
1050
|
+
# Check if we reached a publisher URL
|
|
1051
|
+
if self._is_publisher_url(final_url, doi=doi):
|
|
1052
|
+
logger.success(f"{doi}: {final_url}")
|
|
1053
|
+
result["resolved_url"] = final_url
|
|
1054
|
+
return True
|
|
1055
|
+
|
|
1056
|
+
# Also accept Elsevier linking hub as success
|
|
1057
|
+
elif "linkinghub.elsevier.com" in final_url:
|
|
1058
|
+
logger.success(f"{doi}: {final_url} (Elsevier linking hub)")
|
|
1059
|
+
result["resolved_url"] = final_url
|
|
1060
|
+
return True
|
|
1061
|
+
|
|
1062
|
+
# If we have a URL but it's not a publisher, still mark as partial success
|
|
1063
|
+
elif (
|
|
1064
|
+
final_url
|
|
1065
|
+
and "chrome-error" not in final_url
|
|
1066
|
+
and "openathens" not in final_url.lower()
|
|
1067
|
+
):
|
|
1068
|
+
logger.info(f"{doi}: Reached {final_url}")
|
|
1069
|
+
result["resolved_url"] = final_url
|
|
1070
|
+
return True
|
|
1071
|
+
|
|
1072
|
+
# Only mark as failed if no URL or error/auth page
|
|
1073
|
+
final_url = result.get("final_url") if result else "N/A"
|
|
1074
|
+
logger.fail(f"{doi}: Failed - {final_url}")
|
|
1075
|
+
if result:
|
|
1076
|
+
result["resolved_url"] = None
|
|
1077
|
+
return False
|
|
1078
|
+
|
|
1079
|
+
async def __aenter__(self):
|
|
1080
|
+
return self
|
|
1081
|
+
|
|
1082
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
1083
|
+
if hasattr(self.browser, "cleanup_authenticate_async_context"):
|
|
1084
|
+
await self.browser.cleanup_authenticate_async_context()
|
|
1085
|
+
|
|
1086
|
+
|
|
1087
|
+
async def try_openurl_resolver_async(
|
|
1088
|
+
title: str = "",
|
|
1089
|
+
authors: Optional[list] = None,
|
|
1090
|
+
journal: str = "",
|
|
1091
|
+
year: Optional[int] = None,
|
|
1092
|
+
volume: Optional[int] = None,
|
|
1093
|
+
issue: Optional[int] = None,
|
|
1094
|
+
pages: str = "",
|
|
1095
|
+
doi: str = "",
|
|
1096
|
+
pmid: str = "",
|
|
1097
|
+
resolver_url: Optional[str] = None,
|
|
1098
|
+
auth_manager=None,
|
|
1099
|
+
) -> Optional[str]:
|
|
1100
|
+
"""Try to find full-text URL via OpenURL resolver."""
|
|
1101
|
+
async with OpenURLResolver(auth_manager, resolver_url) as resolver:
|
|
1102
|
+
result = await resolver._resolve_single_async(
|
|
1103
|
+
title, authors, journal, year, volume, issue, pages, doi, pmid
|
|
1104
|
+
)
|
|
1105
|
+
if result and result.get("success") and result.get("final_url"):
|
|
1106
|
+
return result["final_url"]
|
|
1107
|
+
return None
|
|
1108
|
+
|
|
1109
|
+
|
|
1110
|
+
if __name__ == "__main__":
|
|
1111
|
+
import asyncio
|
|
1112
|
+
|
|
1113
|
+
async def main():
|
|
1114
|
+
"""Test the resolver with different articles."""
|
|
1115
|
+
# from scitex import logging
|
|
1116
|
+
# logging.basicConfig(level=logging.DEBUG)
|
|
1117
|
+
from scitex.scholar.auth import AuthenticationManager
|
|
1118
|
+
|
|
1119
|
+
auth_manager = AuthenticationManager()
|
|
1120
|
+
# resolver_url = "https://unimelb.hosted.exlibrisgroup.com/sfxlcl41"
|
|
1121
|
+
|
|
1122
|
+
async with OpenURLResolver(auth_manager) as resolver:
|
|
1123
|
+
print("\n=== Test 1: Article with access ===")
|
|
1124
|
+
result = await resolver._resolve_single_async(
|
|
1125
|
+
doi="10.1002/hipo.22488",
|
|
1126
|
+
# title="Hippocampal sharp wave-ripple: A cognitive biomarker for episodic memory and planning",
|
|
1127
|
+
# authors=["Buzsáki, György"],
|
|
1128
|
+
# journal="Hippocampus",
|
|
1129
|
+
# year=2015,
|
|
1130
|
+
# volume=25,
|
|
1131
|
+
# issue=10,
|
|
1132
|
+
# pages="1073-1188",
|
|
1133
|
+
)
|
|
1134
|
+
print(f"Result: {result}")
|
|
1135
|
+
|
|
1136
|
+
print("\n=== Test 2: Article without access ===")
|
|
1137
|
+
result = await resolver._resolve_single_async(
|
|
1138
|
+
doi="10.1038/s41593-025-01990-7",
|
|
1139
|
+
title="Addressing artifactual bias in large, automated MRI analyses of brain development",
|
|
1140
|
+
journal="Nature Neuroscience",
|
|
1141
|
+
year=2025,
|
|
1142
|
+
)
|
|
1143
|
+
print(f"Result: {result}")
|
|
1144
|
+
|
|
1145
|
+
time.sleep(600)
|
|
1146
|
+
|
|
1147
|
+
asyncio.run(main())
|
|
1148
|
+
|
|
1149
|
+
# python -m scitex.scholar.open_url._OpenURLResolver
|
|
1150
|
+
|
|
1151
|
+
# Is it a resolver url?
|
|
1152
|
+
# https://unimelb.hosted.exlibrisgroup.com/sfxlcl41?ctx_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rft.genre=article&rft.doi=10.1002/hipo.22488
|
|
1153
|
+
|
|
1154
|
+
# EOF
|
|
1155
|
+
# https://unimelb.hosted.exlibrisgroup.com/sfxlcl41?ctx_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rft.genre=article&rft.doi=10.1038/s41593-025-01990-7
|
|
1156
|
+
|
|
1157
|
+
# https://unimelb.hosted.exlibrisgroup.com/sfxlcl41?ctx_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rft.genre=article&rft.doi=10.1002/hipo.22488
|
|
1158
|
+
# https://unimelb.hosted.exlibrisgroup.com/sfxlcl41?ctx_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rft.genre=article&rft.doi=10.1038/s41593-025-02020-2
|
|
1159
|
+
|
|
1160
|
+
# EOF
|