scitex 2.16.2__py3-none-any.whl → 2.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. scitex/_mcp_resources/_cheatsheet.py +1 -1
  2. scitex/_mcp_resources/_modules.py +1 -1
  3. scitex/_mcp_tools/__init__.py +2 -0
  4. scitex/_mcp_tools/verify.py +256 -0
  5. scitex/cli/main.py +2 -0
  6. scitex/cli/verify.py +476 -0
  7. scitex/dev/plt/__init__.py +1 -1
  8. scitex/dev/plt/data/mpl/PLOTTING_FUNCTIONS.yaml +90 -0
  9. scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES.yaml +1571 -0
  10. scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES_DETAILED.yaml +6262 -0
  11. scitex/dev/plt/data/mpl/SIGNATURES_FLATTENED.yaml +1274 -0
  12. scitex/dev/plt/data/mpl/dir_ax.txt +459 -0
  13. scitex/dev/plt/mpl/get_dir_ax.py +1 -1
  14. scitex/dev/plt/mpl/get_signatures.py +1 -1
  15. scitex/dev/plt/mpl/get_signatures_details.py +1 -1
  16. scitex/io/_load.py +8 -1
  17. scitex/io/_save.py +12 -0
  18. scitex/scholar/data/.gitkeep +0 -0
  19. scitex/scholar/data/README.md +44 -0
  20. scitex/scholar/data/bib_files/bibliography.bib +1952 -0
  21. scitex/scholar/data/bib_files/neurovista.bib +277 -0
  22. scitex/scholar/data/bib_files/neurovista_enriched.bib +441 -0
  23. scitex/scholar/data/bib_files/neurovista_enriched_enriched.bib +441 -0
  24. scitex/scholar/data/bib_files/neurovista_processed.bib +338 -0
  25. scitex/scholar/data/bib_files/openaccess.bib +89 -0
  26. scitex/scholar/data/bib_files/pac-seizure_prediction_enriched.bib +2178 -0
  27. scitex/scholar/data/bib_files/pac.bib +698 -0
  28. scitex/scholar/data/bib_files/pac_enriched.bib +1061 -0
  29. scitex/scholar/data/bib_files/pac_processed.bib +0 -0
  30. scitex/scholar/data/bib_files/pac_titles.txt +75 -0
  31. scitex/scholar/data/bib_files/paywalled.bib +98 -0
  32. scitex/scholar/data/bib_files/related-papers-by-coauthors.bib +58 -0
  33. scitex/scholar/data/bib_files/related-papers-by-coauthors_enriched.bib +87 -0
  34. scitex/scholar/data/bib_files/seizure_prediction.bib +694 -0
  35. scitex/scholar/data/bib_files/seizure_prediction_processed.bib +0 -0
  36. scitex/scholar/data/bib_files/test_complete_enriched.bib +437 -0
  37. scitex/scholar/data/bib_files/test_final_enriched.bib +437 -0
  38. scitex/scholar/data/bib_files/test_seizure.bib +46 -0
  39. scitex/scholar/data/impact_factor/JCR_IF_2022.xlsx +0 -0
  40. scitex/scholar/data/impact_factor/JCR_IF_2024.db +0 -0
  41. scitex/scholar/data/impact_factor/JCR_IF_2024.xlsx +0 -0
  42. scitex/scholar/data/impact_factor/JCR_IF_2024_v01.db +0 -0
  43. scitex/scholar/data/impact_factor.db +0 -0
  44. scitex/session/README.md +2 -2
  45. scitex/session/__init__.py +1 -0
  46. scitex/session/_decorator.py +57 -33
  47. scitex/session/_lifecycle/__init__.py +23 -0
  48. scitex/session/_lifecycle/_close.py +225 -0
  49. scitex/session/_lifecycle/_config.py +112 -0
  50. scitex/session/_lifecycle/_matplotlib.py +83 -0
  51. scitex/session/_lifecycle/_start.py +246 -0
  52. scitex/session/_lifecycle/_utils.py +186 -0
  53. scitex/session/_manager.py +40 -3
  54. scitex/session/template.py +1 -1
  55. scitex/template/_templates/plt.py +1 -1
  56. scitex/template/_templates/session.py +1 -1
  57. scitex/verify/README.md +312 -0
  58. scitex/verify/__init__.py +212 -0
  59. scitex/verify/_chain.py +369 -0
  60. scitex/verify/_db.py +600 -0
  61. scitex/verify/_hash.py +187 -0
  62. scitex/verify/_integration.py +127 -0
  63. scitex/verify/_rerun.py +253 -0
  64. scitex/verify/_tracker.py +330 -0
  65. scitex/verify/_visualize.py +48 -0
  66. scitex/verify/_viz/__init__.py +56 -0
  67. scitex/verify/_viz/_colors.py +84 -0
  68. scitex/verify/_viz/_format.py +302 -0
  69. scitex/verify/_viz/_json.py +192 -0
  70. scitex/verify/_viz/_mermaid.py +440 -0
  71. scitex/verify/_viz/_plotly.py +193 -0
  72. scitex/verify/_viz/_templates.py +246 -0
  73. scitex/verify/_viz/_utils.py +56 -0
  74. {scitex-2.16.2.dist-info → scitex-2.17.0.dist-info}/METADATA +1 -1
  75. {scitex-2.16.2.dist-info → scitex-2.17.0.dist-info}/RECORD +78 -29
  76. scitex/scholar/url_finder/.tmp/open_url/KNOWN_RESOLVERS.py +0 -462
  77. scitex/scholar/url_finder/.tmp/open_url/README.md +0 -223
  78. scitex/scholar/url_finder/.tmp/open_url/_DOIToURLResolver.py +0 -694
  79. scitex/scholar/url_finder/.tmp/open_url/_OpenURLResolver.py +0 -1160
  80. scitex/scholar/url_finder/.tmp/open_url/_ResolverLinkFinder.py +0 -344
  81. scitex/scholar/url_finder/.tmp/open_url/__init__.py +0 -24
  82. scitex/session/_lifecycle.py +0 -827
  83. {scitex-2.16.2.dist-info → scitex-2.17.0.dist-info}/WHEEL +0 -0
  84. {scitex-2.16.2.dist-info → scitex-2.17.0.dist-info}/entry_points.txt +0 -0
  85. {scitex-2.16.2.dist-info → scitex-2.17.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,1160 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # Timestamp: "2025-08-06 16:55:39 (ywatanabe)"
4
- # File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/open_url/_OpenURLResolver.py
5
- # ----------------------------------------
6
- from __future__ import annotations
7
-
8
- import os
9
-
10
- __FILE__ = (
11
- "./src/scitex/scholar/open_url/_OpenURLResolver.py"
12
- )
13
- __DIR__ = os.path.dirname(__FILE__)
14
- # ----------------------------------------
15
-
16
- import asyncio
17
- import random
18
- import time
19
- from typing import List, Union
20
-
21
- from scitex import logging
22
-
23
- """OpenURL resolver for finding full-text access through institutional libraries."""
24
-
25
- from typing import Any, Dict, Optional
26
- from urllib.parse import urlencode
27
-
28
- from playwright.async_api import Page
29
-
30
- from scitex.scholar.browser import BrowserManager
31
- from scitex.scholar.config import ScholarConfig
32
-
33
- from ...errors import ScholarError
34
- from ._ResolverLinkFinder import ResolverLinkFinder
35
-
36
- logger = logging.getLogger(__name__)
37
-
38
-
39
- class OpenURLResolver:
40
- """Resolves DOIs/metadata to full-text URLs via institutional OpenURL resolver.
41
-
42
- OpenURL is a standardized format for encoding bibliographic information
43
- that libraries use to link to full-text resources."""
44
-
45
- AUTH_PATTERNS = [
46
- "openathens.net",
47
- "shibauth",
48
- "saml",
49
- "institutionlogin",
50
- "iam.atypon.com",
51
- "auth.elsevier.com",
52
- "go.gale.com/ps/headerQuickSearch",
53
- ]
54
-
55
- PUBLISHER_DOMAINS = [
56
- "sciencedirect.com",
57
- "nature.com",
58
- "springer.com",
59
- "wiley.com",
60
- "onlinelibrary.wiley.com",
61
- "acs.org",
62
- "tandfonline.com",
63
- "sagepub.com",
64
- "academic.oup.com",
65
- "science.org",
66
- "pnas.org",
67
- "bmj.com",
68
- "cell.com",
69
- ]
70
-
71
- def __init__(
72
- self,
73
- auth_manager,
74
- resolver_url: Optional[str] = None,
75
- browser_mode: str = "stealth",
76
- config: Optional[ScholarConfig] = None,
77
- ):
78
- """Initialize OpenURL resolver.
79
-
80
- Args:
81
- auth_manager: Authentication manager for institutional access
82
- resolver_url: Base URL of institutional OpenURL resolver
83
- (Details can be seen at https://www.zotero.org/openurl_resolvers)
84
- browser_mode: Browser mode ("stealth" or "interactive")
85
- config: ScholarConfig instance (creates new if None)
86
- """
87
- self.auth_manager = auth_manager
88
-
89
- # Initialize config
90
- if config is None:
91
- config = ScholarConfig()
92
- self.config = config
93
-
94
- # Resolve resolver URL from config
95
- self.resolver_url = self.config.resolve(
96
- "openurl_resolver_url", resolver_url, None, str
97
- )
98
-
99
- # Create BrowserManager with simplified configuration
100
- self.browser = BrowserManager(
101
- auth_manager=auth_manager,
102
- browser_mode=browser_mode,
103
- config=self.config,
104
- )
105
-
106
- self.timeout = 30
107
- self._link_finder = ResolverLinkFinder()
108
-
109
- # Screenshot capture setup (optional, controlled by config)
110
- self.capture_screenshots = self.config.resolve(
111
- "capture_screenshots", None, False, bool
112
- )
113
- if self.capture_screenshots:
114
- from datetime import datetime
115
-
116
- self.screenshot_dir = (
117
- self.config.paths.get_screenshots_dir() / "openurl"
118
- )
119
- self.screenshot_dir.mkdir(parents=True, exist_ok=True)
120
- self.session_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
121
-
122
- async def _capture_checkpoint_screenshot_async(
123
- self, page, stage: str, doi: str = ""
124
- ) -> Optional[str]:
125
- """Capture screenshot at checkpoint for debugging."""
126
- if not self.capture_screenshots:
127
- return None
128
-
129
- try:
130
- from datetime import datetime
131
-
132
- timestamp = datetime.now().strftime("%H%M%S")
133
- doi_safe = (
134
- doi.replace("/", "-").replace(".", "_") if doi else "unknown"
135
- )
136
- screenshot_name = f"openurl_{stage}_{doi_safe}_{timestamp}.png"
137
- screenshot_path = self.screenshot_dir / screenshot_name
138
-
139
- await page.screenshot(path=str(screenshot_path), full_page=True)
140
- logger.info(
141
- f"📸 Screenshot captured: {stage} -> {screenshot_name}"
142
- )
143
- return str(screenshot_path)
144
- except Exception as e:
145
- logger.warning(f"📸 Screenshot capture failed at {stage}: {e}")
146
- return None
147
-
148
- def build_openurl(
149
- self,
150
- title: str = "",
151
- authors: Optional[list] = None,
152
- journal: str = "",
153
- year: Optional[int] = None,
154
- volume: Optional[int] = None,
155
- issue: Optional[int] = None,
156
- pages: str = "",
157
- doi: str = "",
158
- pmid: str = "",
159
- ) -> str:
160
- """Build OpenURL query string from paper metadata."""
161
- params = {
162
- "ctx_ver": "Z39.88-2004",
163
- "rft_val_fmt": "info:ofi/fmt:kev:mtx:journal",
164
- "rft.genre": "article",
165
- }
166
-
167
- if title:
168
- params["rft.atitle"] = title
169
- if journal:
170
- params["rft.jtitle"] = journal
171
- if year:
172
- params["rft.date"] = str(year)
173
- if volume:
174
- params["rft.volume"] = str(volume)
175
- if issue:
176
- params["rft.issue"] = str(issue)
177
- if pages:
178
- if "-" in str(pages):
179
- spage, epage = pages.split("-", 1)
180
- params["rft.spage"] = spage.strip()
181
- params["rft.epage"] = epage.strip()
182
- else:
183
- params["rft.spage"] = str(pages)
184
- if doi:
185
- params["rft.doi"] = doi
186
- if pmid:
187
- params["rft.pmid"] = str(pmid)
188
-
189
- if authors:
190
- first_author = authors[0]
191
- if "," in first_author:
192
- last, first = first_author.split(",", 1)
193
- params["rft.aulast"] = last.strip()
194
- params["rft.aufirst"] = first.strip()
195
- params["rft.au"] = first_author
196
-
197
- query_string = urlencode(params, safe=":/")
198
- return f"{self.resolver_url}?{query_string}"
199
-
200
- def _is_publisher_url(self, url: str, doi: str = "") -> bool:
201
- """Check if URL is from expected publisher domain."""
202
- if not url:
203
- return False
204
- if any(pattern in url.lower() for pattern in self.AUTH_PATTERNS):
205
- return False
206
- if any(domain in url.lower() for domain in self.PUBLISHER_DOMAINS):
207
- return True
208
- else:
209
- return False
210
-
211
- async def _follow_saml_redirect_async(self, page, saml_url, doi=""):
212
- """Follow SAML/SSO redirect chain until publisher URL is reached."""
213
- logger.info(f"Following SAML redirect chain starting from: {saml_url}")
214
-
215
- if self._is_publisher_url(saml_url, doi):
216
- return saml_url
217
-
218
- await page.goto(
219
- saml_url,
220
- wait_until="domcontentloaded",
221
- timeout=15000, # Increased from 1.5-3s to 15s
222
- )
223
- last_url = ""
224
-
225
- for attempt in range(8):
226
- current_url = page.url
227
- logger.debug(f"SAML redirect attempt {attempt + 1}: {current_url}")
228
-
229
- if self._is_publisher_url(current_url, doi):
230
- logger.info(
231
- f"Successfully navigated to publisher URL: {current_url}"
232
- )
233
- return current_url
234
-
235
- # If URL hasn't changed in 2 attempts, we're stuck
236
- if current_url == last_url and attempt > 1:
237
- logger.warning(f"SAML redirect stuck at: {current_url}")
238
- return current_url
239
-
240
- last_url = current_url
241
-
242
- # Only try form submission first few attempts
243
- if attempt < 3:
244
- try:
245
- forms = await page.query_selector_all("form")
246
- for form in forms:
247
- if await form.is_visible():
248
- logger.debug("Submitting visible form...")
249
- await form.evaluate("form => form.submit()")
250
- await page.wait_for_load_state(
251
- "domcontentloaded", timeout=10000
252
- )
253
- break
254
- except:
255
- pass
256
-
257
- await page.wait_for_timeout(1000)
258
-
259
- final_url = page.url
260
- logger.info(f"SAML redirect completed at: {final_url}")
261
- return final_url
262
-
263
- async def _find_and_click_publisher_go_button_async(self, page, doi=""):
264
- """Find and click the appropriate publisher GO button on the OpenURL resolver page.
265
-
266
- This method implements our proven GO button detection and clicking logic
267
- that successfully worked for Science.org and Nature.com access.
268
- """
269
- try:
270
- logger.info("Looking for publisher GO buttons on resolver page...")
271
-
272
- # Get all GO buttons with context information
273
- go_buttons = await page.evaluate(
274
- """() => {
275
- const goButtons = Array.from(document.querySelectorAll('input[value="Go"], button[value="Go"], input[value="GO"], button[value="GO"]'));
276
- return goButtons.map((btn, index) => {
277
- const parentRow = btn.closest('tr') || btn.parentElement;
278
- const rowText = parentRow ? parentRow.textContent.trim() : '';
279
-
280
- // Check for publisher indicators in the row text
281
- const isScience = rowText.toLowerCase().includes('american association') ||
282
- rowText.toLowerCase().includes('aaas') ||
283
- rowText.toLowerCase().includes('science');
284
- const isNature = rowText.toLowerCase().includes('nature') ||
285
- rowText.toLowerCase().includes('springer');
286
- const isWiley = rowText.toLowerCase().includes('wiley');
287
- const isElsevier = rowText.toLowerCase().includes('elsevier') ||
288
- rowText.toLowerCase().includes('sciencedirect');
289
-
290
- return {
291
- index: index,
292
- globalIndex: Array.from(document.querySelectorAll('input, button, a, [onclick]')).indexOf(btn),
293
- value: btn.value,
294
- rowText: rowText,
295
- isScience: isScience,
296
- isNature: isNature,
297
- isWiley: isWiley,
298
- isElsevier: isElsevier,
299
- isPublisher: isScience || isNature || isWiley || isElsevier
300
- };
301
- });
302
- }"""
303
- )
304
-
305
- if not go_buttons:
306
- logger.warning("No GO buttons found on resolver page")
307
- return {"success": False, "reason": "no_go_buttons"}
308
-
309
- logger.info(f"Found {len(go_buttons)} GO buttons")
310
- for btn in go_buttons:
311
- logger.debug(
312
- f"GO button {btn['index']}: {btn['rowText'][:50]}... (Publisher: {btn['isPublisher']})"
313
- )
314
-
315
- # Find the most appropriate publisher button
316
- target_button = None
317
-
318
- # Priority order: Science > Nature > Other publishers
319
- for btn in go_buttons:
320
- if btn["isScience"]:
321
- target_button = btn
322
- logger.info(
323
- f"Selected Science/AAAS GO button: {btn['rowText'][:50]}..."
324
- )
325
- break
326
- elif btn["isNature"]:
327
- target_button = btn
328
- logger.info(
329
- f"Selected Nature GO button: {btn['rowText'][:50]}..."
330
- )
331
- break
332
- elif btn["isPublisher"]:
333
- target_button = btn
334
- logger.info(
335
- f"Selected publisher GO button: {btn['rowText'][:50]}..."
336
- )
337
- break
338
-
339
- # Fallback: if no publisher buttons, try the first few GO buttons (might be direct access)
340
- if not target_button and go_buttons:
341
- target_button = go_buttons[0]
342
- logger.info(
343
- f"Using fallback GO button: {target_button['rowText'][:50]}..."
344
- )
345
-
346
- if not target_button:
347
- logger.warning("No suitable GO button found")
348
- return {"success": False, "reason": "no_suitable_button"}
349
-
350
- # Click the selected GO button and handle popup
351
- logger.info("Clicking GO button and waiting for popup...")
352
-
353
- try:
354
- # Set up popup listener before clicking
355
- popup_promise = page.wait_for_event("popup", timeout=30000)
356
-
357
- # Click the button using its global index for reliability
358
- click_result = await page.evaluate(
359
- f"""() => {{
360
- const allElements = Array.from(document.querySelectorAll('input, button, a, [onclick]'));
361
- const targetButton = allElements[{target_button['globalIndex']}];
362
- if (targetButton && (targetButton.value === 'Go' || targetButton.value === 'GO')) {{
363
- console.log('Clicking GO button:', targetButton);
364
- targetButton.click();
365
- return 'clicked';
366
- }}
367
- return 'not-found';
368
- }}"""
369
- )
370
-
371
- if click_result != "clicked":
372
- logger.warning("Failed to click GO button")
373
- return {"success": False, "reason": "click_failed"}
374
-
375
- # Wait for popup
376
- popup = await popup_promise
377
- logger.info("Publisher popup opened successfully")
378
-
379
- # Wait for popup to load
380
- await popup.wait_for_load_state(
381
- "domcontentloaded", timeout=30000
382
- )
383
- await popup.wait_for_timeout(5000) # Allow time for redirects
384
-
385
- final_url = popup.url
386
- popup_title = await popup.title()
387
-
388
- logger.info(f"Successfully accessed: {popup_title}")
389
- logger.info(f"Final URL: {final_url}")
390
-
391
- # Verify we reached a publisher URL
392
- if self._is_publisher_url(final_url, doi):
393
- logger.success(
394
- f"Successfully reached publisher: {final_url}"
395
- )
396
-
397
- result = {
398
- "final_url": final_url,
399
- "resolver_url": page.url,
400
- "access_type": "publisher_go_button",
401
- "success": True,
402
- "publisher_detected": True,
403
- "popup_page": popup, # Keep popup open for potential PDF download
404
- }
405
-
406
- # Don't close popup immediately - let caller decide
407
- # await popup.close()
408
- return result
409
- else:
410
- logger.info(f"Reached non-publisher URL: {final_url}")
411
-
412
- result = {
413
- "final_url": final_url,
414
- "resolver_url": page.url,
415
- "access_type": "go_button_redirect",
416
- "success": True,
417
- "publisher_detected": False,
418
- "popup_page": popup, # Keep popup open for potential PDF download
419
- }
420
-
421
- # Don't close popup immediately - let caller decide
422
- # await popup.close()
423
- return result
424
-
425
- except Exception as popup_error:
426
- logger.warning(f"Popup handling failed: {popup_error}")
427
- return {
428
- "success": False,
429
- "reason": f"popup_error: {popup_error}",
430
- }
431
-
432
- except Exception as e:
433
- logger.error(f"GO button detection failed: {e}")
434
- return {"success": False, "reason": f"detection_error: {e}"}
435
-
436
- async def _download_pdf_async_from_publisher_page(
437
- self, popup, filename, download_dir="downloads"
438
- ):
439
- """Download PDF from publisher page after successful GO button access.
440
-
441
- This method implements our proven PDF download logic that works
442
- with various publisher sites including Science.org and Nature.com.
443
- """
444
- from pathlib import Path
445
-
446
- try:
447
- download_path = Path(download_dir)
448
- download_path.mkdir(exist_ok=True)
449
-
450
- logger.info("Looking for PDF download links on publisher page...")
451
-
452
- # Find PDF download links
453
- pdf_links = await popup.evaluate(
454
- """() => {
455
- const allLinks = Array.from(document.querySelectorAll('a, button, input'));
456
- return allLinks.filter(el =>
457
- el.textContent.toLowerCase().includes('pdf') ||
458
- el.textContent.toLowerCase().includes('download') ||
459
- el.href?.includes('pdf') ||
460
- el.getAttribute('data-track-action')?.includes('pdf')
461
- ).map(el => ({
462
- tag: el.tagName,
463
- text: el.textContent.trim(),
464
- href: el.href || el.value || 'no-href',
465
- className: el.className,
466
- id: el.id,
467
- trackAction: el.getAttribute('data-track-action') || 'none'
468
- }));
469
- }"""
470
- )
471
-
472
- if not pdf_links:
473
- logger.warning("No PDF links found on publisher page")
474
- return {"success": False, "reason": "no_pdf_links"}
475
-
476
- logger.info(f"Found {len(pdf_links)} potential PDF links")
477
- for i, link in enumerate(pdf_links):
478
- logger.debug(
479
- f"PDF link {i}: {link['text'][:30]}... | {link['href'][:50]}..."
480
- )
481
-
482
- # Find the best PDF download link
483
- main_pdf_link = None
484
-
485
- # Priority order: direct download > PDF with download > PDF view
486
- for link in pdf_links:
487
- if "download pdf" in link["text"].lower():
488
- main_pdf_link = link
489
- break
490
- elif link["href"] != "no-href" and link["href"].endswith(
491
- ".pdf"
492
- ):
493
- main_pdf_link = link
494
- break
495
- elif (
496
- "pdf" in link["text"].lower()
497
- and "view" not in link["text"].lower()
498
- ):
499
- main_pdf_link = link
500
- break
501
-
502
- if not main_pdf_link:
503
- main_pdf_link = pdf_links[
504
- 0
505
- ] # Fallback to first PDF-related link
506
-
507
- logger.info(f"Selected PDF link: {main_pdf_link['text'][:40]}...")
508
-
509
- # Set up download path
510
- file_path = download_path / filename
511
-
512
- # Configure download headers
513
- await popup.set_extra_http_headers(
514
- {"Accept": "application/pdf,application/octet-stream,*/*"}
515
- )
516
-
517
- # Try download methods
518
- pdf_download = False
519
-
520
- # Method 1: Direct URL navigation
521
- if (
522
- main_pdf_link["href"] != "no-href"
523
- and "pdf" in main_pdf_link["href"].lower()
524
- ):
525
- try:
526
- logger.info("Attempting direct PDF URL download...")
527
- download_promise = popup.wait_for_event(
528
- "download", timeout=30000
529
- )
530
- await popup.goto(main_pdf_link["href"])
531
-
532
- download = await download_promise
533
- await download.save_as(str(file_path))
534
-
535
- if file_path.exists():
536
- size_mb = file_path.stat().st_size / (1024 * 1024)
537
- logger.success(
538
- f"PDF download successfully: {filename} ({size_mb:.1f} MB)"
539
- )
540
- pdf_download = True
541
-
542
- except Exception as e:
543
- logger.debug(f"Direct download failed: {e}")
544
-
545
- # Method 2: Click-based download
546
- if not pdf_download:
547
- try:
548
- logger.info("Attempting click-based PDF download...")
549
- download_promise = popup.wait_for_event(
550
- "download", timeout=30000
551
- )
552
-
553
- # Click the first PDF link
554
- await popup.evaluate(
555
- """() => {
556
- const allLinks = Array.from(document.querySelectorAll('a, button, input'));
557
- const pdfLinks = allLinks.filter(el =>
558
- el.textContent.toLowerCase().includes('pdf') ||
559
- el.textContent.toLowerCase().includes('download') ||
560
- el.href?.includes('pdf')
561
- );
562
- if (pdfLinks.length > 0) {
563
- pdfLinks[0].click();
564
- return 'clicked';
565
- }
566
- return 'no-link';
567
- }"""
568
- )
569
-
570
- download = await download_promise
571
- await download.save_as(str(file_path))
572
-
573
- if file_path.exists():
574
- size_mb = file_path.stat().st_size / (1024 * 1024)
575
- logger.success(
576
- f"PDF download successfully: {filename} ({size_mb:.1f} MB)"
577
- )
578
- pdf_download = True
579
-
580
- except Exception as e:
581
- logger.debug(f"Click-based download failed: {e}")
582
-
583
- if pdf_download:
584
- return {
585
- "success": True,
586
- "filename": filename,
587
- "path": str(file_path),
588
- "size_mb": (
589
- file_path.stat().st_size / (1024 * 1024)
590
- if file_path.exists()
591
- else 0
592
- ),
593
- }
594
- else:
595
- logger.warning("All PDF download methods failed")
596
- # Take screenshot for debugging
597
- screenshot_path = (
598
- download_path
599
- / f"pdf_download_failed_{filename.replace('.pdf', '.png')}"
600
- )
601
- await popup.screenshot(
602
- path=str(screenshot_path), full_page=True
603
- )
604
- logger.info(
605
- f"Screenshot saved for debugging: {screenshot_path}"
606
- )
607
-
608
- return {
609
- "success": False,
610
- "reason": "download_failed",
611
- "screenshot": str(screenshot_path),
612
- "available_links": [link["text"] for link in pdf_links],
613
- }
614
-
615
- except Exception as e:
616
- logger.error(f"PDF download failed: {e}")
617
- return {"success": False, "reason": f"error: {e}"}
618
-
619
- async def resolve_and_download_pdf_async(
620
- self,
621
- title: str = "",
622
- authors: Optional[list] = None,
623
- journal: str = "",
624
- year: Optional[int] = None,
625
- volume: Optional[int] = None,
626
- issue: Optional[int] = None,
627
- pages: str = "",
628
- doi: str = "",
629
- pmid: str = "",
630
- filename: str = None,
631
- download_dir: str = "downloads",
632
- ) -> Dict[str, Any]:
633
- """Resolve paper access and download PDF in one operation.
634
-
635
- This method combines our GO button resolution with PDF download
636
- to provide a complete paper acquisition workflow.
637
- """
638
- if not filename:
639
- # Generate filename from metadata
640
- first_author = (
641
- authors[0].split(",")[0].strip() if authors else "Unknown"
642
- )
643
- filename = f"{first_author}-{year}-{journal.replace(' ', '')}-{title[:30].replace(' ', '_')}.pdf"
644
- # Clean filename
645
- filename = "".join(
646
- c for c in filename if c.isalnum() or c in ".-_"
647
- ).strip()
648
-
649
- logger.info(f"Starting resolve and download for: {filename}")
650
-
651
- # Create fresh context for this operation
652
- browser, context = await self.browser.get_authenticated_browser_and_context_async()
653
- page = await context.new_page()
654
-
655
- try:
656
- # Build OpenURL
657
- openurl = self.build_openurl(
658
- title, authors, journal, year, volume, issue, pages, doi, pmid
659
- )
660
- logger.info(f"Resolving and downloading via OpenURL: {openurl}")
661
-
662
- # Navigate to OpenURL resolver
663
- await page.goto(
664
- openurl, wait_until="domcontentloaded", timeout=30000
665
- )
666
- await page.wait_for_timeout(2000)
667
-
668
- # Try GO button method
669
- go_button_result = (
670
- await self._find_and_click_publisher_go_button_async(page, doi)
671
- )
672
-
673
- if (
674
- go_button_result["success"]
675
- and "popup_page" in go_button_result
676
- ):
677
- popup = go_button_result["popup_page"]
678
-
679
- logger.info(
680
- "Successfully accessed publisher page, attempting PDF download..."
681
- )
682
-
683
- # Try to download PDF
684
- download_result = (
685
- await self._download_pdf_async_from_publisher_page(
686
- popup, filename, download_dir
687
- )
688
- )
689
-
690
- # Close popup
691
- try:
692
- await popup.close()
693
- except:
694
- pass
695
-
696
- # Combine results
697
- final_result = {
698
- **go_button_result,
699
- "pdf_download": download_result,
700
- "filename": filename,
701
- }
702
-
703
- # Remove popup reference
704
- if "popup_page" in final_result:
705
- del final_result["popup_page"]
706
-
707
- if download_result["success"]:
708
- logger.success(f"Successfully download PDF: {filename}")
709
- else:
710
- logger.warning(
711
- f"Paper accessed but PDF download failed: {download_result.get('reason', 'unknown')}"
712
- )
713
-
714
- return final_result
715
-
716
- else:
717
- logger.warning("Could not access publisher page via GO button")
718
- return {
719
- "success": False,
720
- "reason": "go_button_failed",
721
- "go_button_result": go_button_result,
722
- "filename": filename,
723
- }
724
-
725
- except Exception as e:
726
- logger.error(f"Resolve and download failed: {e}")
727
- return {
728
- "success": False,
729
- "reason": f"error: {e}",
730
- "filename": filename,
731
- }
732
- finally:
733
- await context.close()
734
-
735
- async def _resolve_single_async(
736
- self,
737
- title: str = "",
738
- authors: Optional[list] = None,
739
- journal: str = "",
740
- year: Optional[int] = None,
741
- volume: Optional[int] = None,
742
- issue: Optional[int] = None,
743
- pages: str = "",
744
- doi: str = "",
745
- pmid: str = "",
746
- ) -> Optional[Dict[str, Any]]:
747
-
748
- # Note: Removed self.__init__ call that was creating second BrowserManager
749
- # This was causing configuration inconsistency - the resolver is already initialized
750
-
751
- if not doi:
752
- logger.warning("DOI is required for reliable resolution")
753
-
754
- # Create fresh context for each resolution
755
- browser, context = await self.browser.get_authenticated_browser_and_context_async()
756
- page = await context.new_page()
757
-
758
- openurl = self.build_openurl(
759
- title, authors, journal, year, volume, issue, pages, doi, pmid
760
- )
761
- logger.info(f"Resolving via OpenURL: {openurl}")
762
-
763
- try:
764
- logger.info("Navigating to OpenURL resolver...")
765
-
766
- # Clear any existing navigation state
767
- await page.wait_for_timeout(1000)
768
-
769
- await page.goto(
770
- openurl, wait_until="domcontentloaded", timeout=30000
771
- )
772
-
773
- # Checkpoint 1: After loading OpenURL resolver page
774
- await self._capture_checkpoint_screenshot_async(
775
- page, "01_openurl_loaded", doi
776
- )
777
-
778
- # Apply stealth behaviors if using standard browser
779
- if hasattr(self.browser, "stealth_manager"):
780
- await self.browser.stealth_manager.human_delay_async()
781
- await self.browser.stealth_manager.human_mouse_move_async(page)
782
- await self.browser.stealth_manager.human_scroll_async(page)
783
-
784
- await page.wait_for_timeout(2000)
785
-
786
- # Checkpoint 2: After stealth behaviors applied
787
- await self._capture_checkpoint_screenshot_async(
788
- page, "02_stealth_applied", doi
789
- )
790
-
791
- current_url = page.url
792
- if self._is_publisher_url(current_url, doi):
793
- logger.info(
794
- f"Resolver redirected directly to publisher: {current_url}"
795
- )
796
- return {
797
- "final_url": current_url,
798
- "resolver_url": openurl,
799
- "access_type": "direct_redirect",
800
- "success": True,
801
- }
802
-
803
- content = await page.content()
804
- if any(
805
- phrase in content
806
- for phrase in [
807
- "No online text available",
808
- "No full text available",
809
- "No electronic access",
810
- ]
811
- ):
812
- logger.warn("Resolver indicates no access available")
813
- return {
814
- "final_url": None,
815
- "resolver_url": current_url,
816
- "access_type": "no_access",
817
- "success": False,
818
- }
819
-
820
- logger.info("Looking for full-text link on resolver page...")
821
-
822
- # First try our GO button method
823
- go_button_result = (
824
- await self._find_and_click_publisher_go_button_async(page, doi)
825
- )
826
- if go_button_result["success"]:
827
- # Clean up popup if it exists
828
- if "popup_page" in go_button_result:
829
- popup = go_button_result["popup_page"]
830
- try:
831
- await popup.close()
832
- except:
833
- pass
834
- # Remove popup reference from result
835
- del go_button_result["popup_page"]
836
- return go_button_result
837
-
838
- # Fallback to original link finder method
839
- link_result = await self._link_finder.find_link_async(page, doi)
840
-
841
- if not link_result["success"]:
842
- logger.warning(
843
- "Could not find full-text link on resolver page"
844
- )
845
- return {
846
- "final_url": None,
847
- "resolver_url": current_url,
848
- "access_type": "link_not_found",
849
- "success": False,
850
- }
851
-
852
- link_url = link_result["url"]
853
-
854
- if link_url.startswith("javascript:"):
855
- logger.info("Handling JavaScript link...")
856
- try:
857
- async with page.expect_popup(timeout=30000) as popup_info:
858
- await page.evaluate(
859
- link_url.replace("javascript:", "")
860
- )
861
-
862
- popup = await popup_info.value
863
- await popup.wait_for_load_state(
864
- "domcontentloaded", timeout=30000
865
- )
866
- final_url = popup.url
867
-
868
- if any(
869
- domain in final_url
870
- for domain in ["openathens.net", "saml", "shibauth"]
871
- ):
872
- final_url = await self._follow_saml_redirect_async(
873
- popup, final_url, doi
874
- )
875
-
876
- logger.info(f"Successfully resolved to popup: {final_url}")
877
- await popup.close()
878
-
879
- return {
880
- "final_url": final_url,
881
- "resolver_url": openurl,
882
- "access_type": "institutional",
883
- "success": True,
884
- }
885
-
886
- except Exception as popup_error:
887
- logger.warning(f"Popup handling failed: {popup_error}")
888
- return {
889
- "final_url": None,
890
- "resolver_url": openurl,
891
- "access_type": "popup_error",
892
- "success": False,
893
- }
894
- else:
895
- try:
896
- new_page_promise = None
897
-
898
- def handle_page_async(new_page):
899
- nonlocal new_page_promise
900
- new_page_promise = new_page
901
- logger.info(f"New page detected: {new_page.url}")
902
-
903
- context.on("page", handle_page_async)
904
-
905
- await page.goto(
906
- link_url, wait_until="domcontentloaded", timeout=30000
907
- )
908
- await page.wait_for_timeout(3000)
909
-
910
- if new_page_promise:
911
- target_page = new_page_promise
912
- await target_page.wait_for_load_state(
913
- "domcontentloaded", timeout=30000
914
- )
915
- final_url = target_page.url
916
- logger.info(f"Using new window: {final_url}")
917
- await target_page.close()
918
- else:
919
- final_url = page.url
920
-
921
- if any(
922
- domain in final_url.lower()
923
- for domain in [
924
- "openathens.net",
925
- "saml",
926
- "shibauth",
927
- "institutionlogin",
928
- ]
929
- ):
930
- final_url = await self._follow_saml_redirect_async(
931
- page, final_url, doi
932
- )
933
-
934
- return {
935
- "final_url": final_url,
936
- "resolver_url": openurl,
937
- "access_type": "institutional",
938
- "success": True,
939
- }
940
-
941
- except Exception as nav_error:
942
- logger.error(f"Navigation failed: {nav_error}")
943
- return {
944
- "final_url": None,
945
- "resolver_url": openurl,
946
- "access_type": "navigation_error",
947
- "success": False,
948
- }
949
-
950
- except Exception as e:
951
- logger.error(f"OpenURL resolution failed: {e}")
952
- return {
953
- "final_url": None,
954
- "resolver_url": openurl,
955
- "access_type": "error",
956
- "success": False,
957
- }
958
- finally:
959
- await context.close()
960
- # await page.close()
961
-
962
- def _resolve_single(self, **kwargs) -> str:
963
- """Synchronous wrapper for _resolve_single_async."""
964
- import asyncio
965
-
966
- try:
967
- # Try to get existing loop
968
- loop = asyncio.get_running_loop()
969
- # If we're in Jupyter/IPython, use nest_asyncio
970
- import nest_asyncio
971
-
972
- nest_asyncio.apply()
973
- result = asyncio.run(self._resolve_single_async(**kwargs))
974
- except RuntimeError:
975
- # No running loop, create new one
976
- result = asyncio.run(self._resolve_single_async(**kwargs))
977
-
978
- self._validate_final_url(kwargs.get("doi", ""), result)
979
- return result.get("resolved_url") if result else None
980
-
981
- async def _resolve_parallel_async(
982
- self, dois: Union[str, List[str]], concurrency: int = 2
983
- ) -> List[Optional[Dict[str, Any]]]:
984
- """Resolves a list of DOIs in parallel with controlled concurrency.
985
-
986
- Args:
987
- dois: A list of DOI strings to resolve.
988
- concurrency: Maximum number of concurrent tasks (default: 2)
989
-
990
- Returns:
991
- A list of result dictionaries, in the same order as the input DOIs.
992
- """
993
- if not dois:
994
- return []
995
-
996
- is_single = False
997
- if isinstance(dois, str):
998
- dois = [dois]
999
- is_single = True
1000
-
1001
- logger.info(
1002
- f"--- Starting parallel resolution for {len(dois)} DOIs (concurrency: {concurrency}) ---"
1003
- )
1004
-
1005
- # Create semaphore to limit concurrency
1006
- semaphore = asyncio.Semaphore(concurrency)
1007
-
1008
- async def worker_async(doi):
1009
- async with semaphore:
1010
- # Add random delay between requests to appear more human
1011
- await asyncio.sleep(random.uniform(0.5, 2.0))
1012
- return await self._resolve_single_async(doi=doi)
1013
-
1014
- # Create tasks using the worker_async function
1015
- tasks = [worker_async(doi) for doi in dois]
1016
- results = await asyncio.gather(*tasks)
1017
-
1018
- logger.info("--- Parallel resolution finished ---")
1019
- return results[0] if is_single else results
1020
-
1021
- def resolve(
1022
- self, dois: Union[str, List[str]], concurrency: int = 5
1023
- ) -> Union[str, List[str]]:
1024
- """Synchronous wrapper for _resolve_parallel_async."""
1025
- try:
1026
- loop = asyncio.get_running_loop()
1027
- import nest_asyncio
1028
-
1029
- nest_asyncio.apply()
1030
- results = asyncio.run(
1031
- self._resolve_parallel_async(dois, concurrency)
1032
- )
1033
- except RuntimeError:
1034
- results = asyncio.run(
1035
- self._resolve_parallel_async(dois, concurrency)
1036
- )
1037
-
1038
- # Validate results
1039
- dois_list = [dois] if isinstance(dois, str) else dois
1040
- results_list = [results] if not isinstance(results, list) else results
1041
- for doi, result in zip(dois_list, results_list):
1042
- self._validate_final_url(doi, result)
1043
-
1044
- return results
1045
-
1046
- def _validate_final_url(self, doi, result):
1047
- if result and result.get("success"):
1048
- final_url = result.get("final_url", "")
1049
-
1050
- # Check if we reached a publisher URL
1051
- if self._is_publisher_url(final_url, doi=doi):
1052
- logger.success(f"{doi}: {final_url}")
1053
- result["resolved_url"] = final_url
1054
- return True
1055
-
1056
- # Also accept Elsevier linking hub as success
1057
- elif "linkinghub.elsevier.com" in final_url:
1058
- logger.success(f"{doi}: {final_url} (Elsevier linking hub)")
1059
- result["resolved_url"] = final_url
1060
- return True
1061
-
1062
- # If we have a URL but it's not a publisher, still mark as partial success
1063
- elif (
1064
- final_url
1065
- and "chrome-error" not in final_url
1066
- and "openathens" not in final_url.lower()
1067
- ):
1068
- logger.info(f"{doi}: Reached {final_url}")
1069
- result["resolved_url"] = final_url
1070
- return True
1071
-
1072
- # Only mark as failed if no URL or error/auth page
1073
- final_url = result.get("final_url") if result else "N/A"
1074
- logger.fail(f"{doi}: Failed - {final_url}")
1075
- if result:
1076
- result["resolved_url"] = None
1077
- return False
1078
-
1079
- async def __aenter__(self):
1080
- return self
1081
-
1082
- async def __aexit__(self, exc_type, exc_val, exc_tb):
1083
- if hasattr(self.browser, "cleanup_authenticate_async_context"):
1084
- await self.browser.cleanup_authenticate_async_context()
1085
-
1086
-
1087
- async def try_openurl_resolver_async(
1088
- title: str = "",
1089
- authors: Optional[list] = None,
1090
- journal: str = "",
1091
- year: Optional[int] = None,
1092
- volume: Optional[int] = None,
1093
- issue: Optional[int] = None,
1094
- pages: str = "",
1095
- doi: str = "",
1096
- pmid: str = "",
1097
- resolver_url: Optional[str] = None,
1098
- auth_manager=None,
1099
- ) -> Optional[str]:
1100
- """Try to find full-text URL via OpenURL resolver."""
1101
- async with OpenURLResolver(auth_manager, resolver_url) as resolver:
1102
- result = await resolver._resolve_single_async(
1103
- title, authors, journal, year, volume, issue, pages, doi, pmid
1104
- )
1105
- if result and result.get("success") and result.get("final_url"):
1106
- return result["final_url"]
1107
- return None
1108
-
1109
-
1110
- if __name__ == "__main__":
1111
- import asyncio
1112
-
1113
- async def main():
1114
- """Test the resolver with different articles."""
1115
- # from scitex import logging
1116
- # logging.basicConfig(level=logging.DEBUG)
1117
- from scitex.scholar.auth import AuthenticationManager
1118
-
1119
- auth_manager = AuthenticationManager()
1120
- # resolver_url = "https://unimelb.hosted.exlibrisgroup.com/sfxlcl41"
1121
-
1122
- async with OpenURLResolver(auth_manager) as resolver:
1123
- print("\n=== Test 1: Article with access ===")
1124
- result = await resolver._resolve_single_async(
1125
- doi="10.1002/hipo.22488",
1126
- # title="Hippocampal sharp wave-ripple: A cognitive biomarker for episodic memory and planning",
1127
- # authors=["Buzsáki, György"],
1128
- # journal="Hippocampus",
1129
- # year=2015,
1130
- # volume=25,
1131
- # issue=10,
1132
- # pages="1073-1188",
1133
- )
1134
- print(f"Result: {result}")
1135
-
1136
- print("\n=== Test 2: Article without access ===")
1137
- result = await resolver._resolve_single_async(
1138
- doi="10.1038/s41593-025-01990-7",
1139
- title="Addressing artifactual bias in large, automated MRI analyses of brain development",
1140
- journal="Nature Neuroscience",
1141
- year=2025,
1142
- )
1143
- print(f"Result: {result}")
1144
-
1145
- time.sleep(600)
1146
-
1147
- asyncio.run(main())
1148
-
1149
- # python -m scitex.scholar.open_url._OpenURLResolver
1150
-
1151
- # Is it a resolver url?
1152
- # https://unimelb.hosted.exlibrisgroup.com/sfxlcl41?ctx_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rft.genre=article&rft.doi=10.1002/hipo.22488
1153
-
1154
- # EOF
1155
- # https://unimelb.hosted.exlibrisgroup.com/sfxlcl41?ctx_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rft.genre=article&rft.doi=10.1038/s41593-025-01990-7
1156
-
1157
- # https://unimelb.hosted.exlibrisgroup.com/sfxlcl41?ctx_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rft.genre=article&rft.doi=10.1002/hipo.22488
1158
- # https://unimelb.hosted.exlibrisgroup.com/sfxlcl41?ctx_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rft.genre=article&rft.doi=10.1038/s41593-025-02020-2
1159
-
1160
- # EOF