scitex 2.14.0__py3-none-any.whl → 2.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. scitex/__init__.py +47 -0
  2. scitex/_env_loader.py +156 -0
  3. scitex/_mcp_resources/__init__.py +37 -0
  4. scitex/_mcp_resources/_cheatsheet.py +135 -0
  5. scitex/_mcp_resources/_figrecipe.py +138 -0
  6. scitex/_mcp_resources/_formats.py +102 -0
  7. scitex/_mcp_resources/_modules.py +337 -0
  8. scitex/_mcp_resources/_session.py +149 -0
  9. scitex/_mcp_tools/__init__.py +4 -0
  10. scitex/_mcp_tools/audio.py +66 -0
  11. scitex/_mcp_tools/diagram.py +11 -95
  12. scitex/_mcp_tools/introspect.py +191 -0
  13. scitex/_mcp_tools/plt.py +260 -305
  14. scitex/_mcp_tools/scholar.py +74 -0
  15. scitex/_mcp_tools/social.py +244 -0
  16. scitex/_mcp_tools/writer.py +21 -204
  17. scitex/ai/_gen_ai/_PARAMS.py +10 -7
  18. scitex/ai/classification/reporters/_SingleClassificationReporter.py +45 -1603
  19. scitex/ai/classification/reporters/_mixins/__init__.py +36 -0
  20. scitex/ai/classification/reporters/_mixins/_constants.py +67 -0
  21. scitex/ai/classification/reporters/_mixins/_cv_summary.py +387 -0
  22. scitex/ai/classification/reporters/_mixins/_feature_importance.py +119 -0
  23. scitex/ai/classification/reporters/_mixins/_metrics.py +275 -0
  24. scitex/ai/classification/reporters/_mixins/_plotting.py +179 -0
  25. scitex/ai/classification/reporters/_mixins/_reports.py +153 -0
  26. scitex/ai/classification/reporters/_mixins/_storage.py +160 -0
  27. scitex/audio/README.md +40 -36
  28. scitex/audio/__init__.py +127 -59
  29. scitex/audio/_branding.py +185 -0
  30. scitex/audio/_mcp/__init__.py +32 -0
  31. scitex/audio/_mcp/handlers.py +59 -6
  32. scitex/audio/_mcp/speak_handlers.py +238 -0
  33. scitex/audio/_relay.py +225 -0
  34. scitex/audio/engines/elevenlabs_engine.py +6 -1
  35. scitex/audio/mcp_server.py +228 -75
  36. scitex/canvas/README.md +1 -1
  37. scitex/canvas/editor/_dearpygui/__init__.py +25 -0
  38. scitex/canvas/editor/_dearpygui/_editor.py +147 -0
  39. scitex/canvas/editor/_dearpygui/_handlers.py +476 -0
  40. scitex/canvas/editor/_dearpygui/_panels/__init__.py +17 -0
  41. scitex/canvas/editor/_dearpygui/_panels/_control.py +119 -0
  42. scitex/canvas/editor/_dearpygui/_panels/_element_controls.py +190 -0
  43. scitex/canvas/editor/_dearpygui/_panels/_preview.py +43 -0
  44. scitex/canvas/editor/_dearpygui/_panels/_sections.py +390 -0
  45. scitex/canvas/editor/_dearpygui/_plotting.py +187 -0
  46. scitex/canvas/editor/_dearpygui/_rendering.py +504 -0
  47. scitex/canvas/editor/_dearpygui/_selection.py +295 -0
  48. scitex/canvas/editor/_dearpygui/_state.py +93 -0
  49. scitex/canvas/editor/_dearpygui/_utils.py +61 -0
  50. scitex/canvas/editor/flask_editor/templates/__init__.py +32 -70
  51. scitex/cli/__init__.py +38 -43
  52. scitex/cli/audio.py +76 -27
  53. scitex/cli/capture.py +13 -20
  54. scitex/cli/introspect.py +443 -0
  55. scitex/cli/main.py +198 -109
  56. scitex/cli/mcp.py +60 -34
  57. scitex/cli/scholar/__init__.py +8 -0
  58. scitex/cli/scholar/_crossref_scitex.py +296 -0
  59. scitex/cli/scholar/_fetch.py +25 -3
  60. scitex/cli/social.py +314 -0
  61. scitex/cli/writer.py +117 -0
  62. scitex/config/README.md +1 -1
  63. scitex/config/__init__.py +16 -2
  64. scitex/config/_env_registry.py +191 -0
  65. scitex/diagram/__init__.py +42 -19
  66. scitex/diagram/mcp_server.py +13 -125
  67. scitex/introspect/__init__.py +75 -0
  68. scitex/introspect/_call_graph.py +303 -0
  69. scitex/introspect/_class_hierarchy.py +163 -0
  70. scitex/introspect/_core.py +42 -0
  71. scitex/introspect/_docstring.py +131 -0
  72. scitex/introspect/_examples.py +113 -0
  73. scitex/introspect/_imports.py +271 -0
  74. scitex/introspect/_mcp/__init__.py +37 -0
  75. scitex/introspect/_mcp/handlers.py +208 -0
  76. scitex/introspect/_members.py +151 -0
  77. scitex/introspect/_resolve.py +89 -0
  78. scitex/introspect/_signature.py +131 -0
  79. scitex/introspect/_source.py +80 -0
  80. scitex/introspect/_type_hints.py +172 -0
  81. scitex/io/bundle/README.md +1 -1
  82. scitex/mcp_server.py +98 -5
  83. scitex/plt/__init__.py +248 -550
  84. scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +5 -10
  85. scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  86. scitex/plt/gallery/README.md +1 -1
  87. scitex/plt/utils/_hitmap/__init__.py +82 -0
  88. scitex/plt/utils/_hitmap/_artist_extraction.py +343 -0
  89. scitex/plt/utils/_hitmap/_color_application.py +346 -0
  90. scitex/plt/utils/_hitmap/_color_conversion.py +121 -0
  91. scitex/plt/utils/_hitmap/_constants.py +40 -0
  92. scitex/plt/utils/_hitmap/_hitmap_core.py +334 -0
  93. scitex/plt/utils/_hitmap/_path_extraction.py +357 -0
  94. scitex/plt/utils/_hitmap/_query.py +113 -0
  95. scitex/plt/utils/_hitmap.py +46 -1616
  96. scitex/plt/utils/_metadata/__init__.py +80 -0
  97. scitex/plt/utils/_metadata/_artists/__init__.py +25 -0
  98. scitex/plt/utils/_metadata/_artists/_base.py +195 -0
  99. scitex/plt/utils/_metadata/_artists/_collections.py +356 -0
  100. scitex/plt/utils/_metadata/_artists/_extract.py +57 -0
  101. scitex/plt/utils/_metadata/_artists/_images.py +80 -0
  102. scitex/plt/utils/_metadata/_artists/_lines.py +261 -0
  103. scitex/plt/utils/_metadata/_artists/_patches.py +247 -0
  104. scitex/plt/utils/_metadata/_artists/_text.py +106 -0
  105. scitex/plt/utils/_metadata/_csv.py +416 -0
  106. scitex/plt/utils/_metadata/_detect.py +225 -0
  107. scitex/plt/utils/_metadata/_legend.py +127 -0
  108. scitex/plt/utils/_metadata/_rounding.py +117 -0
  109. scitex/plt/utils/_metadata/_verification.py +202 -0
  110. scitex/schema/README.md +1 -1
  111. scitex/scholar/__init__.py +8 -0
  112. scitex/scholar/_mcp/crossref_handlers.py +265 -0
  113. scitex/scholar/core/Scholar.py +63 -1700
  114. scitex/scholar/core/_mixins/__init__.py +36 -0
  115. scitex/scholar/core/_mixins/_enrichers.py +270 -0
  116. scitex/scholar/core/_mixins/_library_handlers.py +100 -0
  117. scitex/scholar/core/_mixins/_loaders.py +103 -0
  118. scitex/scholar/core/_mixins/_pdf_download.py +375 -0
  119. scitex/scholar/core/_mixins/_pipeline.py +312 -0
  120. scitex/scholar/core/_mixins/_project_handlers.py +125 -0
  121. scitex/scholar/core/_mixins/_savers.py +69 -0
  122. scitex/scholar/core/_mixins/_search.py +103 -0
  123. scitex/scholar/core/_mixins/_services.py +88 -0
  124. scitex/scholar/core/_mixins/_url_finding.py +105 -0
  125. scitex/scholar/crossref_scitex.py +367 -0
  126. scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  127. scitex/scholar/examples/00_run_all.sh +120 -0
  128. scitex/scholar/jobs/_executors.py +27 -3
  129. scitex/scholar/pdf_download/ScholarPDFDownloader.py +38 -416
  130. scitex/scholar/pdf_download/_cli.py +154 -0
  131. scitex/scholar/pdf_download/strategies/__init__.py +11 -8
  132. scitex/scholar/pdf_download/strategies/manual_download_fallback.py +80 -3
  133. scitex/scholar/pipelines/ScholarPipelineBibTeX.py +73 -121
  134. scitex/scholar/pipelines/ScholarPipelineParallel.py +80 -138
  135. scitex/scholar/pipelines/ScholarPipelineSingle.py +43 -63
  136. scitex/scholar/pipelines/_single_steps.py +71 -36
  137. scitex/scholar/storage/_LibraryManager.py +97 -1695
  138. scitex/scholar/storage/_mixins/__init__.py +30 -0
  139. scitex/scholar/storage/_mixins/_bibtex_handlers.py +128 -0
  140. scitex/scholar/storage/_mixins/_library_operations.py +218 -0
  141. scitex/scholar/storage/_mixins/_metadata_conversion.py +226 -0
  142. scitex/scholar/storage/_mixins/_paper_saving.py +456 -0
  143. scitex/scholar/storage/_mixins/_resolution.py +376 -0
  144. scitex/scholar/storage/_mixins/_storage_helpers.py +121 -0
  145. scitex/scholar/storage/_mixins/_symlink_handlers.py +226 -0
  146. scitex/scholar/url_finder/.tmp/open_url/KNOWN_RESOLVERS.py +462 -0
  147. scitex/scholar/url_finder/.tmp/open_url/README.md +223 -0
  148. scitex/scholar/url_finder/.tmp/open_url/_DOIToURLResolver.py +694 -0
  149. scitex/scholar/url_finder/.tmp/open_url/_OpenURLResolver.py +1160 -0
  150. scitex/scholar/url_finder/.tmp/open_url/_ResolverLinkFinder.py +344 -0
  151. scitex/scholar/url_finder/.tmp/open_url/__init__.py +24 -0
  152. scitex/security/README.md +3 -3
  153. scitex/session/README.md +1 -1
  154. scitex/sh/README.md +1 -1
  155. scitex/social/__init__.py +153 -0
  156. scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  157. scitex/template/README.md +1 -1
  158. scitex/template/clone_writer_directory.py +5 -5
  159. scitex/writer/README.md +1 -1
  160. scitex/writer/_mcp/handlers.py +11 -744
  161. scitex/writer/_mcp/tool_schemas.py +5 -335
  162. scitex-2.15.1.dist-info/METADATA +648 -0
  163. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/RECORD +166 -111
  164. scitex/canvas/editor/flask_editor/templates/_scripts.py +0 -4933
  165. scitex/canvas/editor/flask_editor/templates/_styles.py +0 -1658
  166. scitex/dev/plt/data/mpl/PLOTTING_FUNCTIONS.yaml +0 -90
  167. scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES.yaml +0 -1571
  168. scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES_DETAILED.yaml +0 -6262
  169. scitex/dev/plt/data/mpl/SIGNATURES_FLATTENED.yaml +0 -1274
  170. scitex/dev/plt/data/mpl/dir_ax.txt +0 -459
  171. scitex/diagram/_compile.py +0 -312
  172. scitex/diagram/_diagram.py +0 -355
  173. scitex/diagram/_mcp/__init__.py +0 -4
  174. scitex/diagram/_mcp/handlers.py +0 -400
  175. scitex/diagram/_mcp/tool_schemas.py +0 -157
  176. scitex/diagram/_presets.py +0 -173
  177. scitex/diagram/_schema.py +0 -182
  178. scitex/diagram/_split.py +0 -278
  179. scitex/plt/_mcp/__init__.py +0 -4
  180. scitex/plt/_mcp/_handlers_annotation.py +0 -102
  181. scitex/plt/_mcp/_handlers_figure.py +0 -195
  182. scitex/plt/_mcp/_handlers_plot.py +0 -252
  183. scitex/plt/_mcp/_handlers_style.py +0 -219
  184. scitex/plt/_mcp/handlers.py +0 -74
  185. scitex/plt/_mcp/tool_schemas.py +0 -497
  186. scitex/plt/mcp_server.py +0 -231
  187. scitex/scholar/data/.gitkeep +0 -0
  188. scitex/scholar/data/README.md +0 -44
  189. scitex/scholar/data/bib_files/bibliography.bib +0 -1952
  190. scitex/scholar/data/bib_files/neurovista.bib +0 -277
  191. scitex/scholar/data/bib_files/neurovista_enriched.bib +0 -441
  192. scitex/scholar/data/bib_files/neurovista_enriched_enriched.bib +0 -441
  193. scitex/scholar/data/bib_files/neurovista_processed.bib +0 -338
  194. scitex/scholar/data/bib_files/openaccess.bib +0 -89
  195. scitex/scholar/data/bib_files/pac-seizure_prediction_enriched.bib +0 -2178
  196. scitex/scholar/data/bib_files/pac.bib +0 -698
  197. scitex/scholar/data/bib_files/pac_enriched.bib +0 -1061
  198. scitex/scholar/data/bib_files/pac_processed.bib +0 -0
  199. scitex/scholar/data/bib_files/pac_titles.txt +0 -75
  200. scitex/scholar/data/bib_files/paywalled.bib +0 -98
  201. scitex/scholar/data/bib_files/related-papers-by-coauthors.bib +0 -58
  202. scitex/scholar/data/bib_files/related-papers-by-coauthors_enriched.bib +0 -87
  203. scitex/scholar/data/bib_files/seizure_prediction.bib +0 -694
  204. scitex/scholar/data/bib_files/seizure_prediction_processed.bib +0 -0
  205. scitex/scholar/data/bib_files/test_complete_enriched.bib +0 -437
  206. scitex/scholar/data/bib_files/test_final_enriched.bib +0 -437
  207. scitex/scholar/data/bib_files/test_seizure.bib +0 -46
  208. scitex/scholar/data/impact_factor/JCR_IF_2022.xlsx +0 -0
  209. scitex/scholar/data/impact_factor/JCR_IF_2024.db +0 -0
  210. scitex/scholar/data/impact_factor/JCR_IF_2024.xlsx +0 -0
  211. scitex/scholar/data/impact_factor/JCR_IF_2024_v01.db +0 -0
  212. scitex/scholar/data/impact_factor.db +0 -0
  213. scitex/scholar/examples/SUGGESTIONS.md +0 -865
  214. scitex/scholar/examples/dev.py +0 -38
  215. scitex-2.14.0.dist-info/METADATA +0 -1238
  216. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/WHEEL +0 -0
  217. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/entry_points.txt +0 -0
  218. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,694 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # Timestamp: "2025-08-09 00:47:35 (ywatanabe)"
4
+ # File: /home/ywatanabe/proj/scitex_repo/scholar/metadata/urls/open_url/_DOIToURLResolver.py
5
+ # ----------------------------------------
6
+ from __future__ import annotations
7
+
8
+ import os
9
+
10
+ __FILE__ = (
11
+ "./scholar/metadata/urls/open_url/_DOIToURLResolver.py"
12
+ )
13
+ __DIR__ = os.path.dirname(__FILE__)
14
+ # ----------------------------------------
15
+
16
+ from datetime import datetime
17
+
18
+ """
19
+ Convert DOIs to accessible publisher URLs using OpenURL resolvers.
20
+
21
+ This module implements Critical Task #5: Resolve publisher URLs from DOIs
22
+ using institutional OpenURL resolvers for authenticate_async access.
23
+ """
24
+
25
+ import asyncio
26
+ import json
27
+ import re
28
+ from pathlib import Path
29
+ from typing import Dict, List, Optional
30
+ from urllib.parse import urlencode
31
+
32
+ import aiohttp
33
+ from playwright.async_api import Page, async_playwright
34
+
35
+ from scitex import logging
36
+ from scitex.scholar.auth import AuthenticationManager
37
+ from scitex.scholar.config import ScholarConfig
38
+
39
+ from ._OpenURLResolver import OpenURLResolver
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ class DOIToURLResolver:
45
+ """Resolve DOIs to accessible publisher URLs via OpenURL."""
46
+
47
+ def __init__(self, config: Optional[ScholarConfig] = None):
48
+ """
49
+ Initialize DOI to URL resolver.
50
+
51
+ Args:
52
+ config: Scholar configuration (uses default if not provided)
53
+ """
54
+ self.config = config or ScholarConfig()
55
+
56
+ # Initialize auth manager for OpenURL resolver
57
+ auth_manager = AuthenticationManager(config=self.config)
58
+ self.openurl_resolver = OpenURLResolver(auth_manager=auth_manager)
59
+
60
+ # Initialize path manager for screenshots
61
+ self.path_manager = self.config.path_manager
62
+
63
+ # Cache for resolved URLs
64
+ self.cache_dir = Path.home() / ".scitex" / "scholar" / "url_cache"
65
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
66
+ self.cache_file = self.cache_dir / "doi_url_cache.json"
67
+ self.cache = self._load_cache()
68
+
69
+ # Track failures for adaptive behavior
70
+ self.failures = {}
71
+
72
+ async def _capture_workflow_screenshot_async(
73
+ self, doi: str, url: str, stage: str, page: Optional[Page] = None
74
+ ):
75
+ """
76
+ Capture systematic screenshots during DOI resolution workflow.
77
+
78
+ Args:
79
+ doi: DOI being resolved
80
+ url: Current URL
81
+ stage: Workflow stage (e.g., "doi_redirect", "openurl_result", "publisher_page", "access_check")
82
+ page: Existing page object (if None, creates new browser)
83
+ """
84
+ try:
85
+ # Create paper info from DOI for directory structure
86
+ paper_info = {
87
+ "title": f"DOI_Resolution_{doi.replace('/', '_')}",
88
+ "authors": [],
89
+ "year": None,
90
+ "doi": doi,
91
+ "journal": None,
92
+ "url": url,
93
+ }
94
+
95
+ storage_paths = self.path_manager.get_paper_storage_paths(
96
+ paper_info, "doi_resolution"
97
+ )
98
+ screenshots_dir = storage_paths["storage_path"] / "screenshots"
99
+ screenshots_dir.mkdir(parents=True, exist_ok=True)
100
+
101
+ # Generate timestamp and filename
102
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
103
+ safe_doi = doi.replace("/", "_").replace(":", "_")
104
+ filename = f"{timestamp}-{stage}-{safe_doi}.png"
105
+ screenshot_path = screenshots_dir / filename
106
+
107
+ if page:
108
+ # Use existing page
109
+ await page.screenshot(
110
+ path=str(screenshot_path), full_page=True, timeout=10000
111
+ )
112
+ logger.info(
113
+ f"DOI workflow screenshot: {stage} -> {screenshot_path}"
114
+ )
115
+ else:
116
+ # Create new browser for screenshot
117
+ async with async_playwright() as p:
118
+ browser = await p.chromium.launch(headless=True)
119
+ new_page = await browser.new_page()
120
+
121
+ try:
122
+ await new_page.goto(
123
+ url, wait_until="networkidle", timeout=30000
124
+ )
125
+ await new_page.screenshot(
126
+ path=str(screenshot_path),
127
+ full_page=True,
128
+ timeout=10000,
129
+ )
130
+
131
+ # Save page info
132
+ info_file = screenshot_path.with_suffix(".txt")
133
+ with open(info_file, "w", encoding="utf-8") as f:
134
+ f.write(f"DOI: {doi}\n")
135
+ f.write(f"URL: {url}\n")
136
+ f.write(f"Stage: {stage}\n")
137
+ f.write(f"Timestamp: {timestamp}\n")
138
+ f.write(f"Page Title: {await new_page.title()}\n")
139
+ f.write(f"Final URL: {new_page.url}\n")
140
+
141
+ logger.info(
142
+ f"DOI workflow screenshot: {stage} -> {screenshot_path}"
143
+ )
144
+ finally:
145
+ await browser.close()
146
+
147
+ except Exception as e:
148
+ logger.debug(
149
+ f"Failed to capture DOI workflow screenshot for {stage}: {e}"
150
+ )
151
+
152
+ def _load_cache(self) -> Dict[str, Dict[str, any]]:
153
+ """Load URL cache from disk."""
154
+ if self.cache_file.exists():
155
+ try:
156
+ with open(self.cache_file, "r") as f:
157
+ return json.load(f)
158
+ except Exception as e:
159
+ logger.warning(f"Failed to load cache: {e}")
160
+ return {}
161
+
162
+ def _save_cache(self):
163
+ """Save URL cache to disk."""
164
+ try:
165
+ with open(self.cache_file, "w") as f:
166
+ json.dump(self.cache, f, indent=2)
167
+ except Exception as e:
168
+ logger.error(f"Failed to save cache: {e}")
169
+
170
+ def _extract_doi_info(self, doi: str) -> Dict[str, str]:
171
+ """Extract publisher and article ID from DOI."""
172
+ # Common DOI patterns
173
+ patterns = {
174
+ "elsevier": r"10\.1016/(.+)",
175
+ "springer": r"10\.1007/(.+)",
176
+ "nature": r"10\.1038/(.+)",
177
+ "wiley": r"10\.1002/(.+)",
178
+ "ieee": r"10\.1109/(.+)",
179
+ "acs": r"10\.1021/(.+)",
180
+ "rsc": r"10\.1039/(.+)",
181
+ "plos": r"10\.1371/(.+)",
182
+ "frontiers": r"10\.3389/(.+)",
183
+ "mdpi": r"10\.3390/(.+)",
184
+ "oxford": r"10\.1093/(.+)",
185
+ "sage": r"10\.1177/(.+)",
186
+ "taylor_francis": r"10\.1080/(.+)",
187
+ "apa": r"10\.1037/(.+)",
188
+ "iop": r"10\.1088/(.+)",
189
+ }
190
+
191
+ for publisher, pattern in patterns.items():
192
+ match = re.match(pattern, doi)
193
+ if match:
194
+ return {
195
+ "publisher": publisher,
196
+ "article_id": match.group(1),
197
+ "doi": doi,
198
+ }
199
+
200
+ # Generic pattern
201
+ match = re.match(r"(10\.\d+)/(.+)", doi)
202
+ if match:
203
+ return {
204
+ "publisher": "unknown",
205
+ "prefix": match.group(1),
206
+ "article_id": match.group(2),
207
+ "doi": doi,
208
+ }
209
+
210
+ return {"doi": doi, "publisher": "unknown"}
211
+
212
+ def _build_direct_urls(self, doi: str) -> List[str]:
213
+ """Build potential direct publisher URLs for a DOI."""
214
+ info = self._extract_doi_info(doi)
215
+ urls = []
216
+
217
+ # Always include standard DOI URL
218
+ urls.append(f"https://doi.org/{doi}")
219
+
220
+ # Publisher-specific patterns
221
+ if info["publisher"] == "elsevier":
222
+ # ScienceDirect pattern
223
+ urls.append(
224
+ f"https://www.sciencedirect.com/science/article/pii/{info['article_id']}"
225
+ )
226
+
227
+ elif info["publisher"] == "springer":
228
+ # SpringerLink pattern
229
+ urls.append(f"https://link.springer.com/article/{doi}")
230
+ urls.append(f"https://link.springer.com/chapter/{doi}")
231
+
232
+ elif info["publisher"] == "nature":
233
+ # Nature pattern
234
+ urls.append(
235
+ f"https://www.nature.com/articles/{info['article_id']}"
236
+ )
237
+
238
+ elif info["publisher"] == "wiley":
239
+ # Wiley Online Library pattern
240
+ urls.append(f"https://onlinelibrary.wiley.com/doi/abs/{doi}")
241
+ urls.append(f"https://onlinelibrary.wiley.com/doi/full/{doi}")
242
+
243
+ elif info["publisher"] == "ieee":
244
+ # IEEE Xplore pattern (needs document ID)
245
+ urls.append(
246
+ f"https://ieeexplore.ieee.org/document/{info['article_id']}"
247
+ )
248
+
249
+ elif info["publisher"] == "oxford":
250
+ # Oxford Academic pattern
251
+ urls.append(f"https://academic.oup.com/article-lookup/doi/{doi}")
252
+
253
+ return urls
254
+
255
+ async def resolve_single_async(
256
+ self, doi: str, use_openurl: bool = True, verify_access: bool = True
257
+ ) -> Optional[Dict[str, any]]:
258
+ """
259
+ Resolve a single DOI to accessible URL.
260
+
261
+ Args:
262
+ doi: DOI to resolve
263
+ use_openurl: Whether to use OpenURL resolver
264
+ verify_access: Whether to verify PDF access
265
+
266
+ Returns:
267
+ Dict with 'url', 'access_type', 'verified' fields if successful
268
+ """
269
+ # Check cache first
270
+ if doi in self.cache:
271
+ logger.debug(f"Using cached URL for {doi}")
272
+ return self.cache[doi]
273
+
274
+ result = None
275
+
276
+ try:
277
+ # Try OpenURL resolver first if configured
278
+ openurl_resolver_url = self.config.resolve(
279
+ "openurl_resolver_url", None, None, type=str
280
+ )
281
+ if use_openurl and openurl_resolver_url:
282
+ logger.info(f"Trying OpenURL resolver for {doi}")
283
+
284
+ # Capture screenshot of initial DOI URL
285
+ doi_url = f"https://doi.org/{doi}"
286
+ await self._capture_workflow_screenshot_async(
287
+ doi, doi_url, "01_initial_doi"
288
+ )
289
+
290
+ openurl_result = await self._try_openurl_async(doi)
291
+ if openurl_result:
292
+ result = openurl_result
293
+ # Capture screenshot of OpenURL result
294
+ if result.get("url"):
295
+ await self._capture_workflow_screenshot_async(
296
+ doi, result["url"], "02_openurl_resolved"
297
+ )
298
+
299
+ # Try direct publisher URLs
300
+ if not result:
301
+ logger.info(f"Trying direct publisher URLs for {doi}")
302
+ direct_result = await self._try_direct_urls_async(
303
+ doi, verify_access
304
+ )
305
+ if direct_result:
306
+ result = direct_result
307
+ # Capture screenshot of direct URL result
308
+ if result.get("url"):
309
+ await self._capture_workflow_screenshot_async(
310
+ doi, result["url"], "03_direct_publisher"
311
+ )
312
+
313
+ # Cache successful result
314
+ if result:
315
+ self.cache[doi] = result
316
+ self._save_cache()
317
+ logger.success(f"Resolved {doi} to {result['url']}")
318
+ else:
319
+ logger.warning(f"Failed to resolve {doi}")
320
+
321
+ return result
322
+
323
+ except Exception as e:
324
+ logger.error(f"Error resolving {doi}: {e}")
325
+ return None
326
+
327
+ async def _try_openurl_async(self, doi: str) -> Optional[Dict[str, any]]:
328
+ """Try to resolve DOI using OpenURL."""
329
+ try:
330
+ # Build OpenURL query
331
+ params = {
332
+ "rft_id": f"info:doi/{doi}",
333
+ "rft.genre": "article",
334
+ "rft_val_fmt": "info:ofi/fmt:kev:mtx:journal",
335
+ "req_dat": "format=pdf",
336
+ }
337
+
338
+ openurl_resolver_url = self.config.resolve(
339
+ "openurl_resolver_url", None, None, type=str
340
+ )
341
+ openurl = f"{openurl_resolver_url}?{urlencode(params)}"
342
+
343
+ # Use the OpenURL resolver to navigate
344
+ async with async_playwright() as p:
345
+ browser = await p.chromium.launch(headless=True)
346
+ page = await browser.new_page()
347
+
348
+ try:
349
+ # Navigate to OpenURL
350
+ await page.goto(
351
+ openurl, wait_until="networkidle", timeout=30000
352
+ )
353
+
354
+ # Wait for potential redirects
355
+ await page.wait_for_timeout(3000)
356
+
357
+ # Get final URL
358
+ final_url = page.url
359
+
360
+ # Check if we reached a publisher page
361
+ if "doi.org" not in final_url and any(
362
+ domain in final_url
363
+ for domain in [
364
+ "sciencedirect",
365
+ "springer",
366
+ "nature",
367
+ "wiley",
368
+ "ieee",
369
+ ]
370
+ ):
371
+ # Check for PDF access
372
+ pdf_available = await self._check_pdf_access_async(
373
+ page
374
+ )
375
+
376
+ return {
377
+ "url": final_url,
378
+ "access_type": "openurl",
379
+ "pdf_available": pdf_available,
380
+ "verified": True,
381
+ }
382
+
383
+ finally:
384
+ await browser.close()
385
+
386
+ except Exception as e:
387
+ logger.debug(f"OpenURL resolution failed for {doi}: {e}")
388
+
389
+ return None
390
+
391
+ async def _try_direct_urls_async(
392
+ self, doi: str, verify_access: bool = True
393
+ ) -> Optional[Dict[str, any]]:
394
+ """Try direct publisher URLs."""
395
+ urls = self._build_direct_urls(doi)
396
+
397
+ for url in urls:
398
+ try:
399
+ if verify_access:
400
+ # Verify with browser
401
+ result = await self._verify_url_access_async(url)
402
+ if result:
403
+ return {
404
+ "url": url,
405
+ "access_type": "direct",
406
+ "pdf_available": result.get(
407
+ "pdf_available", False
408
+ ),
409
+ "verified": True,
410
+ }
411
+ else:
412
+ # Just check if URL responds
413
+ async with aiohttp.ClientSession() as session:
414
+ async with session.head(
415
+ url, allow_redirects=True
416
+ ) as resp:
417
+ if resp.status == 200:
418
+ return {
419
+ "url": url,
420
+ "access_type": "direct",
421
+ "verified": False,
422
+ }
423
+
424
+ except Exception as e:
425
+ logger.debug(f"Failed to access {url}: {e}")
426
+ continue
427
+
428
+ return None
429
+
430
+ async def _verify_url_access_async(
431
+ self, url: str
432
+ ) -> Optional[Dict[str, any]]:
433
+ """Verify URL provides article access."""
434
+ try:
435
+ async with async_playwright() as p:
436
+ browser = await p.chromium.launch(headless=True)
437
+ page = await browser.new_page()
438
+
439
+ try:
440
+ # Navigate to URL
441
+ await page.goto(
442
+ url, wait_until="networkidle", timeout=30000
443
+ )
444
+
445
+ # Check for common paywall indicators
446
+ paywall_indicators = [
447
+ "purchase",
448
+ "buy",
449
+ "subscribe",
450
+ "access denied",
451
+ "log in",
452
+ "sign in",
453
+ "institutional login",
454
+ ]
455
+
456
+ page_text = await page.content()
457
+ page_text_lower = page_text.lower()
458
+
459
+ has_paywall = any(
460
+ indicator in page_text_lower
461
+ for indicator in paywall_indicators
462
+ )
463
+
464
+ # Check for PDF access
465
+ pdf_available = await self._check_pdf_access_async(page)
466
+
467
+ if not has_paywall or pdf_available:
468
+ return {
469
+ "pdf_available": pdf_available,
470
+ "has_paywall": has_paywall,
471
+ }
472
+
473
+ finally:
474
+ await browser.close()
475
+
476
+ except Exception as e:
477
+ logger.debug(f"Failed to verify {url}: {e}")
478
+
479
+ return None
480
+
481
+ async def _check_pdf_access_async(self, page: Page) -> bool:
482
+ """Check if PDF download is available on the page."""
483
+ try:
484
+ # Look for PDF download links/buttons
485
+ pdf_selectors = [
486
+ 'a[href*=".pdf"]',
487
+ 'a[href*="/pdf/"]',
488
+ 'button:has-text("Download PDF")',
489
+ 'a:has-text("Download PDF")',
490
+ 'a:has-text("View PDF")',
491
+ 'a:has-text("Full Text PDF")',
492
+ ".pdf-download",
493
+ '[class*="pdf-link"]',
494
+ ]
495
+
496
+ for selector in pdf_selectors:
497
+ elements = await page.query_selector_all(selector)
498
+ if elements:
499
+ return True
500
+
501
+ # Check for embedded PDF viewer
502
+ pdf_viewers = await page.query_selector_all(
503
+ 'iframe[src*="pdf"], embed[type="application/pdf"]'
504
+ )
505
+ if pdf_viewers:
506
+ return True
507
+
508
+ except Exception as e:
509
+ logger.debug(f"Error checking PDF access: {e}")
510
+
511
+ return False
512
+
513
+ async def resolve_batch_async(
514
+ self, dois: List[str], max_concurrent: int = 3, progress_callback=None
515
+ ) -> Dict[str, Dict[str, any]]:
516
+ """
517
+ Resolve multiple DOIs concurrently.
518
+
519
+ Args:
520
+ dois: List of DOIs to resolve
521
+ max_concurrent: Maximum concurrent resolutions
522
+ progress_callback: Optional callback for progress updates
523
+
524
+ Returns:
525
+ Dict mapping DOIs to resolution results
526
+ """
527
+ results = {}
528
+ semaphore = asyncio.Semaphore(max_concurrent)
529
+
530
+ async def resolve_with_limit_async(doi: str, index: int):
531
+ async with semaphore:
532
+ if progress_callback:
533
+ progress_callback(index, len(dois), f"Resolving {doi}")
534
+
535
+ result = await self.resolve_single_async(doi)
536
+ results[doi] = result
537
+ return doi, result
538
+
539
+ # Create tasks
540
+ tasks = [
541
+ resolve_with_limit_async(doi, i) for i, doi in enumerate(dois)
542
+ ]
543
+
544
+ # Process all DOIs
545
+ await asyncio.gather(*tasks, return_exceptions=True)
546
+
547
+ return results
548
+
549
+ def resolve_from_bibtex(
550
+ self, bibtex_path: Path, output_path: Optional[Path] = None
551
+ ) -> Dict[str, Dict[str, any]]:
552
+ """
553
+ Resolve URLs for all DOIs in a BibTeX file.
554
+
555
+ Args:
556
+ bibtex_path: Path to BibTeX file
557
+ output_path: Optional path for updated BibTeX
558
+
559
+ Returns:
560
+ Dict mapping DOIs to resolution results
561
+ """
562
+ import bibtexparser
563
+
564
+ # Load BibTeX
565
+ with open(bibtex_path, "r", encoding="utf-8") as f:
566
+ bib_db = bibtexparser.load(f)
567
+
568
+ # Extract DOIs
569
+ dois = []
570
+ doi_to_entry = {}
571
+
572
+ for entry in bib_db.entries:
573
+ if "doi" in entry:
574
+ doi = entry["doi"]
575
+ dois.append(doi)
576
+ doi_to_entry[doi] = entry
577
+
578
+ logger.info(f"Found {len(dois)} DOIs in {bibtex_path}")
579
+
580
+ # Resolve URLs
581
+ loop = asyncio.new_event_loop()
582
+ asyncio.set_event_loop(loop)
583
+
584
+ try:
585
+ results = loop.run_until_complete(self.resolve_batch_async(dois))
586
+ finally:
587
+ loop.close()
588
+
589
+ # Update BibTeX entries with URLs
590
+ success_count = 0
591
+ for doi, result in results.items():
592
+ if result and result.get("url"):
593
+ entry = doi_to_entry[doi]
594
+ entry["url"] = result["url"]
595
+ entry["url_source"] = result["access_type"]
596
+ if result.get("pdf_available"):
597
+ entry["pdf_available"] = "yes"
598
+ success_count += 1
599
+
600
+ logger.info(f"Resolved URLs for {success_count}/{len(dois)} DOIs")
601
+
602
+ # Save updated BibTeX if requested
603
+ if output_path:
604
+ with open(output_path, "w", encoding="utf-8") as f:
605
+ bibtexparser.dump(bib_db, f)
606
+ logger.info(f"Saved updated BibTeX to {output_path}")
607
+
608
+ return results
609
+
610
+
611
+ async def main():
612
+ """Command-line interface for DOI to URL resolution."""
613
+ import argparse
614
+
615
+ parser = argparse.ArgumentParser(
616
+ description="Resolve DOIs to accessible publisher URLs",
617
+ formatter_class=argparse.RawDescriptionHelpFormatter,
618
+ epilog="""
619
+ Examples:
620
+ # Resolve single DOI
621
+ python -m scitex.scholar.open_url.resolve_urls --doi "10.1038/nature12373"
622
+
623
+ # Resolve DOIs from BibTeX file
624
+ python -m scitex.scholar.open_url.resolve_urls --bibtex papers.bib
625
+
626
+ # Save URLs to new BibTeX file
627
+ python -m scitex.scholar.open_url.resolve_urls --bibtex papers.bib --output papers-with-urls.bib
628
+ """,
629
+ )
630
+
631
+ input_group = parser.add_mutually_exclusive_group(required=True)
632
+
633
+ input_group.add_argument("--doi", type=str, help="Single DOI to resolve")
634
+
635
+ input_group.add_argument(
636
+ "--bibtex", "-b", type=str, help="BibTeX file containing DOIs"
637
+ )
638
+
639
+ parser.add_argument(
640
+ "--output",
641
+ "-o",
642
+ type=str,
643
+ help="Output BibTeX file (for --bibtex mode)",
644
+ )
645
+
646
+ parser.add_argument(
647
+ "--no-verify", action="store_true", help="Skip access verification"
648
+ )
649
+
650
+ args = parser.parse_args()
651
+
652
+ # Initialize resolver
653
+ resolver = DOIToURLResolver()
654
+
655
+ if args.doi:
656
+ # Single DOI mode
657
+ result = await resolver.resolve_single_async(
658
+ args.doi, verify_access=not args.no_verify
659
+ )
660
+
661
+ if result:
662
+ print(f"\nResolved URL: {result['url']}")
663
+ print(f"Access type: {result['access_type']}")
664
+ if "pdf_available" in result:
665
+ print(
666
+ f"PDF available: {'Yes' if result['pdf_available'] else 'No'}"
667
+ )
668
+ else:
669
+ print("\nFailed to resolve DOI")
670
+
671
+ else:
672
+ # BibTeX mode
673
+ results = resolver.resolve_from_bibtex(
674
+ Path(args.bibtex), Path(args.output) if args.output else None
675
+ )
676
+
677
+ # Print summary
678
+ success = sum(1 for r in results.values() if r and r.get("url"))
679
+ print(f"\nResolved {success}/{len(results)} DOIs")
680
+
681
+ # Show first few results
682
+ for doi, result in list(results.items())[:5]:
683
+ if result:
684
+ print(f"\n{doi}:")
685
+ print(f" URL: {result['url']}")
686
+ print(f" Type: {result['access_type']}")
687
+
688
+
689
+ if __name__ == "__main__":
690
+ import sys
691
+
692
+ sys.exit(asyncio.run(main()))
693
+
694
+ # EOF