scitex 2.14.0__py3-none-any.whl → 2.15.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +71 -17
- scitex/_env_loader.py +156 -0
- scitex/_mcp_resources/__init__.py +37 -0
- scitex/_mcp_resources/_cheatsheet.py +135 -0
- scitex/_mcp_resources/_figrecipe.py +138 -0
- scitex/_mcp_resources/_formats.py +102 -0
- scitex/_mcp_resources/_modules.py +337 -0
- scitex/_mcp_resources/_session.py +149 -0
- scitex/_mcp_tools/__init__.py +4 -0
- scitex/_mcp_tools/audio.py +66 -0
- scitex/_mcp_tools/diagram.py +11 -95
- scitex/_mcp_tools/introspect.py +210 -0
- scitex/_mcp_tools/plt.py +260 -305
- scitex/_mcp_tools/scholar.py +74 -0
- scitex/_mcp_tools/social.py +244 -0
- scitex/_mcp_tools/template.py +24 -0
- scitex/_mcp_tools/writer.py +21 -204
- scitex/ai/_gen_ai/_PARAMS.py +10 -7
- scitex/ai/classification/reporters/_SingleClassificationReporter.py +45 -1603
- scitex/ai/classification/reporters/_mixins/__init__.py +36 -0
- scitex/ai/classification/reporters/_mixins/_constants.py +67 -0
- scitex/ai/classification/reporters/_mixins/_cv_summary.py +387 -0
- scitex/ai/classification/reporters/_mixins/_feature_importance.py +119 -0
- scitex/ai/classification/reporters/_mixins/_metrics.py +275 -0
- scitex/ai/classification/reporters/_mixins/_plotting.py +179 -0
- scitex/ai/classification/reporters/_mixins/_reports.py +153 -0
- scitex/ai/classification/reporters/_mixins/_storage.py +160 -0
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +30 -1550
- scitex/ai/classification/timeseries/_sliding_window_core.py +467 -0
- scitex/ai/classification/timeseries/_sliding_window_plotting.py +369 -0
- scitex/audio/README.md +40 -36
- scitex/audio/__init__.py +129 -61
- scitex/audio/_branding.py +185 -0
- scitex/audio/_mcp/__init__.py +32 -0
- scitex/audio/_mcp/handlers.py +59 -6
- scitex/audio/_mcp/speak_handlers.py +238 -0
- scitex/audio/_relay.py +225 -0
- scitex/audio/_tts.py +18 -10
- scitex/audio/engines/base.py +17 -10
- scitex/audio/engines/elevenlabs_engine.py +7 -2
- scitex/audio/mcp_server.py +228 -75
- scitex/canvas/README.md +1 -1
- scitex/canvas/editor/_dearpygui/__init__.py +25 -0
- scitex/canvas/editor/_dearpygui/_editor.py +147 -0
- scitex/canvas/editor/_dearpygui/_handlers.py +476 -0
- scitex/canvas/editor/_dearpygui/_panels/__init__.py +17 -0
- scitex/canvas/editor/_dearpygui/_panels/_control.py +119 -0
- scitex/canvas/editor/_dearpygui/_panels/_element_controls.py +190 -0
- scitex/canvas/editor/_dearpygui/_panels/_preview.py +43 -0
- scitex/canvas/editor/_dearpygui/_panels/_sections.py +390 -0
- scitex/canvas/editor/_dearpygui/_plotting.py +187 -0
- scitex/canvas/editor/_dearpygui/_rendering.py +504 -0
- scitex/canvas/editor/_dearpygui/_selection.py +295 -0
- scitex/canvas/editor/_dearpygui/_state.py +93 -0
- scitex/canvas/editor/_dearpygui/_utils.py +61 -0
- scitex/canvas/editor/flask_editor/_core/__init__.py +27 -0
- scitex/canvas/editor/flask_editor/_core/_bbox_extraction.py +200 -0
- scitex/canvas/editor/flask_editor/_core/_editor.py +173 -0
- scitex/canvas/editor/flask_editor/_core/_export_helpers.py +353 -0
- scitex/canvas/editor/flask_editor/_core/_routes_basic.py +190 -0
- scitex/canvas/editor/flask_editor/_core/_routes_export.py +332 -0
- scitex/canvas/editor/flask_editor/_core/_routes_panels.py +252 -0
- scitex/canvas/editor/flask_editor/_core/_routes_save.py +218 -0
- scitex/canvas/editor/flask_editor/_core.py +25 -1684
- scitex/canvas/editor/flask_editor/templates/__init__.py +32 -70
- scitex/cli/__init__.py +38 -43
- scitex/cli/audio.py +76 -27
- scitex/cli/capture.py +13 -20
- scitex/cli/introspect.py +481 -0
- scitex/cli/main.py +200 -109
- scitex/cli/mcp.py +60 -34
- scitex/cli/plt.py +357 -0
- scitex/cli/repro.py +15 -8
- scitex/cli/resource.py +15 -8
- scitex/cli/scholar/__init__.py +23 -8
- scitex/cli/scholar/_crossref_scitex.py +296 -0
- scitex/cli/scholar/_fetch.py +25 -3
- scitex/cli/social.py +314 -0
- scitex/cli/stats.py +15 -8
- scitex/cli/template.py +129 -12
- scitex/cli/tex.py +15 -8
- scitex/cli/writer.py +132 -8
- scitex/cloud/__init__.py +41 -2
- scitex/config/README.md +1 -1
- scitex/config/__init__.py +16 -2
- scitex/config/_env_registry.py +256 -0
- scitex/context/__init__.py +22 -0
- scitex/dev/__init__.py +20 -1
- scitex/diagram/__init__.py +42 -19
- scitex/diagram/mcp_server.py +13 -125
- scitex/gen/__init__.py +50 -14
- scitex/gen/_list_packages.py +4 -4
- scitex/introspect/__init__.py +82 -0
- scitex/introspect/_call_graph.py +303 -0
- scitex/introspect/_class_hierarchy.py +163 -0
- scitex/introspect/_core.py +41 -0
- scitex/introspect/_docstring.py +131 -0
- scitex/introspect/_examples.py +113 -0
- scitex/introspect/_imports.py +271 -0
- scitex/{gen/_inspect_module.py → introspect/_list_api.py} +43 -54
- scitex/introspect/_mcp/__init__.py +41 -0
- scitex/introspect/_mcp/handlers.py +233 -0
- scitex/introspect/_members.py +155 -0
- scitex/introspect/_resolve.py +89 -0
- scitex/introspect/_signature.py +131 -0
- scitex/introspect/_source.py +80 -0
- scitex/introspect/_type_hints.py +172 -0
- scitex/io/_save.py +1 -2
- scitex/io/bundle/README.md +1 -1
- scitex/logging/_formatters.py +19 -9
- scitex/mcp_server.py +98 -5
- scitex/os/__init__.py +4 -0
- scitex/{gen → os}/_check_host.py +4 -5
- scitex/plt/__init__.py +245 -550
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +5 -10
- scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/plt/gallery/README.md +1 -1
- scitex/plt/utils/_hitmap/__init__.py +82 -0
- scitex/plt/utils/_hitmap/_artist_extraction.py +343 -0
- scitex/plt/utils/_hitmap/_color_application.py +346 -0
- scitex/plt/utils/_hitmap/_color_conversion.py +121 -0
- scitex/plt/utils/_hitmap/_constants.py +40 -0
- scitex/plt/utils/_hitmap/_hitmap_core.py +334 -0
- scitex/plt/utils/_hitmap/_path_extraction.py +357 -0
- scitex/plt/utils/_hitmap/_query.py +113 -0
- scitex/plt/utils/_hitmap.py +46 -1616
- scitex/plt/utils/_metadata/__init__.py +80 -0
- scitex/plt/utils/_metadata/_artists/__init__.py +25 -0
- scitex/plt/utils/_metadata/_artists/_base.py +195 -0
- scitex/plt/utils/_metadata/_artists/_collections.py +356 -0
- scitex/plt/utils/_metadata/_artists/_extract.py +57 -0
- scitex/plt/utils/_metadata/_artists/_images.py +80 -0
- scitex/plt/utils/_metadata/_artists/_lines.py +261 -0
- scitex/plt/utils/_metadata/_artists/_patches.py +247 -0
- scitex/plt/utils/_metadata/_artists/_text.py +106 -0
- scitex/plt/utils/_metadata/_csv.py +416 -0
- scitex/plt/utils/_metadata/_detect.py +225 -0
- scitex/plt/utils/_metadata/_legend.py +127 -0
- scitex/plt/utils/_metadata/_rounding.py +117 -0
- scitex/plt/utils/_metadata/_verification.py +202 -0
- scitex/schema/README.md +1 -1
- scitex/scholar/__init__.py +8 -0
- scitex/scholar/_mcp/crossref_handlers.py +265 -0
- scitex/scholar/core/Scholar.py +63 -1700
- scitex/scholar/core/_mixins/__init__.py +36 -0
- scitex/scholar/core/_mixins/_enrichers.py +270 -0
- scitex/scholar/core/_mixins/_library_handlers.py +100 -0
- scitex/scholar/core/_mixins/_loaders.py +103 -0
- scitex/scholar/core/_mixins/_pdf_download.py +375 -0
- scitex/scholar/core/_mixins/_pipeline.py +312 -0
- scitex/scholar/core/_mixins/_project_handlers.py +125 -0
- scitex/scholar/core/_mixins/_savers.py +69 -0
- scitex/scholar/core/_mixins/_search.py +103 -0
- scitex/scholar/core/_mixins/_services.py +88 -0
- scitex/scholar/core/_mixins/_url_finding.py +105 -0
- scitex/scholar/crossref_scitex.py +367 -0
- scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/scholar/examples/00_run_all.sh +120 -0
- scitex/scholar/jobs/_executors.py +27 -3
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +38 -416
- scitex/scholar/pdf_download/_cli.py +154 -0
- scitex/scholar/pdf_download/strategies/__init__.py +11 -8
- scitex/scholar/pdf_download/strategies/manual_download_fallback.py +80 -3
- scitex/scholar/pipelines/ScholarPipelineBibTeX.py +73 -121
- scitex/scholar/pipelines/ScholarPipelineParallel.py +80 -138
- scitex/scholar/pipelines/ScholarPipelineSingle.py +43 -63
- scitex/scholar/pipelines/_single_steps.py +71 -36
- scitex/scholar/storage/_LibraryManager.py +97 -1695
- scitex/scholar/storage/_mixins/__init__.py +30 -0
- scitex/scholar/storage/_mixins/_bibtex_handlers.py +128 -0
- scitex/scholar/storage/_mixins/_library_operations.py +218 -0
- scitex/scholar/storage/_mixins/_metadata_conversion.py +226 -0
- scitex/scholar/storage/_mixins/_paper_saving.py +456 -0
- scitex/scholar/storage/_mixins/_resolution.py +376 -0
- scitex/scholar/storage/_mixins/_storage_helpers.py +121 -0
- scitex/scholar/storage/_mixins/_symlink_handlers.py +226 -0
- scitex/scholar/url_finder/.tmp/open_url/KNOWN_RESOLVERS.py +462 -0
- scitex/scholar/url_finder/.tmp/open_url/README.md +223 -0
- scitex/scholar/url_finder/.tmp/open_url/_DOIToURLResolver.py +694 -0
- scitex/scholar/url_finder/.tmp/open_url/_OpenURLResolver.py +1160 -0
- scitex/scholar/url_finder/.tmp/open_url/_ResolverLinkFinder.py +344 -0
- scitex/scholar/url_finder/.tmp/open_url/__init__.py +24 -0
- scitex/security/README.md +3 -3
- scitex/session/README.md +1 -1
- scitex/session/__init__.py +26 -7
- scitex/session/_decorator.py +1 -1
- scitex/sh/README.md +1 -1
- scitex/sh/__init__.py +7 -4
- scitex/social/__init__.py +155 -0
- scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/stats/_mcp/_handlers/__init__.py +31 -0
- scitex/stats/_mcp/_handlers/_corrections.py +113 -0
- scitex/stats/_mcp/_handlers/_descriptive.py +78 -0
- scitex/stats/_mcp/_handlers/_effect_size.py +106 -0
- scitex/stats/_mcp/_handlers/_format.py +94 -0
- scitex/stats/_mcp/_handlers/_normality.py +110 -0
- scitex/stats/_mcp/_handlers/_posthoc.py +224 -0
- scitex/stats/_mcp/_handlers/_power.py +247 -0
- scitex/stats/_mcp/_handlers/_recommend.py +102 -0
- scitex/stats/_mcp/_handlers/_run_test.py +279 -0
- scitex/stats/_mcp/_handlers/_stars.py +48 -0
- scitex/stats/_mcp/handlers.py +19 -1171
- scitex/stats/auto/_stat_style.py +175 -0
- scitex/stats/auto/_style_definitions.py +411 -0
- scitex/stats/auto/_styles.py +22 -620
- scitex/stats/descriptive/__init__.py +11 -8
- scitex/stats/descriptive/_ci.py +39 -0
- scitex/stats/power/_power.py +15 -4
- scitex/str/__init__.py +2 -1
- scitex/str/_title_case.py +63 -0
- scitex/template/README.md +1 -1
- scitex/template/__init__.py +25 -10
- scitex/template/_code_templates.py +147 -0
- scitex/template/_mcp/handlers.py +81 -0
- scitex/template/_mcp/tool_schemas.py +55 -0
- scitex/template/_templates/__init__.py +51 -0
- scitex/template/_templates/audio.py +233 -0
- scitex/template/_templates/canvas.py +312 -0
- scitex/template/_templates/capture.py +268 -0
- scitex/template/_templates/config.py +43 -0
- scitex/template/_templates/diagram.py +294 -0
- scitex/template/_templates/io.py +107 -0
- scitex/template/_templates/module.py +53 -0
- scitex/template/_templates/plt.py +202 -0
- scitex/template/_templates/scholar.py +267 -0
- scitex/template/_templates/session.py +130 -0
- scitex/template/_templates/session_minimal.py +43 -0
- scitex/template/_templates/session_plot.py +67 -0
- scitex/template/_templates/session_stats.py +77 -0
- scitex/template/_templates/stats.py +323 -0
- scitex/template/_templates/writer.py +296 -0
- scitex/template/clone_writer_directory.py +5 -5
- scitex/ui/_backends/_email.py +10 -2
- scitex/ui/_backends/_webhook.py +5 -1
- scitex/web/_search_pubmed.py +10 -6
- scitex/writer/README.md +1 -1
- scitex/writer/_mcp/handlers.py +11 -744
- scitex/writer/_mcp/tool_schemas.py +5 -335
- scitex-2.15.2.dist-info/METADATA +648 -0
- {scitex-2.14.0.dist-info → scitex-2.15.2.dist-info}/RECORD +246 -150
- scitex/canvas/editor/flask_editor/templates/_scripts.py +0 -4933
- scitex/canvas/editor/flask_editor/templates/_styles.py +0 -1658
- scitex/dev/plt/data/mpl/PLOTTING_FUNCTIONS.yaml +0 -90
- scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES.yaml +0 -1571
- scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES_DETAILED.yaml +0 -6262
- scitex/dev/plt/data/mpl/SIGNATURES_FLATTENED.yaml +0 -1274
- scitex/dev/plt/data/mpl/dir_ax.txt +0 -459
- scitex/diagram/_compile.py +0 -312
- scitex/diagram/_diagram.py +0 -355
- scitex/diagram/_mcp/__init__.py +0 -4
- scitex/diagram/_mcp/handlers.py +0 -400
- scitex/diagram/_mcp/tool_schemas.py +0 -157
- scitex/diagram/_presets.py +0 -173
- scitex/diagram/_schema.py +0 -182
- scitex/diagram/_split.py +0 -278
- scitex/gen/_ci.py +0 -12
- scitex/gen/_title_case.py +0 -89
- scitex/plt/_mcp/__init__.py +0 -4
- scitex/plt/_mcp/_handlers_annotation.py +0 -102
- scitex/plt/_mcp/_handlers_figure.py +0 -195
- scitex/plt/_mcp/_handlers_plot.py +0 -252
- scitex/plt/_mcp/_handlers_style.py +0 -219
- scitex/plt/_mcp/handlers.py +0 -74
- scitex/plt/_mcp/tool_schemas.py +0 -497
- scitex/plt/mcp_server.py +0 -231
- scitex/scholar/data/.gitkeep +0 -0
- scitex/scholar/data/README.md +0 -44
- scitex/scholar/data/bib_files/bibliography.bib +0 -1952
- scitex/scholar/data/bib_files/neurovista.bib +0 -277
- scitex/scholar/data/bib_files/neurovista_enriched.bib +0 -441
- scitex/scholar/data/bib_files/neurovista_enriched_enriched.bib +0 -441
- scitex/scholar/data/bib_files/neurovista_processed.bib +0 -338
- scitex/scholar/data/bib_files/openaccess.bib +0 -89
- scitex/scholar/data/bib_files/pac-seizure_prediction_enriched.bib +0 -2178
- scitex/scholar/data/bib_files/pac.bib +0 -698
- scitex/scholar/data/bib_files/pac_enriched.bib +0 -1061
- scitex/scholar/data/bib_files/pac_processed.bib +0 -0
- scitex/scholar/data/bib_files/pac_titles.txt +0 -75
- scitex/scholar/data/bib_files/paywalled.bib +0 -98
- scitex/scholar/data/bib_files/related-papers-by-coauthors.bib +0 -58
- scitex/scholar/data/bib_files/related-papers-by-coauthors_enriched.bib +0 -87
- scitex/scholar/data/bib_files/seizure_prediction.bib +0 -694
- scitex/scholar/data/bib_files/seizure_prediction_processed.bib +0 -0
- scitex/scholar/data/bib_files/test_complete_enriched.bib +0 -437
- scitex/scholar/data/bib_files/test_final_enriched.bib +0 -437
- scitex/scholar/data/bib_files/test_seizure.bib +0 -46
- scitex/scholar/data/impact_factor/JCR_IF_2022.xlsx +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024.db +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024.xlsx +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024_v01.db +0 -0
- scitex/scholar/data/impact_factor.db +0 -0
- scitex/scholar/examples/SUGGESTIONS.md +0 -865
- scitex/scholar/examples/dev.py +0 -38
- scitex-2.14.0.dist-info/METADATA +0 -1238
- /scitex/{gen → context}/_detect_environment.py +0 -0
- /scitex/{gen → context}/_get_notebook_path.py +0 -0
- /scitex/{gen/_shell.py → sh/_shell_legacy.py} +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.2.dist-info}/WHEEL +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.2.dist-info}/entry_points.txt +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,694 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# Timestamp: "2025-08-09 00:47:35 (ywatanabe)"
|
|
4
|
+
# File: /home/ywatanabe/proj/scitex_repo/scholar/metadata/urls/open_url/_DOIToURLResolver.py
|
|
5
|
+
# ----------------------------------------
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
__FILE__ = (
|
|
11
|
+
"./scholar/metadata/urls/open_url/_DOIToURLResolver.py"
|
|
12
|
+
)
|
|
13
|
+
__DIR__ = os.path.dirname(__FILE__)
|
|
14
|
+
# ----------------------------------------
|
|
15
|
+
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
Convert DOIs to accessible publisher URLs using OpenURL resolvers.
|
|
20
|
+
|
|
21
|
+
This module implements Critical Task #5: Resolve publisher URLs from DOIs
|
|
22
|
+
using institutional OpenURL resolvers for authenticate_async access.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import asyncio
|
|
26
|
+
import json
|
|
27
|
+
import re
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Dict, List, Optional
|
|
30
|
+
from urllib.parse import urlencode
|
|
31
|
+
|
|
32
|
+
import aiohttp
|
|
33
|
+
from playwright.async_api import Page, async_playwright
|
|
34
|
+
|
|
35
|
+
from scitex import logging
|
|
36
|
+
from scitex.scholar.auth import AuthenticationManager
|
|
37
|
+
from scitex.scholar.config import ScholarConfig
|
|
38
|
+
|
|
39
|
+
from ._OpenURLResolver import OpenURLResolver
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DOIToURLResolver:
|
|
45
|
+
"""Resolve DOIs to accessible publisher URLs via OpenURL."""
|
|
46
|
+
|
|
47
|
+
def __init__(self, config: Optional[ScholarConfig] = None):
|
|
48
|
+
"""
|
|
49
|
+
Initialize DOI to URL resolver.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
config: Scholar configuration (uses default if not provided)
|
|
53
|
+
"""
|
|
54
|
+
self.config = config or ScholarConfig()
|
|
55
|
+
|
|
56
|
+
# Initialize auth manager for OpenURL resolver
|
|
57
|
+
auth_manager = AuthenticationManager(config=self.config)
|
|
58
|
+
self.openurl_resolver = OpenURLResolver(auth_manager=auth_manager)
|
|
59
|
+
|
|
60
|
+
# Initialize path manager for screenshots
|
|
61
|
+
self.path_manager = self.config.path_manager
|
|
62
|
+
|
|
63
|
+
# Cache for resolved URLs
|
|
64
|
+
self.cache_dir = Path.home() / ".scitex" / "scholar" / "url_cache"
|
|
65
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
self.cache_file = self.cache_dir / "doi_url_cache.json"
|
|
67
|
+
self.cache = self._load_cache()
|
|
68
|
+
|
|
69
|
+
# Track failures for adaptive behavior
|
|
70
|
+
self.failures = {}
|
|
71
|
+
|
|
72
|
+
async def _capture_workflow_screenshot_async(
|
|
73
|
+
self, doi: str, url: str, stage: str, page: Optional[Page] = None
|
|
74
|
+
):
|
|
75
|
+
"""
|
|
76
|
+
Capture systematic screenshots during DOI resolution workflow.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
doi: DOI being resolved
|
|
80
|
+
url: Current URL
|
|
81
|
+
stage: Workflow stage (e.g., "doi_redirect", "openurl_result", "publisher_page", "access_check")
|
|
82
|
+
page: Existing page object (if None, creates new browser)
|
|
83
|
+
"""
|
|
84
|
+
try:
|
|
85
|
+
# Create paper info from DOI for directory structure
|
|
86
|
+
paper_info = {
|
|
87
|
+
"title": f"DOI_Resolution_{doi.replace('/', '_')}",
|
|
88
|
+
"authors": [],
|
|
89
|
+
"year": None,
|
|
90
|
+
"doi": doi,
|
|
91
|
+
"journal": None,
|
|
92
|
+
"url": url,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
storage_paths = self.path_manager.get_paper_storage_paths(
|
|
96
|
+
paper_info, "doi_resolution"
|
|
97
|
+
)
|
|
98
|
+
screenshots_dir = storage_paths["storage_path"] / "screenshots"
|
|
99
|
+
screenshots_dir.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
|
|
101
|
+
# Generate timestamp and filename
|
|
102
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
103
|
+
safe_doi = doi.replace("/", "_").replace(":", "_")
|
|
104
|
+
filename = f"{timestamp}-{stage}-{safe_doi}.png"
|
|
105
|
+
screenshot_path = screenshots_dir / filename
|
|
106
|
+
|
|
107
|
+
if page:
|
|
108
|
+
# Use existing page
|
|
109
|
+
await page.screenshot(
|
|
110
|
+
path=str(screenshot_path), full_page=True, timeout=10000
|
|
111
|
+
)
|
|
112
|
+
logger.info(
|
|
113
|
+
f"DOI workflow screenshot: {stage} -> {screenshot_path}"
|
|
114
|
+
)
|
|
115
|
+
else:
|
|
116
|
+
# Create new browser for screenshot
|
|
117
|
+
async with async_playwright() as p:
|
|
118
|
+
browser = await p.chromium.launch(headless=True)
|
|
119
|
+
new_page = await browser.new_page()
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
await new_page.goto(
|
|
123
|
+
url, wait_until="networkidle", timeout=30000
|
|
124
|
+
)
|
|
125
|
+
await new_page.screenshot(
|
|
126
|
+
path=str(screenshot_path),
|
|
127
|
+
full_page=True,
|
|
128
|
+
timeout=10000,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Save page info
|
|
132
|
+
info_file = screenshot_path.with_suffix(".txt")
|
|
133
|
+
with open(info_file, "w", encoding="utf-8") as f:
|
|
134
|
+
f.write(f"DOI: {doi}\n")
|
|
135
|
+
f.write(f"URL: {url}\n")
|
|
136
|
+
f.write(f"Stage: {stage}\n")
|
|
137
|
+
f.write(f"Timestamp: {timestamp}\n")
|
|
138
|
+
f.write(f"Page Title: {await new_page.title()}\n")
|
|
139
|
+
f.write(f"Final URL: {new_page.url}\n")
|
|
140
|
+
|
|
141
|
+
logger.info(
|
|
142
|
+
f"DOI workflow screenshot: {stage} -> {screenshot_path}"
|
|
143
|
+
)
|
|
144
|
+
finally:
|
|
145
|
+
await browser.close()
|
|
146
|
+
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.debug(
|
|
149
|
+
f"Failed to capture DOI workflow screenshot for {stage}: {e}"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def _load_cache(self) -> Dict[str, Dict[str, any]]:
|
|
153
|
+
"""Load URL cache from disk."""
|
|
154
|
+
if self.cache_file.exists():
|
|
155
|
+
try:
|
|
156
|
+
with open(self.cache_file, "r") as f:
|
|
157
|
+
return json.load(f)
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.warning(f"Failed to load cache: {e}")
|
|
160
|
+
return {}
|
|
161
|
+
|
|
162
|
+
def _save_cache(self):
|
|
163
|
+
"""Save URL cache to disk."""
|
|
164
|
+
try:
|
|
165
|
+
with open(self.cache_file, "w") as f:
|
|
166
|
+
json.dump(self.cache, f, indent=2)
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logger.error(f"Failed to save cache: {e}")
|
|
169
|
+
|
|
170
|
+
def _extract_doi_info(self, doi: str) -> Dict[str, str]:
|
|
171
|
+
"""Extract publisher and article ID from DOI."""
|
|
172
|
+
# Common DOI patterns
|
|
173
|
+
patterns = {
|
|
174
|
+
"elsevier": r"10\.1016/(.+)",
|
|
175
|
+
"springer": r"10\.1007/(.+)",
|
|
176
|
+
"nature": r"10\.1038/(.+)",
|
|
177
|
+
"wiley": r"10\.1002/(.+)",
|
|
178
|
+
"ieee": r"10\.1109/(.+)",
|
|
179
|
+
"acs": r"10\.1021/(.+)",
|
|
180
|
+
"rsc": r"10\.1039/(.+)",
|
|
181
|
+
"plos": r"10\.1371/(.+)",
|
|
182
|
+
"frontiers": r"10\.3389/(.+)",
|
|
183
|
+
"mdpi": r"10\.3390/(.+)",
|
|
184
|
+
"oxford": r"10\.1093/(.+)",
|
|
185
|
+
"sage": r"10\.1177/(.+)",
|
|
186
|
+
"taylor_francis": r"10\.1080/(.+)",
|
|
187
|
+
"apa": r"10\.1037/(.+)",
|
|
188
|
+
"iop": r"10\.1088/(.+)",
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
for publisher, pattern in patterns.items():
|
|
192
|
+
match = re.match(pattern, doi)
|
|
193
|
+
if match:
|
|
194
|
+
return {
|
|
195
|
+
"publisher": publisher,
|
|
196
|
+
"article_id": match.group(1),
|
|
197
|
+
"doi": doi,
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# Generic pattern
|
|
201
|
+
match = re.match(r"(10\.\d+)/(.+)", doi)
|
|
202
|
+
if match:
|
|
203
|
+
return {
|
|
204
|
+
"publisher": "unknown",
|
|
205
|
+
"prefix": match.group(1),
|
|
206
|
+
"article_id": match.group(2),
|
|
207
|
+
"doi": doi,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
return {"doi": doi, "publisher": "unknown"}
|
|
211
|
+
|
|
212
|
+
def _build_direct_urls(self, doi: str) -> List[str]:
|
|
213
|
+
"""Build potential direct publisher URLs for a DOI."""
|
|
214
|
+
info = self._extract_doi_info(doi)
|
|
215
|
+
urls = []
|
|
216
|
+
|
|
217
|
+
# Always include standard DOI URL
|
|
218
|
+
urls.append(f"https://doi.org/{doi}")
|
|
219
|
+
|
|
220
|
+
# Publisher-specific patterns
|
|
221
|
+
if info["publisher"] == "elsevier":
|
|
222
|
+
# ScienceDirect pattern
|
|
223
|
+
urls.append(
|
|
224
|
+
f"https://www.sciencedirect.com/science/article/pii/{info['article_id']}"
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
elif info["publisher"] == "springer":
|
|
228
|
+
# SpringerLink pattern
|
|
229
|
+
urls.append(f"https://link.springer.com/article/{doi}")
|
|
230
|
+
urls.append(f"https://link.springer.com/chapter/{doi}")
|
|
231
|
+
|
|
232
|
+
elif info["publisher"] == "nature":
|
|
233
|
+
# Nature pattern
|
|
234
|
+
urls.append(
|
|
235
|
+
f"https://www.nature.com/articles/{info['article_id']}"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
elif info["publisher"] == "wiley":
|
|
239
|
+
# Wiley Online Library pattern
|
|
240
|
+
urls.append(f"https://onlinelibrary.wiley.com/doi/abs/{doi}")
|
|
241
|
+
urls.append(f"https://onlinelibrary.wiley.com/doi/full/{doi}")
|
|
242
|
+
|
|
243
|
+
elif info["publisher"] == "ieee":
|
|
244
|
+
# IEEE Xplore pattern (needs document ID)
|
|
245
|
+
urls.append(
|
|
246
|
+
f"https://ieeexplore.ieee.org/document/{info['article_id']}"
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
elif info["publisher"] == "oxford":
|
|
250
|
+
# Oxford Academic pattern
|
|
251
|
+
urls.append(f"https://academic.oup.com/article-lookup/doi/{doi}")
|
|
252
|
+
|
|
253
|
+
return urls
|
|
254
|
+
|
|
255
|
+
async def resolve_single_async(
|
|
256
|
+
self, doi: str, use_openurl: bool = True, verify_access: bool = True
|
|
257
|
+
) -> Optional[Dict[str, any]]:
|
|
258
|
+
"""
|
|
259
|
+
Resolve a single DOI to accessible URL.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
doi: DOI to resolve
|
|
263
|
+
use_openurl: Whether to use OpenURL resolver
|
|
264
|
+
verify_access: Whether to verify PDF access
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Dict with 'url', 'access_type', 'verified' fields if successful
|
|
268
|
+
"""
|
|
269
|
+
# Check cache first
|
|
270
|
+
if doi in self.cache:
|
|
271
|
+
logger.debug(f"Using cached URL for {doi}")
|
|
272
|
+
return self.cache[doi]
|
|
273
|
+
|
|
274
|
+
result = None
|
|
275
|
+
|
|
276
|
+
try:
|
|
277
|
+
# Try OpenURL resolver first if configured
|
|
278
|
+
openurl_resolver_url = self.config.resolve(
|
|
279
|
+
"openurl_resolver_url", None, None, type=str
|
|
280
|
+
)
|
|
281
|
+
if use_openurl and openurl_resolver_url:
|
|
282
|
+
logger.info(f"Trying OpenURL resolver for {doi}")
|
|
283
|
+
|
|
284
|
+
# Capture screenshot of initial DOI URL
|
|
285
|
+
doi_url = f"https://doi.org/{doi}"
|
|
286
|
+
await self._capture_workflow_screenshot_async(
|
|
287
|
+
doi, doi_url, "01_initial_doi"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
openurl_result = await self._try_openurl_async(doi)
|
|
291
|
+
if openurl_result:
|
|
292
|
+
result = openurl_result
|
|
293
|
+
# Capture screenshot of OpenURL result
|
|
294
|
+
if result.get("url"):
|
|
295
|
+
await self._capture_workflow_screenshot_async(
|
|
296
|
+
doi, result["url"], "02_openurl_resolved"
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Try direct publisher URLs
|
|
300
|
+
if not result:
|
|
301
|
+
logger.info(f"Trying direct publisher URLs for {doi}")
|
|
302
|
+
direct_result = await self._try_direct_urls_async(
|
|
303
|
+
doi, verify_access
|
|
304
|
+
)
|
|
305
|
+
if direct_result:
|
|
306
|
+
result = direct_result
|
|
307
|
+
# Capture screenshot of direct URL result
|
|
308
|
+
if result.get("url"):
|
|
309
|
+
await self._capture_workflow_screenshot_async(
|
|
310
|
+
doi, result["url"], "03_direct_publisher"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Cache successful result
|
|
314
|
+
if result:
|
|
315
|
+
self.cache[doi] = result
|
|
316
|
+
self._save_cache()
|
|
317
|
+
logger.success(f"Resolved {doi} to {result['url']}")
|
|
318
|
+
else:
|
|
319
|
+
logger.warning(f"Failed to resolve {doi}")
|
|
320
|
+
|
|
321
|
+
return result
|
|
322
|
+
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.error(f"Error resolving {doi}: {e}")
|
|
325
|
+
return None
|
|
326
|
+
|
|
327
|
+
async def _try_openurl_async(self, doi: str) -> Optional[Dict[str, any]]:
|
|
328
|
+
"""Try to resolve DOI using OpenURL."""
|
|
329
|
+
try:
|
|
330
|
+
# Build OpenURL query
|
|
331
|
+
params = {
|
|
332
|
+
"rft_id": f"info:doi/{doi}",
|
|
333
|
+
"rft.genre": "article",
|
|
334
|
+
"rft_val_fmt": "info:ofi/fmt:kev:mtx:journal",
|
|
335
|
+
"req_dat": "format=pdf",
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
openurl_resolver_url = self.config.resolve(
|
|
339
|
+
"openurl_resolver_url", None, None, type=str
|
|
340
|
+
)
|
|
341
|
+
openurl = f"{openurl_resolver_url}?{urlencode(params)}"
|
|
342
|
+
|
|
343
|
+
# Use the OpenURL resolver to navigate
|
|
344
|
+
async with async_playwright() as p:
|
|
345
|
+
browser = await p.chromium.launch(headless=True)
|
|
346
|
+
page = await browser.new_page()
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
# Navigate to OpenURL
|
|
350
|
+
await page.goto(
|
|
351
|
+
openurl, wait_until="networkidle", timeout=30000
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Wait for potential redirects
|
|
355
|
+
await page.wait_for_timeout(3000)
|
|
356
|
+
|
|
357
|
+
# Get final URL
|
|
358
|
+
final_url = page.url
|
|
359
|
+
|
|
360
|
+
# Check if we reached a publisher page
|
|
361
|
+
if "doi.org" not in final_url and any(
|
|
362
|
+
domain in final_url
|
|
363
|
+
for domain in [
|
|
364
|
+
"sciencedirect",
|
|
365
|
+
"springer",
|
|
366
|
+
"nature",
|
|
367
|
+
"wiley",
|
|
368
|
+
"ieee",
|
|
369
|
+
]
|
|
370
|
+
):
|
|
371
|
+
# Check for PDF access
|
|
372
|
+
pdf_available = await self._check_pdf_access_async(
|
|
373
|
+
page
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
return {
|
|
377
|
+
"url": final_url,
|
|
378
|
+
"access_type": "openurl",
|
|
379
|
+
"pdf_available": pdf_available,
|
|
380
|
+
"verified": True,
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
finally:
|
|
384
|
+
await browser.close()
|
|
385
|
+
|
|
386
|
+
except Exception as e:
|
|
387
|
+
logger.debug(f"OpenURL resolution failed for {doi}: {e}")
|
|
388
|
+
|
|
389
|
+
return None
|
|
390
|
+
|
|
391
|
+
async def _try_direct_urls_async(
|
|
392
|
+
self, doi: str, verify_access: bool = True
|
|
393
|
+
) -> Optional[Dict[str, any]]:
|
|
394
|
+
"""Try direct publisher URLs."""
|
|
395
|
+
urls = self._build_direct_urls(doi)
|
|
396
|
+
|
|
397
|
+
for url in urls:
|
|
398
|
+
try:
|
|
399
|
+
if verify_access:
|
|
400
|
+
# Verify with browser
|
|
401
|
+
result = await self._verify_url_access_async(url)
|
|
402
|
+
if result:
|
|
403
|
+
return {
|
|
404
|
+
"url": url,
|
|
405
|
+
"access_type": "direct",
|
|
406
|
+
"pdf_available": result.get(
|
|
407
|
+
"pdf_available", False
|
|
408
|
+
),
|
|
409
|
+
"verified": True,
|
|
410
|
+
}
|
|
411
|
+
else:
|
|
412
|
+
# Just check if URL responds
|
|
413
|
+
async with aiohttp.ClientSession() as session:
|
|
414
|
+
async with session.head(
|
|
415
|
+
url, allow_redirects=True
|
|
416
|
+
) as resp:
|
|
417
|
+
if resp.status == 200:
|
|
418
|
+
return {
|
|
419
|
+
"url": url,
|
|
420
|
+
"access_type": "direct",
|
|
421
|
+
"verified": False,
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
except Exception as e:
|
|
425
|
+
logger.debug(f"Failed to access {url}: {e}")
|
|
426
|
+
continue
|
|
427
|
+
|
|
428
|
+
return None
|
|
429
|
+
|
|
430
|
+
async def _verify_url_access_async(
|
|
431
|
+
self, url: str
|
|
432
|
+
) -> Optional[Dict[str, any]]:
|
|
433
|
+
"""Verify URL provides article access."""
|
|
434
|
+
try:
|
|
435
|
+
async with async_playwright() as p:
|
|
436
|
+
browser = await p.chromium.launch(headless=True)
|
|
437
|
+
page = await browser.new_page()
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
# Navigate to URL
|
|
441
|
+
await page.goto(
|
|
442
|
+
url, wait_until="networkidle", timeout=30000
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
# Check for common paywall indicators
|
|
446
|
+
paywall_indicators = [
|
|
447
|
+
"purchase",
|
|
448
|
+
"buy",
|
|
449
|
+
"subscribe",
|
|
450
|
+
"access denied",
|
|
451
|
+
"log in",
|
|
452
|
+
"sign in",
|
|
453
|
+
"institutional login",
|
|
454
|
+
]
|
|
455
|
+
|
|
456
|
+
page_text = await page.content()
|
|
457
|
+
page_text_lower = page_text.lower()
|
|
458
|
+
|
|
459
|
+
has_paywall = any(
|
|
460
|
+
indicator in page_text_lower
|
|
461
|
+
for indicator in paywall_indicators
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
# Check for PDF access
|
|
465
|
+
pdf_available = await self._check_pdf_access_async(page)
|
|
466
|
+
|
|
467
|
+
if not has_paywall or pdf_available:
|
|
468
|
+
return {
|
|
469
|
+
"pdf_available": pdf_available,
|
|
470
|
+
"has_paywall": has_paywall,
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
finally:
|
|
474
|
+
await browser.close()
|
|
475
|
+
|
|
476
|
+
except Exception as e:
|
|
477
|
+
logger.debug(f"Failed to verify {url}: {e}")
|
|
478
|
+
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
async def _check_pdf_access_async(self, page: Page) -> bool:
|
|
482
|
+
"""Check if PDF download is available on the page."""
|
|
483
|
+
try:
|
|
484
|
+
# Look for PDF download links/buttons
|
|
485
|
+
pdf_selectors = [
|
|
486
|
+
'a[href*=".pdf"]',
|
|
487
|
+
'a[href*="/pdf/"]',
|
|
488
|
+
'button:has-text("Download PDF")',
|
|
489
|
+
'a:has-text("Download PDF")',
|
|
490
|
+
'a:has-text("View PDF")',
|
|
491
|
+
'a:has-text("Full Text PDF")',
|
|
492
|
+
".pdf-download",
|
|
493
|
+
'[class*="pdf-link"]',
|
|
494
|
+
]
|
|
495
|
+
|
|
496
|
+
for selector in pdf_selectors:
|
|
497
|
+
elements = await page.query_selector_all(selector)
|
|
498
|
+
if elements:
|
|
499
|
+
return True
|
|
500
|
+
|
|
501
|
+
# Check for embedded PDF viewer
|
|
502
|
+
pdf_viewers = await page.query_selector_all(
|
|
503
|
+
'iframe[src*="pdf"], embed[type="application/pdf"]'
|
|
504
|
+
)
|
|
505
|
+
if pdf_viewers:
|
|
506
|
+
return True
|
|
507
|
+
|
|
508
|
+
except Exception as e:
|
|
509
|
+
logger.debug(f"Error checking PDF access: {e}")
|
|
510
|
+
|
|
511
|
+
return False
|
|
512
|
+
|
|
513
|
+
async def resolve_batch_async(
|
|
514
|
+
self, dois: List[str], max_concurrent: int = 3, progress_callback=None
|
|
515
|
+
) -> Dict[str, Dict[str, any]]:
|
|
516
|
+
"""
|
|
517
|
+
Resolve multiple DOIs concurrently.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
dois: List of DOIs to resolve
|
|
521
|
+
max_concurrent: Maximum concurrent resolutions
|
|
522
|
+
progress_callback: Optional callback for progress updates
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
Dict mapping DOIs to resolution results
|
|
526
|
+
"""
|
|
527
|
+
results = {}
|
|
528
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
529
|
+
|
|
530
|
+
async def resolve_with_limit_async(doi: str, index: int):
|
|
531
|
+
async with semaphore:
|
|
532
|
+
if progress_callback:
|
|
533
|
+
progress_callback(index, len(dois), f"Resolving {doi}")
|
|
534
|
+
|
|
535
|
+
result = await self.resolve_single_async(doi)
|
|
536
|
+
results[doi] = result
|
|
537
|
+
return doi, result
|
|
538
|
+
|
|
539
|
+
# Create tasks
|
|
540
|
+
tasks = [
|
|
541
|
+
resolve_with_limit_async(doi, i) for i, doi in enumerate(dois)
|
|
542
|
+
]
|
|
543
|
+
|
|
544
|
+
# Process all DOIs
|
|
545
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
546
|
+
|
|
547
|
+
return results
|
|
548
|
+
|
|
549
|
+
def resolve_from_bibtex(
|
|
550
|
+
self, bibtex_path: Path, output_path: Optional[Path] = None
|
|
551
|
+
) -> Dict[str, Dict[str, any]]:
|
|
552
|
+
"""
|
|
553
|
+
Resolve URLs for all DOIs in a BibTeX file.
|
|
554
|
+
|
|
555
|
+
Args:
|
|
556
|
+
bibtex_path: Path to BibTeX file
|
|
557
|
+
output_path: Optional path for updated BibTeX
|
|
558
|
+
|
|
559
|
+
Returns:
|
|
560
|
+
Dict mapping DOIs to resolution results
|
|
561
|
+
"""
|
|
562
|
+
import bibtexparser
|
|
563
|
+
|
|
564
|
+
# Load BibTeX
|
|
565
|
+
with open(bibtex_path, "r", encoding="utf-8") as f:
|
|
566
|
+
bib_db = bibtexparser.load(f)
|
|
567
|
+
|
|
568
|
+
# Extract DOIs
|
|
569
|
+
dois = []
|
|
570
|
+
doi_to_entry = {}
|
|
571
|
+
|
|
572
|
+
for entry in bib_db.entries:
|
|
573
|
+
if "doi" in entry:
|
|
574
|
+
doi = entry["doi"]
|
|
575
|
+
dois.append(doi)
|
|
576
|
+
doi_to_entry[doi] = entry
|
|
577
|
+
|
|
578
|
+
logger.info(f"Found {len(dois)} DOIs in {bibtex_path}")
|
|
579
|
+
|
|
580
|
+
# Resolve URLs
|
|
581
|
+
loop = asyncio.new_event_loop()
|
|
582
|
+
asyncio.set_event_loop(loop)
|
|
583
|
+
|
|
584
|
+
try:
|
|
585
|
+
results = loop.run_until_complete(self.resolve_batch_async(dois))
|
|
586
|
+
finally:
|
|
587
|
+
loop.close()
|
|
588
|
+
|
|
589
|
+
# Update BibTeX entries with URLs
|
|
590
|
+
success_count = 0
|
|
591
|
+
for doi, result in results.items():
|
|
592
|
+
if result and result.get("url"):
|
|
593
|
+
entry = doi_to_entry[doi]
|
|
594
|
+
entry["url"] = result["url"]
|
|
595
|
+
entry["url_source"] = result["access_type"]
|
|
596
|
+
if result.get("pdf_available"):
|
|
597
|
+
entry["pdf_available"] = "yes"
|
|
598
|
+
success_count += 1
|
|
599
|
+
|
|
600
|
+
logger.info(f"Resolved URLs for {success_count}/{len(dois)} DOIs")
|
|
601
|
+
|
|
602
|
+
# Save updated BibTeX if requested
|
|
603
|
+
if output_path:
|
|
604
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
605
|
+
bibtexparser.dump(bib_db, f)
|
|
606
|
+
logger.info(f"Saved updated BibTeX to {output_path}")
|
|
607
|
+
|
|
608
|
+
return results
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
async def main():
|
|
612
|
+
"""Command-line interface for DOI to URL resolution."""
|
|
613
|
+
import argparse
|
|
614
|
+
|
|
615
|
+
parser = argparse.ArgumentParser(
|
|
616
|
+
description="Resolve DOIs to accessible publisher URLs",
|
|
617
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
618
|
+
epilog="""
|
|
619
|
+
Examples:
|
|
620
|
+
# Resolve single DOI
|
|
621
|
+
python -m scitex.scholar.open_url.resolve_urls --doi "10.1038/nature12373"
|
|
622
|
+
|
|
623
|
+
# Resolve DOIs from BibTeX file
|
|
624
|
+
python -m scitex.scholar.open_url.resolve_urls --bibtex papers.bib
|
|
625
|
+
|
|
626
|
+
# Save URLs to new BibTeX file
|
|
627
|
+
python -m scitex.scholar.open_url.resolve_urls --bibtex papers.bib --output papers-with-urls.bib
|
|
628
|
+
""",
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
input_group = parser.add_mutually_exclusive_group(required=True)
|
|
632
|
+
|
|
633
|
+
input_group.add_argument("--doi", type=str, help="Single DOI to resolve")
|
|
634
|
+
|
|
635
|
+
input_group.add_argument(
|
|
636
|
+
"--bibtex", "-b", type=str, help="BibTeX file containing DOIs"
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
parser.add_argument(
|
|
640
|
+
"--output",
|
|
641
|
+
"-o",
|
|
642
|
+
type=str,
|
|
643
|
+
help="Output BibTeX file (for --bibtex mode)",
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
parser.add_argument(
|
|
647
|
+
"--no-verify", action="store_true", help="Skip access verification"
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
args = parser.parse_args()
|
|
651
|
+
|
|
652
|
+
# Initialize resolver
|
|
653
|
+
resolver = DOIToURLResolver()
|
|
654
|
+
|
|
655
|
+
if args.doi:
|
|
656
|
+
# Single DOI mode
|
|
657
|
+
result = await resolver.resolve_single_async(
|
|
658
|
+
args.doi, verify_access=not args.no_verify
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
if result:
|
|
662
|
+
print(f"\nResolved URL: {result['url']}")
|
|
663
|
+
print(f"Access type: {result['access_type']}")
|
|
664
|
+
if "pdf_available" in result:
|
|
665
|
+
print(
|
|
666
|
+
f"PDF available: {'Yes' if result['pdf_available'] else 'No'}"
|
|
667
|
+
)
|
|
668
|
+
else:
|
|
669
|
+
print("\nFailed to resolve DOI")
|
|
670
|
+
|
|
671
|
+
else:
|
|
672
|
+
# BibTeX mode
|
|
673
|
+
results = resolver.resolve_from_bibtex(
|
|
674
|
+
Path(args.bibtex), Path(args.output) if args.output else None
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
# Print summary
|
|
678
|
+
success = sum(1 for r in results.values() if r and r.get("url"))
|
|
679
|
+
print(f"\nResolved {success}/{len(results)} DOIs")
|
|
680
|
+
|
|
681
|
+
# Show first few results
|
|
682
|
+
for doi, result in list(results.items())[:5]:
|
|
683
|
+
if result:
|
|
684
|
+
print(f"\n{doi}:")
|
|
685
|
+
print(f" URL: {result['url']}")
|
|
686
|
+
print(f" Type: {result['access_type']}")
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
if __name__ == "__main__":
|
|
690
|
+
import sys
|
|
691
|
+
|
|
692
|
+
sys.exit(asyncio.run(main()))
|
|
693
|
+
|
|
694
|
+
# EOF
|