scitex 2.14.0__py3-none-any.whl → 2.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +47 -0
- scitex/_env_loader.py +156 -0
- scitex/_mcp_resources/__init__.py +37 -0
- scitex/_mcp_resources/_cheatsheet.py +135 -0
- scitex/_mcp_resources/_figrecipe.py +138 -0
- scitex/_mcp_resources/_formats.py +102 -0
- scitex/_mcp_resources/_modules.py +337 -0
- scitex/_mcp_resources/_session.py +149 -0
- scitex/_mcp_tools/__init__.py +4 -0
- scitex/_mcp_tools/audio.py +66 -0
- scitex/_mcp_tools/diagram.py +11 -95
- scitex/_mcp_tools/introspect.py +191 -0
- scitex/_mcp_tools/plt.py +260 -305
- scitex/_mcp_tools/scholar.py +74 -0
- scitex/_mcp_tools/social.py +244 -0
- scitex/_mcp_tools/writer.py +21 -204
- scitex/ai/_gen_ai/_PARAMS.py +10 -7
- scitex/ai/classification/reporters/_SingleClassificationReporter.py +45 -1603
- scitex/ai/classification/reporters/_mixins/__init__.py +36 -0
- scitex/ai/classification/reporters/_mixins/_constants.py +67 -0
- scitex/ai/classification/reporters/_mixins/_cv_summary.py +387 -0
- scitex/ai/classification/reporters/_mixins/_feature_importance.py +119 -0
- scitex/ai/classification/reporters/_mixins/_metrics.py +275 -0
- scitex/ai/classification/reporters/_mixins/_plotting.py +179 -0
- scitex/ai/classification/reporters/_mixins/_reports.py +153 -0
- scitex/ai/classification/reporters/_mixins/_storage.py +160 -0
- scitex/audio/README.md +40 -36
- scitex/audio/__init__.py +127 -59
- scitex/audio/_branding.py +185 -0
- scitex/audio/_mcp/__init__.py +32 -0
- scitex/audio/_mcp/handlers.py +59 -6
- scitex/audio/_mcp/speak_handlers.py +238 -0
- scitex/audio/_relay.py +225 -0
- scitex/audio/engines/elevenlabs_engine.py +6 -1
- scitex/audio/mcp_server.py +228 -75
- scitex/canvas/README.md +1 -1
- scitex/canvas/editor/_dearpygui/__init__.py +25 -0
- scitex/canvas/editor/_dearpygui/_editor.py +147 -0
- scitex/canvas/editor/_dearpygui/_handlers.py +476 -0
- scitex/canvas/editor/_dearpygui/_panels/__init__.py +17 -0
- scitex/canvas/editor/_dearpygui/_panels/_control.py +119 -0
- scitex/canvas/editor/_dearpygui/_panels/_element_controls.py +190 -0
- scitex/canvas/editor/_dearpygui/_panels/_preview.py +43 -0
- scitex/canvas/editor/_dearpygui/_panels/_sections.py +390 -0
- scitex/canvas/editor/_dearpygui/_plotting.py +187 -0
- scitex/canvas/editor/_dearpygui/_rendering.py +504 -0
- scitex/canvas/editor/_dearpygui/_selection.py +295 -0
- scitex/canvas/editor/_dearpygui/_state.py +93 -0
- scitex/canvas/editor/_dearpygui/_utils.py +61 -0
- scitex/canvas/editor/flask_editor/templates/__init__.py +32 -70
- scitex/cli/__init__.py +38 -43
- scitex/cli/audio.py +76 -27
- scitex/cli/capture.py +13 -20
- scitex/cli/introspect.py +443 -0
- scitex/cli/main.py +198 -109
- scitex/cli/mcp.py +60 -34
- scitex/cli/scholar/__init__.py +8 -0
- scitex/cli/scholar/_crossref_scitex.py +296 -0
- scitex/cli/scholar/_fetch.py +25 -3
- scitex/cli/social.py +314 -0
- scitex/cli/writer.py +117 -0
- scitex/config/README.md +1 -1
- scitex/config/__init__.py +16 -2
- scitex/config/_env_registry.py +191 -0
- scitex/diagram/__init__.py +42 -19
- scitex/diagram/mcp_server.py +13 -125
- scitex/introspect/__init__.py +75 -0
- scitex/introspect/_call_graph.py +303 -0
- scitex/introspect/_class_hierarchy.py +163 -0
- scitex/introspect/_core.py +42 -0
- scitex/introspect/_docstring.py +131 -0
- scitex/introspect/_examples.py +113 -0
- scitex/introspect/_imports.py +271 -0
- scitex/introspect/_mcp/__init__.py +37 -0
- scitex/introspect/_mcp/handlers.py +208 -0
- scitex/introspect/_members.py +151 -0
- scitex/introspect/_resolve.py +89 -0
- scitex/introspect/_signature.py +131 -0
- scitex/introspect/_source.py +80 -0
- scitex/introspect/_type_hints.py +172 -0
- scitex/io/bundle/README.md +1 -1
- scitex/mcp_server.py +98 -5
- scitex/plt/__init__.py +248 -550
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +5 -10
- scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/plt/gallery/README.md +1 -1
- scitex/plt/utils/_hitmap/__init__.py +82 -0
- scitex/plt/utils/_hitmap/_artist_extraction.py +343 -0
- scitex/plt/utils/_hitmap/_color_application.py +346 -0
- scitex/plt/utils/_hitmap/_color_conversion.py +121 -0
- scitex/plt/utils/_hitmap/_constants.py +40 -0
- scitex/plt/utils/_hitmap/_hitmap_core.py +334 -0
- scitex/plt/utils/_hitmap/_path_extraction.py +357 -0
- scitex/plt/utils/_hitmap/_query.py +113 -0
- scitex/plt/utils/_hitmap.py +46 -1616
- scitex/plt/utils/_metadata/__init__.py +80 -0
- scitex/plt/utils/_metadata/_artists/__init__.py +25 -0
- scitex/plt/utils/_metadata/_artists/_base.py +195 -0
- scitex/plt/utils/_metadata/_artists/_collections.py +356 -0
- scitex/plt/utils/_metadata/_artists/_extract.py +57 -0
- scitex/plt/utils/_metadata/_artists/_images.py +80 -0
- scitex/plt/utils/_metadata/_artists/_lines.py +261 -0
- scitex/plt/utils/_metadata/_artists/_patches.py +247 -0
- scitex/plt/utils/_metadata/_artists/_text.py +106 -0
- scitex/plt/utils/_metadata/_csv.py +416 -0
- scitex/plt/utils/_metadata/_detect.py +225 -0
- scitex/plt/utils/_metadata/_legend.py +127 -0
- scitex/plt/utils/_metadata/_rounding.py +117 -0
- scitex/plt/utils/_metadata/_verification.py +202 -0
- scitex/schema/README.md +1 -1
- scitex/scholar/__init__.py +8 -0
- scitex/scholar/_mcp/crossref_handlers.py +265 -0
- scitex/scholar/core/Scholar.py +63 -1700
- scitex/scholar/core/_mixins/__init__.py +36 -0
- scitex/scholar/core/_mixins/_enrichers.py +270 -0
- scitex/scholar/core/_mixins/_library_handlers.py +100 -0
- scitex/scholar/core/_mixins/_loaders.py +103 -0
- scitex/scholar/core/_mixins/_pdf_download.py +375 -0
- scitex/scholar/core/_mixins/_pipeline.py +312 -0
- scitex/scholar/core/_mixins/_project_handlers.py +125 -0
- scitex/scholar/core/_mixins/_savers.py +69 -0
- scitex/scholar/core/_mixins/_search.py +103 -0
- scitex/scholar/core/_mixins/_services.py +88 -0
- scitex/scholar/core/_mixins/_url_finding.py +105 -0
- scitex/scholar/crossref_scitex.py +367 -0
- scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/scholar/examples/00_run_all.sh +120 -0
- scitex/scholar/jobs/_executors.py +27 -3
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +38 -416
- scitex/scholar/pdf_download/_cli.py +154 -0
- scitex/scholar/pdf_download/strategies/__init__.py +11 -8
- scitex/scholar/pdf_download/strategies/manual_download_fallback.py +80 -3
- scitex/scholar/pipelines/ScholarPipelineBibTeX.py +73 -121
- scitex/scholar/pipelines/ScholarPipelineParallel.py +80 -138
- scitex/scholar/pipelines/ScholarPipelineSingle.py +43 -63
- scitex/scholar/pipelines/_single_steps.py +71 -36
- scitex/scholar/storage/_LibraryManager.py +97 -1695
- scitex/scholar/storage/_mixins/__init__.py +30 -0
- scitex/scholar/storage/_mixins/_bibtex_handlers.py +128 -0
- scitex/scholar/storage/_mixins/_library_operations.py +218 -0
- scitex/scholar/storage/_mixins/_metadata_conversion.py +226 -0
- scitex/scholar/storage/_mixins/_paper_saving.py +456 -0
- scitex/scholar/storage/_mixins/_resolution.py +376 -0
- scitex/scholar/storage/_mixins/_storage_helpers.py +121 -0
- scitex/scholar/storage/_mixins/_symlink_handlers.py +226 -0
- scitex/scholar/url_finder/.tmp/open_url/KNOWN_RESOLVERS.py +462 -0
- scitex/scholar/url_finder/.tmp/open_url/README.md +223 -0
- scitex/scholar/url_finder/.tmp/open_url/_DOIToURLResolver.py +694 -0
- scitex/scholar/url_finder/.tmp/open_url/_OpenURLResolver.py +1160 -0
- scitex/scholar/url_finder/.tmp/open_url/_ResolverLinkFinder.py +344 -0
- scitex/scholar/url_finder/.tmp/open_url/__init__.py +24 -0
- scitex/security/README.md +3 -3
- scitex/session/README.md +1 -1
- scitex/sh/README.md +1 -1
- scitex/social/__init__.py +153 -0
- scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/template/README.md +1 -1
- scitex/template/clone_writer_directory.py +5 -5
- scitex/writer/README.md +1 -1
- scitex/writer/_mcp/handlers.py +11 -744
- scitex/writer/_mcp/tool_schemas.py +5 -335
- scitex-2.15.1.dist-info/METADATA +648 -0
- {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/RECORD +166 -111
- scitex/canvas/editor/flask_editor/templates/_scripts.py +0 -4933
- scitex/canvas/editor/flask_editor/templates/_styles.py +0 -1658
- scitex/dev/plt/data/mpl/PLOTTING_FUNCTIONS.yaml +0 -90
- scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES.yaml +0 -1571
- scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES_DETAILED.yaml +0 -6262
- scitex/dev/plt/data/mpl/SIGNATURES_FLATTENED.yaml +0 -1274
- scitex/dev/plt/data/mpl/dir_ax.txt +0 -459
- scitex/diagram/_compile.py +0 -312
- scitex/diagram/_diagram.py +0 -355
- scitex/diagram/_mcp/__init__.py +0 -4
- scitex/diagram/_mcp/handlers.py +0 -400
- scitex/diagram/_mcp/tool_schemas.py +0 -157
- scitex/diagram/_presets.py +0 -173
- scitex/diagram/_schema.py +0 -182
- scitex/diagram/_split.py +0 -278
- scitex/plt/_mcp/__init__.py +0 -4
- scitex/plt/_mcp/_handlers_annotation.py +0 -102
- scitex/plt/_mcp/_handlers_figure.py +0 -195
- scitex/plt/_mcp/_handlers_plot.py +0 -252
- scitex/plt/_mcp/_handlers_style.py +0 -219
- scitex/plt/_mcp/handlers.py +0 -74
- scitex/plt/_mcp/tool_schemas.py +0 -497
- scitex/plt/mcp_server.py +0 -231
- scitex/scholar/data/.gitkeep +0 -0
- scitex/scholar/data/README.md +0 -44
- scitex/scholar/data/bib_files/bibliography.bib +0 -1952
- scitex/scholar/data/bib_files/neurovista.bib +0 -277
- scitex/scholar/data/bib_files/neurovista_enriched.bib +0 -441
- scitex/scholar/data/bib_files/neurovista_enriched_enriched.bib +0 -441
- scitex/scholar/data/bib_files/neurovista_processed.bib +0 -338
- scitex/scholar/data/bib_files/openaccess.bib +0 -89
- scitex/scholar/data/bib_files/pac-seizure_prediction_enriched.bib +0 -2178
- scitex/scholar/data/bib_files/pac.bib +0 -698
- scitex/scholar/data/bib_files/pac_enriched.bib +0 -1061
- scitex/scholar/data/bib_files/pac_processed.bib +0 -0
- scitex/scholar/data/bib_files/pac_titles.txt +0 -75
- scitex/scholar/data/bib_files/paywalled.bib +0 -98
- scitex/scholar/data/bib_files/related-papers-by-coauthors.bib +0 -58
- scitex/scholar/data/bib_files/related-papers-by-coauthors_enriched.bib +0 -87
- scitex/scholar/data/bib_files/seizure_prediction.bib +0 -694
- scitex/scholar/data/bib_files/seizure_prediction_processed.bib +0 -0
- scitex/scholar/data/bib_files/test_complete_enriched.bib +0 -437
- scitex/scholar/data/bib_files/test_final_enriched.bib +0 -437
- scitex/scholar/data/bib_files/test_seizure.bib +0 -46
- scitex/scholar/data/impact_factor/JCR_IF_2022.xlsx +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024.db +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024.xlsx +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024_v01.db +0 -0
- scitex/scholar/data/impact_factor.db +0 -0
- scitex/scholar/examples/SUGGESTIONS.md +0 -865
- scitex/scholar/examples/dev.py +0 -38
- scitex-2.14.0.dist-info/METADATA +0 -1238
- {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/WHEEL +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/entry_points.txt +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Timestamp: "2026-01-22 (ywatanabe)"
|
|
3
|
+
# File: src/scitex/scholar/pdf_download/_cli.py
|
|
4
|
+
"""CLI entry point for ScholarPDFDownloader."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import asyncio
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from scitex import logging
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
__FILE__ = __file__
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def main_async(args):
|
|
20
|
+
"""Example usage showing decoupled URL resolution and downloading."""
|
|
21
|
+
from scitex.scholar import (
|
|
22
|
+
ScholarAuthManager,
|
|
23
|
+
ScholarBrowserManager,
|
|
24
|
+
ScholarURLFinder,
|
|
25
|
+
)
|
|
26
|
+
from scitex.scholar.auth import AuthenticationGateway
|
|
27
|
+
from scitex.scholar.pdf_download import ScholarPDFDownloader
|
|
28
|
+
|
|
29
|
+
# Authenticated Browser and Context
|
|
30
|
+
auth_manager = ScholarAuthManager()
|
|
31
|
+
browser_manager = ScholarBrowserManager(
|
|
32
|
+
chrome_profile_name="system",
|
|
33
|
+
browser_mode=args.browser_mode,
|
|
34
|
+
auth_manager=auth_manager,
|
|
35
|
+
use_zenrows_proxy=False,
|
|
36
|
+
)
|
|
37
|
+
(
|
|
38
|
+
browser,
|
|
39
|
+
context,
|
|
40
|
+
) = await browser_manager.get_authenticated_browser_and_context_async()
|
|
41
|
+
|
|
42
|
+
# Authentication Gateway
|
|
43
|
+
auth_gateway = AuthenticationGateway(
|
|
44
|
+
auth_manager=auth_manager,
|
|
45
|
+
browser_manager=browser_manager,
|
|
46
|
+
)
|
|
47
|
+
url_context = await auth_gateway.prepare_context_async(
|
|
48
|
+
doi=args.doi, context=context
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# URL Resolution
|
|
52
|
+
url_finder = ScholarURLFinder(context)
|
|
53
|
+
resolved_url = url_context.url if url_context else None
|
|
54
|
+
if resolved_url:
|
|
55
|
+
logger.info(f"Using resolved URL from auth_gateway: {resolved_url}")
|
|
56
|
+
urls = await url_finder.find_pdf_urls(resolved_url)
|
|
57
|
+
else:
|
|
58
|
+
logger.info(f"No resolved URL, using DOI: {args.doi}")
|
|
59
|
+
urls = await url_finder.find_pdf_urls(args.doi)
|
|
60
|
+
|
|
61
|
+
# Extract URL strings from list of dicts
|
|
62
|
+
pdf_urls = []
|
|
63
|
+
for entry in urls:
|
|
64
|
+
if isinstance(entry, dict):
|
|
65
|
+
pdf_urls.append(entry.get("url"))
|
|
66
|
+
elif isinstance(entry, str):
|
|
67
|
+
pdf_urls.append(entry)
|
|
68
|
+
|
|
69
|
+
if not pdf_urls:
|
|
70
|
+
logger.error(f"No PDF URLs found for DOI: {args.doi}")
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
logger.info(f"Found {len(pdf_urls)} PDF URL(s) for DOI: {args.doi}")
|
|
74
|
+
|
|
75
|
+
# PDF Download
|
|
76
|
+
pdf_downloader = ScholarPDFDownloader(context)
|
|
77
|
+
if len(pdf_urls) == 1:
|
|
78
|
+
await pdf_downloader.download_from_url(pdf_urls[0], args.output)
|
|
79
|
+
else:
|
|
80
|
+
output_dir = Path(args.output).parent
|
|
81
|
+
await pdf_downloader.download_from_urls(
|
|
82
|
+
pdf_urls, output_dir=output_dir, max_concurrent=3
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def main(args):
|
|
87
|
+
asyncio.run(main_async(args))
|
|
88
|
+
return 0
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def parse_args() -> argparse.Namespace:
|
|
92
|
+
"""Parse command line arguments."""
|
|
93
|
+
parser = argparse.ArgumentParser(
|
|
94
|
+
description="Download a PDF using DOI with authentication support"
|
|
95
|
+
)
|
|
96
|
+
parser.add_argument(
|
|
97
|
+
"--doi",
|
|
98
|
+
type=str,
|
|
99
|
+
required=True,
|
|
100
|
+
help="DOI of the paper (e.g., 10.1088/1741-2552/aaf92e)",
|
|
101
|
+
)
|
|
102
|
+
parser.add_argument(
|
|
103
|
+
"--output",
|
|
104
|
+
type=str,
|
|
105
|
+
default="~/.scitex/scholar/library/downloads/downloaded_paper.pdf",
|
|
106
|
+
help="Output path for the PDF",
|
|
107
|
+
)
|
|
108
|
+
parser.add_argument(
|
|
109
|
+
"--browser-mode",
|
|
110
|
+
type=str,
|
|
111
|
+
choices=["stealth", "interactive"],
|
|
112
|
+
default="stealth",
|
|
113
|
+
help="Browser mode (default: stealth)",
|
|
114
|
+
)
|
|
115
|
+
return parser.parse_args()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def run_main() -> None:
|
|
119
|
+
"""Initialize scitex framework, run main function, and cleanup."""
|
|
120
|
+
global CONFIG, CC, sys, plt, rng
|
|
121
|
+
|
|
122
|
+
import sys
|
|
123
|
+
|
|
124
|
+
import matplotlib.pyplot as plt
|
|
125
|
+
|
|
126
|
+
import scitex as stx
|
|
127
|
+
|
|
128
|
+
args = parse_args()
|
|
129
|
+
|
|
130
|
+
CONFIG, sys.stdout, sys.stderr, plt, CC, rng = stx.session.start(
|
|
131
|
+
sys,
|
|
132
|
+
plt,
|
|
133
|
+
args=args,
|
|
134
|
+
file=__FILE__,
|
|
135
|
+
sdir_suffix=None,
|
|
136
|
+
verbose=False,
|
|
137
|
+
agg=True,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
exit_status = main(args)
|
|
141
|
+
|
|
142
|
+
stx.session.close(
|
|
143
|
+
CONFIG,
|
|
144
|
+
verbose=False,
|
|
145
|
+
notify=False,
|
|
146
|
+
message="",
|
|
147
|
+
exit_status=exit_status,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
if __name__ == "__main__":
|
|
152
|
+
run_main()
|
|
153
|
+
|
|
154
|
+
# EOF
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
2
|
"""PDF Download Strategies
|
|
4
3
|
|
|
5
4
|
This module contains different strategies for downloading PDFs from academic publishers.
|
|
@@ -9,21 +8,24 @@ Each strategy is tried in sequence until one succeeds.
|
|
|
9
8
|
# Download strategies
|
|
10
9
|
from .chrome_pdf_viewer import try_download_chrome_pdf_viewer_async
|
|
11
10
|
from .direct_download import try_download_direct_async
|
|
12
|
-
from .
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
try_download_open_access_async,
|
|
16
|
-
try_download_open_access_sync,
|
|
11
|
+
from .manual_download_fallback import (
|
|
12
|
+
handle_manual_download_on_page_async,
|
|
13
|
+
try_download_manual_async,
|
|
17
14
|
)
|
|
18
15
|
|
|
19
16
|
# Manual download utilities
|
|
20
17
|
from .manual_download_utils import (
|
|
21
18
|
DownloadMonitorAndSync,
|
|
22
19
|
FlexibleFilenameGenerator,
|
|
23
|
-
show_stop_automation_button_async,
|
|
24
|
-
show_manual_download_button_async,
|
|
25
20
|
complete_manual_download_workflow_async,
|
|
21
|
+
show_manual_download_button_async,
|
|
22
|
+
show_stop_automation_button_async,
|
|
23
|
+
)
|
|
24
|
+
from .open_access_download import (
|
|
25
|
+
try_download_open_access_async,
|
|
26
|
+
try_download_open_access_sync,
|
|
26
27
|
)
|
|
28
|
+
from .response_body import try_download_response_body_async
|
|
27
29
|
|
|
28
30
|
__all__ = [
|
|
29
31
|
# Download strategies
|
|
@@ -33,6 +35,7 @@ __all__ = [
|
|
|
33
35
|
"try_download_manual_async",
|
|
34
36
|
"try_download_open_access_async",
|
|
35
37
|
"try_download_open_access_sync",
|
|
38
|
+
"handle_manual_download_on_page_async",
|
|
36
39
|
# Manual download utilities
|
|
37
40
|
"DownloadMonitorAndSync",
|
|
38
41
|
"FlexibleFilenameGenerator",
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
2
|
# Timestamp: "2025-10-13 08:00:08 (ywatanabe)"
|
|
4
3
|
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/pdf_download/strategies/manual_download_fallback.py
|
|
5
4
|
# ----------------------------------------
|
|
6
5
|
from __future__ import annotations
|
|
6
|
+
|
|
7
7
|
import os
|
|
8
8
|
|
|
9
9
|
__FILE__ = "./src/scitex/scholar/pdf_download/strategies/manual_download_fallback.py"
|
|
@@ -21,7 +21,6 @@ from scitex.scholar import ScholarConfig
|
|
|
21
21
|
from scitex.scholar.browser import browser_logger
|
|
22
22
|
from scitex.scholar.pdf_download.strategies.manual_download_utils import (
|
|
23
23
|
DownloadMonitorAndSync,
|
|
24
|
-
complete_manual_download_workflow_async,
|
|
25
24
|
)
|
|
26
25
|
|
|
27
26
|
logger = logging.getLogger(__name__)
|
|
@@ -51,7 +50,8 @@ async def try_download_manual_async(
|
|
|
51
50
|
config: Scholar configuration
|
|
52
51
|
doi: Optional DOI for filename generation
|
|
53
52
|
|
|
54
|
-
Returns
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
55
|
Path to downloaded file, or None if failed
|
|
56
56
|
"""
|
|
57
57
|
config = config or ScholarConfig()
|
|
@@ -164,4 +164,81 @@ async def try_download_manual_async(
|
|
|
164
164
|
return None
|
|
165
165
|
|
|
166
166
|
|
|
167
|
+
async def handle_manual_download_on_page_async(
|
|
168
|
+
page,
|
|
169
|
+
pdf_url: str,
|
|
170
|
+
output_path: Path,
|
|
171
|
+
func_name: str = "handle_manual_download",
|
|
172
|
+
config: ScholarConfig = None,
|
|
173
|
+
doi: Optional[str] = None,
|
|
174
|
+
) -> Optional[Path]:
|
|
175
|
+
"""Handle manual download on an already-open page.
|
|
176
|
+
|
|
177
|
+
Unlike try_download_manual_async, this uses an existing page
|
|
178
|
+
(e.g., from the stop automation button workflow).
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
page: Already-open Playwright page
|
|
182
|
+
pdf_url: URL of the PDF
|
|
183
|
+
output_path: Target output path
|
|
184
|
+
config: Scholar configuration
|
|
185
|
+
doi: Optional DOI for metadata
|
|
186
|
+
|
|
187
|
+
Returns
|
|
188
|
+
-------
|
|
189
|
+
Path to downloaded file, or None if failed
|
|
190
|
+
"""
|
|
191
|
+
config = config or ScholarConfig()
|
|
192
|
+
downloads_dir = config.get_library_downloads_dir()
|
|
193
|
+
|
|
194
|
+
# Extract DOI from URL if not provided
|
|
195
|
+
if not doi and "doi.org/" in pdf_url:
|
|
196
|
+
doi = pdf_url.split("doi.org/")[-1].split("?")[0].split("#")[0]
|
|
197
|
+
|
|
198
|
+
await browser_logger.info(page, f"{func_name}: Manual download mode activated")
|
|
199
|
+
await browser_logger.info(
|
|
200
|
+
page, f"{func_name}: Please download the PDF manually from this page"
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Monitor for download
|
|
204
|
+
monitor = DownloadMonitorAndSync(downloads_dir, downloads_dir)
|
|
205
|
+
|
|
206
|
+
def log_progress(msg: str):
|
|
207
|
+
logger.info(f"{func_name}: {msg}")
|
|
208
|
+
|
|
209
|
+
temp_file = await monitor.monitor_for_new_download_async(
|
|
210
|
+
timeout_sec=120, logger_func=log_progress
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
if not temp_file:
|
|
214
|
+
await browser_logger.error(
|
|
215
|
+
page, f"{func_name}: No new PDF detected in downloads directory"
|
|
216
|
+
)
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
await browser_logger.info(
|
|
220
|
+
page,
|
|
221
|
+
f"{func_name}: Detected PDF: {temp_file.name} ({temp_file.stat().st_size / 1e6:.1f} MB)",
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Save minimal metadata
|
|
225
|
+
if doi:
|
|
226
|
+
import json
|
|
227
|
+
|
|
228
|
+
metadata_file = temp_file.parent / f"{temp_file.name}.meta.json"
|
|
229
|
+
metadata = {"doi": doi, "pdf_url": pdf_url, "pdf_file": temp_file.name}
|
|
230
|
+
with open(metadata_file, "w") as f:
|
|
231
|
+
json.dump(metadata, f, indent=2)
|
|
232
|
+
|
|
233
|
+
await browser_logger.info(
|
|
234
|
+
page, f"{func_name}: Manual download complete - saved in downloads/"
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
logger.info(f"{func_name}: PDF: {temp_file}")
|
|
238
|
+
if doi:
|
|
239
|
+
logger.info(f"{func_name}: DOI: {doi} (saved in {temp_file.name}.meta.json)")
|
|
240
|
+
|
|
241
|
+
return temp_file
|
|
242
|
+
|
|
243
|
+
|
|
167
244
|
# EOF
|
|
@@ -1,14 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
|
-
# Timestamp: "
|
|
4
|
-
# File: /home/ywatanabe/proj/
|
|
5
|
-
# ----------------------------------------
|
|
6
|
-
from __future__ import annotations
|
|
7
|
-
import os
|
|
3
|
+
# Timestamp: "2026-01-22 16:32:41 (ywatanabe)"
|
|
4
|
+
# File: /home/ywatanabe/proj/scitex-code/src/scitex/scholar/pipelines/ScholarPipelineBibTeX.py
|
|
8
5
|
|
|
9
|
-
__FILE__ = "./src/scitex/scholar/pipelines/ScholarPipelineBibTeX.py"
|
|
10
|
-
__DIR__ = os.path.dirname(__FILE__)
|
|
11
|
-
# ----------------------------------------
|
|
12
6
|
|
|
13
7
|
"""
|
|
14
8
|
Functionalities:
|
|
@@ -33,12 +27,12 @@ IO:
|
|
|
33
27
|
- {input_bibtex}_processed.bib (enriched BibTeX with download status)
|
|
34
28
|
"""
|
|
35
29
|
|
|
36
|
-
"""Imports"""
|
|
37
|
-
import argparse
|
|
38
30
|
import asyncio
|
|
39
31
|
from pathlib import Path
|
|
40
|
-
from typing import Optional
|
|
32
|
+
from typing import Optional
|
|
33
|
+
from typing import Union
|
|
41
34
|
|
|
35
|
+
import scitex as stx
|
|
42
36
|
from scitex import logging
|
|
43
37
|
from scitex.scholar.core import Papers
|
|
44
38
|
from scitex.scholar.pipelines.ScholarPipelineParallel import (
|
|
@@ -49,8 +43,6 @@ from scitex.scholar.storage import BibTeXHandler
|
|
|
49
43
|
logger = logging.getLogger(__name__)
|
|
50
44
|
|
|
51
45
|
"""Functions & Classes"""
|
|
52
|
-
|
|
53
|
-
|
|
54
46
|
class ScholarPipelineBibTeX:
|
|
55
47
|
"""Processes BibTeX files through parallel paper acquisition pipeline"""
|
|
56
48
|
|
|
@@ -96,7 +88,8 @@ class ScholarPipelineBibTeX:
|
|
|
96
88
|
project: Project name for symlinking (optional)
|
|
97
89
|
output_bibtex_path: Path to save enriched BibTeX (optional, defaults to {input}_processed.bib)
|
|
98
90
|
|
|
99
|
-
Returns
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
100
93
|
Papers collection with processed papers
|
|
101
94
|
"""
|
|
102
95
|
bibtex_path = Path(bibtex_path)
|
|
@@ -145,7 +138,9 @@ class ScholarPipelineBibTeX:
|
|
|
145
138
|
logger.success(
|
|
146
139
|
f"{self.name}: Processed {len(processed_papers)}/{len(papers)} papers"
|
|
147
140
|
)
|
|
148
|
-
logger.success(
|
|
141
|
+
logger.success(
|
|
142
|
+
f"{self.name}: Saved enriched BibTeX: {output_bibtex_path}"
|
|
143
|
+
)
|
|
149
144
|
|
|
150
145
|
# Update project bibliography if project specified
|
|
151
146
|
if project:
|
|
@@ -161,7 +156,9 @@ class ScholarPipelineBibTeX:
|
|
|
161
156
|
bibtex_files=[bibtex_path, output_bibtex_path],
|
|
162
157
|
)
|
|
163
158
|
|
|
164
|
-
logger.success(
|
|
159
|
+
logger.success(
|
|
160
|
+
f"{self.name}: Updated project bibliography: {project}"
|
|
161
|
+
)
|
|
165
162
|
except Exception as e:
|
|
166
163
|
logger.warning(f"Failed to update bibliography: {e}")
|
|
167
164
|
|
|
@@ -180,7 +177,8 @@ class ScholarPipelineBibTeX:
|
|
|
180
177
|
project: Project name for symlinking (optional)
|
|
181
178
|
output_bibtex_path: Path to save enriched BibTeX (optional)
|
|
182
179
|
|
|
183
|
-
Returns
|
|
180
|
+
Returns
|
|
181
|
+
-------
|
|
184
182
|
Papers collection with processed papers
|
|
185
183
|
"""
|
|
186
184
|
logger.info(f"{self.name}: Processing BibTeX text content")
|
|
@@ -193,7 +191,9 @@ class ScholarPipelineBibTeX:
|
|
|
193
191
|
logger.warning(f"{self.name}: No papers found in BibTeX text")
|
|
194
192
|
return Papers([], project=project)
|
|
195
193
|
|
|
196
|
-
logger.info(
|
|
194
|
+
logger.info(
|
|
195
|
+
f"{self.name}: Loaded {len(papers)} papers from BibTeX text"
|
|
196
|
+
)
|
|
197
197
|
|
|
198
198
|
# Step 2: Process papers in parallel
|
|
199
199
|
papers_collection = Papers(papers, project=project)
|
|
@@ -213,7 +213,9 @@ class ScholarPipelineBibTeX:
|
|
|
213
213
|
processed_collection,
|
|
214
214
|
output_path=output_bibtex_path,
|
|
215
215
|
)
|
|
216
|
-
logger.success(
|
|
216
|
+
logger.success(
|
|
217
|
+
f"{self.name}: Saved enriched BibTeX: {output_bibtex_path}"
|
|
218
|
+
)
|
|
217
219
|
|
|
218
220
|
logger.success(
|
|
219
221
|
f"{self.name}: Processed {len(processed_papers)}/{len(papers)} papers"
|
|
@@ -222,134 +224,84 @@ class ScholarPipelineBibTeX:
|
|
|
222
224
|
return processed_collection
|
|
223
225
|
|
|
224
226
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
227
|
+
@stx.session
|
|
228
|
+
def main(
|
|
229
|
+
bibtex: str = None,
|
|
230
|
+
project: str = None,
|
|
231
|
+
output: str = None,
|
|
232
|
+
num_workers: int = 4,
|
|
233
|
+
browser_mode: str = "stealth",
|
|
234
|
+
chrome_profile: str = "system",
|
|
235
|
+
CONFIG=stx.INJECTED,
|
|
236
|
+
logger=stx.INJECTED,
|
|
237
|
+
) -> int:
|
|
238
|
+
"""Process BibTeX files through parallel paper acquisition pipeline.
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
bibtex : str
|
|
243
|
+
Path to BibTeX file (required)
|
|
244
|
+
project : str
|
|
245
|
+
Project name for symlinking (optional)
|
|
246
|
+
output : str
|
|
247
|
+
Output BibTeX path (default: {input}_processed.bib)
|
|
248
|
+
num_workers : int
|
|
249
|
+
Number of parallel workers (default: 4)
|
|
250
|
+
browser_mode : str
|
|
251
|
+
Browser mode: 'stealth' or 'interactive' (default: stealth)
|
|
252
|
+
chrome_profile : str
|
|
253
|
+
Base Chrome profile name to sync from (default: system)
|
|
254
|
+
|
|
255
|
+
Returns
|
|
256
|
+
-------
|
|
257
|
+
int
|
|
258
|
+
Exit status code (0 for success)
|
|
259
|
+
"""
|
|
260
|
+
if not bibtex:
|
|
229
261
|
logger.error("No BibTeX file provided. Use --bibtex")
|
|
230
262
|
return 1
|
|
231
263
|
|
|
232
|
-
bibtex_path = Path(
|
|
264
|
+
bibtex_path = Path(bibtex)
|
|
233
265
|
if not bibtex_path.exists():
|
|
234
266
|
logger.error(f"BibTeX file not found: {bibtex_path}")
|
|
235
267
|
return 1
|
|
236
268
|
|
|
237
269
|
logger.info(f"Processing BibTeX file: {bibtex_path}")
|
|
238
|
-
logger.info(f"Workers: {
|
|
239
|
-
logger.info(f"Project: {
|
|
270
|
+
logger.info(f"Workers: {num_workers}")
|
|
271
|
+
logger.info(f"Project: {project or 'None'}")
|
|
240
272
|
|
|
241
273
|
# Create BibTeX pipeline
|
|
242
274
|
bibtex_pipeline = ScholarPipelineBibTeX(
|
|
243
|
-
num_workers=
|
|
244
|
-
browser_mode=
|
|
245
|
-
base_chrome_profile=
|
|
275
|
+
num_workers=num_workers,
|
|
276
|
+
browser_mode=browser_mode,
|
|
277
|
+
base_chrome_profile=chrome_profile,
|
|
246
278
|
)
|
|
247
279
|
|
|
248
280
|
# Run pipeline
|
|
249
281
|
papers = asyncio.run(
|
|
250
282
|
bibtex_pipeline.process_bibtex_file_async(
|
|
251
283
|
bibtex_path=bibtex_path,
|
|
252
|
-
project=
|
|
253
|
-
output_bibtex_path=
|
|
284
|
+
project=project,
|
|
285
|
+
output_bibtex_path=output,
|
|
254
286
|
)
|
|
255
287
|
)
|
|
256
288
|
|
|
257
|
-
logger.success(
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
def parse_args() -> argparse.Namespace:
|
|
262
|
-
"""Parse command line arguments."""
|
|
263
|
-
parser = argparse.ArgumentParser(
|
|
264
|
-
description="Process BibTeX files through parallel paper acquisition pipeline"
|
|
265
|
-
)
|
|
266
|
-
parser.add_argument(
|
|
267
|
-
"--bibtex",
|
|
268
|
-
type=str,
|
|
269
|
-
required=True,
|
|
270
|
-
help="Path to BibTeX file",
|
|
271
|
-
)
|
|
272
|
-
parser.add_argument(
|
|
273
|
-
"--project",
|
|
274
|
-
type=str,
|
|
275
|
-
default=None,
|
|
276
|
-
help="Project name for symlinking (optional)",
|
|
277
|
-
)
|
|
278
|
-
parser.add_argument(
|
|
279
|
-
"--output",
|
|
280
|
-
type=str,
|
|
281
|
-
default=None,
|
|
282
|
-
help="Output BibTeX path (default: {input}_processed.bib)",
|
|
283
|
-
)
|
|
284
|
-
parser.add_argument(
|
|
285
|
-
"--num-workers",
|
|
286
|
-
type=int,
|
|
287
|
-
default=4,
|
|
288
|
-
help="Number of parallel workers (default: 4)",
|
|
289
|
-
)
|
|
290
|
-
parser.add_argument(
|
|
291
|
-
"--browser-mode",
|
|
292
|
-
type=str,
|
|
293
|
-
choices=["stealth", "interactive"],
|
|
294
|
-
default="stealth",
|
|
295
|
-
help="Browser mode (default: stealth)",
|
|
296
|
-
)
|
|
297
|
-
parser.add_argument(
|
|
298
|
-
"--chrome-profile",
|
|
299
|
-
type=str,
|
|
300
|
-
default="system",
|
|
301
|
-
help="Base Chrome profile name to sync from (default: system)",
|
|
302
|
-
)
|
|
303
|
-
args = parser.parse_args()
|
|
304
|
-
return args
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
def run_main() -> None:
|
|
308
|
-
"""Initialize scitex framework, run main function, and cleanup."""
|
|
309
|
-
global CONFIG, CC, sys, plt, rng
|
|
310
|
-
|
|
311
|
-
import sys
|
|
312
|
-
|
|
313
|
-
import matplotlib.pyplot as plt
|
|
314
|
-
|
|
315
|
-
import scitex as stx
|
|
316
|
-
|
|
317
|
-
args = parse_args()
|
|
318
|
-
|
|
319
|
-
CONFIG, sys.stdout, sys.stderr, plt, CC, rng = stx.session.start(
|
|
320
|
-
sys,
|
|
321
|
-
plt,
|
|
322
|
-
args=args,
|
|
323
|
-
file=__FILE__,
|
|
324
|
-
sdir_suffix=None,
|
|
325
|
-
verbose=False,
|
|
326
|
-
agg=True,
|
|
327
|
-
)
|
|
328
|
-
|
|
329
|
-
exit_status = main(args)
|
|
330
|
-
|
|
331
|
-
stx.session.close(
|
|
332
|
-
CONFIG,
|
|
333
|
-
verbose=False,
|
|
334
|
-
notify=False,
|
|
335
|
-
message="",
|
|
336
|
-
exit_status=exit_status,
|
|
289
|
+
logger.success(
|
|
290
|
+
f"BibTeX processing complete: {len(papers)} papers processed"
|
|
337
291
|
)
|
|
292
|
+
return 0
|
|
338
293
|
|
|
339
294
|
|
|
340
295
|
if __name__ == "__main__":
|
|
341
|
-
|
|
296
|
+
main()
|
|
342
297
|
|
|
343
298
|
"""
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
--num-workers 8 \
|
|
351
|
-
--chrome-profile system \
|
|
352
|
-
--browser-mode stealth
|
|
299
|
+
python -m scitex.scholar.pipelines.ScholarPipelineBibTeX \
|
|
300
|
+
--bibtex ./data/scholar/bib_files/neurovista.bib \
|
|
301
|
+
--project neurovista \
|
|
302
|
+
--num-workers 8 \
|
|
303
|
+
--chrome-profile system \
|
|
304
|
+
--browser-mode interactive
|
|
353
305
|
"""
|
|
354
306
|
|
|
355
307
|
# EOF
|