scitex 2.14.0__py3-none-any.whl → 2.15.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +71 -17
- scitex/_env_loader.py +156 -0
- scitex/_mcp_resources/__init__.py +37 -0
- scitex/_mcp_resources/_cheatsheet.py +135 -0
- scitex/_mcp_resources/_figrecipe.py +138 -0
- scitex/_mcp_resources/_formats.py +102 -0
- scitex/_mcp_resources/_modules.py +337 -0
- scitex/_mcp_resources/_session.py +149 -0
- scitex/_mcp_tools/__init__.py +4 -0
- scitex/_mcp_tools/audio.py +66 -0
- scitex/_mcp_tools/diagram.py +11 -95
- scitex/_mcp_tools/introspect.py +210 -0
- scitex/_mcp_tools/plt.py +260 -305
- scitex/_mcp_tools/scholar.py +74 -0
- scitex/_mcp_tools/social.py +27 -0
- scitex/_mcp_tools/template.py +24 -0
- scitex/_mcp_tools/writer.py +17 -210
- scitex/ai/_gen_ai/_PARAMS.py +10 -7
- scitex/ai/classification/reporters/_SingleClassificationReporter.py +45 -1603
- scitex/ai/classification/reporters/_mixins/__init__.py +36 -0
- scitex/ai/classification/reporters/_mixins/_constants.py +67 -0
- scitex/ai/classification/reporters/_mixins/_cv_summary.py +387 -0
- scitex/ai/classification/reporters/_mixins/_feature_importance.py +119 -0
- scitex/ai/classification/reporters/_mixins/_metrics.py +275 -0
- scitex/ai/classification/reporters/_mixins/_plotting.py +179 -0
- scitex/ai/classification/reporters/_mixins/_reports.py +153 -0
- scitex/ai/classification/reporters/_mixins/_storage.py +160 -0
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +30 -1550
- scitex/ai/classification/timeseries/_sliding_window_core.py +467 -0
- scitex/ai/classification/timeseries/_sliding_window_plotting.py +369 -0
- scitex/audio/README.md +40 -36
- scitex/audio/__init__.py +129 -61
- scitex/audio/_branding.py +185 -0
- scitex/audio/_mcp/__init__.py +32 -0
- scitex/audio/_mcp/handlers.py +59 -6
- scitex/audio/_mcp/speak_handlers.py +238 -0
- scitex/audio/_relay.py +225 -0
- scitex/audio/_tts.py +18 -10
- scitex/audio/engines/base.py +17 -10
- scitex/audio/engines/elevenlabs_engine.py +7 -2
- scitex/audio/mcp_server.py +228 -75
- scitex/canvas/README.md +1 -1
- scitex/canvas/editor/_dearpygui/__init__.py +25 -0
- scitex/canvas/editor/_dearpygui/_editor.py +147 -0
- scitex/canvas/editor/_dearpygui/_handlers.py +476 -0
- scitex/canvas/editor/_dearpygui/_panels/__init__.py +17 -0
- scitex/canvas/editor/_dearpygui/_panels/_control.py +119 -0
- scitex/canvas/editor/_dearpygui/_panels/_element_controls.py +190 -0
- scitex/canvas/editor/_dearpygui/_panels/_preview.py +43 -0
- scitex/canvas/editor/_dearpygui/_panels/_sections.py +390 -0
- scitex/canvas/editor/_dearpygui/_plotting.py +187 -0
- scitex/canvas/editor/_dearpygui/_rendering.py +504 -0
- scitex/canvas/editor/_dearpygui/_selection.py +295 -0
- scitex/canvas/editor/_dearpygui/_state.py +93 -0
- scitex/canvas/editor/_dearpygui/_utils.py +61 -0
- scitex/canvas/editor/flask_editor/_core/__init__.py +27 -0
- scitex/canvas/editor/flask_editor/_core/_bbox_extraction.py +200 -0
- scitex/canvas/editor/flask_editor/_core/_editor.py +173 -0
- scitex/canvas/editor/flask_editor/_core/_export_helpers.py +353 -0
- scitex/canvas/editor/flask_editor/_core/_routes_basic.py +190 -0
- scitex/canvas/editor/flask_editor/_core/_routes_export.py +332 -0
- scitex/canvas/editor/flask_editor/_core/_routes_panels.py +252 -0
- scitex/canvas/editor/flask_editor/_core/_routes_save.py +218 -0
- scitex/canvas/editor/flask_editor/_core.py +25 -1684
- scitex/canvas/editor/flask_editor/templates/__init__.py +32 -70
- scitex/cli/__init__.py +38 -43
- scitex/cli/audio.py +160 -41
- scitex/cli/capture.py +133 -20
- scitex/cli/introspect.py +488 -0
- scitex/cli/main.py +200 -109
- scitex/cli/mcp.py +60 -34
- scitex/cli/plt.py +414 -0
- scitex/cli/repro.py +15 -8
- scitex/cli/resource.py +15 -8
- scitex/cli/scholar/__init__.py +154 -8
- scitex/cli/scholar/_crossref_scitex.py +296 -0
- scitex/cli/scholar/_fetch.py +25 -3
- scitex/cli/social.py +355 -0
- scitex/cli/stats.py +136 -11
- scitex/cli/template.py +129 -12
- scitex/cli/tex.py +15 -8
- scitex/cli/writer.py +49 -299
- scitex/cloud/__init__.py +41 -2
- scitex/config/README.md +1 -1
- scitex/config/__init__.py +16 -2
- scitex/config/_env_registry.py +256 -0
- scitex/context/__init__.py +22 -0
- scitex/dev/__init__.py +20 -1
- scitex/diagram/__init__.py +42 -19
- scitex/diagram/mcp_server.py +13 -125
- scitex/gen/__init__.py +50 -14
- scitex/gen/_list_packages.py +4 -4
- scitex/introspect/__init__.py +82 -0
- scitex/introspect/_call_graph.py +303 -0
- scitex/introspect/_class_hierarchy.py +163 -0
- scitex/introspect/_core.py +41 -0
- scitex/introspect/_docstring.py +131 -0
- scitex/introspect/_examples.py +113 -0
- scitex/introspect/_imports.py +271 -0
- scitex/{gen/_inspect_module.py → introspect/_list_api.py} +48 -56
- scitex/introspect/_mcp/__init__.py +41 -0
- scitex/introspect/_mcp/handlers.py +233 -0
- scitex/introspect/_members.py +155 -0
- scitex/introspect/_resolve.py +89 -0
- scitex/introspect/_signature.py +131 -0
- scitex/introspect/_source.py +80 -0
- scitex/introspect/_type_hints.py +172 -0
- scitex/io/_save.py +1 -2
- scitex/io/bundle/README.md +1 -1
- scitex/logging/_formatters.py +19 -9
- scitex/mcp_server.py +98 -5
- scitex/os/__init__.py +4 -0
- scitex/{gen → os}/_check_host.py +4 -5
- scitex/plt/__init__.py +245 -550
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +5 -10
- scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/plt/gallery/README.md +1 -1
- scitex/plt/utils/_hitmap/__init__.py +82 -0
- scitex/plt/utils/_hitmap/_artist_extraction.py +343 -0
- scitex/plt/utils/_hitmap/_color_application.py +346 -0
- scitex/plt/utils/_hitmap/_color_conversion.py +121 -0
- scitex/plt/utils/_hitmap/_constants.py +40 -0
- scitex/plt/utils/_hitmap/_hitmap_core.py +334 -0
- scitex/plt/utils/_hitmap/_path_extraction.py +357 -0
- scitex/plt/utils/_hitmap/_query.py +113 -0
- scitex/plt/utils/_hitmap.py +46 -1616
- scitex/plt/utils/_metadata/__init__.py +80 -0
- scitex/plt/utils/_metadata/_artists/__init__.py +25 -0
- scitex/plt/utils/_metadata/_artists/_base.py +195 -0
- scitex/plt/utils/_metadata/_artists/_collections.py +356 -0
- scitex/plt/utils/_metadata/_artists/_extract.py +57 -0
- scitex/plt/utils/_metadata/_artists/_images.py +80 -0
- scitex/plt/utils/_metadata/_artists/_lines.py +261 -0
- scitex/plt/utils/_metadata/_artists/_patches.py +247 -0
- scitex/plt/utils/_metadata/_artists/_text.py +106 -0
- scitex/plt/utils/_metadata/_csv.py +416 -0
- scitex/plt/utils/_metadata/_detect.py +225 -0
- scitex/plt/utils/_metadata/_legend.py +127 -0
- scitex/plt/utils/_metadata/_rounding.py +117 -0
- scitex/plt/utils/_metadata/_verification.py +202 -0
- scitex/schema/README.md +1 -1
- scitex/scholar/__init__.py +8 -0
- scitex/scholar/_mcp/crossref_handlers.py +265 -0
- scitex/scholar/core/Scholar.py +63 -1700
- scitex/scholar/core/_mixins/__init__.py +36 -0
- scitex/scholar/core/_mixins/_enrichers.py +270 -0
- scitex/scholar/core/_mixins/_library_handlers.py +100 -0
- scitex/scholar/core/_mixins/_loaders.py +103 -0
- scitex/scholar/core/_mixins/_pdf_download.py +375 -0
- scitex/scholar/core/_mixins/_pipeline.py +312 -0
- scitex/scholar/core/_mixins/_project_handlers.py +125 -0
- scitex/scholar/core/_mixins/_savers.py +69 -0
- scitex/scholar/core/_mixins/_search.py +103 -0
- scitex/scholar/core/_mixins/_services.py +88 -0
- scitex/scholar/core/_mixins/_url_finding.py +105 -0
- scitex/scholar/crossref_scitex.py +367 -0
- scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/scholar/examples/00_run_all.sh +120 -0
- scitex/scholar/jobs/_executors.py +27 -3
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +38 -416
- scitex/scholar/pdf_download/_cli.py +154 -0
- scitex/scholar/pdf_download/strategies/__init__.py +11 -8
- scitex/scholar/pdf_download/strategies/manual_download_fallback.py +80 -3
- scitex/scholar/pipelines/ScholarPipelineBibTeX.py +73 -121
- scitex/scholar/pipelines/ScholarPipelineParallel.py +80 -138
- scitex/scholar/pipelines/ScholarPipelineSingle.py +43 -63
- scitex/scholar/pipelines/_single_steps.py +71 -36
- scitex/scholar/storage/_LibraryManager.py +97 -1695
- scitex/scholar/storage/_mixins/__init__.py +30 -0
- scitex/scholar/storage/_mixins/_bibtex_handlers.py +128 -0
- scitex/scholar/storage/_mixins/_library_operations.py +218 -0
- scitex/scholar/storage/_mixins/_metadata_conversion.py +226 -0
- scitex/scholar/storage/_mixins/_paper_saving.py +456 -0
- scitex/scholar/storage/_mixins/_resolution.py +376 -0
- scitex/scholar/storage/_mixins/_storage_helpers.py +121 -0
- scitex/scholar/storage/_mixins/_symlink_handlers.py +226 -0
- scitex/security/README.md +3 -3
- scitex/session/README.md +1 -1
- scitex/session/__init__.py +26 -7
- scitex/session/_decorator.py +1 -1
- scitex/sh/README.md +1 -1
- scitex/sh/__init__.py +7 -4
- scitex/social/__init__.py +155 -0
- scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/stats/_mcp/_handlers/__init__.py +31 -0
- scitex/stats/_mcp/_handlers/_corrections.py +113 -0
- scitex/stats/_mcp/_handlers/_descriptive.py +78 -0
- scitex/stats/_mcp/_handlers/_effect_size.py +106 -0
- scitex/stats/_mcp/_handlers/_format.py +94 -0
- scitex/stats/_mcp/_handlers/_normality.py +110 -0
- scitex/stats/_mcp/_handlers/_posthoc.py +224 -0
- scitex/stats/_mcp/_handlers/_power.py +247 -0
- scitex/stats/_mcp/_handlers/_recommend.py +102 -0
- scitex/stats/_mcp/_handlers/_run_test.py +279 -0
- scitex/stats/_mcp/_handlers/_stars.py +48 -0
- scitex/stats/_mcp/handlers.py +19 -1171
- scitex/stats/auto/_stat_style.py +175 -0
- scitex/stats/auto/_style_definitions.py +411 -0
- scitex/stats/auto/_styles.py +22 -620
- scitex/stats/descriptive/__init__.py +11 -8
- scitex/stats/descriptive/_ci.py +39 -0
- scitex/stats/power/_power.py +15 -4
- scitex/str/__init__.py +2 -1
- scitex/str/_title_case.py +63 -0
- scitex/template/README.md +1 -1
- scitex/template/__init__.py +25 -10
- scitex/template/_code_templates.py +147 -0
- scitex/template/_mcp/handlers.py +81 -0
- scitex/template/_mcp/tool_schemas.py +55 -0
- scitex/template/_templates/__init__.py +51 -0
- scitex/template/_templates/audio.py +233 -0
- scitex/template/_templates/canvas.py +312 -0
- scitex/template/_templates/capture.py +268 -0
- scitex/template/_templates/config.py +43 -0
- scitex/template/_templates/diagram.py +294 -0
- scitex/template/_templates/io.py +107 -0
- scitex/template/_templates/module.py +53 -0
- scitex/template/_templates/plt.py +202 -0
- scitex/template/_templates/scholar.py +267 -0
- scitex/template/_templates/session.py +130 -0
- scitex/template/_templates/session_minimal.py +43 -0
- scitex/template/_templates/session_plot.py +67 -0
- scitex/template/_templates/session_stats.py +77 -0
- scitex/template/_templates/stats.py +323 -0
- scitex/template/_templates/writer.py +296 -0
- scitex/template/clone_writer_directory.py +5 -5
- scitex/ui/_backends/_email.py +10 -2
- scitex/ui/_backends/_webhook.py +5 -1
- scitex/web/_search_pubmed.py +10 -6
- scitex/writer/README.md +1 -1
- scitex/writer/__init__.py +43 -34
- scitex/writer/_mcp/handlers.py +11 -744
- scitex/writer/_mcp/tool_schemas.py +5 -335
- scitex-2.15.3.dist-info/METADATA +667 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/RECORD +241 -120
- scitex/canvas/editor/flask_editor/templates/_scripts.py +0 -4933
- scitex/canvas/editor/flask_editor/templates/_styles.py +0 -1658
- scitex/diagram/_compile.py +0 -312
- scitex/diagram/_diagram.py +0 -355
- scitex/diagram/_mcp/__init__.py +0 -4
- scitex/diagram/_mcp/handlers.py +0 -400
- scitex/diagram/_mcp/tool_schemas.py +0 -157
- scitex/diagram/_presets.py +0 -173
- scitex/diagram/_schema.py +0 -182
- scitex/diagram/_split.py +0 -278
- scitex/gen/_ci.py +0 -12
- scitex/gen/_title_case.py +0 -89
- scitex/plt/_mcp/__init__.py +0 -4
- scitex/plt/_mcp/_handlers_annotation.py +0 -102
- scitex/plt/_mcp/_handlers_figure.py +0 -195
- scitex/plt/_mcp/_handlers_plot.py +0 -252
- scitex/plt/_mcp/_handlers_style.py +0 -219
- scitex/plt/_mcp/handlers.py +0 -74
- scitex/plt/_mcp/tool_schemas.py +0 -497
- scitex/plt/mcp_server.py +0 -231
- scitex/scholar/examples/SUGGESTIONS.md +0 -865
- scitex/scholar/examples/dev.py +0 -38
- scitex-2.14.0.dist-info/METADATA +0 -1238
- /scitex/{gen → context}/_detect_environment.py +0 -0
- /scitex/{gen → context}/_get_notebook_path.py +0 -0
- /scitex/{gen/_shell.py → sh/_shell_legacy.py} +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/WHEEL +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/entry_points.txt +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Timestamp: "2026-01-24 (ywatanabe)"
|
|
3
|
+
# File: /home/ywatanabe/proj/scitex-python/src/scitex/scholar/core/_mixins/_pdf_download.py
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
PDF download mixin for Scholar class.
|
|
7
|
+
|
|
8
|
+
Provides PDF downloading functionality from DOIs and BibTeX files.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import json
|
|
15
|
+
import shutil
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
|
19
|
+
|
|
20
|
+
from scitex import logging
|
|
21
|
+
from scitex.scholar.auth.core.AuthenticationGateway import AuthenticationGateway
|
|
22
|
+
from scitex.scholar.pdf_download.ScholarPDFDownloader import ScholarPDFDownloader
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from ..Papers import Papers
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class PDFDownloadMixin:
|
|
31
|
+
"""Mixin providing PDF download methods."""
|
|
32
|
+
|
|
33
|
+
async def download_pdfs_from_dois_async(
|
|
34
|
+
self,
|
|
35
|
+
dois: List[str],
|
|
36
|
+
output_dir: Optional[Path] = None,
|
|
37
|
+
max_concurrent: int = 1,
|
|
38
|
+
) -> Dict[str, int]:
|
|
39
|
+
"""Download PDFs for given DOIs using ScholarPDFDownloader.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
dois: List of DOI strings
|
|
43
|
+
output_dir: Output directory (not used - downloads to library MASTER)
|
|
44
|
+
max_concurrent: Maximum concurrent downloads (default: 1 for sequential)
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
Dictionary with download statistics
|
|
49
|
+
"""
|
|
50
|
+
if not dois:
|
|
51
|
+
return {"downloaded": 0, "failed": 0, "errors": 0}
|
|
52
|
+
|
|
53
|
+
(
|
|
54
|
+
browser,
|
|
55
|
+
context,
|
|
56
|
+
) = await self._browser_manager.get_authenticated_browser_and_context_async()
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
pdf_downloader = ScholarPDFDownloader(
|
|
60
|
+
context=context,
|
|
61
|
+
config=self.config,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
logger.info(
|
|
65
|
+
f"{self.name}: Starting PDF download for {len(dois)} DOIs "
|
|
66
|
+
f"(max_concurrent={max_concurrent})"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
results = await pdf_downloader.download_from_dois(
|
|
70
|
+
dois=dois,
|
|
71
|
+
output_dir=str(output_dir) if output_dir else "/tmp/",
|
|
72
|
+
max_concurrent=max_concurrent,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
stats = {"downloaded": 0, "failed": 0, "errors": 0}
|
|
76
|
+
library_dir = self.config.path_manager.library_dir
|
|
77
|
+
master_dir = library_dir / "MASTER"
|
|
78
|
+
master_dir.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
|
|
80
|
+
for doi, downloaded_paths in zip(dois, results):
|
|
81
|
+
try:
|
|
82
|
+
if downloaded_paths and len(downloaded_paths) > 0:
|
|
83
|
+
temp_pdf_path = downloaded_paths[0]
|
|
84
|
+
|
|
85
|
+
paper_id = self.config.path_manager._generate_paper_id(doi=doi)
|
|
86
|
+
storage_path = master_dir / paper_id
|
|
87
|
+
storage_path.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
|
|
89
|
+
pdf_filename = (
|
|
90
|
+
f"DOI_{doi.replace('/', '_').replace(':', '_')}.pdf"
|
|
91
|
+
)
|
|
92
|
+
master_pdf_path = storage_path / pdf_filename
|
|
93
|
+
shutil.move(str(temp_pdf_path), str(master_pdf_path))
|
|
94
|
+
|
|
95
|
+
metadata_file = storage_path / "metadata.json"
|
|
96
|
+
if metadata_file.exists():
|
|
97
|
+
with open(metadata_file) as f:
|
|
98
|
+
metadata = json.load(f)
|
|
99
|
+
else:
|
|
100
|
+
metadata = {
|
|
101
|
+
"doi": doi,
|
|
102
|
+
"scitex_id": paper_id,
|
|
103
|
+
"created_at": datetime.now().isoformat(),
|
|
104
|
+
"created_by": "SciTeX Scholar",
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
metadata["pdf_path"] = str(
|
|
108
|
+
master_pdf_path.relative_to(library_dir)
|
|
109
|
+
)
|
|
110
|
+
metadata["pdf_downloaded_at"] = datetime.now().isoformat()
|
|
111
|
+
metadata["pdf_size_bytes"] = master_pdf_path.stat().st_size
|
|
112
|
+
metadata["updated_at"] = datetime.now().isoformat()
|
|
113
|
+
|
|
114
|
+
with open(metadata_file, "w") as f:
|
|
115
|
+
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
|
116
|
+
|
|
117
|
+
if self.project not in ["master", "MASTER"]:
|
|
118
|
+
self._library_manager.update_symlink(
|
|
119
|
+
master_storage_path=storage_path,
|
|
120
|
+
project=self.project,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
logger.success(
|
|
124
|
+
f"{self.name}: Downloaded and organized PDF for {doi}: "
|
|
125
|
+
f"{master_pdf_path}"
|
|
126
|
+
)
|
|
127
|
+
stats["downloaded"] += 1
|
|
128
|
+
else:
|
|
129
|
+
logger.warning(f"{self.name}: No PDF downloaded for DOI: {doi}")
|
|
130
|
+
stats["failed"] += 1
|
|
131
|
+
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.error(f"{self.name}: Failed to organize PDF for {doi}: {e}")
|
|
134
|
+
stats["errors"] += 1
|
|
135
|
+
stats["failed"] += 1
|
|
136
|
+
|
|
137
|
+
return stats
|
|
138
|
+
|
|
139
|
+
finally:
|
|
140
|
+
await self._browser_manager.close()
|
|
141
|
+
|
|
142
|
+
async def _download_pdfs_sequential(
|
|
143
|
+
self, dois: List[str], output_dir: Optional[Path] = None
|
|
144
|
+
) -> Dict[str, int]:
|
|
145
|
+
"""Sequential PDF download with authentication gateway."""
|
|
146
|
+
results = {"downloaded": 0, "failed": 0, "errors": 0}
|
|
147
|
+
|
|
148
|
+
(
|
|
149
|
+
browser,
|
|
150
|
+
context,
|
|
151
|
+
) = await self._browser_manager.get_authenticated_browser_and_context_async()
|
|
152
|
+
|
|
153
|
+
auth_gateway = AuthenticationGateway(
|
|
154
|
+
auth_manager=self._auth_manager,
|
|
155
|
+
browser_manager=self._browser_manager,
|
|
156
|
+
config=self.config,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
pdf_downloader = ScholarPDFDownloader(
|
|
160
|
+
context=context,
|
|
161
|
+
config=self.config,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
library_dir = self.config.path_manager.library_dir
|
|
165
|
+
master_dir = library_dir / "MASTER"
|
|
166
|
+
project_dir = library_dir / self.project
|
|
167
|
+
master_dir.mkdir(parents=True, exist_ok=True)
|
|
168
|
+
project_dir.mkdir(parents=True, exist_ok=True)
|
|
169
|
+
|
|
170
|
+
for doi in dois:
|
|
171
|
+
try:
|
|
172
|
+
logger.info(f"{self.name}: Processing DOI: {doi}")
|
|
173
|
+
|
|
174
|
+
_url_context = await auth_gateway.prepare_context_async(
|
|
175
|
+
doi=doi, context=context
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
urls = await self._find_urls_for_doi_async(doi, context)
|
|
179
|
+
pdf_urls = urls.get("urls_pdf", [])
|
|
180
|
+
|
|
181
|
+
if not pdf_urls:
|
|
182
|
+
logger.warning(f"{self.name}: No PDF URLs found for DOI: {doi}")
|
|
183
|
+
results["failed"] += 1
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
downloaded_path = None
|
|
187
|
+
for pdf_entry in pdf_urls:
|
|
188
|
+
pdf_url = (
|
|
189
|
+
pdf_entry.get("url")
|
|
190
|
+
if isinstance(pdf_entry, dict)
|
|
191
|
+
else pdf_entry
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
if not pdf_url:
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
temp_output = (
|
|
198
|
+
Path("/tmp") / f"{doi.replace('/', '_').replace(':', '_')}.pdf"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
result = await pdf_downloader.download_from_url(
|
|
202
|
+
pdf_url=pdf_url, output_path=temp_output
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
if result and result.exists():
|
|
206
|
+
downloaded_path = result
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
if downloaded_path:
|
|
210
|
+
self._store_downloaded_pdf(
|
|
211
|
+
doi, downloaded_path, library_dir, master_dir
|
|
212
|
+
)
|
|
213
|
+
downloaded_path.unlink()
|
|
214
|
+
results["downloaded"] += 1
|
|
215
|
+
else:
|
|
216
|
+
logger.warning(
|
|
217
|
+
f"{self.name}: Failed to download any PDF for DOI: {doi}"
|
|
218
|
+
)
|
|
219
|
+
results["failed"] += 1
|
|
220
|
+
|
|
221
|
+
except Exception as e:
|
|
222
|
+
logger.error(f"{self.name}: Failed to process {doi}: {e}")
|
|
223
|
+
results["errors"] += 1
|
|
224
|
+
results["failed"] += 1
|
|
225
|
+
|
|
226
|
+
await self._browser_manager.close()
|
|
227
|
+
logger.info(f"{self.name}: PDF download complete: {results}")
|
|
228
|
+
return results
|
|
229
|
+
|
|
230
|
+
def _store_downloaded_pdf(
|
|
231
|
+
self,
|
|
232
|
+
doi: str,
|
|
233
|
+
downloaded_path: Path,
|
|
234
|
+
library_dir: Path,
|
|
235
|
+
master_dir: Path,
|
|
236
|
+
) -> None:
|
|
237
|
+
"""Store downloaded PDF in library structure."""
|
|
238
|
+
from ..Paper import Paper
|
|
239
|
+
from ..Papers import Papers
|
|
240
|
+
|
|
241
|
+
paper_id = self.config.path_manager._generate_paper_id(doi=doi)
|
|
242
|
+
storage_path = master_dir / paper_id
|
|
243
|
+
storage_path.mkdir(parents=True, exist_ok=True)
|
|
244
|
+
|
|
245
|
+
readable_name = None
|
|
246
|
+
temp_paper = None
|
|
247
|
+
try:
|
|
248
|
+
temp_paper = Paper()
|
|
249
|
+
temp_paper.metadata.id.doi = doi
|
|
250
|
+
temp_papers = Papers([temp_paper])
|
|
251
|
+
enriched = asyncio.run(self.enrich_papers_async(temp_papers))
|
|
252
|
+
if enriched and len(enriched) > 0:
|
|
253
|
+
temp_paper = enriched[0]
|
|
254
|
+
|
|
255
|
+
first_author = "Unknown"
|
|
256
|
+
authors = temp_paper.metadata.basic.authors
|
|
257
|
+
if authors and len(authors) > 0:
|
|
258
|
+
author_parts = authors[0].split()
|
|
259
|
+
if len(author_parts) > 1:
|
|
260
|
+
first_author = author_parts[-1]
|
|
261
|
+
else:
|
|
262
|
+
first_author = author_parts[0]
|
|
263
|
+
|
|
264
|
+
year = temp_paper.metadata.basic.year
|
|
265
|
+
year_str = str(year) if year else "Unknown"
|
|
266
|
+
|
|
267
|
+
journal_clean = "Unknown"
|
|
268
|
+
journal = temp_paper.metadata.publication.journal
|
|
269
|
+
if journal:
|
|
270
|
+
journal_clean = "".join(
|
|
271
|
+
c for c in journal if c.isalnum() or c in " "
|
|
272
|
+
).replace(" ", "")
|
|
273
|
+
if not journal_clean:
|
|
274
|
+
journal_clean = "Unknown"
|
|
275
|
+
|
|
276
|
+
readable_name = f"{first_author}-{year_str}-{journal_clean}"
|
|
277
|
+
except Exception:
|
|
278
|
+
pass
|
|
279
|
+
|
|
280
|
+
if not readable_name:
|
|
281
|
+
readable_name = f"DOI_{doi.replace('/', '_').replace(':', '_')}"
|
|
282
|
+
|
|
283
|
+
pdf_filename = f"DOI_{doi.replace('/', '_').replace(':', '_')}.pdf"
|
|
284
|
+
master_pdf_path = storage_path / pdf_filename
|
|
285
|
+
shutil.copy2(downloaded_path, master_pdf_path)
|
|
286
|
+
|
|
287
|
+
metadata_file = storage_path / "metadata.json"
|
|
288
|
+
if metadata_file.exists():
|
|
289
|
+
with open(metadata_file) as f:
|
|
290
|
+
metadata = json.load(f)
|
|
291
|
+
logger.debug(f"{self.name}: Loaded existing metadata for {paper_id}")
|
|
292
|
+
else:
|
|
293
|
+
metadata = {
|
|
294
|
+
"doi": doi,
|
|
295
|
+
"scitex_id": paper_id,
|
|
296
|
+
"created_at": datetime.now().isoformat(),
|
|
297
|
+
"created_by": "SciTeX Scholar",
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
if temp_paper:
|
|
301
|
+
paper_dict = temp_paper.to_dict()
|
|
302
|
+
for key, value in paper_dict.items():
|
|
303
|
+
if value is not None and key not in ["doi", "scitex_id"]:
|
|
304
|
+
metadata[key] = value
|
|
305
|
+
|
|
306
|
+
metadata["pdf_path"] = str(master_pdf_path.relative_to(library_dir))
|
|
307
|
+
metadata["pdf_downloaded_at"] = datetime.now().isoformat()
|
|
308
|
+
metadata["pdf_size_bytes"] = master_pdf_path.stat().st_size
|
|
309
|
+
metadata["updated_at"] = datetime.now().isoformat()
|
|
310
|
+
|
|
311
|
+
with open(metadata_file, "w") as f:
|
|
312
|
+
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
|
313
|
+
|
|
314
|
+
if self.project not in ["master", "MASTER"]:
|
|
315
|
+
self._library_manager.update_symlink(
|
|
316
|
+
master_storage_path=storage_path,
|
|
317
|
+
project=self.project,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
logger.success(
|
|
321
|
+
f"{self.name}: Downloaded PDF for {doi}: MASTER/{paper_id}/{pdf_filename}"
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
def download_pdfs_from_dois(
|
|
325
|
+
self, dois: List[str], output_dir: Optional[Path] = None
|
|
326
|
+
) -> Dict[str, int]:
|
|
327
|
+
"""Download PDFs for given DOIs.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
dois: List of DOI strings
|
|
331
|
+
output_dir: Output directory (uses config default if None)
|
|
332
|
+
|
|
333
|
+
Returns
|
|
334
|
+
-------
|
|
335
|
+
Dictionary with download statistics
|
|
336
|
+
"""
|
|
337
|
+
return asyncio.run(self.download_pdfs_from_dois_async(dois, output_dir))
|
|
338
|
+
|
|
339
|
+
def download_pdfs_from_bibtex(
|
|
340
|
+
self,
|
|
341
|
+
bibtex_input: Union[str, Path, Papers],
|
|
342
|
+
output_dir: Optional[Path] = None,
|
|
343
|
+
) -> Dict[str, int]:
|
|
344
|
+
"""Download PDFs from BibTeX file or Papers collection.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
bibtex_input: BibTeX file path, content string, or Papers collection
|
|
348
|
+
output_dir: Output directory (uses config default if None)
|
|
349
|
+
|
|
350
|
+
Returns
|
|
351
|
+
-------
|
|
352
|
+
Dictionary with download statistics
|
|
353
|
+
"""
|
|
354
|
+
from ..Papers import Papers
|
|
355
|
+
|
|
356
|
+
if isinstance(bibtex_input, Papers):
|
|
357
|
+
papers = bibtex_input
|
|
358
|
+
else:
|
|
359
|
+
papers = self.load_bibtex(bibtex_input)
|
|
360
|
+
|
|
361
|
+
dois = [paper.metadata.id.doi for paper in papers if paper.metadata.id.doi]
|
|
362
|
+
|
|
363
|
+
if not dois:
|
|
364
|
+
logger.warning(f"{self.name}: No papers with DOIs found in BibTeX input")
|
|
365
|
+
return {"downloaded": 0, "failed": 0, "errors": 0}
|
|
366
|
+
|
|
367
|
+
logger.info(
|
|
368
|
+
f"{self.name}: Found {len(dois)} papers with DOIs "
|
|
369
|
+
f"out of {len(papers)} total papers"
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
return self.download_pdfs_from_dois(dois, output_dir)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
# EOF
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Timestamp: "2026-01-24 (ywatanabe)"
|
|
3
|
+
# File: /home/ywatanabe/proj/scitex-python/src/scitex/scholar/core/_mixins/_pipeline.py
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Pipeline mixin for Scholar class.
|
|
7
|
+
|
|
8
|
+
Provides paper processing pipeline functionality for single and batch operations.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
15
|
+
|
|
16
|
+
from scitex import logging
|
|
17
|
+
from scitex.scholar.pdf_download.ScholarPDFDownloader import ScholarPDFDownloader
|
|
18
|
+
from scitex.scholar.url_finder.ScholarURLFinder import ScholarURLFinder
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from ..Paper import Paper
|
|
22
|
+
from ..Papers import Papers
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class PipelineMixin:
|
|
28
|
+
"""Mixin providing paper processing pipeline methods."""
|
|
29
|
+
|
|
30
|
+
async def process_paper_async(
|
|
31
|
+
self,
|
|
32
|
+
title: Optional[str] = None,
|
|
33
|
+
doi: Optional[str] = None,
|
|
34
|
+
project: Optional[str] = None,
|
|
35
|
+
) -> Paper:
|
|
36
|
+
"""Complete sequential pipeline for processing a single paper.
|
|
37
|
+
|
|
38
|
+
Accepts either title OR doi. Uses storage-first approach:
|
|
39
|
+
each stage checks storage before processing.
|
|
40
|
+
|
|
41
|
+
Workflow:
|
|
42
|
+
Stage 0: Resolve DOI from title (if needed)
|
|
43
|
+
Stage 1: Load or create Paper from storage
|
|
44
|
+
Stage 2: Find PDF URLs -> save to storage
|
|
45
|
+
Stage 3: Download PDF -> save to storage
|
|
46
|
+
Stage 4: Update project symlinks
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
title: Paper title (will resolve DOI using engine)
|
|
50
|
+
doi: DOI of the paper (preferred if available)
|
|
51
|
+
project: Project name (uses self.project if None)
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
Fully processed Paper object
|
|
56
|
+
|
|
57
|
+
Examples
|
|
58
|
+
--------
|
|
59
|
+
paper = await scholar.process_paper_async(doi="10.1038/s41598-017-02626-y")
|
|
60
|
+
paper = await scholar.process_paper_async(title="Attention Is All You Need")
|
|
61
|
+
"""
|
|
62
|
+
from ..Paper import Paper
|
|
63
|
+
|
|
64
|
+
if not title and not doi:
|
|
65
|
+
raise ValueError("Must provide either title or doi")
|
|
66
|
+
|
|
67
|
+
project = project or self.project
|
|
68
|
+
|
|
69
|
+
logger.info(f"{'=' * 60}")
|
|
70
|
+
logger.info("Processing paper")
|
|
71
|
+
if title:
|
|
72
|
+
logger.info(f"Title: {title[:50]}...")
|
|
73
|
+
if doi:
|
|
74
|
+
logger.info(f"DOI: {doi}")
|
|
75
|
+
logger.info(f"{'=' * 60}")
|
|
76
|
+
|
|
77
|
+
# Stage 0: Resolve DOI from title (if needed)
|
|
78
|
+
if not doi and title:
|
|
79
|
+
logger.info("Stage 0: Resolving DOI from title...")
|
|
80
|
+
results = await self._scholar_engine.search_async(title=title)
|
|
81
|
+
|
|
82
|
+
if results and results.get("id", {}).get("doi"):
|
|
83
|
+
doi = results["id"]["doi"]
|
|
84
|
+
logger.success(f"Resolved DOI: {doi}")
|
|
85
|
+
else:
|
|
86
|
+
logger.error(f"Could not resolve DOI from title: {title}")
|
|
87
|
+
raise ValueError(f"Could not resolve DOI from title: {title}")
|
|
88
|
+
|
|
89
|
+
paper_id = self.config.path_manager._generate_paper_id(doi=doi)
|
|
90
|
+
storage_path = self.config.get_library_master_dir() / paper_id
|
|
91
|
+
|
|
92
|
+
logger.info(f"Paper ID: {paper_id}")
|
|
93
|
+
logger.info(f"Storage: {storage_path}")
|
|
94
|
+
|
|
95
|
+
# Stage 1: Load or create Paper from storage
|
|
96
|
+
logger.info("\nStage 1: Loading/creating metadata...")
|
|
97
|
+
if self._library_manager.has_metadata(paper_id):
|
|
98
|
+
paper = self._library_manager.load_paper_from_id(paper_id)
|
|
99
|
+
logger.info("Loaded existing metadata from storage")
|
|
100
|
+
else:
|
|
101
|
+
paper = Paper()
|
|
102
|
+
paper.metadata.set_doi(doi)
|
|
103
|
+
paper.container.scitex_id = paper_id
|
|
104
|
+
|
|
105
|
+
if title:
|
|
106
|
+
paper.metadata.basic.title = title
|
|
107
|
+
|
|
108
|
+
self._library_manager.save_paper_incremental(paper_id, paper)
|
|
109
|
+
logger.success("Created new paper entry in storage")
|
|
110
|
+
|
|
111
|
+
# Stage 2: Check/find URLs
|
|
112
|
+
logger.info("\nStage 2: Checking/finding PDF URLs...")
|
|
113
|
+
if not self._library_manager.has_urls(paper_id):
|
|
114
|
+
logger.info(f"Finding PDF URLs for DOI: {doi}")
|
|
115
|
+
(
|
|
116
|
+
browser,
|
|
117
|
+
context,
|
|
118
|
+
) = await self._browser_manager.get_authenticated_browser_and_context_async()
|
|
119
|
+
try:
|
|
120
|
+
url_finder = ScholarURLFinder(context, config=self.config)
|
|
121
|
+
urls = await url_finder.find_pdf_urls(doi)
|
|
122
|
+
|
|
123
|
+
paper.metadata.url.pdfs = urls
|
|
124
|
+
self._library_manager.save_paper_incremental(paper_id, paper)
|
|
125
|
+
logger.success(f"Found {len(urls)} PDF URLs, saved to storage")
|
|
126
|
+
finally:
|
|
127
|
+
await self._browser_manager.close()
|
|
128
|
+
else:
|
|
129
|
+
logger.info(
|
|
130
|
+
f"PDF URLs already in storage ({len(paper.metadata.url.pdfs)} URLs)"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Stage 3: Check/download PDF
|
|
134
|
+
logger.info("\nStage 3: Checking/downloading PDF...")
|
|
135
|
+
if not self._library_manager.has_pdf(paper_id):
|
|
136
|
+
logger.info("Downloading PDF...")
|
|
137
|
+
if paper.metadata.url.pdfs:
|
|
138
|
+
(
|
|
139
|
+
browser,
|
|
140
|
+
context,
|
|
141
|
+
) = await self._browser_manager.get_authenticated_browser_and_context_async()
|
|
142
|
+
try:
|
|
143
|
+
downloader = ScholarPDFDownloader(context, config=self.config)
|
|
144
|
+
|
|
145
|
+
pdf_url = (
|
|
146
|
+
paper.metadata.url.pdfs[0]["url"]
|
|
147
|
+
if isinstance(paper.metadata.url.pdfs[0], dict)
|
|
148
|
+
else paper.metadata.url.pdfs[0]
|
|
149
|
+
)
|
|
150
|
+
temp_path = storage_path / "main.pdf"
|
|
151
|
+
|
|
152
|
+
result = await downloader.download_from_url(
|
|
153
|
+
pdf_url, temp_path, doi=doi
|
|
154
|
+
)
|
|
155
|
+
if result and result.exists():
|
|
156
|
+
paper.metadata.path.pdfs.append(str(result))
|
|
157
|
+
self._library_manager.save_paper_incremental(paper_id, paper)
|
|
158
|
+
logger.success(f"{self.name}: Downloaded PDF, saved to storage")
|
|
159
|
+
else:
|
|
160
|
+
logger.warning(f"{self.name}: Failed to download PDF")
|
|
161
|
+
finally:
|
|
162
|
+
await self._browser_manager.close()
|
|
163
|
+
else:
|
|
164
|
+
logger.warning(f"{self.name}: No PDF URLs available for download")
|
|
165
|
+
else:
|
|
166
|
+
logger.info(f"{self.name}: PDF already in storage")
|
|
167
|
+
|
|
168
|
+
# Stage 4: Update project symlinks
|
|
169
|
+
if project and project not in ["master", "MASTER"]:
|
|
170
|
+
logger.info(f"{self.name}: \nStage 4: Updating project symlinks...")
|
|
171
|
+
self._library_manager.update_symlink(
|
|
172
|
+
master_storage_path=storage_path,
|
|
173
|
+
project=project,
|
|
174
|
+
)
|
|
175
|
+
logger.success(f"{self.name}: Updated symlink in project: {project}")
|
|
176
|
+
|
|
177
|
+
logger.info(f"\n{'=' * 60}")
|
|
178
|
+
logger.success(f"{self.name}: Paper processing complete")
|
|
179
|
+
logger.info(f"{'=' * 60}\n")
|
|
180
|
+
|
|
181
|
+
return paper
|
|
182
|
+
|
|
183
|
+
def process_paper(
|
|
184
|
+
self,
|
|
185
|
+
title: Optional[str] = None,
|
|
186
|
+
doi: Optional[str] = None,
|
|
187
|
+
project: Optional[str] = None,
|
|
188
|
+
) -> Paper:
|
|
189
|
+
"""Synchronous wrapper for process_paper_async.
|
|
190
|
+
|
|
191
|
+
See process_paper_async() for full documentation.
|
|
192
|
+
"""
|
|
193
|
+
return asyncio.run(
|
|
194
|
+
self.process_paper_async(title=title, doi=doi, project=project)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
async def process_papers_async(
|
|
198
|
+
self,
|
|
199
|
+
papers: Union[Papers, List[str]],
|
|
200
|
+
project: Optional[str] = None,
|
|
201
|
+
max_concurrent: int = 3,
|
|
202
|
+
) -> Papers:
|
|
203
|
+
"""Process multiple papers with controlled parallelism.
|
|
204
|
+
|
|
205
|
+
Each paper goes through complete sequential pipeline.
|
|
206
|
+
Semaphore controls how many papers process concurrently.
|
|
207
|
+
|
|
208
|
+
Architecture:
|
|
209
|
+
- Parallel papers (max_concurrent at a time)
|
|
210
|
+
- Sequential stages per paper
|
|
211
|
+
- Storage checks before each stage
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
papers: Papers collection or list of DOIs
|
|
215
|
+
project: Project name (uses self.project if None)
|
|
216
|
+
max_concurrent: Maximum concurrent papers (default: 3)
|
|
217
|
+
Set to 1 for purely sequential processing
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
Papers collection with processed papers
|
|
222
|
+
|
|
223
|
+
Examples
|
|
224
|
+
--------
|
|
225
|
+
papers = scholar.load_bibtex("papers.bib")
|
|
226
|
+
processed = await scholar.process_papers_async(papers, max_concurrent=3)
|
|
227
|
+
|
|
228
|
+
dois = ["10.1038/...", "10.1016/...", "10.1109/..."]
|
|
229
|
+
processed = await scholar.process_papers_async(dois, max_concurrent=1)
|
|
230
|
+
"""
|
|
231
|
+
from ..Paper import Paper
|
|
232
|
+
from ..Papers import Papers
|
|
233
|
+
|
|
234
|
+
project = project or self.project
|
|
235
|
+
|
|
236
|
+
if isinstance(papers, list):
|
|
237
|
+
papers_list = []
|
|
238
|
+
for doi in papers:
|
|
239
|
+
p = Paper()
|
|
240
|
+
p.metadata.set_doi(doi)
|
|
241
|
+
papers_list.append(p)
|
|
242
|
+
papers = Papers(papers_list, project=project, config=self.config)
|
|
243
|
+
|
|
244
|
+
total = len(papers)
|
|
245
|
+
logger.info(f"{self.name}: \n{'=' * 60}")
|
|
246
|
+
logger.info(
|
|
247
|
+
f"{self.name}: Processing {total} papers (max_concurrent={max_concurrent})"
|
|
248
|
+
)
|
|
249
|
+
logger.info(f"{self.name}: Project: {project}")
|
|
250
|
+
logger.info(f"{self.name}: {'=' * 60}\n")
|
|
251
|
+
|
|
252
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
253
|
+
|
|
254
|
+
async def process_with_semaphore(paper, index):
|
|
255
|
+
"""Process one paper with semaphore control."""
|
|
256
|
+
async with semaphore:
|
|
257
|
+
logger.info(f"{self.name}: \n[{index}/{total}] Starting paper...")
|
|
258
|
+
try:
|
|
259
|
+
result = await self.process_paper_async(
|
|
260
|
+
title=paper.metadata.basic.title,
|
|
261
|
+
doi=paper.metadata.id.doi,
|
|
262
|
+
project=project,
|
|
263
|
+
)
|
|
264
|
+
logger.success(f"{self.name}: [{index}/{total}] Completed")
|
|
265
|
+
return result
|
|
266
|
+
except Exception as e:
|
|
267
|
+
logger.error(f"{self.name}: [{index}/{total}] Failed: {e}")
|
|
268
|
+
return None
|
|
269
|
+
|
|
270
|
+
tasks = [process_with_semaphore(paper, i + 1) for i, paper in enumerate(papers)]
|
|
271
|
+
|
|
272
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
273
|
+
|
|
274
|
+
processed_papers = []
|
|
275
|
+
errors = 0
|
|
276
|
+
for i, result in enumerate(results):
|
|
277
|
+
if isinstance(result, Exception):
|
|
278
|
+
logger.error(f"{self.name}: Paper {i + 1} raised exception: {result}")
|
|
279
|
+
errors += 1
|
|
280
|
+
elif result is not None:
|
|
281
|
+
processed_papers.append(result)
|
|
282
|
+
|
|
283
|
+
logger.info(f"{self.name}: \n{'=' * 60}")
|
|
284
|
+
logger.info(f"{self.name}: Batch Processing Complete")
|
|
285
|
+
logger.info(f"{self.name}: Total: {total}")
|
|
286
|
+
logger.info(f"{self.name}: Successful: {len(processed_papers)}")
|
|
287
|
+
logger.info(f"{self.name}: Failed: {total - len(processed_papers)}")
|
|
288
|
+
logger.info(f"{self.name}: Errors: {errors}")
|
|
289
|
+
logger.info(f"{self.name}: {'=' * 60}\n")
|
|
290
|
+
|
|
291
|
+
return Papers(processed_papers, project=project, config=self.config)
|
|
292
|
+
|
|
293
|
+
def process_papers(
|
|
294
|
+
self,
|
|
295
|
+
papers: Union[Papers, List[str]],
|
|
296
|
+
project: Optional[str] = None,
|
|
297
|
+
max_concurrent: int = 3,
|
|
298
|
+
) -> Papers:
|
|
299
|
+
"""Synchronous wrapper for process_papers_async.
|
|
300
|
+
|
|
301
|
+
See process_papers_async() for full documentation.
|
|
302
|
+
"""
|
|
303
|
+
return asyncio.run(
|
|
304
|
+
self.process_papers_async(
|
|
305
|
+
papers=papers,
|
|
306
|
+
project=project,
|
|
307
|
+
max_concurrent=max_concurrent,
|
|
308
|
+
)
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# EOF
|