scitex 2.14.0__py3-none-any.whl → 2.15.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (264) hide show
  1. scitex/__init__.py +71 -17
  2. scitex/_env_loader.py +156 -0
  3. scitex/_mcp_resources/__init__.py +37 -0
  4. scitex/_mcp_resources/_cheatsheet.py +135 -0
  5. scitex/_mcp_resources/_figrecipe.py +138 -0
  6. scitex/_mcp_resources/_formats.py +102 -0
  7. scitex/_mcp_resources/_modules.py +337 -0
  8. scitex/_mcp_resources/_session.py +149 -0
  9. scitex/_mcp_tools/__init__.py +4 -0
  10. scitex/_mcp_tools/audio.py +66 -0
  11. scitex/_mcp_tools/diagram.py +11 -95
  12. scitex/_mcp_tools/introspect.py +210 -0
  13. scitex/_mcp_tools/plt.py +260 -305
  14. scitex/_mcp_tools/scholar.py +74 -0
  15. scitex/_mcp_tools/social.py +27 -0
  16. scitex/_mcp_tools/template.py +24 -0
  17. scitex/_mcp_tools/writer.py +17 -210
  18. scitex/ai/_gen_ai/_PARAMS.py +10 -7
  19. scitex/ai/classification/reporters/_SingleClassificationReporter.py +45 -1603
  20. scitex/ai/classification/reporters/_mixins/__init__.py +36 -0
  21. scitex/ai/classification/reporters/_mixins/_constants.py +67 -0
  22. scitex/ai/classification/reporters/_mixins/_cv_summary.py +387 -0
  23. scitex/ai/classification/reporters/_mixins/_feature_importance.py +119 -0
  24. scitex/ai/classification/reporters/_mixins/_metrics.py +275 -0
  25. scitex/ai/classification/reporters/_mixins/_plotting.py +179 -0
  26. scitex/ai/classification/reporters/_mixins/_reports.py +153 -0
  27. scitex/ai/classification/reporters/_mixins/_storage.py +160 -0
  28. scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +30 -1550
  29. scitex/ai/classification/timeseries/_sliding_window_core.py +467 -0
  30. scitex/ai/classification/timeseries/_sliding_window_plotting.py +369 -0
  31. scitex/audio/README.md +40 -36
  32. scitex/audio/__init__.py +129 -61
  33. scitex/audio/_branding.py +185 -0
  34. scitex/audio/_mcp/__init__.py +32 -0
  35. scitex/audio/_mcp/handlers.py +59 -6
  36. scitex/audio/_mcp/speak_handlers.py +238 -0
  37. scitex/audio/_relay.py +225 -0
  38. scitex/audio/_tts.py +18 -10
  39. scitex/audio/engines/base.py +17 -10
  40. scitex/audio/engines/elevenlabs_engine.py +7 -2
  41. scitex/audio/mcp_server.py +228 -75
  42. scitex/canvas/README.md +1 -1
  43. scitex/canvas/editor/_dearpygui/__init__.py +25 -0
  44. scitex/canvas/editor/_dearpygui/_editor.py +147 -0
  45. scitex/canvas/editor/_dearpygui/_handlers.py +476 -0
  46. scitex/canvas/editor/_dearpygui/_panels/__init__.py +17 -0
  47. scitex/canvas/editor/_dearpygui/_panels/_control.py +119 -0
  48. scitex/canvas/editor/_dearpygui/_panels/_element_controls.py +190 -0
  49. scitex/canvas/editor/_dearpygui/_panels/_preview.py +43 -0
  50. scitex/canvas/editor/_dearpygui/_panels/_sections.py +390 -0
  51. scitex/canvas/editor/_dearpygui/_plotting.py +187 -0
  52. scitex/canvas/editor/_dearpygui/_rendering.py +504 -0
  53. scitex/canvas/editor/_dearpygui/_selection.py +295 -0
  54. scitex/canvas/editor/_dearpygui/_state.py +93 -0
  55. scitex/canvas/editor/_dearpygui/_utils.py +61 -0
  56. scitex/canvas/editor/flask_editor/_core/__init__.py +27 -0
  57. scitex/canvas/editor/flask_editor/_core/_bbox_extraction.py +200 -0
  58. scitex/canvas/editor/flask_editor/_core/_editor.py +173 -0
  59. scitex/canvas/editor/flask_editor/_core/_export_helpers.py +353 -0
  60. scitex/canvas/editor/flask_editor/_core/_routes_basic.py +190 -0
  61. scitex/canvas/editor/flask_editor/_core/_routes_export.py +332 -0
  62. scitex/canvas/editor/flask_editor/_core/_routes_panels.py +252 -0
  63. scitex/canvas/editor/flask_editor/_core/_routes_save.py +218 -0
  64. scitex/canvas/editor/flask_editor/_core.py +25 -1684
  65. scitex/canvas/editor/flask_editor/templates/__init__.py +32 -70
  66. scitex/cli/__init__.py +38 -43
  67. scitex/cli/audio.py +160 -41
  68. scitex/cli/capture.py +133 -20
  69. scitex/cli/introspect.py +488 -0
  70. scitex/cli/main.py +200 -109
  71. scitex/cli/mcp.py +60 -34
  72. scitex/cli/plt.py +414 -0
  73. scitex/cli/repro.py +15 -8
  74. scitex/cli/resource.py +15 -8
  75. scitex/cli/scholar/__init__.py +154 -8
  76. scitex/cli/scholar/_crossref_scitex.py +296 -0
  77. scitex/cli/scholar/_fetch.py +25 -3
  78. scitex/cli/social.py +355 -0
  79. scitex/cli/stats.py +136 -11
  80. scitex/cli/template.py +129 -12
  81. scitex/cli/tex.py +15 -8
  82. scitex/cli/writer.py +49 -299
  83. scitex/cloud/__init__.py +41 -2
  84. scitex/config/README.md +1 -1
  85. scitex/config/__init__.py +16 -2
  86. scitex/config/_env_registry.py +256 -0
  87. scitex/context/__init__.py +22 -0
  88. scitex/dev/__init__.py +20 -1
  89. scitex/diagram/__init__.py +42 -19
  90. scitex/diagram/mcp_server.py +13 -125
  91. scitex/gen/__init__.py +50 -14
  92. scitex/gen/_list_packages.py +4 -4
  93. scitex/introspect/__init__.py +82 -0
  94. scitex/introspect/_call_graph.py +303 -0
  95. scitex/introspect/_class_hierarchy.py +163 -0
  96. scitex/introspect/_core.py +41 -0
  97. scitex/introspect/_docstring.py +131 -0
  98. scitex/introspect/_examples.py +113 -0
  99. scitex/introspect/_imports.py +271 -0
  100. scitex/{gen/_inspect_module.py → introspect/_list_api.py} +48 -56
  101. scitex/introspect/_mcp/__init__.py +41 -0
  102. scitex/introspect/_mcp/handlers.py +233 -0
  103. scitex/introspect/_members.py +155 -0
  104. scitex/introspect/_resolve.py +89 -0
  105. scitex/introspect/_signature.py +131 -0
  106. scitex/introspect/_source.py +80 -0
  107. scitex/introspect/_type_hints.py +172 -0
  108. scitex/io/_save.py +1 -2
  109. scitex/io/bundle/README.md +1 -1
  110. scitex/logging/_formatters.py +19 -9
  111. scitex/mcp_server.py +98 -5
  112. scitex/os/__init__.py +4 -0
  113. scitex/{gen → os}/_check_host.py +4 -5
  114. scitex/plt/__init__.py +245 -550
  115. scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +5 -10
  116. scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  117. scitex/plt/gallery/README.md +1 -1
  118. scitex/plt/utils/_hitmap/__init__.py +82 -0
  119. scitex/plt/utils/_hitmap/_artist_extraction.py +343 -0
  120. scitex/plt/utils/_hitmap/_color_application.py +346 -0
  121. scitex/plt/utils/_hitmap/_color_conversion.py +121 -0
  122. scitex/plt/utils/_hitmap/_constants.py +40 -0
  123. scitex/plt/utils/_hitmap/_hitmap_core.py +334 -0
  124. scitex/plt/utils/_hitmap/_path_extraction.py +357 -0
  125. scitex/plt/utils/_hitmap/_query.py +113 -0
  126. scitex/plt/utils/_hitmap.py +46 -1616
  127. scitex/plt/utils/_metadata/__init__.py +80 -0
  128. scitex/plt/utils/_metadata/_artists/__init__.py +25 -0
  129. scitex/plt/utils/_metadata/_artists/_base.py +195 -0
  130. scitex/plt/utils/_metadata/_artists/_collections.py +356 -0
  131. scitex/plt/utils/_metadata/_artists/_extract.py +57 -0
  132. scitex/plt/utils/_metadata/_artists/_images.py +80 -0
  133. scitex/plt/utils/_metadata/_artists/_lines.py +261 -0
  134. scitex/plt/utils/_metadata/_artists/_patches.py +247 -0
  135. scitex/plt/utils/_metadata/_artists/_text.py +106 -0
  136. scitex/plt/utils/_metadata/_csv.py +416 -0
  137. scitex/plt/utils/_metadata/_detect.py +225 -0
  138. scitex/plt/utils/_metadata/_legend.py +127 -0
  139. scitex/plt/utils/_metadata/_rounding.py +117 -0
  140. scitex/plt/utils/_metadata/_verification.py +202 -0
  141. scitex/schema/README.md +1 -1
  142. scitex/scholar/__init__.py +8 -0
  143. scitex/scholar/_mcp/crossref_handlers.py +265 -0
  144. scitex/scholar/core/Scholar.py +63 -1700
  145. scitex/scholar/core/_mixins/__init__.py +36 -0
  146. scitex/scholar/core/_mixins/_enrichers.py +270 -0
  147. scitex/scholar/core/_mixins/_library_handlers.py +100 -0
  148. scitex/scholar/core/_mixins/_loaders.py +103 -0
  149. scitex/scholar/core/_mixins/_pdf_download.py +375 -0
  150. scitex/scholar/core/_mixins/_pipeline.py +312 -0
  151. scitex/scholar/core/_mixins/_project_handlers.py +125 -0
  152. scitex/scholar/core/_mixins/_savers.py +69 -0
  153. scitex/scholar/core/_mixins/_search.py +103 -0
  154. scitex/scholar/core/_mixins/_services.py +88 -0
  155. scitex/scholar/core/_mixins/_url_finding.py +105 -0
  156. scitex/scholar/crossref_scitex.py +367 -0
  157. scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  158. scitex/scholar/examples/00_run_all.sh +120 -0
  159. scitex/scholar/jobs/_executors.py +27 -3
  160. scitex/scholar/pdf_download/ScholarPDFDownloader.py +38 -416
  161. scitex/scholar/pdf_download/_cli.py +154 -0
  162. scitex/scholar/pdf_download/strategies/__init__.py +11 -8
  163. scitex/scholar/pdf_download/strategies/manual_download_fallback.py +80 -3
  164. scitex/scholar/pipelines/ScholarPipelineBibTeX.py +73 -121
  165. scitex/scholar/pipelines/ScholarPipelineParallel.py +80 -138
  166. scitex/scholar/pipelines/ScholarPipelineSingle.py +43 -63
  167. scitex/scholar/pipelines/_single_steps.py +71 -36
  168. scitex/scholar/storage/_LibraryManager.py +97 -1695
  169. scitex/scholar/storage/_mixins/__init__.py +30 -0
  170. scitex/scholar/storage/_mixins/_bibtex_handlers.py +128 -0
  171. scitex/scholar/storage/_mixins/_library_operations.py +218 -0
  172. scitex/scholar/storage/_mixins/_metadata_conversion.py +226 -0
  173. scitex/scholar/storage/_mixins/_paper_saving.py +456 -0
  174. scitex/scholar/storage/_mixins/_resolution.py +376 -0
  175. scitex/scholar/storage/_mixins/_storage_helpers.py +121 -0
  176. scitex/scholar/storage/_mixins/_symlink_handlers.py +226 -0
  177. scitex/security/README.md +3 -3
  178. scitex/session/README.md +1 -1
  179. scitex/session/__init__.py +26 -7
  180. scitex/session/_decorator.py +1 -1
  181. scitex/sh/README.md +1 -1
  182. scitex/sh/__init__.py +7 -4
  183. scitex/social/__init__.py +155 -0
  184. scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  185. scitex/stats/_mcp/_handlers/__init__.py +31 -0
  186. scitex/stats/_mcp/_handlers/_corrections.py +113 -0
  187. scitex/stats/_mcp/_handlers/_descriptive.py +78 -0
  188. scitex/stats/_mcp/_handlers/_effect_size.py +106 -0
  189. scitex/stats/_mcp/_handlers/_format.py +94 -0
  190. scitex/stats/_mcp/_handlers/_normality.py +110 -0
  191. scitex/stats/_mcp/_handlers/_posthoc.py +224 -0
  192. scitex/stats/_mcp/_handlers/_power.py +247 -0
  193. scitex/stats/_mcp/_handlers/_recommend.py +102 -0
  194. scitex/stats/_mcp/_handlers/_run_test.py +279 -0
  195. scitex/stats/_mcp/_handlers/_stars.py +48 -0
  196. scitex/stats/_mcp/handlers.py +19 -1171
  197. scitex/stats/auto/_stat_style.py +175 -0
  198. scitex/stats/auto/_style_definitions.py +411 -0
  199. scitex/stats/auto/_styles.py +22 -620
  200. scitex/stats/descriptive/__init__.py +11 -8
  201. scitex/stats/descriptive/_ci.py +39 -0
  202. scitex/stats/power/_power.py +15 -4
  203. scitex/str/__init__.py +2 -1
  204. scitex/str/_title_case.py +63 -0
  205. scitex/template/README.md +1 -1
  206. scitex/template/__init__.py +25 -10
  207. scitex/template/_code_templates.py +147 -0
  208. scitex/template/_mcp/handlers.py +81 -0
  209. scitex/template/_mcp/tool_schemas.py +55 -0
  210. scitex/template/_templates/__init__.py +51 -0
  211. scitex/template/_templates/audio.py +233 -0
  212. scitex/template/_templates/canvas.py +312 -0
  213. scitex/template/_templates/capture.py +268 -0
  214. scitex/template/_templates/config.py +43 -0
  215. scitex/template/_templates/diagram.py +294 -0
  216. scitex/template/_templates/io.py +107 -0
  217. scitex/template/_templates/module.py +53 -0
  218. scitex/template/_templates/plt.py +202 -0
  219. scitex/template/_templates/scholar.py +267 -0
  220. scitex/template/_templates/session.py +130 -0
  221. scitex/template/_templates/session_minimal.py +43 -0
  222. scitex/template/_templates/session_plot.py +67 -0
  223. scitex/template/_templates/session_stats.py +77 -0
  224. scitex/template/_templates/stats.py +323 -0
  225. scitex/template/_templates/writer.py +296 -0
  226. scitex/template/clone_writer_directory.py +5 -5
  227. scitex/ui/_backends/_email.py +10 -2
  228. scitex/ui/_backends/_webhook.py +5 -1
  229. scitex/web/_search_pubmed.py +10 -6
  230. scitex/writer/README.md +1 -1
  231. scitex/writer/__init__.py +43 -34
  232. scitex/writer/_mcp/handlers.py +11 -744
  233. scitex/writer/_mcp/tool_schemas.py +5 -335
  234. scitex-2.15.3.dist-info/METADATA +667 -0
  235. {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/RECORD +241 -120
  236. scitex/canvas/editor/flask_editor/templates/_scripts.py +0 -4933
  237. scitex/canvas/editor/flask_editor/templates/_styles.py +0 -1658
  238. scitex/diagram/_compile.py +0 -312
  239. scitex/diagram/_diagram.py +0 -355
  240. scitex/diagram/_mcp/__init__.py +0 -4
  241. scitex/diagram/_mcp/handlers.py +0 -400
  242. scitex/diagram/_mcp/tool_schemas.py +0 -157
  243. scitex/diagram/_presets.py +0 -173
  244. scitex/diagram/_schema.py +0 -182
  245. scitex/diagram/_split.py +0 -278
  246. scitex/gen/_ci.py +0 -12
  247. scitex/gen/_title_case.py +0 -89
  248. scitex/plt/_mcp/__init__.py +0 -4
  249. scitex/plt/_mcp/_handlers_annotation.py +0 -102
  250. scitex/plt/_mcp/_handlers_figure.py +0 -195
  251. scitex/plt/_mcp/_handlers_plot.py +0 -252
  252. scitex/plt/_mcp/_handlers_style.py +0 -219
  253. scitex/plt/_mcp/handlers.py +0 -74
  254. scitex/plt/_mcp/tool_schemas.py +0 -497
  255. scitex/plt/mcp_server.py +0 -231
  256. scitex/scholar/examples/SUGGESTIONS.md +0 -865
  257. scitex/scholar/examples/dev.py +0 -38
  258. scitex-2.14.0.dist-info/METADATA +0 -1238
  259. /scitex/{gen → context}/_detect_environment.py +0 -0
  260. /scitex/{gen → context}/_get_notebook_path.py +0 -0
  261. /scitex/{gen/_shell.py → sh/_shell_legacy.py} +0 -0
  262. {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/WHEEL +0 -0
  263. {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/entry_points.txt +0 -0
  264. {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,375 @@
1
+ #!/usr/bin/env python3
2
+ # Timestamp: "2026-01-24 (ywatanabe)"
3
+ # File: /home/ywatanabe/proj/scitex-python/src/scitex/scholar/core/_mixins/_pdf_download.py
4
+
5
+ """
6
+ PDF download mixin for Scholar class.
7
+
8
+ Provides PDF downloading functionality from DOIs and BibTeX files.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import json
15
+ import shutil
16
+ from datetime import datetime
17
+ from pathlib import Path
18
+ from typing import TYPE_CHECKING, Dict, List, Optional, Union
19
+
20
+ from scitex import logging
21
+ from scitex.scholar.auth.core.AuthenticationGateway import AuthenticationGateway
22
+ from scitex.scholar.pdf_download.ScholarPDFDownloader import ScholarPDFDownloader
23
+
24
+ if TYPE_CHECKING:
25
+ from ..Papers import Papers
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class PDFDownloadMixin:
31
+ """Mixin providing PDF download methods."""
32
+
33
+ async def download_pdfs_from_dois_async(
34
+ self,
35
+ dois: List[str],
36
+ output_dir: Optional[Path] = None,
37
+ max_concurrent: int = 1,
38
+ ) -> Dict[str, int]:
39
+ """Download PDFs for given DOIs using ScholarPDFDownloader.
40
+
41
+ Args:
42
+ dois: List of DOI strings
43
+ output_dir: Output directory (not used - downloads to library MASTER)
44
+ max_concurrent: Maximum concurrent downloads (default: 1 for sequential)
45
+
46
+ Returns
47
+ -------
48
+ Dictionary with download statistics
49
+ """
50
+ if not dois:
51
+ return {"downloaded": 0, "failed": 0, "errors": 0}
52
+
53
+ (
54
+ browser,
55
+ context,
56
+ ) = await self._browser_manager.get_authenticated_browser_and_context_async()
57
+
58
+ try:
59
+ pdf_downloader = ScholarPDFDownloader(
60
+ context=context,
61
+ config=self.config,
62
+ )
63
+
64
+ logger.info(
65
+ f"{self.name}: Starting PDF download for {len(dois)} DOIs "
66
+ f"(max_concurrent={max_concurrent})"
67
+ )
68
+
69
+ results = await pdf_downloader.download_from_dois(
70
+ dois=dois,
71
+ output_dir=str(output_dir) if output_dir else "/tmp/",
72
+ max_concurrent=max_concurrent,
73
+ )
74
+
75
+ stats = {"downloaded": 0, "failed": 0, "errors": 0}
76
+ library_dir = self.config.path_manager.library_dir
77
+ master_dir = library_dir / "MASTER"
78
+ master_dir.mkdir(parents=True, exist_ok=True)
79
+
80
+ for doi, downloaded_paths in zip(dois, results):
81
+ try:
82
+ if downloaded_paths and len(downloaded_paths) > 0:
83
+ temp_pdf_path = downloaded_paths[0]
84
+
85
+ paper_id = self.config.path_manager._generate_paper_id(doi=doi)
86
+ storage_path = master_dir / paper_id
87
+ storage_path.mkdir(parents=True, exist_ok=True)
88
+
89
+ pdf_filename = (
90
+ f"DOI_{doi.replace('/', '_').replace(':', '_')}.pdf"
91
+ )
92
+ master_pdf_path = storage_path / pdf_filename
93
+ shutil.move(str(temp_pdf_path), str(master_pdf_path))
94
+
95
+ metadata_file = storage_path / "metadata.json"
96
+ if metadata_file.exists():
97
+ with open(metadata_file) as f:
98
+ metadata = json.load(f)
99
+ else:
100
+ metadata = {
101
+ "doi": doi,
102
+ "scitex_id": paper_id,
103
+ "created_at": datetime.now().isoformat(),
104
+ "created_by": "SciTeX Scholar",
105
+ }
106
+
107
+ metadata["pdf_path"] = str(
108
+ master_pdf_path.relative_to(library_dir)
109
+ )
110
+ metadata["pdf_downloaded_at"] = datetime.now().isoformat()
111
+ metadata["pdf_size_bytes"] = master_pdf_path.stat().st_size
112
+ metadata["updated_at"] = datetime.now().isoformat()
113
+
114
+ with open(metadata_file, "w") as f:
115
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
116
+
117
+ if self.project not in ["master", "MASTER"]:
118
+ self._library_manager.update_symlink(
119
+ master_storage_path=storage_path,
120
+ project=self.project,
121
+ )
122
+
123
+ logger.success(
124
+ f"{self.name}: Downloaded and organized PDF for {doi}: "
125
+ f"{master_pdf_path}"
126
+ )
127
+ stats["downloaded"] += 1
128
+ else:
129
+ logger.warning(f"{self.name}: No PDF downloaded for DOI: {doi}")
130
+ stats["failed"] += 1
131
+
132
+ except Exception as e:
133
+ logger.error(f"{self.name}: Failed to organize PDF for {doi}: {e}")
134
+ stats["errors"] += 1
135
+ stats["failed"] += 1
136
+
137
+ return stats
138
+
139
+ finally:
140
+ await self._browser_manager.close()
141
+
142
+ async def _download_pdfs_sequential(
143
+ self, dois: List[str], output_dir: Optional[Path] = None
144
+ ) -> Dict[str, int]:
145
+ """Sequential PDF download with authentication gateway."""
146
+ results = {"downloaded": 0, "failed": 0, "errors": 0}
147
+
148
+ (
149
+ browser,
150
+ context,
151
+ ) = await self._browser_manager.get_authenticated_browser_and_context_async()
152
+
153
+ auth_gateway = AuthenticationGateway(
154
+ auth_manager=self._auth_manager,
155
+ browser_manager=self._browser_manager,
156
+ config=self.config,
157
+ )
158
+
159
+ pdf_downloader = ScholarPDFDownloader(
160
+ context=context,
161
+ config=self.config,
162
+ )
163
+
164
+ library_dir = self.config.path_manager.library_dir
165
+ master_dir = library_dir / "MASTER"
166
+ project_dir = library_dir / self.project
167
+ master_dir.mkdir(parents=True, exist_ok=True)
168
+ project_dir.mkdir(parents=True, exist_ok=True)
169
+
170
+ for doi in dois:
171
+ try:
172
+ logger.info(f"{self.name}: Processing DOI: {doi}")
173
+
174
+ _url_context = await auth_gateway.prepare_context_async(
175
+ doi=doi, context=context
176
+ )
177
+
178
+ urls = await self._find_urls_for_doi_async(doi, context)
179
+ pdf_urls = urls.get("urls_pdf", [])
180
+
181
+ if not pdf_urls:
182
+ logger.warning(f"{self.name}: No PDF URLs found for DOI: {doi}")
183
+ results["failed"] += 1
184
+ continue
185
+
186
+ downloaded_path = None
187
+ for pdf_entry in pdf_urls:
188
+ pdf_url = (
189
+ pdf_entry.get("url")
190
+ if isinstance(pdf_entry, dict)
191
+ else pdf_entry
192
+ )
193
+
194
+ if not pdf_url:
195
+ continue
196
+
197
+ temp_output = (
198
+ Path("/tmp") / f"{doi.replace('/', '_').replace(':', '_')}.pdf"
199
+ )
200
+
201
+ result = await pdf_downloader.download_from_url(
202
+ pdf_url=pdf_url, output_path=temp_output
203
+ )
204
+
205
+ if result and result.exists():
206
+ downloaded_path = result
207
+ break
208
+
209
+ if downloaded_path:
210
+ self._store_downloaded_pdf(
211
+ doi, downloaded_path, library_dir, master_dir
212
+ )
213
+ downloaded_path.unlink()
214
+ results["downloaded"] += 1
215
+ else:
216
+ logger.warning(
217
+ f"{self.name}: Failed to download any PDF for DOI: {doi}"
218
+ )
219
+ results["failed"] += 1
220
+
221
+ except Exception as e:
222
+ logger.error(f"{self.name}: Failed to process {doi}: {e}")
223
+ results["errors"] += 1
224
+ results["failed"] += 1
225
+
226
+ await self._browser_manager.close()
227
+ logger.info(f"{self.name}: PDF download complete: {results}")
228
+ return results
229
+
230
+ def _store_downloaded_pdf(
231
+ self,
232
+ doi: str,
233
+ downloaded_path: Path,
234
+ library_dir: Path,
235
+ master_dir: Path,
236
+ ) -> None:
237
+ """Store downloaded PDF in library structure."""
238
+ from ..Paper import Paper
239
+ from ..Papers import Papers
240
+
241
+ paper_id = self.config.path_manager._generate_paper_id(doi=doi)
242
+ storage_path = master_dir / paper_id
243
+ storage_path.mkdir(parents=True, exist_ok=True)
244
+
245
+ readable_name = None
246
+ temp_paper = None
247
+ try:
248
+ temp_paper = Paper()
249
+ temp_paper.metadata.id.doi = doi
250
+ temp_papers = Papers([temp_paper])
251
+ enriched = asyncio.run(self.enrich_papers_async(temp_papers))
252
+ if enriched and len(enriched) > 0:
253
+ temp_paper = enriched[0]
254
+
255
+ first_author = "Unknown"
256
+ authors = temp_paper.metadata.basic.authors
257
+ if authors and len(authors) > 0:
258
+ author_parts = authors[0].split()
259
+ if len(author_parts) > 1:
260
+ first_author = author_parts[-1]
261
+ else:
262
+ first_author = author_parts[0]
263
+
264
+ year = temp_paper.metadata.basic.year
265
+ year_str = str(year) if year else "Unknown"
266
+
267
+ journal_clean = "Unknown"
268
+ journal = temp_paper.metadata.publication.journal
269
+ if journal:
270
+ journal_clean = "".join(
271
+ c for c in journal if c.isalnum() or c in " "
272
+ ).replace(" ", "")
273
+ if not journal_clean:
274
+ journal_clean = "Unknown"
275
+
276
+ readable_name = f"{first_author}-{year_str}-{journal_clean}"
277
+ except Exception:
278
+ pass
279
+
280
+ if not readable_name:
281
+ readable_name = f"DOI_{doi.replace('/', '_').replace(':', '_')}"
282
+
283
+ pdf_filename = f"DOI_{doi.replace('/', '_').replace(':', '_')}.pdf"
284
+ master_pdf_path = storage_path / pdf_filename
285
+ shutil.copy2(downloaded_path, master_pdf_path)
286
+
287
+ metadata_file = storage_path / "metadata.json"
288
+ if metadata_file.exists():
289
+ with open(metadata_file) as f:
290
+ metadata = json.load(f)
291
+ logger.debug(f"{self.name}: Loaded existing metadata for {paper_id}")
292
+ else:
293
+ metadata = {
294
+ "doi": doi,
295
+ "scitex_id": paper_id,
296
+ "created_at": datetime.now().isoformat(),
297
+ "created_by": "SciTeX Scholar",
298
+ }
299
+
300
+ if temp_paper:
301
+ paper_dict = temp_paper.to_dict()
302
+ for key, value in paper_dict.items():
303
+ if value is not None and key not in ["doi", "scitex_id"]:
304
+ metadata[key] = value
305
+
306
+ metadata["pdf_path"] = str(master_pdf_path.relative_to(library_dir))
307
+ metadata["pdf_downloaded_at"] = datetime.now().isoformat()
308
+ metadata["pdf_size_bytes"] = master_pdf_path.stat().st_size
309
+ metadata["updated_at"] = datetime.now().isoformat()
310
+
311
+ with open(metadata_file, "w") as f:
312
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
313
+
314
+ if self.project not in ["master", "MASTER"]:
315
+ self._library_manager.update_symlink(
316
+ master_storage_path=storage_path,
317
+ project=self.project,
318
+ )
319
+
320
+ logger.success(
321
+ f"{self.name}: Downloaded PDF for {doi}: MASTER/{paper_id}/{pdf_filename}"
322
+ )
323
+
324
+ def download_pdfs_from_dois(
325
+ self, dois: List[str], output_dir: Optional[Path] = None
326
+ ) -> Dict[str, int]:
327
+ """Download PDFs for given DOIs.
328
+
329
+ Args:
330
+ dois: List of DOI strings
331
+ output_dir: Output directory (uses config default if None)
332
+
333
+ Returns
334
+ -------
335
+ Dictionary with download statistics
336
+ """
337
+ return asyncio.run(self.download_pdfs_from_dois_async(dois, output_dir))
338
+
339
+ def download_pdfs_from_bibtex(
340
+ self,
341
+ bibtex_input: Union[str, Path, Papers],
342
+ output_dir: Optional[Path] = None,
343
+ ) -> Dict[str, int]:
344
+ """Download PDFs from BibTeX file or Papers collection.
345
+
346
+ Args:
347
+ bibtex_input: BibTeX file path, content string, or Papers collection
348
+ output_dir: Output directory (uses config default if None)
349
+
350
+ Returns
351
+ -------
352
+ Dictionary with download statistics
353
+ """
354
+ from ..Papers import Papers
355
+
356
+ if isinstance(bibtex_input, Papers):
357
+ papers = bibtex_input
358
+ else:
359
+ papers = self.load_bibtex(bibtex_input)
360
+
361
+ dois = [paper.metadata.id.doi for paper in papers if paper.metadata.id.doi]
362
+
363
+ if not dois:
364
+ logger.warning(f"{self.name}: No papers with DOIs found in BibTeX input")
365
+ return {"downloaded": 0, "failed": 0, "errors": 0}
366
+
367
+ logger.info(
368
+ f"{self.name}: Found {len(dois)} papers with DOIs "
369
+ f"out of {len(papers)} total papers"
370
+ )
371
+
372
+ return self.download_pdfs_from_dois(dois, output_dir)
373
+
374
+
375
+ # EOF
@@ -0,0 +1,312 @@
1
+ #!/usr/bin/env python3
2
+ # Timestamp: "2026-01-24 (ywatanabe)"
3
+ # File: /home/ywatanabe/proj/scitex-python/src/scitex/scholar/core/_mixins/_pipeline.py
4
+
5
+ """
6
+ Pipeline mixin for Scholar class.
7
+
8
+ Provides paper processing pipeline functionality for single and batch operations.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ from typing import TYPE_CHECKING, List, Optional, Union
15
+
16
+ from scitex import logging
17
+ from scitex.scholar.pdf_download.ScholarPDFDownloader import ScholarPDFDownloader
18
+ from scitex.scholar.url_finder.ScholarURLFinder import ScholarURLFinder
19
+
20
+ if TYPE_CHECKING:
21
+ from ..Paper import Paper
22
+ from ..Papers import Papers
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class PipelineMixin:
28
+ """Mixin providing paper processing pipeline methods."""
29
+
30
+ async def process_paper_async(
31
+ self,
32
+ title: Optional[str] = None,
33
+ doi: Optional[str] = None,
34
+ project: Optional[str] = None,
35
+ ) -> Paper:
36
+ """Complete sequential pipeline for processing a single paper.
37
+
38
+ Accepts either title OR doi. Uses storage-first approach:
39
+ each stage checks storage before processing.
40
+
41
+ Workflow:
42
+ Stage 0: Resolve DOI from title (if needed)
43
+ Stage 1: Load or create Paper from storage
44
+ Stage 2: Find PDF URLs -> save to storage
45
+ Stage 3: Download PDF -> save to storage
46
+ Stage 4: Update project symlinks
47
+
48
+ Args:
49
+ title: Paper title (will resolve DOI using engine)
50
+ doi: DOI of the paper (preferred if available)
51
+ project: Project name (uses self.project if None)
52
+
53
+ Returns
54
+ -------
55
+ Fully processed Paper object
56
+
57
+ Examples
58
+ --------
59
+ paper = await scholar.process_paper_async(doi="10.1038/s41598-017-02626-y")
60
+ paper = await scholar.process_paper_async(title="Attention Is All You Need")
61
+ """
62
+ from ..Paper import Paper
63
+
64
+ if not title and not doi:
65
+ raise ValueError("Must provide either title or doi")
66
+
67
+ project = project or self.project
68
+
69
+ logger.info(f"{'=' * 60}")
70
+ logger.info("Processing paper")
71
+ if title:
72
+ logger.info(f"Title: {title[:50]}...")
73
+ if doi:
74
+ logger.info(f"DOI: {doi}")
75
+ logger.info(f"{'=' * 60}")
76
+
77
+ # Stage 0: Resolve DOI from title (if needed)
78
+ if not doi and title:
79
+ logger.info("Stage 0: Resolving DOI from title...")
80
+ results = await self._scholar_engine.search_async(title=title)
81
+
82
+ if results and results.get("id", {}).get("doi"):
83
+ doi = results["id"]["doi"]
84
+ logger.success(f"Resolved DOI: {doi}")
85
+ else:
86
+ logger.error(f"Could not resolve DOI from title: {title}")
87
+ raise ValueError(f"Could not resolve DOI from title: {title}")
88
+
89
+ paper_id = self.config.path_manager._generate_paper_id(doi=doi)
90
+ storage_path = self.config.get_library_master_dir() / paper_id
91
+
92
+ logger.info(f"Paper ID: {paper_id}")
93
+ logger.info(f"Storage: {storage_path}")
94
+
95
+ # Stage 1: Load or create Paper from storage
96
+ logger.info("\nStage 1: Loading/creating metadata...")
97
+ if self._library_manager.has_metadata(paper_id):
98
+ paper = self._library_manager.load_paper_from_id(paper_id)
99
+ logger.info("Loaded existing metadata from storage")
100
+ else:
101
+ paper = Paper()
102
+ paper.metadata.set_doi(doi)
103
+ paper.container.scitex_id = paper_id
104
+
105
+ if title:
106
+ paper.metadata.basic.title = title
107
+
108
+ self._library_manager.save_paper_incremental(paper_id, paper)
109
+ logger.success("Created new paper entry in storage")
110
+
111
+ # Stage 2: Check/find URLs
112
+ logger.info("\nStage 2: Checking/finding PDF URLs...")
113
+ if not self._library_manager.has_urls(paper_id):
114
+ logger.info(f"Finding PDF URLs for DOI: {doi}")
115
+ (
116
+ browser,
117
+ context,
118
+ ) = await self._browser_manager.get_authenticated_browser_and_context_async()
119
+ try:
120
+ url_finder = ScholarURLFinder(context, config=self.config)
121
+ urls = await url_finder.find_pdf_urls(doi)
122
+
123
+ paper.metadata.url.pdfs = urls
124
+ self._library_manager.save_paper_incremental(paper_id, paper)
125
+ logger.success(f"Found {len(urls)} PDF URLs, saved to storage")
126
+ finally:
127
+ await self._browser_manager.close()
128
+ else:
129
+ logger.info(
130
+ f"PDF URLs already in storage ({len(paper.metadata.url.pdfs)} URLs)"
131
+ )
132
+
133
+ # Stage 3: Check/download PDF
134
+ logger.info("\nStage 3: Checking/downloading PDF...")
135
+ if not self._library_manager.has_pdf(paper_id):
136
+ logger.info("Downloading PDF...")
137
+ if paper.metadata.url.pdfs:
138
+ (
139
+ browser,
140
+ context,
141
+ ) = await self._browser_manager.get_authenticated_browser_and_context_async()
142
+ try:
143
+ downloader = ScholarPDFDownloader(context, config=self.config)
144
+
145
+ pdf_url = (
146
+ paper.metadata.url.pdfs[0]["url"]
147
+ if isinstance(paper.metadata.url.pdfs[0], dict)
148
+ else paper.metadata.url.pdfs[0]
149
+ )
150
+ temp_path = storage_path / "main.pdf"
151
+
152
+ result = await downloader.download_from_url(
153
+ pdf_url, temp_path, doi=doi
154
+ )
155
+ if result and result.exists():
156
+ paper.metadata.path.pdfs.append(str(result))
157
+ self._library_manager.save_paper_incremental(paper_id, paper)
158
+ logger.success(f"{self.name}: Downloaded PDF, saved to storage")
159
+ else:
160
+ logger.warning(f"{self.name}: Failed to download PDF")
161
+ finally:
162
+ await self._browser_manager.close()
163
+ else:
164
+ logger.warning(f"{self.name}: No PDF URLs available for download")
165
+ else:
166
+ logger.info(f"{self.name}: PDF already in storage")
167
+
168
+ # Stage 4: Update project symlinks
169
+ if project and project not in ["master", "MASTER"]:
170
+ logger.info(f"{self.name}: \nStage 4: Updating project symlinks...")
171
+ self._library_manager.update_symlink(
172
+ master_storage_path=storage_path,
173
+ project=project,
174
+ )
175
+ logger.success(f"{self.name}: Updated symlink in project: {project}")
176
+
177
+ logger.info(f"\n{'=' * 60}")
178
+ logger.success(f"{self.name}: Paper processing complete")
179
+ logger.info(f"{'=' * 60}\n")
180
+
181
+ return paper
182
+
183
+ def process_paper(
184
+ self,
185
+ title: Optional[str] = None,
186
+ doi: Optional[str] = None,
187
+ project: Optional[str] = None,
188
+ ) -> Paper:
189
+ """Synchronous wrapper for process_paper_async.
190
+
191
+ See process_paper_async() for full documentation.
192
+ """
193
+ return asyncio.run(
194
+ self.process_paper_async(title=title, doi=doi, project=project)
195
+ )
196
+
197
+ async def process_papers_async(
198
+ self,
199
+ papers: Union[Papers, List[str]],
200
+ project: Optional[str] = None,
201
+ max_concurrent: int = 3,
202
+ ) -> Papers:
203
+ """Process multiple papers with controlled parallelism.
204
+
205
+ Each paper goes through complete sequential pipeline.
206
+ Semaphore controls how many papers process concurrently.
207
+
208
+ Architecture:
209
+ - Parallel papers (max_concurrent at a time)
210
+ - Sequential stages per paper
211
+ - Storage checks before each stage
212
+
213
+ Args:
214
+ papers: Papers collection or list of DOIs
215
+ project: Project name (uses self.project if None)
216
+ max_concurrent: Maximum concurrent papers (default: 3)
217
+ Set to 1 for purely sequential processing
218
+
219
+ Returns
220
+ -------
221
+ Papers collection with processed papers
222
+
223
+ Examples
224
+ --------
225
+ papers = scholar.load_bibtex("papers.bib")
226
+ processed = await scholar.process_papers_async(papers, max_concurrent=3)
227
+
228
+ dois = ["10.1038/...", "10.1016/...", "10.1109/..."]
229
+ processed = await scholar.process_papers_async(dois, max_concurrent=1)
230
+ """
231
+ from ..Paper import Paper
232
+ from ..Papers import Papers
233
+
234
+ project = project or self.project
235
+
236
+ if isinstance(papers, list):
237
+ papers_list = []
238
+ for doi in papers:
239
+ p = Paper()
240
+ p.metadata.set_doi(doi)
241
+ papers_list.append(p)
242
+ papers = Papers(papers_list, project=project, config=self.config)
243
+
244
+ total = len(papers)
245
+ logger.info(f"{self.name}: \n{'=' * 60}")
246
+ logger.info(
247
+ f"{self.name}: Processing {total} papers (max_concurrent={max_concurrent})"
248
+ )
249
+ logger.info(f"{self.name}: Project: {project}")
250
+ logger.info(f"{self.name}: {'=' * 60}\n")
251
+
252
+ semaphore = asyncio.Semaphore(max_concurrent)
253
+
254
+ async def process_with_semaphore(paper, index):
255
+ """Process one paper with semaphore control."""
256
+ async with semaphore:
257
+ logger.info(f"{self.name}: \n[{index}/{total}] Starting paper...")
258
+ try:
259
+ result = await self.process_paper_async(
260
+ title=paper.metadata.basic.title,
261
+ doi=paper.metadata.id.doi,
262
+ project=project,
263
+ )
264
+ logger.success(f"{self.name}: [{index}/{total}] Completed")
265
+ return result
266
+ except Exception as e:
267
+ logger.error(f"{self.name}: [{index}/{total}] Failed: {e}")
268
+ return None
269
+
270
+ tasks = [process_with_semaphore(paper, i + 1) for i, paper in enumerate(papers)]
271
+
272
+ results = await asyncio.gather(*tasks, return_exceptions=True)
273
+
274
+ processed_papers = []
275
+ errors = 0
276
+ for i, result in enumerate(results):
277
+ if isinstance(result, Exception):
278
+ logger.error(f"{self.name}: Paper {i + 1} raised exception: {result}")
279
+ errors += 1
280
+ elif result is not None:
281
+ processed_papers.append(result)
282
+
283
+ logger.info(f"{self.name}: \n{'=' * 60}")
284
+ logger.info(f"{self.name}: Batch Processing Complete")
285
+ logger.info(f"{self.name}: Total: {total}")
286
+ logger.info(f"{self.name}: Successful: {len(processed_papers)}")
287
+ logger.info(f"{self.name}: Failed: {total - len(processed_papers)}")
288
+ logger.info(f"{self.name}: Errors: {errors}")
289
+ logger.info(f"{self.name}: {'=' * 60}\n")
290
+
291
+ return Papers(processed_papers, project=project, config=self.config)
292
+
293
+ def process_papers(
294
+ self,
295
+ papers: Union[Papers, List[str]],
296
+ project: Optional[str] = None,
297
+ max_concurrent: int = 3,
298
+ ) -> Papers:
299
+ """Synchronous wrapper for process_papers_async.
300
+
301
+ See process_papers_async() for full documentation.
302
+ """
303
+ return asyncio.run(
304
+ self.process_papers_async(
305
+ papers=papers,
306
+ project=project,
307
+ max_concurrent=max_concurrent,
308
+ )
309
+ )
310
+
311
+
312
+ # EOF