scitex 2.14.0__py3-none-any.whl → 2.15.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +71 -17
- scitex/_env_loader.py +156 -0
- scitex/_mcp_resources/__init__.py +37 -0
- scitex/_mcp_resources/_cheatsheet.py +135 -0
- scitex/_mcp_resources/_figrecipe.py +138 -0
- scitex/_mcp_resources/_formats.py +102 -0
- scitex/_mcp_resources/_modules.py +337 -0
- scitex/_mcp_resources/_session.py +149 -0
- scitex/_mcp_tools/__init__.py +4 -0
- scitex/_mcp_tools/audio.py +66 -0
- scitex/_mcp_tools/diagram.py +11 -95
- scitex/_mcp_tools/introspect.py +210 -0
- scitex/_mcp_tools/plt.py +260 -305
- scitex/_mcp_tools/scholar.py +74 -0
- scitex/_mcp_tools/social.py +27 -0
- scitex/_mcp_tools/template.py +24 -0
- scitex/_mcp_tools/writer.py +17 -210
- scitex/ai/_gen_ai/_PARAMS.py +10 -7
- scitex/ai/classification/reporters/_SingleClassificationReporter.py +45 -1603
- scitex/ai/classification/reporters/_mixins/__init__.py +36 -0
- scitex/ai/classification/reporters/_mixins/_constants.py +67 -0
- scitex/ai/classification/reporters/_mixins/_cv_summary.py +387 -0
- scitex/ai/classification/reporters/_mixins/_feature_importance.py +119 -0
- scitex/ai/classification/reporters/_mixins/_metrics.py +275 -0
- scitex/ai/classification/reporters/_mixins/_plotting.py +179 -0
- scitex/ai/classification/reporters/_mixins/_reports.py +153 -0
- scitex/ai/classification/reporters/_mixins/_storage.py +160 -0
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +30 -1550
- scitex/ai/classification/timeseries/_sliding_window_core.py +467 -0
- scitex/ai/classification/timeseries/_sliding_window_plotting.py +369 -0
- scitex/audio/README.md +40 -36
- scitex/audio/__init__.py +129 -61
- scitex/audio/_branding.py +185 -0
- scitex/audio/_mcp/__init__.py +32 -0
- scitex/audio/_mcp/handlers.py +59 -6
- scitex/audio/_mcp/speak_handlers.py +238 -0
- scitex/audio/_relay.py +225 -0
- scitex/audio/_tts.py +18 -10
- scitex/audio/engines/base.py +17 -10
- scitex/audio/engines/elevenlabs_engine.py +7 -2
- scitex/audio/mcp_server.py +228 -75
- scitex/canvas/README.md +1 -1
- scitex/canvas/editor/_dearpygui/__init__.py +25 -0
- scitex/canvas/editor/_dearpygui/_editor.py +147 -0
- scitex/canvas/editor/_dearpygui/_handlers.py +476 -0
- scitex/canvas/editor/_dearpygui/_panels/__init__.py +17 -0
- scitex/canvas/editor/_dearpygui/_panels/_control.py +119 -0
- scitex/canvas/editor/_dearpygui/_panels/_element_controls.py +190 -0
- scitex/canvas/editor/_dearpygui/_panels/_preview.py +43 -0
- scitex/canvas/editor/_dearpygui/_panels/_sections.py +390 -0
- scitex/canvas/editor/_dearpygui/_plotting.py +187 -0
- scitex/canvas/editor/_dearpygui/_rendering.py +504 -0
- scitex/canvas/editor/_dearpygui/_selection.py +295 -0
- scitex/canvas/editor/_dearpygui/_state.py +93 -0
- scitex/canvas/editor/_dearpygui/_utils.py +61 -0
- scitex/canvas/editor/flask_editor/_core/__init__.py +27 -0
- scitex/canvas/editor/flask_editor/_core/_bbox_extraction.py +200 -0
- scitex/canvas/editor/flask_editor/_core/_editor.py +173 -0
- scitex/canvas/editor/flask_editor/_core/_export_helpers.py +353 -0
- scitex/canvas/editor/flask_editor/_core/_routes_basic.py +190 -0
- scitex/canvas/editor/flask_editor/_core/_routes_export.py +332 -0
- scitex/canvas/editor/flask_editor/_core/_routes_panels.py +252 -0
- scitex/canvas/editor/flask_editor/_core/_routes_save.py +218 -0
- scitex/canvas/editor/flask_editor/_core.py +25 -1684
- scitex/canvas/editor/flask_editor/templates/__init__.py +32 -70
- scitex/cli/__init__.py +38 -43
- scitex/cli/audio.py +160 -41
- scitex/cli/capture.py +133 -20
- scitex/cli/introspect.py +488 -0
- scitex/cli/main.py +200 -109
- scitex/cli/mcp.py +60 -34
- scitex/cli/plt.py +414 -0
- scitex/cli/repro.py +15 -8
- scitex/cli/resource.py +15 -8
- scitex/cli/scholar/__init__.py +154 -8
- scitex/cli/scholar/_crossref_scitex.py +296 -0
- scitex/cli/scholar/_fetch.py +25 -3
- scitex/cli/social.py +355 -0
- scitex/cli/stats.py +136 -11
- scitex/cli/template.py +129 -12
- scitex/cli/tex.py +15 -8
- scitex/cli/writer.py +49 -299
- scitex/cloud/__init__.py +41 -2
- scitex/config/README.md +1 -1
- scitex/config/__init__.py +16 -2
- scitex/config/_env_registry.py +256 -0
- scitex/context/__init__.py +22 -0
- scitex/dev/__init__.py +20 -1
- scitex/diagram/__init__.py +42 -19
- scitex/diagram/mcp_server.py +13 -125
- scitex/gen/__init__.py +50 -14
- scitex/gen/_list_packages.py +4 -4
- scitex/introspect/__init__.py +82 -0
- scitex/introspect/_call_graph.py +303 -0
- scitex/introspect/_class_hierarchy.py +163 -0
- scitex/introspect/_core.py +41 -0
- scitex/introspect/_docstring.py +131 -0
- scitex/introspect/_examples.py +113 -0
- scitex/introspect/_imports.py +271 -0
- scitex/{gen/_inspect_module.py → introspect/_list_api.py} +48 -56
- scitex/introspect/_mcp/__init__.py +41 -0
- scitex/introspect/_mcp/handlers.py +233 -0
- scitex/introspect/_members.py +155 -0
- scitex/introspect/_resolve.py +89 -0
- scitex/introspect/_signature.py +131 -0
- scitex/introspect/_source.py +80 -0
- scitex/introspect/_type_hints.py +172 -0
- scitex/io/_save.py +1 -2
- scitex/io/bundle/README.md +1 -1
- scitex/logging/_formatters.py +19 -9
- scitex/mcp_server.py +98 -5
- scitex/os/__init__.py +4 -0
- scitex/{gen → os}/_check_host.py +4 -5
- scitex/plt/__init__.py +245 -550
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +5 -10
- scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/plt/gallery/README.md +1 -1
- scitex/plt/utils/_hitmap/__init__.py +82 -0
- scitex/plt/utils/_hitmap/_artist_extraction.py +343 -0
- scitex/plt/utils/_hitmap/_color_application.py +346 -0
- scitex/plt/utils/_hitmap/_color_conversion.py +121 -0
- scitex/plt/utils/_hitmap/_constants.py +40 -0
- scitex/plt/utils/_hitmap/_hitmap_core.py +334 -0
- scitex/plt/utils/_hitmap/_path_extraction.py +357 -0
- scitex/plt/utils/_hitmap/_query.py +113 -0
- scitex/plt/utils/_hitmap.py +46 -1616
- scitex/plt/utils/_metadata/__init__.py +80 -0
- scitex/plt/utils/_metadata/_artists/__init__.py +25 -0
- scitex/plt/utils/_metadata/_artists/_base.py +195 -0
- scitex/plt/utils/_metadata/_artists/_collections.py +356 -0
- scitex/plt/utils/_metadata/_artists/_extract.py +57 -0
- scitex/plt/utils/_metadata/_artists/_images.py +80 -0
- scitex/plt/utils/_metadata/_artists/_lines.py +261 -0
- scitex/plt/utils/_metadata/_artists/_patches.py +247 -0
- scitex/plt/utils/_metadata/_artists/_text.py +106 -0
- scitex/plt/utils/_metadata/_csv.py +416 -0
- scitex/plt/utils/_metadata/_detect.py +225 -0
- scitex/plt/utils/_metadata/_legend.py +127 -0
- scitex/plt/utils/_metadata/_rounding.py +117 -0
- scitex/plt/utils/_metadata/_verification.py +202 -0
- scitex/schema/README.md +1 -1
- scitex/scholar/__init__.py +8 -0
- scitex/scholar/_mcp/crossref_handlers.py +265 -0
- scitex/scholar/core/Scholar.py +63 -1700
- scitex/scholar/core/_mixins/__init__.py +36 -0
- scitex/scholar/core/_mixins/_enrichers.py +270 -0
- scitex/scholar/core/_mixins/_library_handlers.py +100 -0
- scitex/scholar/core/_mixins/_loaders.py +103 -0
- scitex/scholar/core/_mixins/_pdf_download.py +375 -0
- scitex/scholar/core/_mixins/_pipeline.py +312 -0
- scitex/scholar/core/_mixins/_project_handlers.py +125 -0
- scitex/scholar/core/_mixins/_savers.py +69 -0
- scitex/scholar/core/_mixins/_search.py +103 -0
- scitex/scholar/core/_mixins/_services.py +88 -0
- scitex/scholar/core/_mixins/_url_finding.py +105 -0
- scitex/scholar/crossref_scitex.py +367 -0
- scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/scholar/examples/00_run_all.sh +120 -0
- scitex/scholar/jobs/_executors.py +27 -3
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +38 -416
- scitex/scholar/pdf_download/_cli.py +154 -0
- scitex/scholar/pdf_download/strategies/__init__.py +11 -8
- scitex/scholar/pdf_download/strategies/manual_download_fallback.py +80 -3
- scitex/scholar/pipelines/ScholarPipelineBibTeX.py +73 -121
- scitex/scholar/pipelines/ScholarPipelineParallel.py +80 -138
- scitex/scholar/pipelines/ScholarPipelineSingle.py +43 -63
- scitex/scholar/pipelines/_single_steps.py +71 -36
- scitex/scholar/storage/_LibraryManager.py +97 -1695
- scitex/scholar/storage/_mixins/__init__.py +30 -0
- scitex/scholar/storage/_mixins/_bibtex_handlers.py +128 -0
- scitex/scholar/storage/_mixins/_library_operations.py +218 -0
- scitex/scholar/storage/_mixins/_metadata_conversion.py +226 -0
- scitex/scholar/storage/_mixins/_paper_saving.py +456 -0
- scitex/scholar/storage/_mixins/_resolution.py +376 -0
- scitex/scholar/storage/_mixins/_storage_helpers.py +121 -0
- scitex/scholar/storage/_mixins/_symlink_handlers.py +226 -0
- scitex/security/README.md +3 -3
- scitex/session/README.md +1 -1
- scitex/session/__init__.py +26 -7
- scitex/session/_decorator.py +1 -1
- scitex/sh/README.md +1 -1
- scitex/sh/__init__.py +7 -4
- scitex/social/__init__.py +155 -0
- scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/stats/_mcp/_handlers/__init__.py +31 -0
- scitex/stats/_mcp/_handlers/_corrections.py +113 -0
- scitex/stats/_mcp/_handlers/_descriptive.py +78 -0
- scitex/stats/_mcp/_handlers/_effect_size.py +106 -0
- scitex/stats/_mcp/_handlers/_format.py +94 -0
- scitex/stats/_mcp/_handlers/_normality.py +110 -0
- scitex/stats/_mcp/_handlers/_posthoc.py +224 -0
- scitex/stats/_mcp/_handlers/_power.py +247 -0
- scitex/stats/_mcp/_handlers/_recommend.py +102 -0
- scitex/stats/_mcp/_handlers/_run_test.py +279 -0
- scitex/stats/_mcp/_handlers/_stars.py +48 -0
- scitex/stats/_mcp/handlers.py +19 -1171
- scitex/stats/auto/_stat_style.py +175 -0
- scitex/stats/auto/_style_definitions.py +411 -0
- scitex/stats/auto/_styles.py +22 -620
- scitex/stats/descriptive/__init__.py +11 -8
- scitex/stats/descriptive/_ci.py +39 -0
- scitex/stats/power/_power.py +15 -4
- scitex/str/__init__.py +2 -1
- scitex/str/_title_case.py +63 -0
- scitex/template/README.md +1 -1
- scitex/template/__init__.py +25 -10
- scitex/template/_code_templates.py +147 -0
- scitex/template/_mcp/handlers.py +81 -0
- scitex/template/_mcp/tool_schemas.py +55 -0
- scitex/template/_templates/__init__.py +51 -0
- scitex/template/_templates/audio.py +233 -0
- scitex/template/_templates/canvas.py +312 -0
- scitex/template/_templates/capture.py +268 -0
- scitex/template/_templates/config.py +43 -0
- scitex/template/_templates/diagram.py +294 -0
- scitex/template/_templates/io.py +107 -0
- scitex/template/_templates/module.py +53 -0
- scitex/template/_templates/plt.py +202 -0
- scitex/template/_templates/scholar.py +267 -0
- scitex/template/_templates/session.py +130 -0
- scitex/template/_templates/session_minimal.py +43 -0
- scitex/template/_templates/session_plot.py +67 -0
- scitex/template/_templates/session_stats.py +77 -0
- scitex/template/_templates/stats.py +323 -0
- scitex/template/_templates/writer.py +296 -0
- scitex/template/clone_writer_directory.py +5 -5
- scitex/ui/_backends/_email.py +10 -2
- scitex/ui/_backends/_webhook.py +5 -1
- scitex/web/_search_pubmed.py +10 -6
- scitex/writer/README.md +1 -1
- scitex/writer/__init__.py +43 -34
- scitex/writer/_mcp/handlers.py +11 -744
- scitex/writer/_mcp/tool_schemas.py +5 -335
- scitex-2.15.3.dist-info/METADATA +667 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/RECORD +241 -120
- scitex/canvas/editor/flask_editor/templates/_scripts.py +0 -4933
- scitex/canvas/editor/flask_editor/templates/_styles.py +0 -1658
- scitex/diagram/_compile.py +0 -312
- scitex/diagram/_diagram.py +0 -355
- scitex/diagram/_mcp/__init__.py +0 -4
- scitex/diagram/_mcp/handlers.py +0 -400
- scitex/diagram/_mcp/tool_schemas.py +0 -157
- scitex/diagram/_presets.py +0 -173
- scitex/diagram/_schema.py +0 -182
- scitex/diagram/_split.py +0 -278
- scitex/gen/_ci.py +0 -12
- scitex/gen/_title_case.py +0 -89
- scitex/plt/_mcp/__init__.py +0 -4
- scitex/plt/_mcp/_handlers_annotation.py +0 -102
- scitex/plt/_mcp/_handlers_figure.py +0 -195
- scitex/plt/_mcp/_handlers_plot.py +0 -252
- scitex/plt/_mcp/_handlers_style.py +0 -219
- scitex/plt/_mcp/handlers.py +0 -74
- scitex/plt/_mcp/tool_schemas.py +0 -497
- scitex/plt/mcp_server.py +0 -231
- scitex/scholar/examples/SUGGESTIONS.md +0 -865
- scitex/scholar/examples/dev.py +0 -38
- scitex-2.14.0.dist-info/METADATA +0 -1238
- /scitex/{gen → context}/_detect_environment.py +0 -0
- /scitex/{gen → context}/_get_notebook_path.py +0 -0
- /scitex/{gen/_shell.py → sh/_shell_legacy.py} +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/WHEEL +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/entry_points.txt +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,15 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
# Timestamp: "
|
|
3
|
-
# File:
|
|
4
|
-
# ----------------------------------------
|
|
5
|
-
from __future__ import annotations
|
|
6
|
-
|
|
7
|
-
import os
|
|
8
|
-
|
|
9
|
-
__FILE__ = "./src/scitex/scholar/pipelines/ScholarPipelineParallel.py"
|
|
10
|
-
__DIR__ = os.path.dirname(__FILE__)
|
|
11
|
-
# ----------------------------------------
|
|
12
|
-
|
|
2
|
+
# Timestamp: "2026-01-22 (ywatanabe)"
|
|
3
|
+
# File: src/scitex/scholar/pipelines/ScholarPipelineParallel.py
|
|
13
4
|
"""
|
|
14
5
|
Functionalities:
|
|
15
6
|
- Orchestrates parallel paper acquisition using multiple browser profiles
|
|
@@ -33,11 +24,12 @@ IO:
|
|
|
33
24
|
- library/{project}/{paper_id} -> ../MASTER/{paper_id} (multiple symlinks)
|
|
34
25
|
"""
|
|
35
26
|
|
|
36
|
-
|
|
37
|
-
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
38
29
|
import asyncio
|
|
39
30
|
from typing import List, Optional
|
|
40
31
|
|
|
32
|
+
import scitex as stx
|
|
41
33
|
from scitex import logging
|
|
42
34
|
from scitex.browser.core import ChromeProfileManager
|
|
43
35
|
from scitex.scholar.auth import ScholarAuthManager
|
|
@@ -79,7 +71,8 @@ class ScholarPipelineParallel:
|
|
|
79
71
|
async def _verify_authentication_async(self) -> bool:
|
|
80
72
|
"""Pre-verify authentication once before spawning workers.
|
|
81
73
|
|
|
82
|
-
Returns
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
83
76
|
True if authenticated, False otherwise
|
|
84
77
|
"""
|
|
85
78
|
logger.info(f"{self.name}: Verifying authentication...")
|
|
@@ -110,7 +103,8 @@ class ScholarPipelineParallel:
|
|
|
110
103
|
Args:
|
|
111
104
|
num_workers: Number of workers to prepare (defaults to self.num_workers)
|
|
112
105
|
|
|
113
|
-
Returns
|
|
106
|
+
Returns
|
|
107
|
+
-------
|
|
114
108
|
List of worker profile names
|
|
115
109
|
"""
|
|
116
110
|
workers_to_prepare = (
|
|
@@ -157,7 +151,8 @@ class ScholarPipelineParallel:
|
|
|
157
151
|
worker_id: Worker ID for logging
|
|
158
152
|
worker_profile: Chrome profile name for this worker
|
|
159
153
|
|
|
160
|
-
Returns
|
|
154
|
+
Returns
|
|
155
|
+
-------
|
|
161
156
|
Paper object if successful, None otherwise
|
|
162
157
|
"""
|
|
163
158
|
logger.info(
|
|
@@ -200,7 +195,8 @@ class ScholarPipelineParallel:
|
|
|
200
195
|
doi_or_title_list: List of DOI or title strings
|
|
201
196
|
project: Project name for symlinking (optional)
|
|
202
197
|
|
|
203
|
-
Returns
|
|
198
|
+
Returns
|
|
199
|
+
-------
|
|
204
200
|
List of successfully processed Paper objects
|
|
205
201
|
"""
|
|
206
202
|
if not doi_or_title_list:
|
|
@@ -291,7 +287,8 @@ class ScholarPipelineParallel:
|
|
|
291
287
|
papers: Papers collection
|
|
292
288
|
project: Project name for symlinking (optional, uses papers.project if None)
|
|
293
289
|
|
|
294
|
-
Returns
|
|
290
|
+
Returns
|
|
291
|
+
-------
|
|
295
292
|
List of successfully processed Paper objects
|
|
296
293
|
"""
|
|
297
294
|
# Extract DOIs or titles from papers
|
|
@@ -314,34 +311,64 @@ class ScholarPipelineParallel:
|
|
|
314
311
|
)
|
|
315
312
|
|
|
316
313
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
314
|
+
@stx.session
|
|
315
|
+
def main(
|
|
316
|
+
dois: str = None,
|
|
317
|
+
titles: str = None,
|
|
318
|
+
project: str = None,
|
|
319
|
+
num_workers: int = 4,
|
|
320
|
+
browser_mode: str = "stealth",
|
|
321
|
+
chrome_profile: str = "system",
|
|
322
|
+
CONFIG=stx.INJECTED,
|
|
323
|
+
logger=stx.INJECTED,
|
|
324
|
+
) -> int:
|
|
325
|
+
"""Orchestrate parallel paper acquisition pipeline.
|
|
326
|
+
|
|
327
|
+
Parameters
|
|
328
|
+
----------
|
|
329
|
+
dois : str
|
|
330
|
+
Comma-separated DOIs (e.g., '10.1038/...,10.1016/...')
|
|
331
|
+
titles : str
|
|
332
|
+
Comma-separated paper titles
|
|
333
|
+
project : str
|
|
334
|
+
Project name for symlinking (optional)
|
|
335
|
+
num_workers : int
|
|
336
|
+
Number of parallel workers (default: 4)
|
|
337
|
+
browser_mode : str
|
|
338
|
+
Browser mode: 'stealth' or 'interactive' (default: stealth)
|
|
339
|
+
chrome_profile : str
|
|
340
|
+
Base Chrome profile name to sync from (default: system)
|
|
341
|
+
|
|
342
|
+
Returns
|
|
343
|
+
-------
|
|
344
|
+
int
|
|
345
|
+
Exit status code (0 for success)
|
|
346
|
+
"""
|
|
320
347
|
# Parse input queries
|
|
321
348
|
queries = []
|
|
322
|
-
if
|
|
323
|
-
queries.extend(
|
|
324
|
-
if
|
|
325
|
-
queries.extend(
|
|
349
|
+
if dois:
|
|
350
|
+
queries.extend(dois.split(","))
|
|
351
|
+
if titles:
|
|
352
|
+
queries.extend(titles.split(","))
|
|
326
353
|
|
|
327
354
|
if not queries:
|
|
328
355
|
logger.error("No queries provided. Use --dois or --titles")
|
|
329
356
|
return 1
|
|
330
357
|
|
|
331
|
-
logger.info(f"Processing {len(queries)} queries with {
|
|
358
|
+
logger.info(f"Processing {len(queries)} queries with {num_workers} workers")
|
|
332
359
|
|
|
333
360
|
# Create parallel pipeline
|
|
334
361
|
parallel_pipeline = ScholarPipelineParallel(
|
|
335
|
-
num_workers=
|
|
336
|
-
browser_mode=
|
|
337
|
-
base_chrome_profile=
|
|
362
|
+
num_workers=num_workers,
|
|
363
|
+
browser_mode=browser_mode,
|
|
364
|
+
base_chrome_profile=chrome_profile,
|
|
338
365
|
)
|
|
339
366
|
|
|
340
367
|
# Run pipeline
|
|
341
368
|
papers = asyncio.run(
|
|
342
369
|
parallel_pipeline.process_papers_from_list_async(
|
|
343
370
|
doi_or_title_list=queries,
|
|
344
|
-
project=
|
|
371
|
+
project=project,
|
|
345
372
|
)
|
|
346
373
|
)
|
|
347
374
|
|
|
@@ -349,114 +376,29 @@ def main(args):
|
|
|
349
376
|
return 0
|
|
350
377
|
|
|
351
378
|
|
|
352
|
-
def parse_args() -> argparse.Namespace:
|
|
353
|
-
"""Parse command line arguments."""
|
|
354
|
-
parser = argparse.ArgumentParser(
|
|
355
|
-
description="Orchestrate parallel paper acquisition pipeline"
|
|
356
|
-
)
|
|
357
|
-
parser.add_argument(
|
|
358
|
-
"--dois",
|
|
359
|
-
type=str,
|
|
360
|
-
default=None,
|
|
361
|
-
help="Comma-separated DOIs (e.g., '10.1038/...,10.1016/...')",
|
|
362
|
-
)
|
|
363
|
-
parser.add_argument(
|
|
364
|
-
"--titles",
|
|
365
|
-
type=str,
|
|
366
|
-
default=None,
|
|
367
|
-
help="Comma-separated paper titles",
|
|
368
|
-
)
|
|
369
|
-
parser.add_argument(
|
|
370
|
-
"--project",
|
|
371
|
-
type=str,
|
|
372
|
-
default=None,
|
|
373
|
-
help="Project name for symlinking (optional)",
|
|
374
|
-
)
|
|
375
|
-
parser.add_argument(
|
|
376
|
-
"--num-workers",
|
|
377
|
-
type=int,
|
|
378
|
-
default=4,
|
|
379
|
-
help="Number of parallel workers (default: 4)",
|
|
380
|
-
)
|
|
381
|
-
parser.add_argument(
|
|
382
|
-
"--browser-mode",
|
|
383
|
-
type=str,
|
|
384
|
-
choices=["stealth", "interactive"],
|
|
385
|
-
default="stealth",
|
|
386
|
-
help="Browser mode (default: stealth)",
|
|
387
|
-
)
|
|
388
|
-
parser.add_argument(
|
|
389
|
-
"--chrome-profile",
|
|
390
|
-
type=str,
|
|
391
|
-
default="system",
|
|
392
|
-
help="Base Chrome profile name to sync from (default: system)",
|
|
393
|
-
)
|
|
394
|
-
args = parser.parse_args()
|
|
395
|
-
return args
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
def run_main() -> None:
|
|
399
|
-
"""Initialize scitex framework, run main function, and cleanup."""
|
|
400
|
-
global CONFIG, CC, sys, plt, rng
|
|
401
|
-
|
|
402
|
-
import sys
|
|
403
|
-
|
|
404
|
-
import matplotlib.pyplot as plt
|
|
405
|
-
|
|
406
|
-
import scitex as stx
|
|
407
|
-
|
|
408
|
-
args = parse_args()
|
|
409
|
-
|
|
410
|
-
CONFIG, sys.stdout, sys.stderr, plt, CC, rng = stx.session.start(
|
|
411
|
-
sys,
|
|
412
|
-
plt,
|
|
413
|
-
args=args,
|
|
414
|
-
file=__FILE__,
|
|
415
|
-
sdir_suffix=None,
|
|
416
|
-
verbose=False,
|
|
417
|
-
agg=True,
|
|
418
|
-
)
|
|
419
|
-
|
|
420
|
-
exit_status = main(args)
|
|
421
|
-
|
|
422
|
-
stx.session.close(
|
|
423
|
-
CONFIG,
|
|
424
|
-
verbose=False,
|
|
425
|
-
notify=False,
|
|
426
|
-
message="",
|
|
427
|
-
exit_status=exit_status,
|
|
428
|
-
)
|
|
429
|
-
|
|
430
|
-
|
|
431
379
|
if __name__ == "__main__":
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
--titles "Neural State Monitoring in the Treatment of Epilepsy" \
|
|
456
|
-
--project epilepsy \
|
|
457
|
-
--num-workers 8 \
|
|
458
|
-
--browser-mode stealth \
|
|
459
|
-
--chrome-profile system
|
|
460
|
-
"""
|
|
380
|
+
main()
|
|
381
|
+
|
|
382
|
+
# Usage:
|
|
383
|
+
# # With DOIs (4 workers)
|
|
384
|
+
# python -m scitex.scholar.pipelines.ScholarPipelineParallel \
|
|
385
|
+
# --dois "10.1212/wnl.0000000000200348,10.1038/s41598-017-02626-y" \
|
|
386
|
+
# --project neurovista \
|
|
387
|
+
# --num-workers 4 \
|
|
388
|
+
# --browser-mode stealth \
|
|
389
|
+
# --chrome-profile system
|
|
390
|
+
#
|
|
391
|
+
# # With titles (2 workers)
|
|
392
|
+
# python -m scitex.scholar.pipelines.ScholarPipelineParallel \
|
|
393
|
+
# --titles "Attention Is All You Need,BERT: Pre-training" \
|
|
394
|
+
# --project transformers \
|
|
395
|
+
# --num-workers 2
|
|
396
|
+
#
|
|
397
|
+
# # Mixed DOIs and titles (8 workers)
|
|
398
|
+
# python -m scitex.scholar.pipelines.ScholarPipelineParallel \
|
|
399
|
+
# --dois "10.1038/s41593-025-01990-7" \
|
|
400
|
+
# --titles "Neural State Monitoring in the Treatment of Epilepsy" \
|
|
401
|
+
# --project epilepsy \
|
|
402
|
+
# --num-workers 8
|
|
461
403
|
|
|
462
404
|
# EOF
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
# Timestamp: "2026-01-
|
|
2
|
+
# Timestamp: "2026-01-22 (ywatanabe)"
|
|
3
3
|
# File: src/scitex/scholar/pipelines/ScholarPipelineSingle.py
|
|
4
4
|
"""
|
|
5
5
|
Single paper acquisition pipeline orchestrator.
|
|
@@ -21,10 +21,10 @@ IO:
|
|
|
21
21
|
|
|
22
22
|
from __future__ import annotations
|
|
23
23
|
|
|
24
|
-
import argparse
|
|
25
24
|
import asyncio
|
|
26
25
|
from typing import Optional
|
|
27
26
|
|
|
27
|
+
import scitex as stx
|
|
28
28
|
from scitex import logging
|
|
29
29
|
from scitex.scholar.storage import PaperIO
|
|
30
30
|
|
|
@@ -90,10 +90,10 @@ class ScholarPipelineSingle(PipelineStepsMixin, PipelineHelpersMixin):
|
|
|
90
90
|
)
|
|
91
91
|
if context:
|
|
92
92
|
await self._step_06_find_pdf_urls(
|
|
93
|
-
paper, io, context, auth_gateway, force
|
|
93
|
+
paper, io, context, auth_gateway, force, browser_manager
|
|
94
94
|
)
|
|
95
95
|
await self._step_07_download_pdf(
|
|
96
|
-
paper, io, context, auth_gateway, force
|
|
96
|
+
paper, io, context, auth_gateway, force, browser_manager
|
|
97
97
|
)
|
|
98
98
|
if browser_manager:
|
|
99
99
|
await browser_manager.close()
|
|
@@ -108,75 +108,55 @@ class ScholarPipelineSingle(PipelineStepsMixin, PipelineHelpersMixin):
|
|
|
108
108
|
return paper, symlink_path
|
|
109
109
|
|
|
110
110
|
|
|
111
|
-
|
|
112
|
-
|
|
111
|
+
@stx.session
|
|
112
|
+
def main(
|
|
113
|
+
doi_or_title: str = None,
|
|
114
|
+
project: str = None,
|
|
115
|
+
browser_mode: str = "stealth",
|
|
116
|
+
chrome_profile: str = "system",
|
|
117
|
+
force: bool = False,
|
|
118
|
+
CONFIG=stx.INJECTED,
|
|
119
|
+
logger=stx.INJECTED,
|
|
120
|
+
) -> int:
|
|
121
|
+
"""Orchestrate full paper acquisition pipeline.
|
|
122
|
+
|
|
123
|
+
Parameters
|
|
124
|
+
----------
|
|
125
|
+
doi_or_title : str
|
|
126
|
+
DOI or paper title (required)
|
|
127
|
+
project : str
|
|
128
|
+
Project name for symlinking (optional)
|
|
129
|
+
browser_mode : str
|
|
130
|
+
Browser mode: 'stealth' or 'interactive' (default: stealth)
|
|
131
|
+
chrome_profile : str
|
|
132
|
+
Chrome profile name (default: system)
|
|
133
|
+
force : bool
|
|
134
|
+
Force fresh processing (default: False)
|
|
135
|
+
|
|
136
|
+
Returns
|
|
137
|
+
-------
|
|
138
|
+
int
|
|
139
|
+
Exit status code (0 for success)
|
|
140
|
+
"""
|
|
141
|
+
if not doi_or_title:
|
|
142
|
+
logger.error("--doi-or-title is required")
|
|
143
|
+
return 1
|
|
144
|
+
|
|
113
145
|
pipeline = ScholarPipelineSingle(
|
|
114
|
-
browser_mode=
|
|
146
|
+
browser_mode=browser_mode, chrome_profile=chrome_profile
|
|
115
147
|
)
|
|
116
148
|
paper, symlink_path = asyncio.run(
|
|
117
149
|
pipeline.process_single_paper(
|
|
118
|
-
doi_or_title=
|
|
119
|
-
project=
|
|
120
|
-
force=
|
|
150
|
+
doi_or_title=doi_or_title,
|
|
151
|
+
project=project,
|
|
152
|
+
force=force,
|
|
121
153
|
)
|
|
122
154
|
)
|
|
123
155
|
return 0
|
|
124
156
|
|
|
125
157
|
|
|
126
|
-
def parse_args() -> argparse.Namespace:
|
|
127
|
-
"""Parse command line arguments."""
|
|
128
|
-
parser = argparse.ArgumentParser(
|
|
129
|
-
description="Orchestrate full paper acquisition pipeline"
|
|
130
|
-
)
|
|
131
|
-
parser.add_argument(
|
|
132
|
-
"--doi-or-title", type=str, required=True, help="DOI or paper title"
|
|
133
|
-
)
|
|
134
|
-
parser.add_argument(
|
|
135
|
-
"--project", type=str, default=None, help="Project name for symlinking"
|
|
136
|
-
)
|
|
137
|
-
parser.add_argument(
|
|
138
|
-
"--browser-mode",
|
|
139
|
-
type=str,
|
|
140
|
-
choices=["stealth", "interactive"],
|
|
141
|
-
default="stealth",
|
|
142
|
-
help="Browser mode (default: stealth)",
|
|
143
|
-
)
|
|
144
|
-
parser.add_argument(
|
|
145
|
-
"--chrome-profile",
|
|
146
|
-
type=str,
|
|
147
|
-
required=True,
|
|
148
|
-
help="Chrome profile name (default: system)",
|
|
149
|
-
)
|
|
150
|
-
parser.add_argument(
|
|
151
|
-
"--force",
|
|
152
|
-
"-f",
|
|
153
|
-
action="store_true",
|
|
154
|
-
default=False,
|
|
155
|
-
help="Force fresh processing",
|
|
156
|
-
)
|
|
157
|
-
return parser.parse_args()
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
def run_main() -> None:
|
|
161
|
-
"""Initialize scitex framework, run main function, and cleanup."""
|
|
162
|
-
import sys
|
|
163
|
-
|
|
164
|
-
import matplotlib.pyplot as plt
|
|
165
|
-
|
|
166
|
-
import scitex as stx
|
|
167
|
-
|
|
168
|
-
args = parse_args()
|
|
169
|
-
CONFIG, sys.stdout, sys.stderr, plt, CC, rng = stx.session.start(
|
|
170
|
-
sys, plt, args=args, file=__file__, sdir_suffix=None, verbose=False, agg=True
|
|
171
|
-
)
|
|
172
|
-
exit_status = main(args)
|
|
173
|
-
stx.session.close(
|
|
174
|
-
CONFIG, verbose=False, notify=False, message="", exit_status=exit_status
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
|
|
178
158
|
if __name__ == "__main__":
|
|
179
|
-
|
|
159
|
+
main()
|
|
180
160
|
|
|
181
161
|
# Usage:
|
|
182
162
|
# python -m scitex.scholar.pipelines.ScholarPipelineSingle \
|
|
@@ -115,7 +115,9 @@ class PipelineStepsMixin:
|
|
|
115
115
|
)
|
|
116
116
|
return browser_manager, context, auth_gateway
|
|
117
117
|
|
|
118
|
-
async def _step_06_find_pdf_urls(
|
|
118
|
+
async def _step_06_find_pdf_urls(
|
|
119
|
+
self, paper, io, context, auth_gateway, force, browser_manager=None
|
|
120
|
+
):
|
|
119
121
|
if not paper.metadata.url.pdfs or force:
|
|
120
122
|
logger.info(f"{self.name}: Finding PDF URLs...")
|
|
121
123
|
try:
|
|
@@ -127,6 +129,9 @@ class PipelineStepsMixin:
|
|
|
127
129
|
)
|
|
128
130
|
except Exception as e:
|
|
129
131
|
logger.warning(f"{self.name}: Auth gateway failed: {e}")
|
|
132
|
+
await self._capture_screenshot(
|
|
133
|
+
browser_manager, context, io, "auth_gateway_failed"
|
|
134
|
+
)
|
|
130
135
|
publisher_url = paper.metadata.id.doi
|
|
131
136
|
from scitex.scholar import ScholarURLFinder
|
|
132
137
|
|
|
@@ -135,23 +140,34 @@ class PipelineStepsMixin:
|
|
|
135
140
|
paper.metadata.url.pdfs = urls
|
|
136
141
|
paper.metadata.url.pdfs_engines = ["ScholarURLFinder"]
|
|
137
142
|
io.save_metadata()
|
|
143
|
+
if not urls:
|
|
144
|
+
await self._capture_screenshot(
|
|
145
|
+
browser_manager, context, io, "no_pdf_urls_found"
|
|
146
|
+
)
|
|
138
147
|
logger.info(f"{self.name}: Found {len(urls)} PDF URL(s)")
|
|
139
148
|
else:
|
|
140
149
|
logger.info(f"{self.name}: PDF URLs exist ({len(paper.metadata.url.pdfs)})")
|
|
141
150
|
|
|
142
|
-
async def _step_07_download_pdf(
|
|
151
|
+
async def _step_07_download_pdf(
|
|
152
|
+
self, paper, io, context, auth_gateway, force, browser_manager=None
|
|
153
|
+
):
|
|
143
154
|
if (not io.has_pdf() or force) and paper.metadata.url.pdfs:
|
|
144
155
|
logger.info(f"{self.name}: Downloading PDF...")
|
|
145
156
|
from scitex.scholar.pdf_download import ScholarPDFDownloader
|
|
146
157
|
|
|
147
158
|
downloader = ScholarPDFDownloader(context)
|
|
148
|
-
downloaded, temp_path = await self._download_pdf_from_url(
|
|
159
|
+
downloaded, temp_path, download_method = await self._download_pdf_from_url(
|
|
149
160
|
paper, io, context, auth_gateway, downloader
|
|
150
161
|
)
|
|
151
162
|
if downloaded:
|
|
152
|
-
self._handle_downloaded_pdf(
|
|
163
|
+
self._handle_downloaded_pdf(
|
|
164
|
+
paper, io, downloaded, temp_path, download_method
|
|
165
|
+
)
|
|
153
166
|
else:
|
|
154
|
-
self.
|
|
167
|
+
await self._capture_screenshot(
|
|
168
|
+
browser_manager, context, io, "pdf_download_failed"
|
|
169
|
+
)
|
|
170
|
+
self._check_manual_download(io, paper)
|
|
155
171
|
elif io.has_pdf():
|
|
156
172
|
logger.info(f"{self.name}: PDF already exists, skipping download")
|
|
157
173
|
|
|
@@ -211,24 +227,35 @@ class PipelineStepsMixin:
|
|
|
211
227
|
downloaded_file = await downloader.download_from_url(
|
|
212
228
|
pdf_url, output_path=temp_pdf_path, doi=paper.metadata.id.doi
|
|
213
229
|
)
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
230
|
+
# Track download method based on context flags
|
|
231
|
+
download_method = "unknown"
|
|
232
|
+
if downloaded_file:
|
|
233
|
+
is_manual = getattr(context, "_scitex_is_manual_mode", False)
|
|
234
|
+
download_method = "manual_download" if is_manual else "automated"
|
|
235
|
+
return downloaded_file, temp_pdf_path, download_method
|
|
236
|
+
|
|
237
|
+
def _handle_downloaded_pdf(
|
|
238
|
+
self, paper, io, downloaded_file, temp_pdf_path, download_method="unknown"
|
|
239
|
+
):
|
|
217
240
|
import shutil
|
|
218
241
|
|
|
219
242
|
if downloaded_file == temp_pdf_path and temp_pdf_path.exists():
|
|
220
243
|
main_pdf = io.get_pdf_path()
|
|
221
244
|
shutil.move(str(temp_pdf_path), str(main_pdf))
|
|
222
245
|
paper.metadata.path.pdfs = [str(main_pdf)]
|
|
246
|
+
paper.metadata.path.pdfs_engines = [download_method]
|
|
223
247
|
paper.container.pdf_size_bytes = main_pdf.stat().st_size
|
|
224
248
|
io.save_metadata()
|
|
225
|
-
logger.success(
|
|
249
|
+
logger.success(
|
|
250
|
+
f"{self.name}: PDF downloaded to MASTER via {download_method}"
|
|
251
|
+
)
|
|
226
252
|
else:
|
|
227
253
|
io.save_pdf(downloaded_file)
|
|
254
|
+
paper.metadata.path.pdfs_engines = [download_method]
|
|
228
255
|
io.save_metadata()
|
|
229
256
|
logger.info(f"{self.name}: PDF saved ({str(downloaded_file)})")
|
|
230
257
|
|
|
231
|
-
def _check_manual_download(self, io):
|
|
258
|
+
def _check_manual_download(self, io, paper=None):
|
|
232
259
|
import time
|
|
233
260
|
|
|
234
261
|
from scitex.scholar import ScholarConfig
|
|
@@ -249,6 +276,9 @@ class PipelineStepsMixin:
|
|
|
249
276
|
latest_pdf = recent_pdfs[0][0]
|
|
250
277
|
logger.info(f"{self.name}: Found recent PDF: {latest_pdf.name}")
|
|
251
278
|
io.save_pdf(latest_pdf)
|
|
279
|
+
# Track as manual download
|
|
280
|
+
if paper:
|
|
281
|
+
paper.metadata.path.pdfs_engines = ["manual_download"]
|
|
252
282
|
io.save_metadata()
|
|
253
283
|
logger.success(f"{self.name}: Manual PDF saved to MASTER")
|
|
254
284
|
else:
|
|
@@ -258,39 +288,44 @@ class PipelineStepsMixin:
|
|
|
258
288
|
class PipelineHelpersMixin:
|
|
259
289
|
"""Mixin containing helper methods for single paper pipeline."""
|
|
260
290
|
|
|
291
|
+
async def _capture_screenshot(self, browser_manager, context, io, description):
|
|
292
|
+
"""Capture screenshot for debugging when issues occur."""
|
|
293
|
+
if not browser_manager or not context:
|
|
294
|
+
return
|
|
295
|
+
try:
|
|
296
|
+
from datetime import datetime
|
|
297
|
+
|
|
298
|
+
screenshots_dir = io.paper_dir / "screenshots"
|
|
299
|
+
screenshots_dir.mkdir(parents=True, exist_ok=True)
|
|
300
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
301
|
+
screenshot_path = screenshots_dir / f"{timestamp}_{description}.png"
|
|
302
|
+
pages = context.pages
|
|
303
|
+
if pages:
|
|
304
|
+
page = pages[0]
|
|
305
|
+
await browser_manager.take_screenshot_async(
|
|
306
|
+
page, str(screenshot_path), full_page=True
|
|
307
|
+
)
|
|
308
|
+
logger.info(f"{self.name}: Screenshot saved: {screenshot_path.name}")
|
|
309
|
+
except Exception as e:
|
|
310
|
+
logger.debug(f"{self.name}: Screenshot capture failed: {e}")
|
|
311
|
+
|
|
261
312
|
def _generate_paper_id(self, doi: str) -> str:
|
|
262
313
|
"""Generate 8-digit library ID from DOI."""
|
|
263
314
|
return hashlib.md5(f"DOI:{doi}".encode()).hexdigest()[:8].upper()
|
|
264
315
|
|
|
265
316
|
def _link_to_project(self, paper: Paper, project: str, io: PaperIO) -> Path:
|
|
266
|
-
"""Create human-readable symlink in project directory."""
|
|
267
|
-
from scitex.scholar import
|
|
317
|
+
"""Create human-readable symlink in project directory using LibraryManager."""
|
|
318
|
+
from scitex.scholar.storage import LibraryManager
|
|
268
319
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
n_pdfs=len(pdf_files),
|
|
274
|
-
citation_count=paper.metadata.citation_count.total or 0,
|
|
275
|
-
impact_factor=int(paper.metadata.publication.impact_factor or 0),
|
|
276
|
-
year=paper.metadata.basic.year or 0,
|
|
277
|
-
first_author=(
|
|
278
|
-
paper.metadata.basic.authors[0].split()[-1]
|
|
279
|
-
if paper.metadata.basic.authors
|
|
280
|
-
else "Unknown"
|
|
281
|
-
),
|
|
282
|
-
journal_name=(
|
|
283
|
-
paper.metadata.publication.short_journal
|
|
284
|
-
or paper.metadata.publication.journal
|
|
285
|
-
or "Unknown"
|
|
286
|
-
),
|
|
320
|
+
library_manager = LibraryManager()
|
|
321
|
+
symlink_path = library_manager.update_symlink(
|
|
322
|
+
master_storage_path=io.paper_dir,
|
|
323
|
+
project=project,
|
|
287
324
|
)
|
|
288
|
-
symlink_path
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
symlink_path.symlink_to(target_path)
|
|
293
|
-
logger.success(f"{self.name}: Created symlink: {project}/{entry_name}")
|
|
325
|
+
if symlink_path:
|
|
326
|
+
logger.success(
|
|
327
|
+
f"{self.name}: Created symlink: {project}/{symlink_path.name}"
|
|
328
|
+
)
|
|
294
329
|
return symlink_path
|
|
295
330
|
|
|
296
331
|
def _enrich_impact_factor(self, paper: Paper) -> None:
|