scitex 2.14.0__py3-none-any.whl → 2.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +47 -0
- scitex/_env_loader.py +156 -0
- scitex/_mcp_resources/__init__.py +37 -0
- scitex/_mcp_resources/_cheatsheet.py +135 -0
- scitex/_mcp_resources/_figrecipe.py +138 -0
- scitex/_mcp_resources/_formats.py +102 -0
- scitex/_mcp_resources/_modules.py +337 -0
- scitex/_mcp_resources/_session.py +149 -0
- scitex/_mcp_tools/__init__.py +4 -0
- scitex/_mcp_tools/audio.py +66 -0
- scitex/_mcp_tools/diagram.py +11 -95
- scitex/_mcp_tools/introspect.py +191 -0
- scitex/_mcp_tools/plt.py +260 -305
- scitex/_mcp_tools/scholar.py +74 -0
- scitex/_mcp_tools/social.py +244 -0
- scitex/_mcp_tools/writer.py +21 -204
- scitex/ai/_gen_ai/_PARAMS.py +10 -7
- scitex/ai/classification/reporters/_SingleClassificationReporter.py +45 -1603
- scitex/ai/classification/reporters/_mixins/__init__.py +36 -0
- scitex/ai/classification/reporters/_mixins/_constants.py +67 -0
- scitex/ai/classification/reporters/_mixins/_cv_summary.py +387 -0
- scitex/ai/classification/reporters/_mixins/_feature_importance.py +119 -0
- scitex/ai/classification/reporters/_mixins/_metrics.py +275 -0
- scitex/ai/classification/reporters/_mixins/_plotting.py +179 -0
- scitex/ai/classification/reporters/_mixins/_reports.py +153 -0
- scitex/ai/classification/reporters/_mixins/_storage.py +160 -0
- scitex/audio/README.md +40 -36
- scitex/audio/__init__.py +127 -59
- scitex/audio/_branding.py +185 -0
- scitex/audio/_mcp/__init__.py +32 -0
- scitex/audio/_mcp/handlers.py +59 -6
- scitex/audio/_mcp/speak_handlers.py +238 -0
- scitex/audio/_relay.py +225 -0
- scitex/audio/engines/elevenlabs_engine.py +6 -1
- scitex/audio/mcp_server.py +228 -75
- scitex/canvas/README.md +1 -1
- scitex/canvas/editor/_dearpygui/__init__.py +25 -0
- scitex/canvas/editor/_dearpygui/_editor.py +147 -0
- scitex/canvas/editor/_dearpygui/_handlers.py +476 -0
- scitex/canvas/editor/_dearpygui/_panels/__init__.py +17 -0
- scitex/canvas/editor/_dearpygui/_panels/_control.py +119 -0
- scitex/canvas/editor/_dearpygui/_panels/_element_controls.py +190 -0
- scitex/canvas/editor/_dearpygui/_panels/_preview.py +43 -0
- scitex/canvas/editor/_dearpygui/_panels/_sections.py +390 -0
- scitex/canvas/editor/_dearpygui/_plotting.py +187 -0
- scitex/canvas/editor/_dearpygui/_rendering.py +504 -0
- scitex/canvas/editor/_dearpygui/_selection.py +295 -0
- scitex/canvas/editor/_dearpygui/_state.py +93 -0
- scitex/canvas/editor/_dearpygui/_utils.py +61 -0
- scitex/canvas/editor/flask_editor/templates/__init__.py +32 -70
- scitex/cli/__init__.py +38 -43
- scitex/cli/audio.py +76 -27
- scitex/cli/capture.py +13 -20
- scitex/cli/introspect.py +443 -0
- scitex/cli/main.py +198 -109
- scitex/cli/mcp.py +60 -34
- scitex/cli/scholar/__init__.py +8 -0
- scitex/cli/scholar/_crossref_scitex.py +296 -0
- scitex/cli/scholar/_fetch.py +25 -3
- scitex/cli/social.py +314 -0
- scitex/cli/writer.py +117 -0
- scitex/config/README.md +1 -1
- scitex/config/__init__.py +16 -2
- scitex/config/_env_registry.py +191 -0
- scitex/diagram/__init__.py +42 -19
- scitex/diagram/mcp_server.py +13 -125
- scitex/introspect/__init__.py +75 -0
- scitex/introspect/_call_graph.py +303 -0
- scitex/introspect/_class_hierarchy.py +163 -0
- scitex/introspect/_core.py +42 -0
- scitex/introspect/_docstring.py +131 -0
- scitex/introspect/_examples.py +113 -0
- scitex/introspect/_imports.py +271 -0
- scitex/introspect/_mcp/__init__.py +37 -0
- scitex/introspect/_mcp/handlers.py +208 -0
- scitex/introspect/_members.py +151 -0
- scitex/introspect/_resolve.py +89 -0
- scitex/introspect/_signature.py +131 -0
- scitex/introspect/_source.py +80 -0
- scitex/introspect/_type_hints.py +172 -0
- scitex/io/bundle/README.md +1 -1
- scitex/mcp_server.py +98 -5
- scitex/plt/__init__.py +248 -550
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +5 -10
- scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/plt/gallery/README.md +1 -1
- scitex/plt/utils/_hitmap/__init__.py +82 -0
- scitex/plt/utils/_hitmap/_artist_extraction.py +343 -0
- scitex/plt/utils/_hitmap/_color_application.py +346 -0
- scitex/plt/utils/_hitmap/_color_conversion.py +121 -0
- scitex/plt/utils/_hitmap/_constants.py +40 -0
- scitex/plt/utils/_hitmap/_hitmap_core.py +334 -0
- scitex/plt/utils/_hitmap/_path_extraction.py +357 -0
- scitex/plt/utils/_hitmap/_query.py +113 -0
- scitex/plt/utils/_hitmap.py +46 -1616
- scitex/plt/utils/_metadata/__init__.py +80 -0
- scitex/plt/utils/_metadata/_artists/__init__.py +25 -0
- scitex/plt/utils/_metadata/_artists/_base.py +195 -0
- scitex/plt/utils/_metadata/_artists/_collections.py +356 -0
- scitex/plt/utils/_metadata/_artists/_extract.py +57 -0
- scitex/plt/utils/_metadata/_artists/_images.py +80 -0
- scitex/plt/utils/_metadata/_artists/_lines.py +261 -0
- scitex/plt/utils/_metadata/_artists/_patches.py +247 -0
- scitex/plt/utils/_metadata/_artists/_text.py +106 -0
- scitex/plt/utils/_metadata/_csv.py +416 -0
- scitex/plt/utils/_metadata/_detect.py +225 -0
- scitex/plt/utils/_metadata/_legend.py +127 -0
- scitex/plt/utils/_metadata/_rounding.py +117 -0
- scitex/plt/utils/_metadata/_verification.py +202 -0
- scitex/schema/README.md +1 -1
- scitex/scholar/__init__.py +8 -0
- scitex/scholar/_mcp/crossref_handlers.py +265 -0
- scitex/scholar/core/Scholar.py +63 -1700
- scitex/scholar/core/_mixins/__init__.py +36 -0
- scitex/scholar/core/_mixins/_enrichers.py +270 -0
- scitex/scholar/core/_mixins/_library_handlers.py +100 -0
- scitex/scholar/core/_mixins/_loaders.py +103 -0
- scitex/scholar/core/_mixins/_pdf_download.py +375 -0
- scitex/scholar/core/_mixins/_pipeline.py +312 -0
- scitex/scholar/core/_mixins/_project_handlers.py +125 -0
- scitex/scholar/core/_mixins/_savers.py +69 -0
- scitex/scholar/core/_mixins/_search.py +103 -0
- scitex/scholar/core/_mixins/_services.py +88 -0
- scitex/scholar/core/_mixins/_url_finding.py +105 -0
- scitex/scholar/crossref_scitex.py +367 -0
- scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/scholar/examples/00_run_all.sh +120 -0
- scitex/scholar/jobs/_executors.py +27 -3
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +38 -416
- scitex/scholar/pdf_download/_cli.py +154 -0
- scitex/scholar/pdf_download/strategies/__init__.py +11 -8
- scitex/scholar/pdf_download/strategies/manual_download_fallback.py +80 -3
- scitex/scholar/pipelines/ScholarPipelineBibTeX.py +73 -121
- scitex/scholar/pipelines/ScholarPipelineParallel.py +80 -138
- scitex/scholar/pipelines/ScholarPipelineSingle.py +43 -63
- scitex/scholar/pipelines/_single_steps.py +71 -36
- scitex/scholar/storage/_LibraryManager.py +97 -1695
- scitex/scholar/storage/_mixins/__init__.py +30 -0
- scitex/scholar/storage/_mixins/_bibtex_handlers.py +128 -0
- scitex/scholar/storage/_mixins/_library_operations.py +218 -0
- scitex/scholar/storage/_mixins/_metadata_conversion.py +226 -0
- scitex/scholar/storage/_mixins/_paper_saving.py +456 -0
- scitex/scholar/storage/_mixins/_resolution.py +376 -0
- scitex/scholar/storage/_mixins/_storage_helpers.py +121 -0
- scitex/scholar/storage/_mixins/_symlink_handlers.py +226 -0
- scitex/scholar/url_finder/.tmp/open_url/KNOWN_RESOLVERS.py +462 -0
- scitex/scholar/url_finder/.tmp/open_url/README.md +223 -0
- scitex/scholar/url_finder/.tmp/open_url/_DOIToURLResolver.py +694 -0
- scitex/scholar/url_finder/.tmp/open_url/_OpenURLResolver.py +1160 -0
- scitex/scholar/url_finder/.tmp/open_url/_ResolverLinkFinder.py +344 -0
- scitex/scholar/url_finder/.tmp/open_url/__init__.py +24 -0
- scitex/security/README.md +3 -3
- scitex/session/README.md +1 -1
- scitex/sh/README.md +1 -1
- scitex/social/__init__.py +153 -0
- scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/template/README.md +1 -1
- scitex/template/clone_writer_directory.py +5 -5
- scitex/writer/README.md +1 -1
- scitex/writer/_mcp/handlers.py +11 -744
- scitex/writer/_mcp/tool_schemas.py +5 -335
- scitex-2.15.1.dist-info/METADATA +648 -0
- {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/RECORD +166 -111
- scitex/canvas/editor/flask_editor/templates/_scripts.py +0 -4933
- scitex/canvas/editor/flask_editor/templates/_styles.py +0 -1658
- scitex/dev/plt/data/mpl/PLOTTING_FUNCTIONS.yaml +0 -90
- scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES.yaml +0 -1571
- scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES_DETAILED.yaml +0 -6262
- scitex/dev/plt/data/mpl/SIGNATURES_FLATTENED.yaml +0 -1274
- scitex/dev/plt/data/mpl/dir_ax.txt +0 -459
- scitex/diagram/_compile.py +0 -312
- scitex/diagram/_diagram.py +0 -355
- scitex/diagram/_mcp/__init__.py +0 -4
- scitex/diagram/_mcp/handlers.py +0 -400
- scitex/diagram/_mcp/tool_schemas.py +0 -157
- scitex/diagram/_presets.py +0 -173
- scitex/diagram/_schema.py +0 -182
- scitex/diagram/_split.py +0 -278
- scitex/plt/_mcp/__init__.py +0 -4
- scitex/plt/_mcp/_handlers_annotation.py +0 -102
- scitex/plt/_mcp/_handlers_figure.py +0 -195
- scitex/plt/_mcp/_handlers_plot.py +0 -252
- scitex/plt/_mcp/_handlers_style.py +0 -219
- scitex/plt/_mcp/handlers.py +0 -74
- scitex/plt/_mcp/tool_schemas.py +0 -497
- scitex/plt/mcp_server.py +0 -231
- scitex/scholar/data/.gitkeep +0 -0
- scitex/scholar/data/README.md +0 -44
- scitex/scholar/data/bib_files/bibliography.bib +0 -1952
- scitex/scholar/data/bib_files/neurovista.bib +0 -277
- scitex/scholar/data/bib_files/neurovista_enriched.bib +0 -441
- scitex/scholar/data/bib_files/neurovista_enriched_enriched.bib +0 -441
- scitex/scholar/data/bib_files/neurovista_processed.bib +0 -338
- scitex/scholar/data/bib_files/openaccess.bib +0 -89
- scitex/scholar/data/bib_files/pac-seizure_prediction_enriched.bib +0 -2178
- scitex/scholar/data/bib_files/pac.bib +0 -698
- scitex/scholar/data/bib_files/pac_enriched.bib +0 -1061
- scitex/scholar/data/bib_files/pac_processed.bib +0 -0
- scitex/scholar/data/bib_files/pac_titles.txt +0 -75
- scitex/scholar/data/bib_files/paywalled.bib +0 -98
- scitex/scholar/data/bib_files/related-papers-by-coauthors.bib +0 -58
- scitex/scholar/data/bib_files/related-papers-by-coauthors_enriched.bib +0 -87
- scitex/scholar/data/bib_files/seizure_prediction.bib +0 -694
- scitex/scholar/data/bib_files/seizure_prediction_processed.bib +0 -0
- scitex/scholar/data/bib_files/test_complete_enriched.bib +0 -437
- scitex/scholar/data/bib_files/test_final_enriched.bib +0 -437
- scitex/scholar/data/bib_files/test_seizure.bib +0 -46
- scitex/scholar/data/impact_factor/JCR_IF_2022.xlsx +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024.db +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024.xlsx +0 -0
- scitex/scholar/data/impact_factor/JCR_IF_2024_v01.db +0 -0
- scitex/scholar/data/impact_factor.db +0 -0
- scitex/scholar/examples/SUGGESTIONS.md +0 -865
- scitex/scholar/examples/dev.py +0 -38
- scitex-2.14.0.dist-info/METADATA +0 -1238
- {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/WHEEL +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/entry_points.txt +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/licenses/LICENSE +0 -0
scitex/scholar/core/Scholar.py
CHANGED
|
@@ -1,16 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
# Timestamp: "
|
|
3
|
-
# File: /home/ywatanabe/proj/
|
|
4
|
-
# ----------------------------------------
|
|
5
|
-
from __future__ import annotations
|
|
6
|
-
|
|
7
|
-
import os
|
|
8
|
-
|
|
9
|
-
__FILE__ = "./src/scitex/scholar/core/Scholar.py"
|
|
10
|
-
__DIR__ = os.path.dirname(__FILE__)
|
|
11
|
-
# ----------------------------------------
|
|
12
|
-
|
|
13
|
-
__FILE__ = __file__
|
|
2
|
+
# Timestamp: "2026-01-24 (ywatanabe)"
|
|
3
|
+
# File: /home/ywatanabe/proj/scitex-python/src/scitex/scholar/core/Scholar.py
|
|
14
4
|
|
|
15
5
|
"""
|
|
16
6
|
Unified Scholar class for scientific literature management.
|
|
@@ -22,43 +12,42 @@ This is the main entry point for all scholar functionality, providing:
|
|
|
22
12
|
- Progressive disclosure of advanced features
|
|
23
13
|
"""
|
|
24
14
|
|
|
25
|
-
import
|
|
26
|
-
import json
|
|
27
|
-
import shutil
|
|
28
|
-
from copy import deepcopy
|
|
29
|
-
from datetime import datetime
|
|
30
|
-
from pathlib import Path
|
|
31
|
-
from typing import Any, Dict, List, Optional, Union
|
|
15
|
+
from __future__ import annotations
|
|
32
16
|
|
|
33
|
-
import
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Optional, Union
|
|
34
19
|
|
|
35
20
|
from scitex import logging
|
|
36
|
-
|
|
37
|
-
# PDF extraction is now handled by scitex.io
|
|
38
|
-
from scitex.logging import ScholarError
|
|
39
|
-
|
|
40
|
-
# Updated imports for current architecture
|
|
41
|
-
from scitex.scholar.auth import ScholarAuthManager
|
|
42
|
-
from scitex.scholar.auth.core.AuthenticationGateway import (
|
|
43
|
-
AuthenticationGateway,
|
|
44
|
-
)
|
|
45
|
-
from scitex.scholar.browser import ScholarBrowserManager
|
|
46
21
|
from scitex.scholar.config import ScholarConfig
|
|
47
|
-
from scitex.scholar.impact_factor.ImpactFactorEngine import ImpactFactorEngine
|
|
48
|
-
from scitex.scholar.metadata_engines.ScholarEngine import ScholarEngine
|
|
49
|
-
from scitex.scholar.pdf_download.ScholarPDFDownloader import (
|
|
50
|
-
ScholarPDFDownloader,
|
|
51
|
-
)
|
|
52
|
-
from scitex.scholar.storage import LibraryManager, ScholarLibrary
|
|
53
|
-
from scitex.scholar.url_finder.ScholarURLFinder import ScholarURLFinder
|
|
54
22
|
|
|
55
|
-
from .
|
|
56
|
-
|
|
23
|
+
from ._mixins import (
|
|
24
|
+
EnricherMixin,
|
|
25
|
+
LibraryHandlerMixin,
|
|
26
|
+
LoaderMixin,
|
|
27
|
+
PDFDownloadMixin,
|
|
28
|
+
PipelineMixin,
|
|
29
|
+
ProjectHandlerMixin,
|
|
30
|
+
SaverMixin,
|
|
31
|
+
SearchMixin,
|
|
32
|
+
ServiceMixin,
|
|
33
|
+
URLFindingMixin,
|
|
34
|
+
)
|
|
57
35
|
|
|
58
36
|
logger = logging.getLogger(__name__)
|
|
59
37
|
|
|
60
38
|
|
|
61
|
-
class Scholar
|
|
39
|
+
class Scholar(
|
|
40
|
+
EnricherMixin,
|
|
41
|
+
URLFindingMixin,
|
|
42
|
+
PDFDownloadMixin,
|
|
43
|
+
LoaderMixin,
|
|
44
|
+
SearchMixin,
|
|
45
|
+
SaverMixin,
|
|
46
|
+
ProjectHandlerMixin,
|
|
47
|
+
LibraryHandlerMixin,
|
|
48
|
+
PipelineMixin,
|
|
49
|
+
ServiceMixin,
|
|
50
|
+
):
|
|
62
51
|
"""
|
|
63
52
|
Main interface for SciTeX Scholar - scientific literature management made simple.
|
|
64
53
|
|
|
@@ -103,8 +92,7 @@ class Scholar:
|
|
|
103
92
|
project_description: Optional[str] = None,
|
|
104
93
|
browser_mode: Optional[str] = None,
|
|
105
94
|
):
|
|
106
|
-
"""
|
|
107
|
-
Initialize Scholar with configuration.
|
|
95
|
+
"""Initialize Scholar with configuration.
|
|
108
96
|
|
|
109
97
|
Args:
|
|
110
98
|
config: Can be:
|
|
@@ -115,29 +103,15 @@ class Scholar:
|
|
|
115
103
|
project_description: Optional description for the project
|
|
116
104
|
browser_mode: Browser mode ('stealth', 'interactive', 'manual')
|
|
117
105
|
"""
|
|
118
|
-
|
|
119
106
|
self.config = self._init_config(config)
|
|
120
|
-
|
|
121
|
-
# Store browser mode for later use
|
|
122
107
|
self.browser_mode = browser_mode or "stealth"
|
|
123
108
|
|
|
124
|
-
# Set project and workspace
|
|
125
109
|
self.project = self.config.resolve("project", project, "default")
|
|
126
110
|
self.workspace_dir = self.config.path_manager.get_workspace_dir()
|
|
127
111
|
|
|
128
|
-
# Auto-create project directory if it doesn't exist
|
|
129
112
|
if project:
|
|
130
113
|
self._ensure_project_exists(project, project_description)
|
|
131
114
|
|
|
132
|
-
# Initialize service components (lazy loading for better performance)
|
|
133
|
-
# Use mangled names for private properties
|
|
134
|
-
self._Scholar__scholar_engine = None # Replaces DOIResolver and LibraryEnricher
|
|
135
|
-
self._Scholar__auth_manager = None
|
|
136
|
-
self._Scholar__browser_manager = None
|
|
137
|
-
self._Scholar__library_manager = None
|
|
138
|
-
self._Scholar__library = None # ScholarLibrary for high-level operations
|
|
139
|
-
|
|
140
|
-
# Show user-friendly initialization message with library location
|
|
141
115
|
library_path = self.config.get_library_project_dir(self.project)
|
|
142
116
|
if project:
|
|
143
117
|
project_path = library_path / project
|
|
@@ -147,1711 +121,100 @@ class Scholar:
|
|
|
147
121
|
else:
|
|
148
122
|
logger.info(f"{self.name}: Scholar initialized (library: {library_path})")
|
|
149
123
|
|
|
150
|
-
# ----------------------------------------
|
|
151
|
-
# Enrichers
|
|
152
|
-
# ----------------------------------------
|
|
153
|
-
async def enrich_papers_async(self, papers: Papers) -> Papers:
|
|
154
|
-
"""Async version of enrich_papers for use in async contexts.
|
|
155
|
-
|
|
156
|
-
Args:
|
|
157
|
-
papers: Papers collection to enrich.
|
|
158
|
-
|
|
159
|
-
Returns:
|
|
160
|
-
Enriched Papers collection
|
|
161
|
-
"""
|
|
162
|
-
enriched_list = []
|
|
163
|
-
|
|
164
|
-
for paper in papers:
|
|
165
|
-
try:
|
|
166
|
-
# Use ScholarEngine to search and enrich
|
|
167
|
-
results = await self._scholar_engine.search_async(
|
|
168
|
-
title=paper.metadata.basic.title,
|
|
169
|
-
year=paper.metadata.basic.year,
|
|
170
|
-
authors=(
|
|
171
|
-
paper.metadata.basic.authors[0]
|
|
172
|
-
if paper.metadata.basic.authors
|
|
173
|
-
else None
|
|
174
|
-
),
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
# Create a copy to avoid modifying original
|
|
178
|
-
enriched_paper = self._merge_enrichment_data(paper, results)
|
|
179
|
-
enriched_list.append(enriched_paper)
|
|
180
|
-
title = paper.metadata.basic.title or "No title"
|
|
181
|
-
logger.info(f"{self.name}: Enriched: {title[:50]}...")
|
|
182
|
-
|
|
183
|
-
except Exception as e:
|
|
184
|
-
title = paper.metadata.basic.title or "No title"
|
|
185
|
-
logger.warning(
|
|
186
|
-
f"{self.name}: Failed to enrich paper '{title[:50]}...': {e}"
|
|
187
|
-
)
|
|
188
|
-
enriched_list.append(paper) # Keep original if enrichment fails
|
|
189
|
-
|
|
190
|
-
enriched_papers = Papers(enriched_list, project=self.project)
|
|
191
|
-
|
|
192
|
-
# Add impact factors as post-processing step
|
|
193
|
-
if self.config.resolve("enrich_impact_factors", None, True):
|
|
194
|
-
enriched_papers = self._enrich_impact_factors(enriched_papers)
|
|
195
|
-
|
|
196
|
-
return enriched_papers
|
|
197
|
-
|
|
198
|
-
def enrich_papers(
|
|
199
|
-
self, papers: Optional[Papers] = None
|
|
200
|
-
) -> Union[Papers, Dict[str, int]]:
|
|
201
|
-
"""Enrich papers with metadata from multiple sources.
|
|
202
|
-
|
|
203
|
-
Args:
|
|
204
|
-
papers: Papers collection to enrich. If None, enriches all papers in current project.
|
|
205
|
-
|
|
206
|
-
Returns:
|
|
207
|
-
- If papers provided: Returns enriched Papers collection
|
|
208
|
-
- If no papers: Returns dict with enrichment statistics for project
|
|
209
|
-
"""
|
|
210
|
-
|
|
211
|
-
# If no papers provided, enrich entire project
|
|
212
|
-
if papers is None:
|
|
213
|
-
return self._enrich_current_project()
|
|
214
|
-
|
|
215
|
-
# Enrich the provided papers collection
|
|
216
|
-
enriched_list = []
|
|
217
|
-
|
|
218
|
-
nest_asyncio.apply() # Allow nested event loops
|
|
219
|
-
|
|
220
|
-
for paper in papers:
|
|
221
|
-
try:
|
|
222
|
-
# Use ScholarEngine to search and enrich
|
|
223
|
-
results = asyncio.run(
|
|
224
|
-
self._scholar_engine.search_async(
|
|
225
|
-
title=paper.metadata.basic.title,
|
|
226
|
-
year=paper.metadata.basic.year,
|
|
227
|
-
authors=(
|
|
228
|
-
paper.metadata.basic.authors[0]
|
|
229
|
-
if paper.metadata.basic.authors
|
|
230
|
-
else None
|
|
231
|
-
),
|
|
232
|
-
)
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
# Create a copy to avoid modifying original
|
|
236
|
-
enriched_paper = self._merge_enrichment_data(paper, results)
|
|
237
|
-
enriched_list.append(enriched_paper)
|
|
238
|
-
title = paper.metadata.basic.title or "No title"
|
|
239
|
-
logger.info(f"{self.name}: Enriched: {title[:50]}...")
|
|
240
|
-
|
|
241
|
-
except Exception as e:
|
|
242
|
-
title = paper.metadata.basic.title or "No title"
|
|
243
|
-
logger.warning(
|
|
244
|
-
f"{self.name}: Failed to enrich paper '{title[:50]}...': {e}"
|
|
245
|
-
)
|
|
246
|
-
enriched_list.append(paper) # Keep original if enrichment fails
|
|
247
|
-
|
|
248
|
-
enriched_papers = Papers(enriched_list, project=self.project)
|
|
249
|
-
|
|
250
|
-
# Add impact factors as post-processing step
|
|
251
|
-
if self.config.resolve("enrich_impact_factors", None, True):
|
|
252
|
-
enriched_papers = self._enrich_impact_factors(enriched_papers)
|
|
253
|
-
|
|
254
|
-
return enriched_papers
|
|
255
|
-
|
|
256
|
-
def _enrich_impact_factors(self, papers: Papers) -> Papers:
|
|
257
|
-
"""Add journal impact factors to papers.
|
|
258
|
-
|
|
259
|
-
Args:
|
|
260
|
-
papers: Papers collection to enrich with impact factors
|
|
261
|
-
|
|
262
|
-
Returns:
|
|
263
|
-
Papers collection with impact factors added where available
|
|
264
|
-
"""
|
|
265
|
-
try:
|
|
266
|
-
# Try JCR database first (fast)
|
|
267
|
-
jcr_engine = ImpactFactorEngine()
|
|
268
|
-
papers = jcr_engine.enrich_papers(papers)
|
|
269
|
-
return papers
|
|
270
|
-
except Exception as e:
|
|
271
|
-
logger.debug(
|
|
272
|
-
f"{self.name}: JCR engine unavailable: {e}, falling back to calculation method"
|
|
273
|
-
)
|
|
274
|
-
|
|
275
|
-
return papers
|
|
276
|
-
|
|
277
|
-
def _merge_enrichment_data(self, paper: Paper, results: Dict) -> Paper:
|
|
278
|
-
"""Merge enrichment results into paper object.
|
|
279
|
-
|
|
280
|
-
Creates a new Paper object with merged data to avoid modifying the original.
|
|
281
|
-
"""
|
|
282
|
-
# Import here to avoid circular dependency
|
|
283
|
-
|
|
284
|
-
enriched = deepcopy(paper)
|
|
285
|
-
|
|
286
|
-
# Results from ScholarEngine is already combined metadata, not individual engine results
|
|
287
|
-
if not results:
|
|
288
|
-
return enriched
|
|
289
|
-
|
|
290
|
-
# Extract from the combined metadata structure
|
|
291
|
-
# ID section
|
|
292
|
-
if "id" in results:
|
|
293
|
-
if results["id"].get("doi") and not enriched.metadata.id.doi:
|
|
294
|
-
enriched.metadata.set_doi(results["id"]["doi"])
|
|
295
|
-
if results["id"].get("pmid") and not enriched.metadata.id.pmid:
|
|
296
|
-
enriched.metadata.id.pmid = results["id"]["pmid"]
|
|
297
|
-
if results["id"].get("arxiv_id") and not enriched.metadata.id.arxiv_id:
|
|
298
|
-
enriched.metadata.id.arxiv_id = results["id"]["arxiv_id"]
|
|
299
|
-
# Note: corpus_id, semantic_id, ieee_id are in results but not in Paper dataclass
|
|
300
|
-
|
|
301
|
-
# Basic metadata section
|
|
302
|
-
if "basic" in results:
|
|
303
|
-
# Always update abstract if found (key enrichment goal)
|
|
304
|
-
if results["basic"].get("abstract"):
|
|
305
|
-
enriched.metadata.basic.abstract = results["basic"]["abstract"]
|
|
306
|
-
|
|
307
|
-
# Update title if more complete
|
|
308
|
-
if results["basic"].get("title"):
|
|
309
|
-
new_title = results["basic"]["title"]
|
|
310
|
-
current_title = enriched.metadata.basic.title or ""
|
|
311
|
-
if not current_title or len(new_title) > len(current_title):
|
|
312
|
-
enriched.metadata.basic.title = new_title
|
|
313
|
-
|
|
314
|
-
# Update authors if found
|
|
315
|
-
if results["basic"].get("authors") and not enriched.metadata.basic.authors:
|
|
316
|
-
enriched.metadata.basic.authors = results["basic"]["authors"]
|
|
317
|
-
|
|
318
|
-
# Update year if found
|
|
319
|
-
if results["basic"].get("year") and not enriched.metadata.basic.year:
|
|
320
|
-
enriched.metadata.basic.year = results["basic"]["year"]
|
|
321
|
-
|
|
322
|
-
# Update keywords if found
|
|
323
|
-
if (
|
|
324
|
-
results["basic"].get("keywords")
|
|
325
|
-
and not enriched.metadata.basic.keywords
|
|
326
|
-
):
|
|
327
|
-
enriched.metadata.basic.keywords = results["basic"]["keywords"]
|
|
328
|
-
|
|
329
|
-
# Publication metadata
|
|
330
|
-
if "publication" in results:
|
|
331
|
-
if (
|
|
332
|
-
results["publication"].get("journal")
|
|
333
|
-
and not enriched.metadata.publication.journal
|
|
334
|
-
):
|
|
335
|
-
enriched.metadata.publication.journal = results["publication"][
|
|
336
|
-
"journal"
|
|
337
|
-
]
|
|
338
|
-
if (
|
|
339
|
-
results["publication"].get("publisher")
|
|
340
|
-
and not enriched.metadata.publication.publisher
|
|
341
|
-
):
|
|
342
|
-
enriched.metadata.publication.publisher = results["publication"][
|
|
343
|
-
"publisher"
|
|
344
|
-
]
|
|
345
|
-
if (
|
|
346
|
-
results["publication"].get("volume")
|
|
347
|
-
and not enriched.metadata.publication.volume
|
|
348
|
-
):
|
|
349
|
-
enriched.metadata.publication.volume = results["publication"]["volume"]
|
|
350
|
-
if (
|
|
351
|
-
results["publication"].get("issue")
|
|
352
|
-
and not enriched.metadata.publication.issue
|
|
353
|
-
):
|
|
354
|
-
enriched.metadata.publication.issue = results["publication"]["issue"]
|
|
355
|
-
if (
|
|
356
|
-
results["publication"].get("pages")
|
|
357
|
-
and not enriched.metadata.publication.pages
|
|
358
|
-
):
|
|
359
|
-
enriched.metadata.publication.pages = results["publication"]["pages"]
|
|
360
|
-
|
|
361
|
-
# Citation metadata
|
|
362
|
-
if "citation_count" in results:
|
|
363
|
-
# Try both "count" and "total" fields
|
|
364
|
-
count = results["citation_count"].get("count") or results[
|
|
365
|
-
"citation_count"
|
|
366
|
-
].get("total")
|
|
367
|
-
if count:
|
|
368
|
-
# Always take the maximum citation count
|
|
369
|
-
current_count = enriched.metadata.citation_count.total or 0
|
|
370
|
-
if not current_count or count > current_count:
|
|
371
|
-
enriched.metadata.citation_count.total = count
|
|
372
|
-
# Note: influential_citation_count is in results but not in Paper dataclass
|
|
373
|
-
|
|
374
|
-
# URL metadata
|
|
375
|
-
if "url" in results:
|
|
376
|
-
if results["url"].get("pdf"):
|
|
377
|
-
# Check if this PDF is not already in the list
|
|
378
|
-
pdf_url = results["url"]["pdf"]
|
|
379
|
-
if not any(p.get("url") == pdf_url for p in enriched.metadata.url.pdfs):
|
|
380
|
-
enriched.metadata.url.pdfs.append(
|
|
381
|
-
{"url": pdf_url, "source": "enrichment"}
|
|
382
|
-
)
|
|
383
|
-
if results["url"].get("url") and not enriched.metadata.url.publisher:
|
|
384
|
-
enriched.metadata.url.publisher = results["url"]["url"]
|
|
385
|
-
|
|
386
|
-
# Note: Metrics section (journal_impact_factor, h_index) not stored in Paper dataclass
|
|
387
|
-
|
|
388
|
-
return enriched
|
|
389
|
-
|
|
390
|
-
def _enrich_current_project(self) -> Dict[str, int]:
|
|
391
|
-
"""Enrich all papers in the current project.
|
|
392
|
-
|
|
393
|
-
Returns:
|
|
394
|
-
Dictionary with enrichment statistics
|
|
395
|
-
"""
|
|
396
|
-
if not self.project:
|
|
397
|
-
raise ValueError(
|
|
398
|
-
"No project specified. Use Scholar(project='name') or provide papers to enrich()."
|
|
399
|
-
)
|
|
400
|
-
|
|
401
|
-
# Load papers from project library
|
|
402
|
-
papers = self.load_project(self.project)
|
|
403
|
-
logger.info(
|
|
404
|
-
f"{self.name}: Enriching {len(papers)} papers in project '{self.project}'"
|
|
405
|
-
)
|
|
406
|
-
|
|
407
|
-
# Enrich the papers
|
|
408
|
-
enriched_papers = self.enrich_papers(papers)
|
|
409
|
-
|
|
410
|
-
# Count successes
|
|
411
|
-
enriched_count = sum(
|
|
412
|
-
1
|
|
413
|
-
for i, p in enumerate(enriched_papers)
|
|
414
|
-
if p.abstract and not papers[i].abstract # Check if abstract was added
|
|
415
|
-
)
|
|
416
|
-
|
|
417
|
-
# Save enriched papers back to library
|
|
418
|
-
saved_ids = self.save_papers_to_library(enriched_papers)
|
|
419
|
-
|
|
420
|
-
return {
|
|
421
|
-
"enriched": enriched_count,
|
|
422
|
-
"failed": len(papers) - enriched_count,
|
|
423
|
-
"total": len(papers),
|
|
424
|
-
"saved": len(saved_ids),
|
|
425
|
-
}
|
|
426
|
-
|
|
427
|
-
# ----------------------------------------
|
|
428
|
-
# URL Finding (Orchestration)
|
|
429
|
-
# ----------------------------------------
|
|
430
|
-
async def _find_urls_for_doi_async(self, doi: str, context) -> Dict[str, Any]:
|
|
431
|
-
"""Find all URLs for a DOI (orchestration layer).
|
|
432
|
-
|
|
433
|
-
Workflow:
|
|
434
|
-
DOI → Publisher URL → PDF URLs → OpenURL (fallback)
|
|
435
|
-
|
|
436
|
-
Args:
|
|
437
|
-
doi: DOI string
|
|
438
|
-
context: Authenticated browser context
|
|
439
|
-
|
|
440
|
-
Returns:
|
|
441
|
-
Dictionary with URL information: {
|
|
442
|
-
"url_doi": "https://doi.org/...",
|
|
443
|
-
"url_publisher": "https://publisher.com/...",
|
|
444
|
-
"urls_pdf": [{"url": "...", "source": "zotero_translator"}],
|
|
445
|
-
"url_openurl_resolved": "..." (if fallback used)
|
|
446
|
-
}
|
|
447
|
-
"""
|
|
448
|
-
from scitex.scholar.auth.gateway import (
|
|
449
|
-
OpenURLResolver,
|
|
450
|
-
normalize_doi_as_http,
|
|
451
|
-
resolve_publisher_url_by_navigating_to_doi_page,
|
|
452
|
-
)
|
|
453
|
-
|
|
454
|
-
# Initialize result
|
|
455
|
-
urls = {"url_doi": normalize_doi_as_http(doi)}
|
|
456
|
-
|
|
457
|
-
# Step 1: Resolve publisher URL
|
|
458
|
-
page = await context.new_page()
|
|
459
|
-
try:
|
|
460
|
-
url_publisher = await resolve_publisher_url_by_navigating_to_doi_page(
|
|
461
|
-
doi, page
|
|
462
|
-
)
|
|
463
|
-
urls["url_publisher"] = url_publisher
|
|
464
|
-
finally:
|
|
465
|
-
await page.close()
|
|
466
|
-
|
|
467
|
-
# Step 2: Find PDF URLs from publisher URL
|
|
468
|
-
url_finder = ScholarURLFinder(context, config=self.config)
|
|
469
|
-
urls_pdf = []
|
|
470
|
-
|
|
471
|
-
if url_publisher:
|
|
472
|
-
urls_pdf = await url_finder.find_pdf_urls(url_publisher)
|
|
473
|
-
|
|
474
|
-
# Step 3: Try OpenURL fallback if no PDFs found
|
|
475
|
-
if not urls_pdf:
|
|
476
|
-
openurl_resolver = OpenURLResolver(config=self.config)
|
|
477
|
-
page = await context.new_page()
|
|
478
|
-
try:
|
|
479
|
-
url_openurl_resolved = await openurl_resolver.resolve_doi(doi, page)
|
|
480
|
-
urls["url_openurl_resolved"] = url_openurl_resolved
|
|
481
|
-
|
|
482
|
-
if url_openurl_resolved and url_openurl_resolved != "skipped":
|
|
483
|
-
urls_pdf = await url_finder.find_pdf_urls(url_openurl_resolved)
|
|
484
|
-
finally:
|
|
485
|
-
await page.close()
|
|
486
|
-
|
|
487
|
-
# Deduplicate and store
|
|
488
|
-
urls["urls_pdf"] = self._deduplicate_pdf_urls(urls_pdf) if urls_pdf else []
|
|
489
|
-
|
|
490
|
-
return urls
|
|
491
|
-
|
|
492
|
-
def _deduplicate_pdf_urls(self, urls_pdf: List[Dict]) -> List[Dict]:
|
|
493
|
-
"""Remove duplicate PDF URLs.
|
|
494
|
-
|
|
495
|
-
Args:
|
|
496
|
-
urls_pdf: List of PDF URL dicts
|
|
497
|
-
|
|
498
|
-
Returns:
|
|
499
|
-
Deduplicated list of PDF URL dicts
|
|
500
|
-
"""
|
|
501
|
-
seen = set()
|
|
502
|
-
unique = []
|
|
503
|
-
for pdf in urls_pdf:
|
|
504
|
-
url = pdf.get("url") if isinstance(pdf, dict) else pdf
|
|
505
|
-
if url not in seen:
|
|
506
|
-
seen.add(url)
|
|
507
|
-
unique.append(pdf)
|
|
508
|
-
return unique
|
|
509
|
-
|
|
510
|
-
# ----------------------------------------
|
|
511
|
-
# PDF Downloaders
|
|
512
|
-
# ----------------------------------------
|
|
513
|
-
async def download_pdfs_from_dois_async(
|
|
514
|
-
self,
|
|
515
|
-
dois: List[str],
|
|
516
|
-
output_dir: Optional[Path] = None,
|
|
517
|
-
max_concurrent: int = 1,
|
|
518
|
-
) -> Dict[str, int]:
|
|
519
|
-
"""Download PDFs for given DOIs using ScholarPDFDownloader.
|
|
520
|
-
|
|
521
|
-
Args:
|
|
522
|
-
dois: List of DOI strings
|
|
523
|
-
output_dir: Output directory (not used - downloads to library MASTER)
|
|
524
|
-
max_concurrent: Maximum concurrent downloads (default: 1 for sequential)
|
|
525
|
-
|
|
526
|
-
Returns:
|
|
527
|
-
Dictionary with download statistics
|
|
528
|
-
"""
|
|
529
|
-
if not dois:
|
|
530
|
-
return {"downloaded": 0, "failed": 0, "errors": 0}
|
|
531
|
-
|
|
532
|
-
# Get authenticated browser context
|
|
533
|
-
(
|
|
534
|
-
browser,
|
|
535
|
-
context,
|
|
536
|
-
) = await self._browser_manager.get_authenticated_browser_and_context_async()
|
|
537
|
-
|
|
538
|
-
try:
|
|
539
|
-
# Initialize PDF downloader with browser context
|
|
540
|
-
pdf_downloader = ScholarPDFDownloader(
|
|
541
|
-
context=context,
|
|
542
|
-
config=self.config,
|
|
543
|
-
)
|
|
544
|
-
|
|
545
|
-
# Use download_from_dois from ScholarPDFDownloader
|
|
546
|
-
# This handles parallel downloads with semaphore control
|
|
547
|
-
logger.info(
|
|
548
|
-
f"{self.name}: Starting PDF download for {len(dois)} DOIs (max_concurrent={max_concurrent})"
|
|
549
|
-
)
|
|
550
|
-
|
|
551
|
-
results = await pdf_downloader.download_from_dois(
|
|
552
|
-
dois=dois,
|
|
553
|
-
output_dir=str(output_dir) if output_dir else "/tmp/",
|
|
554
|
-
max_concurrent=max_concurrent,
|
|
555
|
-
)
|
|
556
|
-
|
|
557
|
-
# Process results and organize in library
|
|
558
|
-
stats = {"downloaded": 0, "failed": 0, "errors": 0}
|
|
559
|
-
library_dir = self.config.path_manager.library_dir
|
|
560
|
-
master_dir = library_dir / "MASTER"
|
|
561
|
-
master_dir.mkdir(parents=True, exist_ok=True)
|
|
562
|
-
|
|
563
|
-
for doi, downloaded_paths in zip(dois, results):
|
|
564
|
-
try:
|
|
565
|
-
if downloaded_paths and len(downloaded_paths) > 0:
|
|
566
|
-
# PDF was downloaded successfully
|
|
567
|
-
# Take the first downloaded PDF (if multiple)
|
|
568
|
-
temp_pdf_path = downloaded_paths[0]
|
|
569
|
-
|
|
570
|
-
# Generate paper ID and create storage
|
|
571
|
-
paper_id = self.config.path_manager._generate_paper_id(doi=doi)
|
|
572
|
-
storage_path = master_dir / paper_id
|
|
573
|
-
storage_path.mkdir(parents=True, exist_ok=True)
|
|
574
|
-
|
|
575
|
-
# Move PDF to MASTER library
|
|
576
|
-
pdf_filename = (
|
|
577
|
-
f"DOI_{doi.replace('/', '_').replace(':', '_')}.pdf"
|
|
578
|
-
)
|
|
579
|
-
master_pdf_path = storage_path / pdf_filename
|
|
580
|
-
shutil.move(str(temp_pdf_path), str(master_pdf_path))
|
|
581
|
-
|
|
582
|
-
# Create/update metadata
|
|
583
|
-
metadata_file = storage_path / "metadata.json"
|
|
584
|
-
if metadata_file.exists():
|
|
585
|
-
with open(metadata_file) as f:
|
|
586
|
-
metadata = json.load(f)
|
|
587
|
-
else:
|
|
588
|
-
metadata = {
|
|
589
|
-
"doi": doi,
|
|
590
|
-
"scitex_id": paper_id,
|
|
591
|
-
"created_at": datetime.now().isoformat(),
|
|
592
|
-
"created_by": "SciTeX Scholar",
|
|
593
|
-
}
|
|
594
|
-
|
|
595
|
-
# Update metadata with PDF info
|
|
596
|
-
metadata["pdf_path"] = str(
|
|
597
|
-
master_pdf_path.relative_to(library_dir)
|
|
598
|
-
)
|
|
599
|
-
metadata["pdf_downloaded_at"] = datetime.now().isoformat()
|
|
600
|
-
metadata["pdf_size_bytes"] = master_pdf_path.stat().st_size
|
|
601
|
-
metadata["updated_at"] = datetime.now().isoformat()
|
|
602
|
-
|
|
603
|
-
with open(metadata_file, "w") as f:
|
|
604
|
-
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
|
605
|
-
|
|
606
|
-
# Update symlink using LibraryManager
|
|
607
|
-
if self.project not in ["master", "MASTER"]:
|
|
608
|
-
self._library_manager.update_symlink(
|
|
609
|
-
master_storage_path=storage_path,
|
|
610
|
-
project=self.project,
|
|
611
|
-
)
|
|
612
|
-
|
|
613
|
-
logger.success(
|
|
614
|
-
f"{self.name}: Downloaded and organized PDF for {doi}: {master_pdf_path}"
|
|
615
|
-
)
|
|
616
|
-
stats["downloaded"] += 1
|
|
617
|
-
else:
|
|
618
|
-
logger.warning(f"{self.name}: No PDF downloaded for DOI: {doi}")
|
|
619
|
-
stats["failed"] += 1
|
|
620
|
-
|
|
621
|
-
except Exception as e:
|
|
622
|
-
logger.error(f"{self.name}: Failed to organize PDF for {doi}: {e}")
|
|
623
|
-
stats["errors"] += 1
|
|
624
|
-
stats["failed"] += 1
|
|
625
|
-
|
|
626
|
-
return stats
|
|
627
|
-
|
|
628
|
-
finally:
|
|
629
|
-
# Always close browser
|
|
630
|
-
await self._browser_manager.close()
|
|
631
|
-
|
|
632
|
-
async def _download_pdfs_sequential(
|
|
633
|
-
self, dois: List[str], output_dir: Optional[Path] = None
|
|
634
|
-
) -> Dict[str, int]:
|
|
635
|
-
"""Sequential PDF download with authentication gateway."""
|
|
636
|
-
results = {"downloaded": 0, "failed": 0, "errors": 0}
|
|
637
|
-
|
|
638
|
-
# Get authenticated browser context
|
|
639
|
-
(
|
|
640
|
-
browser,
|
|
641
|
-
context,
|
|
642
|
-
) = await self._browser_manager.get_authenticated_browser_and_context_async()
|
|
643
|
-
|
|
644
|
-
# Initialize authentication gateway (NEW)
|
|
645
|
-
auth_gateway = AuthenticationGateway(
|
|
646
|
-
auth_manager=self._auth_manager,
|
|
647
|
-
browser_manager=self._browser_manager,
|
|
648
|
-
config=self.config,
|
|
649
|
-
)
|
|
650
|
-
|
|
651
|
-
# Use simple downloader for sequential downloads
|
|
652
|
-
pdf_downloader = ScholarPDFDownloader(
|
|
653
|
-
context=context,
|
|
654
|
-
config=self.config,
|
|
655
|
-
)
|
|
656
|
-
|
|
657
|
-
library_dir = self.config.path_manager.library_dir
|
|
658
|
-
master_dir = library_dir / "MASTER"
|
|
659
|
-
project_dir = library_dir / self.project
|
|
660
|
-
master_dir.mkdir(parents=True, exist_ok=True)
|
|
661
|
-
project_dir.mkdir(parents=True, exist_ok=True)
|
|
662
|
-
|
|
663
|
-
for doi in dois:
|
|
664
|
-
try:
|
|
665
|
-
logger.info(f"{self.name}: Processing DOI: {doi}")
|
|
666
|
-
|
|
667
|
-
# NEW: Prepare authentication context BEFORE URL finding
|
|
668
|
-
# This establishes publisher-specific cookies if needed
|
|
669
|
-
_url_context = await auth_gateway.prepare_context_async(
|
|
670
|
-
doi=doi, context=context
|
|
671
|
-
)
|
|
672
|
-
|
|
673
|
-
# Step 1: Find URLs for the DOI (orchestration)
|
|
674
|
-
urls = await self._find_urls_for_doi_async(doi, context)
|
|
675
|
-
|
|
676
|
-
# Step 2: Get PDF URLs
|
|
677
|
-
pdf_urls = urls.get("urls_pdf", [])
|
|
678
|
-
|
|
679
|
-
if not pdf_urls:
|
|
680
|
-
logger.warning(f"{self.name}: No PDF URLs found for DOI: {doi}")
|
|
681
|
-
results["failed"] += 1
|
|
682
|
-
continue
|
|
683
|
-
|
|
684
|
-
# Step 3: Try to download from each PDF URL
|
|
685
|
-
downloaded_path = None
|
|
686
|
-
for pdf_entry in pdf_urls:
|
|
687
|
-
# Handle both dict and string formats
|
|
688
|
-
pdf_url = (
|
|
689
|
-
pdf_entry.get("url")
|
|
690
|
-
if isinstance(pdf_entry, dict)
|
|
691
|
-
else pdf_entry
|
|
692
|
-
)
|
|
693
|
-
|
|
694
|
-
if not pdf_url:
|
|
695
|
-
continue
|
|
696
|
-
|
|
697
|
-
# Download to temp location first
|
|
698
|
-
temp_output = (
|
|
699
|
-
Path("/tmp") / f"{doi.replace('/', '_').replace(':', '_')}.pdf"
|
|
700
|
-
)
|
|
701
|
-
|
|
702
|
-
# Download PDF using simple downloader
|
|
703
|
-
result = await pdf_downloader.download_from_url(
|
|
704
|
-
pdf_url=pdf_url, output_path=temp_output
|
|
705
|
-
)
|
|
706
|
-
|
|
707
|
-
if result and result.exists():
|
|
708
|
-
downloaded_path = result
|
|
709
|
-
break
|
|
710
|
-
|
|
711
|
-
if downloaded_path:
|
|
712
|
-
# Step 4: Store PDF in MASTER library with proper organization
|
|
713
|
-
|
|
714
|
-
# Generate unique ID from DOI using PathManager
|
|
715
|
-
paper_id = self.config.path_manager._generate_paper_id(doi=doi)
|
|
716
|
-
|
|
717
|
-
# Create MASTER storage directory
|
|
718
|
-
storage_path = master_dir / paper_id
|
|
719
|
-
storage_path.mkdir(parents=True, exist_ok=True)
|
|
720
|
-
|
|
721
|
-
# Try to get paper metadata to generate readable name
|
|
722
|
-
readable_name = None
|
|
723
|
-
temp_paper = None
|
|
724
|
-
try:
|
|
725
|
-
# Try to load paper from DOI to get metadata
|
|
726
|
-
from scitex.scholar.core.Paper import Paper
|
|
727
|
-
from scitex.scholar.core.Papers import Papers
|
|
728
|
-
|
|
729
|
-
temp_paper = Paper()
|
|
730
|
-
temp_paper.metadata.id.doi = doi
|
|
731
|
-
# Try to enrich to get author/year/journal using async method
|
|
732
|
-
temp_papers = Papers([temp_paper])
|
|
733
|
-
enriched = await self.enrich_papers_async(temp_papers)
|
|
734
|
-
if enriched and len(enriched) > 0:
|
|
735
|
-
temp_paper = enriched[0]
|
|
736
|
-
|
|
737
|
-
# Generate readable name from metadata
|
|
738
|
-
first_author = "Unknown"
|
|
739
|
-
authors = temp_paper.metadata.basic.authors
|
|
740
|
-
if authors and len(authors) > 0:
|
|
741
|
-
author_parts = authors[0].split()
|
|
742
|
-
if len(author_parts) > 1:
|
|
743
|
-
first_author = author_parts[-1] # Last name
|
|
744
|
-
else:
|
|
745
|
-
first_author = author_parts[0]
|
|
746
|
-
|
|
747
|
-
year = temp_paper.metadata.basic.year
|
|
748
|
-
year_str = str(year) if year else "Unknown"
|
|
749
|
-
|
|
750
|
-
journal_clean = "Unknown"
|
|
751
|
-
journal = temp_paper.metadata.publication.journal
|
|
752
|
-
if journal:
|
|
753
|
-
# Clean journal name - remove special chars, keep alphanumeric
|
|
754
|
-
journal_clean = "".join(
|
|
755
|
-
c for c in journal if c.isalnum() or c in " "
|
|
756
|
-
).replace(" ", "")
|
|
757
|
-
if not journal_clean:
|
|
758
|
-
journal_clean = "Unknown"
|
|
759
|
-
|
|
760
|
-
# Format: Author-Year-Journal
|
|
761
|
-
readable_name = f"{first_author}-{year_str}-{journal_clean}"
|
|
762
|
-
except:
|
|
763
|
-
pass
|
|
764
|
-
|
|
765
|
-
# Fallback to DOI if metadata extraction failed
|
|
766
|
-
if not readable_name:
|
|
767
|
-
readable_name = f"DOI_{doi.replace('/', '_').replace(':', '_')}"
|
|
768
|
-
|
|
769
|
-
# Copy PDF to MASTER storage with ORIGINAL filename to track how downloaded
|
|
770
|
-
# The PDF filename preserves the DOI format for tracking
|
|
771
|
-
pdf_filename = f"DOI_{doi.replace('/', '_').replace(':', '_')}.pdf"
|
|
772
|
-
master_pdf_path = storage_path / pdf_filename
|
|
773
|
-
shutil.copy2(downloaded_path, master_pdf_path)
|
|
774
|
-
|
|
775
|
-
# Load existing metadata or create minimal new metadata
|
|
776
|
-
metadata_file = storage_path / "metadata.json"
|
|
777
|
-
if metadata_file.exists():
|
|
778
|
-
# Load existing rich metadata - DO NOT OVERWRITE IT
|
|
779
|
-
with open(metadata_file) as f:
|
|
780
|
-
metadata = json.load(f)
|
|
781
|
-
logger.debug(
|
|
782
|
-
f"{self.name}: Loaded existing metadata for {paper_id}"
|
|
783
|
-
)
|
|
784
|
-
else:
|
|
785
|
-
# Create new minimal metadata only if none exists
|
|
786
|
-
metadata = {
|
|
787
|
-
"doi": doi,
|
|
788
|
-
"scitex_id": paper_id,
|
|
789
|
-
"created_at": datetime.now().isoformat(),
|
|
790
|
-
"created_by": "SciTeX Scholar",
|
|
791
|
-
}
|
|
792
|
-
|
|
793
|
-
# Add enriched paper metadata for new papers only
|
|
794
|
-
if temp_paper:
|
|
795
|
-
# Use Pydantic to_dict() for Paper
|
|
796
|
-
paper_dict = temp_paper.to_dict()
|
|
797
|
-
# Merge paper metadata
|
|
798
|
-
for key, value in paper_dict.items():
|
|
799
|
-
if value is not None and key not in [
|
|
800
|
-
"doi",
|
|
801
|
-
"scitex_id",
|
|
802
|
-
]:
|
|
803
|
-
metadata[key] = value
|
|
804
|
-
|
|
805
|
-
# Add PDF information
|
|
806
|
-
metadata["pdf_path"] = str(master_pdf_path.relative_to(library_dir))
|
|
807
|
-
metadata["pdf_downloaded_at"] = datetime.now().isoformat()
|
|
808
|
-
metadata["pdf_size_bytes"] = master_pdf_path.stat().st_size
|
|
809
|
-
metadata["updated_at"] = datetime.now().isoformat()
|
|
810
|
-
|
|
811
|
-
# Save updated metadata
|
|
812
|
-
with open(metadata_file, "w") as f:
|
|
813
|
-
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
|
814
|
-
|
|
815
|
-
# Update symlink using LibraryManager
|
|
816
|
-
if self.project not in ["master", "MASTER"]:
|
|
817
|
-
self._library_manager.update_symlink(
|
|
818
|
-
master_storage_path=storage_path,
|
|
819
|
-
project=self.project,
|
|
820
|
-
)
|
|
821
|
-
|
|
822
|
-
# Clean up temp file
|
|
823
|
-
downloaded_path.unlink()
|
|
824
|
-
|
|
825
|
-
logger.success(
|
|
826
|
-
f"{self.name}: Downloaded PDF for {doi}: MASTER/{paper_id}/{pdf_filename}"
|
|
827
|
-
)
|
|
828
|
-
results["downloaded"] += 1
|
|
829
|
-
else:
|
|
830
|
-
logger.warning(
|
|
831
|
-
f"{self.name}: Failed to download any PDF for DOI: {doi}"
|
|
832
|
-
)
|
|
833
|
-
results["failed"] += 1
|
|
834
|
-
|
|
835
|
-
except Exception as e:
|
|
836
|
-
logger.error(f"{self.name}: Failed to process {doi}: {e}")
|
|
837
|
-
results["errors"] += 1
|
|
838
|
-
results["failed"] += 1
|
|
839
|
-
|
|
840
|
-
await self._browser_manager.close()
|
|
841
|
-
logger.info(f"{self.name}: PDF download complete: {results}")
|
|
842
|
-
return results
|
|
843
|
-
|
|
844
|
-
def download_pdfs_from_dois(
|
|
845
|
-
self, dois: List[str], output_dir: Optional[Path] = None
|
|
846
|
-
) -> Dict[str, int]:
|
|
847
|
-
"""Download PDFs for given DOIs.
|
|
848
|
-
|
|
849
|
-
Args:
|
|
850
|
-
dois: List of DOI strings
|
|
851
|
-
output_dir: Output directory (uses config default if None)
|
|
852
|
-
|
|
853
|
-
Returns:
|
|
854
|
-
Dictionary with download statistics
|
|
855
|
-
"""
|
|
856
|
-
import asyncio
|
|
857
|
-
|
|
858
|
-
return asyncio.run(self.download_pdfs_from_dois_async(dois, output_dir))
|
|
859
|
-
|
|
860
|
-
def download_pdfs_from_bibtex(
|
|
861
|
-
self,
|
|
862
|
-
bibtex_input: Union[str, Path, Papers],
|
|
863
|
-
output_dir: Optional[Path] = None,
|
|
864
|
-
) -> Dict[str, int]:
|
|
865
|
-
"""Download PDFs from BibTeX file or Papers collection.
|
|
866
|
-
|
|
867
|
-
Args:
|
|
868
|
-
bibtex_input: BibTeX file path, content string, or Papers collection
|
|
869
|
-
output_dir: Output directory (uses config default if None)
|
|
870
|
-
|
|
871
|
-
Returns:
|
|
872
|
-
Dictionary with download statistics
|
|
873
|
-
"""
|
|
874
|
-
# Load papers if bibtex_input is not already Papers
|
|
875
|
-
if isinstance(bibtex_input, Papers):
|
|
876
|
-
papers = bibtex_input
|
|
877
|
-
else:
|
|
878
|
-
papers = self.load_bibtex(bibtex_input)
|
|
879
|
-
|
|
880
|
-
# Extract DOIs from papers
|
|
881
|
-
dois = [paper.metadata.id.doi for paper in papers if paper.metadata.id.doi]
|
|
882
|
-
|
|
883
|
-
if not dois:
|
|
884
|
-
logger.warning(f"{self.name}: No papers with DOIs found in BibTeX input")
|
|
885
|
-
return {"downloaded": 0, "failed": 0, "errors": 0}
|
|
886
|
-
|
|
887
|
-
logger.info(
|
|
888
|
-
f"{self.name}: Found {len(dois)} papers with DOIs out of {len(papers)} total papers"
|
|
889
|
-
)
|
|
890
|
-
|
|
891
|
-
# Download PDFs using DOI method
|
|
892
|
-
return self.download_pdfs_from_dois(dois, output_dir)
|
|
893
|
-
|
|
894
|
-
# ----------------------------------------
|
|
895
|
-
# Loaders
|
|
896
|
-
# ----------------------------------------
|
|
897
|
-
def load_project(self, project: Optional[str] = None) -> Papers:
|
|
898
|
-
"""Load papers from a project using library manager service.
|
|
899
|
-
|
|
900
|
-
Args:
|
|
901
|
-
project: Project name (uses self.project if None)
|
|
902
|
-
|
|
903
|
-
Returns:
|
|
904
|
-
Papers collection from the project
|
|
905
|
-
"""
|
|
906
|
-
project_name = project or self.project
|
|
907
|
-
if not project_name:
|
|
908
|
-
raise ValueError("No project specified")
|
|
909
|
-
|
|
910
|
-
# Load papers from library by reading symlinks in project directory
|
|
911
|
-
import json
|
|
912
|
-
|
|
913
|
-
from ..core.Paper import Paper
|
|
914
|
-
from ..core.Papers import Papers
|
|
915
|
-
|
|
916
|
-
logger.info(f"{self.name}: Loading papers from project: {project_name}")
|
|
917
|
-
|
|
918
|
-
library_dir = self.config.path_manager.library_dir
|
|
919
|
-
project_dir = library_dir / project_name
|
|
920
|
-
|
|
921
|
-
if not project_dir.exists():
|
|
922
|
-
logger.warning(
|
|
923
|
-
f"{self.name}: Project directory does not exist: {project_dir}"
|
|
924
|
-
)
|
|
925
|
-
return Papers([], project=project_name)
|
|
926
|
-
|
|
927
|
-
papers = []
|
|
928
|
-
for item in project_dir.iterdir():
|
|
929
|
-
# Skip info directory and metadata files
|
|
930
|
-
if item.name in ["info", "project_metadata.json", "README.md"]:
|
|
931
|
-
continue
|
|
932
|
-
|
|
933
|
-
# Follow symlink to MASTER directory
|
|
934
|
-
if item.is_symlink():
|
|
935
|
-
master_path = item.resolve()
|
|
936
|
-
if master_path.exists():
|
|
937
|
-
# Load metadata.json from MASTER directory
|
|
938
|
-
metadata_file = master_path / "metadata.json"
|
|
939
|
-
if metadata_file.exists():
|
|
940
|
-
try:
|
|
941
|
-
with open(metadata_file) as f:
|
|
942
|
-
metadata = json.load(f)
|
|
943
|
-
|
|
944
|
-
# Create Paper object using from_dict class method
|
|
945
|
-
paper = Paper.from_dict(metadata)
|
|
946
|
-
|
|
947
|
-
papers.append(paper)
|
|
948
|
-
except Exception as e:
|
|
949
|
-
logger.warning(
|
|
950
|
-
f"{self.name}: Failed to load metadata from {metadata_file}: {e}"
|
|
951
|
-
)
|
|
952
|
-
|
|
953
|
-
logger.info(
|
|
954
|
-
f"{self.name}: Loaded {len(papers)} papers from project: {project_name}"
|
|
955
|
-
)
|
|
956
|
-
return Papers(papers, project=project_name)
|
|
957
|
-
|
|
958
|
-
def load_bibtex(self, bibtex_input: Union[str, Path]) -> Papers:
|
|
959
|
-
"""Load Papers collection from BibTeX file or content.
|
|
960
|
-
|
|
961
|
-
Args:
|
|
962
|
-
bibtex_input: BibTeX file path or content string
|
|
963
|
-
|
|
964
|
-
Returns:
|
|
965
|
-
Papers collection
|
|
966
|
-
"""
|
|
967
|
-
# Use the internal library to load papers
|
|
968
|
-
papers = self._library.papers_from_bibtex(bibtex_input)
|
|
969
|
-
|
|
970
|
-
# Convert to Papers collection
|
|
971
|
-
from .Papers import Papers
|
|
972
|
-
|
|
973
|
-
papers_collection = Papers(papers, config=self.config, project=self.project)
|
|
974
|
-
papers_collection.library = self._library # Attach library for save operations
|
|
975
124
|
|
|
976
|
-
return papers_collection
|
|
977
|
-
|
|
978
|
-
# ----------------------------------------
|
|
979
|
-
# Searchers
|
|
980
|
-
# ----------------------------------------
|
|
981
|
-
def search_library(self, query: str, project: Optional[str] = None) -> Papers:
|
|
982
|
-
"""
|
|
983
|
-
Search papers in local library.
|
|
984
|
-
|
|
985
|
-
For new literature search (not in library), use AI2 Scholar QA:
|
|
986
|
-
https://scholarqa.allen.ai/chat/ then process with:
|
|
987
|
-
papers = scholar.load_bibtex('file.bib') followed by scholar.enrich(papers)
|
|
988
|
-
|
|
989
|
-
Args:
|
|
990
|
-
query: Search query
|
|
991
|
-
project: Project filter (uses self.project if None)
|
|
992
|
-
|
|
993
|
-
Returns:
|
|
994
|
-
Papers collection matching the query
|
|
995
|
-
"""
|
|
996
|
-
# For now, return empty Papers until search is implemented
|
|
997
|
-
from ..core.Papers import Papers
|
|
998
|
-
|
|
999
|
-
logger.info(f"{self.name}: Searching library for: {query}")
|
|
1000
|
-
return Papers([], project=project or self.project)
|
|
1001
|
-
|
|
1002
|
-
def search_across_projects(
|
|
1003
|
-
self, query: str, projects: Optional[List[str]] = None
|
|
1004
|
-
) -> Papers:
|
|
1005
|
-
"""Search for papers across multiple projects or the entire library.
|
|
1006
|
-
|
|
1007
|
-
Args:
|
|
1008
|
-
query: Search query
|
|
1009
|
-
projects: List of project names to search (None for all)
|
|
1010
|
-
|
|
1011
|
-
Returns:
|
|
1012
|
-
Papers collection with search results
|
|
1013
|
-
"""
|
|
1014
|
-
if projects is None:
|
|
1015
|
-
# Search all projects
|
|
1016
|
-
all_projects = [p["name"] for p in self.list_projects()]
|
|
1017
|
-
else:
|
|
1018
|
-
all_projects = projects
|
|
1019
|
-
|
|
1020
|
-
all_papers = []
|
|
1021
|
-
for project in all_projects:
|
|
1022
|
-
try:
|
|
1023
|
-
project_dir = self.config.get_library_project_dir(project)
|
|
1024
|
-
# Load papers from metadata.json files in project directory
|
|
1025
|
-
for item in project_dir.iterdir():
|
|
1026
|
-
if item.is_symlink() or item.is_dir():
|
|
1027
|
-
# Follow symlink to MASTER or use direct dir
|
|
1028
|
-
paper_dir = item.resolve() if item.is_symlink() else item
|
|
1029
|
-
metadata_file = paper_dir / "metadata.json"
|
|
1030
|
-
if metadata_file.exists():
|
|
1031
|
-
try:
|
|
1032
|
-
paper = Paper.model_validate_json(
|
|
1033
|
-
metadata_file.read_text()
|
|
1034
|
-
)
|
|
1035
|
-
# Simple text search
|
|
1036
|
-
query_lower = query.lower()
|
|
1037
|
-
title = (paper.metadata.basic.title or "").lower()
|
|
1038
|
-
abstract = (paper.metadata.basic.abstract or "").lower()
|
|
1039
|
-
authors = paper.metadata.basic.authors or []
|
|
1040
|
-
if (
|
|
1041
|
-
query_lower in title
|
|
1042
|
-
or query_lower in abstract
|
|
1043
|
-
or any(
|
|
1044
|
-
query_lower in (a or "").lower()
|
|
1045
|
-
for a in authors
|
|
1046
|
-
)
|
|
1047
|
-
):
|
|
1048
|
-
all_papers.append(paper)
|
|
1049
|
-
except Exception as e:
|
|
1050
|
-
logger.debug(
|
|
1051
|
-
f"{self.name}: Failed to load {metadata_file}: {e}"
|
|
1052
|
-
)
|
|
1053
|
-
except Exception as e:
|
|
1054
|
-
logger.debug(f"{self.name}: Failed to search project {project}: {e}")
|
|
1055
|
-
|
|
1056
|
-
return Papers(all_papers, config=self.config, project="search_results")
|
|
1057
|
-
|
|
1058
|
-
# ----------------------------------------
|
|
1059
|
-
# Savers
|
|
1060
|
-
# ----------------------------------------
|
|
1061
|
-
def save_papers_to_library(self, papers: Papers) -> List[str]:
|
|
1062
|
-
"""Save papers collection to library.
|
|
1063
|
-
|
|
1064
|
-
Args:
|
|
1065
|
-
papers: Papers collection to save
|
|
1066
|
-
|
|
1067
|
-
Returns:
|
|
1068
|
-
List of paper IDs saved
|
|
1069
|
-
"""
|
|
1070
|
-
saved_ids = []
|
|
1071
|
-
for paper in papers:
|
|
1072
|
-
try:
|
|
1073
|
-
paper_id = self._library.save_paper(paper)
|
|
1074
|
-
saved_ids.append(paper_id)
|
|
1075
|
-
except Exception as e:
|
|
1076
|
-
logger.warning(f"{self.name}: Failed to save paper: {e}")
|
|
1077
|
-
|
|
1078
|
-
logger.info(
|
|
1079
|
-
f"{self.name}: Saved {len(saved_ids)}/{len(papers)} papers to library"
|
|
1080
|
-
)
|
|
1081
|
-
return saved_ids
|
|
1082
|
-
|
|
1083
|
-
def save_papers_as_bibtex(
|
|
1084
|
-
self, papers: Papers, output_path: Optional[Union[str, Path]] = None
|
|
1085
|
-
) -> str:
|
|
1086
|
-
"""Save papers to BibTeX format with enrichment metadata.
|
|
1087
|
-
|
|
1088
|
-
Args:
|
|
1089
|
-
papers: Papers collection to save
|
|
1090
|
-
output_path: Optional path to save the BibTeX file
|
|
1091
|
-
|
|
1092
|
-
Returns:
|
|
1093
|
-
BibTeX content as string with enrichment metadata included
|
|
1094
|
-
"""
|
|
1095
|
-
from ..storage.BibTeXHandler import BibTeXHandler
|
|
1096
|
-
|
|
1097
|
-
bibtex_handler = BibTeXHandler(project=self.project, config=self.config)
|
|
1098
|
-
return bibtex_handler.papers_to_bibtex(papers, output_path)
|
|
1099
|
-
|
|
1100
|
-
# ----------------------------------------
|
|
1101
|
-
# Project Handlers
|
|
1102
|
-
# ----------------------------------------
|
|
1103
|
-
def _ensure_project_exists(
|
|
1104
|
-
self, project: str, description: Optional[str] = None
|
|
1105
|
-
) -> Path:
|
|
1106
|
-
"""Ensure project directory exists, create if needed (PRIVATE).
|
|
1107
|
-
|
|
1108
|
-
Args:
|
|
1109
|
-
project: Project name
|
|
1110
|
-
description: Optional project description
|
|
1111
|
-
|
|
1112
|
-
Returns:
|
|
1113
|
-
Path to the project directory
|
|
1114
|
-
"""
|
|
1115
|
-
project_dir = self.config.get_library_project_dir(project)
|
|
1116
|
-
info_dir = project_dir / "info"
|
|
1117
|
-
|
|
1118
|
-
# Create project and info directories
|
|
1119
|
-
if not project_dir.exists():
|
|
1120
|
-
project_dir.mkdir(parents=True, exist_ok=True)
|
|
1121
|
-
logger.info(f"{self.name}: Auto-created project directory: {project}")
|
|
1122
|
-
|
|
1123
|
-
# Ensure info directory exists
|
|
1124
|
-
info_dir.mkdir(parents=True, exist_ok=True)
|
|
1125
|
-
|
|
1126
|
-
# Create/move metadata file to info directory
|
|
1127
|
-
old_metadata_file = project_dir / "project_metadata.json" # Old location
|
|
1128
|
-
metadata_file = info_dir / "project_metadata.json" # New location
|
|
1129
|
-
|
|
1130
|
-
# Move existing metadata file if it exists in old location
|
|
1131
|
-
if old_metadata_file.exists() and not metadata_file.exists():
|
|
1132
|
-
shutil.move(str(old_metadata_file), str(metadata_file))
|
|
1133
|
-
logger.info(f"{self.name}: Moved project metadata to info directory")
|
|
1134
|
-
|
|
1135
|
-
# Create metadata file if it doesn't exist
|
|
1136
|
-
if not metadata_file.exists():
|
|
1137
|
-
metadata = {
|
|
1138
|
-
"name": project,
|
|
1139
|
-
"description": description or f"Papers for {project} project",
|
|
1140
|
-
"created": datetime.now().isoformat(),
|
|
1141
|
-
"created_by": "SciTeX Scholar",
|
|
1142
|
-
"auto_created": True,
|
|
1143
|
-
}
|
|
1144
|
-
|
|
1145
|
-
with open(metadata_file, "w") as f:
|
|
1146
|
-
json.dump(metadata, f, indent=2)
|
|
1147
|
-
|
|
1148
|
-
logger.info(
|
|
1149
|
-
f"{self.name}: Created project metadata in info directory: {project}"
|
|
1150
|
-
)
|
|
1151
|
-
|
|
1152
|
-
return project_dir
|
|
1153
|
-
|
|
1154
|
-
def _create_project_metadata(
|
|
1155
|
-
self, project: str, description: Optional[str] = None
|
|
1156
|
-
) -> Path:
|
|
1157
|
-
"""Create project directory and metadata (PRIVATE).
|
|
1158
|
-
|
|
1159
|
-
DEPRECATED: Use _ensure_project_exists instead.
|
|
1160
|
-
|
|
1161
|
-
Args:
|
|
1162
|
-
project: Project name
|
|
1163
|
-
description: Optional project description
|
|
1164
|
-
|
|
1165
|
-
Returns:
|
|
1166
|
-
Path to the created project directory
|
|
1167
|
-
"""
|
|
1168
|
-
# Just use the new method that puts metadata in info directory
|
|
1169
|
-
return self._ensure_project_exists(project, description)
|
|
1170
|
-
|
|
1171
|
-
def list_projects(self) -> List[Dict[str, Any]]:
|
|
1172
|
-
"""List all projects in the Scholar library.
|
|
1173
|
-
|
|
1174
|
-
Returns:
|
|
1175
|
-
List of project information dictionaries
|
|
1176
|
-
"""
|
|
1177
|
-
library_dir = self.config.path_manager.library_dir
|
|
1178
|
-
projects = []
|
|
1179
|
-
|
|
1180
|
-
for item in library_dir.iterdir():
|
|
1181
|
-
if item.is_dir() and item.name != "MASTER":
|
|
1182
|
-
project_info = {
|
|
1183
|
-
"name": item.name,
|
|
1184
|
-
"path": str(item),
|
|
1185
|
-
"papers_count": len(list(item.glob("*"))),
|
|
1186
|
-
"created": None,
|
|
1187
|
-
"description": None,
|
|
1188
|
-
}
|
|
1189
|
-
|
|
1190
|
-
# Load metadata if exists
|
|
1191
|
-
metadata_file = item / "project_metadata.json"
|
|
1192
|
-
if metadata_file.exists():
|
|
1193
|
-
try:
|
|
1194
|
-
with open(metadata_file) as f:
|
|
1195
|
-
metadata = json.load(f)
|
|
1196
|
-
project_info.update(metadata)
|
|
1197
|
-
except Exception as e:
|
|
1198
|
-
logger.debug(f"Failed to load metadata for {item.name}: {e}")
|
|
1199
|
-
|
|
1200
|
-
projects.append(project_info)
|
|
1201
|
-
|
|
1202
|
-
return sorted(projects, key=lambda x: x["name"])
|
|
1203
|
-
|
|
1204
|
-
# ----------------------------------------
|
|
1205
|
-
# Library Handlers
|
|
1206
|
-
# ----------------------------------------
|
|
1207
|
-
def get_library_statistics(self) -> Dict[str, Any]:
|
|
1208
|
-
"""Get comprehensive statistics for the entire Scholar library.
|
|
1209
|
-
|
|
1210
|
-
Returns:
|
|
1211
|
-
Dictionary with library-wide statistics
|
|
1212
|
-
"""
|
|
1213
|
-
master_dir = self.config.get_library_master_dir()
|
|
1214
|
-
projects = self.list_projects()
|
|
1215
|
-
|
|
1216
|
-
stats = {
|
|
1217
|
-
"total_projects": len(projects),
|
|
1218
|
-
"total_papers": (
|
|
1219
|
-
len(list(master_dir.glob("*"))) if master_dir.exists() else 0
|
|
1220
|
-
),
|
|
1221
|
-
"projects": projects,
|
|
1222
|
-
"library_path": str(self.config.path_manager.library_dir),
|
|
1223
|
-
"master_path": str(master_dir),
|
|
1224
|
-
}
|
|
1225
|
-
|
|
1226
|
-
# Calculate storage usage
|
|
1227
|
-
if master_dir.exists():
|
|
1228
|
-
total_size = sum(
|
|
1229
|
-
f.stat().st_size for f in master_dir.rglob("*") if f.is_file()
|
|
1230
|
-
)
|
|
1231
|
-
stats["storage_mb"] = total_size / (1024 * 1024)
|
|
1232
|
-
else:
|
|
1233
|
-
stats["storage_mb"] = 0
|
|
1234
|
-
|
|
1235
|
-
return stats
|
|
1236
|
-
|
|
1237
|
-
def backup_library(self, backup_path: Union[str, Path]) -> Dict[str, Any]:
|
|
1238
|
-
"""Create a backup of the Scholar library.
|
|
1239
|
-
|
|
1240
|
-
Args:
|
|
1241
|
-
backup_path: Path for the backup
|
|
1242
|
-
|
|
1243
|
-
Returns:
|
|
1244
|
-
Dictionary with backup information
|
|
1245
|
-
"""
|
|
1246
|
-
backup_path = Path(backup_path)
|
|
1247
|
-
library_path = self.config.path_manager.library_dir
|
|
1248
|
-
|
|
1249
|
-
if not library_path.exists():
|
|
1250
|
-
raise ScholarError("Library directory does not exist")
|
|
1251
|
-
|
|
1252
|
-
# Create timestamped backup
|
|
1253
|
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
1254
|
-
backup_dir = backup_path / f"scholar_library_backup_{timestamp}"
|
|
1255
|
-
|
|
1256
|
-
logger.info(f"{self.name}: Creating library backup at {backup_dir}")
|
|
1257
|
-
shutil.copytree(library_path, backup_dir)
|
|
1258
|
-
|
|
1259
|
-
# Create backup metadata
|
|
1260
|
-
backup_info = {
|
|
1261
|
-
"timestamp": timestamp,
|
|
1262
|
-
"source": str(library_path),
|
|
1263
|
-
"backup": str(backup_dir),
|
|
1264
|
-
"size_mb": sum(
|
|
1265
|
-
f.stat().st_size for f in backup_dir.rglob("*") if f.is_file()
|
|
1266
|
-
)
|
|
1267
|
-
/ (1024 * 1024),
|
|
1268
|
-
}
|
|
1269
|
-
|
|
1270
|
-
metadata_file = backup_dir / "backup_metadata.json"
|
|
1271
|
-
with open(metadata_file, "w") as f:
|
|
1272
|
-
json.dump(backup_info, f, indent=2)
|
|
1273
|
-
|
|
1274
|
-
logger.info(
|
|
1275
|
-
f"{self.name}: Library backup completed: {backup_info['size_mb']:.2f} MB"
|
|
1276
|
-
)
|
|
1277
|
-
return backup_info
|
|
1278
|
-
|
|
1279
|
-
# =========================================================================
|
|
1280
|
-
# PIPELINE METHODS (Phase 2)
|
|
1281
|
-
# =========================================================================
|
|
1282
|
-
|
|
1283
|
-
async def process_paper_async(
|
|
1284
|
-
self,
|
|
1285
|
-
title: Optional[str] = None,
|
|
1286
|
-
doi: Optional[str] = None,
|
|
1287
|
-
project: Optional[str] = None,
|
|
1288
|
-
) -> Paper:
|
|
1289
|
-
"""
|
|
1290
|
-
Complete sequential pipeline for processing a single paper.
|
|
1291
|
-
|
|
1292
|
-
Accepts either title OR doi. Uses storage-first approach:
|
|
1293
|
-
each stage checks storage before processing.
|
|
1294
|
-
|
|
1295
|
-
Workflow:
|
|
1296
|
-
Stage 0: Resolve DOI from title (if needed)
|
|
1297
|
-
Stage 1: Load or create Paper from storage
|
|
1298
|
-
Stage 2: Find PDF URLs → save to storage
|
|
1299
|
-
Stage 3: Download PDF → save to storage
|
|
1300
|
-
Stage 4: Update project symlinks
|
|
1301
|
-
|
|
1302
|
-
Args:
|
|
1303
|
-
title: Paper title (will resolve DOI using engine)
|
|
1304
|
-
doi: DOI of the paper (preferred if available)
|
|
1305
|
-
project: Project name (uses self.project if None)
|
|
1306
|
-
|
|
1307
|
-
Returns:
|
|
1308
|
-
Fully processed Paper object
|
|
1309
|
-
|
|
1310
|
-
Examples:
|
|
1311
|
-
# With DOI (direct)
|
|
1312
|
-
paper = await scholar.process_paper_async(doi="10.1038/s41598-017-02626-y")
|
|
1313
|
-
|
|
1314
|
-
# With title (resolves DOI first)
|
|
1315
|
-
paper = await scholar.process_paper_async(
|
|
1316
|
-
title="Attention Is All You Need"
|
|
1317
|
-
)
|
|
1318
|
-
"""
|
|
1319
|
-
from scitex.scholar.core.Paper import Paper
|
|
1320
|
-
|
|
1321
|
-
# Validate input
|
|
1322
|
-
if not title and not doi:
|
|
1323
|
-
raise ValueError("Must provide either title or doi")
|
|
1324
|
-
|
|
1325
|
-
project = project or self.project
|
|
1326
|
-
|
|
1327
|
-
logger.info(f"{'=' * 60}")
|
|
1328
|
-
logger.info("Processing paper")
|
|
1329
|
-
if title:
|
|
1330
|
-
logger.info(f"Title: {title[:50]}...")
|
|
1331
|
-
if doi:
|
|
1332
|
-
logger.info(f"DOI: {doi}")
|
|
1333
|
-
logger.info(f"{'=' * 60}")
|
|
1334
|
-
|
|
1335
|
-
# Stage 0: Resolve DOI from title (if needed)
|
|
1336
|
-
if not doi and title:
|
|
1337
|
-
logger.info("Stage 0: Resolving DOI from title...")
|
|
1338
|
-
|
|
1339
|
-
# Use ScholarEngine to search and get DOI
|
|
1340
|
-
results = await self._scholar_engine.search_async(title=title)
|
|
1341
|
-
|
|
1342
|
-
if results and results.get("id", {}).get("doi"):
|
|
1343
|
-
doi = results["id"]["doi"]
|
|
1344
|
-
logger.success(f"Resolved DOI: {doi}")
|
|
1345
|
-
else:
|
|
1346
|
-
logger.error(f"Could not resolve DOI from title: {title}")
|
|
1347
|
-
raise ValueError(f"Could not resolve DOI from title: {title}")
|
|
1348
|
-
|
|
1349
|
-
# Generate paper ID from DOI
|
|
1350
|
-
paper_id = self.config.path_manager._generate_paper_id(doi=doi)
|
|
1351
|
-
storage_path = self.config.get_library_master_dir() / paper_id
|
|
1352
|
-
|
|
1353
|
-
logger.info(f"Paper ID: {paper_id}")
|
|
1354
|
-
logger.info(f"Storage: {storage_path}")
|
|
1355
|
-
|
|
1356
|
-
# Stage 1: Load or create Paper from storage
|
|
1357
|
-
logger.info("\nStage 1: Loading/creating metadata...")
|
|
1358
|
-
if self._library_manager.has_metadata(paper_id):
|
|
1359
|
-
# Load existing from storage
|
|
1360
|
-
paper = self._library_manager.load_paper_from_id(paper_id)
|
|
1361
|
-
logger.info("Loaded existing metadata from storage")
|
|
1362
|
-
else:
|
|
1363
|
-
# Create new Paper
|
|
1364
|
-
paper = Paper()
|
|
1365
|
-
paper.metadata.set_doi(doi)
|
|
1366
|
-
paper.container.scitex_id = paper_id
|
|
1367
|
-
|
|
1368
|
-
# If we have title, save it
|
|
1369
|
-
if title:
|
|
1370
|
-
paper.metadata.basic.title = title
|
|
1371
|
-
|
|
1372
|
-
# Create storage and save
|
|
1373
|
-
self._library_manager.save_paper_incremental(paper_id, paper)
|
|
1374
|
-
logger.success("Created new paper entry in storage")
|
|
1375
|
-
|
|
1376
|
-
# Stage 2: Check/find URLs
|
|
1377
|
-
logger.info("\nStage 2: Checking/finding PDF URLs...")
|
|
1378
|
-
if not self._library_manager.has_urls(paper_id):
|
|
1379
|
-
logger.info(f"Finding PDF URLs for DOI: {doi}")
|
|
1380
|
-
(
|
|
1381
|
-
browser,
|
|
1382
|
-
context,
|
|
1383
|
-
) = await self._browser_manager.get_authenticated_browser_and_context_async()
|
|
1384
|
-
try:
|
|
1385
|
-
url_finder = ScholarURLFinder(context, config=self.config)
|
|
1386
|
-
urls = await url_finder.find_pdf_urls(doi)
|
|
1387
|
-
|
|
1388
|
-
paper.metadata.url.pdfs = urls
|
|
1389
|
-
self._library_manager.save_paper_incremental(paper_id, paper)
|
|
1390
|
-
logger.success(f"Found {len(urls)} PDF URLs, saved to storage")
|
|
1391
|
-
finally:
|
|
1392
|
-
await self._browser_manager.close()
|
|
1393
|
-
else:
|
|
1394
|
-
logger.info(
|
|
1395
|
-
f"PDF URLs already in storage ({len(paper.metadata.url.pdfs)} URLs)"
|
|
1396
|
-
)
|
|
1397
|
-
|
|
1398
|
-
# Stage 3: Check/download PDF
|
|
1399
|
-
logger.info("\nStage 3: Checking/downloading PDF...")
|
|
1400
|
-
if not self._library_manager.has_pdf(paper_id):
|
|
1401
|
-
logger.info("Downloading PDF...")
|
|
1402
|
-
if paper.metadata.url.pdfs:
|
|
1403
|
-
(
|
|
1404
|
-
browser,
|
|
1405
|
-
context,
|
|
1406
|
-
) = await self._browser_manager.get_authenticated_browser_and_context_async()
|
|
1407
|
-
try:
|
|
1408
|
-
downloader = ScholarPDFDownloader(context, config=self.config)
|
|
1409
|
-
|
|
1410
|
-
pdf_url = (
|
|
1411
|
-
paper.metadata.url.pdfs[0]["url"]
|
|
1412
|
-
if isinstance(paper.metadata.url.pdfs[0], dict)
|
|
1413
|
-
else paper.metadata.url.pdfs[0]
|
|
1414
|
-
)
|
|
1415
|
-
temp_path = storage_path / "main.pdf"
|
|
1416
|
-
|
|
1417
|
-
result = await downloader.download_from_url(
|
|
1418
|
-
pdf_url, temp_path, doi=doi
|
|
1419
|
-
)
|
|
1420
|
-
if result and result.exists():
|
|
1421
|
-
paper.metadata.path.pdfs.append(str(result))
|
|
1422
|
-
self._library_manager.save_paper_incremental(paper_id, paper)
|
|
1423
|
-
logger.success(f"{self.name}: Downloaded PDF, saved to storage")
|
|
1424
|
-
else:
|
|
1425
|
-
logger.warning(f"{self.name}: Failed to download PDF")
|
|
1426
|
-
finally:
|
|
1427
|
-
await self._browser_manager.close()
|
|
1428
|
-
else:
|
|
1429
|
-
logger.warning(f"{self.name}: No PDF URLs available for download")
|
|
1430
|
-
else:
|
|
1431
|
-
logger.info(f"{self.name}: PDF already in storage")
|
|
1432
|
-
|
|
1433
|
-
# Stage 4: Update project symlinks
|
|
1434
|
-
if project and project not in ["master", "MASTER"]:
|
|
1435
|
-
logger.info(f"{self.name}: \nStage 4: Updating project symlinks...")
|
|
1436
|
-
self._library_manager.update_symlink(
|
|
1437
|
-
master_storage_path=storage_path,
|
|
1438
|
-
project=project,
|
|
1439
|
-
)
|
|
1440
|
-
logger.success(f"{self.name}: Updated symlink in project: {project}")
|
|
1441
|
-
|
|
1442
|
-
logger.info(f"\n{'=' * 60}")
|
|
1443
|
-
logger.success(f"{self.name}: Paper processing complete")
|
|
1444
|
-
logger.info(f"{'=' * 60}\n")
|
|
1445
|
-
|
|
1446
|
-
return paper
|
|
1447
|
-
|
|
1448
|
-
def process_paper(
|
|
1449
|
-
self,
|
|
1450
|
-
title: Optional[str] = None,
|
|
1451
|
-
doi: Optional[str] = None,
|
|
1452
|
-
project: Optional[str] = None,
|
|
1453
|
-
) -> Paper:
|
|
1454
|
-
"""
|
|
1455
|
-
Synchronous wrapper for process_paper_async.
|
|
1456
|
-
|
|
1457
|
-
See process_paper_async() for full documentation.
|
|
1458
|
-
"""
|
|
1459
|
-
return asyncio.run(
|
|
1460
|
-
self.process_paper_async(title=title, doi=doi, project=project)
|
|
1461
|
-
)
|
|
1462
|
-
|
|
1463
|
-
# =========================================================================
|
|
1464
|
-
# PIPELINE METHODS (Phase 3) - Parallel Papers Processing
|
|
1465
|
-
# =========================================================================
|
|
1466
|
-
|
|
1467
|
-
async def process_papers_async(
|
|
1468
|
-
self,
|
|
1469
|
-
papers: Union[Papers, List[str]],
|
|
1470
|
-
project: Optional[str] = None,
|
|
1471
|
-
max_concurrent: int = 3,
|
|
1472
|
-
) -> Papers:
|
|
1473
|
-
"""
|
|
1474
|
-
Process multiple papers with controlled parallelism.
|
|
1475
|
-
|
|
1476
|
-
Each paper goes through complete sequential pipeline.
|
|
1477
|
-
Semaphore controls how many papers process concurrently.
|
|
1478
|
-
|
|
1479
|
-
Architecture:
|
|
1480
|
-
- Parallel papers (max_concurrent at a time)
|
|
1481
|
-
- Sequential stages per paper
|
|
1482
|
-
- Storage checks before each stage
|
|
1483
|
-
|
|
1484
|
-
Args:
|
|
1485
|
-
papers: Papers collection or list of DOIs
|
|
1486
|
-
project: Project name (uses self.project if None)
|
|
1487
|
-
max_concurrent: Maximum concurrent papers (default: 3)
|
|
1488
|
-
Set to 1 for purely sequential processing
|
|
1489
|
-
|
|
1490
|
-
Returns:
|
|
1491
|
-
Papers collection with processed papers
|
|
1492
|
-
|
|
1493
|
-
Examples:
|
|
1494
|
-
# Process Papers collection (parallel)
|
|
1495
|
-
papers = scholar.load_bibtex("papers.bib")
|
|
1496
|
-
processed = await scholar.process_papers_async(papers, max_concurrent=3)
|
|
1497
|
-
|
|
1498
|
-
# Process DOI list (sequential)
|
|
1499
|
-
dois = ["10.1038/...", "10.1016/...", "10.1109/..."]
|
|
1500
|
-
processed = await scholar.process_papers_async(dois, max_concurrent=1)
|
|
1501
|
-
"""
|
|
1502
|
-
from scitex.scholar.core.Papers import Papers
|
|
1503
|
-
|
|
1504
|
-
project = project or self.project
|
|
1505
|
-
|
|
1506
|
-
# Convert input to Papers collection
|
|
1507
|
-
if isinstance(papers, list):
|
|
1508
|
-
# List of DOI strings
|
|
1509
|
-
papers_list = []
|
|
1510
|
-
for doi in papers:
|
|
1511
|
-
from scitex.scholar.core.Paper import Paper
|
|
1512
|
-
|
|
1513
|
-
p = Paper()
|
|
1514
|
-
p.metadata.set_doi(doi)
|
|
1515
|
-
papers_list.append(p)
|
|
1516
|
-
papers = Papers(papers_list, project=project, config=self.config)
|
|
1517
|
-
|
|
1518
|
-
total = len(papers)
|
|
1519
|
-
logger.info(f"{self.name}: \n{'=' * 60}")
|
|
1520
|
-
logger.info(
|
|
1521
|
-
f"{self.name}: Processing {total} papers (max_concurrent={max_concurrent})"
|
|
1522
|
-
)
|
|
1523
|
-
logger.info(f"{self.name}: Project: {project}")
|
|
1524
|
-
logger.info(f"{self.name}: {'=' * 60}\n")
|
|
1525
|
-
|
|
1526
|
-
# Use semaphore for controlled parallelism
|
|
1527
|
-
semaphore = asyncio.Semaphore(max_concurrent)
|
|
1528
|
-
|
|
1529
|
-
async def process_with_semaphore(paper, index):
|
|
1530
|
-
"""Process one paper with semaphore control."""
|
|
1531
|
-
async with semaphore:
|
|
1532
|
-
logger.info(f"{self.name}: \n[{index}/{total}] Starting paper...")
|
|
1533
|
-
try:
|
|
1534
|
-
result = await self.process_paper_async(
|
|
1535
|
-
title=paper.metadata.basic.title,
|
|
1536
|
-
doi=paper.metadata.id.doi,
|
|
1537
|
-
project=project,
|
|
1538
|
-
)
|
|
1539
|
-
logger.success(f"{self.name}: [{index}/{total}] Completed")
|
|
1540
|
-
return result
|
|
1541
|
-
except Exception as e:
|
|
1542
|
-
logger.error(f"{self.name}: [{index}/{total}] Failed: {e}")
|
|
1543
|
-
return None
|
|
1544
|
-
|
|
1545
|
-
# Create tasks for all papers
|
|
1546
|
-
tasks = [process_with_semaphore(paper, i + 1) for i, paper in enumerate(papers)]
|
|
1547
|
-
|
|
1548
|
-
# Process with controlled parallelism
|
|
1549
|
-
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
1550
|
-
|
|
1551
|
-
# Filter successful results
|
|
1552
|
-
processed_papers = []
|
|
1553
|
-
errors = 0
|
|
1554
|
-
for i, result in enumerate(results):
|
|
1555
|
-
if isinstance(result, Exception):
|
|
1556
|
-
logger.error(f"{self.name}: Paper {i + 1} raised exception: {result}")
|
|
1557
|
-
errors += 1
|
|
1558
|
-
elif result is not None:
|
|
1559
|
-
processed_papers.append(result)
|
|
1560
|
-
|
|
1561
|
-
# Summary
|
|
1562
|
-
logger.info(f"{self.name}: \n{'=' * 60}")
|
|
1563
|
-
logger.info(f"{self.name}: Batch Processing Complete")
|
|
1564
|
-
logger.info(f"{self.name}: Total: {total}")
|
|
1565
|
-
logger.info(f"{self.name}: Successful: {len(processed_papers)}")
|
|
1566
|
-
logger.info(f"{self.name}: Failed: {total - len(processed_papers)}")
|
|
1567
|
-
logger.info(f"{self.name}: Errors: {errors}")
|
|
1568
|
-
logger.info(f"{self.name}: {'=' * 60}\n")
|
|
1569
|
-
|
|
1570
|
-
return Papers(processed_papers, project=project, config=self.config)
|
|
1571
|
-
|
|
1572
|
-
def process_papers(
|
|
1573
|
-
self,
|
|
1574
|
-
papers: Union[Papers, List[str]],
|
|
1575
|
-
project: Optional[str] = None,
|
|
1576
|
-
max_concurrent: int = 3,
|
|
1577
|
-
) -> Papers:
|
|
1578
|
-
"""
|
|
1579
|
-
Synchronous wrapper for process_papers_async.
|
|
1580
|
-
|
|
1581
|
-
See process_papers_async() for full documentation.
|
|
1582
|
-
"""
|
|
1583
|
-
return asyncio.run(
|
|
1584
|
-
self.process_papers_async(
|
|
1585
|
-
papers=papers,
|
|
1586
|
-
project=project,
|
|
1587
|
-
max_concurrent=max_concurrent,
|
|
1588
|
-
)
|
|
1589
|
-
)
|
|
1590
|
-
|
|
1591
|
-
# =========================================================================
|
|
1592
|
-
# INTERNAL SERVICES (PRIVATE - users should not access these directly)
|
|
1593
|
-
# =========================================================================
|
|
1594
|
-
def _init_config(self, config):
|
|
1595
|
-
# Handle different config input types
|
|
1596
|
-
if config is None:
|
|
1597
|
-
return ScholarConfig.load() # Auto-detect config
|
|
1598
|
-
elif isinstance(config, (str, Path)):
|
|
1599
|
-
return ScholarConfig.from_yaml(config)
|
|
1600
|
-
elif isinstance(config, ScholarConfig):
|
|
1601
|
-
return config
|
|
1602
|
-
else:
|
|
1603
|
-
raise TypeError(f"Invalid config type: {type(config)}")
|
|
1604
|
-
|
|
1605
|
-
@property
|
|
1606
|
-
def _scholar_engine(self) -> ScholarEngine:
|
|
1607
|
-
"""Get Scholar engine for search and enrichment (PRIVATE)."""
|
|
1608
|
-
if not hasattr(self, "__scholar_engine") or self.__scholar_engine is None:
|
|
1609
|
-
self.__scholar_engine = ScholarEngine(config=self.config)
|
|
1610
|
-
return self.__scholar_engine
|
|
1611
|
-
|
|
1612
|
-
@property
|
|
1613
|
-
def _auth_manager(self) -> ScholarAuthManager:
|
|
1614
|
-
"""Get authentication manager service (PRIVATE)."""
|
|
1615
|
-
if not hasattr(self, "__auth_manager") or self.__auth_manager is None:
|
|
1616
|
-
self.__auth_manager = ScholarAuthManager()
|
|
1617
|
-
return self.__auth_manager
|
|
1618
|
-
|
|
1619
|
-
@property
|
|
1620
|
-
def _browser_manager(self) -> ScholarBrowserManager:
|
|
1621
|
-
"""Get browser manager service (PRIVATE)."""
|
|
1622
|
-
if not hasattr(self, "__browser_manager") or self.__browser_manager is None:
|
|
1623
|
-
self.__browser_manager = ScholarBrowserManager(
|
|
1624
|
-
auth_manager=self._auth_manager,
|
|
1625
|
-
chrome_profile_name="system",
|
|
1626
|
-
browser_mode=self.browser_mode,
|
|
1627
|
-
)
|
|
1628
|
-
return self.__browser_manager
|
|
1629
|
-
|
|
1630
|
-
@property
|
|
1631
|
-
def _library_manager(self) -> LibraryManager:
|
|
1632
|
-
"""Get library manager service - low-level operations (PRIVATE)."""
|
|
1633
|
-
if not hasattr(self, "__library_manager") or self.__library_manager is None:
|
|
1634
|
-
self.__library_manager = LibraryManager(
|
|
1635
|
-
project=self.project, config=self.config
|
|
1636
|
-
)
|
|
1637
|
-
return self.__library_manager
|
|
1638
|
-
|
|
1639
|
-
@property
|
|
1640
|
-
def _library(self) -> ScholarLibrary:
|
|
1641
|
-
"""Get Scholar library service - high-level operations (PRIVATE)."""
|
|
1642
|
-
if not hasattr(self, "__library") or self.__library is None:
|
|
1643
|
-
self.__library = ScholarLibrary(project=self.project, config=self.config)
|
|
1644
|
-
return self.__library
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
# Export all classes and functions
|
|
1648
125
|
__all__ = ["Scholar"]
|
|
1649
126
|
|
|
127
|
+
|
|
1650
128
|
if __name__ == "__main__":
|
|
1651
|
-
from
|
|
1652
|
-
from
|
|
129
|
+
from .Paper import Paper
|
|
130
|
+
from .Papers import Papers
|
|
1653
131
|
|
|
1654
132
|
def main():
|
|
1655
133
|
"""Demonstrate Scholar class usage - Clean API Demo."""
|
|
1656
134
|
print("\n" + "=" * 60)
|
|
1657
|
-
print("
|
|
135
|
+
print("Scholar Module Demo - Clean API")
|
|
1658
136
|
print("=" * 60 + "\n")
|
|
1659
137
|
|
|
1660
|
-
# ----------------------------------------
|
|
1661
138
|
# 1. Initialize Scholar
|
|
1662
|
-
|
|
1663
|
-
print("1️⃣ Initialize Scholar")
|
|
139
|
+
print("1. Initialize Scholar")
|
|
1664
140
|
print("-" * 60)
|
|
1665
141
|
scholar = Scholar(
|
|
1666
142
|
project="demo_project",
|
|
1667
143
|
project_description="Demo project for testing Scholar API",
|
|
1668
144
|
)
|
|
1669
|
-
print("
|
|
145
|
+
print("Scholar initialized")
|
|
1670
146
|
print(f" Project: {scholar.project}")
|
|
1671
|
-
print(f" Workspace: {scholar.get_workspace_dir()}")
|
|
1672
147
|
print()
|
|
1673
148
|
|
|
1674
|
-
#
|
|
149
|
+
# 2. Project Management
|
|
1675
150
|
print("2. Project Management:")
|
|
1676
151
|
try:
|
|
1677
|
-
# Create a new project
|
|
1678
152
|
project_dir = scholar._create_project_metadata(
|
|
1679
153
|
"neural_networks_2024",
|
|
1680
154
|
description="Collection of neural network papers from 2024",
|
|
1681
155
|
)
|
|
1682
|
-
print("
|
|
1683
|
-
print(f"
|
|
156
|
+
print(" Created project: neural_networks_2024")
|
|
157
|
+
print(f" Project directory: {project_dir}")
|
|
1684
158
|
|
|
1685
|
-
# List all projects
|
|
1686
159
|
projects = scholar.list_projects()
|
|
1687
|
-
print(f"
|
|
1688
|
-
for
|
|
1689
|
-
print(
|
|
1690
|
-
f" - {project['name']}: {project.get('description', 'No description')}"
|
|
1691
|
-
)
|
|
160
|
+
print(f" Total projects in library: {len(projects)}")
|
|
161
|
+
for proj in projects[:3]:
|
|
162
|
+
print(f" - {proj['name']}: {proj.get('description', 'No desc')}")
|
|
1692
163
|
if len(projects) > 3:
|
|
1693
164
|
print(f" ... and {len(projects) - 3} more")
|
|
1694
165
|
|
|
1695
166
|
except Exception as e:
|
|
1696
|
-
print(f"
|
|
167
|
+
print(f" Project management demo skipped: {e}")
|
|
1697
168
|
print()
|
|
1698
169
|
|
|
1699
|
-
#
|
|
170
|
+
# 3. Library Statistics
|
|
1700
171
|
print("3. Library Statistics:")
|
|
1701
172
|
try:
|
|
1702
173
|
stats = scholar.get_library_statistics()
|
|
1703
|
-
print(f"
|
|
1704
|
-
print(f"
|
|
1705
|
-
print(f"
|
|
1706
|
-
print(f"
|
|
174
|
+
print(f" Total projects: {stats['total_projects']}")
|
|
175
|
+
print(f" Total papers: {stats['total_papers']}")
|
|
176
|
+
print(f" Storage usage: {stats['storage_mb']:.2f} MB")
|
|
177
|
+
print(f" Library path: {stats['library_path']}")
|
|
1707
178
|
|
|
1708
179
|
except Exception as e:
|
|
1709
|
-
print(f"
|
|
180
|
+
print(f" Library statistics demo skipped: {e}")
|
|
1710
181
|
print()
|
|
1711
182
|
|
|
1712
|
-
#
|
|
183
|
+
# 4. Working with Papers
|
|
1713
184
|
print("4. Working with Papers:")
|
|
1714
|
-
|
|
1715
|
-
# Create some sample papers with Pydantic structure
|
|
1716
185
|
p1 = Paper()
|
|
1717
186
|
p1.metadata.basic.title = "Vision Transformer: An Image Is Worth 16x16 Words"
|
|
1718
187
|
p1.metadata.basic.authors = ["Dosovitskiy, Alexey", "Beyer, Lucas"]
|
|
1719
188
|
p1.metadata.basic.year = 2021
|
|
1720
|
-
p1.metadata.basic.keywords = [
|
|
1721
|
-
"vision transformer",
|
|
1722
|
-
"computer vision",
|
|
1723
|
-
"attention",
|
|
1724
|
-
]
|
|
1725
189
|
p1.metadata.publication.journal = "ICLR"
|
|
1726
190
|
p1.metadata.set_doi("10.48550/arXiv.2010.11929")
|
|
1727
|
-
p1.container.projects = ["neural_networks_2024"]
|
|
1728
191
|
|
|
1729
|
-
|
|
1730
|
-
p2.metadata.basic.title = "Scaling Laws for Neural Language Models"
|
|
1731
|
-
p2.metadata.basic.authors = ["Kaplan, Jared", "McCandlish, Sam"]
|
|
1732
|
-
p2.metadata.basic.year = 2020
|
|
1733
|
-
p2.metadata.basic.keywords = ["scaling laws", "language models", "GPT"]
|
|
1734
|
-
p2.metadata.publication.journal = "arXiv preprint"
|
|
1735
|
-
p2.metadata.set_doi("10.48550/arXiv.2001.08361")
|
|
1736
|
-
p2.container.projects = ["neural_networks_2024"]
|
|
1737
|
-
|
|
1738
|
-
sample_papers = [p1, p2]
|
|
1739
|
-
|
|
1740
|
-
# Create Papers collection
|
|
192
|
+
sample_papers = [p1]
|
|
1741
193
|
papers = Papers(
|
|
1742
194
|
sample_papers,
|
|
1743
195
|
project="neural_networks_2024",
|
|
1744
196
|
config=scholar.config,
|
|
1745
197
|
)
|
|
1746
|
-
print(f"
|
|
1747
|
-
|
|
1748
|
-
# Use Scholar to work with the collection
|
|
1749
|
-
# Switch project by creating new instance (cleaner pattern)
|
|
1750
|
-
scholar = Scholar(project="neural_networks_2024")
|
|
1751
|
-
print(f" 🎯 Set Scholar project to: {scholar.project}")
|
|
1752
|
-
print()
|
|
1753
|
-
|
|
1754
|
-
# Demonstrate DOI resolution workflow
|
|
1755
|
-
print("5. Scholar Workflow Integration:")
|
|
1756
|
-
try:
|
|
1757
|
-
# Create a sample BibTeX content for demonstration
|
|
1758
|
-
sample_bibtex = """
|
|
1759
|
-
@article{sample2024,
|
|
1760
|
-
title = {Sample Paper for Demo},
|
|
1761
|
-
author = {Demo, Author},
|
|
1762
|
-
year = {2024},
|
|
1763
|
-
journal = {Demo Journal}
|
|
1764
|
-
}
|
|
1765
|
-
"""
|
|
1766
|
-
|
|
1767
|
-
# Demonstrate BibTeX loading
|
|
1768
|
-
papers_from_bibtex = scholar.load_bibtex(sample_bibtex.strip())
|
|
1769
|
-
print(f" 📄 Loaded {len(papers_from_bibtex)} papers from BibTeX")
|
|
1770
|
-
|
|
1771
|
-
# Demonstrate project loading
|
|
1772
|
-
if scholar.project:
|
|
1773
|
-
try:
|
|
1774
|
-
project_papers = scholar.load_project()
|
|
1775
|
-
print(
|
|
1776
|
-
f" 📂 Loaded {len(project_papers)} papers from current project"
|
|
1777
|
-
)
|
|
1778
|
-
except:
|
|
1779
|
-
print(" 📂 Current project is empty or doesn't exist yet")
|
|
1780
|
-
|
|
1781
|
-
except Exception as e:
|
|
1782
|
-
print(f" ⚠️ Workflow demo partially skipped: {e}")
|
|
1783
|
-
print()
|
|
1784
|
-
|
|
1785
|
-
# Demonstrate search capabilities
|
|
1786
|
-
print("6. Search Capabilities:")
|
|
1787
|
-
try:
|
|
1788
|
-
# Search across projects
|
|
1789
|
-
search_results = scholar.search_across_projects("transformer")
|
|
1790
|
-
print(
|
|
1791
|
-
f" 🔍 Search for 'transformer': {len(search_results)} results across all projects"
|
|
1792
|
-
)
|
|
1793
|
-
|
|
1794
|
-
# Search in current library (existing papers)
|
|
1795
|
-
library_search = scholar.search_library("vision")
|
|
1796
|
-
print(f" 🔍 Library search for 'vision': {len(library_search)} results")
|
|
1797
|
-
|
|
1798
|
-
except Exception as e:
|
|
1799
|
-
print(f" ⚠️ Search demo skipped: {e}")
|
|
1800
|
-
print()
|
|
1801
|
-
|
|
1802
|
-
# Demonstrate configuration access
|
|
1803
|
-
print("7. Configuration Management:")
|
|
1804
|
-
print(f" ⚙️ Scholar directory: {scholar.config.paths.scholar_dir}")
|
|
1805
|
-
print(f" ⚙️ Library directory: {scholar.config.get_library_project_dir()}")
|
|
1806
|
-
print(
|
|
1807
|
-
f" ⚙️ Debug mode: {scholar.config.resolve('debug_mode', default=False)}"
|
|
1808
|
-
)
|
|
198
|
+
print(f" Created collection with {len(papers)} papers")
|
|
1809
199
|
print()
|
|
1810
200
|
|
|
1811
|
-
#
|
|
1812
|
-
print("
|
|
1813
|
-
print(f"
|
|
1814
|
-
print(f"
|
|
1815
|
-
print(f" 🔧 Browser Manager: {type(scholar._browser_manager).__name__}")
|
|
1816
|
-
print(f" 🔧 Library Manager: {type(scholar._library_manager).__name__}")
|
|
201
|
+
# 5. Configuration
|
|
202
|
+
print("5. Configuration Management:")
|
|
203
|
+
print(f" Scholar directory: {scholar.config.paths.scholar_dir}")
|
|
204
|
+
print(f" Library directory: {scholar.config.get_library_project_dir()}")
|
|
1817
205
|
print()
|
|
1818
206
|
|
|
1819
|
-
#
|
|
1820
|
-
print("
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
backup_dir = Path(tempfile.mkdtemp()) / "scholar_backup"
|
|
1826
|
-
backup_info = scholar.backup_library(backup_dir)
|
|
1827
|
-
print(" 💾 Library backup created:")
|
|
1828
|
-
print(f" 📁 Location: {backup_info['backup']}")
|
|
1829
|
-
print(f" 📊 Size: {backup_info['size_mb']:.2f} MB")
|
|
1830
|
-
print(f" 🕐 Timestamp: {backup_info['timestamp']}")
|
|
1831
|
-
|
|
1832
|
-
# Clean up
|
|
1833
|
-
import shutil
|
|
1834
|
-
|
|
1835
|
-
shutil.rmtree(backup_dir, ignore_errors=True)
|
|
1836
|
-
|
|
1837
|
-
except Exception as e:
|
|
1838
|
-
print(f" ⚠️ Backup demo skipped: {e}")
|
|
207
|
+
# 6. Service Components
|
|
208
|
+
print("6. Service Components (Internal):")
|
|
209
|
+
print(f" Scholar Engine: {type(scholar._scholar_engine).__name__}")
|
|
210
|
+
print(f" Auth Manager: {type(scholar._auth_manager).__name__}")
|
|
211
|
+
print(f" Browser Manager: {type(scholar._browser_manager).__name__}")
|
|
212
|
+
print(f" Library Manager: {type(scholar._library_manager).__name__}")
|
|
1839
213
|
print()
|
|
1840
214
|
|
|
1841
|
-
print("Scholar
|
|
1842
|
-
print()
|
|
1843
|
-
print("💡 Key Scholar Capabilities:")
|
|
1844
|
-
print(" • Global library management and statistics")
|
|
1845
|
-
print(" • Project creation and organization")
|
|
1846
|
-
print(" • Cross-project search and analysis")
|
|
1847
|
-
print(" • Integration with Paper and Papers classes")
|
|
1848
|
-
print(" • DOI resolution and metadata enrichment")
|
|
1849
|
-
print(" • PDF download and browser automation")
|
|
1850
|
-
print(" • Backup and maintenance operations")
|
|
1851
|
-
print()
|
|
215
|
+
print("Scholar demo completed!")
|
|
1852
216
|
|
|
1853
217
|
main()
|
|
1854
218
|
|
|
1855
|
-
# python -m scitex.scholar.core.Scholar
|
|
1856
219
|
|
|
1857
220
|
# EOF
|