scitex 2.14.0__py3-none-any.whl → 2.15.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +71 -17
- scitex/_env_loader.py +156 -0
- scitex/_mcp_resources/__init__.py +37 -0
- scitex/_mcp_resources/_cheatsheet.py +135 -0
- scitex/_mcp_resources/_figrecipe.py +138 -0
- scitex/_mcp_resources/_formats.py +102 -0
- scitex/_mcp_resources/_modules.py +337 -0
- scitex/_mcp_resources/_session.py +149 -0
- scitex/_mcp_tools/__init__.py +4 -0
- scitex/_mcp_tools/audio.py +66 -0
- scitex/_mcp_tools/diagram.py +11 -95
- scitex/_mcp_tools/introspect.py +210 -0
- scitex/_mcp_tools/plt.py +260 -305
- scitex/_mcp_tools/scholar.py +74 -0
- scitex/_mcp_tools/social.py +27 -0
- scitex/_mcp_tools/template.py +24 -0
- scitex/_mcp_tools/writer.py +17 -210
- scitex/ai/_gen_ai/_PARAMS.py +10 -7
- scitex/ai/classification/reporters/_SingleClassificationReporter.py +45 -1603
- scitex/ai/classification/reporters/_mixins/__init__.py +36 -0
- scitex/ai/classification/reporters/_mixins/_constants.py +67 -0
- scitex/ai/classification/reporters/_mixins/_cv_summary.py +387 -0
- scitex/ai/classification/reporters/_mixins/_feature_importance.py +119 -0
- scitex/ai/classification/reporters/_mixins/_metrics.py +275 -0
- scitex/ai/classification/reporters/_mixins/_plotting.py +179 -0
- scitex/ai/classification/reporters/_mixins/_reports.py +153 -0
- scitex/ai/classification/reporters/_mixins/_storage.py +160 -0
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +30 -1550
- scitex/ai/classification/timeseries/_sliding_window_core.py +467 -0
- scitex/ai/classification/timeseries/_sliding_window_plotting.py +369 -0
- scitex/audio/README.md +40 -36
- scitex/audio/__init__.py +129 -61
- scitex/audio/_branding.py +185 -0
- scitex/audio/_mcp/__init__.py +32 -0
- scitex/audio/_mcp/handlers.py +59 -6
- scitex/audio/_mcp/speak_handlers.py +238 -0
- scitex/audio/_relay.py +225 -0
- scitex/audio/_tts.py +18 -10
- scitex/audio/engines/base.py +17 -10
- scitex/audio/engines/elevenlabs_engine.py +7 -2
- scitex/audio/mcp_server.py +228 -75
- scitex/canvas/README.md +1 -1
- scitex/canvas/editor/_dearpygui/__init__.py +25 -0
- scitex/canvas/editor/_dearpygui/_editor.py +147 -0
- scitex/canvas/editor/_dearpygui/_handlers.py +476 -0
- scitex/canvas/editor/_dearpygui/_panels/__init__.py +17 -0
- scitex/canvas/editor/_dearpygui/_panels/_control.py +119 -0
- scitex/canvas/editor/_dearpygui/_panels/_element_controls.py +190 -0
- scitex/canvas/editor/_dearpygui/_panels/_preview.py +43 -0
- scitex/canvas/editor/_dearpygui/_panels/_sections.py +390 -0
- scitex/canvas/editor/_dearpygui/_plotting.py +187 -0
- scitex/canvas/editor/_dearpygui/_rendering.py +504 -0
- scitex/canvas/editor/_dearpygui/_selection.py +295 -0
- scitex/canvas/editor/_dearpygui/_state.py +93 -0
- scitex/canvas/editor/_dearpygui/_utils.py +61 -0
- scitex/canvas/editor/flask_editor/_core/__init__.py +27 -0
- scitex/canvas/editor/flask_editor/_core/_bbox_extraction.py +200 -0
- scitex/canvas/editor/flask_editor/_core/_editor.py +173 -0
- scitex/canvas/editor/flask_editor/_core/_export_helpers.py +353 -0
- scitex/canvas/editor/flask_editor/_core/_routes_basic.py +190 -0
- scitex/canvas/editor/flask_editor/_core/_routes_export.py +332 -0
- scitex/canvas/editor/flask_editor/_core/_routes_panels.py +252 -0
- scitex/canvas/editor/flask_editor/_core/_routes_save.py +218 -0
- scitex/canvas/editor/flask_editor/_core.py +25 -1684
- scitex/canvas/editor/flask_editor/templates/__init__.py +32 -70
- scitex/cli/__init__.py +38 -43
- scitex/cli/audio.py +160 -41
- scitex/cli/capture.py +133 -20
- scitex/cli/introspect.py +488 -0
- scitex/cli/main.py +200 -109
- scitex/cli/mcp.py +60 -34
- scitex/cli/plt.py +414 -0
- scitex/cli/repro.py +15 -8
- scitex/cli/resource.py +15 -8
- scitex/cli/scholar/__init__.py +154 -8
- scitex/cli/scholar/_crossref_scitex.py +296 -0
- scitex/cli/scholar/_fetch.py +25 -3
- scitex/cli/social.py +355 -0
- scitex/cli/stats.py +136 -11
- scitex/cli/template.py +129 -12
- scitex/cli/tex.py +15 -8
- scitex/cli/writer.py +49 -299
- scitex/cloud/__init__.py +41 -2
- scitex/config/README.md +1 -1
- scitex/config/__init__.py +16 -2
- scitex/config/_env_registry.py +256 -0
- scitex/context/__init__.py +22 -0
- scitex/dev/__init__.py +20 -1
- scitex/diagram/__init__.py +42 -19
- scitex/diagram/mcp_server.py +13 -125
- scitex/gen/__init__.py +50 -14
- scitex/gen/_list_packages.py +4 -4
- scitex/introspect/__init__.py +82 -0
- scitex/introspect/_call_graph.py +303 -0
- scitex/introspect/_class_hierarchy.py +163 -0
- scitex/introspect/_core.py +41 -0
- scitex/introspect/_docstring.py +131 -0
- scitex/introspect/_examples.py +113 -0
- scitex/introspect/_imports.py +271 -0
- scitex/{gen/_inspect_module.py → introspect/_list_api.py} +48 -56
- scitex/introspect/_mcp/__init__.py +41 -0
- scitex/introspect/_mcp/handlers.py +233 -0
- scitex/introspect/_members.py +155 -0
- scitex/introspect/_resolve.py +89 -0
- scitex/introspect/_signature.py +131 -0
- scitex/introspect/_source.py +80 -0
- scitex/introspect/_type_hints.py +172 -0
- scitex/io/_save.py +1 -2
- scitex/io/bundle/README.md +1 -1
- scitex/logging/_formatters.py +19 -9
- scitex/mcp_server.py +98 -5
- scitex/os/__init__.py +4 -0
- scitex/{gen → os}/_check_host.py +4 -5
- scitex/plt/__init__.py +245 -550
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +5 -10
- scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/plt/gallery/README.md +1 -1
- scitex/plt/utils/_hitmap/__init__.py +82 -0
- scitex/plt/utils/_hitmap/_artist_extraction.py +343 -0
- scitex/plt/utils/_hitmap/_color_application.py +346 -0
- scitex/plt/utils/_hitmap/_color_conversion.py +121 -0
- scitex/plt/utils/_hitmap/_constants.py +40 -0
- scitex/plt/utils/_hitmap/_hitmap_core.py +334 -0
- scitex/plt/utils/_hitmap/_path_extraction.py +357 -0
- scitex/plt/utils/_hitmap/_query.py +113 -0
- scitex/plt/utils/_hitmap.py +46 -1616
- scitex/plt/utils/_metadata/__init__.py +80 -0
- scitex/plt/utils/_metadata/_artists/__init__.py +25 -0
- scitex/plt/utils/_metadata/_artists/_base.py +195 -0
- scitex/plt/utils/_metadata/_artists/_collections.py +356 -0
- scitex/plt/utils/_metadata/_artists/_extract.py +57 -0
- scitex/plt/utils/_metadata/_artists/_images.py +80 -0
- scitex/plt/utils/_metadata/_artists/_lines.py +261 -0
- scitex/plt/utils/_metadata/_artists/_patches.py +247 -0
- scitex/plt/utils/_metadata/_artists/_text.py +106 -0
- scitex/plt/utils/_metadata/_csv.py +416 -0
- scitex/plt/utils/_metadata/_detect.py +225 -0
- scitex/plt/utils/_metadata/_legend.py +127 -0
- scitex/plt/utils/_metadata/_rounding.py +117 -0
- scitex/plt/utils/_metadata/_verification.py +202 -0
- scitex/schema/README.md +1 -1
- scitex/scholar/__init__.py +8 -0
- scitex/scholar/_mcp/crossref_handlers.py +265 -0
- scitex/scholar/core/Scholar.py +63 -1700
- scitex/scholar/core/_mixins/__init__.py +36 -0
- scitex/scholar/core/_mixins/_enrichers.py +270 -0
- scitex/scholar/core/_mixins/_library_handlers.py +100 -0
- scitex/scholar/core/_mixins/_loaders.py +103 -0
- scitex/scholar/core/_mixins/_pdf_download.py +375 -0
- scitex/scholar/core/_mixins/_pipeline.py +312 -0
- scitex/scholar/core/_mixins/_project_handlers.py +125 -0
- scitex/scholar/core/_mixins/_savers.py +69 -0
- scitex/scholar/core/_mixins/_search.py +103 -0
- scitex/scholar/core/_mixins/_services.py +88 -0
- scitex/scholar/core/_mixins/_url_finding.py +105 -0
- scitex/scholar/crossref_scitex.py +367 -0
- scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/scholar/examples/00_run_all.sh +120 -0
- scitex/scholar/jobs/_executors.py +27 -3
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +38 -416
- scitex/scholar/pdf_download/_cli.py +154 -0
- scitex/scholar/pdf_download/strategies/__init__.py +11 -8
- scitex/scholar/pdf_download/strategies/manual_download_fallback.py +80 -3
- scitex/scholar/pipelines/ScholarPipelineBibTeX.py +73 -121
- scitex/scholar/pipelines/ScholarPipelineParallel.py +80 -138
- scitex/scholar/pipelines/ScholarPipelineSingle.py +43 -63
- scitex/scholar/pipelines/_single_steps.py +71 -36
- scitex/scholar/storage/_LibraryManager.py +97 -1695
- scitex/scholar/storage/_mixins/__init__.py +30 -0
- scitex/scholar/storage/_mixins/_bibtex_handlers.py +128 -0
- scitex/scholar/storage/_mixins/_library_operations.py +218 -0
- scitex/scholar/storage/_mixins/_metadata_conversion.py +226 -0
- scitex/scholar/storage/_mixins/_paper_saving.py +456 -0
- scitex/scholar/storage/_mixins/_resolution.py +376 -0
- scitex/scholar/storage/_mixins/_storage_helpers.py +121 -0
- scitex/scholar/storage/_mixins/_symlink_handlers.py +226 -0
- scitex/security/README.md +3 -3
- scitex/session/README.md +1 -1
- scitex/session/__init__.py +26 -7
- scitex/session/_decorator.py +1 -1
- scitex/sh/README.md +1 -1
- scitex/sh/__init__.py +7 -4
- scitex/social/__init__.py +155 -0
- scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
- scitex/stats/_mcp/_handlers/__init__.py +31 -0
- scitex/stats/_mcp/_handlers/_corrections.py +113 -0
- scitex/stats/_mcp/_handlers/_descriptive.py +78 -0
- scitex/stats/_mcp/_handlers/_effect_size.py +106 -0
- scitex/stats/_mcp/_handlers/_format.py +94 -0
- scitex/stats/_mcp/_handlers/_normality.py +110 -0
- scitex/stats/_mcp/_handlers/_posthoc.py +224 -0
- scitex/stats/_mcp/_handlers/_power.py +247 -0
- scitex/stats/_mcp/_handlers/_recommend.py +102 -0
- scitex/stats/_mcp/_handlers/_run_test.py +279 -0
- scitex/stats/_mcp/_handlers/_stars.py +48 -0
- scitex/stats/_mcp/handlers.py +19 -1171
- scitex/stats/auto/_stat_style.py +175 -0
- scitex/stats/auto/_style_definitions.py +411 -0
- scitex/stats/auto/_styles.py +22 -620
- scitex/stats/descriptive/__init__.py +11 -8
- scitex/stats/descriptive/_ci.py +39 -0
- scitex/stats/power/_power.py +15 -4
- scitex/str/__init__.py +2 -1
- scitex/str/_title_case.py +63 -0
- scitex/template/README.md +1 -1
- scitex/template/__init__.py +25 -10
- scitex/template/_code_templates.py +147 -0
- scitex/template/_mcp/handlers.py +81 -0
- scitex/template/_mcp/tool_schemas.py +55 -0
- scitex/template/_templates/__init__.py +51 -0
- scitex/template/_templates/audio.py +233 -0
- scitex/template/_templates/canvas.py +312 -0
- scitex/template/_templates/capture.py +268 -0
- scitex/template/_templates/config.py +43 -0
- scitex/template/_templates/diagram.py +294 -0
- scitex/template/_templates/io.py +107 -0
- scitex/template/_templates/module.py +53 -0
- scitex/template/_templates/plt.py +202 -0
- scitex/template/_templates/scholar.py +267 -0
- scitex/template/_templates/session.py +130 -0
- scitex/template/_templates/session_minimal.py +43 -0
- scitex/template/_templates/session_plot.py +67 -0
- scitex/template/_templates/session_stats.py +77 -0
- scitex/template/_templates/stats.py +323 -0
- scitex/template/_templates/writer.py +296 -0
- scitex/template/clone_writer_directory.py +5 -5
- scitex/ui/_backends/_email.py +10 -2
- scitex/ui/_backends/_webhook.py +5 -1
- scitex/web/_search_pubmed.py +10 -6
- scitex/writer/README.md +1 -1
- scitex/writer/__init__.py +43 -34
- scitex/writer/_mcp/handlers.py +11 -744
- scitex/writer/_mcp/tool_schemas.py +5 -335
- scitex-2.15.3.dist-info/METADATA +667 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/RECORD +241 -120
- scitex/canvas/editor/flask_editor/templates/_scripts.py +0 -4933
- scitex/canvas/editor/flask_editor/templates/_styles.py +0 -1658
- scitex/diagram/_compile.py +0 -312
- scitex/diagram/_diagram.py +0 -355
- scitex/diagram/_mcp/__init__.py +0 -4
- scitex/diagram/_mcp/handlers.py +0 -400
- scitex/diagram/_mcp/tool_schemas.py +0 -157
- scitex/diagram/_presets.py +0 -173
- scitex/diagram/_schema.py +0 -182
- scitex/diagram/_split.py +0 -278
- scitex/gen/_ci.py +0 -12
- scitex/gen/_title_case.py +0 -89
- scitex/plt/_mcp/__init__.py +0 -4
- scitex/plt/_mcp/_handlers_annotation.py +0 -102
- scitex/plt/_mcp/_handlers_figure.py +0 -195
- scitex/plt/_mcp/_handlers_plot.py +0 -252
- scitex/plt/_mcp/_handlers_style.py +0 -219
- scitex/plt/_mcp/handlers.py +0 -74
- scitex/plt/_mcp/tool_schemas.py +0 -497
- scitex/plt/mcp_server.py +0 -231
- scitex/scholar/examples/SUGGESTIONS.md +0 -865
- scitex/scholar/examples/dev.py +0 -38
- scitex-2.14.0.dist-info/METADATA +0 -1238
- /scitex/{gen → context}/_detect_environment.py +0 -0
- /scitex/{gen → context}/_get_notebook_path.py +0 -0
- /scitex/{gen/_shell.py → sh/_shell_legacy.py} +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/WHEEL +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/entry_points.txt +0 -0
- {scitex-2.14.0.dist-info → scitex-2.15.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Timestamp: "2026-01-24 (ywatanabe)"
|
|
3
|
+
# File: /home/ywatanabe/proj/scitex-python/src/scitex/scholar/storage/_mixins/_resolution.py
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
DOI resolution mixin for LibraryManager.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
from scitex import logging
|
|
17
|
+
from scitex.scholar.utils import TextNormalizer
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ResolutionMixin:
|
|
23
|
+
"""Mixin providing DOI resolution methods."""
|
|
24
|
+
|
|
25
|
+
def check_library_for_doi(
|
|
26
|
+
self, title: str, year: Optional[int] = None
|
|
27
|
+
) -> Optional[str]:
|
|
28
|
+
"""Check if DOI already exists in master Scholar library."""
|
|
29
|
+
try:
|
|
30
|
+
for paper_dir in self.library_master_dir.iterdir():
|
|
31
|
+
if not paper_dir.is_dir():
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
metadata_file = paper_dir / "metadata.json"
|
|
35
|
+
if metadata_file.exists():
|
|
36
|
+
try:
|
|
37
|
+
with open(metadata_file) as file_:
|
|
38
|
+
metadata = json.load(file_)
|
|
39
|
+
|
|
40
|
+
stored_title = metadata.get("title", "")
|
|
41
|
+
stored_year = metadata.get("year")
|
|
42
|
+
stored_doi = metadata.get("doi")
|
|
43
|
+
|
|
44
|
+
title_match = self._is_title_similar(title, stored_title)
|
|
45
|
+
year_match = (
|
|
46
|
+
not year
|
|
47
|
+
or not stored_year
|
|
48
|
+
or abs(int(stored_year) - int(year)) <= 1
|
|
49
|
+
if isinstance(stored_year, (int, str))
|
|
50
|
+
and str(stored_year).isdigit()
|
|
51
|
+
else stored_year == year
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if title_match and year_match and stored_doi:
|
|
55
|
+
logger.info(
|
|
56
|
+
f"DOI found in master Scholar library: {stored_doi} (paper_id: {paper_dir.name})"
|
|
57
|
+
)
|
|
58
|
+
return stored_doi
|
|
59
|
+
|
|
60
|
+
except (json.JSONDecodeError, KeyError, ValueError) as exc_:
|
|
61
|
+
logger.debug(
|
|
62
|
+
f"Error reading metadata from {metadata_file}: {exc_}"
|
|
63
|
+
)
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
except Exception as exc_:
|
|
69
|
+
logger.debug(f"Error checking master Scholar library: {exc_}")
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
async def resolve_and_create_library_structure_async(
|
|
73
|
+
self,
|
|
74
|
+
papers: List[Dict[str, Any]],
|
|
75
|
+
project: str,
|
|
76
|
+
sources: Optional[List[str]] = None,
|
|
77
|
+
) -> Dict[str, Dict[str, str]]:
|
|
78
|
+
"""Resolve DOIs and create full Scholar library structure with proper paths."""
|
|
79
|
+
if not self.single_doi_resolver:
|
|
80
|
+
raise ValueError("SingleDOIResolver is required for resolving DOIs")
|
|
81
|
+
|
|
82
|
+
results = {}
|
|
83
|
+
for paper in papers:
|
|
84
|
+
title = paper.get("title")
|
|
85
|
+
if not title:
|
|
86
|
+
logger.warning(f"Skipping paper without title: {paper}")
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
logger.info(f"Processing: {title[:50]}...")
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
doi_result = await self.single_doi_resolver.metadata2doi_async(
|
|
93
|
+
title=title,
|
|
94
|
+
year=paper.get("year"),
|
|
95
|
+
authors=paper.get("authors"),
|
|
96
|
+
sources=sources,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
enhanced_metadata = self._extract_enhanced_metadata(doi_result, paper)
|
|
100
|
+
paper_info = {**paper, **enhanced_metadata}
|
|
101
|
+
|
|
102
|
+
storage_paths = self._call_path_manager_get_storage_paths(
|
|
103
|
+
paper_info=paper_info, collection_name="MASTER"
|
|
104
|
+
)
|
|
105
|
+
paper_id = storage_paths["unique_id"]
|
|
106
|
+
storage_path = storage_paths["storage_path"]
|
|
107
|
+
metadata_file = storage_path / "metadata.json"
|
|
108
|
+
|
|
109
|
+
complete_metadata = self._create_complete_metadata(
|
|
110
|
+
paper, doi_result, paper_id, enhanced_metadata
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
with open(metadata_file, "w") as file_:
|
|
114
|
+
json.dump(complete_metadata, file_, indent=2)
|
|
115
|
+
|
|
116
|
+
logger.success(
|
|
117
|
+
f"Saved metadata.json for {paper_id} ({len(complete_metadata)} fields)"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
project_symlink_path = self._create_project_symlink(
|
|
121
|
+
master_storage_path=storage_path,
|
|
122
|
+
project=project,
|
|
123
|
+
readable_name=storage_paths["readable_name"],
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
bibtex_source_filename = getattr(self, "_source_filename", "papers")
|
|
127
|
+
info_dir = self._create_bibtex_info_structure(
|
|
128
|
+
project=project,
|
|
129
|
+
paper_info={**paper, **enhanced_metadata},
|
|
130
|
+
complete_metadata=complete_metadata,
|
|
131
|
+
bibtex_source_filename=bibtex_source_filename,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
results[title] = {
|
|
135
|
+
"scitex_id": paper_id,
|
|
136
|
+
"scholar_id": paper_id,
|
|
137
|
+
"doi": complete_metadata.get("doi"),
|
|
138
|
+
"master_storage_path": str(storage_path),
|
|
139
|
+
"project_symlink_path": str(project_symlink_path)
|
|
140
|
+
if project_symlink_path
|
|
141
|
+
else None,
|
|
142
|
+
"readable_name": storage_paths["readable_name"],
|
|
143
|
+
"metadata_file": str(metadata_file),
|
|
144
|
+
"info_dir": str(info_dir) if info_dir else None,
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
logger.info(f"Created library entry: {paper_id}")
|
|
148
|
+
if complete_metadata.get("doi"):
|
|
149
|
+
logger.info(f" DOI: {complete_metadata['doi']}")
|
|
150
|
+
logger.info(f" Storage: {storage_path}")
|
|
151
|
+
|
|
152
|
+
except Exception as exc_:
|
|
153
|
+
logger.error(f"Error processing '{title[:30]}...': {exc_}")
|
|
154
|
+
|
|
155
|
+
logger.success(
|
|
156
|
+
f"Created Scholar library entries for {len(results)}/{len(papers)} papers"
|
|
157
|
+
)
|
|
158
|
+
return results
|
|
159
|
+
|
|
160
|
+
async def resolve_and_create_library_structure_with_source_async(
|
|
161
|
+
self,
|
|
162
|
+
papers: List[Dict[str, Any]],
|
|
163
|
+
project: str,
|
|
164
|
+
sources: Optional[List[str]] = None,
|
|
165
|
+
bibtex_source_filename: str = "papers",
|
|
166
|
+
) -> Dict[str, Dict[str, str]]:
|
|
167
|
+
"""Enhanced version that passes source filename for BibTeX structure."""
|
|
168
|
+
self._source_filename = bibtex_source_filename
|
|
169
|
+
return await self.resolve_and_create_library_structure_async(
|
|
170
|
+
papers=papers, project=project, sources=sources
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def _extract_enhanced_metadata(
|
|
174
|
+
self, doi_result: Optional[Dict], paper: Dict
|
|
175
|
+
) -> Dict[str, Any]:
|
|
176
|
+
"""Extract enhanced metadata from DOI resolution result."""
|
|
177
|
+
enhanced = {}
|
|
178
|
+
if doi_result and isinstance(doi_result, dict):
|
|
179
|
+
metadata_source = doi_result.get("metadata", {})
|
|
180
|
+
enhanced.update(
|
|
181
|
+
{
|
|
182
|
+
"doi": doi_result.get("doi"),
|
|
183
|
+
"journal": metadata_source.get("journal")
|
|
184
|
+
or doi_result.get("journal")
|
|
185
|
+
or paper.get("journal"),
|
|
186
|
+
"authors": metadata_source.get("authors")
|
|
187
|
+
or doi_result.get("authors")
|
|
188
|
+
or paper.get("authors"),
|
|
189
|
+
"year": metadata_source.get("year")
|
|
190
|
+
or doi_result.get("year")
|
|
191
|
+
or paper.get("year"),
|
|
192
|
+
"title": metadata_source.get("title")
|
|
193
|
+
or doi_result.get("title")
|
|
194
|
+
or paper.get("title"),
|
|
195
|
+
"abstract": metadata_source.get("abstract")
|
|
196
|
+
or doi_result.get("abstract"),
|
|
197
|
+
"publisher": metadata_source.get("publisher")
|
|
198
|
+
or doi_result.get("publisher"),
|
|
199
|
+
"volume": metadata_source.get("volume") or doi_result.get("volume"),
|
|
200
|
+
"issue": metadata_source.get("issue") or doi_result.get("issue"),
|
|
201
|
+
"pages": metadata_source.get("pages") or doi_result.get("pages"),
|
|
202
|
+
"issn": metadata_source.get("issn") or doi_result.get("issn"),
|
|
203
|
+
"short_journal": metadata_source.get("short_journal")
|
|
204
|
+
or doi_result.get("short_journal"),
|
|
205
|
+
}
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
if doi_result.get("doi"):
|
|
209
|
+
logger.success(
|
|
210
|
+
f"Enhanced metadata from DOI source: {dict(metadata_source)}"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
return enhanced
|
|
214
|
+
|
|
215
|
+
def _create_complete_metadata(
|
|
216
|
+
self,
|
|
217
|
+
paper: Dict,
|
|
218
|
+
doi_result: Optional[Dict],
|
|
219
|
+
paper_id: str,
|
|
220
|
+
enhanced_metadata: Dict,
|
|
221
|
+
) -> Dict[str, Any]:
|
|
222
|
+
"""Create complete metadata dictionary with source tracking."""
|
|
223
|
+
raw_title = enhanced_metadata.get("title") or paper.get("title")
|
|
224
|
+
clean_title = TextNormalizer.clean_metadata_text(raw_title) if raw_title else ""
|
|
225
|
+
raw_abstract = None
|
|
226
|
+
if enhanced_metadata.get("abstract"):
|
|
227
|
+
raw_abstract = TextNormalizer.clean_metadata_text(
|
|
228
|
+
enhanced_metadata["abstract"]
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
doi_source_value = self._get_doi_source_value(doi_result)
|
|
232
|
+
|
|
233
|
+
complete_metadata = {
|
|
234
|
+
"title": clean_title,
|
|
235
|
+
"title_source": doi_source_value
|
|
236
|
+
if enhanced_metadata.get("title") != paper.get("title")
|
|
237
|
+
else "manual",
|
|
238
|
+
"authors": enhanced_metadata.get("authors") or paper.get("authors"),
|
|
239
|
+
"authors_source": doi_source_value
|
|
240
|
+
if enhanced_metadata.get("authors") != paper.get("authors")
|
|
241
|
+
else ("manual" if paper.get("authors") else None),
|
|
242
|
+
"year": enhanced_metadata.get("year") or paper.get("year"),
|
|
243
|
+
"year_source": doi_source_value
|
|
244
|
+
if enhanced_metadata.get("year") != paper.get("year")
|
|
245
|
+
else ("manual" if paper.get("year") else None),
|
|
246
|
+
"journal": enhanced_metadata.get("journal") or paper.get("journal"),
|
|
247
|
+
"journal_source": doi_source_value
|
|
248
|
+
if enhanced_metadata.get("journal") != paper.get("journal")
|
|
249
|
+
else ("manual" if paper.get("journal") else None),
|
|
250
|
+
"abstract": raw_abstract,
|
|
251
|
+
"abstract_source": doi_source_value
|
|
252
|
+
if enhanced_metadata.get("abstract")
|
|
253
|
+
else None,
|
|
254
|
+
"scitex_id": paper_id,
|
|
255
|
+
"created_at": datetime.now().isoformat(),
|
|
256
|
+
"created_by": "SciTeX Scholar",
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if doi_result and isinstance(doi_result, dict):
|
|
260
|
+
safe_fields = [
|
|
261
|
+
"publisher",
|
|
262
|
+
"volume",
|
|
263
|
+
"issue",
|
|
264
|
+
"pages",
|
|
265
|
+
"issn",
|
|
266
|
+
"short_journal",
|
|
267
|
+
]
|
|
268
|
+
for field in safe_fields:
|
|
269
|
+
value = enhanced_metadata.get(field)
|
|
270
|
+
if value is not None:
|
|
271
|
+
complete_metadata[field] = value
|
|
272
|
+
complete_metadata[f"{field}_source"] = (
|
|
273
|
+
doi_source_value or "unknown_api"
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
if doi_result and doi_result.get("doi"):
|
|
277
|
+
complete_metadata.update(
|
|
278
|
+
{"doi": doi_result["doi"], "doi_source": doi_source_value}
|
|
279
|
+
)
|
|
280
|
+
logger.success(f"DOI resolved for {paper_id}: {doi_result['doi']}")
|
|
281
|
+
else:
|
|
282
|
+
complete_metadata.update(
|
|
283
|
+
{"doi": None, "doi_source": None, "doi_resolution_failed": True}
|
|
284
|
+
)
|
|
285
|
+
logger.warning(
|
|
286
|
+
f"DOI resolution failed for {paper_id}: {paper.get('title', '')[:40]}..."
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
self._add_standard_fields(complete_metadata)
|
|
290
|
+
|
|
291
|
+
storage_paths = self._call_path_manager_get_storage_paths(
|
|
292
|
+
paper_info={**paper, **enhanced_metadata}, collection_name="MASTER"
|
|
293
|
+
)
|
|
294
|
+
storage_path = storage_paths["storage_path"]
|
|
295
|
+
|
|
296
|
+
complete_metadata.update(
|
|
297
|
+
{
|
|
298
|
+
"master_storage_path": str(storage_path),
|
|
299
|
+
"readable_name": storage_paths["readable_name"],
|
|
300
|
+
"metadata_file": str(storage_path / "metadata.json"),
|
|
301
|
+
}
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
return complete_metadata
|
|
305
|
+
|
|
306
|
+
def _get_doi_source_value(self, doi_result: Optional[Dict]) -> Optional[str]:
|
|
307
|
+
"""Get normalized DOI source value."""
|
|
308
|
+
if not doi_result or not doi_result.get("source"):
|
|
309
|
+
return None
|
|
310
|
+
|
|
311
|
+
source = doi_result["source"]
|
|
312
|
+
if "crossref" in source.lower():
|
|
313
|
+
return "crossref"
|
|
314
|
+
elif "semantic" in source.lower():
|
|
315
|
+
return "semantic_scholar"
|
|
316
|
+
elif "pubmed" in source.lower():
|
|
317
|
+
return "pubmed"
|
|
318
|
+
elif "openalex" in source.lower():
|
|
319
|
+
return "openalex"
|
|
320
|
+
return source
|
|
321
|
+
|
|
322
|
+
def _add_standard_fields(self, complete_metadata: Dict) -> None:
|
|
323
|
+
"""Add standard fields with None defaults."""
|
|
324
|
+
standard_fields = {
|
|
325
|
+
"keywords": None,
|
|
326
|
+
"references": None,
|
|
327
|
+
"venue": None,
|
|
328
|
+
"publisher": None,
|
|
329
|
+
"volume": None,
|
|
330
|
+
"issue": None,
|
|
331
|
+
"pages": None,
|
|
332
|
+
"issn": None,
|
|
333
|
+
"short_journal": None,
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
missing_fields = []
|
|
337
|
+
for field, default_value in standard_fields.items():
|
|
338
|
+
if field not in complete_metadata or complete_metadata[field] is None:
|
|
339
|
+
complete_metadata[field] = default_value
|
|
340
|
+
missing_fields.append(field)
|
|
341
|
+
|
|
342
|
+
if missing_fields:
|
|
343
|
+
logger.info(
|
|
344
|
+
f"Missing fields for future enhancement: {', '.join(missing_fields)}"
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
def _is_title_similar(
|
|
348
|
+
self, title1: str, title2: str, threshold: float = 0.7
|
|
349
|
+
) -> bool:
|
|
350
|
+
"""Check if two titles are similar enough to be considered the same paper."""
|
|
351
|
+
if not title1 or not title2:
|
|
352
|
+
return False
|
|
353
|
+
|
|
354
|
+
def normalize_title(title: str) -> str:
|
|
355
|
+
title = title.lower()
|
|
356
|
+
title = re.sub(r"[^\w\s]", " ", title)
|
|
357
|
+
title = re.sub(r"\s+", " ", title)
|
|
358
|
+
return title.strip()
|
|
359
|
+
|
|
360
|
+
norm_title1 = normalize_title(title1)
|
|
361
|
+
norm_title2 = normalize_title(title2)
|
|
362
|
+
|
|
363
|
+
words1 = set(norm_title1.split())
|
|
364
|
+
words2 = set(norm_title2.split())
|
|
365
|
+
|
|
366
|
+
if not words1 or not words2:
|
|
367
|
+
return False
|
|
368
|
+
|
|
369
|
+
intersection = len(words1.intersection(words2))
|
|
370
|
+
union = len(words1.union(words2))
|
|
371
|
+
similarity = intersection / union if union > 0 else 0.0
|
|
372
|
+
|
|
373
|
+
return similarity >= threshold
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
# EOF
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Timestamp: "2026-01-24 (ywatanabe)"
|
|
3
|
+
# File: /home/ywatanabe/proj/scitex-python/src/scitex/scholar/storage/_mixins/_storage_helpers.py
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Storage helper mixin for LibraryManager.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from typing import TYPE_CHECKING, Dict, Optional
|
|
14
|
+
|
|
15
|
+
from scitex import logging
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from scitex.scholar.core.Paper import Paper
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class StorageHelpersMixin:
|
|
24
|
+
"""Mixin providing storage helper methods."""
|
|
25
|
+
|
|
26
|
+
def has_metadata(self, paper_id: str) -> bool:
|
|
27
|
+
"""Check if metadata.json exists for paper."""
|
|
28
|
+
metadata_file = self.library_master_dir / paper_id / "metadata.json"
|
|
29
|
+
return metadata_file.exists()
|
|
30
|
+
|
|
31
|
+
def has_urls(self, paper_id: str) -> bool:
|
|
32
|
+
"""Check if PDF URLs exist in metadata."""
|
|
33
|
+
if not self.has_metadata(paper_id):
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
metadata_file = self.library_master_dir / paper_id / "metadata.json"
|
|
37
|
+
try:
|
|
38
|
+
with open(metadata_file) as f:
|
|
39
|
+
data = json.load(f)
|
|
40
|
+
|
|
41
|
+
urls = data.get("metadata", {}).get("url", {}).get("pdfs", [])
|
|
42
|
+
return len(urls) > 0
|
|
43
|
+
except Exception:
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
def has_pdf(self, paper_id: str) -> bool:
|
|
47
|
+
"""Check if PDF file exists in storage."""
|
|
48
|
+
paper_dir = self.library_master_dir / paper_id
|
|
49
|
+
if not paper_dir.exists():
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
pdf_files = list(paper_dir.glob("*.pdf"))
|
|
53
|
+
return len(pdf_files) > 0
|
|
54
|
+
|
|
55
|
+
def load_paper_from_id(self, paper_id: str) -> Optional[Paper]:
|
|
56
|
+
"""Load Paper object from storage by ID."""
|
|
57
|
+
from scitex.scholar.core.Paper import Paper
|
|
58
|
+
|
|
59
|
+
metadata_file = self.library_master_dir / paper_id / "metadata.json"
|
|
60
|
+
|
|
61
|
+
if not metadata_file.exists():
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
with open(metadata_file) as f:
|
|
66
|
+
data = json.load(f)
|
|
67
|
+
|
|
68
|
+
paper = Paper.from_dict(data)
|
|
69
|
+
return paper
|
|
70
|
+
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.error(f"Failed to load paper {paper_id}: {e}")
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
def save_paper_incremental(self, paper_id: str, paper: Paper) -> None:
|
|
76
|
+
"""Save Paper object to storage (incremental update)."""
|
|
77
|
+
storage_path = self.library_master_dir / paper_id
|
|
78
|
+
storage_path.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
|
|
80
|
+
metadata_file = storage_path / "metadata.json"
|
|
81
|
+
|
|
82
|
+
existing_data = {}
|
|
83
|
+
if metadata_file.exists():
|
|
84
|
+
try:
|
|
85
|
+
with open(metadata_file) as f:
|
|
86
|
+
existing_data = json.load(f)
|
|
87
|
+
except Exception:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
new_data = paper.model_dump()
|
|
91
|
+
merged_data = self._merge_metadata(existing_data, new_data)
|
|
92
|
+
|
|
93
|
+
if "container" not in merged_data:
|
|
94
|
+
merged_data["container"] = {}
|
|
95
|
+
merged_data["container"]["updated_at"] = datetime.now().isoformat()
|
|
96
|
+
|
|
97
|
+
with open(metadata_file, "w") as f:
|
|
98
|
+
json.dump(merged_data, f, indent=2, ensure_ascii=False)
|
|
99
|
+
|
|
100
|
+
logger.debug(f"Saved paper {paper_id} to storage")
|
|
101
|
+
|
|
102
|
+
def _merge_metadata(self, existing: Dict, new: Dict) -> Dict:
|
|
103
|
+
"""Recursively merge metadata dicts, preferring new non-None values."""
|
|
104
|
+
result = existing.copy()
|
|
105
|
+
|
|
106
|
+
for key, new_value in new.items():
|
|
107
|
+
if key not in result:
|
|
108
|
+
result[key] = new_value
|
|
109
|
+
elif new_value is None:
|
|
110
|
+
pass
|
|
111
|
+
elif isinstance(new_value, dict) and isinstance(result[key], dict):
|
|
112
|
+
result[key] = self._merge_metadata(result[key], new_value)
|
|
113
|
+
elif isinstance(new_value, list) and len(new_value) > 0:
|
|
114
|
+
result[key] = new_value
|
|
115
|
+
elif new_value:
|
|
116
|
+
result[key] = new_value
|
|
117
|
+
|
|
118
|
+
return result
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# EOF
|