scitex 2.14.0__py3-none-any.whl → 2.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. scitex/__init__.py +47 -0
  2. scitex/_env_loader.py +156 -0
  3. scitex/_mcp_resources/__init__.py +37 -0
  4. scitex/_mcp_resources/_cheatsheet.py +135 -0
  5. scitex/_mcp_resources/_figrecipe.py +138 -0
  6. scitex/_mcp_resources/_formats.py +102 -0
  7. scitex/_mcp_resources/_modules.py +337 -0
  8. scitex/_mcp_resources/_session.py +149 -0
  9. scitex/_mcp_tools/__init__.py +4 -0
  10. scitex/_mcp_tools/audio.py +66 -0
  11. scitex/_mcp_tools/diagram.py +11 -95
  12. scitex/_mcp_tools/introspect.py +191 -0
  13. scitex/_mcp_tools/plt.py +260 -305
  14. scitex/_mcp_tools/scholar.py +74 -0
  15. scitex/_mcp_tools/social.py +244 -0
  16. scitex/_mcp_tools/writer.py +21 -204
  17. scitex/ai/_gen_ai/_PARAMS.py +10 -7
  18. scitex/ai/classification/reporters/_SingleClassificationReporter.py +45 -1603
  19. scitex/ai/classification/reporters/_mixins/__init__.py +36 -0
  20. scitex/ai/classification/reporters/_mixins/_constants.py +67 -0
  21. scitex/ai/classification/reporters/_mixins/_cv_summary.py +387 -0
  22. scitex/ai/classification/reporters/_mixins/_feature_importance.py +119 -0
  23. scitex/ai/classification/reporters/_mixins/_metrics.py +275 -0
  24. scitex/ai/classification/reporters/_mixins/_plotting.py +179 -0
  25. scitex/ai/classification/reporters/_mixins/_reports.py +153 -0
  26. scitex/ai/classification/reporters/_mixins/_storage.py +160 -0
  27. scitex/audio/README.md +40 -36
  28. scitex/audio/__init__.py +127 -59
  29. scitex/audio/_branding.py +185 -0
  30. scitex/audio/_mcp/__init__.py +32 -0
  31. scitex/audio/_mcp/handlers.py +59 -6
  32. scitex/audio/_mcp/speak_handlers.py +238 -0
  33. scitex/audio/_relay.py +225 -0
  34. scitex/audio/engines/elevenlabs_engine.py +6 -1
  35. scitex/audio/mcp_server.py +228 -75
  36. scitex/canvas/README.md +1 -1
  37. scitex/canvas/editor/_dearpygui/__init__.py +25 -0
  38. scitex/canvas/editor/_dearpygui/_editor.py +147 -0
  39. scitex/canvas/editor/_dearpygui/_handlers.py +476 -0
  40. scitex/canvas/editor/_dearpygui/_panels/__init__.py +17 -0
  41. scitex/canvas/editor/_dearpygui/_panels/_control.py +119 -0
  42. scitex/canvas/editor/_dearpygui/_panels/_element_controls.py +190 -0
  43. scitex/canvas/editor/_dearpygui/_panels/_preview.py +43 -0
  44. scitex/canvas/editor/_dearpygui/_panels/_sections.py +390 -0
  45. scitex/canvas/editor/_dearpygui/_plotting.py +187 -0
  46. scitex/canvas/editor/_dearpygui/_rendering.py +504 -0
  47. scitex/canvas/editor/_dearpygui/_selection.py +295 -0
  48. scitex/canvas/editor/_dearpygui/_state.py +93 -0
  49. scitex/canvas/editor/_dearpygui/_utils.py +61 -0
  50. scitex/canvas/editor/flask_editor/templates/__init__.py +32 -70
  51. scitex/cli/__init__.py +38 -43
  52. scitex/cli/audio.py +76 -27
  53. scitex/cli/capture.py +13 -20
  54. scitex/cli/introspect.py +443 -0
  55. scitex/cli/main.py +198 -109
  56. scitex/cli/mcp.py +60 -34
  57. scitex/cli/scholar/__init__.py +8 -0
  58. scitex/cli/scholar/_crossref_scitex.py +296 -0
  59. scitex/cli/scholar/_fetch.py +25 -3
  60. scitex/cli/social.py +314 -0
  61. scitex/cli/writer.py +117 -0
  62. scitex/config/README.md +1 -1
  63. scitex/config/__init__.py +16 -2
  64. scitex/config/_env_registry.py +191 -0
  65. scitex/diagram/__init__.py +42 -19
  66. scitex/diagram/mcp_server.py +13 -125
  67. scitex/introspect/__init__.py +75 -0
  68. scitex/introspect/_call_graph.py +303 -0
  69. scitex/introspect/_class_hierarchy.py +163 -0
  70. scitex/introspect/_core.py +42 -0
  71. scitex/introspect/_docstring.py +131 -0
  72. scitex/introspect/_examples.py +113 -0
  73. scitex/introspect/_imports.py +271 -0
  74. scitex/introspect/_mcp/__init__.py +37 -0
  75. scitex/introspect/_mcp/handlers.py +208 -0
  76. scitex/introspect/_members.py +151 -0
  77. scitex/introspect/_resolve.py +89 -0
  78. scitex/introspect/_signature.py +131 -0
  79. scitex/introspect/_source.py +80 -0
  80. scitex/introspect/_type_hints.py +172 -0
  81. scitex/io/bundle/README.md +1 -1
  82. scitex/mcp_server.py +98 -5
  83. scitex/plt/__init__.py +248 -550
  84. scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +5 -10
  85. scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  86. scitex/plt/gallery/README.md +1 -1
  87. scitex/plt/utils/_hitmap/__init__.py +82 -0
  88. scitex/plt/utils/_hitmap/_artist_extraction.py +343 -0
  89. scitex/plt/utils/_hitmap/_color_application.py +346 -0
  90. scitex/plt/utils/_hitmap/_color_conversion.py +121 -0
  91. scitex/plt/utils/_hitmap/_constants.py +40 -0
  92. scitex/plt/utils/_hitmap/_hitmap_core.py +334 -0
  93. scitex/plt/utils/_hitmap/_path_extraction.py +357 -0
  94. scitex/plt/utils/_hitmap/_query.py +113 -0
  95. scitex/plt/utils/_hitmap.py +46 -1616
  96. scitex/plt/utils/_metadata/__init__.py +80 -0
  97. scitex/plt/utils/_metadata/_artists/__init__.py +25 -0
  98. scitex/plt/utils/_metadata/_artists/_base.py +195 -0
  99. scitex/plt/utils/_metadata/_artists/_collections.py +356 -0
  100. scitex/plt/utils/_metadata/_artists/_extract.py +57 -0
  101. scitex/plt/utils/_metadata/_artists/_images.py +80 -0
  102. scitex/plt/utils/_metadata/_artists/_lines.py +261 -0
  103. scitex/plt/utils/_metadata/_artists/_patches.py +247 -0
  104. scitex/plt/utils/_metadata/_artists/_text.py +106 -0
  105. scitex/plt/utils/_metadata/_csv.py +416 -0
  106. scitex/plt/utils/_metadata/_detect.py +225 -0
  107. scitex/plt/utils/_metadata/_legend.py +127 -0
  108. scitex/plt/utils/_metadata/_rounding.py +117 -0
  109. scitex/plt/utils/_metadata/_verification.py +202 -0
  110. scitex/schema/README.md +1 -1
  111. scitex/scholar/__init__.py +8 -0
  112. scitex/scholar/_mcp/crossref_handlers.py +265 -0
  113. scitex/scholar/core/Scholar.py +63 -1700
  114. scitex/scholar/core/_mixins/__init__.py +36 -0
  115. scitex/scholar/core/_mixins/_enrichers.py +270 -0
  116. scitex/scholar/core/_mixins/_library_handlers.py +100 -0
  117. scitex/scholar/core/_mixins/_loaders.py +103 -0
  118. scitex/scholar/core/_mixins/_pdf_download.py +375 -0
  119. scitex/scholar/core/_mixins/_pipeline.py +312 -0
  120. scitex/scholar/core/_mixins/_project_handlers.py +125 -0
  121. scitex/scholar/core/_mixins/_savers.py +69 -0
  122. scitex/scholar/core/_mixins/_search.py +103 -0
  123. scitex/scholar/core/_mixins/_services.py +88 -0
  124. scitex/scholar/core/_mixins/_url_finding.py +105 -0
  125. scitex/scholar/crossref_scitex.py +367 -0
  126. scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  127. scitex/scholar/examples/00_run_all.sh +120 -0
  128. scitex/scholar/jobs/_executors.py +27 -3
  129. scitex/scholar/pdf_download/ScholarPDFDownloader.py +38 -416
  130. scitex/scholar/pdf_download/_cli.py +154 -0
  131. scitex/scholar/pdf_download/strategies/__init__.py +11 -8
  132. scitex/scholar/pdf_download/strategies/manual_download_fallback.py +80 -3
  133. scitex/scholar/pipelines/ScholarPipelineBibTeX.py +73 -121
  134. scitex/scholar/pipelines/ScholarPipelineParallel.py +80 -138
  135. scitex/scholar/pipelines/ScholarPipelineSingle.py +43 -63
  136. scitex/scholar/pipelines/_single_steps.py +71 -36
  137. scitex/scholar/storage/_LibraryManager.py +97 -1695
  138. scitex/scholar/storage/_mixins/__init__.py +30 -0
  139. scitex/scholar/storage/_mixins/_bibtex_handlers.py +128 -0
  140. scitex/scholar/storage/_mixins/_library_operations.py +218 -0
  141. scitex/scholar/storage/_mixins/_metadata_conversion.py +226 -0
  142. scitex/scholar/storage/_mixins/_paper_saving.py +456 -0
  143. scitex/scholar/storage/_mixins/_resolution.py +376 -0
  144. scitex/scholar/storage/_mixins/_storage_helpers.py +121 -0
  145. scitex/scholar/storage/_mixins/_symlink_handlers.py +226 -0
  146. scitex/scholar/url_finder/.tmp/open_url/KNOWN_RESOLVERS.py +462 -0
  147. scitex/scholar/url_finder/.tmp/open_url/README.md +223 -0
  148. scitex/scholar/url_finder/.tmp/open_url/_DOIToURLResolver.py +694 -0
  149. scitex/scholar/url_finder/.tmp/open_url/_OpenURLResolver.py +1160 -0
  150. scitex/scholar/url_finder/.tmp/open_url/_ResolverLinkFinder.py +344 -0
  151. scitex/scholar/url_finder/.tmp/open_url/__init__.py +24 -0
  152. scitex/security/README.md +3 -3
  153. scitex/session/README.md +1 -1
  154. scitex/sh/README.md +1 -1
  155. scitex/social/__init__.py +153 -0
  156. scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  157. scitex/template/README.md +1 -1
  158. scitex/template/clone_writer_directory.py +5 -5
  159. scitex/writer/README.md +1 -1
  160. scitex/writer/_mcp/handlers.py +11 -744
  161. scitex/writer/_mcp/tool_schemas.py +5 -335
  162. scitex-2.15.1.dist-info/METADATA +648 -0
  163. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/RECORD +166 -111
  164. scitex/canvas/editor/flask_editor/templates/_scripts.py +0 -4933
  165. scitex/canvas/editor/flask_editor/templates/_styles.py +0 -1658
  166. scitex/dev/plt/data/mpl/PLOTTING_FUNCTIONS.yaml +0 -90
  167. scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES.yaml +0 -1571
  168. scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES_DETAILED.yaml +0 -6262
  169. scitex/dev/plt/data/mpl/SIGNATURES_FLATTENED.yaml +0 -1274
  170. scitex/dev/plt/data/mpl/dir_ax.txt +0 -459
  171. scitex/diagram/_compile.py +0 -312
  172. scitex/diagram/_diagram.py +0 -355
  173. scitex/diagram/_mcp/__init__.py +0 -4
  174. scitex/diagram/_mcp/handlers.py +0 -400
  175. scitex/diagram/_mcp/tool_schemas.py +0 -157
  176. scitex/diagram/_presets.py +0 -173
  177. scitex/diagram/_schema.py +0 -182
  178. scitex/diagram/_split.py +0 -278
  179. scitex/plt/_mcp/__init__.py +0 -4
  180. scitex/plt/_mcp/_handlers_annotation.py +0 -102
  181. scitex/plt/_mcp/_handlers_figure.py +0 -195
  182. scitex/plt/_mcp/_handlers_plot.py +0 -252
  183. scitex/plt/_mcp/_handlers_style.py +0 -219
  184. scitex/plt/_mcp/handlers.py +0 -74
  185. scitex/plt/_mcp/tool_schemas.py +0 -497
  186. scitex/plt/mcp_server.py +0 -231
  187. scitex/scholar/data/.gitkeep +0 -0
  188. scitex/scholar/data/README.md +0 -44
  189. scitex/scholar/data/bib_files/bibliography.bib +0 -1952
  190. scitex/scholar/data/bib_files/neurovista.bib +0 -277
  191. scitex/scholar/data/bib_files/neurovista_enriched.bib +0 -441
  192. scitex/scholar/data/bib_files/neurovista_enriched_enriched.bib +0 -441
  193. scitex/scholar/data/bib_files/neurovista_processed.bib +0 -338
  194. scitex/scholar/data/bib_files/openaccess.bib +0 -89
  195. scitex/scholar/data/bib_files/pac-seizure_prediction_enriched.bib +0 -2178
  196. scitex/scholar/data/bib_files/pac.bib +0 -698
  197. scitex/scholar/data/bib_files/pac_enriched.bib +0 -1061
  198. scitex/scholar/data/bib_files/pac_processed.bib +0 -0
  199. scitex/scholar/data/bib_files/pac_titles.txt +0 -75
  200. scitex/scholar/data/bib_files/paywalled.bib +0 -98
  201. scitex/scholar/data/bib_files/related-papers-by-coauthors.bib +0 -58
  202. scitex/scholar/data/bib_files/related-papers-by-coauthors_enriched.bib +0 -87
  203. scitex/scholar/data/bib_files/seizure_prediction.bib +0 -694
  204. scitex/scholar/data/bib_files/seizure_prediction_processed.bib +0 -0
  205. scitex/scholar/data/bib_files/test_complete_enriched.bib +0 -437
  206. scitex/scholar/data/bib_files/test_final_enriched.bib +0 -437
  207. scitex/scholar/data/bib_files/test_seizure.bib +0 -46
  208. scitex/scholar/data/impact_factor/JCR_IF_2022.xlsx +0 -0
  209. scitex/scholar/data/impact_factor/JCR_IF_2024.db +0 -0
  210. scitex/scholar/data/impact_factor/JCR_IF_2024.xlsx +0 -0
  211. scitex/scholar/data/impact_factor/JCR_IF_2024_v01.db +0 -0
  212. scitex/scholar/data/impact_factor.db +0 -0
  213. scitex/scholar/examples/SUGGESTIONS.md +0 -865
  214. scitex/scholar/examples/dev.py +0 -38
  215. scitex-2.14.0.dist-info/METADATA +0 -1238
  216. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/WHEEL +0 -0
  217. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/entry_points.txt +0 -0
  218. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,376 @@
1
+ #!/usr/bin/env python3
2
+ # Timestamp: "2026-01-24 (ywatanabe)"
3
+ # File: /home/ywatanabe/proj/scitex-python/src/scitex/scholar/storage/_mixins/_resolution.py
4
+
5
+ """
6
+ DOI resolution mixin for LibraryManager.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import re
13
+ from datetime import datetime
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ from scitex import logging
17
+ from scitex.scholar.utils import TextNormalizer
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class ResolutionMixin:
23
+ """Mixin providing DOI resolution methods."""
24
+
25
+ def check_library_for_doi(
26
+ self, title: str, year: Optional[int] = None
27
+ ) -> Optional[str]:
28
+ """Check if DOI already exists in master Scholar library."""
29
+ try:
30
+ for paper_dir in self.library_master_dir.iterdir():
31
+ if not paper_dir.is_dir():
32
+ continue
33
+
34
+ metadata_file = paper_dir / "metadata.json"
35
+ if metadata_file.exists():
36
+ try:
37
+ with open(metadata_file) as file_:
38
+ metadata = json.load(file_)
39
+
40
+ stored_title = metadata.get("title", "")
41
+ stored_year = metadata.get("year")
42
+ stored_doi = metadata.get("doi")
43
+
44
+ title_match = self._is_title_similar(title, stored_title)
45
+ year_match = (
46
+ not year
47
+ or not stored_year
48
+ or abs(int(stored_year) - int(year)) <= 1
49
+ if isinstance(stored_year, (int, str))
50
+ and str(stored_year).isdigit()
51
+ else stored_year == year
52
+ )
53
+
54
+ if title_match and year_match and stored_doi:
55
+ logger.info(
56
+ f"DOI found in master Scholar library: {stored_doi} (paper_id: {paper_dir.name})"
57
+ )
58
+ return stored_doi
59
+
60
+ except (json.JSONDecodeError, KeyError, ValueError) as exc_:
61
+ logger.debug(
62
+ f"Error reading metadata from {metadata_file}: {exc_}"
63
+ )
64
+ continue
65
+
66
+ return None
67
+
68
+ except Exception as exc_:
69
+ logger.debug(f"Error checking master Scholar library: {exc_}")
70
+ return None
71
+
72
+ async def resolve_and_create_library_structure_async(
73
+ self,
74
+ papers: List[Dict[str, Any]],
75
+ project: str,
76
+ sources: Optional[List[str]] = None,
77
+ ) -> Dict[str, Dict[str, str]]:
78
+ """Resolve DOIs and create full Scholar library structure with proper paths."""
79
+ if not self.single_doi_resolver:
80
+ raise ValueError("SingleDOIResolver is required for resolving DOIs")
81
+
82
+ results = {}
83
+ for paper in papers:
84
+ title = paper.get("title")
85
+ if not title:
86
+ logger.warning(f"Skipping paper without title: {paper}")
87
+ continue
88
+
89
+ logger.info(f"Processing: {title[:50]}...")
90
+
91
+ try:
92
+ doi_result = await self.single_doi_resolver.metadata2doi_async(
93
+ title=title,
94
+ year=paper.get("year"),
95
+ authors=paper.get("authors"),
96
+ sources=sources,
97
+ )
98
+
99
+ enhanced_metadata = self._extract_enhanced_metadata(doi_result, paper)
100
+ paper_info = {**paper, **enhanced_metadata}
101
+
102
+ storage_paths = self._call_path_manager_get_storage_paths(
103
+ paper_info=paper_info, collection_name="MASTER"
104
+ )
105
+ paper_id = storage_paths["unique_id"]
106
+ storage_path = storage_paths["storage_path"]
107
+ metadata_file = storage_path / "metadata.json"
108
+
109
+ complete_metadata = self._create_complete_metadata(
110
+ paper, doi_result, paper_id, enhanced_metadata
111
+ )
112
+
113
+ with open(metadata_file, "w") as file_:
114
+ json.dump(complete_metadata, file_, indent=2)
115
+
116
+ logger.success(
117
+ f"Saved metadata.json for {paper_id} ({len(complete_metadata)} fields)"
118
+ )
119
+
120
+ project_symlink_path = self._create_project_symlink(
121
+ master_storage_path=storage_path,
122
+ project=project,
123
+ readable_name=storage_paths["readable_name"],
124
+ )
125
+
126
+ bibtex_source_filename = getattr(self, "_source_filename", "papers")
127
+ info_dir = self._create_bibtex_info_structure(
128
+ project=project,
129
+ paper_info={**paper, **enhanced_metadata},
130
+ complete_metadata=complete_metadata,
131
+ bibtex_source_filename=bibtex_source_filename,
132
+ )
133
+
134
+ results[title] = {
135
+ "scitex_id": paper_id,
136
+ "scholar_id": paper_id,
137
+ "doi": complete_metadata.get("doi"),
138
+ "master_storage_path": str(storage_path),
139
+ "project_symlink_path": str(project_symlink_path)
140
+ if project_symlink_path
141
+ else None,
142
+ "readable_name": storage_paths["readable_name"],
143
+ "metadata_file": str(metadata_file),
144
+ "info_dir": str(info_dir) if info_dir else None,
145
+ }
146
+
147
+ logger.info(f"Created library entry: {paper_id}")
148
+ if complete_metadata.get("doi"):
149
+ logger.info(f" DOI: {complete_metadata['doi']}")
150
+ logger.info(f" Storage: {storage_path}")
151
+
152
+ except Exception as exc_:
153
+ logger.error(f"Error processing '{title[:30]}...': {exc_}")
154
+
155
+ logger.success(
156
+ f"Created Scholar library entries for {len(results)}/{len(papers)} papers"
157
+ )
158
+ return results
159
+
160
+ async def resolve_and_create_library_structure_with_source_async(
161
+ self,
162
+ papers: List[Dict[str, Any]],
163
+ project: str,
164
+ sources: Optional[List[str]] = None,
165
+ bibtex_source_filename: str = "papers",
166
+ ) -> Dict[str, Dict[str, str]]:
167
+ """Enhanced version that passes source filename for BibTeX structure."""
168
+ self._source_filename = bibtex_source_filename
169
+ return await self.resolve_and_create_library_structure_async(
170
+ papers=papers, project=project, sources=sources
171
+ )
172
+
173
+ def _extract_enhanced_metadata(
174
+ self, doi_result: Optional[Dict], paper: Dict
175
+ ) -> Dict[str, Any]:
176
+ """Extract enhanced metadata from DOI resolution result."""
177
+ enhanced = {}
178
+ if doi_result and isinstance(doi_result, dict):
179
+ metadata_source = doi_result.get("metadata", {})
180
+ enhanced.update(
181
+ {
182
+ "doi": doi_result.get("doi"),
183
+ "journal": metadata_source.get("journal")
184
+ or doi_result.get("journal")
185
+ or paper.get("journal"),
186
+ "authors": metadata_source.get("authors")
187
+ or doi_result.get("authors")
188
+ or paper.get("authors"),
189
+ "year": metadata_source.get("year")
190
+ or doi_result.get("year")
191
+ or paper.get("year"),
192
+ "title": metadata_source.get("title")
193
+ or doi_result.get("title")
194
+ or paper.get("title"),
195
+ "abstract": metadata_source.get("abstract")
196
+ or doi_result.get("abstract"),
197
+ "publisher": metadata_source.get("publisher")
198
+ or doi_result.get("publisher"),
199
+ "volume": metadata_source.get("volume") or doi_result.get("volume"),
200
+ "issue": metadata_source.get("issue") or doi_result.get("issue"),
201
+ "pages": metadata_source.get("pages") or doi_result.get("pages"),
202
+ "issn": metadata_source.get("issn") or doi_result.get("issn"),
203
+ "short_journal": metadata_source.get("short_journal")
204
+ or doi_result.get("short_journal"),
205
+ }
206
+ )
207
+
208
+ if doi_result.get("doi"):
209
+ logger.success(
210
+ f"Enhanced metadata from DOI source: {dict(metadata_source)}"
211
+ )
212
+
213
+ return enhanced
214
+
215
+ def _create_complete_metadata(
216
+ self,
217
+ paper: Dict,
218
+ doi_result: Optional[Dict],
219
+ paper_id: str,
220
+ enhanced_metadata: Dict,
221
+ ) -> Dict[str, Any]:
222
+ """Create complete metadata dictionary with source tracking."""
223
+ raw_title = enhanced_metadata.get("title") or paper.get("title")
224
+ clean_title = TextNormalizer.clean_metadata_text(raw_title) if raw_title else ""
225
+ raw_abstract = None
226
+ if enhanced_metadata.get("abstract"):
227
+ raw_abstract = TextNormalizer.clean_metadata_text(
228
+ enhanced_metadata["abstract"]
229
+ )
230
+
231
+ doi_source_value = self._get_doi_source_value(doi_result)
232
+
233
+ complete_metadata = {
234
+ "title": clean_title,
235
+ "title_source": doi_source_value
236
+ if enhanced_metadata.get("title") != paper.get("title")
237
+ else "manual",
238
+ "authors": enhanced_metadata.get("authors") or paper.get("authors"),
239
+ "authors_source": doi_source_value
240
+ if enhanced_metadata.get("authors") != paper.get("authors")
241
+ else ("manual" if paper.get("authors") else None),
242
+ "year": enhanced_metadata.get("year") or paper.get("year"),
243
+ "year_source": doi_source_value
244
+ if enhanced_metadata.get("year") != paper.get("year")
245
+ else ("manual" if paper.get("year") else None),
246
+ "journal": enhanced_metadata.get("journal") or paper.get("journal"),
247
+ "journal_source": doi_source_value
248
+ if enhanced_metadata.get("journal") != paper.get("journal")
249
+ else ("manual" if paper.get("journal") else None),
250
+ "abstract": raw_abstract,
251
+ "abstract_source": doi_source_value
252
+ if enhanced_metadata.get("abstract")
253
+ else None,
254
+ "scitex_id": paper_id,
255
+ "created_at": datetime.now().isoformat(),
256
+ "created_by": "SciTeX Scholar",
257
+ }
258
+
259
+ if doi_result and isinstance(doi_result, dict):
260
+ safe_fields = [
261
+ "publisher",
262
+ "volume",
263
+ "issue",
264
+ "pages",
265
+ "issn",
266
+ "short_journal",
267
+ ]
268
+ for field in safe_fields:
269
+ value = enhanced_metadata.get(field)
270
+ if value is not None:
271
+ complete_metadata[field] = value
272
+ complete_metadata[f"{field}_source"] = (
273
+ doi_source_value or "unknown_api"
274
+ )
275
+
276
+ if doi_result and doi_result.get("doi"):
277
+ complete_metadata.update(
278
+ {"doi": doi_result["doi"], "doi_source": doi_source_value}
279
+ )
280
+ logger.success(f"DOI resolved for {paper_id}: {doi_result['doi']}")
281
+ else:
282
+ complete_metadata.update(
283
+ {"doi": None, "doi_source": None, "doi_resolution_failed": True}
284
+ )
285
+ logger.warning(
286
+ f"DOI resolution failed for {paper_id}: {paper.get('title', '')[:40]}..."
287
+ )
288
+
289
+ self._add_standard_fields(complete_metadata)
290
+
291
+ storage_paths = self._call_path_manager_get_storage_paths(
292
+ paper_info={**paper, **enhanced_metadata}, collection_name="MASTER"
293
+ )
294
+ storage_path = storage_paths["storage_path"]
295
+
296
+ complete_metadata.update(
297
+ {
298
+ "master_storage_path": str(storage_path),
299
+ "readable_name": storage_paths["readable_name"],
300
+ "metadata_file": str(storage_path / "metadata.json"),
301
+ }
302
+ )
303
+
304
+ return complete_metadata
305
+
306
+ def _get_doi_source_value(self, doi_result: Optional[Dict]) -> Optional[str]:
307
+ """Get normalized DOI source value."""
308
+ if not doi_result or not doi_result.get("source"):
309
+ return None
310
+
311
+ source = doi_result["source"]
312
+ if "crossref" in source.lower():
313
+ return "crossref"
314
+ elif "semantic" in source.lower():
315
+ return "semantic_scholar"
316
+ elif "pubmed" in source.lower():
317
+ return "pubmed"
318
+ elif "openalex" in source.lower():
319
+ return "openalex"
320
+ return source
321
+
322
+ def _add_standard_fields(self, complete_metadata: Dict) -> None:
323
+ """Add standard fields with None defaults."""
324
+ standard_fields = {
325
+ "keywords": None,
326
+ "references": None,
327
+ "venue": None,
328
+ "publisher": None,
329
+ "volume": None,
330
+ "issue": None,
331
+ "pages": None,
332
+ "issn": None,
333
+ "short_journal": None,
334
+ }
335
+
336
+ missing_fields = []
337
+ for field, default_value in standard_fields.items():
338
+ if field not in complete_metadata or complete_metadata[field] is None:
339
+ complete_metadata[field] = default_value
340
+ missing_fields.append(field)
341
+
342
+ if missing_fields:
343
+ logger.info(
344
+ f"Missing fields for future enhancement: {', '.join(missing_fields)}"
345
+ )
346
+
347
+ def _is_title_similar(
348
+ self, title1: str, title2: str, threshold: float = 0.7
349
+ ) -> bool:
350
+ """Check if two titles are similar enough to be considered the same paper."""
351
+ if not title1 or not title2:
352
+ return False
353
+
354
+ def normalize_title(title: str) -> str:
355
+ title = title.lower()
356
+ title = re.sub(r"[^\w\s]", " ", title)
357
+ title = re.sub(r"\s+", " ", title)
358
+ return title.strip()
359
+
360
+ norm_title1 = normalize_title(title1)
361
+ norm_title2 = normalize_title(title2)
362
+
363
+ words1 = set(norm_title1.split())
364
+ words2 = set(norm_title2.split())
365
+
366
+ if not words1 or not words2:
367
+ return False
368
+
369
+ intersection = len(words1.intersection(words2))
370
+ union = len(words1.union(words2))
371
+ similarity = intersection / union if union > 0 else 0.0
372
+
373
+ return similarity >= threshold
374
+
375
+
376
+ # EOF
@@ -0,0 +1,121 @@
1
+ #!/usr/bin/env python3
2
+ # Timestamp: "2026-01-24 (ywatanabe)"
3
+ # File: /home/ywatanabe/proj/scitex-python/src/scitex/scholar/storage/_mixins/_storage_helpers.py
4
+
5
+ """
6
+ Storage helper mixin for LibraryManager.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ from datetime import datetime
13
+ from typing import TYPE_CHECKING, Dict, Optional
14
+
15
+ from scitex import logging
16
+
17
+ if TYPE_CHECKING:
18
+ from scitex.scholar.core.Paper import Paper
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class StorageHelpersMixin:
24
+ """Mixin providing storage helper methods."""
25
+
26
+ def has_metadata(self, paper_id: str) -> bool:
27
+ """Check if metadata.json exists for paper."""
28
+ metadata_file = self.library_master_dir / paper_id / "metadata.json"
29
+ return metadata_file.exists()
30
+
31
+ def has_urls(self, paper_id: str) -> bool:
32
+ """Check if PDF URLs exist in metadata."""
33
+ if not self.has_metadata(paper_id):
34
+ return False
35
+
36
+ metadata_file = self.library_master_dir / paper_id / "metadata.json"
37
+ try:
38
+ with open(metadata_file) as f:
39
+ data = json.load(f)
40
+
41
+ urls = data.get("metadata", {}).get("url", {}).get("pdfs", [])
42
+ return len(urls) > 0
43
+ except Exception:
44
+ return False
45
+
46
+ def has_pdf(self, paper_id: str) -> bool:
47
+ """Check if PDF file exists in storage."""
48
+ paper_dir = self.library_master_dir / paper_id
49
+ if not paper_dir.exists():
50
+ return False
51
+
52
+ pdf_files = list(paper_dir.glob("*.pdf"))
53
+ return len(pdf_files) > 0
54
+
55
+ def load_paper_from_id(self, paper_id: str) -> Optional[Paper]:
56
+ """Load Paper object from storage by ID."""
57
+ from scitex.scholar.core.Paper import Paper
58
+
59
+ metadata_file = self.library_master_dir / paper_id / "metadata.json"
60
+
61
+ if not metadata_file.exists():
62
+ return None
63
+
64
+ try:
65
+ with open(metadata_file) as f:
66
+ data = json.load(f)
67
+
68
+ paper = Paper.from_dict(data)
69
+ return paper
70
+
71
+ except Exception as e:
72
+ logger.error(f"Failed to load paper {paper_id}: {e}")
73
+ return None
74
+
75
+ def save_paper_incremental(self, paper_id: str, paper: Paper) -> None:
76
+ """Save Paper object to storage (incremental update)."""
77
+ storage_path = self.library_master_dir / paper_id
78
+ storage_path.mkdir(parents=True, exist_ok=True)
79
+
80
+ metadata_file = storage_path / "metadata.json"
81
+
82
+ existing_data = {}
83
+ if metadata_file.exists():
84
+ try:
85
+ with open(metadata_file) as f:
86
+ existing_data = json.load(f)
87
+ except Exception:
88
+ pass
89
+
90
+ new_data = paper.model_dump()
91
+ merged_data = self._merge_metadata(existing_data, new_data)
92
+
93
+ if "container" not in merged_data:
94
+ merged_data["container"] = {}
95
+ merged_data["container"]["updated_at"] = datetime.now().isoformat()
96
+
97
+ with open(metadata_file, "w") as f:
98
+ json.dump(merged_data, f, indent=2, ensure_ascii=False)
99
+
100
+ logger.debug(f"Saved paper {paper_id} to storage")
101
+
102
+ def _merge_metadata(self, existing: Dict, new: Dict) -> Dict:
103
+ """Recursively merge metadata dicts, preferring new non-None values."""
104
+ result = existing.copy()
105
+
106
+ for key, new_value in new.items():
107
+ if key not in result:
108
+ result[key] = new_value
109
+ elif new_value is None:
110
+ pass
111
+ elif isinstance(new_value, dict) and isinstance(result[key], dict):
112
+ result[key] = self._merge_metadata(result[key], new_value)
113
+ elif isinstance(new_value, list) and len(new_value) > 0:
114
+ result[key] = new_value
115
+ elif new_value:
116
+ result[key] = new_value
117
+
118
+ return result
119
+
120
+
121
+ # EOF