scitex 2.14.0__py3-none-any.whl → 2.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. scitex/__init__.py +47 -0
  2. scitex/_env_loader.py +156 -0
  3. scitex/_mcp_resources/__init__.py +37 -0
  4. scitex/_mcp_resources/_cheatsheet.py +135 -0
  5. scitex/_mcp_resources/_figrecipe.py +138 -0
  6. scitex/_mcp_resources/_formats.py +102 -0
  7. scitex/_mcp_resources/_modules.py +337 -0
  8. scitex/_mcp_resources/_session.py +149 -0
  9. scitex/_mcp_tools/__init__.py +4 -0
  10. scitex/_mcp_tools/audio.py +66 -0
  11. scitex/_mcp_tools/diagram.py +11 -95
  12. scitex/_mcp_tools/introspect.py +191 -0
  13. scitex/_mcp_tools/plt.py +260 -305
  14. scitex/_mcp_tools/scholar.py +74 -0
  15. scitex/_mcp_tools/social.py +244 -0
  16. scitex/_mcp_tools/writer.py +21 -204
  17. scitex/ai/_gen_ai/_PARAMS.py +10 -7
  18. scitex/ai/classification/reporters/_SingleClassificationReporter.py +45 -1603
  19. scitex/ai/classification/reporters/_mixins/__init__.py +36 -0
  20. scitex/ai/classification/reporters/_mixins/_constants.py +67 -0
  21. scitex/ai/classification/reporters/_mixins/_cv_summary.py +387 -0
  22. scitex/ai/classification/reporters/_mixins/_feature_importance.py +119 -0
  23. scitex/ai/classification/reporters/_mixins/_metrics.py +275 -0
  24. scitex/ai/classification/reporters/_mixins/_plotting.py +179 -0
  25. scitex/ai/classification/reporters/_mixins/_reports.py +153 -0
  26. scitex/ai/classification/reporters/_mixins/_storage.py +160 -0
  27. scitex/audio/README.md +40 -36
  28. scitex/audio/__init__.py +127 -59
  29. scitex/audio/_branding.py +185 -0
  30. scitex/audio/_mcp/__init__.py +32 -0
  31. scitex/audio/_mcp/handlers.py +59 -6
  32. scitex/audio/_mcp/speak_handlers.py +238 -0
  33. scitex/audio/_relay.py +225 -0
  34. scitex/audio/engines/elevenlabs_engine.py +6 -1
  35. scitex/audio/mcp_server.py +228 -75
  36. scitex/canvas/README.md +1 -1
  37. scitex/canvas/editor/_dearpygui/__init__.py +25 -0
  38. scitex/canvas/editor/_dearpygui/_editor.py +147 -0
  39. scitex/canvas/editor/_dearpygui/_handlers.py +476 -0
  40. scitex/canvas/editor/_dearpygui/_panels/__init__.py +17 -0
  41. scitex/canvas/editor/_dearpygui/_panels/_control.py +119 -0
  42. scitex/canvas/editor/_dearpygui/_panels/_element_controls.py +190 -0
  43. scitex/canvas/editor/_dearpygui/_panels/_preview.py +43 -0
  44. scitex/canvas/editor/_dearpygui/_panels/_sections.py +390 -0
  45. scitex/canvas/editor/_dearpygui/_plotting.py +187 -0
  46. scitex/canvas/editor/_dearpygui/_rendering.py +504 -0
  47. scitex/canvas/editor/_dearpygui/_selection.py +295 -0
  48. scitex/canvas/editor/_dearpygui/_state.py +93 -0
  49. scitex/canvas/editor/_dearpygui/_utils.py +61 -0
  50. scitex/canvas/editor/flask_editor/templates/__init__.py +32 -70
  51. scitex/cli/__init__.py +38 -43
  52. scitex/cli/audio.py +76 -27
  53. scitex/cli/capture.py +13 -20
  54. scitex/cli/introspect.py +443 -0
  55. scitex/cli/main.py +198 -109
  56. scitex/cli/mcp.py +60 -34
  57. scitex/cli/scholar/__init__.py +8 -0
  58. scitex/cli/scholar/_crossref_scitex.py +296 -0
  59. scitex/cli/scholar/_fetch.py +25 -3
  60. scitex/cli/social.py +314 -0
  61. scitex/cli/writer.py +117 -0
  62. scitex/config/README.md +1 -1
  63. scitex/config/__init__.py +16 -2
  64. scitex/config/_env_registry.py +191 -0
  65. scitex/diagram/__init__.py +42 -19
  66. scitex/diagram/mcp_server.py +13 -125
  67. scitex/introspect/__init__.py +75 -0
  68. scitex/introspect/_call_graph.py +303 -0
  69. scitex/introspect/_class_hierarchy.py +163 -0
  70. scitex/introspect/_core.py +42 -0
  71. scitex/introspect/_docstring.py +131 -0
  72. scitex/introspect/_examples.py +113 -0
  73. scitex/introspect/_imports.py +271 -0
  74. scitex/introspect/_mcp/__init__.py +37 -0
  75. scitex/introspect/_mcp/handlers.py +208 -0
  76. scitex/introspect/_members.py +151 -0
  77. scitex/introspect/_resolve.py +89 -0
  78. scitex/introspect/_signature.py +131 -0
  79. scitex/introspect/_source.py +80 -0
  80. scitex/introspect/_type_hints.py +172 -0
  81. scitex/io/bundle/README.md +1 -1
  82. scitex/mcp_server.py +98 -5
  83. scitex/plt/__init__.py +248 -550
  84. scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +5 -10
  85. scitex/plt/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  86. scitex/plt/gallery/README.md +1 -1
  87. scitex/plt/utils/_hitmap/__init__.py +82 -0
  88. scitex/plt/utils/_hitmap/_artist_extraction.py +343 -0
  89. scitex/plt/utils/_hitmap/_color_application.py +346 -0
  90. scitex/plt/utils/_hitmap/_color_conversion.py +121 -0
  91. scitex/plt/utils/_hitmap/_constants.py +40 -0
  92. scitex/plt/utils/_hitmap/_hitmap_core.py +334 -0
  93. scitex/plt/utils/_hitmap/_path_extraction.py +357 -0
  94. scitex/plt/utils/_hitmap/_query.py +113 -0
  95. scitex/plt/utils/_hitmap.py +46 -1616
  96. scitex/plt/utils/_metadata/__init__.py +80 -0
  97. scitex/plt/utils/_metadata/_artists/__init__.py +25 -0
  98. scitex/plt/utils/_metadata/_artists/_base.py +195 -0
  99. scitex/plt/utils/_metadata/_artists/_collections.py +356 -0
  100. scitex/plt/utils/_metadata/_artists/_extract.py +57 -0
  101. scitex/plt/utils/_metadata/_artists/_images.py +80 -0
  102. scitex/plt/utils/_metadata/_artists/_lines.py +261 -0
  103. scitex/plt/utils/_metadata/_artists/_patches.py +247 -0
  104. scitex/plt/utils/_metadata/_artists/_text.py +106 -0
  105. scitex/plt/utils/_metadata/_csv.py +416 -0
  106. scitex/plt/utils/_metadata/_detect.py +225 -0
  107. scitex/plt/utils/_metadata/_legend.py +127 -0
  108. scitex/plt/utils/_metadata/_rounding.py +117 -0
  109. scitex/plt/utils/_metadata/_verification.py +202 -0
  110. scitex/schema/README.md +1 -1
  111. scitex/scholar/__init__.py +8 -0
  112. scitex/scholar/_mcp/crossref_handlers.py +265 -0
  113. scitex/scholar/core/Scholar.py +63 -1700
  114. scitex/scholar/core/_mixins/__init__.py +36 -0
  115. scitex/scholar/core/_mixins/_enrichers.py +270 -0
  116. scitex/scholar/core/_mixins/_library_handlers.py +100 -0
  117. scitex/scholar/core/_mixins/_loaders.py +103 -0
  118. scitex/scholar/core/_mixins/_pdf_download.py +375 -0
  119. scitex/scholar/core/_mixins/_pipeline.py +312 -0
  120. scitex/scholar/core/_mixins/_project_handlers.py +125 -0
  121. scitex/scholar/core/_mixins/_savers.py +69 -0
  122. scitex/scholar/core/_mixins/_search.py +103 -0
  123. scitex/scholar/core/_mixins/_services.py +88 -0
  124. scitex/scholar/core/_mixins/_url_finding.py +105 -0
  125. scitex/scholar/crossref_scitex.py +367 -0
  126. scitex/scholar/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  127. scitex/scholar/examples/00_run_all.sh +120 -0
  128. scitex/scholar/jobs/_executors.py +27 -3
  129. scitex/scholar/pdf_download/ScholarPDFDownloader.py +38 -416
  130. scitex/scholar/pdf_download/_cli.py +154 -0
  131. scitex/scholar/pdf_download/strategies/__init__.py +11 -8
  132. scitex/scholar/pdf_download/strategies/manual_download_fallback.py +80 -3
  133. scitex/scholar/pipelines/ScholarPipelineBibTeX.py +73 -121
  134. scitex/scholar/pipelines/ScholarPipelineParallel.py +80 -138
  135. scitex/scholar/pipelines/ScholarPipelineSingle.py +43 -63
  136. scitex/scholar/pipelines/_single_steps.py +71 -36
  137. scitex/scholar/storage/_LibraryManager.py +97 -1695
  138. scitex/scholar/storage/_mixins/__init__.py +30 -0
  139. scitex/scholar/storage/_mixins/_bibtex_handlers.py +128 -0
  140. scitex/scholar/storage/_mixins/_library_operations.py +218 -0
  141. scitex/scholar/storage/_mixins/_metadata_conversion.py +226 -0
  142. scitex/scholar/storage/_mixins/_paper_saving.py +456 -0
  143. scitex/scholar/storage/_mixins/_resolution.py +376 -0
  144. scitex/scholar/storage/_mixins/_storage_helpers.py +121 -0
  145. scitex/scholar/storage/_mixins/_symlink_handlers.py +226 -0
  146. scitex/scholar/url_finder/.tmp/open_url/KNOWN_RESOLVERS.py +462 -0
  147. scitex/scholar/url_finder/.tmp/open_url/README.md +223 -0
  148. scitex/scholar/url_finder/.tmp/open_url/_DOIToURLResolver.py +694 -0
  149. scitex/scholar/url_finder/.tmp/open_url/_OpenURLResolver.py +1160 -0
  150. scitex/scholar/url_finder/.tmp/open_url/_ResolverLinkFinder.py +344 -0
  151. scitex/scholar/url_finder/.tmp/open_url/__init__.py +24 -0
  152. scitex/security/README.md +3 -3
  153. scitex/session/README.md +1 -1
  154. scitex/sh/README.md +1 -1
  155. scitex/social/__init__.py +153 -0
  156. scitex/social/docs/EXTERNAL_PACKAGE_BRANDING.md +149 -0
  157. scitex/template/README.md +1 -1
  158. scitex/template/clone_writer_directory.py +5 -5
  159. scitex/writer/README.md +1 -1
  160. scitex/writer/_mcp/handlers.py +11 -744
  161. scitex/writer/_mcp/tool_schemas.py +5 -335
  162. scitex-2.15.1.dist-info/METADATA +648 -0
  163. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/RECORD +166 -111
  164. scitex/canvas/editor/flask_editor/templates/_scripts.py +0 -4933
  165. scitex/canvas/editor/flask_editor/templates/_styles.py +0 -1658
  166. scitex/dev/plt/data/mpl/PLOTTING_FUNCTIONS.yaml +0 -90
  167. scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES.yaml +0 -1571
  168. scitex/dev/plt/data/mpl/PLOTTING_SIGNATURES_DETAILED.yaml +0 -6262
  169. scitex/dev/plt/data/mpl/SIGNATURES_FLATTENED.yaml +0 -1274
  170. scitex/dev/plt/data/mpl/dir_ax.txt +0 -459
  171. scitex/diagram/_compile.py +0 -312
  172. scitex/diagram/_diagram.py +0 -355
  173. scitex/diagram/_mcp/__init__.py +0 -4
  174. scitex/diagram/_mcp/handlers.py +0 -400
  175. scitex/diagram/_mcp/tool_schemas.py +0 -157
  176. scitex/diagram/_presets.py +0 -173
  177. scitex/diagram/_schema.py +0 -182
  178. scitex/diagram/_split.py +0 -278
  179. scitex/plt/_mcp/__init__.py +0 -4
  180. scitex/plt/_mcp/_handlers_annotation.py +0 -102
  181. scitex/plt/_mcp/_handlers_figure.py +0 -195
  182. scitex/plt/_mcp/_handlers_plot.py +0 -252
  183. scitex/plt/_mcp/_handlers_style.py +0 -219
  184. scitex/plt/_mcp/handlers.py +0 -74
  185. scitex/plt/_mcp/tool_schemas.py +0 -497
  186. scitex/plt/mcp_server.py +0 -231
  187. scitex/scholar/data/.gitkeep +0 -0
  188. scitex/scholar/data/README.md +0 -44
  189. scitex/scholar/data/bib_files/bibliography.bib +0 -1952
  190. scitex/scholar/data/bib_files/neurovista.bib +0 -277
  191. scitex/scholar/data/bib_files/neurovista_enriched.bib +0 -441
  192. scitex/scholar/data/bib_files/neurovista_enriched_enriched.bib +0 -441
  193. scitex/scholar/data/bib_files/neurovista_processed.bib +0 -338
  194. scitex/scholar/data/bib_files/openaccess.bib +0 -89
  195. scitex/scholar/data/bib_files/pac-seizure_prediction_enriched.bib +0 -2178
  196. scitex/scholar/data/bib_files/pac.bib +0 -698
  197. scitex/scholar/data/bib_files/pac_enriched.bib +0 -1061
  198. scitex/scholar/data/bib_files/pac_processed.bib +0 -0
  199. scitex/scholar/data/bib_files/pac_titles.txt +0 -75
  200. scitex/scholar/data/bib_files/paywalled.bib +0 -98
  201. scitex/scholar/data/bib_files/related-papers-by-coauthors.bib +0 -58
  202. scitex/scholar/data/bib_files/related-papers-by-coauthors_enriched.bib +0 -87
  203. scitex/scholar/data/bib_files/seizure_prediction.bib +0 -694
  204. scitex/scholar/data/bib_files/seizure_prediction_processed.bib +0 -0
  205. scitex/scholar/data/bib_files/test_complete_enriched.bib +0 -437
  206. scitex/scholar/data/bib_files/test_final_enriched.bib +0 -437
  207. scitex/scholar/data/bib_files/test_seizure.bib +0 -46
  208. scitex/scholar/data/impact_factor/JCR_IF_2022.xlsx +0 -0
  209. scitex/scholar/data/impact_factor/JCR_IF_2024.db +0 -0
  210. scitex/scholar/data/impact_factor/JCR_IF_2024.xlsx +0 -0
  211. scitex/scholar/data/impact_factor/JCR_IF_2024_v01.db +0 -0
  212. scitex/scholar/data/impact_factor.db +0 -0
  213. scitex/scholar/examples/SUGGESTIONS.md +0 -865
  214. scitex/scholar/examples/dev.py +0 -38
  215. scitex-2.14.0.dist-info/METADATA +0 -1238
  216. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/WHEEL +0 -0
  217. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/entry_points.txt +0 -0
  218. {scitex-2.14.0.dist-info → scitex-2.15.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,37 +1,110 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # Timestamp: "2025-10-08 05:41:15 (ywatanabe)"
4
- # File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/storage/_LibraryManager.py
5
- # ----------------------------------------
6
- from __future__ import annotations
7
- import os
8
-
9
- __FILE__ = "./src/scitex/scholar/storage/_LibraryManager.py"
10
- __DIR__ = os.path.dirname(__FILE__)
11
- # ----------------------------------------
2
+ # Timestamp: "2026-01-24 (ywatanabe)"
3
+ # File: /home/ywatanabe/proj/scitex-python/src/scitex/scholar/storage/_LibraryManager.py
4
+
5
+ """
6
+ Unified manager for Scholar library structure and paper storage.
7
+
8
+ This module provides a comprehensive library manager that:
9
+ - Manages paper storage in the master library
10
+ - Handles metadata conversion and standardization
11
+ - Creates project symlinks for organization
12
+ - Generates BibTeX entries and structures
13
+ - Resolves DOIs and updates library metadata
14
+
15
+ The main class inherits from multiple mixins for modular functionality:
16
+ - StorageHelpersMixin: Storage helper methods (has_*, load, save)
17
+ - MetadataConversionMixin: Metadata conversion utilities
18
+ - PaperSavingMixin: Paper saving methods
19
+ - ResolutionMixin: DOI resolution and library structure creation
20
+ - SymlinkHandlersMixin: Symlink generation and management
21
+ - BibtexHandlersMixin: BibTeX structure and entry generation
22
+ - LibraryOperationsMixin: Library operations (update, validate)
23
+ """
12
24
 
13
- __FILE__ = __file__
25
+ from __future__ import annotations
14
26
 
15
- import asyncio
16
- import copy
17
- import json
18
- import re
19
- from collections import OrderedDict
20
- from datetime import datetime
21
- from pathlib import Path
22
- from typing import Any, Dict, List, Optional
27
+ from typing import Optional
23
28
 
24
29
  from scitex import logging
25
30
  from scitex.scholar.config import ScholarConfig
26
- from scitex.scholar.metadata_engines.utils import BASE_STRUCTURE, standardize_metadata
27
31
  from scitex.scholar.storage._DeduplicationManager import DeduplicationManager
28
- from scitex.scholar.utils import TextNormalizer
32
+
33
+ from ._mixins import (
34
+ BibtexHandlersMixin,
35
+ LibraryOperationsMixin,
36
+ MetadataConversionMixin,
37
+ PaperSavingMixin,
38
+ ResolutionMixin,
39
+ StorageHelpersMixin,
40
+ SymlinkHandlersMixin,
41
+ )
29
42
 
30
43
  logger = logging.getLogger(__name__)
31
44
 
32
45
 
33
- class LibraryManager:
34
- """Unified manager for Scholar library structure and paper storage."""
46
+ class LibraryManager(
47
+ StorageHelpersMixin,
48
+ MetadataConversionMixin,
49
+ PaperSavingMixin,
50
+ ResolutionMixin,
51
+ SymlinkHandlersMixin,
52
+ BibtexHandlersMixin,
53
+ LibraryOperationsMixin,
54
+ ):
55
+ """
56
+ Unified manager for Scholar library structure and paper storage.
57
+
58
+ This class provides comprehensive functionality for managing the Scholar
59
+ library, including:
60
+ - Storage helpers (check metadata, URLs, PDFs)
61
+ - Paper loading and incremental saving
62
+ - Metadata conversion to standardized format
63
+ - DOI resolution and library structure creation
64
+ - Project symlink management
65
+ - BibTeX entry generation
66
+ - Library validation and updates
67
+
68
+ Parameters
69
+ ----------
70
+ project : str, optional
71
+ Project name for organizing papers
72
+ single_doi_resolver : object, optional
73
+ DOI resolver instance for resolving DOIs from metadata
74
+ config : ScholarConfig, optional
75
+ Configuration object for Scholar settings
76
+
77
+ Attributes
78
+ ----------
79
+ config : ScholarConfig
80
+ Configuration object
81
+ project : str
82
+ Current project name
83
+ library_master_dir : Path
84
+ Path to the master library directory
85
+ single_doi_resolver : object
86
+ DOI resolver instance
87
+ dedup_manager : DeduplicationManager
88
+ Deduplication manager instance
89
+
90
+ Examples
91
+ --------
92
+ >>> # Basic usage
93
+ >>> manager = LibraryManager(project="my_research")
94
+ >>> paper_id = manager.save_resolved_paper(
95
+ ... title="My Paper",
96
+ ... doi="10.1234/example",
97
+ ... authors=["Author One", "Author Two"],
98
+ ... year=2024,
99
+ ... )
100
+
101
+ >>> # Check if paper exists
102
+ >>> has_pdf = manager.has_pdf(paper_id)
103
+ >>> has_meta = manager.has_metadata(paper_id)
104
+
105
+ >>> # Load paper
106
+ >>> paper = manager.load_paper_from_id(paper_id)
107
+ """
35
108
 
36
109
  def __init__(
37
110
  self,
@@ -47,1679 +120,8 @@ class LibraryManager:
47
120
  self._source_filename = "papers"
48
121
  self.dedup_manager = DeduplicationManager(config=self.config)
49
122
 
50
- # =========================================================================
51
- # Storage Helper Methods (Phase 1)
52
- # =========================================================================
53
-
54
- def has_metadata(self, paper_id: str) -> bool:
55
- """Check if metadata.json exists for paper.
56
-
57
- Args:
58
- paper_id: 8-digit paper ID
59
-
60
- Returns:
61
- True if metadata.json exists, False otherwise
62
- """
63
- metadata_file = self.library_master_dir / paper_id / "metadata.json"
64
- return metadata_file.exists()
65
-
66
- def has_urls(self, paper_id: str) -> bool:
67
- """Check if PDF URLs exist in metadata.
68
-
69
- Args:
70
- paper_id: 8-digit paper ID
71
-
72
- Returns:
73
- True if metadata has PDF URLs, False otherwise
74
- """
75
- if not self.has_metadata(paper_id):
76
- return False
77
-
78
- metadata_file = self.library_master_dir / paper_id / "metadata.json"
79
- try:
80
- with open(metadata_file, "r") as f:
81
- data = json.load(f)
82
-
83
- # Check nested structure: metadata.url.pdfs
84
- urls = data.get("metadata", {}).get("url", {}).get("pdfs", [])
85
- return len(urls) > 0
86
- except Exception:
87
- return False
88
-
89
- def has_pdf(self, paper_id: str) -> bool:
90
- """Check if PDF file exists in storage.
91
-
92
- Args:
93
- paper_id: 8-digit paper ID
94
-
95
- Returns:
96
- True if any PDF file exists, False otherwise
97
- """
98
- paper_dir = self.library_master_dir / paper_id
99
- if not paper_dir.exists():
100
- return False
101
-
102
- # Check for any PDF files
103
- pdf_files = list(paper_dir.glob("*.pdf"))
104
- return len(pdf_files) > 0
105
-
106
- def load_paper_from_id(self, paper_id: str) -> Optional["Paper"]:
107
- """Load Paper object from storage by ID.
108
-
109
- Args:
110
- paper_id: 8-digit paper ID
111
-
112
- Returns:
113
- Paper object if found, None otherwise
114
- """
115
- from scitex.scholar.core.Paper import Paper
116
-
117
- metadata_file = self.library_master_dir / paper_id / "metadata.json"
118
-
119
- if not metadata_file.exists():
120
- return None
121
-
122
- try:
123
- with open(metadata_file, "r") as f:
124
- data = json.load(f)
125
-
126
- # Use Paper.from_dict() which handles Pydantic validation
127
- paper = Paper.from_dict(data)
128
- return paper
129
-
130
- except Exception as e:
131
- logger.error(f"Failed to load paper {paper_id}: {e}")
132
- return None
133
-
134
- def save_paper_incremental(self, paper_id: str, paper: "Paper") -> None:
135
- """Save Paper object to storage (incremental update).
136
-
137
- This saves the complete Paper object to metadata.json,
138
- preserving existing data and updating with new fields.
139
-
140
- Args:
141
- paper_id: 8-digit paper ID
142
- paper: Paper object to save
143
- """
144
- storage_path = self.library_master_dir / paper_id
145
- storage_path.mkdir(parents=True, exist_ok=True)
146
-
147
- metadata_file = storage_path / "metadata.json"
148
-
149
- # Load existing metadata if it exists
150
- existing_data = {}
151
- if metadata_file.exists():
152
- try:
153
- with open(metadata_file, "r") as f:
154
- existing_data = json.load(f)
155
- except Exception:
156
- pass
157
-
158
- # Get new data from Paper object
159
- new_data = paper.model_dump()
160
-
161
- # Merge: new data takes precedence for non-None values
162
- merged_data = self._merge_metadata(existing_data, new_data)
163
-
164
- # Update timestamps
165
- if "container" not in merged_data:
166
- merged_data["container"] = {}
167
- merged_data["container"]["updated_at"] = datetime.now().isoformat()
168
-
169
- # Save to file
170
- with open(metadata_file, "w") as f:
171
- json.dump(merged_data, f, indent=2, ensure_ascii=False)
172
-
173
- logger.debug(f"Saved paper {paper_id} to storage")
174
-
175
- def _merge_metadata(self, existing: Dict, new: Dict) -> Dict:
176
- """Recursively merge metadata dicts, preferring new non-None values."""
177
- result = existing.copy()
178
-
179
- for key, new_value in new.items():
180
- if key not in result:
181
- result[key] = new_value
182
- elif new_value is None:
183
- # Keep existing value if new is None
184
- pass
185
- elif isinstance(new_value, dict) and isinstance(result[key], dict):
186
- # Recursively merge nested dicts
187
- result[key] = self._merge_metadata(result[key], new_value)
188
- elif isinstance(new_value, list) and len(new_value) > 0:
189
- # Update lists if new list is not empty
190
- result[key] = new_value
191
- elif new_value:
192
- # Update with new non-empty value
193
- result[key] = new_value
194
-
195
- return result
196
-
197
- # =========================================================================
198
- # Existing Methods
199
- # =========================================================================
200
-
201
- def _dotdict_to_dict(self, obj):
202
- """Recursively convert DotDict to plain dict for JSON serialization."""
203
- from scitex.dict import DotDict
204
-
205
- if isinstance(obj, DotDict):
206
- return {k: self._dotdict_to_dict(v) for k, v in obj._data.items()}
207
- elif isinstance(obj, dict):
208
- return {k: self._dotdict_to_dict(v) for k, v in obj.items()}
209
- elif isinstance(obj, list):
210
- return [self._dotdict_to_dict(item) for item in obj]
211
- else:
212
- return obj
213
-
214
- def _add_engine_to_list(self, engines_list: list, source: str) -> None:
215
- """Helper to add source to engines list if not already present."""
216
- if source and source not in engines_list:
217
- engines_list.append(source)
218
-
219
- def _convert_to_standardized_metadata(self, flat_metadata: Dict) -> OrderedDict:
220
- """Convert flat metadata dict to standardized nested structure with _engines tracking."""
221
- standardized = copy.deepcopy(BASE_STRUCTURE)
222
-
223
- # Map flat fields to standardized structure
224
- # ID section
225
- if "doi" in flat_metadata:
226
- standardized["id"]["doi"] = flat_metadata["doi"]
227
- self._add_engine_to_list(
228
- standardized["id"]["doi_engines"],
229
- flat_metadata.get("doi_source"),
230
- )
231
- if "scitex_id" in flat_metadata:
232
- standardized["id"]["scholar_id"] = flat_metadata["scitex_id"]
233
-
234
- # Basic section
235
- if "title" in flat_metadata:
236
- standardized["basic"]["title"] = flat_metadata["title"]
237
- self._add_engine_to_list(
238
- standardized["basic"]["title_engines"],
239
- flat_metadata.get("title_source"),
240
- )
241
- if "authors" in flat_metadata:
242
- standardized["basic"]["authors"] = flat_metadata["authors"]
243
- self._add_engine_to_list(
244
- standardized["basic"]["authors_engines"],
245
- flat_metadata.get("authors_source"),
246
- )
247
- if "year" in flat_metadata:
248
- standardized["basic"]["year"] = flat_metadata["year"]
249
- self._add_engine_to_list(
250
- standardized["basic"]["year_engines"],
251
- flat_metadata.get("year_source"),
252
- )
253
- if "abstract" in flat_metadata:
254
- standardized["basic"]["abstract"] = flat_metadata["abstract"]
255
- self._add_engine_to_list(
256
- standardized["basic"]["abstract_engines"],
257
- flat_metadata.get("abstract_source"),
258
- )
259
-
260
- # Citation count section
261
- if "citation_count" in flat_metadata:
262
- cc_value = flat_metadata["citation_count"]
263
- # Handle both scalar (4) and dict ({"total": 4}) formats
264
- if isinstance(cc_value, dict):
265
- # If it's a dict, extract the total value
266
- standardized["citation_count"]["total"] = cc_value.get("total")
267
- self._add_engine_to_list(
268
- standardized["citation_count"]["total_engines"],
269
- cc_value.get("total_source"),
270
- )
271
- # Copy yearly breakdowns if present
272
- for year in [
273
- "2025",
274
- "2024",
275
- "2023",
276
- "2022",
277
- "2021",
278
- "2020",
279
- "2019",
280
- "2018",
281
- "2017",
282
- "2016",
283
- "2015",
284
- ]:
285
- if year in cc_value:
286
- standardized["citation_count"][year] = cc_value[year]
287
- if f"{year}_source" in cc_value:
288
- self._add_engine_to_list(
289
- standardized["citation_count"][f"{year}_engines"],
290
- cc_value.get(f"{year}_source"),
291
- )
292
- else:
293
- # If it's a scalar, just assign it to total
294
- standardized["citation_count"]["total"] = cc_value
295
- self._add_engine_to_list(
296
- standardized["citation_count"]["total_engines"],
297
- flat_metadata.get("citation_count_source"),
298
- )
299
-
300
- # Publication section
301
- if "journal" in flat_metadata:
302
- standardized["publication"]["journal"] = flat_metadata["journal"]
303
- self._add_engine_to_list(
304
- standardized["publication"]["journal_engines"],
305
- flat_metadata.get("journal_source"),
306
- )
307
- if "short_journal" in flat_metadata:
308
- standardized["publication"]["short_journal"] = flat_metadata[
309
- "short_journal"
310
- ]
311
- if "impact_factor" in flat_metadata:
312
- standardized["publication"]["impact_factor"] = flat_metadata[
313
- "impact_factor"
314
- ]
315
- if "issn" in flat_metadata:
316
- standardized["publication"]["issn"] = flat_metadata["issn"]
317
- if "volume" in flat_metadata:
318
- standardized["publication"]["volume"] = flat_metadata["volume"]
319
- if "issue" in flat_metadata:
320
- standardized["publication"]["issue"] = flat_metadata["issue"]
321
- if "pages" in flat_metadata:
322
- # Split pages into first_page and last_page if needed
323
- pages = flat_metadata["pages"]
324
- if pages and "-" in str(pages):
325
- first, last = str(pages).split("-", 1)
326
- standardized["publication"]["first_page"] = first.strip()
327
- standardized["publication"]["last_page"] = last.strip()
328
- if "publisher" in flat_metadata:
329
- standardized["publication"]["publisher"] = flat_metadata["publisher"]
330
-
331
- # URL section
332
- if "url_doi" in flat_metadata:
333
- standardized["url"]["doi"] = flat_metadata["url_doi"]
334
- if "url_publisher" in flat_metadata:
335
- standardized["url"]["publisher"] = flat_metadata["url_publisher"]
336
- self._add_engine_to_list(
337
- standardized["url"]["publisher_engines"], "ScholarURLFinder"
338
- )
339
- if "url_openurl_query" in flat_metadata:
340
- standardized["url"]["openurl_query"] = flat_metadata["url_openurl_query"]
341
- if "url_openurl_resolved" in flat_metadata:
342
- standardized["url"]["openurl_resolved"] = flat_metadata[
343
- "url_openurl_resolved"
344
- ]
345
- self._add_engine_to_list(
346
- standardized["url"]["openurl_resolved_engines"],
347
- "ScholarURLFinder",
348
- )
349
- if "urls_pdf" in flat_metadata:
350
- standardized["url"]["pdfs"] = flat_metadata["urls_pdf"]
351
- self._add_engine_to_list(
352
- standardized["url"]["pdfs_engines"], "ScholarURLFinder"
353
- )
354
-
355
- # Path section
356
- if "pdf_path" in flat_metadata:
357
- standardized["path"]["pdfs"] = [flat_metadata["pdf_path"]]
358
- self._add_engine_to_list(
359
- standardized["path"]["pdfs_engines"],
360
- "ScholarPDFDownloaderWithScreenshotsParallel",
361
- )
362
-
363
- return standardized
364
-
365
- def _call_path_manager_get_storage_paths(
366
- self, paper_info: Dict, collection_name: str = "MASTER"
367
- ) -> Dict[str, Any]:
368
- """Helper to call PathManager's get_paper_storage_paths with proper parameters."""
369
- # Extract parameters from paper_info dict
370
- doi = paper_info.get("doi")
371
- title = paper_info.get("title")
372
- authors = paper_info.get("authors", [])
373
- year = paper_info.get("year")
374
- journal = paper_info.get("journal")
375
-
376
- # Call PathManager with individual parameters
377
- storage_path, readable_name, paper_id = (
378
- self.config.path_manager.get_paper_storage_paths(
379
- doi=doi,
380
- title=title,
381
- authors=authors,
382
- year=year,
383
- journal=journal,
384
- project=collection_name,
385
- )
386
- )
387
-
388
- # Return in the expected dict format
389
- return {
390
- "storage_path": storage_path,
391
- "readable_name": readable_name,
392
- "unique_id": paper_id,
393
- }
394
-
395
- def check_library_for_doi(
396
- self, title: str, year: Optional[int] = None
397
- ) -> Optional[str]:
398
- """Check if DOI already exists in master Scholar library."""
399
-
400
- try:
401
- for paper_dir in self.library_master_dir.iterdir():
402
- if not paper_dir.is_dir():
403
- continue
404
-
405
- metadata_file = paper_dir / "metadata.json"
406
- if metadata_file.exists():
407
- try:
408
- with open(metadata_file, "r") as file_:
409
- metadata = json.load(file_)
410
-
411
- stored_title = metadata.get("title", "")
412
- stored_year = metadata.get("year")
413
- stored_doi = metadata.get("doi")
414
-
415
- title_match = self._is_title_similar(title, stored_title)
416
- year_match = (
417
- not year
418
- or not stored_year
419
- or abs(int(stored_year) - int(year)) <= 1
420
- if isinstance(stored_year, (int, str))
421
- and str(stored_year).isdigit()
422
- else stored_year == year
423
- )
424
-
425
- if title_match and year_match and stored_doi:
426
- logger.info(
427
- f"DOI found in master Scholar library: {stored_doi} (paper_id: {paper_dir.name})"
428
- )
429
- return stored_doi
430
-
431
- except (
432
- json.JSONDecodeError,
433
- KeyError,
434
- ValueError,
435
- ) as exc_:
436
- logger.debug(
437
- f"Error reading metadata from {metadata_file}: {exc_}"
438
- )
439
- continue
440
-
441
- return None
442
-
443
- except Exception as exc_:
444
- logger.debug(f"Error checking master Scholar library: {exc_}")
445
- return None
446
-
447
- def save_resolved_paper(
448
- self,
449
- # Can accept either a Paper object or individual fields
450
- paper_data: Optional["Paper"] = None,
451
- # Required bibliographic fields (if not providing paper_data)
452
- title: Optional[str] = None,
453
- doi: Optional[str] = None,
454
- # Optional bibliographic fields
455
- authors: Optional[List[str]] = None,
456
- year: Optional[int] = None,
457
- journal: Optional[str] = None,
458
- abstract: Optional[str] = None,
459
- # Additional bibliographic fields
460
- volume: Optional[str] = None,
461
- issue: Optional[str] = None,
462
- pages: Optional[str] = None,
463
- publisher: Optional[str] = None,
464
- issn: Optional[str] = None,
465
- short_journal: Optional[str] = None,
466
- # Enrichment fields
467
- citation_count: Optional[int] = None,
468
- impact_factor: Optional[float] = None,
469
- # Source tracking (which engine/database provided this info)
470
- doi_source: Optional[str] = None,
471
- title_source: Optional[str] = None,
472
- abstract_source: Optional[str] = None,
473
- authors_source: Optional[str] = None,
474
- year_source: Optional[str] = None,
475
- journal_source: Optional[str] = None,
476
- # Library management
477
- library_id: Optional[str] = None,
478
- project: Optional[str] = None,
479
- # Legacy support (will be removed)
480
- metadata: Optional[Dict] = None,
481
- bibtex_source: Optional[str] = None,
482
- source: Optional[str] = None, # Legacy doi_source
483
- paper_id: Optional[str] = None, # Legacy library_id
484
- **kwargs, # For backward compatibility
485
- ) -> str:
486
- """Save successfully resolved paper to Scholar library."""
487
-
488
- # If paper_data is provided, extract fields from it
489
- if paper_data is not None:
490
- if hasattr(paper_data, "metadata"):
491
- # Pydantic Paper object
492
- title = title or (paper_data.metadata.basic.title or "")
493
- doi = doi or (paper_data.metadata.id.doi or "")
494
- authors = authors or paper_data.metadata.basic.authors
495
- year = year or paper_data.metadata.basic.year
496
- journal = journal or paper_data.metadata.publication.journal
497
- abstract = abstract or paper_data.metadata.basic.abstract
498
- publisher = publisher or paper_data.metadata.publication.publisher
499
- impact_factor = (
500
- impact_factor or paper_data.metadata.publication.impact_factor
501
- )
502
- library_id = library_id or paper_data.container.library_id
503
- elif isinstance(paper_data, dict):
504
- # Dict paper object
505
- title = title or paper_data.get("title", "")
506
- doi = doi or paper_data.get("doi", "")
507
- authors = authors or paper_data.get("authors", [])
508
- year = year or paper_data.get("year")
509
- journal = journal or paper_data.get("journal")
510
- abstract = abstract or paper_data.get("abstract")
511
- publisher = publisher or paper_data.get("publisher")
512
- impact_factor = impact_factor or paper_data.get("impact_factor")
513
- library_id = (
514
- library_id
515
- or paper_data.get("scitex_id")
516
- or paper_data.get("scholar_id")
517
- )
518
-
519
- # Handle legacy parameters
520
- if paper_id and not library_id:
521
- library_id = paper_id
522
- if source and not doi_source:
523
- doi_source = source
524
-
525
- # Build paper_info with explicit parameters (not metadata dict)
526
- paper_info = {
527
- "title": title,
528
- "year": year,
529
- "authors": authors or [],
530
- "doi": doi,
531
- "journal": journal,
532
- }
533
-
534
- # Only use metadata dict as fallback for backward compatibility
535
- if metadata:
536
- if not journal:
537
- journal = metadata.get("journal")
538
- paper_info["journal"] = journal
539
- if not year:
540
- year = metadata.get("year")
541
- paper_info["year"] = year
542
- if not authors:
543
- authors = metadata.get("authors")
544
- paper_info["authors"] = authors or []
545
-
546
- # Check for existing paper first (deduplication)
547
- check_metadata = {
548
- "doi": doi,
549
- "title": title,
550
- "authors": authors or [],
551
- "year": year,
552
- }
553
- existing_paper_dir = self.dedup_manager.check_for_existing_paper(check_metadata)
554
-
555
- if existing_paper_dir:
556
- logger.info(f"Found existing paper: {existing_paper_dir.name}")
557
- # Update existing paper instead of creating new
558
- master_storage_path = existing_paper_dir
559
- paper_id = existing_paper_dir.name
560
- readable_name = None # Will be determined from existing symlinks
561
- else:
562
- # Call PathManager with individual parameters for new paper
563
- storage_path, readable_name, paper_id = (
564
- self.config.path_manager.get_paper_storage_paths(
565
- doi=doi,
566
- title=title,
567
- authors=authors or [],
568
- year=year,
569
- journal=journal,
570
- project="MASTER",
571
- )
572
- )
573
- master_storage_path = storage_path
574
-
575
- # Use provided library_id if available, otherwise use generated paper_id
576
- if library_id:
577
- paper_id = library_id
578
-
579
- master_metadata_file = master_storage_path / "metadata.json"
580
-
581
- existing_metadata = {}
582
- if master_metadata_file.exists():
583
- try:
584
- with open(master_metadata_file, "r") as file_:
585
- existing_metadata = json.load(file_)
586
- except (json.JSONDecodeError, IOError):
587
- existing_metadata = {}
588
-
589
- # Clean text fields
590
- clean_title = TextNormalizer.clean_metadata_text(
591
- existing_metadata.get("title", title)
592
- )
593
-
594
- # Use explicit abstract parameter first, then metadata dict, then existing
595
- clean_abstract = None
596
- if abstract:
597
- clean_abstract = TextNormalizer.clean_metadata_text(abstract)
598
- elif metadata and metadata.get("abstract"):
599
- clean_abstract = TextNormalizer.clean_metadata_text(metadata["abstract"])
600
- elif existing_metadata.get("abstract"):
601
- clean_abstract = TextNormalizer.clean_metadata_text(
602
- existing_metadata["abstract"]
603
- )
604
-
605
- # Handle doi_source - explicit parameter takes precedence
606
- doi_source_value = doi_source or existing_metadata.get("doi_source")
607
- if not doi_source_value and source:
608
- # Normalize legacy source parameter
609
- if "crossref" in source.lower():
610
- doi_source_value = "crossref"
611
- elif "semantic" in source.lower():
612
- doi_source_value = "semantic_scholar"
613
- elif "pubmed" in source.lower():
614
- doi_source_value = "pubmed"
615
- elif "openalex" in source.lower():
616
- doi_source_value = "openalex"
617
- else:
618
- doi_source_value = source
619
-
620
- comprehensive_metadata = {
621
- # Core bibliographic fields
622
- "title": clean_title,
623
- "title_source": title_source
624
- or existing_metadata.get("title_source", "input"),
625
- "doi": existing_metadata.get("doi", doi),
626
- "doi_source": doi_source_value,
627
- "year": existing_metadata.get("year", year),
628
- "year_source": year_source
629
- or existing_metadata.get("year_source", "input" if year else None),
630
- "authors": existing_metadata.get("authors", authors or []),
631
- "authors_source": authors_source
632
- or existing_metadata.get("authors_source", "input" if authors else None),
633
- "journal": existing_metadata.get("journal", journal),
634
- "journal_source": journal_source
635
- or existing_metadata.get("journal_source", "input" if journal else None),
636
- # Additional bibliographic fields from explicit parameters
637
- "volume": existing_metadata.get("volume", volume),
638
- "issue": existing_metadata.get("issue", issue),
639
- "pages": existing_metadata.get("pages", pages),
640
- "publisher": existing_metadata.get("publisher", publisher),
641
- "issn": existing_metadata.get("issn", issn),
642
- "short_journal": existing_metadata.get("short_journal", short_journal),
643
- # Abstract with source tracking
644
- "abstract": existing_metadata.get("abstract", clean_abstract),
645
- "abstract_source": abstract_source
646
- or existing_metadata.get("abstract_source", "input" if abstract else None),
647
- # Enrichment fields
648
- "citation_count": existing_metadata.get("citation_count", citation_count),
649
- "impact_factor": existing_metadata.get("impact_factor", impact_factor),
650
- "scitex_id": existing_metadata.get(
651
- "scitex_id", existing_metadata.get("scholar_id", paper_id)
652
- ),
653
- "created_at": existing_metadata.get(
654
- "created_at", datetime.now().isoformat()
655
- ),
656
- "created_by": existing_metadata.get("created_by", "SciTeX Scholar"),
657
- "updated_at": datetime.now().isoformat(),
658
- "projects": existing_metadata.get(
659
- "projects", [] if self.project == "master" else [self.project]
660
- ),
661
- "master_storage_path": str(master_storage_path),
662
- "readable_name": readable_name,
663
- "metadata_file": str(master_metadata_file),
664
- }
665
-
666
- # Store plain dict version for JSON serialization
667
- comprehensive_metadata_plain = self._dotdict_to_dict(comprehensive_metadata)
668
-
669
- # Convert to standardized format before saving
670
- standardized_metadata = self._convert_to_standardized_metadata(
671
- comprehensive_metadata_plain
672
- )
673
-
674
- # Wrap with Paper container properties
675
- final_structure = OrderedDict(
676
- [
677
- ("metadata", standardized_metadata),
678
- (
679
- "container",
680
- OrderedDict(
681
- [
682
- (
683
- "scitex_id",
684
- comprehensive_metadata_plain.get("scitex_id"),
685
- ),
686
- ("library_id", paper_id),
687
- (
688
- "created_at",
689
- comprehensive_metadata_plain.get("created_at"),
690
- ),
691
- (
692
- "created_by",
693
- comprehensive_metadata_plain.get("created_by"),
694
- ),
695
- (
696
- "updated_at",
697
- comprehensive_metadata_plain.get("updated_at"),
698
- ),
699
- (
700
- "projects",
701
- comprehensive_metadata_plain.get("projects", []),
702
- ),
703
- ("master_storage_path", str(master_storage_path)),
704
- ("readable_name", readable_name),
705
- ("metadata_file", str(master_metadata_file)),
706
- (
707
- "pdf_downloaded_at",
708
- comprehensive_metadata_plain.get("pdf_downloaded_at"),
709
- ),
710
- (
711
- "pdf_size_bytes",
712
- comprehensive_metadata_plain.get("pdf_size_bytes"),
713
- ),
714
- ]
715
- ),
716
- ),
717
- ]
718
- )
719
-
720
- with open(master_metadata_file, "w") as file_:
721
- json.dump(final_structure, file_, indent=2, ensure_ascii=False)
722
-
723
- logger.success(f"Saved paper to MASTER Scholar library: {paper_id}")
724
-
725
- # Create project symlink if project is specified and not MASTER
726
- if self.project and self.project not in ["master", "MASTER"]:
727
- try:
728
- # Use centralized naming logic - use original comprehensive_metadata (not plain)
729
- readable_name = self._generate_readable_name(
730
- comprehensive_metadata=comprehensive_metadata,
731
- master_storage_path=master_storage_path,
732
- authors=authors,
733
- year=year,
734
- journal=journal,
735
- )
736
-
737
- self._create_project_symlink(
738
- master_storage_path=master_storage_path,
739
- project=self.project,
740
- readable_name=readable_name,
741
- )
742
- except Exception as exc_:
743
- logger.error(f"Failed to create symlink for {paper_id}: {exc_}")
744
-
745
- return paper_id
746
-
747
- def save_unresolved_paper(
748
- self,
749
- title: str,
750
- year: Optional[int] = None,
751
- authors: Optional[List[str]] = None,
752
- reason: str = "DOI not found",
753
- bibtex_source: Optional[str] = None,
754
- ) -> None:
755
- """Save paper that couldn't be resolved to unresolved directory."""
756
- clean_title = TextNormalizer.clean_metadata_text(title) if title else ""
757
- unresolved_info = {
758
- "title": clean_title,
759
- "year": year,
760
- "authors": authors or [],
761
- "reason": reason,
762
- "bibtex_source": bibtex_source,
763
- "project": self.project,
764
- "created_at": datetime.now().isoformat(),
765
- "created_by": "SciTeX Scholar",
766
- }
767
-
768
- project_lib_path = (
769
- self.config.path_manager.get_scholar_library_path() / self.project
770
- )
771
- unresolved_dir = project_lib_path / "unresolved"
772
- unresolved_dir.mkdir(parents=True, exist_ok=True)
773
-
774
- safe_title = title or "untitled"
775
- safe_title = re.sub(r"[^\w\s-]", "", safe_title)[:50]
776
- safe_title = re.sub(r"[-\s]+", "_", safe_title)
777
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
778
- unresolved_file = unresolved_dir / f"{safe_title}_{timestamp}.json"
779
-
780
- with open(unresolved_file, "w") as file_:
781
- json.dump(unresolved_info, file_, indent=2, ensure_ascii=False)
782
-
783
- logger.warning(f"Saved unresolved entry: {unresolved_file.name}")
784
-
785
- async def resolve_and_create_library_structure_async(
786
- self,
787
- papers: List[Dict[str, Any]],
788
- project: str,
789
- sources: Optional[List[str]] = None,
790
- ) -> Dict[str, Dict[str, str]]:
791
- """Resolve DOIs and create full Scholar library structure with proper paths."""
792
- if not self.single_doi_resolver:
793
- raise ValueError("SingleDOIResolver is required for resolving DOIs")
794
-
795
- results = {}
796
- for paper in papers:
797
- title = paper.get("title")
798
- if not title:
799
- logger.warning(f"Skipping paper without title: {paper}")
800
- continue
801
-
802
- logger.info(f"Processing: {title[:50]}...")
803
-
804
- try:
805
- doi_result = await self.single_doi_resolver.metadata2doi_async(
806
- title=title,
807
- year=paper.get("year"),
808
- authors=paper.get("authors"),
809
- sources=sources,
810
- )
811
-
812
- enhanced_metadata = self._extract_enhanced_metadata(doi_result, paper)
813
- paper_info = {**paper, **enhanced_metadata}
814
-
815
- storage_paths = self._call_path_manager_get_storage_paths(
816
- paper_info=paper_info, collection_name="MASTER"
817
- )
818
- paper_id = storage_paths["unique_id"]
819
- storage_path = storage_paths["storage_path"]
820
- metadata_file = storage_path / "metadata.json"
821
-
822
- complete_metadata = self._create_complete_metadata(
823
- paper, doi_result, paper_id, enhanced_metadata
824
- )
825
-
826
- with open(metadata_file, "w") as file_:
827
- json.dump(complete_metadata, file_, indent=2)
828
-
829
- logger.success(
830
- f"Saved metadata.json for {paper_id} ({len(complete_metadata)} fields)"
831
- )
832
-
833
- project_symlink_path = self._create_project_symlink(
834
- master_storage_path=storage_path,
835
- project=project,
836
- readable_name=storage_paths["readable_name"],
837
- )
838
-
839
- bibtex_source_filename = getattr(self, "_source_filename", "papers")
840
- info_dir = self._create_bibtex_info_structure(
841
- project=project,
842
- paper_info={**paper, **enhanced_metadata},
843
- complete_metadata=complete_metadata,
844
- bibtex_source_filename=bibtex_source_filename,
845
- )
846
-
847
- results[title] = {
848
- "scitex_id": paper_id,
849
- "scholar_id": paper_id,
850
- "doi": complete_metadata.get("doi"),
851
- "master_storage_path": str(storage_path),
852
- "project_symlink_path": (
853
- str(project_symlink_path) if project_symlink_path else None
854
- ),
855
- "readable_name": storage_paths["readable_name"],
856
- "metadata_file": str(metadata_file),
857
- "info_dir": str(info_dir) if info_dir else None,
858
- }
859
-
860
- logger.info(f"Created library entry: {paper_id}")
861
- if complete_metadata.get("doi"):
862
- logger.info(f" DOI: {complete_metadata['doi']}")
863
- logger.info(f" Storage: {storage_path}")
864
-
865
- except Exception as exc_:
866
- logger.error(f"❌ Error processing '{title[:30]}...': {exc_}")
867
-
868
- logger.success(
869
- f"Created Scholar library entries for {len(results)}/{len(papers)} papers"
870
- )
871
- return results
872
-
873
- async def resolve_and_create_library_structure_with_source_async(
874
- self,
875
- papers: List[Dict[str, Any]],
876
- project: str,
877
- sources: Optional[List[str]] = None,
878
- bibtex_source_filename: str = "papers",
879
- ) -> Dict[str, Dict[str, str]]:
880
- """Enhanced version that passes source filename for BibTeX structure."""
881
- self._source_filename = bibtex_source_filename
882
- return await self.resolve_and_create_library_structure_async(
883
- papers=papers, project=project, sources=sources
884
- )
885
-
886
- def _extract_enhanced_metadata(
887
- self, doi_result: Optional[Dict], paper: Dict
888
- ) -> Dict[str, Any]:
889
- """Extract enhanced metadata from DOI resolution result."""
890
- enhanced = {}
891
- if doi_result and isinstance(doi_result, dict):
892
- metadata_source = doi_result.get("metadata", {})
893
- enhanced.update(
894
- {
895
- "doi": doi_result.get("doi"),
896
- "journal": metadata_source.get("journal")
897
- or doi_result.get("journal")
898
- or paper.get("journal"),
899
- "authors": metadata_source.get("authors")
900
- or doi_result.get("authors")
901
- or paper.get("authors"),
902
- "year": metadata_source.get("year")
903
- or doi_result.get("year")
904
- or paper.get("year"),
905
- "title": metadata_source.get("title")
906
- or doi_result.get("title")
907
- or paper.get("title"),
908
- "abstract": metadata_source.get("abstract")
909
- or doi_result.get("abstract"),
910
- "publisher": metadata_source.get("publisher")
911
- or doi_result.get("publisher"),
912
- "volume": metadata_source.get("volume") or doi_result.get("volume"),
913
- "issue": metadata_source.get("issue") or doi_result.get("issue"),
914
- "pages": metadata_source.get("pages") or doi_result.get("pages"),
915
- "issn": metadata_source.get("issn") or doi_result.get("issn"),
916
- "short_journal": metadata_source.get("short_journal")
917
- or doi_result.get("short_journal"),
918
- }
919
- )
920
-
921
- if doi_result.get("doi"):
922
- logger.success(
923
- f"Enhanced metadata from DOI source: {dict(metadata_source)}"
924
- )
925
-
926
- return enhanced
927
-
928
- def _create_complete_metadata(
929
- self,
930
- paper: Dict,
931
- doi_result: Optional[Dict],
932
- paper_id: str,
933
- enhanced_metadata: Dict,
934
- ) -> Dict[str, Any]:
935
- """Create complete metadata dictionary with source tracking."""
936
- raw_title = enhanced_metadata.get("title") or paper.get("title")
937
- clean_title = TextNormalizer.clean_metadata_text(raw_title) if raw_title else ""
938
- raw_abstract = None
939
- if enhanced_metadata.get("abstract"):
940
- raw_abstract = TextNormalizer.clean_metadata_text(
941
- enhanced_metadata["abstract"]
942
- )
943
-
944
- doi_source_value = None
945
- if doi_result and doi_result.get("source"):
946
- source = doi_result["source"]
947
- if "crossref" in source.lower():
948
- doi_source_value = "crossref"
949
- elif "semantic" in source.lower():
950
- doi_source_value = "semantic_scholar"
951
- elif "pubmed" in source.lower():
952
- doi_source_value = "pubmed"
953
- elif "openalex" in source.lower():
954
- doi_source_value = "openalex"
955
- else:
956
- doi_source_value = source
957
-
958
- complete_metadata = {
959
- "title": clean_title,
960
- "title_source": (
961
- doi_source_value
962
- if enhanced_metadata.get("title") != paper.get("title")
963
- else "manual"
964
- ),
965
- "authors": enhanced_metadata.get("authors") or paper.get("authors"),
966
- "authors_source": (
967
- doi_source_value
968
- if enhanced_metadata.get("authors") != paper.get("authors")
969
- else ("manual" if paper.get("authors") else None)
970
- ),
971
- "year": enhanced_metadata.get("year") or paper.get("year"),
972
- "year_source": (
973
- doi_source_value
974
- if enhanced_metadata.get("year") != paper.get("year")
975
- else ("manual" if paper.get("year") else None)
976
- ),
977
- "journal": enhanced_metadata.get("journal") or paper.get("journal"),
978
- "journal_source": (
979
- doi_source_value
980
- if enhanced_metadata.get("journal") != paper.get("journal")
981
- else ("manual" if paper.get("journal") else None)
982
- ),
983
- "abstract": raw_abstract,
984
- "abstract_source": (
985
- doi_source_value if enhanced_metadata.get("abstract") else None
986
- ),
987
- "scitex_id": paper_id,
988
- "created_at": datetime.now().isoformat(),
989
- "created_by": "SciTeX Scholar",
990
- }
991
-
992
- if doi_result and isinstance(doi_result, dict):
993
- safe_fields = [
994
- "publisher",
995
- "volume",
996
- "issue",
997
- "pages",
998
- "issn",
999
- "short_journal",
1000
- ]
1001
- for field in safe_fields:
1002
- value = enhanced_metadata.get(field)
1003
- if value is not None:
1004
- complete_metadata[field] = value
1005
- complete_metadata[f"{field}_source"] = (
1006
- doi_source_value or "unknown_api"
1007
- )
1008
-
1009
- if doi_result and doi_result.get("doi"):
1010
- complete_metadata.update(
1011
- {"doi": doi_result["doi"], "doi_source": doi_source_value}
1012
- )
1013
- logger.success(f"DOI resolved for {paper_id}: {doi_result['doi']}")
1014
- else:
1015
- complete_metadata.update(
1016
- {
1017
- "doi": None,
1018
- "doi_source": None,
1019
- "doi_resolution_failed": True,
1020
- }
1021
- )
1022
- logger.warning(
1023
- f"DOI resolution failed for {paper_id}: {paper.get('title', '')[:40]}..."
1024
- )
1025
-
1026
- standard_fields = {
1027
- "keywords": None,
1028
- "references": None,
1029
- "venue": None,
1030
- "publisher": None,
1031
- "volume": None,
1032
- "issue": None,
1033
- "pages": None,
1034
- "issn": None,
1035
- "short_journal": None,
1036
- }
1037
-
1038
- missing_fields = []
1039
- for field, default_value in standard_fields.items():
1040
- if field not in complete_metadata or complete_metadata[field] is None:
1041
- complete_metadata[field] = default_value
1042
- missing_fields.append(field)
1043
-
1044
- if missing_fields:
1045
- logger.info(
1046
- f"Missing fields for future enhancement: {', '.join(missing_fields)}"
1047
- )
1048
-
1049
- storage_paths = self._call_path_manager_get_storage_paths(
1050
- paper_info={**paper, **enhanced_metadata}, collection_name="MASTER"
1051
- )
1052
- storage_path = storage_paths["storage_path"]
1053
-
1054
- complete_metadata.update(
1055
- {
1056
- "master_storage_path": str(storage_path),
1057
- "readable_name": storage_paths["readable_name"],
1058
- "metadata_file": str(storage_path / "metadata.json"),
1059
- }
1060
- )
1061
-
1062
- return complete_metadata
1063
-
1064
- def _generate_readable_name(
1065
- self,
1066
- comprehensive_metadata: Dict,
1067
- master_storage_path: Path,
1068
- authors: Optional[List[str]] = None,
1069
- year: Optional[int] = None,
1070
- journal: Optional[str] = None,
1071
- ) -> str:
1072
- """Generate readable symlink name from metadata.
1073
-
1074
- Single source of truth for symlink naming format.
1075
- """
1076
- # Extract author
1077
- first_author = "Unknown"
1078
- if authors and len(authors) > 0:
1079
- author_parts = authors[0].split()
1080
- first_author = (
1081
- author_parts[-1] if len(author_parts) > 1 else author_parts[0]
1082
- )
1083
- first_author = "".join(c for c in first_author if c.isalnum() or c == "-")[
1084
- :20
1085
- ]
1086
-
1087
- # Format year (handle DotDict and other non-int types)
1088
- from scitex.dict import DotDict
1089
-
1090
- if isinstance(year, DotDict):
1091
- # Extract value if it's a DotDict
1092
- year = None # Can't extract year from DotDict structure, use Unknown
1093
-
1094
- # Convert to int if it's a string representation
1095
- if isinstance(year, str) and year.isdigit():
1096
- year = int(year)
1097
-
1098
- # Only use year if it's actually an int
1099
- if isinstance(year, int):
1100
- year_str = f"{year:04d}"
1101
- else:
1102
- year_str = "0000"
1103
-
1104
- # Clean journal name using PathManager (single source of truth)
1105
- journal_clean = "Unknown"
1106
- if journal:
1107
- journal_clean = self.config.path_manager._sanitize_filename(journal)[:30]
1108
- if not journal_clean:
1109
- journal_clean = "Unknown"
1110
-
1111
- # Get citation count and impact factor (handle both flat and nested formats)
1112
- # Check if this is the nested structure from file (has "metadata" key)
1113
- if "metadata" in comprehensive_metadata:
1114
- # Nested structure from file
1115
- metadata_section = comprehensive_metadata.get("metadata", {})
1116
-
1117
- # Extract citation count from nested structure
1118
- cc_val = metadata_section.get("citation_count", {})
1119
- if isinstance(cc_val, dict):
1120
- cc = cc_val.get("total", 0) or 0
1121
- else:
1122
- cc = cc_val or 0
1123
-
1124
- # Extract impact factor from nested structure
1125
- publication_section = metadata_section.get("publication", {})
1126
- if_val = publication_section.get("impact_factor", 0.0) or 0.0
1127
-
1128
- else:
1129
- # Flat structure (during initial save)
1130
- cc_val = comprehensive_metadata.get("citation_count", 0)
1131
- if isinstance(cc_val, dict):
1132
- cc = cc_val.get("total", 0) or 0
1133
- else:
1134
- cc = cc_val or 0
1135
-
1136
- # Try multiple paths for impact_factor
1137
- if_val = (
1138
- comprehensive_metadata.get("journal_impact_factor")
1139
- or comprehensive_metadata.get("impact_factor")
1140
- or comprehensive_metadata.get("publication", {}).get("impact_factor")
1141
- )
1142
- if isinstance(if_val, dict):
1143
- if_val = if_val.get("value", 0.0) or 0.0
1144
- else:
1145
- if_val = if_val or 0.0
1146
-
1147
- # Check PDF status with more granular states
1148
- pdf_files = list(master_storage_path.glob("*.pdf"))
1149
- screenshot_dir = master_storage_path / "screenshots"
1150
- has_screenshots = screenshot_dir.exists() and any(screenshot_dir.iterdir())
1151
- downloading_marker = master_storage_path / ".downloading"
1152
- attempted_marker = master_storage_path / ".download_attempted"
1153
-
1154
- # Extract DOI from metadata to check availability
1155
- doi = None
1156
- if "metadata" in comprehensive_metadata:
1157
- # Nested structure from file
1158
- doi = comprehensive_metadata.get("metadata", {}).get("id", {}).get("doi")
1159
- else:
1160
- # Flat structure (during initial save)
1161
- doi = comprehensive_metadata.get("doi")
1162
-
1163
- if downloading_marker.exists():
1164
- # Download in progress
1165
- pdf_status_letter = "r"
1166
- elif pdf_files:
1167
- # Has PDF = Successful
1168
- pdf_status_letter = "s"
1169
- elif has_screenshots:
1170
- # Has screenshots but no PDF = Failed (attempted but failed)
1171
- pdf_status_letter = "f"
1172
- elif attempted_marker.exists():
1173
- # Download was attempted but failed early (before screenshots)
1174
- pdf_status_letter = "f"
1175
- elif not doi:
1176
- # No DOI = Failed (cannot download without identifier)
1177
- pdf_status_letter = "f"
1178
- else:
1179
- # No PDF, no screenshots, no attempts, has DOI = Pending (not attempted yet)
1180
- pdf_status_letter = "p"
1181
-
1182
- pdf_status_id_map = {
1183
- "p": 0,
1184
- "r": 1,
1185
- "f": 2,
1186
- "s": 3,
1187
- }
1188
- pdf_status_str = f"{pdf_status_id_map[pdf_status_letter]}{pdf_status_letter}"
1189
- # Format: CC_000000-PDF_s-IF_032-2016-Author-Journal
1190
- # PDF status: r=running, s=successful, f=failed, p=pending
1191
- # readable_name = f"CC_{cc:06d}-PDF_{pdf_status_letter}-IF_{int(if_val):03d}-{year_str}-{first_author}-{journal_clean}"
1192
- readable_name = f"PDF-{pdf_status_str}_CC-{cc:06d}_IF-{int(if_val):03d}_{year_str}_{first_author}_{journal_clean}"
1193
-
1194
- return readable_name
1195
-
1196
- def update_symlink(
1197
- self,
1198
- master_storage_path: Path,
1199
- project: str,
1200
- metadata: Optional[Dict] = None,
1201
- ) -> Optional[Path]:
1202
- """Update project symlink to reflect current paper status.
1203
-
1204
- This should be called whenever paper status changes (pending → running → success/failed).
1205
- Generates new readable name based on current state (checking .downloading marker, PDFs, etc.)
1206
- and updates the symlink accordingly.
1207
-
1208
- Args:
1209
- master_storage_path: Path to paper in master library
1210
- project: Project name
1211
- metadata: Optional metadata dict (if not provided, will read from file)
1212
-
1213
- Returns:
1214
- Path to the created symlink, or None if failed
1215
- """
1216
- try:
1217
- # Load metadata if not provided
1218
- if metadata is None:
1219
- metadata_file = master_storage_path / "metadata.json"
1220
- if metadata_file.exists():
1221
- import json
1222
-
1223
- with open(metadata_file, "r") as f:
1224
- metadata = json.load(f)
1225
- else:
1226
- logger.warning(f"No metadata found for {master_storage_path.name}")
1227
- return None
1228
-
1229
- # Extract metadata from nested structure if needed
1230
- # Metadata file has structure: {"metadata": {"basic": {...}, "id": {...}, ...}, "container": {...}}
1231
- if "metadata" in metadata:
1232
- # Nested structure from file
1233
- meta_section = metadata.get("metadata", {})
1234
- basic_section = meta_section.get("basic", {})
1235
- pub_section = meta_section.get("publication", {})
1236
-
1237
- authors = basic_section.get("authors")
1238
- year = basic_section.get("year")
1239
- journal = pub_section.get("journal")
1240
- else:
1241
- # Flat structure (should not happen when reading from file, but handle it)
1242
- authors = metadata.get("authors")
1243
- year = metadata.get("year")
1244
- journal = metadata.get("journal")
1245
-
1246
- # Generate readable name based on current state
1247
- readable_name = self._generate_readable_name(
1248
- comprehensive_metadata=metadata,
1249
- master_storage_path=master_storage_path,
1250
- authors=authors,
1251
- year=year,
1252
- journal=journal,
1253
- )
1254
-
1255
- # Create/update symlink
1256
- return self._create_project_symlink(
1257
- master_storage_path=master_storage_path,
1258
- project=project,
1259
- readable_name=readable_name,
1260
- )
1261
- except Exception as exc_:
1262
- logger.error(
1263
- f"Failed to update symlink for {master_storage_path.name}: {exc_}"
1264
- )
1265
- return None
1266
-
1267
- def _create_project_symlink(
1268
- self, master_storage_path: Path, project: str, readable_name: str
1269
- ) -> Optional[Path]:
1270
- """Create symlink in project directory pointing to master storage.
1271
-
1272
- Removes old symlinks for the same paper with different statuses
1273
- (e.g., removes PDF_p when creating PDF_s).
1274
- """
1275
-
1276
- try:
1277
- project_dir = self.config.path_manager.get_library_project_dir(project)
1278
- symlink_path = project_dir / readable_name
1279
-
1280
- # Extract the master ID from the target path to find old symlinks
1281
- master_id = master_storage_path.name
1282
-
1283
- # Remove old symlinks pointing to the same master entry
1284
- # but with different statuses (PDF_p, PDF_f, PDF_s)
1285
- for existing_link in project_dir.iterdir():
1286
- if not existing_link.is_symlink():
1287
- continue
1288
-
1289
- # Check if this symlink points to the same master entry
1290
- try:
1291
- target = existing_link.resolve()
1292
- if target.name == master_id and existing_link.name != readable_name:
1293
- # This is an old symlink for the same paper
1294
- logger.debug(f"Removing old symlink: {existing_link.name}")
1295
- existing_link.unlink()
1296
- except Exception as e:
1297
- # Handle broken symlinks
1298
- logger.debug(f"Skipping broken symlink {existing_link.name}: {e}")
1299
- continue
1300
-
1301
- # Create new symlink
1302
- if not symlink_path.exists():
1303
- relative_path = os.path.relpath(master_storage_path, project_dir)
1304
- symlink_path.symlink_to(relative_path)
1305
- logger.success(
1306
- f"Created project symlink: {symlink_path} -> {relative_path}"
1307
- )
1308
- else:
1309
- logger.debug(f"Project symlink already exists: {symlink_path}")
1310
-
1311
- return symlink_path
1312
-
1313
- except Exception as exc_:
1314
- logger.warning(f"Failed to create project symlink: {exc_}")
1315
- return None
1316
-
1317
- def _create_bibtex_info_structure(
1318
- self,
1319
- project: str,
1320
- paper_info: Dict[str, Any],
1321
- complete_metadata: Dict[str, Any],
1322
- bibtex_source_filename: str = "papers",
1323
- ) -> Optional[Path]:
1324
- """Create info/papers_bib/pac.bib structure."""
1325
- try:
1326
- project_dir = self.config.path_manager.get_library_project_dir(project)
1327
- info_dir = project_dir / "info" / f"{bibtex_source_filename}_bib"
1328
- info_dir.mkdir(parents=True, exist_ok=True)
1329
-
1330
- bibtex_file = info_dir / f"{bibtex_source_filename}.bib"
1331
- unresolved_dir = info_dir / "unresolved"
1332
- unresolved_dir.mkdir(parents=True, exist_ok=True)
1333
-
1334
- first_author = "unknown"
1335
- if complete_metadata.get("authors"):
1336
- authors = complete_metadata["authors"]
1337
- if isinstance(authors, list) and authors:
1338
- first_author = str(authors[0]).split()[-1].lower()
1339
- elif isinstance(authors, str):
1340
- first_author = authors.split()[-1].lower()
1341
-
1342
- year = complete_metadata.get("year", "unknown")
1343
- entry_key = f"{first_author}{year}"
1344
-
1345
- bibtex_entry = self._generate_bibtex_entry(complete_metadata, entry_key)
1346
-
1347
- if bibtex_file.exists():
1348
- with open(bibtex_file, "a", encoding="utf-8") as file_:
1349
- file_.write(f"\n{bibtex_entry}")
1350
- else:
1351
- with open(bibtex_file, "w", encoding="utf-8") as file_:
1352
- file_.write(bibtex_entry)
1353
-
1354
- if not complete_metadata.get("doi"):
1355
- unresolved_file = unresolved_dir / f"{entry_key}.json"
1356
- unresolved_data = {
1357
- "title": complete_metadata.get("title", ""),
1358
- "authors": complete_metadata.get("authors", []),
1359
- "year": complete_metadata.get("year", ""),
1360
- "journal": complete_metadata.get("journal", ""),
1361
- "scholar_id": complete_metadata.get("scholar_id", ""),
1362
- "resolution_failed": True,
1363
- "timestamp": complete_metadata.get("created_at", ""),
1364
- }
1365
- with open(unresolved_file, "w", encoding="utf-8") as file_:
1366
- json.dump(unresolved_data, file_, indent=2)
1367
- logger.info(f"Added unresolved entry: {unresolved_file}")
1368
-
1369
- logger.success(f"Updated BibTeX info structure: {bibtex_file}")
1370
- return info_dir
1371
-
1372
- except Exception as exc_:
1373
- logger.warning(f"Failed to create BibTeX info structure: {exc_}")
1374
- return None
1375
-
1376
- def _generate_bibtex_entry(self, metadata: Dict[str, Any], entry_key: str) -> str:
1377
- """Generate BibTeX entry from metadata."""
1378
- entry_type = "article"
1379
- if metadata.get("journal"):
1380
- entry_type = "article"
1381
- elif metadata.get("booktitle"):
1382
- entry_type = "inproceedings"
1383
- elif metadata.get("publisher") and not metadata.get("journal"):
1384
- entry_type = "book"
1385
-
1386
- bibtex = f"@{entry_type}{{{entry_key},\n"
1387
-
1388
- field_mappings = {
1389
- "title": "title",
1390
- "authors": "author",
1391
- "year": "year",
1392
- "journal": "journal",
1393
- "doi": "doi",
1394
- "volume": "volume",
1395
- "issue": "number",
1396
- "pages": "pages",
1397
- "publisher": "publisher",
1398
- "booktitle": "booktitle",
1399
- "abstract": "abstract",
1400
- }
1401
-
1402
- for meta_field, bibtex_field in field_mappings.items():
1403
- value = metadata.get(meta_field)
1404
- if value:
1405
- if isinstance(value, list):
1406
- value = " and ".join(str(val_) for val_ in value)
1407
- value_escaped = str(value).replace("{", "\\{").replace("}", "\\}")
1408
- bibtex += f" {bibtex_field} = {{{value_escaped}}},\n"
1409
-
1410
- source_field = f"{meta_field}_source"
1411
- if source_field in metadata:
1412
- bibtex += f" % {bibtex_field}_source = {metadata[source_field]}\n"
1413
-
1414
- bibtex += f" % scholar_id = {metadata.get('scholar_id', 'unknown')},\n"
1415
- bibtex += f" % created_at = {metadata.get('created_at', 'unknown')},\n"
1416
- bibtex += f" % created_by = {metadata.get('created_by', 'unknown')},\n"
1417
- bibtex += "}\n"
1418
-
1419
- return bibtex
1420
-
1421
- # def _ensure_project_symlink(
1422
- # self,
1423
- # title: str,
1424
- # year: Optional[int] = None,
1425
- # authors: Optional[List[str]] = None,
1426
- # paper_id: str = None,
1427
- # master_storage_path: Path = None,
1428
- # ) -> None:
1429
- # """Ensure project symlink exists for paper in master library."""
1430
- # try:
1431
- # if not paper_id or not master_storage_path:
1432
- # return
1433
-
1434
- # project_lib_path = (
1435
- # self.config.path_manager.get_scholar_library_path()
1436
- # / self.project
1437
- # )
1438
- # project_lib_path.mkdir(parents=True, exist_ok=True)
1439
-
1440
- # paper_info = {
1441
- # "title": title,
1442
- # "year": year,
1443
- # "authors": authors or [],
1444
- # }
1445
- # readable_paths = self.config.path_manager.get_paper_storage_paths(
1446
- # paper_info=paper_info, collection_name=self.project
1447
- # )
1448
- # readable_name = readable_paths["readable_name"]
1449
- # symlink_path = project_lib_path / readable_name
1450
-
1451
- # relative_path = f"../MASTER/{paper_id}"
1452
- # if not symlink_path.exists():
1453
- # symlink_path.symlink_to(relative_path)
1454
- # logger.info(
1455
- # f"Created project symlink: {readable_name} -> {relative_path}"
1456
- # )
1457
-
1458
- # except Exception as exc_:
1459
- # logger.debug(f"Error creating project symlink: {exc_}")
1460
-
1461
- def _ensure_project_symlink(
1462
- self,
1463
- title: str,
1464
- year: Optional[int] = None,
1465
- authors: Optional[List[str]] = None,
1466
- paper_id: str = None,
1467
- master_storage_path: Path = None,
1468
- ) -> None:
1469
- try:
1470
- if not paper_id or not master_storage_path:
1471
- return
1472
-
1473
- project_lib_path = (
1474
- self.config.path_manager.get_scholar_library_path() / self.project
1475
- )
1476
- project_lib_path.mkdir(parents=True, exist_ok=True)
1477
-
1478
- paper_info = {
1479
- "title": title,
1480
- "year": year,
1481
- "authors": authors or [],
1482
- }
1483
- readable_paths = self._call_path_manager_get_storage_paths(
1484
- paper_info=paper_info, collection_name=self.project
1485
- )
1486
- readable_name = readable_paths["readable_name"]
1487
- symlink_path = project_lib_path / readable_name
1488
- relative_path = f"../MASTER/{paper_id}"
1489
-
1490
- if not symlink_path.exists():
1491
- symlink_path.symlink_to(relative_path)
1492
- logger.info(
1493
- f"Created project symlink: {readable_name} -> {relative_path}"
1494
- )
1495
- except Exception as exc_:
1496
- logger.debug(f"Error creating project symlink: {exc_}")
1497
-
1498
- def _is_title_similar(
1499
- self, title1: str, title2: str, threshold: float = 0.7
1500
- ) -> bool:
1501
- """Check if two titles are similar enough to be considered the same paper."""
1502
- if not title1 or not title2:
1503
- return False
1504
-
1505
- def normalize_title(title: str) -> str:
1506
- title = title.lower()
1507
- title = re.sub(r"[^\w\s]", " ", title)
1508
- title = re.sub(r"\s+", " ", title)
1509
- return title.strip()
1510
-
1511
- norm_title1 = normalize_title(title1)
1512
- norm_title2 = normalize_title(title2)
1513
-
1514
- words1 = set(norm_title1.split())
1515
- words2 = set(norm_title2.split())
1516
-
1517
- if not words1 or not words2:
1518
- return False
1519
-
1520
- intersection = len(words1.intersection(words2))
1521
- union = len(words1.union(words2))
1522
- similarity = intersection / union if union > 0 else 0.0
1523
-
1524
- return similarity >= threshold
1525
-
1526
- def update_library_metadata(
1527
- self,
1528
- paper_id: str,
1529
- project: str,
1530
- doi: str,
1531
- metadata: Dict[str, Any],
1532
- create_structure: bool = True,
1533
- ) -> bool:
1534
- """Update Scholar library metadata.json with resolved DOI."""
1535
- try:
1536
- library_path = self.config.path_manager.library_dir
1537
- paper_dir = library_path / project / paper_id
1538
- metadata_file = paper_dir / "metadata.json"
1539
-
1540
- if create_structure and not paper_dir.exists():
1541
- self.config.path_manager._ensure_directory(paper_dir)
1542
- logger.info(f"Created Scholar library structure: {paper_dir}")
1543
-
1544
- existing_metadata = {}
1545
- if metadata_file.exists():
1546
- try:
1547
- with open(metadata_file, "r") as file_:
1548
- existing_metadata = json.load(file_)
1549
- except Exception as exc_:
1550
- logger.warning(f"Error loading existing metadata: {exc_}")
1551
-
1552
- updated_metadata = {
1553
- **existing_metadata,
1554
- **metadata,
1555
- "doi": doi,
1556
- "doi_resolved_at": datetime.now().isoformat(),
1557
- "doi_source": "batch_doi_resolver",
1558
- }
1559
-
1560
- with open(metadata_file, "w") as file_:
1561
- json.dump(updated_metadata, file_, indent=2)
1562
-
1563
- logger.success(f"Updated metadata for {paper_id}: DOI {doi}")
1564
- return True
1565
-
1566
- except Exception as exc_:
1567
- logger.error(f"Error updating library metadata for {paper_id}: {exc_}")
1568
- return False
1569
-
1570
- def create_writer_directory_structure(self, paper_id: str, project: str) -> Path:
1571
- """Create basic paper directory structure."""
1572
- library_path = self.config.path_manager.library_dir
1573
- paper_dir = library_path / project / paper_id
1574
-
1575
- self.config.path_manager._ensure_directory(paper_dir)
1576
-
1577
- for subdir in ["attachments", "screenshots"]:
1578
- subdir_path = paper_dir / subdir
1579
- self.config.path_manager._ensure_directory(subdir_path)
1580
-
1581
- logger.info(f"Created Scholar library structure: {paper_dir}")
1582
- return paper_dir
1583
-
1584
- def validate_library_structure(self, project: str) -> Dict[str, Any]:
1585
- """Validate existing library structure for a project."""
1586
- validation = {
1587
- "valid": True,
1588
- "warnings": [],
1589
- "errors": [],
1590
- "paper_count": 0,
1591
- "missing_metadata": [],
1592
- }
1593
-
1594
- library_path = self.config.path_manager.library_dir
1595
- project_dir = library_path / project
1596
-
1597
- if not project_dir.exists():
1598
- validation["errors"].append(
1599
- f"Project directory does not exist: {project_dir}"
1600
- )
1601
- validation["valid"] = False
1602
- return validation
1603
-
1604
- for paper_dir in project_dir.iterdir():
1605
- if paper_dir.is_dir() and len(paper_dir.name) == 8:
1606
- validation["paper_count"] += 1
1607
-
1608
- metadata_file = paper_dir / "metadata.json"
1609
- if not metadata_file.exists():
1610
- validation["missing_metadata"].append(paper_dir.name)
1611
- validation["warnings"].append(
1612
- f"Missing metadata.json: {paper_dir.name}"
1613
- )
1614
-
1615
- return validation
1616
-
1617
- def resolve_and_update_library(
1618
- self,
1619
- papers_with_ids: List[Dict[str, Any]],
1620
- project: str,
1621
- sources: Optional[List[str]] = None,
1622
- ) -> Dict[str, str]:
1623
- """Resolve DOIs and update Scholar library metadata.json files."""
1624
- if not self.single_doi_resolver:
1625
- raise ValueError("SingleDOIResolver is required for resolving DOIs")
1626
-
1627
- results = {}
1628
- for paper in papers_with_ids:
1629
- paper_id = paper.get("paper_id")
1630
- if not paper_id:
1631
- logger.warning(
1632
- f"Skipping paper without paper_id: {paper.get('title', 'Unknown')}"
1633
- )
1634
- continue
1635
-
1636
- title = paper.get("title")
1637
- if not title:
1638
- logger.warning(f"Skipping paper {paper_id} without title")
1639
- continue
1640
-
1641
- logger.info(f"Resolving DOI for {paper_id}: {title[:50]}...")
1642
-
1643
- try:
1644
- result = asyncio.run(
1645
- self.single_doi_resolver.metadata2doi_async(
1646
- title=title,
1647
- year=paper.get("year"),
1648
- authors=paper.get("authors"),
1649
- sources=sources,
1650
- )
1651
- )
1652
-
1653
- if result and isinstance(result, dict) and result.get("doi"):
1654
- doi = result["doi"]
1655
-
1656
- success = self.update_library_metadata(
1657
- paper_id=paper_id,
1658
- project=project,
1659
- doi=doi,
1660
- metadata={
1661
- "title": title,
1662
- "title_source": "input",
1663
- "year": paper.get("year"),
1664
- "year_source": ("input" if paper.get("year") else None),
1665
- "authors": paper.get("authors"),
1666
- "authors_source": (
1667
- "input" if paper.get("authors") else None
1668
- ),
1669
- "journal": paper.get("journal"),
1670
- "journal_source": (
1671
- "input" if paper.get("journal") else None
1672
- ),
1673
- "doi_resolution_source": result.get("source"),
1674
- },
1675
- )
1676
-
1677
- if success:
1678
- results[paper_id] = doi
1679
- logger.success(f"✅ {paper_id}: {doi}")
1680
- else:
1681
- logger.error(
1682
- f"❌ {paper_id}: DOI resolved but metadata update failed"
1683
- )
1684
- else:
1685
- logger.warning(f"⚠️ {paper_id}: No DOI found")
1686
-
1687
- except Exception as exc_:
1688
- logger.error(f"❌ {paper_id}: Error during resolution: {exc_}")
1689
-
1690
- logger.success(
1691
- f"Resolved {len(results)}/{len(papers_with_ids)} DOIs and updated library metadata"
1692
- )
1693
- return results
1694
-
1695
- def resolve_and_create_library_structure(
1696
- self,
1697
- papers: List[Dict[str, Any]],
1698
- project: str,
1699
- sources: Optional[List[str]] = None,
1700
- ) -> Dict[str, Dict[str, str]]:
1701
- """Synchronous wrapper for resolve_and_create_library_structure_async."""
1702
- try:
1703
- loop = asyncio.get_event_loop()
1704
- if loop.is_running():
1705
- raise RuntimeError(
1706
- "Cannot run synchronous version in async context. "
1707
- "Use resolve_and_create_library_structure_async() instead."
1708
- )
1709
- else:
1710
- return loop.run_until_complete(
1711
- self.resolve_and_create_library_structure_async(
1712
- papers, project, sources
1713
- )
1714
- )
1715
- except RuntimeError:
1716
- return asyncio.run(
1717
- self.resolve_and_create_library_structure_async(
1718
- papers, project, sources
1719
- )
1720
- )
1721
-
1722
123
 
1723
124
  __all__ = ["LibraryManager"]
1724
125
 
126
+
1725
127
  # EOF