academic-refchecker 1.2.65__py3-none-any.whl → 1.2.66__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/METADATA +72 -7
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/RECORD +28 -18
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/entry_points.txt +1 -0
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/top_level.txt +1 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +56 -0
- backend/concurrency.py +100 -0
- backend/database.py +686 -0
- backend/main.py +1266 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__version__.py +2 -2
- refchecker/checkers/crossref.py +15 -6
- refchecker/checkers/enhanced_hybrid_checker.py +18 -4
- refchecker/checkers/local_semantic_scholar.py +2 -2
- refchecker/checkers/openalex.py +15 -6
- refchecker/checkers/semantic_scholar.py +15 -6
- refchecker/core/refchecker.py +17 -6
- refchecker/utils/__init__.py +2 -1
- refchecker/utils/arxiv_utils.py +18 -60
- refchecker/utils/doi_utils.py +32 -1
- refchecker/utils/error_utils.py +20 -9
- refchecker/utils/text_utils.py +143 -27
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/licenses/LICENSE +0 -0
backend/thumbnail.py
ADDED
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Thumbnail generation utilities for PDF and web page previews.
|
|
3
|
+
|
|
4
|
+
Uses PyMuPDF (fitz) to extract the first page of PDFs as thumbnails.
|
|
5
|
+
Thumbnails are cached on disk to avoid regeneration.
|
|
6
|
+
"""
|
|
7
|
+
import os
|
|
8
|
+
import hashlib
|
|
9
|
+
import logging
|
|
10
|
+
import tempfile
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional
|
|
13
|
+
import asyncio
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Default thumbnail cache directory
|
|
18
|
+
THUMBNAIL_CACHE_DIR = Path(tempfile.gettempdir()) / "refchecker_thumbnails"
|
|
19
|
+
|
|
20
|
+
# Thumbnail settings
|
|
21
|
+
THUMBNAIL_WIDTH = 200 # Target width in pixels for small thumbnails
|
|
22
|
+
THUMBNAIL_DPI = 150 # Higher DPI for sharper text rendering
|
|
23
|
+
|
|
24
|
+
# Preview settings (larger image for overlay view)
|
|
25
|
+
PREVIEW_WIDTH = 1600 # Target width in pixels for preview/overlay
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_thumbnail_cache_path(source_identifier: str, check_id: Optional[int] = None) -> Path:
|
|
29
|
+
"""
|
|
30
|
+
Get the cache path for a thumbnail.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
source_identifier: A unique identifier for the source (URL, file path, or hash)
|
|
34
|
+
check_id: Optional check ID for more unique naming
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Path to the thumbnail file (may not exist yet)
|
|
38
|
+
"""
|
|
39
|
+
# Create cache directory if it doesn't exist
|
|
40
|
+
THUMBNAIL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
|
|
42
|
+
# Create a hash of the source for the filename
|
|
43
|
+
source_hash = hashlib.md5(source_identifier.encode()).hexdigest()[:12]
|
|
44
|
+
|
|
45
|
+
if check_id:
|
|
46
|
+
filename = f"thumb_{check_id}_{source_hash}.png"
|
|
47
|
+
else:
|
|
48
|
+
filename = f"thumb_{source_hash}.png"
|
|
49
|
+
|
|
50
|
+
return THUMBNAIL_CACHE_DIR / filename
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_preview_cache_path(source_identifier: str, check_id: Optional[int] = None) -> Path:
|
|
54
|
+
"""
|
|
55
|
+
Get the cache path for a preview (larger image for overlay).
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
source_identifier: A unique identifier for the source (URL, file path, or hash)
|
|
59
|
+
check_id: Optional check ID for more unique naming
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Path to the preview file (may not exist yet)
|
|
63
|
+
"""
|
|
64
|
+
# Create cache directory if it doesn't exist
|
|
65
|
+
THUMBNAIL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
|
|
67
|
+
# Create a hash of the source for the filename
|
|
68
|
+
source_hash = hashlib.md5(source_identifier.encode()).hexdigest()[:12]
|
|
69
|
+
|
|
70
|
+
if check_id:
|
|
71
|
+
filename = f"preview_{check_id}_{source_hash}.png"
|
|
72
|
+
else:
|
|
73
|
+
filename = f"preview_{source_hash}.png"
|
|
74
|
+
|
|
75
|
+
return THUMBNAIL_CACHE_DIR / filename
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def generate_pdf_thumbnail(pdf_path: str, output_path: Optional[str] = None) -> Optional[str]:
|
|
79
|
+
"""
|
|
80
|
+
Generate a thumbnail from the first page of a PDF.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
pdf_path: Path to the PDF file
|
|
84
|
+
output_path: Optional path for the output thumbnail. If not provided,
|
|
85
|
+
uses the cache directory.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Path to the generated thumbnail, or None if generation failed
|
|
89
|
+
"""
|
|
90
|
+
try:
|
|
91
|
+
import fitz # PyMuPDF
|
|
92
|
+
|
|
93
|
+
if not os.path.exists(pdf_path):
|
|
94
|
+
logger.error(f"PDF file not found: {pdf_path}")
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
# Determine output path
|
|
98
|
+
if output_path is None:
|
|
99
|
+
output_path = str(get_thumbnail_cache_path(pdf_path))
|
|
100
|
+
|
|
101
|
+
# Check if thumbnail already exists
|
|
102
|
+
if os.path.exists(output_path):
|
|
103
|
+
logger.debug(f"Thumbnail already exists: {output_path}")
|
|
104
|
+
return output_path
|
|
105
|
+
|
|
106
|
+
# Open the PDF
|
|
107
|
+
doc = fitz.open(pdf_path)
|
|
108
|
+
|
|
109
|
+
if len(doc) == 0:
|
|
110
|
+
logger.warning(f"PDF has no pages: {pdf_path}")
|
|
111
|
+
doc.close()
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
# Get the first page
|
|
115
|
+
page = doc[0]
|
|
116
|
+
|
|
117
|
+
# Calculate zoom factor to get desired width
|
|
118
|
+
page_width = page.rect.width
|
|
119
|
+
zoom = THUMBNAIL_WIDTH / page_width
|
|
120
|
+
|
|
121
|
+
# Create transformation matrix
|
|
122
|
+
mat = fitz.Matrix(zoom, zoom)
|
|
123
|
+
|
|
124
|
+
# Render page to pixmap with higher quality
|
|
125
|
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
126
|
+
|
|
127
|
+
# Enhance contrast to make text darker/more readable
|
|
128
|
+
try:
|
|
129
|
+
from PIL import Image, ImageEnhance
|
|
130
|
+
import io
|
|
131
|
+
|
|
132
|
+
# Convert pixmap to PIL Image
|
|
133
|
+
img_data = pix.tobytes("png")
|
|
134
|
+
img = Image.open(io.BytesIO(img_data))
|
|
135
|
+
|
|
136
|
+
# Increase contrast (1.3 = 30% more contrast)
|
|
137
|
+
enhancer = ImageEnhance.Contrast(img)
|
|
138
|
+
img = enhancer.enhance(1.3)
|
|
139
|
+
|
|
140
|
+
# Slightly increase sharpness
|
|
141
|
+
enhancer = ImageEnhance.Sharpness(img)
|
|
142
|
+
img = enhancer.enhance(1.2)
|
|
143
|
+
|
|
144
|
+
# Save enhanced image
|
|
145
|
+
img.save(output_path, "PNG")
|
|
146
|
+
except ImportError:
|
|
147
|
+
# Fallback: save without enhancement if PIL not available
|
|
148
|
+
pix.save(output_path)
|
|
149
|
+
|
|
150
|
+
doc.close()
|
|
151
|
+
|
|
152
|
+
logger.info(f"Generated thumbnail: {output_path} ({pix.width}x{pix.height})")
|
|
153
|
+
return output_path
|
|
154
|
+
|
|
155
|
+
except ImportError:
|
|
156
|
+
logger.error("PyMuPDF (fitz) is not installed. Install with: pip install pymupdf")
|
|
157
|
+
return None
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.error(f"Error generating PDF thumbnail: {e}")
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
async def generate_pdf_thumbnail_async(pdf_path: str, output_path: Optional[str] = None) -> Optional[str]:
|
|
164
|
+
"""
|
|
165
|
+
Async wrapper for PDF thumbnail generation.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
pdf_path: Path to the PDF file
|
|
169
|
+
output_path: Optional path for the output thumbnail
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Path to the generated thumbnail, or None if generation failed
|
|
173
|
+
"""
|
|
174
|
+
return await asyncio.to_thread(generate_pdf_thumbnail, pdf_path, output_path)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def generate_pdf_preview(pdf_path: str, output_path: Optional[str] = None) -> Optional[str]:
|
|
178
|
+
"""
|
|
179
|
+
Generate a high-resolution preview from the first page of a PDF.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
pdf_path: Path to the PDF file
|
|
183
|
+
output_path: Optional path for the output preview. If not provided,
|
|
184
|
+
uses the cache directory.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Path to the generated preview, or None if generation failed
|
|
188
|
+
"""
|
|
189
|
+
try:
|
|
190
|
+
import fitz # PyMuPDF
|
|
191
|
+
|
|
192
|
+
if not os.path.exists(pdf_path):
|
|
193
|
+
logger.error(f"PDF file not found: {pdf_path}")
|
|
194
|
+
return None
|
|
195
|
+
|
|
196
|
+
# Determine output path
|
|
197
|
+
if output_path is None:
|
|
198
|
+
output_path = str(get_preview_cache_path(pdf_path))
|
|
199
|
+
|
|
200
|
+
# Check if preview already exists
|
|
201
|
+
if os.path.exists(output_path):
|
|
202
|
+
logger.debug(f"Preview already exists: {output_path}")
|
|
203
|
+
return output_path
|
|
204
|
+
|
|
205
|
+
# Open the PDF
|
|
206
|
+
doc = fitz.open(pdf_path)
|
|
207
|
+
|
|
208
|
+
if len(doc) == 0:
|
|
209
|
+
logger.warning(f"PDF has no pages: {pdf_path}")
|
|
210
|
+
doc.close()
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
# Get the first page
|
|
214
|
+
page = doc[0]
|
|
215
|
+
|
|
216
|
+
# Calculate zoom factor to get desired width (larger for preview)
|
|
217
|
+
page_width = page.rect.width
|
|
218
|
+
zoom = PREVIEW_WIDTH / page_width
|
|
219
|
+
|
|
220
|
+
# Create transformation matrix
|
|
221
|
+
mat = fitz.Matrix(zoom, zoom)
|
|
222
|
+
|
|
223
|
+
# Render page to pixmap with higher quality
|
|
224
|
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
225
|
+
|
|
226
|
+
# Enhance contrast to make text darker/more readable
|
|
227
|
+
try:
|
|
228
|
+
from PIL import Image, ImageEnhance
|
|
229
|
+
import io
|
|
230
|
+
|
|
231
|
+
# Convert pixmap to PIL Image
|
|
232
|
+
img_data = pix.tobytes("png")
|
|
233
|
+
img = Image.open(io.BytesIO(img_data))
|
|
234
|
+
|
|
235
|
+
# Increase contrast (1.2 = 20% more contrast)
|
|
236
|
+
enhancer = ImageEnhance.Contrast(img)
|
|
237
|
+
img = enhancer.enhance(1.2)
|
|
238
|
+
|
|
239
|
+
# Slightly increase sharpness
|
|
240
|
+
enhancer = ImageEnhance.Sharpness(img)
|
|
241
|
+
img = enhancer.enhance(1.1)
|
|
242
|
+
|
|
243
|
+
# Save enhanced image
|
|
244
|
+
img.save(output_path, "PNG", optimize=True)
|
|
245
|
+
except ImportError:
|
|
246
|
+
# Fallback: save without enhancement if PIL not available
|
|
247
|
+
pix.save(output_path)
|
|
248
|
+
|
|
249
|
+
doc.close()
|
|
250
|
+
|
|
251
|
+
logger.info(f"Generated preview: {output_path} ({pix.width}x{pix.height})")
|
|
252
|
+
return output_path
|
|
253
|
+
|
|
254
|
+
except ImportError:
|
|
255
|
+
logger.error("PyMuPDF (fitz) is not installed. Install with: pip install pymupdf")
|
|
256
|
+
return None
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.error(f"Error generating PDF preview: {e}")
|
|
259
|
+
return None
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
async def generate_pdf_preview_async(pdf_path: str, output_path: Optional[str] = None) -> Optional[str]:
|
|
263
|
+
"""
|
|
264
|
+
Async wrapper for PDF preview generation.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
pdf_path: Path to the PDF file
|
|
268
|
+
output_path: Optional path for the output preview
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
Path to the generated preview, or None if generation failed
|
|
272
|
+
"""
|
|
273
|
+
return await asyncio.to_thread(generate_pdf_preview, pdf_path, output_path)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def generate_arxiv_thumbnail(arxiv_id: str, check_id: Optional[int] = None) -> Optional[str]:
|
|
277
|
+
"""
|
|
278
|
+
Generate a thumbnail for an ArXiv paper.
|
|
279
|
+
|
|
280
|
+
Downloads the PDF and generates a thumbnail of the first page.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
arxiv_id: ArXiv paper ID (e.g., "2311.12022")
|
|
284
|
+
check_id: Optional check ID for cache naming
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
Path to the generated thumbnail, or None if generation failed
|
|
288
|
+
"""
|
|
289
|
+
try:
|
|
290
|
+
import arxiv as arxiv_lib
|
|
291
|
+
|
|
292
|
+
# Check if thumbnail already exists
|
|
293
|
+
output_path = get_thumbnail_cache_path(f"arxiv_{arxiv_id}", check_id)
|
|
294
|
+
if output_path.exists():
|
|
295
|
+
logger.debug(f"ArXiv thumbnail already exists: {output_path}")
|
|
296
|
+
return str(output_path)
|
|
297
|
+
|
|
298
|
+
# Download the PDF to a temporary location
|
|
299
|
+
pdf_dir = Path(tempfile.gettempdir()) / "refchecker_pdfs"
|
|
300
|
+
pdf_dir.mkdir(parents=True, exist_ok=True)
|
|
301
|
+
pdf_path = pdf_dir / f"arxiv_{arxiv_id}.pdf"
|
|
302
|
+
|
|
303
|
+
# Check if PDF is already downloaded
|
|
304
|
+
if not pdf_path.exists():
|
|
305
|
+
logger.info(f"Downloading ArXiv PDF: {arxiv_id}")
|
|
306
|
+
search = arxiv_lib.Search(id_list=[arxiv_id])
|
|
307
|
+
paper = next(search.results())
|
|
308
|
+
paper.download_pdf(filename=str(pdf_path))
|
|
309
|
+
|
|
310
|
+
# Generate thumbnail from the PDF
|
|
311
|
+
return generate_pdf_thumbnail(str(pdf_path), str(output_path))
|
|
312
|
+
|
|
313
|
+
except Exception as e:
|
|
314
|
+
logger.error(f"Error generating ArXiv thumbnail: {e}")
|
|
315
|
+
return None
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
async def generate_arxiv_thumbnail_async(arxiv_id: str, check_id: Optional[int] = None) -> Optional[str]:
|
|
319
|
+
"""
|
|
320
|
+
Async wrapper for ArXiv thumbnail generation.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
arxiv_id: ArXiv paper ID
|
|
324
|
+
check_id: Optional check ID for cache naming
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Path to the generated thumbnail, or None if generation failed
|
|
328
|
+
"""
|
|
329
|
+
return await asyncio.to_thread(generate_arxiv_thumbnail, arxiv_id, check_id)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def generate_arxiv_preview(arxiv_id: str, check_id: Optional[int] = None) -> Optional[str]:
|
|
333
|
+
"""
|
|
334
|
+
Generate a high-resolution preview for an ArXiv paper.
|
|
335
|
+
|
|
336
|
+
Downloads the PDF and generates a preview of the first page.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
arxiv_id: ArXiv paper ID (e.g., "2311.12022")
|
|
340
|
+
check_id: Optional check ID for cache naming
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
Path to the generated preview, or None if generation failed
|
|
344
|
+
"""
|
|
345
|
+
try:
|
|
346
|
+
import arxiv as arxiv_lib
|
|
347
|
+
|
|
348
|
+
# Check if preview already exists
|
|
349
|
+
output_path = get_preview_cache_path(f"arxiv_{arxiv_id}", check_id)
|
|
350
|
+
if output_path.exists():
|
|
351
|
+
logger.debug(f"ArXiv preview already exists: {output_path}")
|
|
352
|
+
return str(output_path)
|
|
353
|
+
|
|
354
|
+
# Download the PDF to a temporary location
|
|
355
|
+
pdf_dir = Path(tempfile.gettempdir()) / "refchecker_pdfs"
|
|
356
|
+
pdf_dir.mkdir(parents=True, exist_ok=True)
|
|
357
|
+
pdf_path = pdf_dir / f"arxiv_{arxiv_id}.pdf"
|
|
358
|
+
|
|
359
|
+
# Check if PDF is already downloaded
|
|
360
|
+
if not pdf_path.exists():
|
|
361
|
+
logger.info(f"Downloading ArXiv PDF: {arxiv_id}")
|
|
362
|
+
search = arxiv_lib.Search(id_list=[arxiv_id])
|
|
363
|
+
paper = next(search.results())
|
|
364
|
+
paper.download_pdf(filename=str(pdf_path))
|
|
365
|
+
|
|
366
|
+
# Generate preview from the PDF
|
|
367
|
+
return generate_pdf_preview(str(pdf_path), str(output_path))
|
|
368
|
+
|
|
369
|
+
except Exception as e:
|
|
370
|
+
logger.error(f"Error generating ArXiv preview: {e}")
|
|
371
|
+
return None
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
async def generate_arxiv_preview_async(arxiv_id: str, check_id: Optional[int] = None) -> Optional[str]:
|
|
375
|
+
"""
|
|
376
|
+
Async wrapper for ArXiv preview generation.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
arxiv_id: ArXiv paper ID
|
|
380
|
+
check_id: Optional check ID for cache naming
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
Path to the generated preview, or None if generation failed
|
|
384
|
+
"""
|
|
385
|
+
return await asyncio.to_thread(generate_arxiv_preview, arxiv_id, check_id)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def get_text_thumbnail(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
|
|
389
|
+
"""
|
|
390
|
+
Generate a thumbnail for pasted text showing actual content.
|
|
391
|
+
|
|
392
|
+
Creates an image with the first few lines of the text content.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
check_id: Check ID for naming
|
|
396
|
+
text_preview: Optional first few lines of text to display
|
|
397
|
+
text_file_path: Optional path to the text file to read content from
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
Path to the generated thumbnail, or None if generation failed
|
|
401
|
+
"""
|
|
402
|
+
try:
|
|
403
|
+
import fitz
|
|
404
|
+
|
|
405
|
+
output_path = get_thumbnail_cache_path(f"text_{check_id}", check_id)
|
|
406
|
+
|
|
407
|
+
if output_path.exists():
|
|
408
|
+
return str(output_path)
|
|
409
|
+
|
|
410
|
+
# Try to read text content from file
|
|
411
|
+
text_content = text_preview
|
|
412
|
+
if text_file_path and os.path.exists(text_file_path):
|
|
413
|
+
try:
|
|
414
|
+
with open(text_file_path, 'r', encoding='utf-8') as f:
|
|
415
|
+
text_content = f.read()
|
|
416
|
+
except Exception as e:
|
|
417
|
+
logger.warning(f"Could not read text file: {e}")
|
|
418
|
+
|
|
419
|
+
# Create a document-like image with actual text content
|
|
420
|
+
doc = fitz.open()
|
|
421
|
+
page = doc.new_page(width=THUMBNAIL_WIDTH, height=int(THUMBNAIL_WIDTH * 1.4))
|
|
422
|
+
|
|
423
|
+
# Fill with white/off-white background
|
|
424
|
+
page.draw_rect(page.rect, color=(0.95, 0.95, 0.95), fill=(0.99, 0.99, 0.99))
|
|
425
|
+
|
|
426
|
+
# Draw border
|
|
427
|
+
page.draw_rect(page.rect, color=(0.8, 0.8, 0.8), width=1)
|
|
428
|
+
|
|
429
|
+
# Draw actual text content if available
|
|
430
|
+
margin = 10
|
|
431
|
+
if text_content:
|
|
432
|
+
# Create a text box for the content
|
|
433
|
+
text_rect = fitz.Rect(margin, margin, THUMBNAIL_WIDTH - margin, int(THUMBNAIL_WIDTH * 1.4) - margin)
|
|
434
|
+
|
|
435
|
+
# Truncate to first ~500 chars for thumbnail
|
|
436
|
+
display_text = text_content[:500]
|
|
437
|
+
if len(text_content) > 500:
|
|
438
|
+
display_text += "..."
|
|
439
|
+
|
|
440
|
+
# Insert text with small font
|
|
441
|
+
page.insert_textbox(
|
|
442
|
+
text_rect,
|
|
443
|
+
display_text,
|
|
444
|
+
fontsize=6,
|
|
445
|
+
color=(0.2, 0.2, 0.2),
|
|
446
|
+
fontname="helv"
|
|
447
|
+
)
|
|
448
|
+
else:
|
|
449
|
+
# Fallback: Draw placeholder lines
|
|
450
|
+
line_height = 12
|
|
451
|
+
y = margin + 30
|
|
452
|
+
|
|
453
|
+
# Draw a "T" icon at top
|
|
454
|
+
text_rect = fitz.Rect(margin, margin, margin + 30, margin + 25)
|
|
455
|
+
page.insert_textbox(text_rect, "T", fontsize=20, color=(0.4, 0.4, 0.6))
|
|
456
|
+
|
|
457
|
+
for i in range(10):
|
|
458
|
+
line_width = THUMBNAIL_WIDTH - 2 * margin
|
|
459
|
+
if i % 3 == 2:
|
|
460
|
+
line_width = line_width * 0.7
|
|
461
|
+
|
|
462
|
+
page.draw_line(
|
|
463
|
+
fitz.Point(margin, y),
|
|
464
|
+
fitz.Point(margin + line_width, y),
|
|
465
|
+
color=(0.7, 0.7, 0.7),
|
|
466
|
+
width=2
|
|
467
|
+
)
|
|
468
|
+
y += line_height
|
|
469
|
+
|
|
470
|
+
# Render to pixmap and save
|
|
471
|
+
pix = page.get_pixmap(alpha=False)
|
|
472
|
+
pix.save(str(output_path))
|
|
473
|
+
doc.close()
|
|
474
|
+
|
|
475
|
+
logger.info(f"Generated text thumbnail: {output_path}")
|
|
476
|
+
return str(output_path)
|
|
477
|
+
|
|
478
|
+
except ImportError:
|
|
479
|
+
logger.error("PyMuPDF (fitz) is not installed")
|
|
480
|
+
return None
|
|
481
|
+
except Exception as e:
|
|
482
|
+
logger.error(f"Error generating text thumbnail: {e}")
|
|
483
|
+
return None
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
async def get_text_thumbnail_async(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
|
|
487
|
+
"""Async wrapper for text thumbnail generation."""
|
|
488
|
+
return await asyncio.to_thread(get_text_thumbnail, check_id, text_preview, text_file_path)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def cleanup_old_thumbnails(max_age_days: int = 30):
|
|
492
|
+
"""
|
|
493
|
+
Clean up old thumbnails from the cache.
|
|
494
|
+
|
|
495
|
+
Args:
|
|
496
|
+
max_age_days: Maximum age in days before thumbnails are deleted
|
|
497
|
+
"""
|
|
498
|
+
try:
|
|
499
|
+
import time
|
|
500
|
+
|
|
501
|
+
if not THUMBNAIL_CACHE_DIR.exists():
|
|
502
|
+
return
|
|
503
|
+
|
|
504
|
+
max_age_seconds = max_age_days * 24 * 60 * 60
|
|
505
|
+
current_time = time.time()
|
|
506
|
+
|
|
507
|
+
for thumb_path in THUMBNAIL_CACHE_DIR.glob("thumb_*.png"):
|
|
508
|
+
try:
|
|
509
|
+
file_age = current_time - thumb_path.stat().st_mtime
|
|
510
|
+
if file_age > max_age_seconds:
|
|
511
|
+
thumb_path.unlink()
|
|
512
|
+
logger.debug(f"Deleted old thumbnail: {thumb_path}")
|
|
513
|
+
except Exception as e:
|
|
514
|
+
logger.warning(f"Error deleting thumbnail {thumb_path}: {e}")
|
|
515
|
+
|
|
516
|
+
except Exception as e:
|
|
517
|
+
logger.error(f"Error cleaning up thumbnails: {e}")
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""
|
|
2
|
+
WebSocket connection manager for real-time updates
|
|
3
|
+
"""
|
|
4
|
+
import asyncio
|
|
5
|
+
import json
|
|
6
|
+
from typing import Dict, Set
|
|
7
|
+
from fastapi import WebSocket
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ConnectionManager:
|
|
14
|
+
"""Manages WebSocket connections for real-time updates"""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
# Map of session_id -> set of websocket connections
|
|
18
|
+
self.active_connections: Dict[str, Set[WebSocket]] = {}
|
|
19
|
+
|
|
20
|
+
async def connect(self, websocket: WebSocket, session_id: str):
|
|
21
|
+
"""Accept a new WebSocket connection"""
|
|
22
|
+
await websocket.accept()
|
|
23
|
+
if session_id not in self.active_connections:
|
|
24
|
+
self.active_connections[session_id] = set()
|
|
25
|
+
self.active_connections[session_id].add(websocket)
|
|
26
|
+
logger.info(f"WebSocket connected for session: {session_id}")
|
|
27
|
+
|
|
28
|
+
def disconnect(self, websocket: WebSocket, session_id: str):
|
|
29
|
+
"""Remove a WebSocket connection"""
|
|
30
|
+
if session_id in self.active_connections:
|
|
31
|
+
self.active_connections[session_id].discard(websocket)
|
|
32
|
+
if not self.active_connections[session_id]:
|
|
33
|
+
del self.active_connections[session_id]
|
|
34
|
+
logger.info(f"WebSocket disconnected for session: {session_id}")
|
|
35
|
+
|
|
36
|
+
async def send_message(self, session_id: str, message_type: str, data: dict):
|
|
37
|
+
"""Send a message to all connections for a session"""
|
|
38
|
+
if session_id not in self.active_connections:
|
|
39
|
+
logger.debug(f"No active connections for session: {session_id}")
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
# Flatten structure: frontend expects {type, session_id, ...data}
|
|
43
|
+
# Include session_id so the client can ignore stale messages from old sessions
|
|
44
|
+
message = {"type": message_type, "session_id": session_id, **data}
|
|
45
|
+
message_json = json.dumps(message)
|
|
46
|
+
|
|
47
|
+
logger.debug(f"Sending {message_type} to session {session_id}: {message_json[:200]}...")
|
|
48
|
+
|
|
49
|
+
# Send to all connections for this session
|
|
50
|
+
disconnected = set()
|
|
51
|
+
for websocket in self.active_connections[session_id]:
|
|
52
|
+
try:
|
|
53
|
+
await websocket.send_text(message_json)
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.error(f"Error sending message to websocket: {e}")
|
|
56
|
+
disconnected.add(websocket)
|
|
57
|
+
|
|
58
|
+
# Clean up disconnected websockets
|
|
59
|
+
for ws in disconnected:
|
|
60
|
+
self.disconnect(ws, session_id)
|
|
61
|
+
|
|
62
|
+
async def broadcast_started(self, session_id: str, paper_title: str, paper_source: str):
|
|
63
|
+
"""Broadcast that checking has started"""
|
|
64
|
+
await self.send_message(session_id, "started", {
|
|
65
|
+
"paper_title": paper_title,
|
|
66
|
+
"paper_source": paper_source
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
async def broadcast_extracting(self, session_id: str):
|
|
70
|
+
"""Broadcast that references are being extracted"""
|
|
71
|
+
await self.send_message(session_id, "extracting", {
|
|
72
|
+
"message": "Extracting references from paper..."
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
async def broadcast_progress(self, session_id: str, current: int, total: int):
|
|
76
|
+
"""Broadcast progress update"""
|
|
77
|
+
await self.send_message(session_id, "progress", {
|
|
78
|
+
"current": current,
|
|
79
|
+
"total": total,
|
|
80
|
+
"percent": round((current / total * 100) if total > 0 else 0, 1)
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
async def broadcast_reference_result(self, session_id: str, reference_data: dict):
|
|
84
|
+
"""Broadcast a reference checking result"""
|
|
85
|
+
await self.send_message(session_id, "reference_result", reference_data)
|
|
86
|
+
|
|
87
|
+
async def broadcast_summary_update(self, session_id: str, summary: dict):
|
|
88
|
+
"""Broadcast updated summary statistics"""
|
|
89
|
+
await self.send_message(session_id, "summary_update", summary)
|
|
90
|
+
|
|
91
|
+
async def broadcast_completed(self, session_id: str, final_summary: dict):
|
|
92
|
+
"""Broadcast that checking is complete"""
|
|
93
|
+
await self.send_message(session_id, "completed", final_summary)
|
|
94
|
+
|
|
95
|
+
async def broadcast_error(self, session_id: str, error_message: str, error_details: str = ""):
|
|
96
|
+
"""Broadcast an error"""
|
|
97
|
+
await self.send_message(session_id, "error", {
|
|
98
|
+
"message": error_message,
|
|
99
|
+
"details": error_details
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# Global connection manager instance
|
|
104
|
+
manager = ConnectionManager()
|
refchecker/__version__.py
CHANGED
refchecker/checkers/crossref.py
CHANGED
|
@@ -487,13 +487,22 @@ class CrossRefReferenceChecker:
|
|
|
487
487
|
work_doi = work_data.get('DOI')
|
|
488
488
|
if doi and work_doi:
|
|
489
489
|
# Compare DOIs using the proper comparison function
|
|
490
|
-
from refchecker.utils.doi_utils import compare_dois
|
|
490
|
+
from refchecker.utils.doi_utils import compare_dois, validate_doi_resolves
|
|
491
491
|
if not compare_dois(doi, work_doi):
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
492
|
+
# If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
|
|
493
|
+
# Treat as warning instead of error
|
|
494
|
+
if validate_doi_resolves(doi):
|
|
495
|
+
errors.append({
|
|
496
|
+
'warning_type': 'doi',
|
|
497
|
+
'warning_details': format_doi_mismatch(doi, work_doi),
|
|
498
|
+
'ref_doi_correct': work_doi
|
|
499
|
+
})
|
|
500
|
+
else:
|
|
501
|
+
errors.append({
|
|
502
|
+
'error_type': 'doi',
|
|
503
|
+
'error_details': format_doi_mismatch(doi, work_doi),
|
|
504
|
+
'ref_doi_correct': work_doi
|
|
505
|
+
})
|
|
497
506
|
|
|
498
507
|
# Extract URL from work data
|
|
499
508
|
work_url = self.extract_url_from_work(work_data)
|
|
@@ -256,6 +256,21 @@ class EnhancedHybridReferenceChecker:
|
|
|
256
256
|
Returns:
|
|
257
257
|
Tuple of (verified_data, errors, url)
|
|
258
258
|
"""
|
|
259
|
+
# Check if this is a URL-only reference (should skip verification)
|
|
260
|
+
authors = reference.get('authors', [])
|
|
261
|
+
if authors and "URL Reference" in authors:
|
|
262
|
+
# Skip verification for URL references - they're just links, not papers
|
|
263
|
+
logger.debug("Enhanced Hybrid: Skipping verification for URL reference")
|
|
264
|
+
return None, [], reference.get('cited_url') or reference.get('url')
|
|
265
|
+
|
|
266
|
+
# Also check if it looks like a URL-only reference (no title, just URL)
|
|
267
|
+
title = reference.get('title', '').strip()
|
|
268
|
+
cited_url = reference.get('cited_url') or reference.get('url')
|
|
269
|
+
if not title and cited_url:
|
|
270
|
+
# This is a URL-only reference without a title
|
|
271
|
+
logger.debug(f"Enhanced Hybrid: Skipping verification for URL-only reference: {cited_url}")
|
|
272
|
+
return None, [], cited_url
|
|
273
|
+
|
|
259
274
|
# Track all APIs that failed and could be retried
|
|
260
275
|
failed_apis = []
|
|
261
276
|
|
|
@@ -533,10 +548,9 @@ class EnhancedHybridReferenceChecker:
|
|
|
533
548
|
if self.semantic_scholar:
|
|
534
549
|
return self.semantic_scholar.normalize_paper_title(title)
|
|
535
550
|
else:
|
|
536
|
-
#
|
|
537
|
-
import
|
|
538
|
-
|
|
539
|
-
return re.sub(r'[^\w\s]', '', title)
|
|
551
|
+
# Use the centralized normalization function from text_utils
|
|
552
|
+
from refchecker.utils.text_utils import normalize_paper_title as normalize_title
|
|
553
|
+
return normalize_title(title)
|
|
540
554
|
|
|
541
555
|
def compare_authors(self, cited_authors: List[str], correct_authors: List[Any]) -> Tuple[bool, str]:
|
|
542
556
|
"""
|