academic-refchecker 1.2.64__py3-none-any.whl → 1.2.66__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
backend/thumbnail.py ADDED
@@ -0,0 +1,517 @@
1
+ """
2
+ Thumbnail generation utilities for PDF and web page previews.
3
+
4
+ Uses PyMuPDF (fitz) to extract the first page of PDFs as thumbnails.
5
+ Thumbnails are cached on disk to avoid regeneration.
6
+ """
7
+ import os
8
+ import hashlib
9
+ import logging
10
+ import tempfile
11
+ from pathlib import Path
12
+ from typing import Optional
13
+ import asyncio
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Default thumbnail cache directory
18
+ THUMBNAIL_CACHE_DIR = Path(tempfile.gettempdir()) / "refchecker_thumbnails"
19
+
20
+ # Thumbnail settings
21
+ THUMBNAIL_WIDTH = 200 # Target width in pixels for small thumbnails
22
+ THUMBNAIL_DPI = 150 # Higher DPI for sharper text rendering
23
+
24
+ # Preview settings (larger image for overlay view)
25
+ PREVIEW_WIDTH = 1600 # Target width in pixels for preview/overlay
26
+
27
+
28
+ def get_thumbnail_cache_path(source_identifier: str, check_id: Optional[int] = None) -> Path:
29
+ """
30
+ Get the cache path for a thumbnail.
31
+
32
+ Args:
33
+ source_identifier: A unique identifier for the source (URL, file path, or hash)
34
+ check_id: Optional check ID for more unique naming
35
+
36
+ Returns:
37
+ Path to the thumbnail file (may not exist yet)
38
+ """
39
+ # Create cache directory if it doesn't exist
40
+ THUMBNAIL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
41
+
42
+ # Create a hash of the source for the filename
43
+ source_hash = hashlib.md5(source_identifier.encode()).hexdigest()[:12]
44
+
45
+ if check_id:
46
+ filename = f"thumb_{check_id}_{source_hash}.png"
47
+ else:
48
+ filename = f"thumb_{source_hash}.png"
49
+
50
+ return THUMBNAIL_CACHE_DIR / filename
51
+
52
+
53
+ def get_preview_cache_path(source_identifier: str, check_id: Optional[int] = None) -> Path:
54
+ """
55
+ Get the cache path for a preview (larger image for overlay).
56
+
57
+ Args:
58
+ source_identifier: A unique identifier for the source (URL, file path, or hash)
59
+ check_id: Optional check ID for more unique naming
60
+
61
+ Returns:
62
+ Path to the preview file (may not exist yet)
63
+ """
64
+ # Create cache directory if it doesn't exist
65
+ THUMBNAIL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
66
+
67
+ # Create a hash of the source for the filename
68
+ source_hash = hashlib.md5(source_identifier.encode()).hexdigest()[:12]
69
+
70
+ if check_id:
71
+ filename = f"preview_{check_id}_{source_hash}.png"
72
+ else:
73
+ filename = f"preview_{source_hash}.png"
74
+
75
+ return THUMBNAIL_CACHE_DIR / filename
76
+
77
+
78
+ def generate_pdf_thumbnail(pdf_path: str, output_path: Optional[str] = None) -> Optional[str]:
79
+ """
80
+ Generate a thumbnail from the first page of a PDF.
81
+
82
+ Args:
83
+ pdf_path: Path to the PDF file
84
+ output_path: Optional path for the output thumbnail. If not provided,
85
+ uses the cache directory.
86
+
87
+ Returns:
88
+ Path to the generated thumbnail, or None if generation failed
89
+ """
90
+ try:
91
+ import fitz # PyMuPDF
92
+
93
+ if not os.path.exists(pdf_path):
94
+ logger.error(f"PDF file not found: {pdf_path}")
95
+ return None
96
+
97
+ # Determine output path
98
+ if output_path is None:
99
+ output_path = str(get_thumbnail_cache_path(pdf_path))
100
+
101
+ # Check if thumbnail already exists
102
+ if os.path.exists(output_path):
103
+ logger.debug(f"Thumbnail already exists: {output_path}")
104
+ return output_path
105
+
106
+ # Open the PDF
107
+ doc = fitz.open(pdf_path)
108
+
109
+ if len(doc) == 0:
110
+ logger.warning(f"PDF has no pages: {pdf_path}")
111
+ doc.close()
112
+ return None
113
+
114
+ # Get the first page
115
+ page = doc[0]
116
+
117
+ # Calculate zoom factor to get desired width
118
+ page_width = page.rect.width
119
+ zoom = THUMBNAIL_WIDTH / page_width
120
+
121
+ # Create transformation matrix
122
+ mat = fitz.Matrix(zoom, zoom)
123
+
124
+ # Render page to pixmap with higher quality
125
+ pix = page.get_pixmap(matrix=mat, alpha=False)
126
+
127
+ # Enhance contrast to make text darker/more readable
128
+ try:
129
+ from PIL import Image, ImageEnhance
130
+ import io
131
+
132
+ # Convert pixmap to PIL Image
133
+ img_data = pix.tobytes("png")
134
+ img = Image.open(io.BytesIO(img_data))
135
+
136
+ # Increase contrast (1.3 = 30% more contrast)
137
+ enhancer = ImageEnhance.Contrast(img)
138
+ img = enhancer.enhance(1.3)
139
+
140
+ # Slightly increase sharpness
141
+ enhancer = ImageEnhance.Sharpness(img)
142
+ img = enhancer.enhance(1.2)
143
+
144
+ # Save enhanced image
145
+ img.save(output_path, "PNG")
146
+ except ImportError:
147
+ # Fallback: save without enhancement if PIL not available
148
+ pix.save(output_path)
149
+
150
+ doc.close()
151
+
152
+ logger.info(f"Generated thumbnail: {output_path} ({pix.width}x{pix.height})")
153
+ return output_path
154
+
155
+ except ImportError:
156
+ logger.error("PyMuPDF (fitz) is not installed. Install with: pip install pymupdf")
157
+ return None
158
+ except Exception as e:
159
+ logger.error(f"Error generating PDF thumbnail: {e}")
160
+ return None
161
+
162
+
163
+ async def generate_pdf_thumbnail_async(pdf_path: str, output_path: Optional[str] = None) -> Optional[str]:
164
+ """
165
+ Async wrapper for PDF thumbnail generation.
166
+
167
+ Args:
168
+ pdf_path: Path to the PDF file
169
+ output_path: Optional path for the output thumbnail
170
+
171
+ Returns:
172
+ Path to the generated thumbnail, or None if generation failed
173
+ """
174
+ return await asyncio.to_thread(generate_pdf_thumbnail, pdf_path, output_path)
175
+
176
+
177
+ def generate_pdf_preview(pdf_path: str, output_path: Optional[str] = None) -> Optional[str]:
178
+ """
179
+ Generate a high-resolution preview from the first page of a PDF.
180
+
181
+ Args:
182
+ pdf_path: Path to the PDF file
183
+ output_path: Optional path for the output preview. If not provided,
184
+ uses the cache directory.
185
+
186
+ Returns:
187
+ Path to the generated preview, or None if generation failed
188
+ """
189
+ try:
190
+ import fitz # PyMuPDF
191
+
192
+ if not os.path.exists(pdf_path):
193
+ logger.error(f"PDF file not found: {pdf_path}")
194
+ return None
195
+
196
+ # Determine output path
197
+ if output_path is None:
198
+ output_path = str(get_preview_cache_path(pdf_path))
199
+
200
+ # Check if preview already exists
201
+ if os.path.exists(output_path):
202
+ logger.debug(f"Preview already exists: {output_path}")
203
+ return output_path
204
+
205
+ # Open the PDF
206
+ doc = fitz.open(pdf_path)
207
+
208
+ if len(doc) == 0:
209
+ logger.warning(f"PDF has no pages: {pdf_path}")
210
+ doc.close()
211
+ return None
212
+
213
+ # Get the first page
214
+ page = doc[0]
215
+
216
+ # Calculate zoom factor to get desired width (larger for preview)
217
+ page_width = page.rect.width
218
+ zoom = PREVIEW_WIDTH / page_width
219
+
220
+ # Create transformation matrix
221
+ mat = fitz.Matrix(zoom, zoom)
222
+
223
+ # Render page to pixmap with higher quality
224
+ pix = page.get_pixmap(matrix=mat, alpha=False)
225
+
226
+ # Enhance contrast to make text darker/more readable
227
+ try:
228
+ from PIL import Image, ImageEnhance
229
+ import io
230
+
231
+ # Convert pixmap to PIL Image
232
+ img_data = pix.tobytes("png")
233
+ img = Image.open(io.BytesIO(img_data))
234
+
235
+ # Increase contrast (1.2 = 20% more contrast)
236
+ enhancer = ImageEnhance.Contrast(img)
237
+ img = enhancer.enhance(1.2)
238
+
239
+ # Slightly increase sharpness
240
+ enhancer = ImageEnhance.Sharpness(img)
241
+ img = enhancer.enhance(1.1)
242
+
243
+ # Save enhanced image
244
+ img.save(output_path, "PNG", optimize=True)
245
+ except ImportError:
246
+ # Fallback: save without enhancement if PIL not available
247
+ pix.save(output_path)
248
+
249
+ doc.close()
250
+
251
+ logger.info(f"Generated preview: {output_path} ({pix.width}x{pix.height})")
252
+ return output_path
253
+
254
+ except ImportError:
255
+ logger.error("PyMuPDF (fitz) is not installed. Install with: pip install pymupdf")
256
+ return None
257
+ except Exception as e:
258
+ logger.error(f"Error generating PDF preview: {e}")
259
+ return None
260
+
261
+
262
+ async def generate_pdf_preview_async(pdf_path: str, output_path: Optional[str] = None) -> Optional[str]:
263
+ """
264
+ Async wrapper for PDF preview generation.
265
+
266
+ Args:
267
+ pdf_path: Path to the PDF file
268
+ output_path: Optional path for the output preview
269
+
270
+ Returns:
271
+ Path to the generated preview, or None if generation failed
272
+ """
273
+ return await asyncio.to_thread(generate_pdf_preview, pdf_path, output_path)
274
+
275
+
276
+ def generate_arxiv_thumbnail(arxiv_id: str, check_id: Optional[int] = None) -> Optional[str]:
277
+ """
278
+ Generate a thumbnail for an ArXiv paper.
279
+
280
+ Downloads the PDF and generates a thumbnail of the first page.
281
+
282
+ Args:
283
+ arxiv_id: ArXiv paper ID (e.g., "2311.12022")
284
+ check_id: Optional check ID for cache naming
285
+
286
+ Returns:
287
+ Path to the generated thumbnail, or None if generation failed
288
+ """
289
+ try:
290
+ import arxiv as arxiv_lib
291
+
292
+ # Check if thumbnail already exists
293
+ output_path = get_thumbnail_cache_path(f"arxiv_{arxiv_id}", check_id)
294
+ if output_path.exists():
295
+ logger.debug(f"ArXiv thumbnail already exists: {output_path}")
296
+ return str(output_path)
297
+
298
+ # Download the PDF to a temporary location
299
+ pdf_dir = Path(tempfile.gettempdir()) / "refchecker_pdfs"
300
+ pdf_dir.mkdir(parents=True, exist_ok=True)
301
+ pdf_path = pdf_dir / f"arxiv_{arxiv_id}.pdf"
302
+
303
+ # Check if PDF is already downloaded
304
+ if not pdf_path.exists():
305
+ logger.info(f"Downloading ArXiv PDF: {arxiv_id}")
306
+ search = arxiv_lib.Search(id_list=[arxiv_id])
307
+ paper = next(search.results())
308
+ paper.download_pdf(filename=str(pdf_path))
309
+
310
+ # Generate thumbnail from the PDF
311
+ return generate_pdf_thumbnail(str(pdf_path), str(output_path))
312
+
313
+ except Exception as e:
314
+ logger.error(f"Error generating ArXiv thumbnail: {e}")
315
+ return None
316
+
317
+
318
+ async def generate_arxiv_thumbnail_async(arxiv_id: str, check_id: Optional[int] = None) -> Optional[str]:
319
+ """
320
+ Async wrapper for ArXiv thumbnail generation.
321
+
322
+ Args:
323
+ arxiv_id: ArXiv paper ID
324
+ check_id: Optional check ID for cache naming
325
+
326
+ Returns:
327
+ Path to the generated thumbnail, or None if generation failed
328
+ """
329
+ return await asyncio.to_thread(generate_arxiv_thumbnail, arxiv_id, check_id)
330
+
331
+
332
+ def generate_arxiv_preview(arxiv_id: str, check_id: Optional[int] = None) -> Optional[str]:
333
+ """
334
+ Generate a high-resolution preview for an ArXiv paper.
335
+
336
+ Downloads the PDF and generates a preview of the first page.
337
+
338
+ Args:
339
+ arxiv_id: ArXiv paper ID (e.g., "2311.12022")
340
+ check_id: Optional check ID for cache naming
341
+
342
+ Returns:
343
+ Path to the generated preview, or None if generation failed
344
+ """
345
+ try:
346
+ import arxiv as arxiv_lib
347
+
348
+ # Check if preview already exists
349
+ output_path = get_preview_cache_path(f"arxiv_{arxiv_id}", check_id)
350
+ if output_path.exists():
351
+ logger.debug(f"ArXiv preview already exists: {output_path}")
352
+ return str(output_path)
353
+
354
+ # Download the PDF to a temporary location
355
+ pdf_dir = Path(tempfile.gettempdir()) / "refchecker_pdfs"
356
+ pdf_dir.mkdir(parents=True, exist_ok=True)
357
+ pdf_path = pdf_dir / f"arxiv_{arxiv_id}.pdf"
358
+
359
+ # Check if PDF is already downloaded
360
+ if not pdf_path.exists():
361
+ logger.info(f"Downloading ArXiv PDF: {arxiv_id}")
362
+ search = arxiv_lib.Search(id_list=[arxiv_id])
363
+ paper = next(search.results())
364
+ paper.download_pdf(filename=str(pdf_path))
365
+
366
+ # Generate preview from the PDF
367
+ return generate_pdf_preview(str(pdf_path), str(output_path))
368
+
369
+ except Exception as e:
370
+ logger.error(f"Error generating ArXiv preview: {e}")
371
+ return None
372
+
373
+
374
+ async def generate_arxiv_preview_async(arxiv_id: str, check_id: Optional[int] = None) -> Optional[str]:
375
+ """
376
+ Async wrapper for ArXiv preview generation.
377
+
378
+ Args:
379
+ arxiv_id: ArXiv paper ID
380
+ check_id: Optional check ID for cache naming
381
+
382
+ Returns:
383
+ Path to the generated preview, or None if generation failed
384
+ """
385
+ return await asyncio.to_thread(generate_arxiv_preview, arxiv_id, check_id)
386
+
387
+
388
+ def get_text_thumbnail(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
389
+ """
390
+ Generate a thumbnail for pasted text showing actual content.
391
+
392
+ Creates an image with the first few lines of the text content.
393
+
394
+ Args:
395
+ check_id: Check ID for naming
396
+ text_preview: Optional first few lines of text to display
397
+ text_file_path: Optional path to the text file to read content from
398
+
399
+ Returns:
400
+ Path to the generated thumbnail, or None if generation failed
401
+ """
402
+ try:
403
+ import fitz
404
+
405
+ output_path = get_thumbnail_cache_path(f"text_{check_id}", check_id)
406
+
407
+ if output_path.exists():
408
+ return str(output_path)
409
+
410
+ # Try to read text content from file
411
+ text_content = text_preview
412
+ if text_file_path and os.path.exists(text_file_path):
413
+ try:
414
+ with open(text_file_path, 'r', encoding='utf-8') as f:
415
+ text_content = f.read()
416
+ except Exception as e:
417
+ logger.warning(f"Could not read text file: {e}")
418
+
419
+ # Create a document-like image with actual text content
420
+ doc = fitz.open()
421
+ page = doc.new_page(width=THUMBNAIL_WIDTH, height=int(THUMBNAIL_WIDTH * 1.4))
422
+
423
+ # Fill with white/off-white background
424
+ page.draw_rect(page.rect, color=(0.95, 0.95, 0.95), fill=(0.99, 0.99, 0.99))
425
+
426
+ # Draw border
427
+ page.draw_rect(page.rect, color=(0.8, 0.8, 0.8), width=1)
428
+
429
+ # Draw actual text content if available
430
+ margin = 10
431
+ if text_content:
432
+ # Create a text box for the content
433
+ text_rect = fitz.Rect(margin, margin, THUMBNAIL_WIDTH - margin, int(THUMBNAIL_WIDTH * 1.4) - margin)
434
+
435
+ # Truncate to first ~500 chars for thumbnail
436
+ display_text = text_content[:500]
437
+ if len(text_content) > 500:
438
+ display_text += "..."
439
+
440
+ # Insert text with small font
441
+ page.insert_textbox(
442
+ text_rect,
443
+ display_text,
444
+ fontsize=6,
445
+ color=(0.2, 0.2, 0.2),
446
+ fontname="helv"
447
+ )
448
+ else:
449
+ # Fallback: Draw placeholder lines
450
+ line_height = 12
451
+ y = margin + 30
452
+
453
+ # Draw a "T" icon at top
454
+ text_rect = fitz.Rect(margin, margin, margin + 30, margin + 25)
455
+ page.insert_textbox(text_rect, "T", fontsize=20, color=(0.4, 0.4, 0.6))
456
+
457
+ for i in range(10):
458
+ line_width = THUMBNAIL_WIDTH - 2 * margin
459
+ if i % 3 == 2:
460
+ line_width = line_width * 0.7
461
+
462
+ page.draw_line(
463
+ fitz.Point(margin, y),
464
+ fitz.Point(margin + line_width, y),
465
+ color=(0.7, 0.7, 0.7),
466
+ width=2
467
+ )
468
+ y += line_height
469
+
470
+ # Render to pixmap and save
471
+ pix = page.get_pixmap(alpha=False)
472
+ pix.save(str(output_path))
473
+ doc.close()
474
+
475
+ logger.info(f"Generated text thumbnail: {output_path}")
476
+ return str(output_path)
477
+
478
+ except ImportError:
479
+ logger.error("PyMuPDF (fitz) is not installed")
480
+ return None
481
+ except Exception as e:
482
+ logger.error(f"Error generating text thumbnail: {e}")
483
+ return None
484
+
485
+
486
+ async def get_text_thumbnail_async(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
487
+ """Async wrapper for text thumbnail generation."""
488
+ return await asyncio.to_thread(get_text_thumbnail, check_id, text_preview, text_file_path)
489
+
490
+
491
+ def cleanup_old_thumbnails(max_age_days: int = 30):
492
+ """
493
+ Clean up old thumbnails from the cache.
494
+
495
+ Args:
496
+ max_age_days: Maximum age in days before thumbnails are deleted
497
+ """
498
+ try:
499
+ import time
500
+
501
+ if not THUMBNAIL_CACHE_DIR.exists():
502
+ return
503
+
504
+ max_age_seconds = max_age_days * 24 * 60 * 60
505
+ current_time = time.time()
506
+
507
+ for thumb_path in THUMBNAIL_CACHE_DIR.glob("thumb_*.png"):
508
+ try:
509
+ file_age = current_time - thumb_path.stat().st_mtime
510
+ if file_age > max_age_seconds:
511
+ thumb_path.unlink()
512
+ logger.debug(f"Deleted old thumbnail: {thumb_path}")
513
+ except Exception as e:
514
+ logger.warning(f"Error deleting thumbnail {thumb_path}: {e}")
515
+
516
+ except Exception as e:
517
+ logger.error(f"Error cleaning up thumbnails: {e}")
@@ -0,0 +1,104 @@
1
+ """
2
+ WebSocket connection manager for real-time updates
3
+ """
4
+ import asyncio
5
+ import json
6
+ from typing import Dict, Set
7
+ from fastapi import WebSocket
8
+ import logging
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class ConnectionManager:
14
+ """Manages WebSocket connections for real-time updates"""
15
+
16
+ def __init__(self):
17
+ # Map of session_id -> set of websocket connections
18
+ self.active_connections: Dict[str, Set[WebSocket]] = {}
19
+
20
+ async def connect(self, websocket: WebSocket, session_id: str):
21
+ """Accept a new WebSocket connection"""
22
+ await websocket.accept()
23
+ if session_id not in self.active_connections:
24
+ self.active_connections[session_id] = set()
25
+ self.active_connections[session_id].add(websocket)
26
+ logger.info(f"WebSocket connected for session: {session_id}")
27
+
28
+ def disconnect(self, websocket: WebSocket, session_id: str):
29
+ """Remove a WebSocket connection"""
30
+ if session_id in self.active_connections:
31
+ self.active_connections[session_id].discard(websocket)
32
+ if not self.active_connections[session_id]:
33
+ del self.active_connections[session_id]
34
+ logger.info(f"WebSocket disconnected for session: {session_id}")
35
+
36
+ async def send_message(self, session_id: str, message_type: str, data: dict):
37
+ """Send a message to all connections for a session"""
38
+ if session_id not in self.active_connections:
39
+ logger.debug(f"No active connections for session: {session_id}")
40
+ return
41
+
42
+ # Flatten structure: frontend expects {type, session_id, ...data}
43
+ # Include session_id so the client can ignore stale messages from old sessions
44
+ message = {"type": message_type, "session_id": session_id, **data}
45
+ message_json = json.dumps(message)
46
+
47
+ logger.debug(f"Sending {message_type} to session {session_id}: {message_json[:200]}...")
48
+
49
+ # Send to all connections for this session
50
+ disconnected = set()
51
+ for websocket in self.active_connections[session_id]:
52
+ try:
53
+ await websocket.send_text(message_json)
54
+ except Exception as e:
55
+ logger.error(f"Error sending message to websocket: {e}")
56
+ disconnected.add(websocket)
57
+
58
+ # Clean up disconnected websockets
59
+ for ws in disconnected:
60
+ self.disconnect(ws, session_id)
61
+
62
+ async def broadcast_started(self, session_id: str, paper_title: str, paper_source: str):
63
+ """Broadcast that checking has started"""
64
+ await self.send_message(session_id, "started", {
65
+ "paper_title": paper_title,
66
+ "paper_source": paper_source
67
+ })
68
+
69
+ async def broadcast_extracting(self, session_id: str):
70
+ """Broadcast that references are being extracted"""
71
+ await self.send_message(session_id, "extracting", {
72
+ "message": "Extracting references from paper..."
73
+ })
74
+
75
+ async def broadcast_progress(self, session_id: str, current: int, total: int):
76
+ """Broadcast progress update"""
77
+ await self.send_message(session_id, "progress", {
78
+ "current": current,
79
+ "total": total,
80
+ "percent": round((current / total * 100) if total > 0 else 0, 1)
81
+ })
82
+
83
+ async def broadcast_reference_result(self, session_id: str, reference_data: dict):
84
+ """Broadcast a reference checking result"""
85
+ await self.send_message(session_id, "reference_result", reference_data)
86
+
87
+ async def broadcast_summary_update(self, session_id: str, summary: dict):
88
+ """Broadcast updated summary statistics"""
89
+ await self.send_message(session_id, "summary_update", summary)
90
+
91
+ async def broadcast_completed(self, session_id: str, final_summary: dict):
92
+ """Broadcast that checking is complete"""
93
+ await self.send_message(session_id, "completed", final_summary)
94
+
95
+ async def broadcast_error(self, session_id: str, error_message: str, error_details: str = ""):
96
+ """Broadcast an error"""
97
+ await self.send_message(session_id, "error", {
98
+ "message": error_message,
99
+ "details": error_details
100
+ })
101
+
102
+
103
+ # Global connection manager instance
104
+ manager = ConnectionManager()
refchecker/__version__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.64"
3
+ __version__ = "1.2.66"
4
4
 
5
- __version__ = "1.2.64"
5
+ __version__ = "1.2.66"
@@ -487,13 +487,22 @@ class CrossRefReferenceChecker:
487
487
  work_doi = work_data.get('DOI')
488
488
  if doi and work_doi:
489
489
  # Compare DOIs using the proper comparison function
490
- from refchecker.utils.doi_utils import compare_dois
490
+ from refchecker.utils.doi_utils import compare_dois, validate_doi_resolves
491
491
  if not compare_dois(doi, work_doi):
492
- errors.append({
493
- 'error_type': 'doi',
494
- 'error_details': format_doi_mismatch(doi, work_doi),
495
- 'ref_doi_correct': work_doi
496
- })
492
+ # If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
493
+ # Treat as warning instead of error
494
+ if validate_doi_resolves(doi):
495
+ errors.append({
496
+ 'warning_type': 'doi',
497
+ 'warning_details': format_doi_mismatch(doi, work_doi),
498
+ 'ref_doi_correct': work_doi
499
+ })
500
+ else:
501
+ errors.append({
502
+ 'error_type': 'doi',
503
+ 'error_details': format_doi_mismatch(doi, work_doi),
504
+ 'ref_doi_correct': work_doi
505
+ })
497
506
 
498
507
  # Extract URL from work data
499
508
  work_url = self.extract_url_from_work(work_data)
@@ -256,6 +256,21 @@ class EnhancedHybridReferenceChecker:
256
256
  Returns:
257
257
  Tuple of (verified_data, errors, url)
258
258
  """
259
+ # Check if this is a URL-only reference (should skip verification)
260
+ authors = reference.get('authors', [])
261
+ if authors and "URL Reference" in authors:
262
+ # Skip verification for URL references - they're just links, not papers
263
+ logger.debug("Enhanced Hybrid: Skipping verification for URL reference")
264
+ return None, [], reference.get('cited_url') or reference.get('url')
265
+
266
+ # Also check if it looks like a URL-only reference (no title, just URL)
267
+ title = reference.get('title', '').strip()
268
+ cited_url = reference.get('cited_url') or reference.get('url')
269
+ if not title and cited_url:
270
+ # This is a URL-only reference without a title
271
+ logger.debug(f"Enhanced Hybrid: Skipping verification for URL-only reference: {cited_url}")
272
+ return None, [], cited_url
273
+
259
274
  # Track all APIs that failed and could be retried
260
275
  failed_apis = []
261
276
 
@@ -533,10 +548,9 @@ class EnhancedHybridReferenceChecker:
533
548
  if self.semantic_scholar:
534
549
  return self.semantic_scholar.normalize_paper_title(title)
535
550
  else:
536
- # Basic normalization if Semantic Scholar is not available
537
- import re
538
- title = re.sub(r'\s+', ' ', title.strip().lower())
539
- return re.sub(r'[^\w\s]', '', title)
551
+ # Use the centralized normalization function from text_utils
552
+ from refchecker.utils.text_utils import normalize_paper_title as normalize_title
553
+ return normalize_title(title)
540
554
 
541
555
  def compare_authors(self, cited_authors: List[str], correct_authors: List[Any]) -> Tuple[bool, str]:
542
556
  """