academic-refchecker 2.0.12__tar.gz → 2.0.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {academic_refchecker-2.0.12/academic_refchecker.egg-info → academic_refchecker-2.0.14}/PKG-INFO +1 -1
  2. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14/academic_refchecker.egg-info}/PKG-INFO +1 -1
  3. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/main.py +33 -5
  4. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/refchecker_wrapper.py +42 -1
  5. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/thumbnail.py +117 -0
  6. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/__version__.py +1 -1
  7. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/checkers/arxiv_citation.py +181 -49
  8. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/checkers/enhanced_hybrid_checker.py +117 -4
  9. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/checkers/semantic_scholar.py +43 -1
  10. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/llm/base.py +1 -15
  11. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/llm/providers.py +102 -113
  12. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/author_utils.py +15 -2
  13. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/bibliography_utils.py +2 -2
  14. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/LICENSE +0 -0
  15. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/MANIFEST.in +0 -0
  16. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/README.md +0 -0
  17. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/SOURCES.txt +0 -0
  18. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/dependency_links.txt +0 -0
  19. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/entry_points.txt +0 -0
  20. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/requires.txt +0 -0
  21. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/top_level.txt +0 -0
  22. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/__init__.py +0 -0
  23. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/__main__.py +0 -0
  24. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/cli.py +0 -0
  25. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/concurrency.py +0 -0
  26. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/database.py +0 -0
  27. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/models.py +0 -0
  28. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/static/assets/index-2P6L_39v.css +0 -0
  29. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/static/assets/index-hk21nqxR.js +0 -0
  30. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/static/favicon.svg +0 -0
  31. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/static/index.html +0 -0
  32. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/static/vite.svg +0 -0
  33. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/backend/websocket_manager.py +0 -0
  34. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/pyproject.toml +0 -0
  35. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/requirements.txt +0 -0
  36. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/scripts/download_db.py +0 -0
  37. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/scripts/run_tests.py +0 -0
  38. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/scripts/start_vllm_server.py +0 -0
  39. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/setup.cfg +0 -0
  40. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/__init__.py +0 -0
  41. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/__main__.py +0 -0
  42. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/checkers/__init__.py +0 -0
  43. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/checkers/crossref.py +0 -0
  44. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/checkers/github_checker.py +0 -0
  45. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/checkers/local_semantic_scholar.py +0 -0
  46. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/checkers/openalex.py +0 -0
  47. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/checkers/openreview_checker.py +0 -0
  48. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/checkers/pdf_paper_checker.py +0 -0
  49. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/checkers/webpage_checker.py +0 -0
  50. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/config/__init__.py +0 -0
  51. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/config/logging.conf +0 -0
  52. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/config/settings.py +0 -0
  53. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/core/__init__.py +0 -0
  54. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/core/db_connection_pool.py +0 -0
  55. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/core/parallel_processor.py +0 -0
  56. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/core/refchecker.py +0 -0
  57. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/database/__init__.py +0 -0
  58. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/database/download_semantic_scholar_db.py +0 -0
  59. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/llm/__init__.py +0 -0
  60. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/scripts/__init__.py +0 -0
  61. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/scripts/start_vllm_server.py +0 -0
  62. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/services/__init__.py +0 -0
  63. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/services/pdf_processor.py +0 -0
  64. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/__init__.py +0 -0
  65. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/arxiv_rate_limiter.py +0 -0
  66. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/arxiv_utils.py +0 -0
  67. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/biblatex_parser.py +0 -0
  68. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/bibtex_parser.py +0 -0
  69. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/config_validator.py +0 -0
  70. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/db_utils.py +0 -0
  71. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/doi_utils.py +0 -0
  72. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/error_utils.py +0 -0
  73. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/mock_objects.py +0 -0
  74. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/text_utils.py +0 -0
  75. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/unicode_utils.py +0 -0
  76. {academic_refchecker-2.0.12 → academic_refchecker-2.0.14}/src/refchecker/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 2.0.12
3
+ Version: 2.0.14
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 2.0.12
3
+ Version: 2.0.14
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -27,6 +27,7 @@ from .thumbnail import (
27
27
  generate_pdf_thumbnail_async,
28
28
  generate_pdf_preview_async,
29
29
  get_text_thumbnail_async,
30
+ get_text_preview_async,
30
31
  get_thumbnail_cache_path,
31
32
  get_preview_cache_path
32
33
  )
@@ -220,12 +221,15 @@ async def start_check(
220
221
  elif source_type == "text":
221
222
  if not source_text:
222
223
  raise HTTPException(status_code=400, detail="No text provided")
224
+ # Normalize line endings - remove all \r to prevent double carriage returns
225
+ # Browser may send \r\n, and Windows file writing can add extra \r
226
+ normalized_text = source_text.replace('\r\n', '\n').replace('\r', '\n')
223
227
  # Save pasted text to a file for later retrieval and thumbnail generation
224
228
  text_dir = Path(tempfile.gettempdir()) / "refchecker_texts"
225
229
  text_dir.mkdir(parents=True, exist_ok=True)
226
230
  text_file_path = text_dir / f"pasted_{session_id}.txt"
227
- with open(text_file_path, "w", encoding="utf-8") as f:
228
- f.write(source_text)
231
+ with open(text_file_path, "w", encoding="utf-8", newline='\n') as f:
232
+ f.write(normalized_text)
229
233
  paper_source = str(text_file_path)
230
234
  paper_title = "Pasted Text"
231
235
  elif source_type == "url":
@@ -646,9 +650,33 @@ async def get_preview(check_id: int):
646
650
  media_type="image/png",
647
651
  headers={"Cache-Control": "public, max-age=86400"} # Cache for 1 day
648
652
  )
649
- else:
650
- # Fall back to thumbnail if preview can't be generated
651
- raise HTTPException(status_code=404, detail="Could not generate preview")
653
+
654
+ # For text sources, generate a high-resolution text preview for overlay display
655
+ if source_type == 'text':
656
+ logger.info(f"Generating text preview for check {check_id}")
657
+ preview_path = await get_text_preview_async(check_id, "", paper_source)
658
+ if preview_path and os.path.exists(preview_path):
659
+ return FileResponse(
660
+ preview_path,
661
+ media_type="image/png",
662
+ headers={"Cache-Control": "public, max-age=86400"}
663
+ )
664
+
665
+ # For non-PDF file uploads, also generate a text preview
666
+ if source_type == 'file' and not paper_source.lower().endswith('.pdf'):
667
+ logger.info(f"Generating text preview for uploaded file check {check_id}")
668
+ if os.path.exists(paper_source):
669
+ preview_path = await get_text_preview_async(check_id, "", paper_source)
670
+ else:
671
+ preview_path = await get_text_preview_async(check_id, "Uploaded file")
672
+ if preview_path and os.path.exists(preview_path):
673
+ return FileResponse(
674
+ preview_path,
675
+ media_type="image/png",
676
+ headers={"Cache-Control": "public, max-age=86400"}
677
+ )
678
+
679
+ raise HTTPException(status_code=404, detail="Could not generate preview")
652
680
 
653
681
  except HTTPException:
654
682
  raise
@@ -3,6 +3,7 @@ Wrapper around refchecker library with progress callbacks for real-time updates
3
3
  """
4
4
  import sys
5
5
  import os
6
+ import re
6
7
  import asyncio
7
8
  import logging
8
9
  import tempfile
@@ -238,6 +239,18 @@ class ProgressRefChecker:
238
239
  if not any(u.get('url') == doi_url for u in authoritative_urls):
239
240
  authoritative_urls.append({"type": "doi", "url": doi_url})
240
241
 
242
+ # Add Semantic Scholar URL if available
243
+ s2_paper_id = external_ids.get('S2PaperId')
244
+ if s2_paper_id:
245
+ s2_url = f"https://www.semanticscholar.org/paper/{s2_paper_id}"
246
+ if not any(u.get('url') == s2_url for u in authoritative_urls):
247
+ authoritative_urls.append({"type": "semantic_scholar", "url": s2_url})
248
+
249
+ # Also check for inline S2 URL (from merged data)
250
+ s2_inline_url = verified_data.get('_semantic_scholar_url')
251
+ if s2_inline_url and not any(u.get('url') == s2_inline_url for u in authoritative_urls):
252
+ authoritative_urls.append({"type": "semantic_scholar", "url": s2_inline_url})
253
+
241
254
  # Format errors, warnings, and suggestions
242
255
  formatted_errors = []
243
256
  formatted_warnings = []
@@ -462,11 +475,20 @@ class ProgressRefChecker:
462
475
  raise ValueError("PDF extraction requires an LLM to be configured. Please configure an LLM provider in settings.")
463
476
  pdf_processor = PDFProcessor()
464
477
  paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, paper_source)
465
- elif paper_source.lower().endswith(('.tex', '.txt')):
478
+ elif paper_source.lower().endswith(('.tex', '.txt', '.bib')):
466
479
  def read_file():
467
480
  with open(paper_source, 'r', encoding='utf-8') as f:
468
481
  return f.read()
469
482
  paper_text = await asyncio.to_thread(read_file)
483
+
484
+ # For .bib files, extract references directly using BibTeX parser
485
+ if paper_source.lower().endswith('.bib'):
486
+ logger.info("Processing uploaded .bib file as BibTeX")
487
+ refs_result = await self._extract_references_from_bibtex(paper_text)
488
+ if refs_result and refs_result[0]:
489
+ arxiv_source_references = refs_result[0]
490
+ extraction_method = 'bib'
491
+ logger.info(f"Extracted {len(arxiv_source_references)} references from .bib file")
470
492
  else:
471
493
  raise ValueError(f"Unsupported file type: {paper_source}")
472
494
  elif source_type == "text":
@@ -494,6 +516,25 @@ class ProgressRefChecker:
494
516
  arxiv_source_references = refs_result[0]
495
517
  extraction_method = 'bbl' # Mark as bbl extraction
496
518
  logger.info(f"Extracted {len(arxiv_source_references)} references from pasted .bbl content")
519
+ # Check if the pasted text is BibTeX format (@article, @misc, @inproceedings, etc.)
520
+ elif re.search(r'@\s*(article|book|inproceedings|incollection|misc|techreport|phdthesis|mastersthesis|conference|inbook|proceedings)\s*\{', paper_text, re.IGNORECASE):
521
+ logger.info("Detected BibTeX format in pasted text")
522
+ refs_result = await self._extract_references_from_bibtex(paper_text)
523
+ if refs_result and refs_result[0]:
524
+ arxiv_source_references = refs_result[0]
525
+ extraction_method = 'bib' # Mark as bib extraction
526
+ logger.info(f"Extracted {len(arxiv_source_references)} references from pasted BibTeX content")
527
+ # Fallback: Try BibTeX parsing anyway for partial/malformed content
528
+ # This handles cases like incomplete paste, or BibTeX-like content without standard entry types
529
+ elif any(marker in paper_text for marker in ['title={', 'author={', 'year={', 'eprint={', '@']):
530
+ logger.info("Detected possible BibTeX-like content, attempting parse")
531
+ refs_result = await self._extract_references_from_bibtex(paper_text)
532
+ if refs_result and refs_result[0]:
533
+ arxiv_source_references = refs_result[0]
534
+ extraction_method = 'bib'
535
+ logger.info(f"Extracted {len(arxiv_source_references)} references from partial BibTeX content")
536
+ else:
537
+ logger.warning("BibTeX-like content detected but parsing failed, will try LLM extraction")
497
538
  # Don't update title for pasted text - keep the placeholder
498
539
  else:
499
540
  raise ValueError(f"Unsupported source type: {source_type}")
@@ -416,6 +416,13 @@ def get_text_thumbnail(check_id: int, text_preview: str = "", text_file_path: st
416
416
  except Exception as e:
417
417
  logger.warning(f"Could not read text file: {e}")
418
418
 
419
+ # Clean up text content - remove excessive blank lines that cause rendering issues
420
+ if text_content:
421
+ # Normalize line endings and remove consecutive blank lines
422
+ lines = text_content.replace('\r\n', '\n').replace('\r', '\n').split('\n')
423
+ # Keep only non-empty lines
424
+ text_content = '\n'.join(line for line in lines if line.strip())
425
+
419
426
  # Create a document-like image with actual text content
420
427
  doc = fitz.open()
421
428
  page = doc.new_page(width=THUMBNAIL_WIDTH, height=int(THUMBNAIL_WIDTH * 1.4))
@@ -483,6 +490,116 @@ def get_text_thumbnail(check_id: int, text_preview: str = "", text_file_path: st
483
490
  return None
484
491
 
485
492
 
493
+ def get_text_preview(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
494
+ """
495
+ Generate a high-resolution preview for pasted text showing actual content.
496
+
497
+ Creates a larger image (similar to PDF previews) with the text content.
498
+
499
+ Args:
500
+ check_id: Check ID for naming
501
+ text_preview: Optional first few lines of text to display
502
+ text_file_path: Optional path to the text file to read content from
503
+
504
+ Returns:
505
+ Path to the generated preview, or None if generation failed
506
+ """
507
+ try:
508
+ import fitz
509
+
510
+ output_path = get_preview_cache_path(f"text_{check_id}", check_id)
511
+
512
+ if output_path.exists():
513
+ return str(output_path)
514
+
515
+ # Try to read text content from file
516
+ text_content = text_preview
517
+ if text_file_path and os.path.exists(text_file_path):
518
+ try:
519
+ with open(text_file_path, 'r', encoding='utf-8') as f:
520
+ text_content = f.read()
521
+ except Exception as e:
522
+ logger.warning(f"Could not read text file: {e}")
523
+
524
+ # Clean up text content - remove excessive blank lines that cause rendering issues
525
+ if text_content:
526
+ # Normalize line endings and remove consecutive blank lines
527
+ lines = text_content.replace('\r\n', '\n').replace('\r', '\n').split('\n')
528
+ # Keep only non-empty lines
529
+ text_content = '\n'.join(line for line in lines if line.strip())
530
+
531
+ # Create a document-like image with actual text content at high resolution
532
+ doc = fitz.open()
533
+ page = doc.new_page(width=PREVIEW_WIDTH, height=int(PREVIEW_WIDTH * 1.4))
534
+
535
+ # Fill with white/off-white background
536
+ page.draw_rect(page.rect, color=(0.9, 0.9, 0.9), fill=(0.98, 0.98, 0.98))
537
+
538
+ # Draw border
539
+ page.draw_rect(page.rect, color=(0.7, 0.7, 0.7), width=2)
540
+
541
+ # Draw actual text content if available
542
+ margin = 40
543
+ if text_content:
544
+ # Create a text box for the content
545
+ text_rect = fitz.Rect(margin, margin, PREVIEW_WIDTH - margin, int(PREVIEW_WIDTH * 1.4) - margin)
546
+
547
+ # Truncate to first ~4000 chars for preview
548
+ display_text = text_content[:4000]
549
+ if len(text_content) > 4000:
550
+ display_text += "\n\n..."
551
+
552
+ # Insert text with readable font size
553
+ page.insert_textbox(
554
+ text_rect,
555
+ display_text,
556
+ fontsize=14,
557
+ color=(0.15, 0.15, 0.15),
558
+ fontname="helv"
559
+ )
560
+ else:
561
+ # Fallback: Draw placeholder
562
+ header_rect = fitz.Rect(margin, margin, PREVIEW_WIDTH - margin, margin + 60)
563
+ page.insert_textbox(header_rect, "Pasted Text", fontsize=36, color=(0.3, 0.3, 0.5))
564
+
565
+ # Draw placeholder lines
566
+ line_height = 24
567
+ y = margin + 100
568
+
569
+ for i in range(20):
570
+ line_width = PREVIEW_WIDTH - 2 * margin
571
+ if i % 3 == 2:
572
+ line_width = line_width * 0.7
573
+
574
+ page.draw_line(
575
+ fitz.Point(margin, y),
576
+ fitz.Point(margin + line_width, y),
577
+ color=(0.7, 0.7, 0.7),
578
+ width=3
579
+ )
580
+ y += line_height
581
+
582
+ # Render to pixmap and save
583
+ pix = page.get_pixmap(alpha=False)
584
+ pix.save(str(output_path))
585
+ doc.close()
586
+
587
+ logger.info(f"Generated text preview: {output_path}")
588
+ return str(output_path)
589
+
590
+ except ImportError:
591
+ logger.error("PyMuPDF (fitz) is not installed")
592
+ return None
593
+ except Exception as e:
594
+ logger.error(f"Error generating text preview: {e}")
595
+ return None
596
+
597
+
598
+ async def get_text_preview_async(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
599
+ """Async wrapper for text preview generation."""
600
+ return await asyncio.to_thread(get_text_preview, check_id, text_preview, text_file_path)
601
+
602
+
486
603
  async def get_text_thumbnail_async(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
487
604
  """Async wrapper for text thumbnail generation."""
488
605
  return await asyncio.to_thread(get_text_thumbnail, check_id, text_preview, text_file_path)
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "2.0.12"
3
+ __version__ = "2.0.14"
@@ -8,8 +8,8 @@ for papers found on ArXiv, as it reflects the author-submitted metadata.
8
8
 
9
9
  Key features:
10
10
  - Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
11
- - Always uses the latest version metadata (strips version suffixes)
12
- - Logs warnings when cited version differs from latest version
11
+ - Checks reference against all historical versions when latest doesn't match
12
+ - Annotates errors with version info when reference matches an older version
13
13
  - Parses BibTeX to extract normalized metadata matching refchecker schema
14
14
 
15
15
  Usage:
@@ -30,6 +30,7 @@ Usage:
30
30
  import re
31
31
  import logging
32
32
  import requests
33
+ import html
33
34
  from typing import Dict, List, Tuple, Optional, Any
34
35
 
35
36
  import bibtexparser
@@ -88,6 +89,8 @@ class ArXivCitationChecker:
88
89
  # export.arxiv.org URLs
89
90
  r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
90
91
  r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
92
+ # DOI format
93
+ r"(?:arxiv[:./])(\d{4}\.\d{4,5})(v\d+)?"
91
94
  ]
92
95
 
93
96
  def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
@@ -107,6 +110,8 @@ class ArXivCitationChecker:
107
110
  reference.get('cited_url', ''),
108
111
  reference.get('raw_text', ''),
109
112
  reference.get('eprint', ''), # BibTeX field
113
+ reference.get('journal', ''),
114
+ reference.get('doi', ''), # DOI field (may contain arXiv ID)
110
115
  ]
111
116
 
112
117
  for source in sources:
@@ -324,35 +329,133 @@ class ArXivCitationChecker:
324
329
 
325
330
  return None
326
331
 
327
- def get_latest_version_info(self, arxiv_id: str) -> Optional[str]:
332
+ def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
328
333
  """
329
- Get the latest version number for an ArXiv paper.
334
+ Check if a reference is an ArXiv paper.
330
335
 
331
- Note: This requires fetching the abstract page, so it's optional.
332
- For now, we rely on the BibTeX always returning latest version metadata.
336
+ Args:
337
+ reference: Reference dictionary
338
+
339
+ Returns:
340
+ True if reference appears to be an ArXiv paper
341
+ """
342
+ arxiv_id, _ = self.extract_arxiv_id(reference)
343
+ return arxiv_id is not None
344
+
345
+ def _fetch_version_metadata_from_html(self, arxiv_id: str, version_num: int) -> Optional[Dict[str, Any]]:
346
+ """
347
+ Fetch and parse metadata for a specific version using HTML scraping.
333
348
 
334
349
  Args:
335
350
  arxiv_id: ArXiv ID without version
351
+ version_num: Version number to fetch (1, 2, 3, etc.)
336
352
 
337
353
  Returns:
338
- Latest version string (e.g., "v3") or None if couldn't determine
354
+ Dictionary with version metadata or None if version doesn't exist
339
355
  """
340
- # The BibTeX endpoint always returns the latest version's metadata,
341
- # so we don't need to explicitly fetch version info
342
- return None
343
-
344
- def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
356
+ version_str = f"v{version_num}"
357
+ url = f"{self.abs_url}/{arxiv_id}{version_str}"
358
+
359
+ self.rate_limiter.wait()
360
+ try:
361
+ logger.debug(f"Checking historical version: {url}")
362
+ response = requests.get(url, timeout=self.timeout)
363
+ if response.status_code == 404:
364
+ return None # Version does not exist
365
+ response.raise_for_status()
366
+ html_content = response.text
367
+
368
+ # Parse meta tags for metadata
369
+ # Title
370
+ title_match = re.search(r'<meta name="citation_title" content="(.*?)"', html_content)
371
+ title = html.unescape(title_match.group(1)).strip() if title_match else ""
372
+
373
+ # Authors
374
+ authors = []
375
+ for auth in re.findall(r'<meta name="citation_author" content="(.*?)"', html_content):
376
+ authors.append(html.unescape(auth).strip())
377
+
378
+ # Date/Year
379
+ date_match = re.search(r'<meta name="citation_date" content="(.*?)"', html_content)
380
+ year = None
381
+ if date_match:
382
+ ym = re.search(r'^(\d{4})', date_match.group(1))
383
+ if ym:
384
+ year = int(ym.group(1))
385
+
386
+ return {
387
+ 'version': version_str,
388
+ 'version_num': version_num,
389
+ 'title': title,
390
+ 'authors': [{'name': a} for a in authors],
391
+ 'year': year,
392
+ 'url': url,
393
+ }
394
+ except Exception as e:
395
+ logger.warning(f"Failed to fetch history {version_str}: {e}")
396
+ return None
397
+
398
+ def _get_latest_version_number(self, arxiv_id: str) -> Optional[int]:
345
399
  """
346
- Check if a reference is an ArXiv paper.
400
+ Get the latest version number by fetching the abstract page.
347
401
 
348
402
  Args:
349
- reference: Reference dictionary
403
+ arxiv_id: ArXiv ID without version
350
404
 
351
405
  Returns:
352
- True if reference appears to be an ArXiv paper
406
+ Latest version number as integer, or None if couldn't determine
353
407
  """
354
- arxiv_id, _ = self.extract_arxiv_id(reference)
355
- return arxiv_id is not None
408
+ url = f"{self.abs_url}/{arxiv_id}"
409
+
410
+ self.rate_limiter.wait()
411
+ try:
412
+ response = requests.get(url, timeout=self.timeout)
413
+ response.raise_for_status()
414
+
415
+ # Look for version links like "[v1]", "[v2]", etc.
416
+ versions = re.findall(r'\[v(\d+)\]', response.text)
417
+ if versions:
418
+ return max(int(v) for v in versions)
419
+ return None
420
+ except Exception as e:
421
+ logger.warning(f"Failed to get latest version for {arxiv_id}: {e}")
422
+ return None
423
+
424
+ def _compare_info_match(
425
+ self, cited_title: str, cited_authors: List[str], cited_year: Optional[int],
426
+ authoritative_title: str, authoritative_authors: List[str], authoritative_year: Optional[int]) -> bool:
427
+ """
428
+ Compare the information of a cited paper with the authoritative information.
429
+
430
+ Args:
431
+ cited_title: Title from the reference
432
+ cited_authors: Authors from the reference
433
+ cited_year: Year from the reference
434
+ authoritative_title: Title from ArXiv version
435
+ authoritative_authors: Authors from ArXiv version
436
+ authoritative_year: Year from ArXiv version
437
+
438
+ Returns:
439
+ True if the information matches, False otherwise.
440
+ """
441
+ # Compare title
442
+ if cited_title and authoritative_title:
443
+ title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
444
+ if title_similarity < SIMILARITY_THRESHOLD:
445
+ return False
446
+
447
+ # Compare authors
448
+ if cited_authors and authoritative_authors:
449
+ authors_match, _ = compare_authors(cited_authors, authoritative_authors)
450
+ if not authors_match:
451
+ return False
452
+
453
+ # Compare year
454
+ if cited_year and authoritative_year:
455
+ if cited_year != authoritative_year:
456
+ return False
457
+
458
+ return True
356
459
 
357
460
  def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
358
461
  """
@@ -360,10 +463,10 @@ class ArXivCitationChecker:
360
463
 
361
464
  This method:
362
465
  1. Extracts the ArXiv ID from the reference
363
- 2. Fetches the official BibTeX from ArXiv (always latest version)
364
- 3. Parses the BibTeX to get authoritative metadata
365
- 4. Compares cited metadata against authoritative source
366
- 5. Logs warnings for version mismatches
466
+ 2. Fetches the official BibTeX from ArXiv (latest version)
467
+ 3. Compares cited metadata against latest version
468
+ 4. If errors found, checks historical versions to find a match
469
+ 5. Annotates errors with version info if reference matches an older version
367
470
 
368
471
  Args:
369
472
  reference: Reference dictionary with title, authors, year, url, etc.
@@ -385,34 +488,26 @@ class ArXivCitationChecker:
385
488
 
386
489
  logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
387
490
 
388
- # Fetch authoritative BibTeX
491
+ # Extract information from reference for comparison
492
+ cited_title = reference.get('title', '').strip()
493
+ cited_authors = reference.get('authors', [])
494
+ cited_year = reference.get('year')
495
+
496
+ # Fetch authoritative BibTeX (latest version)
389
497
  bibtex_content = self.fetch_bibtex(arxiv_id)
390
498
 
391
499
  if not bibtex_content:
392
500
  logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
393
501
  return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
394
502
 
395
- # Parse BibTeX
396
- verified_data = self.parse_bibtex(bibtex_content)
503
+ latest_data = self.parse_bibtex(bibtex_content)
397
504
 
398
- if not verified_data:
505
+ if not latest_data:
399
506
  logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
400
507
  return None, [], None
401
-
402
- # Log version mismatch warning if cited version differs from latest
403
- if cited_version:
404
- # ArXiv BibTeX always returns latest version metadata
405
- # We don't know the actual latest version number without additional API call,
406
- # but we can warn that a specific version was cited
407
- errors.append({
408
- 'warning_type': 'version',
409
- 'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
410
- })
411
- logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
412
-
413
- # Compare title
414
- cited_title = reference.get('title', '').strip()
415
- authoritative_title = verified_data.get('title', '').strip()
508
+
509
+ # Compare against latest version
510
+ authoritative_title = latest_data.get('title', '').strip()
416
511
 
417
512
  if cited_title and authoritative_title:
418
513
  title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
@@ -426,9 +521,8 @@ class ArXivCitationChecker:
426
521
  })
427
522
 
428
523
  # Compare authors
429
- cited_authors = reference.get('authors', [])
430
524
  if cited_authors:
431
- authoritative_authors = verified_data.get('authors', [])
525
+ authoritative_authors = latest_data.get('authors', [])
432
526
  authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
433
527
 
434
528
  if not authors_match:
@@ -440,9 +534,7 @@ class ArXivCitationChecker:
440
534
  })
441
535
 
442
536
  # Compare year
443
- cited_year = reference.get('year')
444
- authoritative_year = verified_data.get('year')
445
-
537
+ authoritative_year = latest_data.get('year')
446
538
  year_warning = validate_year(
447
539
  cited_year=cited_year,
448
540
  paper_year=authoritative_year,
@@ -451,10 +543,50 @@ class ArXivCitationChecker:
451
543
  )
452
544
  if year_warning:
453
545
  errors.append(year_warning)
454
-
455
- # Build URL
546
+
456
547
  paper_url = f"https://arxiv.org/abs/{arxiv_id}"
457
548
 
458
- logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
549
+ # If no errors against latest version, we're done
550
+ if len(errors) == 0:
551
+ logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with no errors")
552
+ return latest_data, errors, paper_url
553
+
554
+ # Check if reference matches a historical version
555
+ # Get latest version number first
556
+ latest_version_num = self._get_latest_version_number(arxiv_id)
557
+
558
+ if latest_version_num and latest_version_num > 1:
559
+ # Check historical versions (1 to latest-1)
560
+ for version_num in range(1, latest_version_num):
561
+ version_data = self._fetch_version_metadata_from_html(arxiv_id, version_num)
562
+ if not version_data:
563
+ continue
564
+
565
+ # Check if reference matches this historical version
566
+ if self._compare_info_match(
567
+ cited_title, cited_authors, cited_year,
568
+ version_data['title'], version_data['authors'], version_data['year']):
569
+
570
+ logger.debug(f"ArXivCitationChecker: Reference matches historical version v{version_num}")
571
+
572
+ # Convert errors to warnings with version update info
573
+ # Version update issues are informational, not errors - the citation was correct for its time
574
+ version_suffix = f" (v{version_num} vs v{latest_version_num} update)"
575
+ warnings = []
576
+ for error in errors:
577
+ warning = {
578
+ 'warning_type': error.get('error_type', 'unknown') + version_suffix,
579
+ 'warning_details': error.get('error_details', ''),
580
+ }
581
+ # Preserve correction hints
582
+ for key in ['ref_title_correct', 'ref_authors_correct', 'ref_year_correct']:
583
+ if key in error:
584
+ warning[key] = error[key]
585
+ warnings.append(warning)
586
+
587
+ # Return with warnings instead of errors - URL points to the matched version
588
+ matched_url = f"https://arxiv.org/abs/{arxiv_id}v{version_num}"
589
+ return latest_data, warnings, matched_url
459
590
 
460
- return verified_data, errors, paper_url
591
+ logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
592
+ return latest_data, errors, paper_url