academic-refchecker 2.0.11__tar.gz → 2.0.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {academic_refchecker-2.0.11/academic_refchecker.egg-info → academic_refchecker-2.0.13}/PKG-INFO +2 -1
  2. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13/academic_refchecker.egg-info}/PKG-INFO +2 -1
  3. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/SOURCES.txt +2 -0
  4. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/requires.txt +1 -0
  5. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/pyproject.toml +1 -0
  6. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/requirements.txt +1 -0
  7. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/__version__.py +1 -1
  8. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/__init__.py +3 -1
  9. academic_refchecker-2.0.13/src/refchecker/checkers/arxiv_citation.py +460 -0
  10. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/enhanced_hybrid_checker.py +24 -0
  11. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/config/settings.py +8 -0
  12. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/llm/base.py +1 -15
  13. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/llm/providers.py +102 -94
  14. academic_refchecker-2.0.13/src/refchecker/utils/arxiv_rate_limiter.py +133 -0
  15. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/text_utils.py +32 -0
  16. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/LICENSE +0 -0
  17. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/MANIFEST.in +0 -0
  18. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/README.md +0 -0
  19. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/dependency_links.txt +0 -0
  20. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/entry_points.txt +0 -0
  21. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/top_level.txt +0 -0
  22. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/__init__.py +0 -0
  23. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/__main__.py +0 -0
  24. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/cli.py +0 -0
  25. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/concurrency.py +0 -0
  26. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/database.py +0 -0
  27. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/main.py +0 -0
  28. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/models.py +0 -0
  29. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/refchecker_wrapper.py +0 -0
  30. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/static/assets/index-2P6L_39v.css +0 -0
  31. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/static/assets/index-hk21nqxR.js +0 -0
  32. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/static/favicon.svg +0 -0
  33. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/static/index.html +0 -0
  34. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/static/vite.svg +0 -0
  35. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/thumbnail.py +0 -0
  36. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/websocket_manager.py +0 -0
  37. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/scripts/download_db.py +0 -0
  38. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/scripts/run_tests.py +0 -0
  39. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/scripts/start_vllm_server.py +0 -0
  40. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/setup.cfg +0 -0
  41. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/__init__.py +0 -0
  42. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/__main__.py +0 -0
  43. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/crossref.py +0 -0
  44. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/github_checker.py +0 -0
  45. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/local_semantic_scholar.py +0 -0
  46. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/openalex.py +0 -0
  47. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/openreview_checker.py +0 -0
  48. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/pdf_paper_checker.py +0 -0
  49. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/semantic_scholar.py +0 -0
  50. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/webpage_checker.py +0 -0
  51. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/config/__init__.py +0 -0
  52. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/config/logging.conf +0 -0
  53. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/core/__init__.py +0 -0
  54. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/core/db_connection_pool.py +0 -0
  55. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/core/parallel_processor.py +0 -0
  56. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/core/refchecker.py +0 -0
  57. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/database/__init__.py +0 -0
  58. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/database/download_semantic_scholar_db.py +0 -0
  59. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/llm/__init__.py +0 -0
  60. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/scripts/__init__.py +0 -0
  61. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/scripts/start_vllm_server.py +0 -0
  62. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/services/__init__.py +0 -0
  63. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/services/pdf_processor.py +0 -0
  64. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/__init__.py +0 -0
  65. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/arxiv_utils.py +0 -0
  66. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/author_utils.py +0 -0
  67. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/biblatex_parser.py +0 -0
  68. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/bibliography_utils.py +0 -0
  69. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/bibtex_parser.py +0 -0
  70. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/config_validator.py +0 -0
  71. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/db_utils.py +0 -0
  72. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/doi_utils.py +0 -0
  73. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/error_utils.py +0 -0
  74. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/mock_objects.py +0 -0
  75. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/unicode_utils.py +0 -0
  76. {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 2.0.11
3
+ Version: 2.0.13
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -32,6 +32,7 @@ Requires-Dist: python-Levenshtein>=0.12.0
32
32
  Requires-Dist: pandas<2.4.0,>=1.3.0
33
33
  Requires-Dist: numpy<2.0.0,>=1.22.4
34
34
  Requires-Dist: pdfplumber>=0.6.0
35
+ Requires-Dist: bibtexparser>=1.4.0
35
36
  Provides-Extra: dev
36
37
  Requires-Dist: pytest>=6.0.0; extra == "dev"
37
38
  Requires-Dist: pytest-cov>=2.0.0; extra == "dev"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 2.0.11
3
+ Version: 2.0.13
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -32,6 +32,7 @@ Requires-Dist: python-Levenshtein>=0.12.0
32
32
  Requires-Dist: pandas<2.4.0,>=1.3.0
33
33
  Requires-Dist: numpy<2.0.0,>=1.22.4
34
34
  Requires-Dist: pdfplumber>=0.6.0
35
+ Requires-Dist: bibtexparser>=1.4.0
35
36
  Provides-Extra: dev
36
37
  Requires-Dist: pytest>=6.0.0; extra == "dev"
37
38
  Requires-Dist: pytest-cov>=2.0.0; extra == "dev"
@@ -31,6 +31,7 @@ src/refchecker/__init__.py
31
31
  src/refchecker/__main__.py
32
32
  src/refchecker/__version__.py
33
33
  src/refchecker/checkers/__init__.py
34
+ src/refchecker/checkers/arxiv_citation.py
34
35
  src/refchecker/checkers/crossref.py
35
36
  src/refchecker/checkers/enhanced_hybrid_checker.py
36
37
  src/refchecker/checkers/github_checker.py
@@ -57,6 +58,7 @@ src/refchecker/scripts/start_vllm_server.py
57
58
  src/refchecker/services/__init__.py
58
59
  src/refchecker/services/pdf_processor.py
59
60
  src/refchecker/utils/__init__.py
61
+ src/refchecker/utils/arxiv_rate_limiter.py
60
62
  src/refchecker/utils/arxiv_utils.py
61
63
  src/refchecker/utils/author_utils.py
62
64
  src/refchecker/utils/biblatex_parser.py
@@ -10,6 +10,7 @@ python-Levenshtein>=0.12.0
10
10
  pandas<2.4.0,>=1.3.0
11
11
  numpy<2.0.0,>=1.22.4
12
12
  pdfplumber>=0.6.0
13
+ bibtexparser>=1.4.0
13
14
 
14
15
  [dev]
15
16
  pytest>=6.0.0
@@ -37,6 +37,7 @@ dependencies = [
37
37
  "pandas>=1.3.0,<2.4.0",
38
38
  "numpy>=1.22.4,<2.0.0",
39
39
  "pdfplumber>=0.6.0",
40
+ "bibtexparser>=1.4.0",
40
41
  ]
41
42
 
42
43
  [project.optional-dependencies]
@@ -10,6 +10,7 @@ colorama>=0.4.4
10
10
  fuzzywuzzy>=0.18.0
11
11
  python-Levenshtein>=0.12.0
12
12
  cryptography>=42.0.0 # For API key encryption in web UI
13
+ bibtexparser>=1.4.0 # For parsing ArXiv BibTeX citations
13
14
 
14
15
  # Additional core dependencies found in codebase
15
16
  pandas>=1.3.0
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "2.0.11"
3
+ __version__ = "2.0.13"
@@ -7,11 +7,13 @@ from .local_semantic_scholar import LocalNonArxivReferenceChecker
7
7
  from .enhanced_hybrid_checker import EnhancedHybridReferenceChecker
8
8
  from .openalex import OpenAlexReferenceChecker
9
9
  from .crossref import CrossRefReferenceChecker
10
+ from .arxiv_citation import ArXivCitationChecker
10
11
 
11
12
  __all__ = [
12
13
  "NonArxivReferenceChecker",
13
14
  "LocalNonArxivReferenceChecker",
14
15
  "EnhancedHybridReferenceChecker",
15
16
  "OpenAlexReferenceChecker",
16
- "CrossRefReferenceChecker"
17
+ "CrossRefReferenceChecker",
18
+ "ArXivCitationChecker",
17
19
  ]
@@ -0,0 +1,460 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ArXiv Citation Checker - Authoritative Source for ArXiv Papers
4
+
5
+ This module provides functionality to verify ArXiv papers by fetching the official
6
+ BibTeX citation directly from ArXiv. This is used as the authoritative metadata source
7
+ for papers found on ArXiv, as it reflects the author-submitted metadata.
8
+
9
+ Key features:
10
+ - Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
11
+ - Always uses the latest version metadata (strips version suffixes)
12
+ - Logs warnings when cited version differs from latest version
13
+ - Parses BibTeX to extract normalized metadata matching refchecker schema
14
+
15
+ Usage:
16
+ from refchecker.checkers.arxiv_citation import ArXivCitationChecker
17
+
18
+ checker = ArXivCitationChecker()
19
+
20
+ reference = {
21
+ 'title': 'Attention Is All You Need',
22
+ 'authors': ['Ashish Vaswani', 'Noam Shazeer'],
23
+ 'year': 2017,
24
+ 'url': 'https://arxiv.org/abs/1706.03762v5',
25
+ }
26
+
27
+ verified_data, errors, url = checker.verify_reference(reference)
28
+ """
29
+
30
+ import re
31
+ import logging
32
+ import requests
33
+ from typing import Dict, List, Tuple, Optional, Any
34
+
35
+ import bibtexparser
36
+ from bibtexparser.bparser import BibTexParser
37
+ from bibtexparser.customization import convert_to_unicode
38
+
39
+ from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
40
+ from refchecker.utils.text_utils import (
41
+ normalize_text,
42
+ compare_authors,
43
+ compare_titles_with_latex_cleaning,
44
+ strip_latex_commands,
45
+ )
46
+ from refchecker.utils.error_utils import format_title_mismatch, validate_year
47
+ from refchecker.config.settings import get_config
48
+
49
+ logger = logging.getLogger(__name__)
50
+
51
+ # Get configuration
52
+ config = get_config()
53
+ SIMILARITY_THRESHOLD = config["text_processing"]["similarity_threshold"]
54
+
55
+
56
+ class ArXivCitationChecker:
57
+ """
58
+ Reference checker that uses ArXiv's official BibTeX export as the authoritative source.
59
+
60
+ This checker fetches the official BibTeX citation from ArXiv for papers identified
61
+ by their ArXiv ID. It uses the latest version's metadata as the authoritative source
62
+ and logs warnings when the cited version differs from the latest.
63
+ """
64
+
65
+ def __init__(self, timeout: int = 30):
66
+ """
67
+ Initialize the ArXiv Citation Checker.
68
+
69
+ Args:
70
+ timeout: HTTP request timeout in seconds
71
+ """
72
+ self.base_url = "https://arxiv.org/bibtex"
73
+ self.abs_url = "https://arxiv.org/abs"
74
+ self.timeout = timeout
75
+ self.rate_limiter = ArXivRateLimiter.get_instance()
76
+
77
+ # Pattern to extract arXiv IDs from various URL formats
78
+ self.arxiv_id_patterns = [
79
+ # Standard arxiv.org URLs
80
+ r'arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
81
+ r'arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
82
+ # Old format with category
83
+ r'arxiv\.org/abs/([a-z-]+/[0-9]{7})(v\d+)?',
84
+ r'arxiv\.org/pdf/([a-z-]+/[0-9]{7})(v\d+)?',
85
+ # arXiv: prefix in text
86
+ r'arXiv:([0-9]{4}\.[0-9]{4,5})(v\d+)?',
87
+ r'arXiv:([a-z-]+/[0-9]{7})(v\d+)?',
88
+ # export.arxiv.org URLs
89
+ r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
90
+ r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
91
+ ]
92
+
93
+ def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
94
+ """
95
+ Extract ArXiv ID from a reference, returning both the base ID and version.
96
+
97
+ Args:
98
+ reference: Reference dictionary containing url, raw_text, etc.
99
+
100
+ Returns:
101
+ Tuple of (arxiv_id_without_version, version_string_or_None)
102
+ For example: ("2301.12345", "v2") or ("2301.12345", None)
103
+ """
104
+ # Sources to check for ArXiv ID
105
+ sources = [
106
+ reference.get('url', ''),
107
+ reference.get('cited_url', ''),
108
+ reference.get('raw_text', ''),
109
+ reference.get('eprint', ''), # BibTeX field
110
+ ]
111
+
112
+ for source in sources:
113
+ if not source:
114
+ continue
115
+
116
+ for pattern in self.arxiv_id_patterns:
117
+ match = re.search(pattern, source, re.IGNORECASE)
118
+ if match:
119
+ arxiv_id = match.group(1)
120
+ version = match.group(2) if len(match.groups()) > 1 else None
121
+ logger.debug(f"Extracted ArXiv ID: {arxiv_id}, version: {version}")
122
+ return arxiv_id, version
123
+
124
+ return None, None
125
+
126
+ def fetch_bibtex(self, arxiv_id: str) -> Optional[str]:
127
+ """
128
+ Fetch the official BibTeX citation from ArXiv.
129
+
130
+ This always fetches the latest version's BibTeX (ArXiv default behavior).
131
+
132
+ Args:
133
+ arxiv_id: ArXiv ID without version suffix (e.g., "2301.12345")
134
+
135
+ Returns:
136
+ BibTeX string or None if fetch failed
137
+ """
138
+ url = f"{self.base_url}/{arxiv_id}"
139
+
140
+ # Wait for rate limit
141
+ self.rate_limiter.wait()
142
+
143
+ try:
144
+ logger.debug(f"Fetching ArXiv BibTeX from: {url}")
145
+ response = requests.get(url, timeout=self.timeout)
146
+ response.raise_for_status()
147
+
148
+ bibtex_content = response.text.strip()
149
+
150
+ # Validate it looks like BibTeX
151
+ if bibtex_content and bibtex_content.startswith('@'):
152
+ logger.debug(f"Successfully fetched BibTeX for ArXiv paper {arxiv_id}")
153
+ return bibtex_content
154
+ else:
155
+ logger.debug(f"Invalid BibTeX response for ArXiv paper {arxiv_id}")
156
+ return None
157
+
158
+ except requests.exceptions.Timeout:
159
+ logger.warning(f"Timeout fetching ArXiv BibTeX for {arxiv_id}")
160
+ return None
161
+ except requests.exceptions.RequestException as e:
162
+ logger.warning(f"Failed to fetch ArXiv BibTeX for {arxiv_id}: {e}")
163
+ return None
164
+
165
+ def parse_bibtex(self, bibtex_str: str) -> Optional[Dict[str, Any]]:
166
+ """
167
+ Parse BibTeX string and extract metadata in refchecker schema format.
168
+
169
+ Args:
170
+ bibtex_str: BibTeX content string
171
+
172
+ Returns:
173
+ Dictionary with parsed metadata or None if parsing failed
174
+ """
175
+ try:
176
+ # Configure parser
177
+ parser = BibTexParser(common_strings=True)
178
+ parser.customization = convert_to_unicode
179
+
180
+ # Parse BibTeX
181
+ bib_database = bibtexparser.loads(bibtex_str, parser=parser)
182
+
183
+ if not bib_database.entries:
184
+ logger.debug("No entries found in BibTeX")
185
+ return None
186
+
187
+ entry = bib_database.entries[0]
188
+
189
+ # Extract and normalize fields
190
+ title = entry.get('title', '')
191
+ # Clean title - remove braces used for capitalization protection
192
+ title = re.sub(r'\{([^}]*)\}', r'\1', title)
193
+ title = title.strip()
194
+
195
+ # Extract authors
196
+ authors_str = entry.get('author', '')
197
+ authors = self._parse_authors(authors_str)
198
+
199
+ # Extract year - prefer year from eprint ID (original submission) over BibTeX year (latest revision)
200
+ arxiv_id = entry.get('eprint', '')
201
+ year = self._extract_year_from_eprint(arxiv_id)
202
+
203
+ # Fall back to BibTeX year field if eprint year extraction fails
204
+ if not year and entry.get('year'):
205
+ try:
206
+ year = int(entry['year'])
207
+ except ValueError:
208
+ pass
209
+
210
+ # Build result in refchecker schema format
211
+ result = {
212
+ 'title': title,
213
+ 'authors': [{'name': author} for author in authors],
214
+ 'year': year,
215
+ 'venue': 'arXiv',
216
+ 'externalIds': {
217
+ 'ArXiv': arxiv_id,
218
+ },
219
+ 'url': f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else None,
220
+ 'isOpenAccess': True,
221
+ 'openAccessPdf': {
222
+ 'url': f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else None
223
+ },
224
+ # Store original bibtex for reference
225
+ '_bibtex_entry': entry,
226
+ '_source': 'ArXiv BibTeX Reference',
227
+ '_source_url': f"https://arxiv.org/bibtex/{arxiv_id}" if arxiv_id else None,
228
+ }
229
+
230
+ # Add DOI if present (some ArXiv papers have DOIs)
231
+ if entry.get('doi'):
232
+ result['externalIds']['DOI'] = entry['doi']
233
+
234
+ logger.debug(f"Parsed ArXiv BibTeX: title='{title[:50]}...', authors={len(authors)}, year={year}")
235
+ return result
236
+
237
+ except Exception as e:
238
+ logger.warning(f"Failed to parse BibTeX: {e}")
239
+ return None
240
+
241
+ def _parse_authors(self, authors_str: str) -> List[str]:
242
+ """
243
+ Parse BibTeX author string into list of author names.
244
+
245
+ BibTeX format: "Last1, First1 and Last2, First2 and ..."
246
+
247
+ Args:
248
+ authors_str: BibTeX author field value
249
+
250
+ Returns:
251
+ List of author names in "First Last" format
252
+ """
253
+ if not authors_str:
254
+ return []
255
+
256
+ authors = []
257
+
258
+ # Split by " and " (BibTeX convention)
259
+ author_parts = re.split(r'\s+and\s+', authors_str)
260
+
261
+ for part in author_parts:
262
+ part = part.strip()
263
+ if not part:
264
+ continue
265
+
266
+ # Handle "Last, First" format
267
+ if ',' in part:
268
+ parts = part.split(',', 1)
269
+ if len(parts) == 2:
270
+ last = parts[0].strip()
271
+ first = parts[1].strip()
272
+ # Convert to "First Last" format
273
+ name = f"{first} {last}".strip()
274
+ else:
275
+ name = part
276
+ else:
277
+ # Already in "First Last" format
278
+ name = part
279
+
280
+ # Clean up the name
281
+ name = re.sub(r'\s+', ' ', name) # Normalize whitespace
282
+ name = re.sub(r'\{([^}]*)\}', r'\1', name) # Remove braces
283
+
284
+ if name:
285
+ authors.append(name)
286
+
287
+ return authors
288
+
289
+ def _extract_year_from_eprint(self, eprint: str) -> Optional[int]:
290
+ """
291
+ Extract year from ArXiv eprint ID.
292
+
293
+ New format (YYMM.NNNNN): First two digits are year
294
+ Old format (cat-name/YYMMNNN): Digits after slash, first two are year
295
+
296
+ Args:
297
+ eprint: ArXiv eprint ID
298
+
299
+ Returns:
300
+ Year as integer or None
301
+ """
302
+ if not eprint:
303
+ return None
304
+
305
+ # New format: 2301.12345
306
+ match = re.match(r'^(\d{2})\d{2}\.\d{4,5}', eprint)
307
+ if match:
308
+ yy = int(match.group(1))
309
+ # ArXiv started in 1991, new format started in 2007
310
+ if yy >= 7:
311
+ return 2000 + yy
312
+ else:
313
+ # Very early 2000s papers (unlikely in new format)
314
+ return 2000 + yy
315
+
316
+ # Old format: hep-th/9901001
317
+ match = re.match(r'^[a-z-]+/(\d{2})\d+', eprint)
318
+ if match:
319
+ yy = int(match.group(1))
320
+ if yy >= 91: # ArXiv started in 1991
321
+ return 1900 + yy
322
+ else:
323
+ return 2000 + yy
324
+
325
+ return None
326
+
327
+ def get_latest_version_info(self, arxiv_id: str) -> Optional[str]:
328
+ """
329
+ Get the latest version number for an ArXiv paper.
330
+
331
+ Note: This requires fetching the abstract page, so it's optional.
332
+ For now, we rely on the BibTeX always returning latest version metadata.
333
+
334
+ Args:
335
+ arxiv_id: ArXiv ID without version
336
+
337
+ Returns:
338
+ Latest version string (e.g., "v3") or None if couldn't determine
339
+ """
340
+ # The BibTeX endpoint always returns the latest version's metadata,
341
+ # so we don't need to explicitly fetch version info
342
+ return None
343
+
344
+ def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
345
+ """
346
+ Check if a reference is an ArXiv paper.
347
+
348
+ Args:
349
+ reference: Reference dictionary
350
+
351
+ Returns:
352
+ True if reference appears to be an ArXiv paper
353
+ """
354
+ arxiv_id, _ = self.extract_arxiv_id(reference)
355
+ return arxiv_id is not None
356
+
357
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
358
+ """
359
+ Verify a reference using ArXiv's official BibTeX as authoritative source.
360
+
361
+ This method:
362
+ 1. Extracts the ArXiv ID from the reference
363
+ 2. Fetches the official BibTeX from ArXiv (always latest version)
364
+ 3. Parses the BibTeX to get authoritative metadata
365
+ 4. Compares cited metadata against authoritative source
366
+ 5. Logs warnings for version mismatches
367
+
368
+ Args:
369
+ reference: Reference dictionary with title, authors, year, url, etc.
370
+
371
+ Returns:
372
+ Tuple of (verified_data, errors, url)
373
+ - verified_data: Authoritative paper metadata from ArXiv or None
374
+ - errors: List of error/warning dictionaries
375
+ - url: ArXiv URL for the paper
376
+ """
377
+ errors = []
378
+
379
+ # Extract ArXiv ID
380
+ arxiv_id, cited_version = self.extract_arxiv_id(reference)
381
+
382
+ if not arxiv_id:
383
+ logger.debug("ArXivCitationChecker: No ArXiv ID found in reference")
384
+ return None, [], None
385
+
386
+ logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
387
+
388
+ # Fetch authoritative BibTeX
389
+ bibtex_content = self.fetch_bibtex(arxiv_id)
390
+
391
+ if not bibtex_content:
392
+ logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
393
+ return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
394
+
395
+ # Parse BibTeX
396
+ verified_data = self.parse_bibtex(bibtex_content)
397
+
398
+ if not verified_data:
399
+ logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
400
+ return None, [], None
401
+
402
+ # Log version mismatch warning if cited version differs from latest
403
+ if cited_version:
404
+ # ArXiv BibTeX always returns latest version metadata
405
+ # We don't know the actual latest version number without additional API call,
406
+ # but we can warn that a specific version was cited
407
+ errors.append({
408
+ 'warning_type': 'version',
409
+ 'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
410
+ })
411
+ logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
412
+
413
+ # Compare title
414
+ cited_title = reference.get('title', '').strip()
415
+ authoritative_title = verified_data.get('title', '').strip()
416
+
417
+ if cited_title and authoritative_title:
418
+ title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
419
+
420
+ if title_similarity < SIMILARITY_THRESHOLD:
421
+ clean_cited_title = strip_latex_commands(cited_title)
422
+ errors.append({
423
+ 'error_type': 'title',
424
+ 'error_details': format_title_mismatch(clean_cited_title, authoritative_title),
425
+ 'ref_title_correct': authoritative_title
426
+ })
427
+
428
+ # Compare authors
429
+ cited_authors = reference.get('authors', [])
430
+ if cited_authors:
431
+ authoritative_authors = verified_data.get('authors', [])
432
+ authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
433
+
434
+ if not authors_match:
435
+ correct_author_names = ', '.join([a.get('name', '') for a in authoritative_authors])
436
+ errors.append({
437
+ 'error_type': 'author',
438
+ 'error_details': author_error,
439
+ 'ref_authors_correct': correct_author_names
440
+ })
441
+
442
+ # Compare year
443
+ cited_year = reference.get('year')
444
+ authoritative_year = verified_data.get('year')
445
+
446
+ year_warning = validate_year(
447
+ cited_year=cited_year,
448
+ paper_year=authoritative_year,
449
+ use_flexible_validation=True,
450
+ context={'arxiv_match': True}
451
+ )
452
+ if year_warning:
453
+ errors.append(year_warning)
454
+
455
+ # Build URL
456
+ paper_url = f"https://arxiv.org/abs/{arxiv_id}"
457
+
458
+ logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
459
+
460
+ return verified_data, errors, paper_url
@@ -43,6 +43,7 @@ class EnhancedHybridReferenceChecker:
43
43
  contact_email: Optional[str] = None,
44
44
  enable_openalex: bool = True,
45
45
  enable_crossref: bool = True,
46
+ enable_arxiv_citation: bool = True,
46
47
  debug_mode: bool = False):
47
48
  """
48
49
  Initialize the enhanced hybrid reference checker
@@ -53,11 +54,22 @@ class EnhancedHybridReferenceChecker:
53
54
  contact_email: Email for polite pool access to APIs
54
55
  enable_openalex: Whether to use OpenAlex API
55
56
  enable_crossref: Whether to use CrossRef API
57
+ enable_arxiv_citation: Whether to use ArXiv Citation checker as authoritative source
56
58
  debug_mode: Whether to enable debug logging
57
59
  """
58
60
  self.contact_email = contact_email
59
61
  self.debug_mode = debug_mode
60
62
 
63
+ # Initialize ArXiv Citation checker (authoritative source for ArXiv papers)
64
+ self.arxiv_citation = None
65
+ if enable_arxiv_citation:
66
+ try:
67
+ from .arxiv_citation import ArXivCitationChecker
68
+ self.arxiv_citation = ArXivCitationChecker()
69
+ logger.debug("Enhanced Hybrid: ArXiv Citation checker initialized")
70
+ except Exception as e:
71
+ logger.warning(f"Enhanced Hybrid: Failed to initialize ArXiv Citation checker: {e}")
72
+
61
73
  # Initialize local database checker if available
62
74
  self.local_db = None
63
75
  if db_path:
@@ -112,6 +124,7 @@ class EnhancedHybridReferenceChecker:
112
124
 
113
125
  # Track API performance for adaptive selection
114
126
  self.api_stats = {
127
+ 'arxiv_citation': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
115
128
  'local_db': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
116
129
  'semantic_scholar': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
117
130
  'openalex': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
@@ -276,6 +289,17 @@ class EnhancedHybridReferenceChecker:
276
289
 
277
290
  # PHASE 1: Try all APIs once in priority order
278
291
 
292
+ # Strategy 0: For ArXiv papers, try ArXiv Citation checker first (authoritative source)
293
+ # This fetches the official BibTeX from ArXiv which is the author-submitted metadata
294
+ if self.arxiv_citation and self.arxiv_citation.is_arxiv_reference(reference):
295
+ logger.debug("Enhanced Hybrid: Reference appears to be ArXiv paper, trying ArXiv Citation checker first")
296
+ verified_data, errors, url, success, failure_type = self._try_api('arxiv_citation', self.arxiv_citation, reference)
297
+ if success:
298
+ logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded as authoritative source")
299
+ return verified_data, errors, url
300
+ if failure_type in ['throttled', 'timeout', 'server_error']:
301
+ failed_apis.append(('arxiv_citation', self.arxiv_citation, failure_type))
302
+
279
303
  # Strategy 1: Always try local database first (fastest)
280
304
  if self.local_db:
281
305
  verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)
@@ -22,6 +22,14 @@ DEFAULT_CONFIG = {
22
22
  "timeout": 30,
23
23
  },
24
24
 
25
+ "arxiv_citation": {
26
+ "base_url": "https://arxiv.org/bibtex",
27
+ "rate_limit_delay": 3.0, # Share rate limiting with other ArXiv endpoints
28
+ "timeout": 30,
29
+ "use_as_authoritative": True, # Use ArXiv BibTeX as authoritative source
30
+ "enabled": True, # Enable ArXiv citation checker in hybrid checker
31
+ },
32
+
25
33
  # Processing Settings
26
34
  "processing": {
27
35
  "max_papers": 50,
@@ -110,21 +110,7 @@ class LLMProvider(ABC):
110
110
 
111
111
  logger.debug(f"Created {len(chunks)} balanced overlapping chunks for parallel processing")
112
112
  return chunks
113
-
114
- def _parse_llm_response(self, response_text: str) -> List[str]:
115
- """Parse LLM response and extract individual references"""
116
- if not response_text:
117
- return []
118
-
119
- # Split by newlines and filter out empty lines
120
- references = []
121
- for line in response_text.strip().split('\n'):
122
- line = line.strip()
123
- if line and not line.startswith('#') and len(line) > 10: # Basic filtering
124
- references.append(line)
125
-
126
- return references
127
-
113
+
128
114
  def extract_references_with_chunking(self, bibliography_text: str) -> List[str]:
129
115
  """
130
116
  Template method that handles chunking for all providers.