academic-refchecker 2.0.11__py3-none-any.whl → 2.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 2.0.11
3
+ Version: 2.0.13
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -32,6 +32,7 @@ Requires-Dist: python-Levenshtein>=0.12.0
32
32
  Requires-Dist: pandas<2.4.0,>=1.3.0
33
33
  Requires-Dist: numpy<2.0.0,>=1.22.4
34
34
  Requires-Dist: pdfplumber>=0.6.0
35
+ Requires-Dist: bibtexparser>=1.4.0
35
36
  Provides-Extra: dev
36
37
  Requires-Dist: pytest>=6.0.0; extra == "dev"
37
38
  Requires-Dist: pytest-cov>=2.0.0; extra == "dev"
@@ -1,4 +1,4 @@
1
- academic_refchecker-2.0.11.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ academic_refchecker-2.0.13.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
2
2
  backend/__init__.py,sha256=TFVkOx5tSp3abty15RzUbaSwQ9ZD0kfUn7PDh63xkYY,521
3
3
  backend/__main__.py,sha256=74V7yUMsRSZaaRyXYm-rZVc3TVUcUgwsoTQTUbV5EqM,211
4
4
  backend/cli.py,sha256=xV3l9M5OdNQQYOcrzj2d_7RmCgj7CXP_1oi0TPe6zNo,1672
@@ -16,10 +16,11 @@ backend/static/assets/index-2P6L_39v.css,sha256=KC3Wa6jfD1qwmEoVpqTovlzf8fsn5oHY
16
16
  backend/static/assets/index-hk21nqxR.js,sha256=z2agP8ZFYw4AfYi-GJ5E_8_k-lPF-frXOJtPk-I0hDs,369533
17
17
  refchecker/__init__.py,sha256=Pg5MrtLxDBRcNYcI02N-bv3tzURVd1S3nQ8IyF7Zw7E,322
18
18
  refchecker/__main__.py,sha256=agBbT9iKN0g2xXtRNCoh29Nr7z2n5vU-r0MCVJKi4tI,232
19
- refchecker/__version__.py,sha256=xQXcCOSnpBnaLZygtDKbuiGK368plb0wUEcXNuWi7_s,66
20
- refchecker/checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
19
+ refchecker/__version__.py,sha256=4nD_XJ2nhdUPe68-UmSGWSjF8JFBkti-Is16FFYXHAI,66
20
+ refchecker/checkers/__init__.py,sha256=-dR7HX0bfPq9YMXrnODoYbfNWFLqu706xoVsUdWHYRI,611
21
+ refchecker/checkers/arxiv_citation.py,sha256=_oQxWt5uUSy-pAGEQjdwBb7dxoFNqWkYgpkV_ZVS-Ho,17332
21
22
  refchecker/checkers/crossref.py,sha256=88moAyTudBqf9SKqTQkNAq1yyuRe95f8r4EpmJznupQ,20937
22
- refchecker/checkers/enhanced_hybrid_checker.py,sha256=2jIeUX7hankPok3M4de9o2bsJZ17ZomuLkdfdr9EV0s,28671
23
+ refchecker/checkers/enhanced_hybrid_checker.py,sha256=HSjxbUo4tr1L1DF8FFG8dfH-Y7mM67sKmqi-KAX_31I,30310
23
24
  refchecker/checkers/github_checker.py,sha256=YJ2sLj22qezw3uWjA0jhtDO0fOW4HUwcVbv2DQ4LjR0,14277
24
25
  refchecker/checkers/local_semantic_scholar.py,sha256=c-KUTh99s-Di71h-pzdrwlPgoSTwB-tgVAZnCrMFXmw,21011
25
26
  refchecker/checkers/openalex.py,sha256=WEjEppQMbutPs8kWOSorCIoXWqpJ9o1CXUicThHSWYU,20120
@@ -29,7 +30,7 @@ refchecker/checkers/semantic_scholar.py,sha256=yvatQM5fXdW0qagqrTUpgotd0RbT7N_pq
29
30
  refchecker/checkers/webpage_checker.py,sha256=A_d5kg3OOsyliC00OVq_l0J-RJ4Ln7hUoURk21aO2fs,43653
30
31
  refchecker/config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
31
32
  refchecker/config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
32
- refchecker/config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
33
+ refchecker/config/settings.py,sha256=O8PETl_O7uyUl1r_spWhOMHbIaiBM-golfdIN82eigI,6512
33
34
  refchecker/core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
34
35
  refchecker/core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
35
36
  refchecker/core/parallel_processor.py,sha256=HpVFEMwPBiP2FRjvGqlaXpjV5S0qP-hxdB_Wdl_lACo,17704
@@ -37,13 +38,14 @@ refchecker/core/refchecker.py,sha256=nX8guDXFL1ZdT-K6KUJT_3iZjuoYsWj4e0rKrqd5VZA
37
38
  refchecker/database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
38
39
  refchecker/database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
39
40
  refchecker/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
- refchecker/llm/base.py,sha256=uMF-KOqZ9ZQ7rccOQLpKJiW9sEMMxr7ePXBSF0yYDJY,16782
41
- refchecker/llm/providers.py,sha256=RhsYbUqHV5YznJcJ8vTa6M-nUKltdREeG5mYrLdBS2c,40992
41
+ refchecker/llm/base.py,sha256=BhpnUn7nrN8LzAnA8rQuG3zBvNovFYxShk1V9oAHlHU,16248
42
+ refchecker/llm/providers.py,sha256=2pOEre_OH_shgm0b9m3_nVIxyoY-MxhFM5KAP_qKo_Q,39131
42
43
  refchecker/scripts/__init__.py,sha256=xJwo6afG8s7S888BK2Bxw2d7FX8aLkbl0l_ZoJOFibE,37
43
44
  refchecker/scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,4213
44
45
  refchecker/services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
45
46
  refchecker/services/pdf_processor.py,sha256=7i5x043qfnyzE5EQmytfy_uPjbeCJp4Ka5OPyH-bwOE,10577
46
47
  refchecker/utils/__init__.py,sha256=SKTEQeKpLOFFMIzZiakzctsW9zGe_J7LDNJlygWV6RY,1221
48
+ refchecker/utils/arxiv_rate_limiter.py,sha256=axOv84Ge6q_mJ69lcyAFsCmHx9qXvV1aX71oSaxhnjE,4119
47
49
  refchecker/utils/arxiv_utils.py,sha256=C7wqoCy9FZUQpoF92vLeJyrK1-6XoMmmL6u_hfDV3ro,18031
48
50
  refchecker/utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
49
51
  refchecker/utils/biblatex_parser.py,sha256=IKRUMtRsjdXIktyk9XGArt_ms0asmqP549uhFvvumuE,25581
@@ -54,11 +56,11 @@ refchecker/utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,
54
56
  refchecker/utils/doi_utils.py,sha256=_7YvQ0DTOQBMIujUE0SdJicjPiAR3VETLU668GIji24,6094
55
57
  refchecker/utils/error_utils.py,sha256=8TcfRUD6phZ7viPJrezQ4jKf_vE65lqEXZq5707eU6s,15425
56
58
  refchecker/utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
57
- refchecker/utils/text_utils.py,sha256=ZIdvP75F_4o_p2lB24CkuX_eEjB9x-BY2FlXsOiYjkQ,234082
59
+ refchecker/utils/text_utils.py,sha256=Tx1k0SqS1cmw4N9BDJY-Ipep2T-HMmKPqi4SMcq1ZJ8,235751
58
60
  refchecker/utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
59
61
  refchecker/utils/url_utils.py,sha256=7b0rWCQJSajzqOvD7ghsBZPejiq6mUIz6SGhvU_WGDs,9441
60
- academic_refchecker-2.0.11.dist-info/METADATA,sha256=oQhQAzud3SET3ya5MUc_z7FCN3FeguPeUYNew2jXSXc,26576
61
- academic_refchecker-2.0.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
62
- academic_refchecker-2.0.11.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
63
- academic_refchecker-2.0.11.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
64
- academic_refchecker-2.0.11.dist-info/RECORD,,
62
+ academic_refchecker-2.0.13.dist-info/METADATA,sha256=N6lsqdFWT6K34WNLqA_W0MO3WB2BEFjx_57jEdyHYes,26611
63
+ academic_refchecker-2.0.13.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
64
+ academic_refchecker-2.0.13.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
65
+ academic_refchecker-2.0.13.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
66
+ academic_refchecker-2.0.13.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
refchecker/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "2.0.11"
3
+ __version__ = "2.0.13"
@@ -7,11 +7,13 @@ from .local_semantic_scholar import LocalNonArxivReferenceChecker
7
7
  from .enhanced_hybrid_checker import EnhancedHybridReferenceChecker
8
8
  from .openalex import OpenAlexReferenceChecker
9
9
  from .crossref import CrossRefReferenceChecker
10
+ from .arxiv_citation import ArXivCitationChecker
10
11
 
11
12
  __all__ = [
12
13
  "NonArxivReferenceChecker",
13
14
  "LocalNonArxivReferenceChecker",
14
15
  "EnhancedHybridReferenceChecker",
15
16
  "OpenAlexReferenceChecker",
16
- "CrossRefReferenceChecker"
17
+ "CrossRefReferenceChecker",
18
+ "ArXivCitationChecker",
17
19
  ]
@@ -0,0 +1,460 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ArXiv Citation Checker - Authoritative Source for ArXiv Papers
4
+
5
+ This module provides functionality to verify ArXiv papers by fetching the official
6
+ BibTeX citation directly from ArXiv. This is used as the authoritative metadata source
7
+ for papers found on ArXiv, as it reflects the author-submitted metadata.
8
+
9
+ Key features:
10
+ - Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
11
+ - Always uses the latest version metadata (strips version suffixes)
12
+ - Logs warnings when cited version differs from latest version
13
+ - Parses BibTeX to extract normalized metadata matching refchecker schema
14
+
15
+ Usage:
16
+ from refchecker.checkers.arxiv_citation import ArXivCitationChecker
17
+
18
+ checker = ArXivCitationChecker()
19
+
20
+ reference = {
21
+ 'title': 'Attention Is All You Need',
22
+ 'authors': ['Ashish Vaswani', 'Noam Shazeer'],
23
+ 'year': 2017,
24
+ 'url': 'https://arxiv.org/abs/1706.03762v5',
25
+ }
26
+
27
+ verified_data, errors, url = checker.verify_reference(reference)
28
+ """
29
+
30
+ import re
31
+ import logging
32
+ import requests
33
+ from typing import Dict, List, Tuple, Optional, Any
34
+
35
+ import bibtexparser
36
+ from bibtexparser.bparser import BibTexParser
37
+ from bibtexparser.customization import convert_to_unicode
38
+
39
+ from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
40
+ from refchecker.utils.text_utils import (
41
+ normalize_text,
42
+ compare_authors,
43
+ compare_titles_with_latex_cleaning,
44
+ strip_latex_commands,
45
+ )
46
+ from refchecker.utils.error_utils import format_title_mismatch, validate_year
47
+ from refchecker.config.settings import get_config
48
+
49
+ logger = logging.getLogger(__name__)
50
+
51
+ # Get configuration
52
+ config = get_config()
53
+ SIMILARITY_THRESHOLD = config["text_processing"]["similarity_threshold"]
54
+
55
+
56
+ class ArXivCitationChecker:
57
+ """
58
+ Reference checker that uses ArXiv's official BibTeX export as the authoritative source.
59
+
60
+ This checker fetches the official BibTeX citation from ArXiv for papers identified
61
+ by their ArXiv ID. It uses the latest version's metadata as the authoritative source
62
+ and logs warnings when the cited version differs from the latest.
63
+ """
64
+
65
+ def __init__(self, timeout: int = 30):
66
+ """
67
+ Initialize the ArXiv Citation Checker.
68
+
69
+ Args:
70
+ timeout: HTTP request timeout in seconds
71
+ """
72
+ self.base_url = "https://arxiv.org/bibtex"
73
+ self.abs_url = "https://arxiv.org/abs"
74
+ self.timeout = timeout
75
+ self.rate_limiter = ArXivRateLimiter.get_instance()
76
+
77
+ # Pattern to extract arXiv IDs from various URL formats
78
+ self.arxiv_id_patterns = [
79
+ # Standard arxiv.org URLs
80
+ r'arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
81
+ r'arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
82
+ # Old format with category
83
+ r'arxiv\.org/abs/([a-z-]+/[0-9]{7})(v\d+)?',
84
+ r'arxiv\.org/pdf/([a-z-]+/[0-9]{7})(v\d+)?',
85
+ # arXiv: prefix in text
86
+ r'arXiv:([0-9]{4}\.[0-9]{4,5})(v\d+)?',
87
+ r'arXiv:([a-z-]+/[0-9]{7})(v\d+)?',
88
+ # export.arxiv.org URLs
89
+ r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
90
+ r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
91
+ ]
92
+
93
+ def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
94
+ """
95
+ Extract ArXiv ID from a reference, returning both the base ID and version.
96
+
97
+ Args:
98
+ reference: Reference dictionary containing url, raw_text, etc.
99
+
100
+ Returns:
101
+ Tuple of (arxiv_id_without_version, version_string_or_None)
102
+ For example: ("2301.12345", "v2") or ("2301.12345", None)
103
+ """
104
+ # Sources to check for ArXiv ID
105
+ sources = [
106
+ reference.get('url', ''),
107
+ reference.get('cited_url', ''),
108
+ reference.get('raw_text', ''),
109
+ reference.get('eprint', ''), # BibTeX field
110
+ ]
111
+
112
+ for source in sources:
113
+ if not source:
114
+ continue
115
+
116
+ for pattern in self.arxiv_id_patterns:
117
+ match = re.search(pattern, source, re.IGNORECASE)
118
+ if match:
119
+ arxiv_id = match.group(1)
120
+ version = match.group(2) if len(match.groups()) > 1 else None
121
+ logger.debug(f"Extracted ArXiv ID: {arxiv_id}, version: {version}")
122
+ return arxiv_id, version
123
+
124
+ return None, None
125
+
126
+ def fetch_bibtex(self, arxiv_id: str) -> Optional[str]:
127
+ """
128
+ Fetch the official BibTeX citation from ArXiv.
129
+
130
+ This always fetches the latest version's BibTeX (ArXiv default behavior).
131
+
132
+ Args:
133
+ arxiv_id: ArXiv ID without version suffix (e.g., "2301.12345")
134
+
135
+ Returns:
136
+ BibTeX string or None if fetch failed
137
+ """
138
+ url = f"{self.base_url}/{arxiv_id}"
139
+
140
+ # Wait for rate limit
141
+ self.rate_limiter.wait()
142
+
143
+ try:
144
+ logger.debug(f"Fetching ArXiv BibTeX from: {url}")
145
+ response = requests.get(url, timeout=self.timeout)
146
+ response.raise_for_status()
147
+
148
+ bibtex_content = response.text.strip()
149
+
150
+ # Validate it looks like BibTeX
151
+ if bibtex_content and bibtex_content.startswith('@'):
152
+ logger.debug(f"Successfully fetched BibTeX for ArXiv paper {arxiv_id}")
153
+ return bibtex_content
154
+ else:
155
+ logger.debug(f"Invalid BibTeX response for ArXiv paper {arxiv_id}")
156
+ return None
157
+
158
+ except requests.exceptions.Timeout:
159
+ logger.warning(f"Timeout fetching ArXiv BibTeX for {arxiv_id}")
160
+ return None
161
+ except requests.exceptions.RequestException as e:
162
+ logger.warning(f"Failed to fetch ArXiv BibTeX for {arxiv_id}: {e}")
163
+ return None
164
+
165
+ def parse_bibtex(self, bibtex_str: str) -> Optional[Dict[str, Any]]:
166
+ """
167
+ Parse BibTeX string and extract metadata in refchecker schema format.
168
+
169
+ Args:
170
+ bibtex_str: BibTeX content string
171
+
172
+ Returns:
173
+ Dictionary with parsed metadata or None if parsing failed
174
+ """
175
+ try:
176
+ # Configure parser
177
+ parser = BibTexParser(common_strings=True)
178
+ parser.customization = convert_to_unicode
179
+
180
+ # Parse BibTeX
181
+ bib_database = bibtexparser.loads(bibtex_str, parser=parser)
182
+
183
+ if not bib_database.entries:
184
+ logger.debug("No entries found in BibTeX")
185
+ return None
186
+
187
+ entry = bib_database.entries[0]
188
+
189
+ # Extract and normalize fields
190
+ title = entry.get('title', '')
191
+ # Clean title - remove braces used for capitalization protection
192
+ title = re.sub(r'\{([^}]*)\}', r'\1', title)
193
+ title = title.strip()
194
+
195
+ # Extract authors
196
+ authors_str = entry.get('author', '')
197
+ authors = self._parse_authors(authors_str)
198
+
199
+ # Extract year - prefer year from eprint ID (original submission) over BibTeX year (latest revision)
200
+ arxiv_id = entry.get('eprint', '')
201
+ year = self._extract_year_from_eprint(arxiv_id)
202
+
203
+ # Fall back to BibTeX year field if eprint year extraction fails
204
+ if not year and entry.get('year'):
205
+ try:
206
+ year = int(entry['year'])
207
+ except ValueError:
208
+ pass
209
+
210
+ # Build result in refchecker schema format
211
+ result = {
212
+ 'title': title,
213
+ 'authors': [{'name': author} for author in authors],
214
+ 'year': year,
215
+ 'venue': 'arXiv',
216
+ 'externalIds': {
217
+ 'ArXiv': arxiv_id,
218
+ },
219
+ 'url': f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else None,
220
+ 'isOpenAccess': True,
221
+ 'openAccessPdf': {
222
+ 'url': f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else None
223
+ },
224
+ # Store original bibtex for reference
225
+ '_bibtex_entry': entry,
226
+ '_source': 'ArXiv BibTeX Reference',
227
+ '_source_url': f"https://arxiv.org/bibtex/{arxiv_id}" if arxiv_id else None,
228
+ }
229
+
230
+ # Add DOI if present (some ArXiv papers have DOIs)
231
+ if entry.get('doi'):
232
+ result['externalIds']['DOI'] = entry['doi']
233
+
234
+ logger.debug(f"Parsed ArXiv BibTeX: title='{title[:50]}...', authors={len(authors)}, year={year}")
235
+ return result
236
+
237
+ except Exception as e:
238
+ logger.warning(f"Failed to parse BibTeX: {e}")
239
+ return None
240
+
241
+ def _parse_authors(self, authors_str: str) -> List[str]:
242
+ """
243
+ Parse BibTeX author string into list of author names.
244
+
245
+ BibTeX format: "Last1, First1 and Last2, First2 and ..."
246
+
247
+ Args:
248
+ authors_str: BibTeX author field value
249
+
250
+ Returns:
251
+ List of author names in "First Last" format
252
+ """
253
+ if not authors_str:
254
+ return []
255
+
256
+ authors = []
257
+
258
+ # Split by " and " (BibTeX convention)
259
+ author_parts = re.split(r'\s+and\s+', authors_str)
260
+
261
+ for part in author_parts:
262
+ part = part.strip()
263
+ if not part:
264
+ continue
265
+
266
+ # Handle "Last, First" format
267
+ if ',' in part:
268
+ parts = part.split(',', 1)
269
+ if len(parts) == 2:
270
+ last = parts[0].strip()
271
+ first = parts[1].strip()
272
+ # Convert to "First Last" format
273
+ name = f"{first} {last}".strip()
274
+ else:
275
+ name = part
276
+ else:
277
+ # Already in "First Last" format
278
+ name = part
279
+
280
+ # Clean up the name
281
+ name = re.sub(r'\s+', ' ', name) # Normalize whitespace
282
+ name = re.sub(r'\{([^}]*)\}', r'\1', name) # Remove braces
283
+
284
+ if name:
285
+ authors.append(name)
286
+
287
+ return authors
288
+
289
+ def _extract_year_from_eprint(self, eprint: str) -> Optional[int]:
290
+ """
291
+ Extract year from ArXiv eprint ID.
292
+
293
+ New format (YYMM.NNNNN): First two digits are year
294
+ Old format (cat-name/YYMMNNN): Digits after slash, first two are year
295
+
296
+ Args:
297
+ eprint: ArXiv eprint ID
298
+
299
+ Returns:
300
+ Year as integer or None
301
+ """
302
+ if not eprint:
303
+ return None
304
+
305
+ # New format: 2301.12345
306
+ match = re.match(r'^(\d{2})\d{2}\.\d{4,5}', eprint)
307
+ if match:
308
+ yy = int(match.group(1))
309
+ # ArXiv started in 1991, new format started in 2007
310
+ if yy >= 7:
311
+ return 2000 + yy
312
+ else:
313
+ # Very early 2000s papers (unlikely in new format)
314
+ return 2000 + yy
315
+
316
+ # Old format: hep-th/9901001
317
+ match = re.match(r'^[a-z-]+/(\d{2})\d+', eprint)
318
+ if match:
319
+ yy = int(match.group(1))
320
+ if yy >= 91: # ArXiv started in 1991
321
+ return 1900 + yy
322
+ else:
323
+ return 2000 + yy
324
+
325
+ return None
326
+
327
+ def get_latest_version_info(self, arxiv_id: str) -> Optional[str]:
328
+ """
329
+ Get the latest version number for an ArXiv paper.
330
+
331
+ Note: This requires fetching the abstract page, so it's optional.
332
+ For now, we rely on the BibTeX always returning latest version metadata.
333
+
334
+ Args:
335
+ arxiv_id: ArXiv ID without version
336
+
337
+ Returns:
338
+ Latest version string (e.g., "v3") or None if couldn't determine
339
+ """
340
+ # The BibTeX endpoint always returns the latest version's metadata,
341
+ # so we don't need to explicitly fetch version info
342
+ return None
343
+
344
+ def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
345
+ """
346
+ Check if a reference is an ArXiv paper.
347
+
348
+ Args:
349
+ reference: Reference dictionary
350
+
351
+ Returns:
352
+ True if reference appears to be an ArXiv paper
353
+ """
354
+ arxiv_id, _ = self.extract_arxiv_id(reference)
355
+ return arxiv_id is not None
356
+
357
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
358
+ """
359
+ Verify a reference using ArXiv's official BibTeX as authoritative source.
360
+
361
+ This method:
362
+ 1. Extracts the ArXiv ID from the reference
363
+ 2. Fetches the official BibTeX from ArXiv (always latest version)
364
+ 3. Parses the BibTeX to get authoritative metadata
365
+ 4. Compares cited metadata against authoritative source
366
+ 5. Logs warnings for version mismatches
367
+
368
+ Args:
369
+ reference: Reference dictionary with title, authors, year, url, etc.
370
+
371
+ Returns:
372
+ Tuple of (verified_data, errors, url)
373
+ - verified_data: Authoritative paper metadata from ArXiv or None
374
+ - errors: List of error/warning dictionaries
375
+ - url: ArXiv URL for the paper
376
+ """
377
+ errors = []
378
+
379
+ # Extract ArXiv ID
380
+ arxiv_id, cited_version = self.extract_arxiv_id(reference)
381
+
382
+ if not arxiv_id:
383
+ logger.debug("ArXivCitationChecker: No ArXiv ID found in reference")
384
+ return None, [], None
385
+
386
+ logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
387
+
388
+ # Fetch authoritative BibTeX
389
+ bibtex_content = self.fetch_bibtex(arxiv_id)
390
+
391
+ if not bibtex_content:
392
+ logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
393
+ return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
394
+
395
+ # Parse BibTeX
396
+ verified_data = self.parse_bibtex(bibtex_content)
397
+
398
+ if not verified_data:
399
+ logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
400
+ return None, [], None
401
+
402
+ # Log version mismatch warning if cited version differs from latest
403
+ if cited_version:
404
+ # ArXiv BibTeX always returns latest version metadata
405
+ # We don't know the actual latest version number without additional API call,
406
+ # but we can warn that a specific version was cited
407
+ errors.append({
408
+ 'warning_type': 'version',
409
+ 'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
410
+ })
411
+ logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
412
+
413
+ # Compare title
414
+ cited_title = reference.get('title', '').strip()
415
+ authoritative_title = verified_data.get('title', '').strip()
416
+
417
+ if cited_title and authoritative_title:
418
+ title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
419
+
420
+ if title_similarity < SIMILARITY_THRESHOLD:
421
+ clean_cited_title = strip_latex_commands(cited_title)
422
+ errors.append({
423
+ 'error_type': 'title',
424
+ 'error_details': format_title_mismatch(clean_cited_title, authoritative_title),
425
+ 'ref_title_correct': authoritative_title
426
+ })
427
+
428
+ # Compare authors
429
+ cited_authors = reference.get('authors', [])
430
+ if cited_authors:
431
+ authoritative_authors = verified_data.get('authors', [])
432
+ authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
433
+
434
+ if not authors_match:
435
+ correct_author_names = ', '.join([a.get('name', '') for a in authoritative_authors])
436
+ errors.append({
437
+ 'error_type': 'author',
438
+ 'error_details': author_error,
439
+ 'ref_authors_correct': correct_author_names
440
+ })
441
+
442
+ # Compare year
443
+ cited_year = reference.get('year')
444
+ authoritative_year = verified_data.get('year')
445
+
446
+ year_warning = validate_year(
447
+ cited_year=cited_year,
448
+ paper_year=authoritative_year,
449
+ use_flexible_validation=True,
450
+ context={'arxiv_match': True}
451
+ )
452
+ if year_warning:
453
+ errors.append(year_warning)
454
+
455
+ # Build URL
456
+ paper_url = f"https://arxiv.org/abs/{arxiv_id}"
457
+
458
+ logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
459
+
460
+ return verified_data, errors, paper_url
@@ -43,6 +43,7 @@ class EnhancedHybridReferenceChecker:
43
43
  contact_email: Optional[str] = None,
44
44
  enable_openalex: bool = True,
45
45
  enable_crossref: bool = True,
46
+ enable_arxiv_citation: bool = True,
46
47
  debug_mode: bool = False):
47
48
  """
48
49
  Initialize the enhanced hybrid reference checker
@@ -53,11 +54,22 @@ class EnhancedHybridReferenceChecker:
53
54
  contact_email: Email for polite pool access to APIs
54
55
  enable_openalex: Whether to use OpenAlex API
55
56
  enable_crossref: Whether to use CrossRef API
57
+ enable_arxiv_citation: Whether to use ArXiv Citation checker as authoritative source
56
58
  debug_mode: Whether to enable debug logging
57
59
  """
58
60
  self.contact_email = contact_email
59
61
  self.debug_mode = debug_mode
60
62
 
63
+ # Initialize ArXiv Citation checker (authoritative source for ArXiv papers)
64
+ self.arxiv_citation = None
65
+ if enable_arxiv_citation:
66
+ try:
67
+ from .arxiv_citation import ArXivCitationChecker
68
+ self.arxiv_citation = ArXivCitationChecker()
69
+ logger.debug("Enhanced Hybrid: ArXiv Citation checker initialized")
70
+ except Exception as e:
71
+ logger.warning(f"Enhanced Hybrid: Failed to initialize ArXiv Citation checker: {e}")
72
+
61
73
  # Initialize local database checker if available
62
74
  self.local_db = None
63
75
  if db_path:
@@ -112,6 +124,7 @@ class EnhancedHybridReferenceChecker:
112
124
 
113
125
  # Track API performance for adaptive selection
114
126
  self.api_stats = {
127
+ 'arxiv_citation': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
115
128
  'local_db': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
116
129
  'semantic_scholar': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
117
130
  'openalex': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
@@ -276,6 +289,17 @@ class EnhancedHybridReferenceChecker:
276
289
 
277
290
  # PHASE 1: Try all APIs once in priority order
278
291
 
292
+ # Strategy 0: For ArXiv papers, try ArXiv Citation checker first (authoritative source)
293
+ # This fetches the official BibTeX from ArXiv which is the author-submitted metadata
294
+ if self.arxiv_citation and self.arxiv_citation.is_arxiv_reference(reference):
295
+ logger.debug("Enhanced Hybrid: Reference appears to be ArXiv paper, trying ArXiv Citation checker first")
296
+ verified_data, errors, url, success, failure_type = self._try_api('arxiv_citation', self.arxiv_citation, reference)
297
+ if success:
298
+ logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded as authoritative source")
299
+ return verified_data, errors, url
300
+ if failure_type in ['throttled', 'timeout', 'server_error']:
301
+ failed_apis.append(('arxiv_citation', self.arxiv_citation, failure_type))
302
+
279
303
  # Strategy 1: Always try local database first (fastest)
280
304
  if self.local_db:
281
305
  verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)
@@ -22,6 +22,14 @@ DEFAULT_CONFIG = {
22
22
  "timeout": 30,
23
23
  },
24
24
 
25
+ "arxiv_citation": {
26
+ "base_url": "https://arxiv.org/bibtex",
27
+ "rate_limit_delay": 3.0, # Share rate limiting with other ArXiv endpoints
28
+ "timeout": 30,
29
+ "use_as_authoritative": True, # Use ArXiv BibTeX as authoritative source
30
+ "enabled": True, # Enable ArXiv citation checker in hybrid checker
31
+ },
32
+
25
33
  # Processing Settings
26
34
  "processing": {
27
35
  "max_papers": 50,
refchecker/llm/base.py CHANGED
@@ -110,21 +110,7 @@ class LLMProvider(ABC):
110
110
 
111
111
  logger.debug(f"Created {len(chunks)} balanced overlapping chunks for parallel processing")
112
112
  return chunks
113
-
114
- def _parse_llm_response(self, response_text: str) -> List[str]:
115
- """Parse LLM response and extract individual references"""
116
- if not response_text:
117
- return []
118
-
119
- # Split by newlines and filter out empty lines
120
- references = []
121
- for line in response_text.strip().split('\n'):
122
- line = line.strip()
123
- if line and not line.startswith('#') and len(line) > 10: # Basic filtering
124
- references.append(line)
125
-
126
- return references
127
-
113
+
128
114
  def extract_references_with_chunking(self, bibliography_text: str) -> List[str]:
129
115
  """
130
116
  Template method that handles chunking for all providers.
@@ -62,51 +62,25 @@ class LLMProviderMixin:
62
62
  """Create prompt for reference extraction"""
63
63
  # Clean BibTeX formatting before sending to LLM
64
64
  cleaned_bibliography = self._clean_bibtex_for_llm(bibliography_text)
65
-
66
- return f"""
67
- Please extract individual references from the following bibliography text. Each reference should be a complete bibliographic entry.
68
-
69
- Instructions:
70
- 1. Split the bibliography into individual references based on numbered markers like [1], [2], etc.
71
- 2. IMPORTANT: References may span multiple lines. A single reference includes everything from one number marker (e.g., [37]) until the next number marker (e.g., [38])
72
- 3. For each reference, extract: authors, title, publication venue, year, and any URLs/DOIs
73
- - For BibTeX entries, extract fields correctly:
74
- * title = the actual paper title from "title" field
75
- * venue = from "journal", "booktitle", "conference" fields
76
- * Do NOT confuse journal names like "arXiv preprint arXiv:1234.5678" with paper titles
77
- 4. Include references that are incomplete, like only author names and titles, but ignore ones that are just a URL without other details
78
- 5. Place a hashmark (#) rather than period between fields of a reference, but asterisks (*) between individual authors
79
- e.g. Author1*Author2*Author3#Title#Venue#Year#URL
80
- 6. CRITICAL: When extracting authors, understand BibTeX author field format correctly
81
- - In BibTeX, the "author" field contains author names separated by " and " (not commas)
82
- - Individual author names may be in "Last, First" format (e.g., "Smith, John")
83
- - Multiple authors are separated by " and " (e.g., "Smith, John and Doe, Jane")
84
- - SPECIAL CASE for collaborations: Handle "Last, First and others" pattern correctly
85
- * author = {"Khachatryan, Vardan and others"} → ONE explicit author plus et al: "Vardan Khachatryan*et al"
86
- * author = {"Smith, John and others"} → ONE explicit author plus et al: "John Smith*et al"
87
- * The "Last, First and others" pattern indicates a collaboration paper where only the first author is listed explicitly
88
- - EXAMPLES:
89
- * author = {"Dolan, Brian P."} → ONE author: "Dolan, Brian P."
90
- * author = {"Smith, John and Doe, Jane"} → TWO authors: "Smith, John*Doe, Jane"
91
- * author = {"Arnab, Anurag and Dehghani, Mostafa and Heigold, Georg"} → THREE authors: "Arnab, Anurag*Dehghani, Mostafa*Heigold, Georg"
92
- * author = {"Khachatryan, Vardan and others"} → ONE explicit author plus et al: "Vardan Khachatryan*et al"
93
- - Use asterisks (*) to separate individual authors in your output
94
- - For "Last, First" format, convert to "First Last" for readability (e.g., "Smith, John" → "John Smith")
95
- - If a BibTeX entry has NO author field, output an empty author field (nothing before the first #)
96
- - Do NOT infer or guess authors based on title or context - only use what is explicitly stated
97
- 7. CRITICAL: When extracting authors, preserve "et al" and similar indicators exactly as they appear
98
- - If the original says "John Smith, Jane Doe, et al" then output "John Smith, Jane Doe, et al"
99
- - If the original says "John Smith et al." then output "John Smith et al."
100
- - Also preserve variations like "and others", "etc.", "..." when used to indicate additional authors
101
- - Do NOT expand "et al" into individual author names, even if you know them
102
- 8. Return ONLY the references, one per line
103
- 9. Do not include reference numbers like [1], [2], etc. in your output
104
- 10. Do not add any additional text or explanations
105
- 11. Ensure that URLs and DOIs are from the specific reference only
106
- - When extracting URLs, preserve the complete URL including protocol
107
- - For BibTeX howpublished fields, extract the full URL from the field value
108
- 12. When parsing multi-line references, combine all authors from all lines before the title
109
- 13. CRITICAL: If the text contains no valid bibliographic references (e.g., only figures, appendix material, or explanatory text), simply return nothing - do NOT explain why you cannot extract references
65
+
66
+ return f"""OUTPUT FORMAT (MANDATORY):
67
+ - Each line must be: Author1*Author2#Title#Venue#Year#URL
68
+ - Use # between fields, * between authors
69
+ - One reference per line
70
+ - NO other text allowed - no explanations, descriptions, or commentary
71
+ - If no valid references exist, return NOTHING (completely empty response)
72
+
73
+ EXTRACTION RULES:
74
+ 1. Split by numbered markers [1], [2], etc. - references may span multiple lines
75
+ 2. Extract: authors, title, venue (journal/booktitle), year, URLs/DOIs
76
+ 3. For BibTeX: "title" field = paper title, "journal"/"booktitle" = venue
77
+ 4. Handle author formats:
78
+ - "Last, First and others" "First Last*et al"
79
+ - "Last, First" → "First Last"
80
+ - Separate multiple authors with *
81
+ - Preserve "et al" exactly as written
82
+ 5. Skip entries that are only URLs without bibliographic data
83
+ 6. If no author field exists, start with # (empty author)
110
84
 
111
85
  Bibliography text:
112
86
  {cleaned_bibliography}
@@ -116,67 +90,120 @@ Bibliography text:
116
90
  """Parse LLM response into list of references"""
117
91
  if not content:
118
92
  return []
119
-
93
+
120
94
  # Ensure content is a string
121
95
  if not isinstance(content, str):
122
96
  content = str(content)
123
-
97
+
124
98
  # Clean the content - remove leading/trailing whitespace
125
99
  content = content.strip()
126
-
100
+
101
+ # Early check: if no # delimiters at all, likely all prose/explanatory text
102
+ if '#' not in content:
103
+ logger.warning("LLM response contains no structured references (no # delimiters found)")
104
+ return []
105
+
127
106
  # Split by double newlines first to handle paragraph-style formatting
128
107
  # then fall back to single newlines
129
108
  references = []
130
-
109
+
131
110
  # Try double newline splitting first (paragraph style)
132
111
  if '\n\n' in content:
133
112
  potential_refs = content.split('\n\n')
134
113
  else:
135
114
  # Fall back to single newline splitting
136
115
  potential_refs = content.split('\n')
137
-
116
+
117
+ import re
118
+
119
+ # Common prose patterns that indicate explanatory text
120
+ prose_starters = (
121
+ 'this ', 'the ', 'i ', 'looking ', 'based on', 'it ',
122
+ 'there ', 'these ', 'here ', 'note', 'please ', 'however',
123
+ 'unfortunately', 'appears to', 'contains', 'following',
124
+ 'above', 'below', 'after', 'before', 'when ', 'if ',
125
+ 'as ', 'for ', 'from ', 'with ', 'without ', 'although'
126
+ )
127
+
138
128
  for ref in potential_refs:
139
129
  ref = ref.strip()
140
-
141
- # Skip empty lines, headers, and explanatory text
130
+
131
+ # Skip empty lines
142
132
  if not ref:
143
133
  continue
144
- if ref.lower().startswith(('reference', 'here are', 'below are', 'extracted', 'bibliography')):
145
- continue
146
- if ref.startswith('#'):
147
- continue
148
- if 'extracted from the bibliography' in ref.lower():
134
+
135
+ # Skip lines starting with # (markdown headers or empty author field without title)
136
+ if ref.startswith('#') and not re.match(r'^#[^#]', ref):
149
137
  continue
150
- if 'formatted as a complete' in ref.lower():
138
+
139
+ # Check for prose/explanatory text patterns
140
+ ref_lower = ref.lower()
141
+
142
+ # Skip common explanatory headers
143
+ if ref_lower.startswith(('reference', 'here are', 'below are', 'extracted', 'bibliography')):
151
144
  continue
145
+
152
146
  # Skip verbose LLM explanatory responses
153
- if 'cannot extract' in ref.lower() and ('references' in ref.lower() or 'bibliographic' in ref.lower()):
147
+ skip_patterns = [
148
+ 'extracted from the bibliography',
149
+ 'formatted as a complete',
150
+ 'cannot extract',
151
+ 'appears to be from',
152
+ 'no numbered reference markers',
153
+ 'only figures',
154
+ 'i cannot',
155
+ 'i return nothing',
156
+ 'return nothing',
157
+ 'no valid bibliographic',
158
+ 'numbered format specified',
159
+ 'it contains',
160
+ 'it does not contain',
161
+ 'text appears to be',
162
+ 'does not appear to contain',
163
+ 'no references found',
164
+ 'empty response',
165
+ 'no bibliography',
166
+ 'no actual bibliographic',
167
+ 'no academic references',
168
+ 'contains only numerical',
169
+ 'data tables',
170
+ 'evaluation rubric',
171
+ 'publication metadata',
172
+ 'citable sources',
173
+ 'reference list',
174
+ ]
175
+ if any(pattern in ref_lower for pattern in skip_patterns):
154
176
  continue
155
- if 'appears to be from' in ref.lower() and 'appendix' in ref.lower():
177
+
178
+ # Skip lines starting with common prose patterns
179
+ if ref_lower.startswith(prose_starters):
156
180
  continue
157
- if 'no numbered reference markers' in ref.lower():
181
+ if ref_lower.startswith('looking at'):
158
182
  continue
159
- if 'only figures' in ref.lower() and 'learning curves' in ref.lower():
183
+ if ref_lower.startswith('since there are'):
160
184
  continue
161
- if ref.lower().startswith('i cannot'):
185
+
186
+ # Key structural check: valid references MUST have # delimiters
187
+ if '#' not in ref:
188
+ # No delimiter = not a valid reference, skip it
189
+ logger.debug(f"Skipping line without # delimiter: {ref[:80]}...")
162
190
  continue
163
-
191
+
164
192
  # Remove common prefixes (bullets, numbers, etc.)
165
193
  ref = ref.lstrip('- *•')
166
194
  ref = ref.strip()
167
-
195
+
168
196
  # Remove reference numbers like "1.", "[1]", "(1)" from the beginning
169
- import re
170
197
  ref = re.sub(r'^(\d+\.|\[\d+\]|\(\d+\))\s*', '', ref)
171
-
198
+
172
199
  # Filter out very short lines (likely not complete references)
173
- if len(ref) > 30: # Increased minimum length for academic references
200
+ if len(ref) > 30: # Minimum length for academic references
174
201
  references.append(ref)
175
-
202
+
176
203
  return references
177
204
 
178
205
 
179
- class OpenAIProvider(LLMProvider, LLMProviderMixin):
206
+ class OpenAIProvider(LLMProviderMixin, LLMProvider):
180
207
  """OpenAI GPT provider for reference extraction"""
181
208
 
182
209
  def __init__(self, config: Dict[str, Any]):
@@ -197,10 +224,6 @@ class OpenAIProvider(LLMProvider, LLMProviderMixin):
197
224
  def extract_references(self, bibliography_text: str) -> List[str]:
198
225
  return self.extract_references_with_chunking(bibliography_text)
199
226
 
200
- def _create_extraction_prompt(self, bibliography_text: str) -> str:
201
- """Create prompt for reference extraction"""
202
- return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
203
-
204
227
  def _call_llm(self, prompt: str) -> str:
205
228
  """Make the actual OpenAI API call and return the response text"""
206
229
  try:
@@ -220,7 +243,7 @@ class OpenAIProvider(LLMProvider, LLMProviderMixin):
220
243
  raise
221
244
 
222
245
 
223
- class AnthropicProvider(LLMProvider, LLMProviderMixin):
246
+ class AnthropicProvider(LLMProviderMixin, LLMProvider):
224
247
  """Anthropic Claude provider for reference extraction"""
225
248
 
226
249
  def __init__(self, config: Dict[str, Any]):
@@ -241,10 +264,6 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
241
264
  def extract_references(self, bibliography_text: str) -> List[str]:
242
265
  return self.extract_references_with_chunking(bibliography_text)
243
266
 
244
- def _create_extraction_prompt(self, bibliography_text: str) -> str:
245
- """Create prompt for reference extraction"""
246
- return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
247
-
248
267
  def _call_llm(self, prompt: str) -> str:
249
268
  """Make the actual Anthropic API call and return the response text"""
250
269
  try:
@@ -252,6 +271,7 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
252
271
  model=self.model or "claude-sonnet-4-20250514",
253
272
  max_tokens=self.max_tokens,
254
273
  temperature=self.temperature,
274
+ system="You are a bibliographic reference extractor. You output ONLY structured reference data in the exact format specified. Never explain, describe, or comment on the input. Never output prose or sentences. If input contains no extractable references, return a completely empty response with no text.",
255
275
  messages=[
256
276
  {"role": "user", "content": prompt}
257
277
  ]
@@ -281,7 +301,7 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
281
301
  raise
282
302
 
283
303
 
284
- class GoogleProvider(LLMProvider, LLMProviderMixin):
304
+ class GoogleProvider(LLMProviderMixin, LLMProvider):
285
305
  """Google Gemini provider for reference extraction"""
286
306
 
287
307
  def __init__(self, config: Dict[str, Any]):
@@ -303,10 +323,6 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
303
323
  def extract_references(self, bibliography_text: str) -> List[str]:
304
324
  return self.extract_references_with_chunking(bibliography_text)
305
325
 
306
- def _create_extraction_prompt(self, bibliography_text: str) -> str:
307
- """Create prompt for reference extraction"""
308
- return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
309
-
310
326
  def _call_llm(self, prompt: str) -> str:
311
327
  """Make the actual Google API call and return the response text"""
312
328
  try:
@@ -341,7 +357,7 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
341
357
  raise
342
358
 
343
359
 
344
- class AzureProvider(LLMProvider, LLMProviderMixin):
360
+ class AzureProvider(LLMProviderMixin, LLMProvider):
345
361
  """Azure OpenAI provider for reference extraction"""
346
362
 
347
363
  def __init__(self, config: Dict[str, Any]):
@@ -375,10 +391,6 @@ class AzureProvider(LLMProvider, LLMProviderMixin):
375
391
  def extract_references(self, bibliography_text: str) -> List[str]:
376
392
  return self.extract_references_with_chunking(bibliography_text)
377
393
 
378
- def _create_extraction_prompt(self, bibliography_text: str) -> str:
379
- """Create prompt for reference extraction"""
380
- return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
381
-
382
394
  def _call_llm(self, prompt: str) -> str:
383
395
  """Make the actual Azure OpenAI API call and return the response text"""
384
396
  try:
@@ -397,7 +409,7 @@ class AzureProvider(LLMProvider, LLMProviderMixin):
397
409
  logger.error(f"Azure API call failed: {e}")
398
410
  raise
399
411
 
400
- class vLLMProvider(LLMProvider, LLMProviderMixin):
412
+ class vLLMProvider(LLMProviderMixin, LLMProvider):
401
413
  """vLLM provider using OpenAI-compatible server mode for local Hugging Face models"""
402
414
 
403
415
  def __init__(self, config: Dict[str, Any]):
@@ -838,10 +850,6 @@ class vLLMProvider(LLMProvider, LLMProviderMixin):
838
850
  def extract_references(self, bibliography_text: str) -> List[str]:
839
851
  return self.extract_references_with_chunking(bibliography_text)
840
852
 
841
- def _create_extraction_prompt(self, bibliography_text: str) -> str:
842
- """Create prompt for reference extraction"""
843
- return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
844
-
845
853
  def _call_llm(self, prompt: str) -> str:
846
854
  """Make the actual vLLM API call and return the response text"""
847
855
  try:
@@ -0,0 +1,133 @@
1
+ """
2
+ Shared ArXiv Rate Limiter utility.
3
+
4
+ ArXiv requests a polite delay of 3 seconds between requests.
5
+ This module provides a centralized rate limiter to coordinate all ArXiv API calls
6
+ across different checkers and utilities.
7
+
8
+ Usage:
9
+ from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
10
+
11
+ # Get the shared limiter instance
12
+ limiter = ArXivRateLimiter.get_instance()
13
+
14
+ # Wait for rate limit before making a request
15
+ limiter.wait()
16
+
17
+ # Then make your request
18
+ response = requests.get(arxiv_url)
19
+ """
20
+
21
+ import time
22
+ import threading
23
+ import logging
24
+ from typing import Optional
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class ArXivRateLimiter:
30
+ """
31
+ Singleton rate limiter for ArXiv API requests.
32
+
33
+ ArXiv requests a minimum of 3 seconds between requests for polite access.
34
+ This class ensures all ArXiv API calls from any part of refchecker
35
+ are properly rate limited.
36
+ """
37
+
38
+ _instance: Optional['ArXivRateLimiter'] = None
39
+ _lock = threading.Lock()
40
+
41
+ # ArXiv recommends at least 3 seconds between requests
42
+ DEFAULT_DELAY = 3.0
43
+
44
+ def __init__(self):
45
+ """Initialize the rate limiter (use get_instance() instead of direct construction)."""
46
+ self._last_request_time: float = 0.0
47
+ self._request_lock = threading.Lock()
48
+ self._delay: float = self.DEFAULT_DELAY
49
+
50
+ @classmethod
51
+ def get_instance(cls) -> 'ArXivRateLimiter':
52
+ """
53
+ Get the singleton instance of the ArXiv rate limiter.
54
+
55
+ Returns:
56
+ The shared ArXivRateLimiter instance
57
+ """
58
+ if cls._instance is None:
59
+ with cls._lock:
60
+ # Double-check locking pattern
61
+ if cls._instance is None:
62
+ cls._instance = cls()
63
+ return cls._instance
64
+
65
+ @classmethod
66
+ def reset_instance(cls) -> None:
67
+ """
68
+ Reset the singleton instance (primarily for testing).
69
+ """
70
+ with cls._lock:
71
+ cls._instance = None
72
+
73
+ @property
74
+ def delay(self) -> float:
75
+ """Get the current delay between requests in seconds."""
76
+ return self._delay
77
+
78
+ @delay.setter
79
+ def delay(self, value: float) -> None:
80
+ """
81
+ Set the delay between requests.
82
+
83
+ Args:
84
+ value: Delay in seconds (minimum 0.5 seconds enforced)
85
+ """
86
+ self._delay = max(0.5, value)
87
+
88
+ def wait(self) -> float:
89
+ """
90
+ Wait for the rate limit before making a request.
91
+
92
+ This method blocks until the required time has passed since the last request.
93
+ It is thread-safe and can be called from multiple threads simultaneously.
94
+
95
+ Returns:
96
+ The actual time waited in seconds (0 if no wait was needed)
97
+ """
98
+ with self._request_lock:
99
+ current_time = time.time()
100
+ time_since_last = current_time - self._last_request_time
101
+
102
+ if time_since_last < self._delay:
103
+ wait_time = self._delay - time_since_last
104
+ logger.debug(f"ArXiv rate limiter: waiting {wait_time:.2f}s")
105
+ time.sleep(wait_time)
106
+ else:
107
+ wait_time = 0.0
108
+
109
+ self._last_request_time = time.time()
110
+ return wait_time
111
+
112
+ def mark_request(self) -> None:
113
+ """
114
+ Mark that a request was just made (without waiting).
115
+
116
+ Use this if you're managing timing externally but still want to
117
+ update the rate limiter's state.
118
+ """
119
+ with self._request_lock:
120
+ self._last_request_time = time.time()
121
+
122
+ def time_until_next(self) -> float:
123
+ """
124
+ Get the time remaining until the next request is allowed.
125
+
126
+ Returns:
127
+ Time in seconds until next request (0 if allowed now)
128
+ """
129
+ with self._request_lock:
130
+ current_time = time.time()
131
+ time_since_last = current_time - self._last_request_time
132
+ remaining = self._delay - time_since_last
133
+ return max(0.0, remaining)
@@ -1319,6 +1319,38 @@ def is_name_match(name1: str, name2: str) -> bool:
1319
1319
  # This handles both surname particle normalization effects and standard 3-part names
1320
1320
  def match_initials_with_names(init_parts, name_parts):
1321
1321
  """Helper function to match initials against full names"""
1322
+ # Handle 4-part initials vs 2-part compound surname
1323
+ # e.g., ['M.', 'V.', 'D.', 'Briel'] vs ['Menkes', 'van den Briel']
1324
+ # where "van den" particles are treated as initials "V. D."
1325
+ if len(init_parts) == 4 and len(name_parts) == 2:
1326
+ # Check if first 3 parts are initials and last is surname
1327
+ if (len(init_parts[0].rstrip('.')) == 1 and
1328
+ len(init_parts[1].rstrip('.')) == 1 and
1329
+ len(init_parts[2].rstrip('.')) == 1 and
1330
+ len(init_parts[3]) > 1 and
1331
+ len(name_parts[0]) > 1 and len(name_parts[1]) > 1):
1332
+
1333
+ first_initial = init_parts[0].rstrip('.')
1334
+ second_initial = init_parts[1].rstrip('.')
1335
+ third_initial = init_parts[2].rstrip('.')
1336
+ last_name = init_parts[3]
1337
+ first_name = name_parts[0]
1338
+ compound_last = name_parts[1]
1339
+
1340
+ # Extract parts from compound lastname (e.g., "van den Briel" -> ["van", "den", "Briel"])
1341
+ compound_parts = compound_last.split()
1342
+ if len(compound_parts) >= 3:
1343
+ # compound_parts = ["van", "den", "Briel"]
1344
+ particle1 = compound_parts[0]
1345
+ particle2 = compound_parts[1]
1346
+ actual_last = compound_parts[-1]
1347
+
1348
+ if (last_name == actual_last and
1349
+ first_initial == first_name[0] and
1350
+ second_initial == particle1[0] and
1351
+ third_initial == particle2[0]):
1352
+ return True
1353
+
1322
1354
  if len(init_parts) == 3 and len(name_parts) == 2:
1323
1355
  # After surname particle normalization: ['g.', 'v.', 'horn'] vs ['grant', 'van horn']
1324
1356
  if (len(init_parts[0].rstrip('.')) == 1 and len(init_parts[1].rstrip('.')) == 1 and len(init_parts[2]) > 1 and