academic-refchecker 2.0.11__py3-none-any.whl → 2.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/METADATA +2 -1
- {academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/RECORD +15 -13
- {academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/WHEEL +1 -1
- refchecker/__version__.py +1 -1
- refchecker/checkers/__init__.py +3 -1
- refchecker/checkers/arxiv_citation.py +460 -0
- refchecker/checkers/enhanced_hybrid_checker.py +24 -0
- refchecker/config/settings.py +8 -0
- refchecker/llm/base.py +1 -15
- refchecker/llm/providers.py +102 -94
- refchecker/utils/arxiv_rate_limiter.py +133 -0
- refchecker/utils/text_utils.py +32 -0
- {academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: academic-refchecker
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.13
|
|
4
4
|
Summary: A comprehensive tool for validating reference accuracy in academic papers
|
|
5
5
|
Author-email: Mark Russinovich <markrussinovich@hotmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -32,6 +32,7 @@ Requires-Dist: python-Levenshtein>=0.12.0
|
|
|
32
32
|
Requires-Dist: pandas<2.4.0,>=1.3.0
|
|
33
33
|
Requires-Dist: numpy<2.0.0,>=1.22.4
|
|
34
34
|
Requires-Dist: pdfplumber>=0.6.0
|
|
35
|
+
Requires-Dist: bibtexparser>=1.4.0
|
|
35
36
|
Provides-Extra: dev
|
|
36
37
|
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
|
37
38
|
Requires-Dist: pytest-cov>=2.0.0; extra == "dev"
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
academic_refchecker-2.0.
|
|
1
|
+
academic_refchecker-2.0.13.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
2
2
|
backend/__init__.py,sha256=TFVkOx5tSp3abty15RzUbaSwQ9ZD0kfUn7PDh63xkYY,521
|
|
3
3
|
backend/__main__.py,sha256=74V7yUMsRSZaaRyXYm-rZVc3TVUcUgwsoTQTUbV5EqM,211
|
|
4
4
|
backend/cli.py,sha256=xV3l9M5OdNQQYOcrzj2d_7RmCgj7CXP_1oi0TPe6zNo,1672
|
|
@@ -16,10 +16,11 @@ backend/static/assets/index-2P6L_39v.css,sha256=KC3Wa6jfD1qwmEoVpqTovlzf8fsn5oHY
|
|
|
16
16
|
backend/static/assets/index-hk21nqxR.js,sha256=z2agP8ZFYw4AfYi-GJ5E_8_k-lPF-frXOJtPk-I0hDs,369533
|
|
17
17
|
refchecker/__init__.py,sha256=Pg5MrtLxDBRcNYcI02N-bv3tzURVd1S3nQ8IyF7Zw7E,322
|
|
18
18
|
refchecker/__main__.py,sha256=agBbT9iKN0g2xXtRNCoh29Nr7z2n5vU-r0MCVJKi4tI,232
|
|
19
|
-
refchecker/__version__.py,sha256=
|
|
20
|
-
refchecker/checkers/__init__.py,sha256
|
|
19
|
+
refchecker/__version__.py,sha256=4nD_XJ2nhdUPe68-UmSGWSjF8JFBkti-Is16FFYXHAI,66
|
|
20
|
+
refchecker/checkers/__init__.py,sha256=-dR7HX0bfPq9YMXrnODoYbfNWFLqu706xoVsUdWHYRI,611
|
|
21
|
+
refchecker/checkers/arxiv_citation.py,sha256=_oQxWt5uUSy-pAGEQjdwBb7dxoFNqWkYgpkV_ZVS-Ho,17332
|
|
21
22
|
refchecker/checkers/crossref.py,sha256=88moAyTudBqf9SKqTQkNAq1yyuRe95f8r4EpmJznupQ,20937
|
|
22
|
-
refchecker/checkers/enhanced_hybrid_checker.py,sha256=
|
|
23
|
+
refchecker/checkers/enhanced_hybrid_checker.py,sha256=HSjxbUo4tr1L1DF8FFG8dfH-Y7mM67sKmqi-KAX_31I,30310
|
|
23
24
|
refchecker/checkers/github_checker.py,sha256=YJ2sLj22qezw3uWjA0jhtDO0fOW4HUwcVbv2DQ4LjR0,14277
|
|
24
25
|
refchecker/checkers/local_semantic_scholar.py,sha256=c-KUTh99s-Di71h-pzdrwlPgoSTwB-tgVAZnCrMFXmw,21011
|
|
25
26
|
refchecker/checkers/openalex.py,sha256=WEjEppQMbutPs8kWOSorCIoXWqpJ9o1CXUicThHSWYU,20120
|
|
@@ -29,7 +30,7 @@ refchecker/checkers/semantic_scholar.py,sha256=yvatQM5fXdW0qagqrTUpgotd0RbT7N_pq
|
|
|
29
30
|
refchecker/checkers/webpage_checker.py,sha256=A_d5kg3OOsyliC00OVq_l0J-RJ4Ln7hUoURk21aO2fs,43653
|
|
30
31
|
refchecker/config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
|
|
31
32
|
refchecker/config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
|
|
32
|
-
refchecker/config/settings.py,sha256
|
|
33
|
+
refchecker/config/settings.py,sha256=O8PETl_O7uyUl1r_spWhOMHbIaiBM-golfdIN82eigI,6512
|
|
33
34
|
refchecker/core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
|
|
34
35
|
refchecker/core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
|
|
35
36
|
refchecker/core/parallel_processor.py,sha256=HpVFEMwPBiP2FRjvGqlaXpjV5S0qP-hxdB_Wdl_lACo,17704
|
|
@@ -37,13 +38,14 @@ refchecker/core/refchecker.py,sha256=nX8guDXFL1ZdT-K6KUJT_3iZjuoYsWj4e0rKrqd5VZA
|
|
|
37
38
|
refchecker/database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
|
|
38
39
|
refchecker/database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
|
|
39
40
|
refchecker/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
-
refchecker/llm/base.py,sha256=
|
|
41
|
-
refchecker/llm/providers.py,sha256=
|
|
41
|
+
refchecker/llm/base.py,sha256=BhpnUn7nrN8LzAnA8rQuG3zBvNovFYxShk1V9oAHlHU,16248
|
|
42
|
+
refchecker/llm/providers.py,sha256=2pOEre_OH_shgm0b9m3_nVIxyoY-MxhFM5KAP_qKo_Q,39131
|
|
42
43
|
refchecker/scripts/__init__.py,sha256=xJwo6afG8s7S888BK2Bxw2d7FX8aLkbl0l_ZoJOFibE,37
|
|
43
44
|
refchecker/scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,4213
|
|
44
45
|
refchecker/services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
|
|
45
46
|
refchecker/services/pdf_processor.py,sha256=7i5x043qfnyzE5EQmytfy_uPjbeCJp4Ka5OPyH-bwOE,10577
|
|
46
47
|
refchecker/utils/__init__.py,sha256=SKTEQeKpLOFFMIzZiakzctsW9zGe_J7LDNJlygWV6RY,1221
|
|
48
|
+
refchecker/utils/arxiv_rate_limiter.py,sha256=axOv84Ge6q_mJ69lcyAFsCmHx9qXvV1aX71oSaxhnjE,4119
|
|
47
49
|
refchecker/utils/arxiv_utils.py,sha256=C7wqoCy9FZUQpoF92vLeJyrK1-6XoMmmL6u_hfDV3ro,18031
|
|
48
50
|
refchecker/utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
|
|
49
51
|
refchecker/utils/biblatex_parser.py,sha256=IKRUMtRsjdXIktyk9XGArt_ms0asmqP549uhFvvumuE,25581
|
|
@@ -54,11 +56,11 @@ refchecker/utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,
|
|
|
54
56
|
refchecker/utils/doi_utils.py,sha256=_7YvQ0DTOQBMIujUE0SdJicjPiAR3VETLU668GIji24,6094
|
|
55
57
|
refchecker/utils/error_utils.py,sha256=8TcfRUD6phZ7viPJrezQ4jKf_vE65lqEXZq5707eU6s,15425
|
|
56
58
|
refchecker/utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
57
|
-
refchecker/utils/text_utils.py,sha256=
|
|
59
|
+
refchecker/utils/text_utils.py,sha256=Tx1k0SqS1cmw4N9BDJY-Ipep2T-HMmKPqi4SMcq1ZJ8,235751
|
|
58
60
|
refchecker/utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
59
61
|
refchecker/utils/url_utils.py,sha256=7b0rWCQJSajzqOvD7ghsBZPejiq6mUIz6SGhvU_WGDs,9441
|
|
60
|
-
academic_refchecker-2.0.
|
|
61
|
-
academic_refchecker-2.0.
|
|
62
|
-
academic_refchecker-2.0.
|
|
63
|
-
academic_refchecker-2.0.
|
|
64
|
-
academic_refchecker-2.0.
|
|
62
|
+
academic_refchecker-2.0.13.dist-info/METADATA,sha256=N6lsqdFWT6K34WNLqA_W0MO3WB2BEFjx_57jEdyHYes,26611
|
|
63
|
+
academic_refchecker-2.0.13.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
64
|
+
academic_refchecker-2.0.13.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
|
|
65
|
+
academic_refchecker-2.0.13.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
|
|
66
|
+
academic_refchecker-2.0.13.dist-info/RECORD,,
|
refchecker/__version__.py
CHANGED
refchecker/checkers/__init__.py
CHANGED
|
@@ -7,11 +7,13 @@ from .local_semantic_scholar import LocalNonArxivReferenceChecker
|
|
|
7
7
|
from .enhanced_hybrid_checker import EnhancedHybridReferenceChecker
|
|
8
8
|
from .openalex import OpenAlexReferenceChecker
|
|
9
9
|
from .crossref import CrossRefReferenceChecker
|
|
10
|
+
from .arxiv_citation import ArXivCitationChecker
|
|
10
11
|
|
|
11
12
|
__all__ = [
|
|
12
13
|
"NonArxivReferenceChecker",
|
|
13
14
|
"LocalNonArxivReferenceChecker",
|
|
14
15
|
"EnhancedHybridReferenceChecker",
|
|
15
16
|
"OpenAlexReferenceChecker",
|
|
16
|
-
"CrossRefReferenceChecker"
|
|
17
|
+
"CrossRefReferenceChecker",
|
|
18
|
+
"ArXivCitationChecker",
|
|
17
19
|
]
|
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
ArXiv Citation Checker - Authoritative Source for ArXiv Papers
|
|
4
|
+
|
|
5
|
+
This module provides functionality to verify ArXiv papers by fetching the official
|
|
6
|
+
BibTeX citation directly from ArXiv. This is used as the authoritative metadata source
|
|
7
|
+
for papers found on ArXiv, as it reflects the author-submitted metadata.
|
|
8
|
+
|
|
9
|
+
Key features:
|
|
10
|
+
- Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
|
|
11
|
+
- Always uses the latest version metadata (strips version suffixes)
|
|
12
|
+
- Logs warnings when cited version differs from latest version
|
|
13
|
+
- Parses BibTeX to extract normalized metadata matching refchecker schema
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
from refchecker.checkers.arxiv_citation import ArXivCitationChecker
|
|
17
|
+
|
|
18
|
+
checker = ArXivCitationChecker()
|
|
19
|
+
|
|
20
|
+
reference = {
|
|
21
|
+
'title': 'Attention Is All You Need',
|
|
22
|
+
'authors': ['Ashish Vaswani', 'Noam Shazeer'],
|
|
23
|
+
'year': 2017,
|
|
24
|
+
'url': 'https://arxiv.org/abs/1706.03762v5',
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
verified_data, errors, url = checker.verify_reference(reference)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import re
|
|
31
|
+
import logging
|
|
32
|
+
import requests
|
|
33
|
+
from typing import Dict, List, Tuple, Optional, Any
|
|
34
|
+
|
|
35
|
+
import bibtexparser
|
|
36
|
+
from bibtexparser.bparser import BibTexParser
|
|
37
|
+
from bibtexparser.customization import convert_to_unicode
|
|
38
|
+
|
|
39
|
+
from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
|
|
40
|
+
from refchecker.utils.text_utils import (
|
|
41
|
+
normalize_text,
|
|
42
|
+
compare_authors,
|
|
43
|
+
compare_titles_with_latex_cleaning,
|
|
44
|
+
strip_latex_commands,
|
|
45
|
+
)
|
|
46
|
+
from refchecker.utils.error_utils import format_title_mismatch, validate_year
|
|
47
|
+
from refchecker.config.settings import get_config
|
|
48
|
+
|
|
49
|
+
logger = logging.getLogger(__name__)
|
|
50
|
+
|
|
51
|
+
# Get configuration
|
|
52
|
+
config = get_config()
|
|
53
|
+
SIMILARITY_THRESHOLD = config["text_processing"]["similarity_threshold"]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ArXivCitationChecker:
|
|
57
|
+
"""
|
|
58
|
+
Reference checker that uses ArXiv's official BibTeX export as the authoritative source.
|
|
59
|
+
|
|
60
|
+
This checker fetches the official BibTeX citation from ArXiv for papers identified
|
|
61
|
+
by their ArXiv ID. It uses the latest version's metadata as the authoritative source
|
|
62
|
+
and logs warnings when the cited version differs from the latest.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self, timeout: int = 30):
|
|
66
|
+
"""
|
|
67
|
+
Initialize the ArXiv Citation Checker.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
timeout: HTTP request timeout in seconds
|
|
71
|
+
"""
|
|
72
|
+
self.base_url = "https://arxiv.org/bibtex"
|
|
73
|
+
self.abs_url = "https://arxiv.org/abs"
|
|
74
|
+
self.timeout = timeout
|
|
75
|
+
self.rate_limiter = ArXivRateLimiter.get_instance()
|
|
76
|
+
|
|
77
|
+
# Pattern to extract arXiv IDs from various URL formats
|
|
78
|
+
self.arxiv_id_patterns = [
|
|
79
|
+
# Standard arxiv.org URLs
|
|
80
|
+
r'arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
81
|
+
r'arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
82
|
+
# Old format with category
|
|
83
|
+
r'arxiv\.org/abs/([a-z-]+/[0-9]{7})(v\d+)?',
|
|
84
|
+
r'arxiv\.org/pdf/([a-z-]+/[0-9]{7})(v\d+)?',
|
|
85
|
+
# arXiv: prefix in text
|
|
86
|
+
r'arXiv:([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
87
|
+
r'arXiv:([a-z-]+/[0-9]{7})(v\d+)?',
|
|
88
|
+
# export.arxiv.org URLs
|
|
89
|
+
r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
90
|
+
r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
|
94
|
+
"""
|
|
95
|
+
Extract ArXiv ID from a reference, returning both the base ID and version.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
reference: Reference dictionary containing url, raw_text, etc.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Tuple of (arxiv_id_without_version, version_string_or_None)
|
|
102
|
+
For example: ("2301.12345", "v2") or ("2301.12345", None)
|
|
103
|
+
"""
|
|
104
|
+
# Sources to check for ArXiv ID
|
|
105
|
+
sources = [
|
|
106
|
+
reference.get('url', ''),
|
|
107
|
+
reference.get('cited_url', ''),
|
|
108
|
+
reference.get('raw_text', ''),
|
|
109
|
+
reference.get('eprint', ''), # BibTeX field
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
for source in sources:
|
|
113
|
+
if not source:
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
for pattern in self.arxiv_id_patterns:
|
|
117
|
+
match = re.search(pattern, source, re.IGNORECASE)
|
|
118
|
+
if match:
|
|
119
|
+
arxiv_id = match.group(1)
|
|
120
|
+
version = match.group(2) if len(match.groups()) > 1 else None
|
|
121
|
+
logger.debug(f"Extracted ArXiv ID: {arxiv_id}, version: {version}")
|
|
122
|
+
return arxiv_id, version
|
|
123
|
+
|
|
124
|
+
return None, None
|
|
125
|
+
|
|
126
|
+
def fetch_bibtex(self, arxiv_id: str) -> Optional[str]:
|
|
127
|
+
"""
|
|
128
|
+
Fetch the official BibTeX citation from ArXiv.
|
|
129
|
+
|
|
130
|
+
This always fetches the latest version's BibTeX (ArXiv default behavior).
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
arxiv_id: ArXiv ID without version suffix (e.g., "2301.12345")
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
BibTeX string or None if fetch failed
|
|
137
|
+
"""
|
|
138
|
+
url = f"{self.base_url}/{arxiv_id}"
|
|
139
|
+
|
|
140
|
+
# Wait for rate limit
|
|
141
|
+
self.rate_limiter.wait()
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
logger.debug(f"Fetching ArXiv BibTeX from: {url}")
|
|
145
|
+
response = requests.get(url, timeout=self.timeout)
|
|
146
|
+
response.raise_for_status()
|
|
147
|
+
|
|
148
|
+
bibtex_content = response.text.strip()
|
|
149
|
+
|
|
150
|
+
# Validate it looks like BibTeX
|
|
151
|
+
if bibtex_content and bibtex_content.startswith('@'):
|
|
152
|
+
logger.debug(f"Successfully fetched BibTeX for ArXiv paper {arxiv_id}")
|
|
153
|
+
return bibtex_content
|
|
154
|
+
else:
|
|
155
|
+
logger.debug(f"Invalid BibTeX response for ArXiv paper {arxiv_id}")
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
except requests.exceptions.Timeout:
|
|
159
|
+
logger.warning(f"Timeout fetching ArXiv BibTeX for {arxiv_id}")
|
|
160
|
+
return None
|
|
161
|
+
except requests.exceptions.RequestException as e:
|
|
162
|
+
logger.warning(f"Failed to fetch ArXiv BibTeX for {arxiv_id}: {e}")
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
def parse_bibtex(self, bibtex_str: str) -> Optional[Dict[str, Any]]:
|
|
166
|
+
"""
|
|
167
|
+
Parse BibTeX string and extract metadata in refchecker schema format.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
bibtex_str: BibTeX content string
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Dictionary with parsed metadata or None if parsing failed
|
|
174
|
+
"""
|
|
175
|
+
try:
|
|
176
|
+
# Configure parser
|
|
177
|
+
parser = BibTexParser(common_strings=True)
|
|
178
|
+
parser.customization = convert_to_unicode
|
|
179
|
+
|
|
180
|
+
# Parse BibTeX
|
|
181
|
+
bib_database = bibtexparser.loads(bibtex_str, parser=parser)
|
|
182
|
+
|
|
183
|
+
if not bib_database.entries:
|
|
184
|
+
logger.debug("No entries found in BibTeX")
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
entry = bib_database.entries[0]
|
|
188
|
+
|
|
189
|
+
# Extract and normalize fields
|
|
190
|
+
title = entry.get('title', '')
|
|
191
|
+
# Clean title - remove braces used for capitalization protection
|
|
192
|
+
title = re.sub(r'\{([^}]*)\}', r'\1', title)
|
|
193
|
+
title = title.strip()
|
|
194
|
+
|
|
195
|
+
# Extract authors
|
|
196
|
+
authors_str = entry.get('author', '')
|
|
197
|
+
authors = self._parse_authors(authors_str)
|
|
198
|
+
|
|
199
|
+
# Extract year - prefer year from eprint ID (original submission) over BibTeX year (latest revision)
|
|
200
|
+
arxiv_id = entry.get('eprint', '')
|
|
201
|
+
year = self._extract_year_from_eprint(arxiv_id)
|
|
202
|
+
|
|
203
|
+
# Fall back to BibTeX year field if eprint year extraction fails
|
|
204
|
+
if not year and entry.get('year'):
|
|
205
|
+
try:
|
|
206
|
+
year = int(entry['year'])
|
|
207
|
+
except ValueError:
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
# Build result in refchecker schema format
|
|
211
|
+
result = {
|
|
212
|
+
'title': title,
|
|
213
|
+
'authors': [{'name': author} for author in authors],
|
|
214
|
+
'year': year,
|
|
215
|
+
'venue': 'arXiv',
|
|
216
|
+
'externalIds': {
|
|
217
|
+
'ArXiv': arxiv_id,
|
|
218
|
+
},
|
|
219
|
+
'url': f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else None,
|
|
220
|
+
'isOpenAccess': True,
|
|
221
|
+
'openAccessPdf': {
|
|
222
|
+
'url': f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else None
|
|
223
|
+
},
|
|
224
|
+
# Store original bibtex for reference
|
|
225
|
+
'_bibtex_entry': entry,
|
|
226
|
+
'_source': 'ArXiv BibTeX Reference',
|
|
227
|
+
'_source_url': f"https://arxiv.org/bibtex/{arxiv_id}" if arxiv_id else None,
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
# Add DOI if present (some ArXiv papers have DOIs)
|
|
231
|
+
if entry.get('doi'):
|
|
232
|
+
result['externalIds']['DOI'] = entry['doi']
|
|
233
|
+
|
|
234
|
+
logger.debug(f"Parsed ArXiv BibTeX: title='{title[:50]}...', authors={len(authors)}, year={year}")
|
|
235
|
+
return result
|
|
236
|
+
|
|
237
|
+
except Exception as e:
|
|
238
|
+
logger.warning(f"Failed to parse BibTeX: {e}")
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
def _parse_authors(self, authors_str: str) -> List[str]:
|
|
242
|
+
"""
|
|
243
|
+
Parse BibTeX author string into list of author names.
|
|
244
|
+
|
|
245
|
+
BibTeX format: "Last1, First1 and Last2, First2 and ..."
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
authors_str: BibTeX author field value
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
List of author names in "First Last" format
|
|
252
|
+
"""
|
|
253
|
+
if not authors_str:
|
|
254
|
+
return []
|
|
255
|
+
|
|
256
|
+
authors = []
|
|
257
|
+
|
|
258
|
+
# Split by " and " (BibTeX convention)
|
|
259
|
+
author_parts = re.split(r'\s+and\s+', authors_str)
|
|
260
|
+
|
|
261
|
+
for part in author_parts:
|
|
262
|
+
part = part.strip()
|
|
263
|
+
if not part:
|
|
264
|
+
continue
|
|
265
|
+
|
|
266
|
+
# Handle "Last, First" format
|
|
267
|
+
if ',' in part:
|
|
268
|
+
parts = part.split(',', 1)
|
|
269
|
+
if len(parts) == 2:
|
|
270
|
+
last = parts[0].strip()
|
|
271
|
+
first = parts[1].strip()
|
|
272
|
+
# Convert to "First Last" format
|
|
273
|
+
name = f"{first} {last}".strip()
|
|
274
|
+
else:
|
|
275
|
+
name = part
|
|
276
|
+
else:
|
|
277
|
+
# Already in "First Last" format
|
|
278
|
+
name = part
|
|
279
|
+
|
|
280
|
+
# Clean up the name
|
|
281
|
+
name = re.sub(r'\s+', ' ', name) # Normalize whitespace
|
|
282
|
+
name = re.sub(r'\{([^}]*)\}', r'\1', name) # Remove braces
|
|
283
|
+
|
|
284
|
+
if name:
|
|
285
|
+
authors.append(name)
|
|
286
|
+
|
|
287
|
+
return authors
|
|
288
|
+
|
|
289
|
+
def _extract_year_from_eprint(self, eprint: str) -> Optional[int]:
|
|
290
|
+
"""
|
|
291
|
+
Extract year from ArXiv eprint ID.
|
|
292
|
+
|
|
293
|
+
New format (YYMM.NNNNN): First two digits are year
|
|
294
|
+
Old format (cat-name/YYMMNNN): Digits after slash, first two are year
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
eprint: ArXiv eprint ID
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Year as integer or None
|
|
301
|
+
"""
|
|
302
|
+
if not eprint:
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
# New format: 2301.12345
|
|
306
|
+
match = re.match(r'^(\d{2})\d{2}\.\d{4,5}', eprint)
|
|
307
|
+
if match:
|
|
308
|
+
yy = int(match.group(1))
|
|
309
|
+
# ArXiv started in 1991, new format started in 2007
|
|
310
|
+
if yy >= 7:
|
|
311
|
+
return 2000 + yy
|
|
312
|
+
else:
|
|
313
|
+
# Very early 2000s papers (unlikely in new format)
|
|
314
|
+
return 2000 + yy
|
|
315
|
+
|
|
316
|
+
# Old format: hep-th/9901001
|
|
317
|
+
match = re.match(r'^[a-z-]+/(\d{2})\d+', eprint)
|
|
318
|
+
if match:
|
|
319
|
+
yy = int(match.group(1))
|
|
320
|
+
if yy >= 91: # ArXiv started in 1991
|
|
321
|
+
return 1900 + yy
|
|
322
|
+
else:
|
|
323
|
+
return 2000 + yy
|
|
324
|
+
|
|
325
|
+
return None
|
|
326
|
+
|
|
327
|
+
def get_latest_version_info(self, arxiv_id: str) -> Optional[str]:
|
|
328
|
+
"""
|
|
329
|
+
Get the latest version number for an ArXiv paper.
|
|
330
|
+
|
|
331
|
+
Note: This requires fetching the abstract page, so it's optional.
|
|
332
|
+
For now, we rely on the BibTeX always returning latest version metadata.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
arxiv_id: ArXiv ID without version
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
Latest version string (e.g., "v3") or None if couldn't determine
|
|
339
|
+
"""
|
|
340
|
+
# The BibTeX endpoint always returns the latest version's metadata,
|
|
341
|
+
# so we don't need to explicitly fetch version info
|
|
342
|
+
return None
|
|
343
|
+
|
|
344
|
+
def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
|
|
345
|
+
"""
|
|
346
|
+
Check if a reference is an ArXiv paper.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
reference: Reference dictionary
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
True if reference appears to be an ArXiv paper
|
|
353
|
+
"""
|
|
354
|
+
arxiv_id, _ = self.extract_arxiv_id(reference)
|
|
355
|
+
return arxiv_id is not None
|
|
356
|
+
|
|
357
|
+
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
358
|
+
"""
|
|
359
|
+
Verify a reference using ArXiv's official BibTeX as authoritative source.
|
|
360
|
+
|
|
361
|
+
This method:
|
|
362
|
+
1. Extracts the ArXiv ID from the reference
|
|
363
|
+
2. Fetches the official BibTeX from ArXiv (always latest version)
|
|
364
|
+
3. Parses the BibTeX to get authoritative metadata
|
|
365
|
+
4. Compares cited metadata against authoritative source
|
|
366
|
+
5. Logs warnings for version mismatches
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
Tuple of (verified_data, errors, url)
|
|
373
|
+
- verified_data: Authoritative paper metadata from ArXiv or None
|
|
374
|
+
- errors: List of error/warning dictionaries
|
|
375
|
+
- url: ArXiv URL for the paper
|
|
376
|
+
"""
|
|
377
|
+
errors = []
|
|
378
|
+
|
|
379
|
+
# Extract ArXiv ID
|
|
380
|
+
arxiv_id, cited_version = self.extract_arxiv_id(reference)
|
|
381
|
+
|
|
382
|
+
if not arxiv_id:
|
|
383
|
+
logger.debug("ArXivCitationChecker: No ArXiv ID found in reference")
|
|
384
|
+
return None, [], None
|
|
385
|
+
|
|
386
|
+
logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
|
|
387
|
+
|
|
388
|
+
# Fetch authoritative BibTeX
|
|
389
|
+
bibtex_content = self.fetch_bibtex(arxiv_id)
|
|
390
|
+
|
|
391
|
+
if not bibtex_content:
|
|
392
|
+
logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
|
|
393
|
+
return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
|
|
394
|
+
|
|
395
|
+
# Parse BibTeX
|
|
396
|
+
verified_data = self.parse_bibtex(bibtex_content)
|
|
397
|
+
|
|
398
|
+
if not verified_data:
|
|
399
|
+
logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
|
|
400
|
+
return None, [], None
|
|
401
|
+
|
|
402
|
+
# Log version mismatch warning if cited version differs from latest
|
|
403
|
+
if cited_version:
|
|
404
|
+
# ArXiv BibTeX always returns latest version metadata
|
|
405
|
+
# We don't know the actual latest version number without additional API call,
|
|
406
|
+
# but we can warn that a specific version was cited
|
|
407
|
+
errors.append({
|
|
408
|
+
'warning_type': 'version',
|
|
409
|
+
'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
|
|
410
|
+
})
|
|
411
|
+
logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
|
|
412
|
+
|
|
413
|
+
# Compare title
|
|
414
|
+
cited_title = reference.get('title', '').strip()
|
|
415
|
+
authoritative_title = verified_data.get('title', '').strip()
|
|
416
|
+
|
|
417
|
+
if cited_title and authoritative_title:
|
|
418
|
+
title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
|
|
419
|
+
|
|
420
|
+
if title_similarity < SIMILARITY_THRESHOLD:
|
|
421
|
+
clean_cited_title = strip_latex_commands(cited_title)
|
|
422
|
+
errors.append({
|
|
423
|
+
'error_type': 'title',
|
|
424
|
+
'error_details': format_title_mismatch(clean_cited_title, authoritative_title),
|
|
425
|
+
'ref_title_correct': authoritative_title
|
|
426
|
+
})
|
|
427
|
+
|
|
428
|
+
# Compare authors
|
|
429
|
+
cited_authors = reference.get('authors', [])
|
|
430
|
+
if cited_authors:
|
|
431
|
+
authoritative_authors = verified_data.get('authors', [])
|
|
432
|
+
authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
|
|
433
|
+
|
|
434
|
+
if not authors_match:
|
|
435
|
+
correct_author_names = ', '.join([a.get('name', '') for a in authoritative_authors])
|
|
436
|
+
errors.append({
|
|
437
|
+
'error_type': 'author',
|
|
438
|
+
'error_details': author_error,
|
|
439
|
+
'ref_authors_correct': correct_author_names
|
|
440
|
+
})
|
|
441
|
+
|
|
442
|
+
# Compare year
|
|
443
|
+
cited_year = reference.get('year')
|
|
444
|
+
authoritative_year = verified_data.get('year')
|
|
445
|
+
|
|
446
|
+
year_warning = validate_year(
|
|
447
|
+
cited_year=cited_year,
|
|
448
|
+
paper_year=authoritative_year,
|
|
449
|
+
use_flexible_validation=True,
|
|
450
|
+
context={'arxiv_match': True}
|
|
451
|
+
)
|
|
452
|
+
if year_warning:
|
|
453
|
+
errors.append(year_warning)
|
|
454
|
+
|
|
455
|
+
# Build URL
|
|
456
|
+
paper_url = f"https://arxiv.org/abs/{arxiv_id}"
|
|
457
|
+
|
|
458
|
+
logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
|
|
459
|
+
|
|
460
|
+
return verified_data, errors, paper_url
|
|
@@ -43,6 +43,7 @@ class EnhancedHybridReferenceChecker:
|
|
|
43
43
|
contact_email: Optional[str] = None,
|
|
44
44
|
enable_openalex: bool = True,
|
|
45
45
|
enable_crossref: bool = True,
|
|
46
|
+
enable_arxiv_citation: bool = True,
|
|
46
47
|
debug_mode: bool = False):
|
|
47
48
|
"""
|
|
48
49
|
Initialize the enhanced hybrid reference checker
|
|
@@ -53,11 +54,22 @@ class EnhancedHybridReferenceChecker:
|
|
|
53
54
|
contact_email: Email for polite pool access to APIs
|
|
54
55
|
enable_openalex: Whether to use OpenAlex API
|
|
55
56
|
enable_crossref: Whether to use CrossRef API
|
|
57
|
+
enable_arxiv_citation: Whether to use ArXiv Citation checker as authoritative source
|
|
56
58
|
debug_mode: Whether to enable debug logging
|
|
57
59
|
"""
|
|
58
60
|
self.contact_email = contact_email
|
|
59
61
|
self.debug_mode = debug_mode
|
|
60
62
|
|
|
63
|
+
# Initialize ArXiv Citation checker (authoritative source for ArXiv papers)
|
|
64
|
+
self.arxiv_citation = None
|
|
65
|
+
if enable_arxiv_citation:
|
|
66
|
+
try:
|
|
67
|
+
from .arxiv_citation import ArXivCitationChecker
|
|
68
|
+
self.arxiv_citation = ArXivCitationChecker()
|
|
69
|
+
logger.debug("Enhanced Hybrid: ArXiv Citation checker initialized")
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.warning(f"Enhanced Hybrid: Failed to initialize ArXiv Citation checker: {e}")
|
|
72
|
+
|
|
61
73
|
# Initialize local database checker if available
|
|
62
74
|
self.local_db = None
|
|
63
75
|
if db_path:
|
|
@@ -112,6 +124,7 @@ class EnhancedHybridReferenceChecker:
|
|
|
112
124
|
|
|
113
125
|
# Track API performance for adaptive selection
|
|
114
126
|
self.api_stats = {
|
|
127
|
+
'arxiv_citation': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
115
128
|
'local_db': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
116
129
|
'semantic_scholar': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
117
130
|
'openalex': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
@@ -276,6 +289,17 @@ class EnhancedHybridReferenceChecker:
|
|
|
276
289
|
|
|
277
290
|
# PHASE 1: Try all APIs once in priority order
|
|
278
291
|
|
|
292
|
+
# Strategy 0: For ArXiv papers, try ArXiv Citation checker first (authoritative source)
|
|
293
|
+
# This fetches the official BibTeX from ArXiv which is the author-submitted metadata
|
|
294
|
+
if self.arxiv_citation and self.arxiv_citation.is_arxiv_reference(reference):
|
|
295
|
+
logger.debug("Enhanced Hybrid: Reference appears to be ArXiv paper, trying ArXiv Citation checker first")
|
|
296
|
+
verified_data, errors, url, success, failure_type = self._try_api('arxiv_citation', self.arxiv_citation, reference)
|
|
297
|
+
if success:
|
|
298
|
+
logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded as authoritative source")
|
|
299
|
+
return verified_data, errors, url
|
|
300
|
+
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
301
|
+
failed_apis.append(('arxiv_citation', self.arxiv_citation, failure_type))
|
|
302
|
+
|
|
279
303
|
# Strategy 1: Always try local database first (fastest)
|
|
280
304
|
if self.local_db:
|
|
281
305
|
verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)
|
refchecker/config/settings.py
CHANGED
|
@@ -22,6 +22,14 @@ DEFAULT_CONFIG = {
|
|
|
22
22
|
"timeout": 30,
|
|
23
23
|
},
|
|
24
24
|
|
|
25
|
+
"arxiv_citation": {
|
|
26
|
+
"base_url": "https://arxiv.org/bibtex",
|
|
27
|
+
"rate_limit_delay": 3.0, # Share rate limiting with other ArXiv endpoints
|
|
28
|
+
"timeout": 30,
|
|
29
|
+
"use_as_authoritative": True, # Use ArXiv BibTeX as authoritative source
|
|
30
|
+
"enabled": True, # Enable ArXiv citation checker in hybrid checker
|
|
31
|
+
},
|
|
32
|
+
|
|
25
33
|
# Processing Settings
|
|
26
34
|
"processing": {
|
|
27
35
|
"max_papers": 50,
|
refchecker/llm/base.py
CHANGED
|
@@ -110,21 +110,7 @@ class LLMProvider(ABC):
|
|
|
110
110
|
|
|
111
111
|
logger.debug(f"Created {len(chunks)} balanced overlapping chunks for parallel processing")
|
|
112
112
|
return chunks
|
|
113
|
-
|
|
114
|
-
def _parse_llm_response(self, response_text: str) -> List[str]:
|
|
115
|
-
"""Parse LLM response and extract individual references"""
|
|
116
|
-
if not response_text:
|
|
117
|
-
return []
|
|
118
|
-
|
|
119
|
-
# Split by newlines and filter out empty lines
|
|
120
|
-
references = []
|
|
121
|
-
for line in response_text.strip().split('\n'):
|
|
122
|
-
line = line.strip()
|
|
123
|
-
if line and not line.startswith('#') and len(line) > 10: # Basic filtering
|
|
124
|
-
references.append(line)
|
|
125
|
-
|
|
126
|
-
return references
|
|
127
|
-
|
|
113
|
+
|
|
128
114
|
def extract_references_with_chunking(self, bibliography_text: str) -> List[str]:
|
|
129
115
|
"""
|
|
130
116
|
Template method that handles chunking for all providers.
|
refchecker/llm/providers.py
CHANGED
|
@@ -62,51 +62,25 @@ class LLMProviderMixin:
|
|
|
62
62
|
"""Create prompt for reference extraction"""
|
|
63
63
|
# Clean BibTeX formatting before sending to LLM
|
|
64
64
|
cleaned_bibliography = self._clean_bibtex_for_llm(bibliography_text)
|
|
65
|
-
|
|
66
|
-
return f"""
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
4.
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
- SPECIAL CASE for collaborations: Handle "Last, First and others" pattern correctly
|
|
85
|
-
* author = {"Khachatryan, Vardan and others"} → ONE explicit author plus et al: "Vardan Khachatryan*et al"
|
|
86
|
-
* author = {"Smith, John and others"} → ONE explicit author plus et al: "John Smith*et al"
|
|
87
|
-
* The "Last, First and others" pattern indicates a collaboration paper where only the first author is listed explicitly
|
|
88
|
-
- EXAMPLES:
|
|
89
|
-
* author = {"Dolan, Brian P."} → ONE author: "Dolan, Brian P."
|
|
90
|
-
* author = {"Smith, John and Doe, Jane"} → TWO authors: "Smith, John*Doe, Jane"
|
|
91
|
-
* author = {"Arnab, Anurag and Dehghani, Mostafa and Heigold, Georg"} → THREE authors: "Arnab, Anurag*Dehghani, Mostafa*Heigold, Georg"
|
|
92
|
-
* author = {"Khachatryan, Vardan and others"} → ONE explicit author plus et al: "Vardan Khachatryan*et al"
|
|
93
|
-
- Use asterisks (*) to separate individual authors in your output
|
|
94
|
-
- For "Last, First" format, convert to "First Last" for readability (e.g., "Smith, John" → "John Smith")
|
|
95
|
-
- If a BibTeX entry has NO author field, output an empty author field (nothing before the first #)
|
|
96
|
-
- Do NOT infer or guess authors based on title or context - only use what is explicitly stated
|
|
97
|
-
7. CRITICAL: When extracting authors, preserve "et al" and similar indicators exactly as they appear
|
|
98
|
-
- If the original says "John Smith, Jane Doe, et al" then output "John Smith, Jane Doe, et al"
|
|
99
|
-
- If the original says "John Smith et al." then output "John Smith et al."
|
|
100
|
-
- Also preserve variations like "and others", "etc.", "..." when used to indicate additional authors
|
|
101
|
-
- Do NOT expand "et al" into individual author names, even if you know them
|
|
102
|
-
8. Return ONLY the references, one per line
|
|
103
|
-
9. Do not include reference numbers like [1], [2], etc. in your output
|
|
104
|
-
10. Do not add any additional text or explanations
|
|
105
|
-
11. Ensure that URLs and DOIs are from the specific reference only
|
|
106
|
-
- When extracting URLs, preserve the complete URL including protocol
|
|
107
|
-
- For BibTeX howpublished fields, extract the full URL from the field value
|
|
108
|
-
12. When parsing multi-line references, combine all authors from all lines before the title
|
|
109
|
-
13. CRITICAL: If the text contains no valid bibliographic references (e.g., only figures, appendix material, or explanatory text), simply return nothing - do NOT explain why you cannot extract references
|
|
65
|
+
|
|
66
|
+
return f"""OUTPUT FORMAT (MANDATORY):
|
|
67
|
+
- Each line must be: Author1*Author2#Title#Venue#Year#URL
|
|
68
|
+
- Use # between fields, * between authors
|
|
69
|
+
- One reference per line
|
|
70
|
+
- NO other text allowed - no explanations, descriptions, or commentary
|
|
71
|
+
- If no valid references exist, return NOTHING (completely empty response)
|
|
72
|
+
|
|
73
|
+
EXTRACTION RULES:
|
|
74
|
+
1. Split by numbered markers [1], [2], etc. - references may span multiple lines
|
|
75
|
+
2. Extract: authors, title, venue (journal/booktitle), year, URLs/DOIs
|
|
76
|
+
3. For BibTeX: "title" field = paper title, "journal"/"booktitle" = venue
|
|
77
|
+
4. Handle author formats:
|
|
78
|
+
- "Last, First and others" → "First Last*et al"
|
|
79
|
+
- "Last, First" → "First Last"
|
|
80
|
+
- Separate multiple authors with *
|
|
81
|
+
- Preserve "et al" exactly as written
|
|
82
|
+
5. Skip entries that are only URLs without bibliographic data
|
|
83
|
+
6. If no author field exists, start with # (empty author)
|
|
110
84
|
|
|
111
85
|
Bibliography text:
|
|
112
86
|
{cleaned_bibliography}
|
|
@@ -116,67 +90,120 @@ Bibliography text:
|
|
|
116
90
|
"""Parse LLM response into list of references"""
|
|
117
91
|
if not content:
|
|
118
92
|
return []
|
|
119
|
-
|
|
93
|
+
|
|
120
94
|
# Ensure content is a string
|
|
121
95
|
if not isinstance(content, str):
|
|
122
96
|
content = str(content)
|
|
123
|
-
|
|
97
|
+
|
|
124
98
|
# Clean the content - remove leading/trailing whitespace
|
|
125
99
|
content = content.strip()
|
|
126
|
-
|
|
100
|
+
|
|
101
|
+
# Early check: if no # delimiters at all, likely all prose/explanatory text
|
|
102
|
+
if '#' not in content:
|
|
103
|
+
logger.warning("LLM response contains no structured references (no # delimiters found)")
|
|
104
|
+
return []
|
|
105
|
+
|
|
127
106
|
# Split by double newlines first to handle paragraph-style formatting
|
|
128
107
|
# then fall back to single newlines
|
|
129
108
|
references = []
|
|
130
|
-
|
|
109
|
+
|
|
131
110
|
# Try double newline splitting first (paragraph style)
|
|
132
111
|
if '\n\n' in content:
|
|
133
112
|
potential_refs = content.split('\n\n')
|
|
134
113
|
else:
|
|
135
114
|
# Fall back to single newline splitting
|
|
136
115
|
potential_refs = content.split('\n')
|
|
137
|
-
|
|
116
|
+
|
|
117
|
+
import re
|
|
118
|
+
|
|
119
|
+
# Common prose patterns that indicate explanatory text
|
|
120
|
+
prose_starters = (
|
|
121
|
+
'this ', 'the ', 'i ', 'looking ', 'based on', 'it ',
|
|
122
|
+
'there ', 'these ', 'here ', 'note', 'please ', 'however',
|
|
123
|
+
'unfortunately', 'appears to', 'contains', 'following',
|
|
124
|
+
'above', 'below', 'after', 'before', 'when ', 'if ',
|
|
125
|
+
'as ', 'for ', 'from ', 'with ', 'without ', 'although'
|
|
126
|
+
)
|
|
127
|
+
|
|
138
128
|
for ref in potential_refs:
|
|
139
129
|
ref = ref.strip()
|
|
140
|
-
|
|
141
|
-
# Skip empty lines
|
|
130
|
+
|
|
131
|
+
# Skip empty lines
|
|
142
132
|
if not ref:
|
|
143
133
|
continue
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
if ref.startswith('#'):
|
|
147
|
-
continue
|
|
148
|
-
if 'extracted from the bibliography' in ref.lower():
|
|
134
|
+
|
|
135
|
+
# Skip lines starting with # (markdown headers or empty author field without title)
|
|
136
|
+
if ref.startswith('#') and not re.match(r'^#[^#]', ref):
|
|
149
137
|
continue
|
|
150
|
-
|
|
138
|
+
|
|
139
|
+
# Check for prose/explanatory text patterns
|
|
140
|
+
ref_lower = ref.lower()
|
|
141
|
+
|
|
142
|
+
# Skip common explanatory headers
|
|
143
|
+
if ref_lower.startswith(('reference', 'here are', 'below are', 'extracted', 'bibliography')):
|
|
151
144
|
continue
|
|
145
|
+
|
|
152
146
|
# Skip verbose LLM explanatory responses
|
|
153
|
-
|
|
147
|
+
skip_patterns = [
|
|
148
|
+
'extracted from the bibliography',
|
|
149
|
+
'formatted as a complete',
|
|
150
|
+
'cannot extract',
|
|
151
|
+
'appears to be from',
|
|
152
|
+
'no numbered reference markers',
|
|
153
|
+
'only figures',
|
|
154
|
+
'i cannot',
|
|
155
|
+
'i return nothing',
|
|
156
|
+
'return nothing',
|
|
157
|
+
'no valid bibliographic',
|
|
158
|
+
'numbered format specified',
|
|
159
|
+
'it contains',
|
|
160
|
+
'it does not contain',
|
|
161
|
+
'text appears to be',
|
|
162
|
+
'does not appear to contain',
|
|
163
|
+
'no references found',
|
|
164
|
+
'empty response',
|
|
165
|
+
'no bibliography',
|
|
166
|
+
'no actual bibliographic',
|
|
167
|
+
'no academic references',
|
|
168
|
+
'contains only numerical',
|
|
169
|
+
'data tables',
|
|
170
|
+
'evaluation rubric',
|
|
171
|
+
'publication metadata',
|
|
172
|
+
'citable sources',
|
|
173
|
+
'reference list',
|
|
174
|
+
]
|
|
175
|
+
if any(pattern in ref_lower for pattern in skip_patterns):
|
|
154
176
|
continue
|
|
155
|
-
|
|
177
|
+
|
|
178
|
+
# Skip lines starting with common prose patterns
|
|
179
|
+
if ref_lower.startswith(prose_starters):
|
|
156
180
|
continue
|
|
157
|
-
if '
|
|
181
|
+
if ref_lower.startswith('looking at'):
|
|
158
182
|
continue
|
|
159
|
-
if
|
|
183
|
+
if ref_lower.startswith('since there are'):
|
|
160
184
|
continue
|
|
161
|
-
|
|
185
|
+
|
|
186
|
+
# Key structural check: valid references MUST have # delimiters
|
|
187
|
+
if '#' not in ref:
|
|
188
|
+
# No delimiter = not a valid reference, skip it
|
|
189
|
+
logger.debug(f"Skipping line without # delimiter: {ref[:80]}...")
|
|
162
190
|
continue
|
|
163
|
-
|
|
191
|
+
|
|
164
192
|
# Remove common prefixes (bullets, numbers, etc.)
|
|
165
193
|
ref = ref.lstrip('- *•')
|
|
166
194
|
ref = ref.strip()
|
|
167
|
-
|
|
195
|
+
|
|
168
196
|
# Remove reference numbers like "1.", "[1]", "(1)" from the beginning
|
|
169
|
-
import re
|
|
170
197
|
ref = re.sub(r'^(\d+\.|\[\d+\]|\(\d+\))\s*', '', ref)
|
|
171
|
-
|
|
198
|
+
|
|
172
199
|
# Filter out very short lines (likely not complete references)
|
|
173
|
-
if len(ref) > 30: #
|
|
200
|
+
if len(ref) > 30: # Minimum length for academic references
|
|
174
201
|
references.append(ref)
|
|
175
|
-
|
|
202
|
+
|
|
176
203
|
return references
|
|
177
204
|
|
|
178
205
|
|
|
179
|
-
class OpenAIProvider(
|
|
206
|
+
class OpenAIProvider(LLMProviderMixin, LLMProvider):
|
|
180
207
|
"""OpenAI GPT provider for reference extraction"""
|
|
181
208
|
|
|
182
209
|
def __init__(self, config: Dict[str, Any]):
|
|
@@ -197,10 +224,6 @@ class OpenAIProvider(LLMProvider, LLMProviderMixin):
|
|
|
197
224
|
def extract_references(self, bibliography_text: str) -> List[str]:
|
|
198
225
|
return self.extract_references_with_chunking(bibliography_text)
|
|
199
226
|
|
|
200
|
-
def _create_extraction_prompt(self, bibliography_text: str) -> str:
|
|
201
|
-
"""Create prompt for reference extraction"""
|
|
202
|
-
return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
|
|
203
|
-
|
|
204
227
|
def _call_llm(self, prompt: str) -> str:
|
|
205
228
|
"""Make the actual OpenAI API call and return the response text"""
|
|
206
229
|
try:
|
|
@@ -220,7 +243,7 @@ class OpenAIProvider(LLMProvider, LLMProviderMixin):
|
|
|
220
243
|
raise
|
|
221
244
|
|
|
222
245
|
|
|
223
|
-
class AnthropicProvider(
|
|
246
|
+
class AnthropicProvider(LLMProviderMixin, LLMProvider):
|
|
224
247
|
"""Anthropic Claude provider for reference extraction"""
|
|
225
248
|
|
|
226
249
|
def __init__(self, config: Dict[str, Any]):
|
|
@@ -241,10 +264,6 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
|
|
|
241
264
|
def extract_references(self, bibliography_text: str) -> List[str]:
|
|
242
265
|
return self.extract_references_with_chunking(bibliography_text)
|
|
243
266
|
|
|
244
|
-
def _create_extraction_prompt(self, bibliography_text: str) -> str:
|
|
245
|
-
"""Create prompt for reference extraction"""
|
|
246
|
-
return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
|
|
247
|
-
|
|
248
267
|
def _call_llm(self, prompt: str) -> str:
|
|
249
268
|
"""Make the actual Anthropic API call and return the response text"""
|
|
250
269
|
try:
|
|
@@ -252,6 +271,7 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
|
|
|
252
271
|
model=self.model or "claude-sonnet-4-20250514",
|
|
253
272
|
max_tokens=self.max_tokens,
|
|
254
273
|
temperature=self.temperature,
|
|
274
|
+
system="You are a bibliographic reference extractor. You output ONLY structured reference data in the exact format specified. Never explain, describe, or comment on the input. Never output prose or sentences. If input contains no extractable references, return a completely empty response with no text.",
|
|
255
275
|
messages=[
|
|
256
276
|
{"role": "user", "content": prompt}
|
|
257
277
|
]
|
|
@@ -281,7 +301,7 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
|
|
|
281
301
|
raise
|
|
282
302
|
|
|
283
303
|
|
|
284
|
-
class GoogleProvider(
|
|
304
|
+
class GoogleProvider(LLMProviderMixin, LLMProvider):
|
|
285
305
|
"""Google Gemini provider for reference extraction"""
|
|
286
306
|
|
|
287
307
|
def __init__(self, config: Dict[str, Any]):
|
|
@@ -303,10 +323,6 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
|
|
|
303
323
|
def extract_references(self, bibliography_text: str) -> List[str]:
|
|
304
324
|
return self.extract_references_with_chunking(bibliography_text)
|
|
305
325
|
|
|
306
|
-
def _create_extraction_prompt(self, bibliography_text: str) -> str:
|
|
307
|
-
"""Create prompt for reference extraction"""
|
|
308
|
-
return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
|
|
309
|
-
|
|
310
326
|
def _call_llm(self, prompt: str) -> str:
|
|
311
327
|
"""Make the actual Google API call and return the response text"""
|
|
312
328
|
try:
|
|
@@ -341,7 +357,7 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
|
|
|
341
357
|
raise
|
|
342
358
|
|
|
343
359
|
|
|
344
|
-
class AzureProvider(
|
|
360
|
+
class AzureProvider(LLMProviderMixin, LLMProvider):
|
|
345
361
|
"""Azure OpenAI provider for reference extraction"""
|
|
346
362
|
|
|
347
363
|
def __init__(self, config: Dict[str, Any]):
|
|
@@ -375,10 +391,6 @@ class AzureProvider(LLMProvider, LLMProviderMixin):
|
|
|
375
391
|
def extract_references(self, bibliography_text: str) -> List[str]:
|
|
376
392
|
return self.extract_references_with_chunking(bibliography_text)
|
|
377
393
|
|
|
378
|
-
def _create_extraction_prompt(self, bibliography_text: str) -> str:
|
|
379
|
-
"""Create prompt for reference extraction"""
|
|
380
|
-
return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
|
|
381
|
-
|
|
382
394
|
def _call_llm(self, prompt: str) -> str:
|
|
383
395
|
"""Make the actual Azure OpenAI API call and return the response text"""
|
|
384
396
|
try:
|
|
@@ -397,7 +409,7 @@ class AzureProvider(LLMProvider, LLMProviderMixin):
|
|
|
397
409
|
logger.error(f"Azure API call failed: {e}")
|
|
398
410
|
raise
|
|
399
411
|
|
|
400
|
-
class vLLMProvider(
|
|
412
|
+
class vLLMProvider(LLMProviderMixin, LLMProvider):
|
|
401
413
|
"""vLLM provider using OpenAI-compatible server mode for local Hugging Face models"""
|
|
402
414
|
|
|
403
415
|
def __init__(self, config: Dict[str, Any]):
|
|
@@ -838,10 +850,6 @@ class vLLMProvider(LLMProvider, LLMProviderMixin):
|
|
|
838
850
|
def extract_references(self, bibliography_text: str) -> List[str]:
|
|
839
851
|
return self.extract_references_with_chunking(bibliography_text)
|
|
840
852
|
|
|
841
|
-
def _create_extraction_prompt(self, bibliography_text: str) -> str:
|
|
842
|
-
"""Create prompt for reference extraction"""
|
|
843
|
-
return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
|
|
844
|
-
|
|
845
853
|
def _call_llm(self, prompt: str) -> str:
|
|
846
854
|
"""Make the actual vLLM API call and return the response text"""
|
|
847
855
|
try:
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared ArXiv Rate Limiter utility.
|
|
3
|
+
|
|
4
|
+
ArXiv requests a polite delay of 3 seconds between requests.
|
|
5
|
+
This module provides a centralized rate limiter to coordinate all ArXiv API calls
|
|
6
|
+
across different checkers and utilities.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
|
|
10
|
+
|
|
11
|
+
# Get the shared limiter instance
|
|
12
|
+
limiter = ArXivRateLimiter.get_instance()
|
|
13
|
+
|
|
14
|
+
# Wait for rate limit before making a request
|
|
15
|
+
limiter.wait()
|
|
16
|
+
|
|
17
|
+
# Then make your request
|
|
18
|
+
response = requests.get(arxiv_url)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import time
|
|
22
|
+
import threading
|
|
23
|
+
import logging
|
|
24
|
+
from typing import Optional
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ArXivRateLimiter:
|
|
30
|
+
"""
|
|
31
|
+
Singleton rate limiter for ArXiv API requests.
|
|
32
|
+
|
|
33
|
+
ArXiv requests a minimum of 3 seconds between requests for polite access.
|
|
34
|
+
This class ensures all ArXiv API calls from any part of refchecker
|
|
35
|
+
are properly rate limited.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
_instance: Optional['ArXivRateLimiter'] = None
|
|
39
|
+
_lock = threading.Lock()
|
|
40
|
+
|
|
41
|
+
# ArXiv recommends at least 3 seconds between requests
|
|
42
|
+
DEFAULT_DELAY = 3.0
|
|
43
|
+
|
|
44
|
+
def __init__(self):
|
|
45
|
+
"""Initialize the rate limiter (use get_instance() instead of direct construction)."""
|
|
46
|
+
self._last_request_time: float = 0.0
|
|
47
|
+
self._request_lock = threading.Lock()
|
|
48
|
+
self._delay: float = self.DEFAULT_DELAY
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def get_instance(cls) -> 'ArXivRateLimiter':
|
|
52
|
+
"""
|
|
53
|
+
Get the singleton instance of the ArXiv rate limiter.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
The shared ArXivRateLimiter instance
|
|
57
|
+
"""
|
|
58
|
+
if cls._instance is None:
|
|
59
|
+
with cls._lock:
|
|
60
|
+
# Double-check locking pattern
|
|
61
|
+
if cls._instance is None:
|
|
62
|
+
cls._instance = cls()
|
|
63
|
+
return cls._instance
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def reset_instance(cls) -> None:
|
|
67
|
+
"""
|
|
68
|
+
Reset the singleton instance (primarily for testing).
|
|
69
|
+
"""
|
|
70
|
+
with cls._lock:
|
|
71
|
+
cls._instance = None
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def delay(self) -> float:
|
|
75
|
+
"""Get the current delay between requests in seconds."""
|
|
76
|
+
return self._delay
|
|
77
|
+
|
|
78
|
+
@delay.setter
|
|
79
|
+
def delay(self, value: float) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Set the delay between requests.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
value: Delay in seconds (minimum 0.5 seconds enforced)
|
|
85
|
+
"""
|
|
86
|
+
self._delay = max(0.5, value)
|
|
87
|
+
|
|
88
|
+
def wait(self) -> float:
|
|
89
|
+
"""
|
|
90
|
+
Wait for the rate limit before making a request.
|
|
91
|
+
|
|
92
|
+
This method blocks until the required time has passed since the last request.
|
|
93
|
+
It is thread-safe and can be called from multiple threads simultaneously.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
The actual time waited in seconds (0 if no wait was needed)
|
|
97
|
+
"""
|
|
98
|
+
with self._request_lock:
|
|
99
|
+
current_time = time.time()
|
|
100
|
+
time_since_last = current_time - self._last_request_time
|
|
101
|
+
|
|
102
|
+
if time_since_last < self._delay:
|
|
103
|
+
wait_time = self._delay - time_since_last
|
|
104
|
+
logger.debug(f"ArXiv rate limiter: waiting {wait_time:.2f}s")
|
|
105
|
+
time.sleep(wait_time)
|
|
106
|
+
else:
|
|
107
|
+
wait_time = 0.0
|
|
108
|
+
|
|
109
|
+
self._last_request_time = time.time()
|
|
110
|
+
return wait_time
|
|
111
|
+
|
|
112
|
+
def mark_request(self) -> None:
|
|
113
|
+
"""
|
|
114
|
+
Mark that a request was just made (without waiting).
|
|
115
|
+
|
|
116
|
+
Use this if you're managing timing externally but still want to
|
|
117
|
+
update the rate limiter's state.
|
|
118
|
+
"""
|
|
119
|
+
with self._request_lock:
|
|
120
|
+
self._last_request_time = time.time()
|
|
121
|
+
|
|
122
|
+
def time_until_next(self) -> float:
|
|
123
|
+
"""
|
|
124
|
+
Get the time remaining until the next request is allowed.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Time in seconds until next request (0 if allowed now)
|
|
128
|
+
"""
|
|
129
|
+
with self._request_lock:
|
|
130
|
+
current_time = time.time()
|
|
131
|
+
time_since_last = current_time - self._last_request_time
|
|
132
|
+
remaining = self._delay - time_since_last
|
|
133
|
+
return max(0.0, remaining)
|
refchecker/utils/text_utils.py
CHANGED
|
@@ -1319,6 +1319,38 @@ def is_name_match(name1: str, name2: str) -> bool:
|
|
|
1319
1319
|
# This handles both surname particle normalization effects and standard 3-part names
|
|
1320
1320
|
def match_initials_with_names(init_parts, name_parts):
|
|
1321
1321
|
"""Helper function to match initials against full names"""
|
|
1322
|
+
# Handle 4-part initials vs 2-part compound surname
|
|
1323
|
+
# e.g., ['M.', 'V.', 'D.', 'Briel'] vs ['Menkes', 'van den Briel']
|
|
1324
|
+
# where "van den" particles are treated as initials "V. D."
|
|
1325
|
+
if len(init_parts) == 4 and len(name_parts) == 2:
|
|
1326
|
+
# Check if first 3 parts are initials and last is surname
|
|
1327
|
+
if (len(init_parts[0].rstrip('.')) == 1 and
|
|
1328
|
+
len(init_parts[1].rstrip('.')) == 1 and
|
|
1329
|
+
len(init_parts[2].rstrip('.')) == 1 and
|
|
1330
|
+
len(init_parts[3]) > 1 and
|
|
1331
|
+
len(name_parts[0]) > 1 and len(name_parts[1]) > 1):
|
|
1332
|
+
|
|
1333
|
+
first_initial = init_parts[0].rstrip('.')
|
|
1334
|
+
second_initial = init_parts[1].rstrip('.')
|
|
1335
|
+
third_initial = init_parts[2].rstrip('.')
|
|
1336
|
+
last_name = init_parts[3]
|
|
1337
|
+
first_name = name_parts[0]
|
|
1338
|
+
compound_last = name_parts[1]
|
|
1339
|
+
|
|
1340
|
+
# Extract parts from compound lastname (e.g., "van den Briel" -> ["van", "den", "Briel"])
|
|
1341
|
+
compound_parts = compound_last.split()
|
|
1342
|
+
if len(compound_parts) >= 3:
|
|
1343
|
+
# compound_parts = ["van", "den", "Briel"]
|
|
1344
|
+
particle1 = compound_parts[0]
|
|
1345
|
+
particle2 = compound_parts[1]
|
|
1346
|
+
actual_last = compound_parts[-1]
|
|
1347
|
+
|
|
1348
|
+
if (last_name == actual_last and
|
|
1349
|
+
first_initial == first_name[0] and
|
|
1350
|
+
second_initial == particle1[0] and
|
|
1351
|
+
third_initial == particle2[0]):
|
|
1352
|
+
return True
|
|
1353
|
+
|
|
1322
1354
|
if len(init_parts) == 3 and len(name_parts) == 2:
|
|
1323
1355
|
# After surname particle normalization: ['g.', 'v.', 'horn'] vs ['grant', 'van horn']
|
|
1324
1356
|
if (len(init_parts[0].rstrip('.')) == 1 and len(init_parts[1].rstrip('.')) == 1 and len(init_parts[2]) > 1 and
|
{academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|