academic-refchecker 2.0.12__py3-none-any.whl → 2.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/METADATA +1 -1
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/RECORD +17 -17
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/WHEEL +1 -1
- backend/main.py +33 -5
- backend/refchecker_wrapper.py +42 -1
- backend/thumbnail.py +117 -0
- refchecker/__version__.py +1 -1
- refchecker/checkers/arxiv_citation.py +181 -49
- refchecker/checkers/enhanced_hybrid_checker.py +117 -4
- refchecker/checkers/semantic_scholar.py +43 -1
- refchecker/llm/base.py +1 -15
- refchecker/llm/providers.py +102 -113
- refchecker/utils/author_utils.py +15 -2
- refchecker/utils/bibliography_utils.py +2 -2
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/top_level.txt +0 -0
refchecker/llm/providers.py
CHANGED
|
@@ -62,52 +62,25 @@ class LLMProviderMixin:
|
|
|
62
62
|
"""Create prompt for reference extraction"""
|
|
63
63
|
# Clean BibTeX formatting before sending to LLM
|
|
64
64
|
cleaned_bibliography = self._clean_bibtex_for_llm(bibliography_text)
|
|
65
|
-
|
|
66
|
-
return f"""
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
4.
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
- SPECIAL CASE for collaborations: Handle "Last, First and others" pattern correctly
|
|
85
|
-
* author = {"Khachatryan, Vardan and others"} → ONE explicit author plus et al: "Vardan Khachatryan*et al"
|
|
86
|
-
* author = {"Smith, John and others"} → ONE explicit author plus et al: "John Smith*et al"
|
|
87
|
-
* The "Last, First and others" pattern indicates a collaboration paper where only the first author is listed explicitly
|
|
88
|
-
- EXAMPLES:
|
|
89
|
-
* author = {"Dolan, Brian P."} → ONE author: "Dolan, Brian P."
|
|
90
|
-
* author = {"Smith, John and Doe, Jane"} → TWO authors: "Smith, John*Doe, Jane"
|
|
91
|
-
* author = {"Arnab, Anurag and Dehghani, Mostafa and Heigold, Georg"} → THREE authors: "Arnab, Anurag*Dehghani, Mostafa*Heigold, Georg"
|
|
92
|
-
* author = {"Khachatryan, Vardan and others"} → ONE explicit author plus et al: "Vardan Khachatryan*et al"
|
|
93
|
-
- Use asterisks (*) to separate individual authors in your output
|
|
94
|
-
- For "Last, First" format, convert to "First Last" for readability (e.g., "Smith, John" → "John Smith")
|
|
95
|
-
- If a BibTeX entry has NO author field, output an empty author field (nothing before the first #)
|
|
96
|
-
- Do NOT infer or guess authors based on title or context - only use what is explicitly stated
|
|
97
|
-
7. CRITICAL: When extracting authors, preserve "et al" and similar indicators exactly as they appear
|
|
98
|
-
- If the original says "John Smith, Jane Doe, et al" then output "John Smith, Jane Doe, et al"
|
|
99
|
-
- If the original says "John Smith et al." then output "John Smith et al."
|
|
100
|
-
- Also preserve variations like "and others", "etc.", "..." when used to indicate additional authors
|
|
101
|
-
- Do NOT expand "et al" into individual author names, even if you know them
|
|
102
|
-
8. Return ONLY the references, one per line
|
|
103
|
-
9. Do not include reference numbers like [1], [2], etc. in your output
|
|
104
|
-
10. Do not add any additional text or explanations
|
|
105
|
-
11. Ensure that URLs and DOIs are from the specific reference only
|
|
106
|
-
- When extracting URLs, preserve the complete URL including protocol
|
|
107
|
-
- For BibTeX howpublished fields, extract the full URL from the field value
|
|
108
|
-
12. When parsing multi-line references, combine all authors from all lines before the title
|
|
109
|
-
13. CRITICAL: If the text contains no valid bibliographic references (e.g., only figures, appendix material, or explanatory text), return ONLY an empty response with no text at all - do NOT explain why, do NOT describe what you see, do NOT say "I return nothing" or similar phrases
|
|
110
|
-
14. OUTPUT FORMAT: Your response must contain ONLY extracted references in the format specified above (Author1*Author2#Title#Venue#Year#URL), one per line. No introductory text, no explanations, no commentary, no "Looking at this text..." statements. If there are no references to extract, output absolutely nothing.
|
|
65
|
+
|
|
66
|
+
return f"""OUTPUT FORMAT (MANDATORY):
|
|
67
|
+
- Each line must be: Author1*Author2#Title#Venue#Year#URL
|
|
68
|
+
- Use # between fields, * between authors
|
|
69
|
+
- One reference per line
|
|
70
|
+
- NO other text allowed - no explanations, descriptions, or commentary
|
|
71
|
+
- If no valid references exist, return NOTHING (completely empty response)
|
|
72
|
+
|
|
73
|
+
EXTRACTION RULES:
|
|
74
|
+
1. Split by numbered markers [1], [2], etc. - references may span multiple lines
|
|
75
|
+
2. Extract: authors, title, venue (journal/booktitle), year, URLs/DOIs
|
|
76
|
+
3. For BibTeX: "title" field = paper title, "journal"/"booktitle" = venue
|
|
77
|
+
4. Handle author formats:
|
|
78
|
+
- "Last, First and others" → "First Last*et al"
|
|
79
|
+
- "Last, First" → "First Last"
|
|
80
|
+
- Separate multiple authors with *
|
|
81
|
+
- Preserve "et al" exactly as written
|
|
82
|
+
5. Skip entries that are only URLs without bibliographic data
|
|
83
|
+
6. If no author field exists, start with # (empty author)
|
|
111
84
|
|
|
112
85
|
Bibliography text:
|
|
113
86
|
{cleaned_bibliography}
|
|
@@ -117,85 +90,120 @@ Bibliography text:
|
|
|
117
90
|
"""Parse LLM response into list of references"""
|
|
118
91
|
if not content:
|
|
119
92
|
return []
|
|
120
|
-
|
|
93
|
+
|
|
121
94
|
# Ensure content is a string
|
|
122
95
|
if not isinstance(content, str):
|
|
123
96
|
content = str(content)
|
|
124
|
-
|
|
97
|
+
|
|
125
98
|
# Clean the content - remove leading/trailing whitespace
|
|
126
99
|
content = content.strip()
|
|
127
|
-
|
|
100
|
+
|
|
101
|
+
# Early check: if no # delimiters at all, likely all prose/explanatory text
|
|
102
|
+
if '#' not in content:
|
|
103
|
+
logger.warning("LLM response contains no structured references (no # delimiters found)")
|
|
104
|
+
return []
|
|
105
|
+
|
|
128
106
|
# Split by double newlines first to handle paragraph-style formatting
|
|
129
107
|
# then fall back to single newlines
|
|
130
108
|
references = []
|
|
131
|
-
|
|
109
|
+
|
|
132
110
|
# Try double newline splitting first (paragraph style)
|
|
133
111
|
if '\n\n' in content:
|
|
134
112
|
potential_refs = content.split('\n\n')
|
|
135
113
|
else:
|
|
136
114
|
# Fall back to single newline splitting
|
|
137
115
|
potential_refs = content.split('\n')
|
|
138
|
-
|
|
116
|
+
|
|
117
|
+
import re
|
|
118
|
+
|
|
119
|
+
# Common prose patterns that indicate explanatory text
|
|
120
|
+
prose_starters = (
|
|
121
|
+
'this ', 'the ', 'i ', 'looking ', 'based on', 'it ',
|
|
122
|
+
'there ', 'these ', 'here ', 'note', 'please ', 'however',
|
|
123
|
+
'unfortunately', 'appears to', 'contains', 'following',
|
|
124
|
+
'above', 'below', 'after', 'before', 'when ', 'if ',
|
|
125
|
+
'as ', 'for ', 'from ', 'with ', 'without ', 'although'
|
|
126
|
+
)
|
|
127
|
+
|
|
139
128
|
for ref in potential_refs:
|
|
140
129
|
ref = ref.strip()
|
|
141
|
-
|
|
142
|
-
# Skip empty lines
|
|
130
|
+
|
|
131
|
+
# Skip empty lines
|
|
143
132
|
if not ref:
|
|
144
133
|
continue
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
if ref.startswith('#'):
|
|
148
|
-
continue
|
|
149
|
-
if 'extracted from the bibliography' in ref.lower():
|
|
134
|
+
|
|
135
|
+
# Skip lines starting with # (markdown headers or empty author field without title)
|
|
136
|
+
if ref.startswith('#') and not re.match(r'^#[^#]', ref):
|
|
150
137
|
continue
|
|
151
|
-
|
|
138
|
+
|
|
139
|
+
# Check for prose/explanatory text patterns
|
|
140
|
+
ref_lower = ref.lower()
|
|
141
|
+
|
|
142
|
+
# Skip common explanatory headers
|
|
143
|
+
if ref_lower.startswith(('reference', 'here are', 'below are', 'extracted', 'bibliography')):
|
|
152
144
|
continue
|
|
145
|
+
|
|
153
146
|
# Skip verbose LLM explanatory responses
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
147
|
+
skip_patterns = [
|
|
148
|
+
'extracted from the bibliography',
|
|
149
|
+
'formatted as a complete',
|
|
150
|
+
'cannot extract',
|
|
151
|
+
'appears to be from',
|
|
152
|
+
'no numbered reference markers',
|
|
153
|
+
'only figures',
|
|
154
|
+
'i cannot',
|
|
155
|
+
'i return nothing',
|
|
156
|
+
'return nothing',
|
|
157
|
+
'no valid bibliographic',
|
|
158
|
+
'numbered format specified',
|
|
159
|
+
'it contains',
|
|
160
|
+
'it does not contain',
|
|
161
|
+
'text appears to be',
|
|
162
|
+
'does not appear to contain',
|
|
163
|
+
'no references found',
|
|
164
|
+
'empty response',
|
|
165
|
+
'no bibliography',
|
|
166
|
+
'no actual bibliographic',
|
|
167
|
+
'no academic references',
|
|
168
|
+
'contains only numerical',
|
|
169
|
+
'data tables',
|
|
170
|
+
'evaluation rubric',
|
|
171
|
+
'publication metadata',
|
|
172
|
+
'citable sources',
|
|
173
|
+
'reference list',
|
|
174
|
+
]
|
|
175
|
+
if any(pattern in ref_lower for pattern in skip_patterns):
|
|
161
176
|
continue
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
if ref.lower().startswith('looking at'):
|
|
166
|
-
continue
|
|
167
|
-
# Skip responses that say "I return nothing" or similar
|
|
168
|
-
if 'i return nothing' in ref.lower() or 'return nothing' in ref.lower():
|
|
169
|
-
continue
|
|
170
|
-
# Skip responses that mention "no valid bibliographic references"
|
|
171
|
-
if 'no valid bibliographic' in ref.lower():
|
|
177
|
+
|
|
178
|
+
# Skip lines starting with common prose patterns
|
|
179
|
+
if ref_lower.startswith(prose_starters):
|
|
172
180
|
continue
|
|
173
|
-
|
|
174
|
-
if ref.lower().startswith('since there are no'):
|
|
181
|
+
if ref_lower.startswith('looking at'):
|
|
175
182
|
continue
|
|
176
|
-
|
|
177
|
-
if 'numbered format specified' in ref.lower():
|
|
183
|
+
if ref_lower.startswith('since there are'):
|
|
178
184
|
continue
|
|
179
|
-
|
|
180
|
-
|
|
185
|
+
|
|
186
|
+
# Key structural check: valid references MUST have # delimiters
|
|
187
|
+
if '#' not in ref:
|
|
188
|
+
# No delimiter = not a valid reference, skip it
|
|
189
|
+
logger.debug(f"Skipping line without # delimiter: {ref[:80]}...")
|
|
181
190
|
continue
|
|
182
|
-
|
|
191
|
+
|
|
183
192
|
# Remove common prefixes (bullets, numbers, etc.)
|
|
184
193
|
ref = ref.lstrip('- *•')
|
|
185
194
|
ref = ref.strip()
|
|
186
|
-
|
|
195
|
+
|
|
187
196
|
# Remove reference numbers like "1.", "[1]", "(1)" from the beginning
|
|
188
|
-
import re
|
|
189
197
|
ref = re.sub(r'^(\d+\.|\[\d+\]|\(\d+\))\s*', '', ref)
|
|
190
|
-
|
|
198
|
+
|
|
191
199
|
# Filter out very short lines (likely not complete references)
|
|
192
|
-
if len(ref) > 30: #
|
|
200
|
+
if len(ref) > 30: # Minimum length for academic references
|
|
193
201
|
references.append(ref)
|
|
194
|
-
|
|
202
|
+
|
|
195
203
|
return references
|
|
196
204
|
|
|
197
205
|
|
|
198
|
-
class OpenAIProvider(
|
|
206
|
+
class OpenAIProvider(LLMProviderMixin, LLMProvider):
|
|
199
207
|
"""OpenAI GPT provider for reference extraction"""
|
|
200
208
|
|
|
201
209
|
def __init__(self, config: Dict[str, Any]):
|
|
@@ -216,10 +224,6 @@ class OpenAIProvider(LLMProvider, LLMProviderMixin):
|
|
|
216
224
|
def extract_references(self, bibliography_text: str) -> List[str]:
|
|
217
225
|
return self.extract_references_with_chunking(bibliography_text)
|
|
218
226
|
|
|
219
|
-
def _create_extraction_prompt(self, bibliography_text: str) -> str:
|
|
220
|
-
"""Create prompt for reference extraction"""
|
|
221
|
-
return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
|
|
222
|
-
|
|
223
227
|
def _call_llm(self, prompt: str) -> str:
|
|
224
228
|
"""Make the actual OpenAI API call and return the response text"""
|
|
225
229
|
try:
|
|
@@ -239,7 +243,7 @@ class OpenAIProvider(LLMProvider, LLMProviderMixin):
|
|
|
239
243
|
raise
|
|
240
244
|
|
|
241
245
|
|
|
242
|
-
class AnthropicProvider(
|
|
246
|
+
class AnthropicProvider(LLMProviderMixin, LLMProvider):
|
|
243
247
|
"""Anthropic Claude provider for reference extraction"""
|
|
244
248
|
|
|
245
249
|
def __init__(self, config: Dict[str, Any]):
|
|
@@ -260,10 +264,6 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
|
|
|
260
264
|
def extract_references(self, bibliography_text: str) -> List[str]:
|
|
261
265
|
return self.extract_references_with_chunking(bibliography_text)
|
|
262
266
|
|
|
263
|
-
def _create_extraction_prompt(self, bibliography_text: str) -> str:
|
|
264
|
-
"""Create prompt for reference extraction"""
|
|
265
|
-
return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
|
|
266
|
-
|
|
267
267
|
def _call_llm(self, prompt: str) -> str:
|
|
268
268
|
"""Make the actual Anthropic API call and return the response text"""
|
|
269
269
|
try:
|
|
@@ -271,6 +271,7 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
|
|
|
271
271
|
model=self.model or "claude-sonnet-4-20250514",
|
|
272
272
|
max_tokens=self.max_tokens,
|
|
273
273
|
temperature=self.temperature,
|
|
274
|
+
system="You are a bibliographic reference extractor. You output ONLY structured reference data in the exact format specified. Never explain, describe, or comment on the input. Never output prose or sentences. If input contains no extractable references, return a completely empty response with no text.",
|
|
274
275
|
messages=[
|
|
275
276
|
{"role": "user", "content": prompt}
|
|
276
277
|
]
|
|
@@ -300,7 +301,7 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
|
|
|
300
301
|
raise
|
|
301
302
|
|
|
302
303
|
|
|
303
|
-
class GoogleProvider(
|
|
304
|
+
class GoogleProvider(LLMProviderMixin, LLMProvider):
|
|
304
305
|
"""Google Gemini provider for reference extraction"""
|
|
305
306
|
|
|
306
307
|
def __init__(self, config: Dict[str, Any]):
|
|
@@ -322,10 +323,6 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
|
|
|
322
323
|
def extract_references(self, bibliography_text: str) -> List[str]:
|
|
323
324
|
return self.extract_references_with_chunking(bibliography_text)
|
|
324
325
|
|
|
325
|
-
def _create_extraction_prompt(self, bibliography_text: str) -> str:
|
|
326
|
-
"""Create prompt for reference extraction"""
|
|
327
|
-
return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
|
|
328
|
-
|
|
329
326
|
def _call_llm(self, prompt: str) -> str:
|
|
330
327
|
"""Make the actual Google API call and return the response text"""
|
|
331
328
|
try:
|
|
@@ -360,7 +357,7 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
|
|
|
360
357
|
raise
|
|
361
358
|
|
|
362
359
|
|
|
363
|
-
class AzureProvider(
|
|
360
|
+
class AzureProvider(LLMProviderMixin, LLMProvider):
|
|
364
361
|
"""Azure OpenAI provider for reference extraction"""
|
|
365
362
|
|
|
366
363
|
def __init__(self, config: Dict[str, Any]):
|
|
@@ -394,10 +391,6 @@ class AzureProvider(LLMProvider, LLMProviderMixin):
|
|
|
394
391
|
def extract_references(self, bibliography_text: str) -> List[str]:
|
|
395
392
|
return self.extract_references_with_chunking(bibliography_text)
|
|
396
393
|
|
|
397
|
-
def _create_extraction_prompt(self, bibliography_text: str) -> str:
|
|
398
|
-
"""Create prompt for reference extraction"""
|
|
399
|
-
return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
|
|
400
|
-
|
|
401
394
|
def _call_llm(self, prompt: str) -> str:
|
|
402
395
|
"""Make the actual Azure OpenAI API call and return the response text"""
|
|
403
396
|
try:
|
|
@@ -416,7 +409,7 @@ class AzureProvider(LLMProvider, LLMProviderMixin):
|
|
|
416
409
|
logger.error(f"Azure API call failed: {e}")
|
|
417
410
|
raise
|
|
418
411
|
|
|
419
|
-
class vLLMProvider(
|
|
412
|
+
class vLLMProvider(LLMProviderMixin, LLMProvider):
|
|
420
413
|
"""vLLM provider using OpenAI-compatible server mode for local Hugging Face models"""
|
|
421
414
|
|
|
422
415
|
def __init__(self, config: Dict[str, Any]):
|
|
@@ -857,10 +850,6 @@ class vLLMProvider(LLMProvider, LLMProviderMixin):
|
|
|
857
850
|
def extract_references(self, bibliography_text: str) -> List[str]:
|
|
858
851
|
return self.extract_references_with_chunking(bibliography_text)
|
|
859
852
|
|
|
860
|
-
def _create_extraction_prompt(self, bibliography_text: str) -> str:
|
|
861
|
-
"""Create prompt for reference extraction"""
|
|
862
|
-
return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
|
|
863
|
-
|
|
864
853
|
def _call_llm(self, prompt: str) -> str:
|
|
865
854
|
"""Make the actual vLLM API call and return the response text"""
|
|
866
855
|
try:
|
refchecker/utils/author_utils.py
CHANGED
|
@@ -42,13 +42,26 @@ def compare_authors(cited_authors, correct_authors, threshold=0.8):
|
|
|
42
42
|
Compare two author lists and return similarity metrics
|
|
43
43
|
|
|
44
44
|
Args:
|
|
45
|
-
cited_authors: List of authors as cited
|
|
46
|
-
correct_authors: List of correct authors
|
|
45
|
+
cited_authors: List of authors as cited (can be strings or dicts with 'name' key)
|
|
46
|
+
correct_authors: List of correct authors (can be strings or dicts with 'name' key)
|
|
47
47
|
threshold: Similarity threshold (0-1)
|
|
48
48
|
|
|
49
49
|
Returns:
|
|
50
50
|
Dictionary with comparison results
|
|
51
51
|
"""
|
|
52
|
+
# Normalize author lists to strings (handle dict format from APIs)
|
|
53
|
+
def normalize_author_list(authors):
|
|
54
|
+
result = []
|
|
55
|
+
for a in authors:
|
|
56
|
+
if isinstance(a, dict):
|
|
57
|
+
result.append(a.get('name', str(a)))
|
|
58
|
+
else:
|
|
59
|
+
result.append(str(a))
|
|
60
|
+
return result
|
|
61
|
+
|
|
62
|
+
cited_authors = normalize_author_list(cited_authors) if cited_authors else []
|
|
63
|
+
correct_authors = normalize_author_list(correct_authors) if correct_authors else []
|
|
64
|
+
|
|
52
65
|
if not cited_authors or not correct_authors:
|
|
53
66
|
return {
|
|
54
67
|
'match': False,
|
|
@@ -164,8 +164,8 @@ def _parse_bibtex_references(bibliography_text):
|
|
|
164
164
|
Returns:
|
|
165
165
|
List of reference dictionaries
|
|
166
166
|
"""
|
|
167
|
-
from refchecker.utils.bibtex_parser import
|
|
168
|
-
return
|
|
167
|
+
from refchecker.utils.bibtex_parser import parse_bibtex_references
|
|
168
|
+
return parse_bibtex_references(bibliography_text)
|
|
169
169
|
|
|
170
170
|
|
|
171
171
|
def _parse_biblatex_references(bibliography_text):
|
{academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|