academic-refchecker 2.0.12__tar.gz → 2.0.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {academic_refchecker-2.0.12/academic_refchecker.egg-info → academic_refchecker-2.0.13}/PKG-INFO +1 -1
  2. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13/academic_refchecker.egg-info}/PKG-INFO +1 -1
  3. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/__version__.py +1 -1
  4. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/llm/base.py +1 -15
  5. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/llm/providers.py +102 -113
  6. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/LICENSE +0 -0
  7. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/MANIFEST.in +0 -0
  8. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/README.md +0 -0
  9. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/SOURCES.txt +0 -0
  10. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/dependency_links.txt +0 -0
  11. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/entry_points.txt +0 -0
  12. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/requires.txt +0 -0
  13. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/top_level.txt +0 -0
  14. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/__init__.py +0 -0
  15. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/__main__.py +0 -0
  16. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/cli.py +0 -0
  17. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/concurrency.py +0 -0
  18. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/database.py +0 -0
  19. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/main.py +0 -0
  20. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/models.py +0 -0
  21. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/refchecker_wrapper.py +0 -0
  22. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/static/assets/index-2P6L_39v.css +0 -0
  23. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/static/assets/index-hk21nqxR.js +0 -0
  24. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/static/favicon.svg +0 -0
  25. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/static/index.html +0 -0
  26. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/static/vite.svg +0 -0
  27. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/thumbnail.py +0 -0
  28. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/backend/websocket_manager.py +0 -0
  29. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/pyproject.toml +0 -0
  30. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/requirements.txt +0 -0
  31. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/scripts/download_db.py +0 -0
  32. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/scripts/run_tests.py +0 -0
  33. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/scripts/start_vllm_server.py +0 -0
  34. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/setup.cfg +0 -0
  35. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/__init__.py +0 -0
  36. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/__main__.py +0 -0
  37. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/checkers/__init__.py +0 -0
  38. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/checkers/arxiv_citation.py +0 -0
  39. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/checkers/crossref.py +0 -0
  40. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/checkers/enhanced_hybrid_checker.py +0 -0
  41. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/checkers/github_checker.py +0 -0
  42. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/checkers/local_semantic_scholar.py +0 -0
  43. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/checkers/openalex.py +0 -0
  44. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/checkers/openreview_checker.py +0 -0
  45. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/checkers/pdf_paper_checker.py +0 -0
  46. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/checkers/semantic_scholar.py +0 -0
  47. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/checkers/webpage_checker.py +0 -0
  48. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/config/__init__.py +0 -0
  49. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/config/logging.conf +0 -0
  50. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/config/settings.py +0 -0
  51. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/core/__init__.py +0 -0
  52. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/core/db_connection_pool.py +0 -0
  53. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/core/parallel_processor.py +0 -0
  54. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/core/refchecker.py +0 -0
  55. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/database/__init__.py +0 -0
  56. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/database/download_semantic_scholar_db.py +0 -0
  57. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/llm/__init__.py +0 -0
  58. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/scripts/__init__.py +0 -0
  59. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/scripts/start_vllm_server.py +0 -0
  60. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/services/__init__.py +0 -0
  61. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/services/pdf_processor.py +0 -0
  62. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/__init__.py +0 -0
  63. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/arxiv_rate_limiter.py +0 -0
  64. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/arxiv_utils.py +0 -0
  65. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/author_utils.py +0 -0
  66. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/biblatex_parser.py +0 -0
  67. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/bibliography_utils.py +0 -0
  68. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/bibtex_parser.py +0 -0
  69. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/config_validator.py +0 -0
  70. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/db_utils.py +0 -0
  71. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/doi_utils.py +0 -0
  72. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/error_utils.py +0 -0
  73. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/mock_objects.py +0 -0
  74. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/text_utils.py +0 -0
  75. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/unicode_utils.py +0 -0
  76. {academic_refchecker-2.0.12 → academic_refchecker-2.0.13}/src/refchecker/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 2.0.12
3
+ Version: 2.0.13
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 2.0.12
3
+ Version: 2.0.13
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "2.0.12"
3
+ __version__ = "2.0.13"
@@ -110,21 +110,7 @@ class LLMProvider(ABC):
110
110
 
111
111
  logger.debug(f"Created {len(chunks)} balanced overlapping chunks for parallel processing")
112
112
  return chunks
113
-
114
- def _parse_llm_response(self, response_text: str) -> List[str]:
115
- """Parse LLM response and extract individual references"""
116
- if not response_text:
117
- return []
118
-
119
- # Split by newlines and filter out empty lines
120
- references = []
121
- for line in response_text.strip().split('\n'):
122
- line = line.strip()
123
- if line and not line.startswith('#') and len(line) > 10: # Basic filtering
124
- references.append(line)
125
-
126
- return references
127
-
113
+
128
114
  def extract_references_with_chunking(self, bibliography_text: str) -> List[str]:
129
115
  """
130
116
  Template method that handles chunking for all providers.
@@ -62,52 +62,25 @@ class LLMProviderMixin:
62
62
  """Create prompt for reference extraction"""
63
63
  # Clean BibTeX formatting before sending to LLM
64
64
  cleaned_bibliography = self._clean_bibtex_for_llm(bibliography_text)
65
-
66
- return f"""
67
- Please extract individual references from the following bibliography text. Each reference should be a complete bibliographic entry.
68
-
69
- Instructions:
70
- 1. Split the bibliography into individual references based on numbered markers like [1], [2], etc.
71
- 2. IMPORTANT: References may span multiple lines. A single reference includes everything from one number marker (e.g., [37]) until the next number marker (e.g., [38])
72
- 3. For each reference, extract: authors, title, publication venue, year, and any URLs/DOIs
73
- - For BibTeX entries, extract fields correctly:
74
- * title = the actual paper title from "title" field
75
- * venue = from "journal", "booktitle", "conference" fields
76
- * Do NOT confuse journal names like "arXiv preprint arXiv:1234.5678" with paper titles
77
- 4. Include references that are incomplete, like only author names and titles, but ignore ones that are just a URL without other details
78
- 5. Place a hashmark (#) rather than period between fields of a reference, but asterisks (*) between individual authors
79
- e.g. Author1*Author2*Author3#Title#Venue#Year#URL
80
- 6. CRITICAL: When extracting authors, understand BibTeX author field format correctly
81
- - In BibTeX, the "author" field contains author names separated by " and " (not commas)
82
- - Individual author names may be in "Last, First" format (e.g., "Smith, John")
83
- - Multiple authors are separated by " and " (e.g., "Smith, John and Doe, Jane")
84
- - SPECIAL CASE for collaborations: Handle "Last, First and others" pattern correctly
85
- * author = {"Khachatryan, Vardan and others"} → ONE explicit author plus et al: "Vardan Khachatryan*et al"
86
- * author = {"Smith, John and others"} → ONE explicit author plus et al: "John Smith*et al"
87
- * The "Last, First and others" pattern indicates a collaboration paper where only the first author is listed explicitly
88
- - EXAMPLES:
89
- * author = {"Dolan, Brian P."} → ONE author: "Dolan, Brian P."
90
- * author = {"Smith, John and Doe, Jane"} → TWO authors: "Smith, John*Doe, Jane"
91
- * author = {"Arnab, Anurag and Dehghani, Mostafa and Heigold, Georg"} → THREE authors: "Arnab, Anurag*Dehghani, Mostafa*Heigold, Georg"
92
- * author = {"Khachatryan, Vardan and others"} → ONE explicit author plus et al: "Vardan Khachatryan*et al"
93
- - Use asterisks (*) to separate individual authors in your output
94
- - For "Last, First" format, convert to "First Last" for readability (e.g., "Smith, John" → "John Smith")
95
- - If a BibTeX entry has NO author field, output an empty author field (nothing before the first #)
96
- - Do NOT infer or guess authors based on title or context - only use what is explicitly stated
97
- 7. CRITICAL: When extracting authors, preserve "et al" and similar indicators exactly as they appear
98
- - If the original says "John Smith, Jane Doe, et al" then output "John Smith, Jane Doe, et al"
99
- - If the original says "John Smith et al." then output "John Smith et al."
100
- - Also preserve variations like "and others", "etc.", "..." when used to indicate additional authors
101
- - Do NOT expand "et al" into individual author names, even if you know them
102
- 8. Return ONLY the references, one per line
103
- 9. Do not include reference numbers like [1], [2], etc. in your output
104
- 10. Do not add any additional text or explanations
105
- 11. Ensure that URLs and DOIs are from the specific reference only
106
- - When extracting URLs, preserve the complete URL including protocol
107
- - For BibTeX howpublished fields, extract the full URL from the field value
108
- 12. When parsing multi-line references, combine all authors from all lines before the title
109
- 13. CRITICAL: If the text contains no valid bibliographic references (e.g., only figures, appendix material, or explanatory text), return ONLY an empty response with no text at all - do NOT explain why, do NOT describe what you see, do NOT say "I return nothing" or similar phrases
110
- 14. OUTPUT FORMAT: Your response must contain ONLY extracted references in the format specified above (Author1*Author2#Title#Venue#Year#URL), one per line. No introductory text, no explanations, no commentary, no "Looking at this text..." statements. If there are no references to extract, output absolutely nothing.
65
+
66
+ return f"""OUTPUT FORMAT (MANDATORY):
67
+ - Each line must be: Author1*Author2#Title#Venue#Year#URL
68
+ - Use # between fields, * between authors
69
+ - One reference per line
70
+ - NO other text allowed - no explanations, descriptions, or commentary
71
+ - If no valid references exist, return NOTHING (completely empty response)
72
+
73
+ EXTRACTION RULES:
74
+ 1. Split by numbered markers [1], [2], etc. - references may span multiple lines
75
+ 2. Extract: authors, title, venue (journal/booktitle), year, URLs/DOIs
76
+ 3. For BibTeX: "title" field = paper title, "journal"/"booktitle" = venue
77
+ 4. Handle author formats:
78
+ - "Last, First and others" "First Last*et al"
79
+ - "Last, First" → "First Last"
80
+ - Separate multiple authors with *
81
+ - Preserve "et al" exactly as written
82
+ 5. Skip entries that are only URLs without bibliographic data
83
+ 6. If no author field exists, start with # (empty author)
111
84
 
112
85
  Bibliography text:
113
86
  {cleaned_bibliography}
@@ -117,85 +90,120 @@ Bibliography text:
117
90
  """Parse LLM response into list of references"""
118
91
  if not content:
119
92
  return []
120
-
93
+
121
94
  # Ensure content is a string
122
95
  if not isinstance(content, str):
123
96
  content = str(content)
124
-
97
+
125
98
  # Clean the content - remove leading/trailing whitespace
126
99
  content = content.strip()
127
-
100
+
101
+ # Early check: if no # delimiters at all, likely all prose/explanatory text
102
+ if '#' not in content:
103
+ logger.warning("LLM response contains no structured references (no # delimiters found)")
104
+ return []
105
+
128
106
  # Split by double newlines first to handle paragraph-style formatting
129
107
  # then fall back to single newlines
130
108
  references = []
131
-
109
+
132
110
  # Try double newline splitting first (paragraph style)
133
111
  if '\n\n' in content:
134
112
  potential_refs = content.split('\n\n')
135
113
  else:
136
114
  # Fall back to single newline splitting
137
115
  potential_refs = content.split('\n')
138
-
116
+
117
+ import re
118
+
119
+ # Common prose patterns that indicate explanatory text
120
+ prose_starters = (
121
+ 'this ', 'the ', 'i ', 'looking ', 'based on', 'it ',
122
+ 'there ', 'these ', 'here ', 'note', 'please ', 'however',
123
+ 'unfortunately', 'appears to', 'contains', 'following',
124
+ 'above', 'below', 'after', 'before', 'when ', 'if ',
125
+ 'as ', 'for ', 'from ', 'with ', 'without ', 'although'
126
+ )
127
+
139
128
  for ref in potential_refs:
140
129
  ref = ref.strip()
141
-
142
- # Skip empty lines, headers, and explanatory text
130
+
131
+ # Skip empty lines
143
132
  if not ref:
144
133
  continue
145
- if ref.lower().startswith(('reference', 'here are', 'below are', 'extracted', 'bibliography')):
146
- continue
147
- if ref.startswith('#'):
148
- continue
149
- if 'extracted from the bibliography' in ref.lower():
134
+
135
+ # Skip lines starting with # (markdown headers or empty author field without title)
136
+ if ref.startswith('#') and not re.match(r'^#[^#]', ref):
150
137
  continue
151
- if 'formatted as a complete' in ref.lower():
138
+
139
+ # Check for prose/explanatory text patterns
140
+ ref_lower = ref.lower()
141
+
142
+ # Skip common explanatory headers
143
+ if ref_lower.startswith(('reference', 'here are', 'below are', 'extracted', 'bibliography')):
152
144
  continue
145
+
153
146
  # Skip verbose LLM explanatory responses
154
- if 'cannot extract' in ref.lower() and ('references' in ref.lower() or 'bibliographic' in ref.lower()):
155
- continue
156
- if 'appears to be from' in ref.lower() and 'appendix' in ref.lower():
157
- continue
158
- if 'no numbered reference markers' in ref.lower():
159
- continue
160
- if 'only figures' in ref.lower() and 'learning curves' in ref.lower():
147
+ skip_patterns = [
148
+ 'extracted from the bibliography',
149
+ 'formatted as a complete',
150
+ 'cannot extract',
151
+ 'appears to be from',
152
+ 'no numbered reference markers',
153
+ 'only figures',
154
+ 'i cannot',
155
+ 'i return nothing',
156
+ 'return nothing',
157
+ 'no valid bibliographic',
158
+ 'numbered format specified',
159
+ 'it contains',
160
+ 'it does not contain',
161
+ 'text appears to be',
162
+ 'does not appear to contain',
163
+ 'no references found',
164
+ 'empty response',
165
+ 'no bibliography',
166
+ 'no actual bibliographic',
167
+ 'no academic references',
168
+ 'contains only numerical',
169
+ 'data tables',
170
+ 'evaluation rubric',
171
+ 'publication metadata',
172
+ 'citable sources',
173
+ 'reference list',
174
+ ]
175
+ if any(pattern in ref_lower for pattern in skip_patterns):
161
176
  continue
162
- if ref.lower().startswith('i cannot'):
163
- continue
164
- # Skip "Looking at this text..." explanatory responses
165
- if ref.lower().startswith('looking at'):
166
- continue
167
- # Skip responses that say "I return nothing" or similar
168
- if 'i return nothing' in ref.lower() or 'return nothing' in ref.lower():
169
- continue
170
- # Skip responses that mention "no valid bibliographic references"
171
- if 'no valid bibliographic' in ref.lower():
177
+
178
+ # Skip lines starting with common prose patterns
179
+ if ref_lower.startswith(prose_starters):
172
180
  continue
173
- # Skip responses that say "Since there are no"
174
- if ref.lower().startswith('since there are no'):
181
+ if ref_lower.startswith('looking at'):
175
182
  continue
176
- # Skip responses that mention "numbered format specified"
177
- if 'numbered format specified' in ref.lower():
183
+ if ref_lower.startswith('since there are'):
178
184
  continue
179
- # Skip responses that describe what the text contains instead of extracting
180
- if ('it contains' in ref.lower() or 'it does not contain' in ref.lower()) and 'bibliography' in ref.lower():
185
+
186
+ # Key structural check: valid references MUST have # delimiters
187
+ if '#' not in ref:
188
+ # No delimiter = not a valid reference, skip it
189
+ logger.debug(f"Skipping line without # delimiter: {ref[:80]}...")
181
190
  continue
182
-
191
+
183
192
  # Remove common prefixes (bullets, numbers, etc.)
184
193
  ref = ref.lstrip('- *•')
185
194
  ref = ref.strip()
186
-
195
+
187
196
  # Remove reference numbers like "1.", "[1]", "(1)" from the beginning
188
- import re
189
197
  ref = re.sub(r'^(\d+\.|\[\d+\]|\(\d+\))\s*', '', ref)
190
-
198
+
191
199
  # Filter out very short lines (likely not complete references)
192
- if len(ref) > 30: # Increased minimum length for academic references
200
+ if len(ref) > 30: # Minimum length for academic references
193
201
  references.append(ref)
194
-
202
+
195
203
  return references
196
204
 
197
205
 
198
- class OpenAIProvider(LLMProvider, LLMProviderMixin):
206
+ class OpenAIProvider(LLMProviderMixin, LLMProvider):
199
207
  """OpenAI GPT provider for reference extraction"""
200
208
 
201
209
  def __init__(self, config: Dict[str, Any]):
@@ -216,10 +224,6 @@ class OpenAIProvider(LLMProvider, LLMProviderMixin):
216
224
  def extract_references(self, bibliography_text: str) -> List[str]:
217
225
  return self.extract_references_with_chunking(bibliography_text)
218
226
 
219
- def _create_extraction_prompt(self, bibliography_text: str) -> str:
220
- """Create prompt for reference extraction"""
221
- return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
222
-
223
227
  def _call_llm(self, prompt: str) -> str:
224
228
  """Make the actual OpenAI API call and return the response text"""
225
229
  try:
@@ -239,7 +243,7 @@ class OpenAIProvider(LLMProvider, LLMProviderMixin):
239
243
  raise
240
244
 
241
245
 
242
- class AnthropicProvider(LLMProvider, LLMProviderMixin):
246
+ class AnthropicProvider(LLMProviderMixin, LLMProvider):
243
247
  """Anthropic Claude provider for reference extraction"""
244
248
 
245
249
  def __init__(self, config: Dict[str, Any]):
@@ -260,10 +264,6 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
260
264
  def extract_references(self, bibliography_text: str) -> List[str]:
261
265
  return self.extract_references_with_chunking(bibliography_text)
262
266
 
263
- def _create_extraction_prompt(self, bibliography_text: str) -> str:
264
- """Create prompt for reference extraction"""
265
- return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
266
-
267
267
  def _call_llm(self, prompt: str) -> str:
268
268
  """Make the actual Anthropic API call and return the response text"""
269
269
  try:
@@ -271,6 +271,7 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
271
271
  model=self.model or "claude-sonnet-4-20250514",
272
272
  max_tokens=self.max_tokens,
273
273
  temperature=self.temperature,
274
+ system="You are a bibliographic reference extractor. You output ONLY structured reference data in the exact format specified. Never explain, describe, or comment on the input. Never output prose or sentences. If input contains no extractable references, return a completely empty response with no text.",
274
275
  messages=[
275
276
  {"role": "user", "content": prompt}
276
277
  ]
@@ -300,7 +301,7 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
300
301
  raise
301
302
 
302
303
 
303
- class GoogleProvider(LLMProvider, LLMProviderMixin):
304
+ class GoogleProvider(LLMProviderMixin, LLMProvider):
304
305
  """Google Gemini provider for reference extraction"""
305
306
 
306
307
  def __init__(self, config: Dict[str, Any]):
@@ -322,10 +323,6 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
322
323
  def extract_references(self, bibliography_text: str) -> List[str]:
323
324
  return self.extract_references_with_chunking(bibliography_text)
324
325
 
325
- def _create_extraction_prompt(self, bibliography_text: str) -> str:
326
- """Create prompt for reference extraction"""
327
- return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
328
-
329
326
  def _call_llm(self, prompt: str) -> str:
330
327
  """Make the actual Google API call and return the response text"""
331
328
  try:
@@ -360,7 +357,7 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
360
357
  raise
361
358
 
362
359
 
363
- class AzureProvider(LLMProvider, LLMProviderMixin):
360
+ class AzureProvider(LLMProviderMixin, LLMProvider):
364
361
  """Azure OpenAI provider for reference extraction"""
365
362
 
366
363
  def __init__(self, config: Dict[str, Any]):
@@ -394,10 +391,6 @@ class AzureProvider(LLMProvider, LLMProviderMixin):
394
391
  def extract_references(self, bibliography_text: str) -> List[str]:
395
392
  return self.extract_references_with_chunking(bibliography_text)
396
393
 
397
- def _create_extraction_prompt(self, bibliography_text: str) -> str:
398
- """Create prompt for reference extraction"""
399
- return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
400
-
401
394
  def _call_llm(self, prompt: str) -> str:
402
395
  """Make the actual Azure OpenAI API call and return the response text"""
403
396
  try:
@@ -416,7 +409,7 @@ class AzureProvider(LLMProvider, LLMProviderMixin):
416
409
  logger.error(f"Azure API call failed: {e}")
417
410
  raise
418
411
 
419
- class vLLMProvider(LLMProvider, LLMProviderMixin):
412
+ class vLLMProvider(LLMProviderMixin, LLMProvider):
420
413
  """vLLM provider using OpenAI-compatible server mode for local Hugging Face models"""
421
414
 
422
415
  def __init__(self, config: Dict[str, Any]):
@@ -857,10 +850,6 @@ class vLLMProvider(LLMProvider, LLMProviderMixin):
857
850
  def extract_references(self, bibliography_text: str) -> List[str]:
858
851
  return self.extract_references_with_chunking(bibliography_text)
859
852
 
860
- def _create_extraction_prompt(self, bibliography_text: str) -> str:
861
- """Create prompt for reference extraction"""
862
- return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
863
-
864
853
  def _call_llm(self, prompt: str) -> str:
865
854
  """Make the actual vLLM API call and return the response text"""
866
855
  try: