academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
BibTeX format parser utility
|
|
4
|
+
|
|
5
|
+
Handles parsing of standard BibTeX format references like:
|
|
6
|
+
@article{key,
|
|
7
|
+
title={Title},
|
|
8
|
+
author={Author Name and Other Author},
|
|
9
|
+
year={2023}
|
|
10
|
+
}
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
import logging
|
|
15
|
+
from typing import List, Dict, Any
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def detect_bibtex_format(text: str) -> bool:
|
|
21
|
+
"""
|
|
22
|
+
Detect if text contains BibTeX format references
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
text: Text to analyze
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
True if BibTeX format detected, False otherwise
|
|
29
|
+
"""
|
|
30
|
+
# Look for BibTeX entry patterns
|
|
31
|
+
return bool(re.search(r'@\w+\s*\{', text))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_bibtex_entries(bib_content: str) -> List[Dict[str, Any]]:
|
|
35
|
+
"""
|
|
36
|
+
Parse BibTeX entries from text content
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
bib_content: String containing BibTeX entries
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of dictionaries, each containing a parsed BibTeX entry
|
|
43
|
+
"""
|
|
44
|
+
if not bib_content:
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
entries = []
|
|
48
|
+
|
|
49
|
+
# Pattern to match BibTeX entries (excluding @string, @comment, @preamble)
|
|
50
|
+
# First find entry starts, then use brace counting for proper boundaries
|
|
51
|
+
entry_start_pattern = r'@(article|inproceedings|incproceedings|book|incollection|inbook|proceedings|techreport|mastersthesis|masterthesis|phdthesis|misc|unpublished|conference|manual|booklet|collection)\s*\{\s*([^,]+)\s*,'
|
|
52
|
+
|
|
53
|
+
# Find entry starts and extract complete entries using brace counting
|
|
54
|
+
start_matches = list(re.finditer(entry_start_pattern, bib_content, re.DOTALL | re.IGNORECASE))
|
|
55
|
+
|
|
56
|
+
for start_match in start_matches:
|
|
57
|
+
entry_type = start_match.group(1).lower()
|
|
58
|
+
entry_key = start_match.group(2).strip()
|
|
59
|
+
|
|
60
|
+
# Find the complete entry by counting braces
|
|
61
|
+
start_pos = start_match.start()
|
|
62
|
+
brace_start = bib_content.find('{', start_pos)
|
|
63
|
+
if brace_start == -1:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
# Count braces to find the end of this entry
|
|
67
|
+
brace_count = 0
|
|
68
|
+
end_pos = brace_start
|
|
69
|
+
|
|
70
|
+
for i, char in enumerate(bib_content[brace_start:], brace_start):
|
|
71
|
+
if char == '{':
|
|
72
|
+
brace_count += 1
|
|
73
|
+
elif char == '}':
|
|
74
|
+
brace_count -= 1
|
|
75
|
+
if brace_count == 0:
|
|
76
|
+
end_pos = i + 1
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
if brace_count != 0:
|
|
80
|
+
logger.warning(f"Unbalanced braces in BibTeX entry starting at position {start_pos}")
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
# Extract the entry content (inside the outermost braces)
|
|
84
|
+
entry_content = bib_content[brace_start+1:end_pos-1]
|
|
85
|
+
|
|
86
|
+
# Parse the entry content
|
|
87
|
+
parsed_entry = parse_bibtex_entry_content(entry_type, entry_key, entry_content)
|
|
88
|
+
if parsed_entry:
|
|
89
|
+
entries.append(parsed_entry)
|
|
90
|
+
|
|
91
|
+
return entries
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def parse_bibtex_entry_content(entry_type: str, entry_key: str, content: str) -> Dict[str, Any]:
|
|
95
|
+
"""
|
|
96
|
+
Parse the content of a single BibTeX entry
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
entry_type: Type of BibTeX entry (article, inproceedings, etc.)
|
|
100
|
+
entry_key: The citation key
|
|
101
|
+
content: Content inside the braces
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Dictionary with parsed entry data
|
|
105
|
+
"""
|
|
106
|
+
fields = {}
|
|
107
|
+
|
|
108
|
+
# Use a more robust approach with manual parsing
|
|
109
|
+
i = 0
|
|
110
|
+
while i < len(content):
|
|
111
|
+
# Skip whitespace
|
|
112
|
+
while i < len(content) and content[i].isspace():
|
|
113
|
+
i += 1
|
|
114
|
+
|
|
115
|
+
if i >= len(content):
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
# Look for field name
|
|
119
|
+
field_start = i
|
|
120
|
+
while i < len(content) and (content[i].isalnum() or content[i] == '_'):
|
|
121
|
+
i += 1
|
|
122
|
+
|
|
123
|
+
if i == field_start:
|
|
124
|
+
i += 1 # Skip non-alphanumeric character
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
field_name = content[field_start:i].lower()
|
|
128
|
+
|
|
129
|
+
# Skip whitespace
|
|
130
|
+
while i < len(content) and content[i].isspace():
|
|
131
|
+
i += 1
|
|
132
|
+
|
|
133
|
+
# Look for equals sign
|
|
134
|
+
if i >= len(content) or content[i] != '=':
|
|
135
|
+
continue
|
|
136
|
+
i += 1 # Skip '='
|
|
137
|
+
|
|
138
|
+
# Skip whitespace
|
|
139
|
+
while i < len(content) and content[i].isspace():
|
|
140
|
+
i += 1
|
|
141
|
+
|
|
142
|
+
if i >= len(content):
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
# Parse field value
|
|
146
|
+
field_value = ""
|
|
147
|
+
if content[i] == '"':
|
|
148
|
+
# Handle quoted strings
|
|
149
|
+
i += 1 # Skip opening quote
|
|
150
|
+
value_start = i
|
|
151
|
+
while i < len(content) and content[i] != '"':
|
|
152
|
+
i += 1
|
|
153
|
+
if i < len(content):
|
|
154
|
+
field_value = content[value_start:i]
|
|
155
|
+
i += 1 # Skip closing quote
|
|
156
|
+
elif content[i] == '{':
|
|
157
|
+
# Handle braced strings with proper nesting
|
|
158
|
+
brace_count = 0
|
|
159
|
+
value_start = i + 1 # Skip opening brace
|
|
160
|
+
i += 1
|
|
161
|
+
while i < len(content):
|
|
162
|
+
if content[i] == '{':
|
|
163
|
+
brace_count += 1
|
|
164
|
+
elif content[i] == '}':
|
|
165
|
+
if brace_count == 0:
|
|
166
|
+
break
|
|
167
|
+
brace_count -= 1
|
|
168
|
+
i += 1
|
|
169
|
+
|
|
170
|
+
if i < len(content):
|
|
171
|
+
field_value = content[value_start:i]
|
|
172
|
+
i += 1 # Skip closing brace
|
|
173
|
+
|
|
174
|
+
if field_value:
|
|
175
|
+
field_value = field_value.strip()
|
|
176
|
+
# Strip outer quotes if present (handles cases like title = {"Some Title"})
|
|
177
|
+
if field_value.startswith('"') and field_value.endswith('"'):
|
|
178
|
+
field_value = field_value[1:-1]
|
|
179
|
+
fields[field_name] = field_value
|
|
180
|
+
|
|
181
|
+
# Skip to next field (look for comma)
|
|
182
|
+
while i < len(content) and content[i] not in ',}':
|
|
183
|
+
i += 1
|
|
184
|
+
if i < len(content) and content[i] == ',':
|
|
185
|
+
i += 1
|
|
186
|
+
|
|
187
|
+
# Fallback to regex if manual parsing failed
|
|
188
|
+
if not fields:
|
|
189
|
+
logger.debug("Manual parsing failed, trying regex approach")
|
|
190
|
+
field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
|
|
191
|
+
|
|
192
|
+
for match in re.finditer(field_pattern, content, re.DOTALL):
|
|
193
|
+
field_name = match.group(1).lower()
|
|
194
|
+
field_value = match.group(2) or match.group(3) or ""
|
|
195
|
+
field_value = field_value.strip()
|
|
196
|
+
if field_value.startswith('"') and field_value.endswith('"'):
|
|
197
|
+
field_value = field_value[1:-1]
|
|
198
|
+
fields[field_name] = field_value
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
'type': entry_type,
|
|
202
|
+
'key': entry_key,
|
|
203
|
+
'fields': fields
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
|
|
208
|
+
"""
|
|
209
|
+
Parse BibTeX formatted references into structured format
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
bibliography_text: String containing BibTeX entries
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
List of structured reference dictionaries
|
|
216
|
+
"""
|
|
217
|
+
from refchecker.utils.text_utils import parse_authors_with_initials, clean_title
|
|
218
|
+
from refchecker.utils.doi_utils import construct_doi_url, is_valid_doi_format
|
|
219
|
+
|
|
220
|
+
entries = parse_bibtex_entries(bibliography_text)
|
|
221
|
+
references = []
|
|
222
|
+
|
|
223
|
+
for entry in entries:
|
|
224
|
+
entry_type = entry['type']
|
|
225
|
+
fields = entry['fields']
|
|
226
|
+
|
|
227
|
+
# Extract required information
|
|
228
|
+
title = fields.get('title', '')
|
|
229
|
+
# Remove braces from BibTeX titles before cleaning
|
|
230
|
+
if title.startswith('{') and title.endswith('}'):
|
|
231
|
+
title = title[1:-1]
|
|
232
|
+
title = clean_title(title)
|
|
233
|
+
|
|
234
|
+
# Parse authors
|
|
235
|
+
authors_raw = fields.get('author', '')
|
|
236
|
+
authors = []
|
|
237
|
+
if authors_raw:
|
|
238
|
+
try:
|
|
239
|
+
authors = parse_authors_with_initials(authors_raw)
|
|
240
|
+
except Exception as e:
|
|
241
|
+
logger.debug(f"Author parsing failed for '{authors_raw}': {e}")
|
|
242
|
+
# Fallback: split by 'and' and clean up
|
|
243
|
+
author_parts = authors_raw.split(' and ')
|
|
244
|
+
for part in author_parts:
|
|
245
|
+
# Remove leading "and" from author names (handles cases like "and Krishnamoorthy, S")
|
|
246
|
+
part = re.sub(r'^and\s+', '', part.strip())
|
|
247
|
+
if part:
|
|
248
|
+
authors.append(part)
|
|
249
|
+
|
|
250
|
+
# Extract year
|
|
251
|
+
year_str = fields.get('year', '')
|
|
252
|
+
year = None
|
|
253
|
+
if year_str:
|
|
254
|
+
try:
|
|
255
|
+
year = int(year_str)
|
|
256
|
+
except (ValueError, TypeError):
|
|
257
|
+
# Try to extract year from string like "2023-04"
|
|
258
|
+
year_match = re.search(r'(\d{4})', year_str)
|
|
259
|
+
if year_match:
|
|
260
|
+
try:
|
|
261
|
+
year = int(year_match.group(1))
|
|
262
|
+
except ValueError:
|
|
263
|
+
pass
|
|
264
|
+
|
|
265
|
+
# If no year found but we have a valid title/authors, try extracting from eprint field
|
|
266
|
+
if year is None and (title or authors):
|
|
267
|
+
eprint = fields.get('eprint', '')
|
|
268
|
+
if eprint:
|
|
269
|
+
# Extract year from ArXiv eprint ID (e.g., "2311.09096" -> 2023)
|
|
270
|
+
eprint_year_match = re.match(r'^(\d{2})(\d{2})', eprint)
|
|
271
|
+
if eprint_year_match:
|
|
272
|
+
yy = int(eprint_year_match.group(1))
|
|
273
|
+
# Convert to 4-digit year (23 -> 2023, assumes 21st century)
|
|
274
|
+
if yy >= 91: # ArXiv started in 1991
|
|
275
|
+
year = 1900 + yy
|
|
276
|
+
else:
|
|
277
|
+
year = 2000 + yy
|
|
278
|
+
|
|
279
|
+
# Extract journal/venue
|
|
280
|
+
journal = fields.get('journal', fields.get('booktitle', fields.get('venue', '')))
|
|
281
|
+
# Remove braces from journal/venue names
|
|
282
|
+
if journal and journal.startswith('{') and journal.endswith('}'):
|
|
283
|
+
journal = journal[1:-1]
|
|
284
|
+
|
|
285
|
+
# Extract DOI and construct URL
|
|
286
|
+
doi = fields.get('doi', '')
|
|
287
|
+
doi_url = None
|
|
288
|
+
if doi and is_valid_doi_format(doi):
|
|
289
|
+
doi_url = construct_doi_url(doi)
|
|
290
|
+
|
|
291
|
+
# Extract other URLs
|
|
292
|
+
url = fields.get('url', '')
|
|
293
|
+
if url:
|
|
294
|
+
from refchecker.utils.url_utils import clean_url
|
|
295
|
+
url = clean_url(url)
|
|
296
|
+
|
|
297
|
+
# Handle special @misc entries with only howpublished field
|
|
298
|
+
if not title and not authors and entry_type == 'misc':
|
|
299
|
+
howpublished = fields.get('howpublished', '')
|
|
300
|
+
if howpublished:
|
|
301
|
+
# Try to extract a URL from howpublished
|
|
302
|
+
url_patterns = [
|
|
303
|
+
r'://([^/]+)', # Missing protocol case: "://example.com/path"
|
|
304
|
+
r'https?://([^/\s]+)', # Standard URL
|
|
305
|
+
r'www\.([^/\s]+)', # www without protocol
|
|
306
|
+
]
|
|
307
|
+
|
|
308
|
+
for pattern in url_patterns:
|
|
309
|
+
match = re.search(pattern, howpublished)
|
|
310
|
+
if match:
|
|
311
|
+
domain = match.group(1)
|
|
312
|
+
# Reconstruct URL with https if protocol was missing
|
|
313
|
+
if howpublished.startswith('://'):
|
|
314
|
+
url = 'https' + howpublished
|
|
315
|
+
elif not howpublished.startswith(('http://', 'https://')):
|
|
316
|
+
url = 'https://' + howpublished
|
|
317
|
+
else:
|
|
318
|
+
url = howpublished
|
|
319
|
+
|
|
320
|
+
# Clean the reconstructed URL
|
|
321
|
+
from refchecker.utils.url_utils import clean_url
|
|
322
|
+
url = clean_url(url)
|
|
323
|
+
|
|
324
|
+
# Generate title from domain/path
|
|
325
|
+
if 'jailbreakchat.com' in domain:
|
|
326
|
+
title = 'JailbreakChat Website'
|
|
327
|
+
elif 'lesswrong.com' in domain:
|
|
328
|
+
title = 'LessWrong Post: Jailbreaking ChatGPT'
|
|
329
|
+
elif 'chat.openai.com' in domain:
|
|
330
|
+
title = 'ChatGPT Conversation Share'
|
|
331
|
+
elif 'gemini.google.com' in domain:
|
|
332
|
+
title = 'Gemini Conversation Share'
|
|
333
|
+
elif 'microsoft.com' in domain:
|
|
334
|
+
title = 'Microsoft Azure Content Safety API'
|
|
335
|
+
elif 'perspectiveapi.com' in domain:
|
|
336
|
+
title = 'Perspective API'
|
|
337
|
+
else:
|
|
338
|
+
# Generic title based on domain
|
|
339
|
+
title = f"Web Resource: {domain}"
|
|
340
|
+
|
|
341
|
+
authors = ["Web Resource"]
|
|
342
|
+
break
|
|
343
|
+
|
|
344
|
+
# Handle regular URL field
|
|
345
|
+
if not url:
|
|
346
|
+
url = fields.get('url', fields.get('howpublished', ''))
|
|
347
|
+
|
|
348
|
+
if url.startswith('\\url{') and url.endswith('}'):
|
|
349
|
+
url = url[5:-1] # Remove \url{...}
|
|
350
|
+
|
|
351
|
+
# Clean any URL we extracted
|
|
352
|
+
if url:
|
|
353
|
+
from refchecker.utils.url_utils import clean_url
|
|
354
|
+
url = clean_url(url)
|
|
355
|
+
|
|
356
|
+
# Construct ArXiv URL from eprint field if no URL present
|
|
357
|
+
if not url and not doi_url:
|
|
358
|
+
eprint = fields.get('eprint', '')
|
|
359
|
+
if eprint and re.match(r'^\d{4}\.\d{4,5}', eprint):
|
|
360
|
+
# Remove version number if present and construct ArXiv URL
|
|
361
|
+
clean_eprint = re.sub(r'v\d+$', '', eprint)
|
|
362
|
+
url = f"https://arxiv.org/abs/{clean_eprint}"
|
|
363
|
+
|
|
364
|
+
# Determine publication URL (prefer DOI, then URL field)
|
|
365
|
+
publication_url = doi_url if doi_url else url
|
|
366
|
+
|
|
367
|
+
# Apply defaults only if we still don't have values
|
|
368
|
+
if not authors:
|
|
369
|
+
authors = ["Unknown Author"]
|
|
370
|
+
|
|
371
|
+
# Clean title
|
|
372
|
+
if not title:
|
|
373
|
+
title = "Unknown Title"
|
|
374
|
+
|
|
375
|
+
# Determine reference type (for compatibility)
|
|
376
|
+
ref_type = 'other'
|
|
377
|
+
if 'arxiv' in publication_url.lower() if publication_url else False or 'arxiv' in title.lower():
|
|
378
|
+
ref_type = 'arxiv'
|
|
379
|
+
elif publication_url or doi:
|
|
380
|
+
ref_type = 'non-arxiv'
|
|
381
|
+
|
|
382
|
+
# Create structured reference (matching old format)
|
|
383
|
+
reference = {
|
|
384
|
+
'title': title,
|
|
385
|
+
'authors': authors,
|
|
386
|
+
'year': year,
|
|
387
|
+
'journal': journal,
|
|
388
|
+
'doi': doi,
|
|
389
|
+
'url': publication_url if publication_url else '',
|
|
390
|
+
'type': ref_type,
|
|
391
|
+
'bibtex_key': entry['key'],
|
|
392
|
+
'bibtex_type': entry_type,
|
|
393
|
+
'raw_text': f"@{entry_type}{{{entry['key']}, ...}}" # Simplified raw text
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
# Add additional fields based on entry type
|
|
397
|
+
if entry_type == 'inproceedings' or entry_type == 'incproceedings':
|
|
398
|
+
reference['pages'] = fields.get('pages', '')
|
|
399
|
+
reference['organization'] = fields.get('organization', '')
|
|
400
|
+
elif entry_type == 'article':
|
|
401
|
+
reference['volume'] = fields.get('volume', '')
|
|
402
|
+
reference['number'] = fields.get('number', '')
|
|
403
|
+
reference['pages'] = fields.get('pages', '')
|
|
404
|
+
elif entry_type == 'book':
|
|
405
|
+
reference['publisher'] = fields.get('publisher', '')
|
|
406
|
+
reference['isbn'] = fields.get('isbn', '')
|
|
407
|
+
|
|
408
|
+
references.append(reference)
|
|
409
|
+
|
|
410
|
+
logger.debug(f"Extracted {len(references)} BibTeX references")
|
|
411
|
+
return references
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Configuration validation utilities for ArXiv Reference Checker
|
|
4
|
+
Provides validation for configuration files and settings
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Dict, Any, List, Optional, Union
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ValidationResult:
|
|
16
|
+
"""Result of configuration validation"""
|
|
17
|
+
is_valid: bool
|
|
18
|
+
errors: List[str]
|
|
19
|
+
warnings: List[str]
|
|
20
|
+
|
|
21
|
+
def __post_init__(self):
|
|
22
|
+
if self.errors is None:
|
|
23
|
+
self.errors = []
|
|
24
|
+
if self.warnings is None:
|
|
25
|
+
self.warnings = []
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ConfigValidator:
|
|
29
|
+
"""Validates configuration dictionaries"""
|
|
30
|
+
|
|
31
|
+
def __init__(self):
|
|
32
|
+
self.required_sections = ['llm', 'processing', 'apis']
|
|
33
|
+
self.llm_providers = ['openai', 'anthropic', 'google', 'azure', 'vllm']
|
|
34
|
+
|
|
35
|
+
def validate_config(self, config: Dict[str, Any]) -> ValidationResult:
|
|
36
|
+
"""
|
|
37
|
+
Validate a complete configuration dictionary
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
config: Configuration dictionary to validate
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
ValidationResult with validation status and messages
|
|
44
|
+
"""
|
|
45
|
+
errors = []
|
|
46
|
+
warnings = []
|
|
47
|
+
|
|
48
|
+
# Check required sections
|
|
49
|
+
for section in self.required_sections:
|
|
50
|
+
if section not in config:
|
|
51
|
+
errors.append(f"Missing required section: {section}")
|
|
52
|
+
else:
|
|
53
|
+
# Validate individual sections
|
|
54
|
+
section_result = self._validate_section(section, config[section])
|
|
55
|
+
errors.extend(section_result.errors)
|
|
56
|
+
warnings.extend(section_result.warnings)
|
|
57
|
+
|
|
58
|
+
return ValidationResult(
|
|
59
|
+
is_valid=len(errors) == 0,
|
|
60
|
+
errors=errors,
|
|
61
|
+
warnings=warnings
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def _validate_section(self, section_name: str, section_config: Dict[str, Any]) -> ValidationResult:
|
|
65
|
+
"""Validate a specific configuration section"""
|
|
66
|
+
if section_name == 'llm':
|
|
67
|
+
return self._validate_llm_config(section_config)
|
|
68
|
+
elif section_name == 'processing':
|
|
69
|
+
return self._validate_processing_config(section_config)
|
|
70
|
+
elif section_name == 'apis':
|
|
71
|
+
return self._validate_apis_config(section_config)
|
|
72
|
+
else:
|
|
73
|
+
return ValidationResult(True, [], [])
|
|
74
|
+
|
|
75
|
+
def _validate_llm_config(self, llm_config: Dict[str, Any]) -> ValidationResult:
|
|
76
|
+
"""Validate LLM configuration"""
|
|
77
|
+
errors = []
|
|
78
|
+
warnings = []
|
|
79
|
+
|
|
80
|
+
# Check provider configurations
|
|
81
|
+
for provider in self.llm_providers:
|
|
82
|
+
if provider in llm_config:
|
|
83
|
+
provider_config = llm_config[provider]
|
|
84
|
+
if not isinstance(provider_config, dict):
|
|
85
|
+
errors.append(f"LLM provider {provider} config must be a dictionary")
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
# Validate provider-specific settings
|
|
89
|
+
provider_result = self._validate_llm_provider_config(provider, provider_config)
|
|
90
|
+
errors.extend(provider_result.errors)
|
|
91
|
+
warnings.extend(provider_result.warnings)
|
|
92
|
+
|
|
93
|
+
return ValidationResult(len(errors) == 0, errors, warnings)
|
|
94
|
+
|
|
95
|
+
def _validate_llm_provider_config(self, provider: str, config: Dict[str, Any]) -> ValidationResult:
|
|
96
|
+
"""Validate configuration for a specific LLM provider"""
|
|
97
|
+
errors = []
|
|
98
|
+
warnings = []
|
|
99
|
+
|
|
100
|
+
# Common validations
|
|
101
|
+
if 'model' in config and not isinstance(config['model'], str):
|
|
102
|
+
errors.append(f"{provider} model must be a string")
|
|
103
|
+
|
|
104
|
+
if 'max_tokens' in config:
|
|
105
|
+
if not isinstance(config['max_tokens'], int) or config['max_tokens'] <= 0:
|
|
106
|
+
errors.append(f"{provider} max_tokens must be a positive integer")
|
|
107
|
+
|
|
108
|
+
if 'temperature' in config:
|
|
109
|
+
if not isinstance(config['temperature'], (int, float)) or config['temperature'] < 0 or config['temperature'] > 2:
|
|
110
|
+
errors.append(f"{provider} temperature must be a number between 0 and 2")
|
|
111
|
+
|
|
112
|
+
if 'timeout' in config:
|
|
113
|
+
if not isinstance(config['timeout'], (int, float)) or config['timeout'] <= 0:
|
|
114
|
+
errors.append(f"{provider} timeout must be a positive number")
|
|
115
|
+
|
|
116
|
+
# Provider-specific validations
|
|
117
|
+
if provider == 'azure':
|
|
118
|
+
if 'endpoint' in config and not isinstance(config['endpoint'], str):
|
|
119
|
+
errors.append("Azure endpoint must be a string")
|
|
120
|
+
if 'api_version' in config and not isinstance(config['api_version'], str):
|
|
121
|
+
errors.append("Azure api_version must be a string")
|
|
122
|
+
elif provider == 'vllm':
|
|
123
|
+
if 'server_url' in config and not isinstance(config['server_url'], str):
|
|
124
|
+
errors.append("vLLM server_url must be a string")
|
|
125
|
+
if 'server_url' in config and not config['server_url'].startswith(('http://', 'https://')):
|
|
126
|
+
errors.append("vLLM server_url must be a valid URL")
|
|
127
|
+
if 'download_path' in config and not isinstance(config['download_path'], str):
|
|
128
|
+
errors.append("vLLM download_path must be a string")
|
|
129
|
+
if 'auto_download' in config and not isinstance(config['auto_download'], bool):
|
|
130
|
+
errors.append("vLLM auto_download must be a boolean")
|
|
131
|
+
|
|
132
|
+
return ValidationResult(len(errors) == 0, errors, warnings)
|
|
133
|
+
|
|
134
|
+
def _validate_processing_config(self, processing_config: Dict[str, Any]) -> ValidationResult:
|
|
135
|
+
"""Validate processing configuration"""
|
|
136
|
+
errors = []
|
|
137
|
+
warnings = []
|
|
138
|
+
|
|
139
|
+
# Validate concurrent requests
|
|
140
|
+
if 'max_concurrent_requests' in processing_config:
|
|
141
|
+
max_concurrent = processing_config['max_concurrent_requests']
|
|
142
|
+
if not isinstance(max_concurrent, int) or max_concurrent <= 0:
|
|
143
|
+
errors.append("max_concurrent_requests must be a positive integer")
|
|
144
|
+
elif max_concurrent > 20:
|
|
145
|
+
warnings.append("max_concurrent_requests > 20 may cause rate limiting")
|
|
146
|
+
|
|
147
|
+
# Validate request delay
|
|
148
|
+
if 'request_delay' in processing_config:
|
|
149
|
+
delay = processing_config['request_delay']
|
|
150
|
+
if not isinstance(delay, (int, float)) or delay < 0:
|
|
151
|
+
errors.append("request_delay must be a non-negative number")
|
|
152
|
+
|
|
153
|
+
# Validate retry attempts
|
|
154
|
+
if 'retry_attempts' in processing_config:
|
|
155
|
+
retry = processing_config['retry_attempts']
|
|
156
|
+
if not isinstance(retry, int) or retry < 0:
|
|
157
|
+
errors.append("retry_attempts must be a non-negative integer")
|
|
158
|
+
elif retry > 10:
|
|
159
|
+
warnings.append("retry_attempts > 10 may cause long delays")
|
|
160
|
+
|
|
161
|
+
return ValidationResult(len(errors) == 0, errors, warnings)
|
|
162
|
+
|
|
163
|
+
def _validate_apis_config(self, apis_config: Dict[str, Any]) -> ValidationResult:
|
|
164
|
+
"""Validate APIs configuration"""
|
|
165
|
+
errors = []
|
|
166
|
+
warnings = []
|
|
167
|
+
|
|
168
|
+
# Validate known API configurations
|
|
169
|
+
known_apis = ['semantic_scholar', 'arxiv', 'google_scholar']
|
|
170
|
+
|
|
171
|
+
for api_name in known_apis:
|
|
172
|
+
if api_name in apis_config:
|
|
173
|
+
api_config = apis_config[api_name]
|
|
174
|
+
if not isinstance(api_config, dict):
|
|
175
|
+
errors.append(f"{api_name} API config must be a dictionary")
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
# Validate common API settings
|
|
179
|
+
if 'base_url' in api_config:
|
|
180
|
+
if not isinstance(api_config['base_url'], str):
|
|
181
|
+
errors.append(f"{api_name} base_url must be a string")
|
|
182
|
+
elif not api_config['base_url'].startswith(('http://', 'https://')):
|
|
183
|
+
errors.append(f"{api_name} base_url must be a valid URL")
|
|
184
|
+
|
|
185
|
+
if 'timeout' in api_config:
|
|
186
|
+
timeout = api_config['timeout']
|
|
187
|
+
if not isinstance(timeout, (int, float)) or timeout <= 0:
|
|
188
|
+
errors.append(f"{api_name} timeout must be a positive number")
|
|
189
|
+
|
|
190
|
+
if 'api_key' in api_config:
|
|
191
|
+
if not isinstance(api_config['api_key'], str):
|
|
192
|
+
errors.append(f"{api_name} api_key must be a string")
|
|
193
|
+
|
|
194
|
+
return ValidationResult(len(errors) == 0, errors, warnings)
|
|
195
|
+
|
|
196
|
+
def validate_llm_command_args(self, args: Dict[str, Any]) -> ValidationResult:
|
|
197
|
+
"""
|
|
198
|
+
Validate LLM command line arguments
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
args: Dictionary of command line arguments
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
ValidationResult with validation status and messages
|
|
205
|
+
"""
|
|
206
|
+
errors = []
|
|
207
|
+
warnings = []
|
|
208
|
+
|
|
209
|
+
# Validate provider
|
|
210
|
+
if 'llm_provider' in args and args['llm_provider']:
|
|
211
|
+
provider = args['llm_provider']
|
|
212
|
+
if provider not in self.llm_providers:
|
|
213
|
+
errors.append(f"Unknown LLM provider: {provider}. Valid providers: {', '.join(self.llm_providers)}")
|
|
214
|
+
|
|
215
|
+
# Validate model
|
|
216
|
+
if 'llm_model' in args and args['llm_model']:
|
|
217
|
+
model = args['llm_model']
|
|
218
|
+
if not isinstance(model, str):
|
|
219
|
+
errors.append("LLM model must be a string")
|
|
220
|
+
|
|
221
|
+
# Validate endpoint
|
|
222
|
+
if 'llm_endpoint' in args and args['llm_endpoint']:
|
|
223
|
+
endpoint = args['llm_endpoint']
|
|
224
|
+
if not isinstance(endpoint, str):
|
|
225
|
+
errors.append("LLM endpoint must be a string")
|
|
226
|
+
elif not endpoint.startswith(('http://', 'https://')):
|
|
227
|
+
errors.append("LLM endpoint must be a valid URL")
|
|
228
|
+
|
|
229
|
+
# Validate API key
|
|
230
|
+
if 'llm_key' in args and args['llm_key']:
|
|
231
|
+
key = args['llm_key']
|
|
232
|
+
if not isinstance(key, str):
|
|
233
|
+
errors.append("LLM API key must be a string")
|
|
234
|
+
elif len(key) < 10:
|
|
235
|
+
warnings.append("LLM API key seems too short")
|
|
236
|
+
|
|
237
|
+
return ValidationResult(len(errors) == 0, errors, warnings)
|
|
238
|
+
|
|
239
|
+
def suggest_fixes(self, validation_result: ValidationResult) -> List[str]:
|
|
240
|
+
"""
|
|
241
|
+
Suggest fixes for validation errors
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
validation_result: Result from validate_config
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
List of suggested fixes
|
|
248
|
+
"""
|
|
249
|
+
suggestions = []
|
|
250
|
+
|
|
251
|
+
for error in validation_result.errors:
|
|
252
|
+
if "Missing required section" in error:
|
|
253
|
+
section = error.split(": ")[1]
|
|
254
|
+
suggestions.append(f"Add {section} section to your configuration")
|
|
255
|
+
elif "must be a positive integer" in error:
|
|
256
|
+
suggestions.append(f"Ensure {error.split()[0]} is set to a positive integer value")
|
|
257
|
+
elif "must be a string" in error:
|
|
258
|
+
suggestions.append(f"Ensure {error.split()[0]} is set to a string value")
|
|
259
|
+
elif "must be a valid URL" in error:
|
|
260
|
+
suggestions.append(f"Ensure URL starts with http:// or https://")
|
|
261
|
+
|
|
262
|
+
return suggestions
|