academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,411 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ BibTeX format parser utility
4
+
5
+ Handles parsing of standard BibTeX format references like:
6
+ @article{key,
7
+ title={Title},
8
+ author={Author Name and Other Author},
9
+ year={2023}
10
+ }
11
+ """
12
+
13
+ import re
14
+ import logging
15
+ from typing import List, Dict, Any
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def detect_bibtex_format(text: str) -> bool:
21
+ """
22
+ Detect if text contains BibTeX format references
23
+
24
+ Args:
25
+ text: Text to analyze
26
+
27
+ Returns:
28
+ True if BibTeX format detected, False otherwise
29
+ """
30
+ # Look for BibTeX entry patterns
31
+ return bool(re.search(r'@\w+\s*\{', text))
32
+
33
+
34
+ def parse_bibtex_entries(bib_content: str) -> List[Dict[str, Any]]:
35
+ """
36
+ Parse BibTeX entries from text content
37
+
38
+ Args:
39
+ bib_content: String containing BibTeX entries
40
+
41
+ Returns:
42
+ List of dictionaries, each containing a parsed BibTeX entry
43
+ """
44
+ if not bib_content:
45
+ return []
46
+
47
+ entries = []
48
+
49
+ # Pattern to match BibTeX entries (excluding @string, @comment, @preamble)
50
+ # First find entry starts, then use brace counting for proper boundaries
51
+ entry_start_pattern = r'@(article|inproceedings|incproceedings|book|incollection|inbook|proceedings|techreport|mastersthesis|masterthesis|phdthesis|misc|unpublished|conference|manual|booklet|collection)\s*\{\s*([^,]+)\s*,'
52
+
53
+ # Find entry starts and extract complete entries using brace counting
54
+ start_matches = list(re.finditer(entry_start_pattern, bib_content, re.DOTALL | re.IGNORECASE))
55
+
56
+ for start_match in start_matches:
57
+ entry_type = start_match.group(1).lower()
58
+ entry_key = start_match.group(2).strip()
59
+
60
+ # Find the complete entry by counting braces
61
+ start_pos = start_match.start()
62
+ brace_start = bib_content.find('{', start_pos)
63
+ if brace_start == -1:
64
+ continue
65
+
66
+ # Count braces to find the end of this entry
67
+ brace_count = 0
68
+ end_pos = brace_start
69
+
70
+ for i, char in enumerate(bib_content[brace_start:], brace_start):
71
+ if char == '{':
72
+ brace_count += 1
73
+ elif char == '}':
74
+ brace_count -= 1
75
+ if brace_count == 0:
76
+ end_pos = i + 1
77
+ break
78
+
79
+ if brace_count != 0:
80
+ logger.warning(f"Unbalanced braces in BibTeX entry starting at position {start_pos}")
81
+ continue
82
+
83
+ # Extract the entry content (inside the outermost braces)
84
+ entry_content = bib_content[brace_start+1:end_pos-1]
85
+
86
+ # Parse the entry content
87
+ parsed_entry = parse_bibtex_entry_content(entry_type, entry_key, entry_content)
88
+ if parsed_entry:
89
+ entries.append(parsed_entry)
90
+
91
+ return entries
92
+
93
+
94
+ def parse_bibtex_entry_content(entry_type: str, entry_key: str, content: str) -> Dict[str, Any]:
95
+ """
96
+ Parse the content of a single BibTeX entry
97
+
98
+ Args:
99
+ entry_type: Type of BibTeX entry (article, inproceedings, etc.)
100
+ entry_key: The citation key
101
+ content: Content inside the braces
102
+
103
+ Returns:
104
+ Dictionary with parsed entry data
105
+ """
106
+ fields = {}
107
+
108
+ # Use a more robust approach with manual parsing
109
+ i = 0
110
+ while i < len(content):
111
+ # Skip whitespace
112
+ while i < len(content) and content[i].isspace():
113
+ i += 1
114
+
115
+ if i >= len(content):
116
+ break
117
+
118
+ # Look for field name
119
+ field_start = i
120
+ while i < len(content) and (content[i].isalnum() or content[i] == '_'):
121
+ i += 1
122
+
123
+ if i == field_start:
124
+ i += 1 # Skip non-alphanumeric character
125
+ continue
126
+
127
+ field_name = content[field_start:i].lower()
128
+
129
+ # Skip whitespace
130
+ while i < len(content) and content[i].isspace():
131
+ i += 1
132
+
133
+ # Look for equals sign
134
+ if i >= len(content) or content[i] != '=':
135
+ continue
136
+ i += 1 # Skip '='
137
+
138
+ # Skip whitespace
139
+ while i < len(content) and content[i].isspace():
140
+ i += 1
141
+
142
+ if i >= len(content):
143
+ break
144
+
145
+ # Parse field value
146
+ field_value = ""
147
+ if content[i] == '"':
148
+ # Handle quoted strings
149
+ i += 1 # Skip opening quote
150
+ value_start = i
151
+ while i < len(content) and content[i] != '"':
152
+ i += 1
153
+ if i < len(content):
154
+ field_value = content[value_start:i]
155
+ i += 1 # Skip closing quote
156
+ elif content[i] == '{':
157
+ # Handle braced strings with proper nesting
158
+ brace_count = 0
159
+ value_start = i + 1 # Skip opening brace
160
+ i += 1
161
+ while i < len(content):
162
+ if content[i] == '{':
163
+ brace_count += 1
164
+ elif content[i] == '}':
165
+ if brace_count == 0:
166
+ break
167
+ brace_count -= 1
168
+ i += 1
169
+
170
+ if i < len(content):
171
+ field_value = content[value_start:i]
172
+ i += 1 # Skip closing brace
173
+
174
+ if field_value:
175
+ field_value = field_value.strip()
176
+ # Strip outer quotes if present (handles cases like title = {"Some Title"})
177
+ if field_value.startswith('"') and field_value.endswith('"'):
178
+ field_value = field_value[1:-1]
179
+ fields[field_name] = field_value
180
+
181
+ # Skip to next field (look for comma)
182
+ while i < len(content) and content[i] not in ',}':
183
+ i += 1
184
+ if i < len(content) and content[i] == ',':
185
+ i += 1
186
+
187
+ # Fallback to regex if manual parsing failed
188
+ if not fields:
189
+ logger.debug("Manual parsing failed, trying regex approach")
190
+ field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
191
+
192
+ for match in re.finditer(field_pattern, content, re.DOTALL):
193
+ field_name = match.group(1).lower()
194
+ field_value = match.group(2) or match.group(3) or ""
195
+ field_value = field_value.strip()
196
+ if field_value.startswith('"') and field_value.endswith('"'):
197
+ field_value = field_value[1:-1]
198
+ fields[field_name] = field_value
199
+
200
+ return {
201
+ 'type': entry_type,
202
+ 'key': entry_key,
203
+ 'fields': fields
204
+ }
205
+
206
+
207
+ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
208
+ """
209
+ Parse BibTeX formatted references into structured format
210
+
211
+ Args:
212
+ bibliography_text: String containing BibTeX entries
213
+
214
+ Returns:
215
+ List of structured reference dictionaries
216
+ """
217
+ from refchecker.utils.text_utils import parse_authors_with_initials, clean_title
218
+ from refchecker.utils.doi_utils import construct_doi_url, is_valid_doi_format
219
+
220
+ entries = parse_bibtex_entries(bibliography_text)
221
+ references = []
222
+
223
+ for entry in entries:
224
+ entry_type = entry['type']
225
+ fields = entry['fields']
226
+
227
+ # Extract required information
228
+ title = fields.get('title', '')
229
+ # Remove braces from BibTeX titles before cleaning
230
+ if title.startswith('{') and title.endswith('}'):
231
+ title = title[1:-1]
232
+ title = clean_title(title)
233
+
234
+ # Parse authors
235
+ authors_raw = fields.get('author', '')
236
+ authors = []
237
+ if authors_raw:
238
+ try:
239
+ authors = parse_authors_with_initials(authors_raw)
240
+ except Exception as e:
241
+ logger.debug(f"Author parsing failed for '{authors_raw}': {e}")
242
+ # Fallback: split by 'and' and clean up
243
+ author_parts = authors_raw.split(' and ')
244
+ for part in author_parts:
245
+ # Remove leading "and" from author names (handles cases like "and Krishnamoorthy, S")
246
+ part = re.sub(r'^and\s+', '', part.strip())
247
+ if part:
248
+ authors.append(part)
249
+
250
+ # Extract year
251
+ year_str = fields.get('year', '')
252
+ year = None
253
+ if year_str:
254
+ try:
255
+ year = int(year_str)
256
+ except (ValueError, TypeError):
257
+ # Try to extract year from string like "2023-04"
258
+ year_match = re.search(r'(\d{4})', year_str)
259
+ if year_match:
260
+ try:
261
+ year = int(year_match.group(1))
262
+ except ValueError:
263
+ pass
264
+
265
+ # If no year found but we have a valid title/authors, try extracting from eprint field
266
+ if year is None and (title or authors):
267
+ eprint = fields.get('eprint', '')
268
+ if eprint:
269
+ # Extract year from ArXiv eprint ID (e.g., "2311.09096" -> 2023)
270
+ eprint_year_match = re.match(r'^(\d{2})(\d{2})', eprint)
271
+ if eprint_year_match:
272
+ yy = int(eprint_year_match.group(1))
273
+ # Convert to 4-digit year (23 -> 2023, assumes 21st century)
274
+ if yy >= 91: # ArXiv started in 1991
275
+ year = 1900 + yy
276
+ else:
277
+ year = 2000 + yy
278
+
279
+ # Extract journal/venue
280
+ journal = fields.get('journal', fields.get('booktitle', fields.get('venue', '')))
281
+ # Remove braces from journal/venue names
282
+ if journal and journal.startswith('{') and journal.endswith('}'):
283
+ journal = journal[1:-1]
284
+
285
+ # Extract DOI and construct URL
286
+ doi = fields.get('doi', '')
287
+ doi_url = None
288
+ if doi and is_valid_doi_format(doi):
289
+ doi_url = construct_doi_url(doi)
290
+
291
+ # Extract other URLs
292
+ url = fields.get('url', '')
293
+ if url:
294
+ from refchecker.utils.url_utils import clean_url
295
+ url = clean_url(url)
296
+
297
+ # Handle special @misc entries with only howpublished field
298
+ if not title and not authors and entry_type == 'misc':
299
+ howpublished = fields.get('howpublished', '')
300
+ if howpublished:
301
+ # Try to extract a URL from howpublished
302
+ url_patterns = [
303
+ r'://([^/]+)', # Missing protocol case: "://example.com/path"
304
+ r'https?://([^/\s]+)', # Standard URL
305
+ r'www\.([^/\s]+)', # www without protocol
306
+ ]
307
+
308
+ for pattern in url_patterns:
309
+ match = re.search(pattern, howpublished)
310
+ if match:
311
+ domain = match.group(1)
312
+ # Reconstruct URL with https if protocol was missing
313
+ if howpublished.startswith('://'):
314
+ url = 'https' + howpublished
315
+ elif not howpublished.startswith(('http://', 'https://')):
316
+ url = 'https://' + howpublished
317
+ else:
318
+ url = howpublished
319
+
320
+ # Clean the reconstructed URL
321
+ from refchecker.utils.url_utils import clean_url
322
+ url = clean_url(url)
323
+
324
+ # Generate title from domain/path
325
+ if 'jailbreakchat.com' in domain:
326
+ title = 'JailbreakChat Website'
327
+ elif 'lesswrong.com' in domain:
328
+ title = 'LessWrong Post: Jailbreaking ChatGPT'
329
+ elif 'chat.openai.com' in domain:
330
+ title = 'ChatGPT Conversation Share'
331
+ elif 'gemini.google.com' in domain:
332
+ title = 'Gemini Conversation Share'
333
+ elif 'microsoft.com' in domain:
334
+ title = 'Microsoft Azure Content Safety API'
335
+ elif 'perspectiveapi.com' in domain:
336
+ title = 'Perspective API'
337
+ else:
338
+ # Generic title based on domain
339
+ title = f"Web Resource: {domain}"
340
+
341
+ authors = ["Web Resource"]
342
+ break
343
+
344
+ # Handle regular URL field
345
+ if not url:
346
+ url = fields.get('url', fields.get('howpublished', ''))
347
+
348
+ if url.startswith('\\url{') and url.endswith('}'):
349
+ url = url[5:-1] # Remove \url{...}
350
+
351
+ # Clean any URL we extracted
352
+ if url:
353
+ from refchecker.utils.url_utils import clean_url
354
+ url = clean_url(url)
355
+
356
+ # Construct ArXiv URL from eprint field if no URL present
357
+ if not url and not doi_url:
358
+ eprint = fields.get('eprint', '')
359
+ if eprint and re.match(r'^\d{4}\.\d{4,5}', eprint):
360
+ # Remove version number if present and construct ArXiv URL
361
+ clean_eprint = re.sub(r'v\d+$', '', eprint)
362
+ url = f"https://arxiv.org/abs/{clean_eprint}"
363
+
364
+ # Determine publication URL (prefer DOI, then URL field)
365
+ publication_url = doi_url if doi_url else url
366
+
367
+ # Apply defaults only if we still don't have values
368
+ if not authors:
369
+ authors = ["Unknown Author"]
370
+
371
+ # Clean title
372
+ if not title:
373
+ title = "Unknown Title"
374
+
375
+ # Determine reference type (for compatibility)
376
+ ref_type = 'other'
377
+ if 'arxiv' in publication_url.lower() if publication_url else False or 'arxiv' in title.lower():
378
+ ref_type = 'arxiv'
379
+ elif publication_url or doi:
380
+ ref_type = 'non-arxiv'
381
+
382
+ # Create structured reference (matching old format)
383
+ reference = {
384
+ 'title': title,
385
+ 'authors': authors,
386
+ 'year': year,
387
+ 'journal': journal,
388
+ 'doi': doi,
389
+ 'url': publication_url if publication_url else '',
390
+ 'type': ref_type,
391
+ 'bibtex_key': entry['key'],
392
+ 'bibtex_type': entry_type,
393
+ 'raw_text': f"@{entry_type}{{{entry['key']}, ...}}" # Simplified raw text
394
+ }
395
+
396
+ # Add additional fields based on entry type
397
+ if entry_type == 'inproceedings' or entry_type == 'incproceedings':
398
+ reference['pages'] = fields.get('pages', '')
399
+ reference['organization'] = fields.get('organization', '')
400
+ elif entry_type == 'article':
401
+ reference['volume'] = fields.get('volume', '')
402
+ reference['number'] = fields.get('number', '')
403
+ reference['pages'] = fields.get('pages', '')
404
+ elif entry_type == 'book':
405
+ reference['publisher'] = fields.get('publisher', '')
406
+ reference['isbn'] = fields.get('isbn', '')
407
+
408
+ references.append(reference)
409
+
410
+ logger.debug(f"Extracted {len(references)} BibTeX references")
411
+ return references
@@ -0,0 +1,262 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Configuration validation utilities for ArXiv Reference Checker
4
+ Provides validation for configuration files and settings
5
+ """
6
+
7
+ import logging
8
+ from typing import Dict, Any, List, Optional, Union
9
+ from dataclasses import dataclass
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @dataclass
15
+ class ValidationResult:
16
+ """Result of configuration validation"""
17
+ is_valid: bool
18
+ errors: List[str]
19
+ warnings: List[str]
20
+
21
+ def __post_init__(self):
22
+ if self.errors is None:
23
+ self.errors = []
24
+ if self.warnings is None:
25
+ self.warnings = []
26
+
27
+
28
+ class ConfigValidator:
29
+ """Validates configuration dictionaries"""
30
+
31
+ def __init__(self):
32
+ self.required_sections = ['llm', 'processing', 'apis']
33
+ self.llm_providers = ['openai', 'anthropic', 'google', 'azure', 'vllm']
34
+
35
+ def validate_config(self, config: Dict[str, Any]) -> ValidationResult:
36
+ """
37
+ Validate a complete configuration dictionary
38
+
39
+ Args:
40
+ config: Configuration dictionary to validate
41
+
42
+ Returns:
43
+ ValidationResult with validation status and messages
44
+ """
45
+ errors = []
46
+ warnings = []
47
+
48
+ # Check required sections
49
+ for section in self.required_sections:
50
+ if section not in config:
51
+ errors.append(f"Missing required section: {section}")
52
+ else:
53
+ # Validate individual sections
54
+ section_result = self._validate_section(section, config[section])
55
+ errors.extend(section_result.errors)
56
+ warnings.extend(section_result.warnings)
57
+
58
+ return ValidationResult(
59
+ is_valid=len(errors) == 0,
60
+ errors=errors,
61
+ warnings=warnings
62
+ )
63
+
64
+ def _validate_section(self, section_name: str, section_config: Dict[str, Any]) -> ValidationResult:
65
+ """Validate a specific configuration section"""
66
+ if section_name == 'llm':
67
+ return self._validate_llm_config(section_config)
68
+ elif section_name == 'processing':
69
+ return self._validate_processing_config(section_config)
70
+ elif section_name == 'apis':
71
+ return self._validate_apis_config(section_config)
72
+ else:
73
+ return ValidationResult(True, [], [])
74
+
75
+ def _validate_llm_config(self, llm_config: Dict[str, Any]) -> ValidationResult:
76
+ """Validate LLM configuration"""
77
+ errors = []
78
+ warnings = []
79
+
80
+ # Check provider configurations
81
+ for provider in self.llm_providers:
82
+ if provider in llm_config:
83
+ provider_config = llm_config[provider]
84
+ if not isinstance(provider_config, dict):
85
+ errors.append(f"LLM provider {provider} config must be a dictionary")
86
+ continue
87
+
88
+ # Validate provider-specific settings
89
+ provider_result = self._validate_llm_provider_config(provider, provider_config)
90
+ errors.extend(provider_result.errors)
91
+ warnings.extend(provider_result.warnings)
92
+
93
+ return ValidationResult(len(errors) == 0, errors, warnings)
94
+
95
+ def _validate_llm_provider_config(self, provider: str, config: Dict[str, Any]) -> ValidationResult:
96
+ """Validate configuration for a specific LLM provider"""
97
+ errors = []
98
+ warnings = []
99
+
100
+ # Common validations
101
+ if 'model' in config and not isinstance(config['model'], str):
102
+ errors.append(f"{provider} model must be a string")
103
+
104
+ if 'max_tokens' in config:
105
+ if not isinstance(config['max_tokens'], int) or config['max_tokens'] <= 0:
106
+ errors.append(f"{provider} max_tokens must be a positive integer")
107
+
108
+ if 'temperature' in config:
109
+ if not isinstance(config['temperature'], (int, float)) or config['temperature'] < 0 or config['temperature'] > 2:
110
+ errors.append(f"{provider} temperature must be a number between 0 and 2")
111
+
112
+ if 'timeout' in config:
113
+ if not isinstance(config['timeout'], (int, float)) or config['timeout'] <= 0:
114
+ errors.append(f"{provider} timeout must be a positive number")
115
+
116
+ # Provider-specific validations
117
+ if provider == 'azure':
118
+ if 'endpoint' in config and not isinstance(config['endpoint'], str):
119
+ errors.append("Azure endpoint must be a string")
120
+ if 'api_version' in config and not isinstance(config['api_version'], str):
121
+ errors.append("Azure api_version must be a string")
122
+ elif provider == 'vllm':
123
+ if 'server_url' in config and not isinstance(config['server_url'], str):
124
+ errors.append("vLLM server_url must be a string")
125
+ if 'server_url' in config and not config['server_url'].startswith(('http://', 'https://')):
126
+ errors.append("vLLM server_url must be a valid URL")
127
+ if 'download_path' in config and not isinstance(config['download_path'], str):
128
+ errors.append("vLLM download_path must be a string")
129
+ if 'auto_download' in config and not isinstance(config['auto_download'], bool):
130
+ errors.append("vLLM auto_download must be a boolean")
131
+
132
+ return ValidationResult(len(errors) == 0, errors, warnings)
133
+
134
+ def _validate_processing_config(self, processing_config: Dict[str, Any]) -> ValidationResult:
135
+ """Validate processing configuration"""
136
+ errors = []
137
+ warnings = []
138
+
139
+ # Validate concurrent requests
140
+ if 'max_concurrent_requests' in processing_config:
141
+ max_concurrent = processing_config['max_concurrent_requests']
142
+ if not isinstance(max_concurrent, int) or max_concurrent <= 0:
143
+ errors.append("max_concurrent_requests must be a positive integer")
144
+ elif max_concurrent > 20:
145
+ warnings.append("max_concurrent_requests > 20 may cause rate limiting")
146
+
147
+ # Validate request delay
148
+ if 'request_delay' in processing_config:
149
+ delay = processing_config['request_delay']
150
+ if not isinstance(delay, (int, float)) or delay < 0:
151
+ errors.append("request_delay must be a non-negative number")
152
+
153
+ # Validate retry attempts
154
+ if 'retry_attempts' in processing_config:
155
+ retry = processing_config['retry_attempts']
156
+ if not isinstance(retry, int) or retry < 0:
157
+ errors.append("retry_attempts must be a non-negative integer")
158
+ elif retry > 10:
159
+ warnings.append("retry_attempts > 10 may cause long delays")
160
+
161
+ return ValidationResult(len(errors) == 0, errors, warnings)
162
+
163
+ def _validate_apis_config(self, apis_config: Dict[str, Any]) -> ValidationResult:
164
+ """Validate APIs configuration"""
165
+ errors = []
166
+ warnings = []
167
+
168
+ # Validate known API configurations
169
+ known_apis = ['semantic_scholar', 'arxiv', 'google_scholar']
170
+
171
+ for api_name in known_apis:
172
+ if api_name in apis_config:
173
+ api_config = apis_config[api_name]
174
+ if not isinstance(api_config, dict):
175
+ errors.append(f"{api_name} API config must be a dictionary")
176
+ continue
177
+
178
+ # Validate common API settings
179
+ if 'base_url' in api_config:
180
+ if not isinstance(api_config['base_url'], str):
181
+ errors.append(f"{api_name} base_url must be a string")
182
+ elif not api_config['base_url'].startswith(('http://', 'https://')):
183
+ errors.append(f"{api_name} base_url must be a valid URL")
184
+
185
+ if 'timeout' in api_config:
186
+ timeout = api_config['timeout']
187
+ if not isinstance(timeout, (int, float)) or timeout <= 0:
188
+ errors.append(f"{api_name} timeout must be a positive number")
189
+
190
+ if 'api_key' in api_config:
191
+ if not isinstance(api_config['api_key'], str):
192
+ errors.append(f"{api_name} api_key must be a string")
193
+
194
+ return ValidationResult(len(errors) == 0, errors, warnings)
195
+
196
+ def validate_llm_command_args(self, args: Dict[str, Any]) -> ValidationResult:
197
+ """
198
+ Validate LLM command line arguments
199
+
200
+ Args:
201
+ args: Dictionary of command line arguments
202
+
203
+ Returns:
204
+ ValidationResult with validation status and messages
205
+ """
206
+ errors = []
207
+ warnings = []
208
+
209
+ # Validate provider
210
+ if 'llm_provider' in args and args['llm_provider']:
211
+ provider = args['llm_provider']
212
+ if provider not in self.llm_providers:
213
+ errors.append(f"Unknown LLM provider: {provider}. Valid providers: {', '.join(self.llm_providers)}")
214
+
215
+ # Validate model
216
+ if 'llm_model' in args and args['llm_model']:
217
+ model = args['llm_model']
218
+ if not isinstance(model, str):
219
+ errors.append("LLM model must be a string")
220
+
221
+ # Validate endpoint
222
+ if 'llm_endpoint' in args and args['llm_endpoint']:
223
+ endpoint = args['llm_endpoint']
224
+ if not isinstance(endpoint, str):
225
+ errors.append("LLM endpoint must be a string")
226
+ elif not endpoint.startswith(('http://', 'https://')):
227
+ errors.append("LLM endpoint must be a valid URL")
228
+
229
+ # Validate API key
230
+ if 'llm_key' in args and args['llm_key']:
231
+ key = args['llm_key']
232
+ if not isinstance(key, str):
233
+ errors.append("LLM API key must be a string")
234
+ elif len(key) < 10:
235
+ warnings.append("LLM API key seems too short")
236
+
237
+ return ValidationResult(len(errors) == 0, errors, warnings)
238
+
239
+ def suggest_fixes(self, validation_result: ValidationResult) -> List[str]:
240
+ """
241
+ Suggest fixes for validation errors
242
+
243
+ Args:
244
+ validation_result: Result from validate_config
245
+
246
+ Returns:
247
+ List of suggested fixes
248
+ """
249
+ suggestions = []
250
+
251
+ for error in validation_result.errors:
252
+ if "Missing required section" in error:
253
+ section = error.split(": ")[1]
254
+ suggestions.append(f"Add {section} section to your configuration")
255
+ elif "must be a positive integer" in error:
256
+ suggestions.append(f"Ensure {error.split()[0]} is set to a positive integer value")
257
+ elif "must be a string" in error:
258
+ suggestions.append(f"Ensure {error.split()[0]} is set to a string value")
259
+ elif "must be a valid URL" in error:
260
+ suggestions.append(f"Ensure URL starts with http:// or https://")
261
+
262
+ return suggestions