academic-refchecker 1.2.35__tar.gz → 1.2.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {academic_refchecker-1.2.35/src/academic_refchecker.egg-info → academic_refchecker-1.2.37}/PKG-INFO +1 -1
  2. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/__version__.py +1 -1
  3. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
  4. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/academic_refchecker.egg-info/SOURCES.txt +2 -0
  5. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/core/refchecker.py +87 -264
  6. academic_refchecker-1.2.37/src/utils/arxiv_utils.py +376 -0
  7. academic_refchecker-1.2.37/src/utils/biblatex_parser.py +474 -0
  8. academic_refchecker-1.2.37/src/utils/bibtex_parser.py +334 -0
  9. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/text_utils.py +288 -83
  10. academic_refchecker-1.2.35/src/utils/arxiv_utils.py +0 -176
  11. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/LICENSE +0 -0
  12. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/MANIFEST.in +0 -0
  13. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/README.md +0 -0
  14. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/pyproject.toml +0 -0
  15. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/requirements.txt +0 -0
  16. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/scripts/download_db.py +0 -0
  17. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/scripts/run_tests.py +0 -0
  18. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/scripts/start_vllm_server.py +0 -0
  19. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/setup.cfg +0 -0
  20. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/__init__.py +0 -0
  21. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
  22. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
  23. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/academic_refchecker.egg-info/requires.txt +0 -0
  24. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/academic_refchecker.egg-info/top_level.txt +0 -0
  25. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/__init__.py +0 -0
  26. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/crossref.py +0 -0
  27. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/enhanced_hybrid_checker.py +0 -0
  28. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/github_checker.py +0 -0
  29. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/local_semantic_scholar.py +0 -0
  30. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/openalex.py +0 -0
  31. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/openreview_checker.py +0 -0
  32. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/semantic_scholar.py +0 -0
  33. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/webpage_checker.py +0 -0
  34. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/config/__init__.py +0 -0
  35. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/config/logging.conf +0 -0
  36. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/config/settings.py +0 -0
  37. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/core/__init__.py +0 -0
  38. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/core/db_connection_pool.py +0 -0
  39. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/core/parallel_processor.py +0 -0
  40. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/database/__init__.py +0 -0
  41. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/database/download_semantic_scholar_db.py +0 -0
  42. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/llm/__init__.py +0 -0
  43. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/llm/base.py +0 -0
  44. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/llm/providers.py +0 -0
  45. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/scripts/__init__.py +0 -0
  46. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/scripts/start_vllm_server.py +0 -0
  47. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/services/__init__.py +0 -0
  48. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/services/pdf_processor.py +0 -0
  49. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/__init__.py +0 -0
  50. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/author_utils.py +0 -0
  51. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/config_validator.py +0 -0
  52. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/db_utils.py +0 -0
  53. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/doi_utils.py +0 -0
  54. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/error_utils.py +0 -0
  55. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/mock_objects.py +0 -0
  56. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/unicode_utils.py +0 -0
  57. {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.35
3
+ Version: 1.2.37
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.35"
3
+ __version__ = "1.2.37"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.35
3
+ Version: 1.2.37
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -42,6 +42,8 @@ src/services/pdf_processor.py
42
42
  src/utils/__init__.py
43
43
  src/utils/arxiv_utils.py
44
44
  src/utils/author_utils.py
45
+ src/utils/biblatex_parser.py
46
+ src/utils/bibtex_parser.py
45
47
  src/utils/config_validator.py
46
48
  src/utils/db_utils.py
47
49
  src/utils/doi_utils.py
@@ -202,6 +202,10 @@ class ArxivReferenceChecker:
202
202
  # debug mode
203
203
  self.debug_mode = debug_mode
204
204
 
205
+ # Initialize extraction flags
206
+ self.used_regex_extraction = False
207
+ self.used_unreliable_extraction = False
208
+
205
209
  # Parallel processing configuration
206
210
  self.enable_parallel = enable_parallel
207
211
  self.max_workers = max_workers
@@ -2887,6 +2891,7 @@ class ArxivReferenceChecker:
2887
2891
  self.total_other_refs = 0
2888
2892
  self.total_unverified_refs = 0
2889
2893
  self.used_regex_extraction = False
2894
+ self.used_unreliable_extraction = False # Only set for fallback regex parsing, not BibTeX
2890
2895
 
2891
2896
  try:
2892
2897
  # Get papers to process
@@ -3105,8 +3110,8 @@ class ArxivReferenceChecker:
3105
3110
  if self.total_errors_found == 0 and self.total_warnings_found == 0 and self.total_unverified_refs == 0:
3106
3111
  print(f"✅ All references verified successfully!")
3107
3112
 
3108
- # Show warning if regex extraction was used and there are many errors
3109
- if self.used_regex_extraction and self.total_errors_found > 5:
3113
+ # Show warning if unreliable extraction was used and there are many errors
3114
+ if self.used_unreliable_extraction and self.total_errors_found > 5:
3110
3115
  print(f"\n⚠️ Results might be affected by incorrect reference extraction. Consider using LLM extraction, which is more robust.")
3111
3116
 
3112
3117
  if self.verification_output_file:
@@ -3124,8 +3129,8 @@ class ArxivReferenceChecker:
3124
3129
  print(f" Total warnings: {self.total_warnings_found}")
3125
3130
  print(f"❓ References that couldn't be verified: {self.total_unverified_refs}")
3126
3131
 
3127
- # Show warning if regex extraction was used and there are many errors
3128
- if self.used_regex_extraction and self.total_errors_found > 5:
3132
+ # Show warning if unreliable extraction was used and there are many errors
3133
+ if self.used_unreliable_extraction and self.total_errors_found > 5:
3129
3134
  print(f"\n⚠️ Results might be affected by incorrect reference extraction. Consider using LLM extraction, which is more robust.")
3130
3135
 
3131
3136
  if self.verification_output_file:
@@ -3401,15 +3406,25 @@ class ArxivReferenceChecker:
3401
3406
  if detect_standard_acm_natbib_format(bibliography_text):
3402
3407
  logger.info("Detected standard ACM/natbib format, using regex-based parsing")
3403
3408
  self.used_regex_extraction = True
3409
+ # Note: ACM/natbib parsing is also quite robust for standard formats
3404
3410
  return self._parse_standard_acm_natbib_references(bibliography_text)
3405
3411
 
3406
3412
  # Check if this is BibTeX format
3407
- from utils.text_utils import detect_bibtex_format
3413
+ from utils.bibtex_parser import detect_bibtex_format
3408
3414
  if detect_bibtex_format(bibliography_text):
3409
3415
  logger.info("Detected BibTeX format, using BibTeX parser")
3410
3416
  self.used_regex_extraction = True
3417
+ # Note: BibTeX parsing is robust, so we don't set used_unreliable_extraction
3411
3418
  return self._parse_bibtex_references(bibliography_text)
3412
3419
 
3420
+ # Check if this is biblatex format
3421
+ from utils.biblatex_parser import detect_biblatex_format
3422
+ if detect_biblatex_format(bibliography_text):
3423
+ logger.info("Detected biblatex format, using biblatex parser")
3424
+ self.used_regex_extraction = True
3425
+ # Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
3426
+ return self._parse_biblatex_references(bibliography_text)
3427
+
3413
3428
  # For non-standard formats, try LLM-based extraction if available
3414
3429
  if self.llm_extractor:
3415
3430
  try:
@@ -3431,6 +3446,7 @@ class ArxivReferenceChecker:
3431
3446
  # Fallback to regex-based parsing only if LLM was not specified
3432
3447
  logger.info("No LLM available, falling back to regex-based parsing")
3433
3448
  self.used_regex_extraction = True
3449
+ self.used_unreliable_extraction = True # This is the unreliable fallback parsing
3434
3450
  return self._parse_references_regex(bibliography_text)
3435
3451
 
3436
3452
  def _parse_standard_acm_natbib_references(self, bibliography_text):
@@ -3622,10 +3638,22 @@ class ArxivReferenceChecker:
3622
3638
  self.used_regex_extraction = True
3623
3639
 
3624
3640
  # Check if this is BibTeX format first
3625
- if re.search(r'@\w+\s*\{', bibliography_text):
3641
+ from utils.bibtex_parser import detect_bibtex_format
3642
+ if detect_bibtex_format(bibliography_text):
3626
3643
  logger.debug("Detected BibTeX format, using BibTeX-specific parsing")
3644
+ # BibTeX parsing is robust, so we don't set used_unreliable_extraction
3627
3645
  return self._parse_bibtex_references(bibliography_text)
3628
3646
 
3647
+ # Check if this is biblatex format
3648
+ from utils.biblatex_parser import detect_biblatex_format
3649
+ if detect_biblatex_format(bibliography_text):
3650
+ logger.debug("Detected biblatex format, using biblatex-specific parsing")
3651
+ # biblatex parsing is also robust, so we don't set used_unreliable_extraction
3652
+ return self._parse_biblatex_references(bibliography_text)
3653
+
3654
+ # If we reach here, we're using the unreliable fallback regex parsing
3655
+ self.used_unreliable_extraction = True
3656
+
3629
3657
  # --- IMPROVED SPLITTING: handle concatenated references like [3]... [4]... ---
3630
3658
  # First, normalize the bibliography text to handle multi-line references
3631
3659
  # This fixes the issue where years appear as separate lines
@@ -4054,270 +4082,33 @@ class ArxivReferenceChecker:
4054
4082
  Returns:
4055
4083
  List of structured reference dictionaries
4056
4084
  """
4057
- import re
4085
+ # Use the dedicated BibTeX parser
4086
+ from utils.bibtex_parser import parse_bibtex_references
4058
4087
 
4059
- # Pattern to match BibTeX entries: @type{key, ...}
4060
- # This handles nested braces properly
4061
- bibtex_pattern = r'@(\w+)\s*\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}'
4062
-
4063
- entries = []
4064
- for match in re.finditer(bibtex_pattern, bibliography_text, re.DOTALL | re.IGNORECASE):
4065
- entry_type = match.group(1).lower()
4066
- entry_content = match.group(2)
4067
-
4068
- # Extract fields from the BibTeX entry
4069
- entry_data = self._parse_bibtex_entry(entry_type, entry_content)
4070
- if entry_data:
4071
- entries.append(entry_data)
4072
-
4073
- if not entries:
4074
- # Fallback: try simpler pattern if the above doesn't work
4075
- logger.debug("Complex BibTeX pattern failed, trying simpler approach")
4076
- # Split on @word{ patterns to find entry boundaries
4077
- parts = re.split(r'(?=@\w+\s*\{)', bibliography_text)
4078
-
4079
- for part in parts:
4080
- part = part.strip()
4081
- if not part or not part.startswith('@'):
4082
- continue
4083
-
4084
- # Find the entry type
4085
- type_match = re.match(r'@(\w+)\s*\{', part)
4086
- if not type_match:
4087
- continue
4088
-
4089
- entry_type = type_match.group(1).lower()
4090
-
4091
- # Extract the content between the first { and the last }
4092
- # This is a simplified approach but should work for most cases
4093
- brace_start = part.find('{')
4094
- if brace_start == -1:
4095
- continue
4096
-
4097
- # Find the matching closing brace
4098
- brace_count = 0
4099
- content_end = -1
4100
- for i, char in enumerate(part[brace_start:], brace_start):
4101
- if char == '{':
4102
- brace_count += 1
4103
- elif char == '}':
4104
- brace_count -= 1
4105
- if brace_count == 0:
4106
- content_end = i
4107
- break
4108
-
4109
- if content_end == -1:
4110
- # No matching brace found, take everything after first {
4111
- entry_content = part[brace_start + 1:]
4112
- else:
4113
- entry_content = part[brace_start + 1:content_end]
4114
-
4115
- entry_data = self._parse_bibtex_entry(entry_type, entry_content)
4116
- if entry_data:
4117
- entries.append(entry_data)
4088
+ # Extract references using the BibTeX parser
4089
+ references = parse_bibtex_references(bibliography_text)
4118
4090
 
4119
- logger.debug(f"Extracted {len(entries)} BibTeX entries")
4120
- return entries
4091
+ logger.debug(f"Extracted {len(references)} BibTeX references using dedicated parser")
4092
+ return references
4121
4093
 
4122
- def _parse_bibtex_entry(self, entry_type, content):
4094
+ def _parse_biblatex_references(self, bibliography_text):
4123
4095
  """
4124
- Parse a single BibTeX entry content to extract fields
4096
+ Parse biblatex formatted references like [1] Author. "Title". In: Venue. Year.
4125
4097
 
4126
4098
  Args:
4127
- entry_type: Type of entry (inproceedings, article, etc.)
4128
- content: Content inside the braces
4099
+ bibliography_text: String containing biblatex .bbl entries
4129
4100
 
4130
4101
  Returns:
4131
- Dictionary with structured reference data
4102
+ List of structured reference dictionaries
4132
4103
  """
4133
- import re
4134
- from utils.text_utils import parse_authors_with_initials, clean_title
4135
- from utils.doi_utils import construct_doi_url, is_valid_doi_format
4136
-
4137
- # Extract key (first part before comma)
4138
- key_match = re.match(r'([^,]+),', content)
4139
- key = key_match.group(1).strip() if key_match else ""
4140
-
4141
- # Extract fields using regex
4142
- fields = {}
4143
-
4144
- # Pattern to match field = {value} or field = "value"
4145
- field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
4146
-
4147
- for match in re.finditer(field_pattern, content, re.DOTALL):
4148
- field_name = match.group(1).lower()
4149
- field_value = match.group(2) or match.group(3) or ""
4150
- # Strip outer quotes if present (handles cases like title = {"Some Title"})
4151
- field_value = field_value.strip()
4152
- if field_value.startswith('"') and field_value.endswith('"'):
4153
- field_value = field_value[1:-1]
4154
- fields[field_name] = field_value
4155
-
4156
- # If field extraction failed, try a simpler approach
4157
- if not fields:
4158
- logger.debug("Field extraction failed, trying line-by-line approach")
4159
- lines = content.split('\n')
4160
- for line in lines:
4161
- line = line.strip()
4162
- if '=' in line:
4163
- field_match = re.match(r'(\w+)\s*=\s*[{"]([^{}"]*)[}"]', line)
4164
- if field_match:
4165
- field_name = field_match.group(1).lower()
4166
- field_value = field_match.group(2).strip()
4167
- # Strip outer quotes if present
4168
- if field_value.startswith('"') and field_value.endswith('"'):
4169
- field_value = field_value[1:-1]
4170
- fields[field_name] = field_value
4171
-
4172
- # Extract required information
4173
- title = fields.get('title', '')
4174
- author_string = fields.get('author', '')
4175
- year = 0
4176
-
4177
- # Parse year
4178
- year_str = fields.get('year', '')
4179
- if year_str:
4180
- year_match = re.search(r'\d{4}', year_str)
4181
- if year_match:
4182
- year = int(year_match.group())
4183
-
4184
- # If no year found but we have a valid title/authors, try extracting from eprint or other fields
4185
- if year == 0 and (title or author_string):
4186
- # Check eprint field for arXiv entries like "2024" prefix
4187
- eprint = fields.get('eprint', '')
4188
- if eprint:
4189
- # Extract year from ArXiv eprint ID (e.g., "2311.09096" -> 2023)
4190
- eprint_year_match = re.match(r'^(\d{2})(\d{2})', eprint)
4191
- if eprint_year_match:
4192
- yy = int(eprint_year_match.group(1))
4193
- # Convert to 4-digit year (23 -> 2023, assumes 21st century)
4194
- if yy >= 91: # ArXiv started in 1991
4195
- year = 1900 + yy
4196
- else:
4197
- year = 2000 + yy
4198
-
4199
- # For entries without year, set None instead of 0
4200
- if year == 0:
4201
- year = None
4202
-
4203
- # Parse authors using the enhanced function
4204
- authors = []
4205
- if author_string:
4206
- try:
4207
- authors = parse_authors_with_initials(author_string)
4208
- except Exception as e:
4209
- logger.debug(f"Author parsing failed for '{author_string}': {e}")
4210
- # Fallback: split by 'and' and clean up
4211
- author_parts = author_string.split(' and ')
4212
- authors = []
4213
- for part in author_parts:
4214
- # Remove leading "and" from author names (handles cases like "and Krishnamoorthy, S")
4215
- part = re.sub(r'^and\s+', '', part.strip())
4216
- if part:
4217
- authors.append(part)
4218
-
4219
- # Special handling for @misc entries with only howpublished field
4220
- if not title and not authors and entry_type == 'misc':
4221
- howpublished = fields.get('howpublished', '')
4222
- if howpublished:
4223
- # Try to extract a URL from howpublished
4224
- url_patterns = [
4225
- r'://([^/]+)', # Missing protocol case: "://example.com/path"
4226
- r'https?://([^/\s]+)', # Standard URL
4227
- r'www\.([^/\s]+)', # www without protocol
4228
- ]
4229
-
4230
- extracted_url = ''
4231
- for pattern in url_patterns:
4232
- match = re.search(pattern, howpublished)
4233
- if match:
4234
- domain = match.group(1)
4235
- # Reconstruct URL with https if protocol was missing
4236
- if howpublished.startswith('://'):
4237
- extracted_url = 'https' + howpublished
4238
- elif not howpublished.startswith(('http://', 'https://')):
4239
- extracted_url = 'https://' + howpublished
4240
- else:
4241
- extracted_url = howpublished
4242
-
4243
- # Generate title from domain/path
4244
- if 'jailbreakchat.com' in domain:
4245
- title = 'JailbreakChat Website'
4246
- elif 'lesswrong.com' in domain:
4247
- title = 'LessWrong Post: Jailbreaking ChatGPT'
4248
- elif 'chat.openai.com' in domain:
4249
- title = 'ChatGPT Conversation Share'
4250
- elif 'gemini.google.com' in domain:
4251
- title = 'Gemini Conversation Share'
4252
- elif 'microsoft.com' in domain:
4253
- title = 'Microsoft Azure Content Safety API'
4254
- elif 'perspectiveapi.com' in domain:
4255
- title = 'Perspective API'
4256
- else:
4257
- # Generic title based on domain
4258
- title = f"Web Resource: {domain}"
4259
-
4260
- authors = ["Web Resource"]
4261
- # Store the extracted URL
4262
- fields['url'] = extracted_url
4263
- break
4264
-
4265
- # Apply defaults only if we still don't have values
4266
- if not authors:
4267
- authors = ["Unknown Author"]
4268
-
4269
- # Clean title
4270
- title = clean_title(title) if title else "Unknown Title"
4271
-
4272
- # Extract URL/DOI
4273
- url = fields.get('url', '')
4274
- doi = fields.get('doi', '')
4275
-
4276
- # Construct DOI URL if we have a DOI
4277
- if doi and is_valid_doi_format(doi):
4278
- url = construct_doi_url(doi)
4279
-
4280
- # Construct ArXiv URL from eprint field if no URL present
4281
- if not url:
4282
- eprint = fields.get('eprint', '')
4283
- if eprint and re.match(r'^\d{4}\.\d{4,5}', eprint):
4284
- # Remove version number if present and construct ArXiv URL
4285
- clean_eprint = re.sub(r'v\d+$', '', eprint)
4286
- url = f"https://arxiv.org/abs/{clean_eprint}"
4287
-
4288
- # Handle special URL fields
4289
- if not url:
4290
- howpublished = fields.get('howpublished', '')
4291
- if 'url{' in howpublished or 'href{' in howpublished:
4292
- url_match = re.search(r'url\{([^}]+)\}', howpublished)
4293
- if not url_match:
4294
- url_match = re.search(r'href\{([^}]+)\}', howpublished)
4295
- if url_match:
4296
- from utils.url_utils import clean_url_punctuation
4297
- url = clean_url_punctuation(url_match.group(1))
4104
+ # Use the dedicated biblatex parser
4105
+ from utils.biblatex_parser import parse_biblatex_references
4298
4106
 
4299
- # Determine reference type
4300
- ref_type = 'other'
4301
- if 'arxiv' in url.lower() or 'arxiv' in title.lower():
4302
- ref_type = 'arxiv'
4303
- elif url or doi:
4304
- ref_type = 'non-arxiv'
4305
-
4306
- # Create structured reference
4307
- structured_ref = {
4308
- 'url': url,
4309
- 'doi': doi,
4310
- 'year': year,
4311
- 'authors': authors,
4312
- 'title': title,
4313
- 'raw_text': f"@{entry_type}{{{key}, {content}}}",
4314
- 'type': ref_type,
4315
- 'bibtex_key': key,
4316
- 'bibtex_type': entry_type
4317
- }
4107
+ # Extract references using the biblatex parser
4108
+ references = parse_biblatex_references(bibliography_text)
4318
4109
 
4319
- logger.debug(f"Parsed BibTeX entry: {title} by {authors} ({year})")
4320
- return structured_ref
4110
+ logger.debug(f"Extracted {len(references)} biblatex references using dedicated parser")
4111
+ return references
4321
4112
 
4322
4113
  def _process_llm_extracted_references(self, references):
4323
4114
  """
@@ -4327,7 +4118,6 @@ class ArxivReferenceChecker:
4327
4118
  unique_references = self._deduplicate_references_with_segment_matching(references)
4328
4119
 
4329
4120
  logger.debug(f"Deduplicated {len(references)} references to {len(unique_references)} unique references")
4330
- logger.info(f"Extracted {len(unique_references)} references using LLM")
4331
4121
 
4332
4122
  processed_refs = []
4333
4123
 
@@ -5032,8 +4822,7 @@ class ArxivReferenceChecker:
5032
4822
  from utils.text_utils import detect_latex_bibliography_format
5033
4823
  latex_format = detect_latex_bibliography_format(tex_content)
5034
4824
  if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
5035
- logger.info(f"Found embedded bibliography in ArXiv LaTeX source for {arxiv_id}, but skipping due to formatting issues")
5036
- logger.info(f"Embedded bibliographies often have inconsistent formatting - falling back to alternative extraction methods")
4825
+ logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
5037
4826
  # Skip embedded bibliography and return None to trigger fallback methods
5038
4827
  return None
5039
4828
 
@@ -5075,6 +4864,40 @@ class ArxivReferenceChecker:
5075
4864
  logger.info(f"Detected LaTeX thebibliography format, using extract_latex_references")
5076
4865
  # Use None for file_path since this is content from .bbl files
5077
4866
  references = extract_latex_references(bibtex_content, None)
4867
+
4868
+ # Validate the parsed references and fallback to LLM if needed
4869
+ from utils.text_utils import validate_parsed_references
4870
+ validation = validate_parsed_references(references)
4871
+
4872
+ if not validation['is_valid']:
4873
+ logger.debug(f"LaTeX parsing validation failed (quality: {validation['quality_score']:.2f})")
4874
+ logger.debug(f"Issues detected: {len(validation['issues'])} problems")
4875
+ for issue in validation['issues'][:5]: # Log first 5 issues
4876
+ logger.debug(f" - {issue}")
4877
+
4878
+ # Try LLM fallback if available
4879
+ if self.llm_extractor:
4880
+ logger.info("Falling back to LLM-based extraction due to unsupported LaTeX format")
4881
+ try:
4882
+ llm_references = self.llm_extractor.extract_references(bibtex_content)
4883
+ if llm_references:
4884
+ # Process LLM results first to get structured references
4885
+ processed_llm_refs = self._process_llm_extracted_references(llm_references)
4886
+ # Then validate the processed results
4887
+ llm_validation = validate_parsed_references(processed_llm_refs)
4888
+ if llm_validation['quality_score'] > validation['quality_score']:
4889
+ logger.debug(f"LLM extraction successful (quality: {llm_validation['quality_score']:.2f})")
4890
+ references = processed_llm_refs
4891
+ else:
4892
+ logger.debug("LLM extraction didn't improve quality, keeping original results")
4893
+ else:
4894
+ logger.warning("LLM extraction returned no results")
4895
+ except Exception as e:
4896
+ logger.error(f"LLM fallback failed: {e}")
4897
+ else:
4898
+ logger.warning("No LLM available for fallback, using original parsing results")
4899
+ else:
4900
+ logger.info(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
5078
4901
  else:
5079
4902
  # Parse BibTeX using the standard flow (LLM or regex based on config)
5080
4903
  references = self.parse_references(bibtex_content)
@@ -5088,7 +4911,7 @@ class ArxivReferenceChecker:
5088
4911
  logger.warning(f"Could not save debug references file for {paper_id}: {e}")
5089
4912
 
5090
4913
  if references:
5091
- logger.info(f"Successfully extracted {len(references)} references from BibTeX for {paper_id}")
4914
+ logger.debug(f"Extracted {len(references)} references")
5092
4915
  return references
5093
4916
 
5094
4917
  # Check if this is a text file containing references
@@ -5158,7 +4981,7 @@ class ArxivReferenceChecker:
5158
4981
  bibtex_references = extract_latex_references(bib_content, paper.file_path)
5159
4982
 
5160
4983
  if bibtex_references:
5161
- logger.info(f"Extracted {len(bibtex_references)} references from BibTeX file")
4984
+ logger.debug(f"Extracted {len(bibtex_references)} references from BibTeX file")
5162
4985
  return bibtex_references
5163
4986
  else:
5164
4987
  logger.warning(f"No references found in BibTeX file: {paper.file_path}")