academic-refchecker 1.2.50__tar.gz → 1.2.51__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {academic_refchecker-1.2.50/src/academic_refchecker.egg-info → academic_refchecker-1.2.51}/PKG-INFO +1 -1
  2. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/__version__.py +1 -1
  3. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
  4. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/checkers/github_checker.py +4 -1
  5. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/checkers/openreview_checker.py +10 -5
  6. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/checkers/semantic_scholar.py +8 -6
  7. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/checkers/webpage_checker.py +4 -1
  8. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/core/parallel_processor.py +4 -1
  9. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/core/refchecker.py +78 -67
  10. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/text_utils.py +134 -13
  11. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/LICENSE +0 -0
  12. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/MANIFEST.in +0 -0
  13. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/README.md +0 -0
  14. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/pyproject.toml +0 -0
  15. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/requirements.txt +0 -0
  16. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/scripts/download_db.py +0 -0
  17. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/scripts/run_tests.py +0 -0
  18. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/scripts/start_vllm_server.py +0 -0
  19. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/setup.cfg +0 -0
  20. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/__init__.py +0 -0
  21. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
  22. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
  23. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
  24. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/academic_refchecker.egg-info/requires.txt +0 -0
  25. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/academic_refchecker.egg-info/top_level.txt +0 -0
  26. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/checkers/__init__.py +0 -0
  27. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/checkers/crossref.py +0 -0
  28. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/checkers/enhanced_hybrid_checker.py +0 -0
  29. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/checkers/local_semantic_scholar.py +0 -0
  30. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/checkers/openalex.py +0 -0
  31. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/config/__init__.py +0 -0
  32. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/config/logging.conf +0 -0
  33. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/config/settings.py +0 -0
  34. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/core/__init__.py +0 -0
  35. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/core/db_connection_pool.py +0 -0
  36. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/database/__init__.py +0 -0
  37. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/database/download_semantic_scholar_db.py +0 -0
  38. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/llm/__init__.py +0 -0
  39. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/llm/base.py +0 -0
  40. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/llm/providers.py +0 -0
  41. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/scripts/__init__.py +0 -0
  42. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/scripts/start_vllm_server.py +0 -0
  43. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/services/__init__.py +0 -0
  44. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/services/pdf_processor.py +0 -0
  45. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/__init__.py +0 -0
  46. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/arxiv_utils.py +0 -0
  47. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/author_utils.py +0 -0
  48. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/biblatex_parser.py +0 -0
  49. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/bibliography_utils.py +0 -0
  50. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/bibtex_parser.py +0 -0
  51. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/config_validator.py +0 -0
  52. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/db_utils.py +0 -0
  53. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/doi_utils.py +0 -0
  54. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/error_utils.py +0 -0
  55. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/mock_objects.py +0 -0
  56. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/unicode_utils.py +0 -0
  57. {academic_refchecker-1.2.50 → academic_refchecker-1.2.51}/src/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.50
3
+ Version: 1.2.51
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.50"
3
+ __version__ = "1.2.51"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.50
3
+ Version: 1.2.51
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -5,6 +5,7 @@ import re
5
5
  import logging
6
6
  from urllib.parse import urlparse
7
7
  from typing import Dict, Optional, Tuple, List, Any
8
+ from utils.text_utils import strip_latex_commands
8
9
 
9
10
  logger = logging.getLogger(__name__)
10
11
 
@@ -170,7 +171,9 @@ class GitHubChecker:
170
171
  title_match = self._check_title_match(cited_title, actual_name, actual_description)
171
172
  if not title_match:
172
173
  from utils.error_utils import format_title_mismatch
173
- details = format_title_mismatch(cited_title, actual_name)
174
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
175
+ clean_cited_title = strip_latex_commands(cited_title)
176
+ details = format_title_mismatch(clean_cited_title, actual_name)
174
177
  if actual_description:
175
178
  snippet = actual_description[:100] + ('...' if len(actual_description) > 100 else '')
176
179
  details += f" ({snippet})"
@@ -36,7 +36,8 @@ from utils.text_utils import (
36
36
  normalize_text, clean_title_basic, is_name_match,
37
37
  calculate_title_similarity, compare_authors,
38
38
  clean_title_for_search, are_venues_substantially_different,
39
- is_year_substantially_different
39
+ is_year_substantially_different, strip_latex_commands,
40
+ compare_titles_with_latex_cleaning
40
41
  )
41
42
 
42
43
  # Set up logging
@@ -423,10 +424,12 @@ class OpenReviewReferenceChecker:
423
424
  paper_title = paper_data.get('title', '').strip()
424
425
 
425
426
  if cited_title and paper_title:
426
- similarity = calculate_title_similarity(cited_title, paper_title)
427
+ similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
427
428
  if similarity < 0.7: # Using a reasonable threshold
428
429
  from utils.error_utils import format_title_mismatch
429
- details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
430
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
431
+ clean_cited_title = strip_latex_commands(cited_title)
432
+ details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
430
433
  errors.append({
431
434
  "warning_type": "title",
432
435
  "warning_details": details
@@ -547,10 +550,12 @@ class OpenReviewReferenceChecker:
547
550
  paper_title = best_match.get('title', '').strip()
548
551
 
549
552
  if cited_title and paper_title:
550
- similarity = calculate_title_similarity(cited_title, paper_title)
553
+ similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
551
554
  if similarity < 0.8: # Slightly higher threshold for search results
552
555
  from utils.error_utils import format_title_mismatch
553
- details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
556
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
557
+ clean_cited_title = strip_latex_commands(cited_title)
558
+ details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
554
559
  errors.append({
555
560
  "warning_type": "title",
556
561
  "warning_details": details
@@ -28,7 +28,7 @@ import time
28
28
  import logging
29
29
  import re
30
30
  from typing import Dict, List, Tuple, Optional, Any, Union
31
- from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search
31
+ from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search, strip_latex_commands, compare_titles_with_latex_cleaning
32
32
  from utils.error_utils import format_title_mismatch
33
33
  from config.settings import get_config
34
34
 
@@ -353,7 +353,7 @@ class NonArxivReferenceChecker:
353
353
  cited_title = title.strip()
354
354
 
355
355
  if cited_title and result_title:
356
- title_similarity = calculate_title_similarity(cited_title.lower(), result_title.lower())
356
+ title_similarity = compare_titles_with_latex_cleaning(cited_title, result_title)
357
357
  logger.debug(f"Semantic Scholar ArXiv search title similarity: {title_similarity:.3f}")
358
358
  logger.debug(f"Cited title: '{cited_title}'")
359
359
  logger.debug(f"Found title: '{result_title}'")
@@ -385,7 +385,7 @@ class NonArxivReferenceChecker:
385
385
  logger.debug(f"DEBUG: cited_title='{cited_title}', arxiv_title='{arxiv_title}'")
386
386
 
387
387
  if cited_title and arxiv_title:
388
- title_similarity = calculate_title_similarity(cited_title.lower(), arxiv_title.lower())
388
+ title_similarity = compare_titles_with_latex_cleaning(cited_title, arxiv_title)
389
389
  logger.debug(f"ArXiv API title similarity: {title_similarity:.3f}")
390
390
  logger.debug(f"Cited title: '{cited_title}'")
391
391
  logger.debug(f"ArXiv title: '{arxiv_title}'")
@@ -419,7 +419,7 @@ class NonArxivReferenceChecker:
419
419
  arxiv_title_check = arxiv_paper_check.get('title', '').strip()
420
420
  cited_title_check = title.strip()
421
421
  if cited_title_check and arxiv_title_check:
422
- title_similarity_check = calculate_title_similarity(cited_title_check.lower(), arxiv_title_check.lower())
422
+ title_similarity_check = compare_titles_with_latex_cleaning(cited_title_check, arxiv_title_check)
423
423
  if title_similarity_check < SIMILARITY_THRESHOLD:
424
424
  logger.debug(f"Detected ArXiv ID mismatch before raw text search - skipping unnecessary searches")
425
425
  arxiv_id_mismatch_detected = True
@@ -468,11 +468,13 @@ class NonArxivReferenceChecker:
468
468
  return None, [], None
469
469
 
470
470
  # Check title using similarity function to handle formatting differences
471
- title_similarity = calculate_title_similarity(title, found_title) if found_title else 0.0
471
+ title_similarity = compare_titles_with_latex_cleaning(title, found_title) if found_title else 0.0
472
472
  if found_title and title_similarity < SIMILARITY_THRESHOLD:
473
+ # Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
474
+ clean_cited_title = strip_latex_commands(title)
473
475
  errors.append({
474
476
  'error_type': 'title',
475
- 'error_details': format_title_mismatch(title, found_title),
477
+ 'error_details': format_title_mismatch(clean_cited_title, found_title),
476
478
  'ref_title_correct': paper_data.get('title', '')
477
479
  })
478
480
 
@@ -7,6 +7,7 @@ from urllib.parse import urlparse, urljoin
7
7
  from typing import Dict, Optional, Tuple, List, Any
8
8
  from bs4 import BeautifulSoup
9
9
  import time
10
+ from utils.text_utils import strip_latex_commands
10
11
 
11
12
  logger = logging.getLogger(__name__)
12
13
 
@@ -185,9 +186,11 @@ class WebPageChecker:
185
186
  if cited_title and page_title:
186
187
  if not self._check_title_match(cited_title, page_title, page_description):
187
188
  from utils.error_utils import format_title_mismatch
189
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
190
+ clean_cited_title = strip_latex_commands(cited_title)
188
191
  errors.append({
189
192
  "warning_type": "title",
190
- "warning_details": format_title_mismatch(cited_title, page_title)
193
+ "warning_details": format_title_mismatch(clean_cited_title, page_title)
191
194
  })
192
195
 
193
196
  # Check if this is a documentation page for the cited topic
@@ -275,7 +275,10 @@ class ParallelReferenceProcessor:
275
275
  reference = result.reference
276
276
 
277
277
  # Print reference info in the same format as sequential mode
278
- title = reference.get('title', 'Untitled')
278
+ raw_title = reference.get('title', 'Untitled')
279
+ # Clean LaTeX commands from title for display
280
+ from utils.text_utils import strip_latex_commands
281
+ title = strip_latex_commands(raw_title)
279
282
  from utils.text_utils import format_authors_for_display
280
283
  authors = format_authors_for_display(reference.get('authors', []))
281
284
  year = reference.get('year', '')
@@ -50,7 +50,8 @@ from utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
50
50
  detect_latex_bibliography_format, extract_latex_references,
51
51
  detect_standard_acm_natbib_format, strip_latex_commands,
52
52
  format_corrected_reference, is_name_match, enhanced_name_match,
53
- calculate_title_similarity, normalize_arxiv_url, deduplicate_urls)
53
+ calculate_title_similarity, normalize_arxiv_url, deduplicate_urls,
54
+ compare_authors)
54
55
  from utils.config_validator import ConfigValidator
55
56
  from services.pdf_processor import PDFProcessor
56
57
  from checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
@@ -1789,7 +1790,7 @@ class ArxivReferenceChecker:
1789
1790
  if authors:
1790
1791
  db_authors = [author.get('name', '') for author in check_paper_data['authors']]
1791
1792
 
1792
- authors_match, author_error = self.compare_authors(authors, db_authors)
1793
+ authors_match, author_error = compare_authors(authors, db_authors)
1793
1794
  if authors_match:
1794
1795
  paper_data = check_paper_data
1795
1796
  search_strategy = "Normalized title with author match"
@@ -1901,10 +1902,12 @@ class ArxivReferenceChecker:
1901
1902
 
1902
1903
  if normalized_title != db_title:
1903
1904
  from utils.error_utils import format_title_mismatch
1905
+ # Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
1906
+ clean_cited_title = strip_latex_commands(title)
1904
1907
  logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
1905
1908
  errors.append({
1906
1909
  'error_type': 'title',
1907
- 'error_details': format_title_mismatch(title, paper_data.get('title')),
1910
+ 'error_details': format_title_mismatch(clean_cited_title, paper_data.get('title')),
1908
1911
  'ref_title_correct': paper_data.get('title')
1909
1912
  })
1910
1913
 
@@ -1912,7 +1915,7 @@ class ArxivReferenceChecker:
1912
1915
  if authors and paper_data.get('authors'):
1913
1916
  # Extract author names from database data
1914
1917
  correct_names = [author.get('name', '') for author in paper_data['authors']]
1915
- authors_match, author_error = self.compare_authors(authors, correct_names)
1918
+ authors_match, author_error = compare_authors(authors, correct_names)
1916
1919
 
1917
1920
  if not authors_match:
1918
1921
  logger.debug(f"DB Verification: Author mismatch - {author_error}")
@@ -3054,6 +3057,13 @@ class ArxivReferenceChecker:
3054
3057
  try:
3055
3058
  # Extract bibliography
3056
3059
  bibliography = self.extract_bibliography(paper, debug_mode)
3060
+
3061
+ # Apply deduplication to all bibliography sources (not just LLM-extracted)
3062
+ if len(bibliography) > 1: # Only deduplicate if we have multiple references
3063
+ original_count = len(bibliography)
3064
+ bibliography = self._deduplicate_bibliography_entries(bibliography)
3065
+ if len(bibliography) < original_count:
3066
+ logger.debug(f"Deduplicated {original_count} references to {len(bibliography)} unique references")
3057
3067
 
3058
3068
  # Update statistics
3059
3069
  self.total_papers_processed += 1
@@ -3493,8 +3503,9 @@ class ArxivReferenceChecker:
3493
3503
  except Exception as e:
3494
3504
  logger.error(f"LLM fallback failed: {e}")
3495
3505
  return []
3496
- logger.debug("Using biblatex file")
3497
- return biblatex_refs
3506
+ if len(biblatex_refs) > 0:
3507
+ logger.debug("Using biblatex file")
3508
+ return biblatex_refs
3498
3509
 
3499
3510
  # For non-standard formats, try LLM-based extraction if available
3500
3511
  if self.llm_extractor:
@@ -4284,9 +4295,9 @@ class ArxivReferenceChecker:
4284
4295
  # If either has no title, can't reliably determine if duplicate
4285
4296
  return False
4286
4297
 
4287
- # If titles match exactly, consider them duplicates
4288
- # This handles the case where the same paper appears multiple times
4289
- if seg1['title'] == seg2['title']:
4298
+ # If titles match exactly (case-insensitive), consider them duplicates
4299
+ # This handles the case where the same paper appears multiple times with different capitalization
4300
+ if seg1['title'].lower() == seg2['title'].lower():
4290
4301
  return True
4291
4302
 
4292
4303
  # Special case: Check if one title is an arXiv identifier and the other is a real title
@@ -4299,16 +4310,54 @@ class ArxivReferenceChecker:
4299
4310
  author1 = seg1['author']
4300
4311
  author2 = seg2['author']
4301
4312
 
4302
- if author1 and author2 and author1 == author2:
4313
+ if author1 and author2 and author1.lower() == author2.lower():
4303
4314
  # Same authors - check if one title is substring of other or significant similarity
4304
- title1 = seg1['title']
4305
- title2 = seg2['title']
4315
+ title1 = seg1['title'].lower()
4316
+ title2 = seg2['title'].lower()
4306
4317
 
4307
4318
  if (title1 in title2 or title2 in title1):
4308
4319
  return True
4309
4320
 
4310
4321
  return False
4311
4322
 
4323
+ def _deduplicate_bibliography_entries(self, bibliography):
4324
+ """
4325
+ Deduplicate bibliography entries using title and author comparison.
4326
+
4327
+ This works with structured reference dictionaries from BibTeX/LaTeX parsing,
4328
+ as opposed to _deduplicate_references_with_segment_matching which works with raw text.
4329
+
4330
+ Args:
4331
+ bibliography: List of reference dictionaries with 'title', 'authors', etc.
4332
+
4333
+ Returns:
4334
+ List of unique reference dictionaries
4335
+ """
4336
+ if len(bibliography) <= 1:
4337
+ return bibliography
4338
+
4339
+ unique_refs = []
4340
+ seen_titles = set()
4341
+
4342
+ for ref in bibliography:
4343
+ title = ref.get('title', '').strip()
4344
+ if not title:
4345
+ # Keep references without titles (they can't be deduplicated)
4346
+ unique_refs.append(ref)
4347
+ continue
4348
+
4349
+ # Normalize title for comparison (case-insensitive, basic cleanup)
4350
+ normalized_title = title.lower().strip()
4351
+
4352
+ # Check if we've seen this title before (case-insensitive)
4353
+ if normalized_title in seen_titles:
4354
+ logger.debug(f"Skipping duplicate reference: '{title}'")
4355
+ else:
4356
+ unique_refs.append(ref)
4357
+ seen_titles.add(normalized_title)
4358
+
4359
+ return unique_refs
4360
+
4312
4361
  def _is_arxiv_identifier_title_mismatch(self, seg1, seg2):
4313
4362
  """
4314
4363
  Check if one reference has an arXiv identifier as title while the other has a real title,
@@ -5087,60 +5136,6 @@ class ArxivReferenceChecker:
5087
5136
 
5088
5137
  return references
5089
5138
 
5090
- def compare_authors(self, cited_authors, correct_authors):
5091
- """
5092
- Compare author lists to check if they match using improved name matching.
5093
- Uses the utility function is_name_match for robust author name comparison.
5094
- """
5095
- # Clean up author names
5096
- cleaned_cited = []
5097
- for author in cited_authors:
5098
- # Remove reference numbers (e.g., "[1]")
5099
- author = re.sub(r'^\[\d+\]', '', author)
5100
- # Remove line breaks
5101
- author = author.replace('\n', ' ')
5102
-
5103
- # Handle "et al" cases properly
5104
- author_clean = author.strip()
5105
- if author_clean.lower() == 'et al':
5106
- # Skip pure "et al" entries
5107
- continue
5108
- elif 'et al' in author_clean.lower():
5109
- # Remove "et al" from the author name (e.g., "S. M. Lundberg et al" -> "S. M. Lundberg")
5110
- author_clean = re.sub(r'\s+et\s+al\.?', '', author_clean, flags=re.IGNORECASE).strip()
5111
- if author_clean: # Only add if something remains
5112
- cleaned_cited.append(author_clean)
5113
- else:
5114
- cleaned_cited.append(author_clean)
5115
-
5116
- if not cleaned_cited:
5117
- return True, "No authors to compare"
5118
-
5119
- # Handle "et al" cases and length mismatches
5120
- has_et_al = any('et al' in a.lower() for a in cited_authors)
5121
-
5122
- if len(cleaned_cited) < len(correct_authors) and (has_et_al or len(cleaned_cited) <= 3):
5123
- # Only compare the authors that are listed
5124
- correct_authors = correct_authors[:len(cleaned_cited)]
5125
- elif len(cleaned_cited) > len(correct_authors) and len(correct_authors) >= 3:
5126
- # Use available correct authors
5127
- cleaned_cited = cleaned_cited[:len(correct_authors)]
5128
-
5129
- # If there's a big count mismatch and no "et al", it's likely an error
5130
- if abs(len(cleaned_cited) - len(correct_authors)) > 3 and not has_et_al:
5131
- return False, "Author count mismatch"
5132
-
5133
- # Compare first author (most important) using the improved utility function
5134
- if cleaned_cited and correct_authors:
5135
- # Use raw names for comparison (is_name_match handles normalization internally)
5136
- cited_first = cleaned_cited[0]
5137
- correct_first = correct_authors[0]
5138
-
5139
- if not enhanced_name_match(cited_first, correct_first):
5140
- from utils.error_utils import format_first_author_mismatch
5141
- return False, format_first_author_mismatch(cited_first, correct_first)
5142
-
5143
- return True, "Authors match"
5144
5139
 
5145
5140
  def normalize_text(self, text):
5146
5141
  """
@@ -5251,6 +5246,19 @@ class ArxivReferenceChecker:
5251
5246
  return False
5252
5247
  return True
5253
5248
 
5249
+ def compare_authors(self, authors1, authors2):
5250
+ """
5251
+ Compare authors using the text_utils compare_authors function.
5252
+
5253
+ Args:
5254
+ authors1: First list of authors
5255
+ authors2: Second list of authors
5256
+
5257
+ Returns:
5258
+ Tuple of (match_result, error_message)
5259
+ """
5260
+ return compare_authors(authors1, authors2)
5261
+
5254
5262
  def _verify_references_sequential(self, paper, bibliography, paper_errors, error_types, unverified_count, debug_mode):
5255
5263
  """
5256
5264
  Sequential reference verification (original implementation)
@@ -5267,7 +5275,10 @@ class ArxivReferenceChecker:
5267
5275
  ref_id = self.extract_arxiv_id_from_url(reference['url'])
5268
5276
 
5269
5277
  # Print reference info in non-debug mode (improved formatting)
5270
- title = reference.get('title', 'Untitled')
5278
+ raw_title = reference.get('title', 'Untitled')
5279
+ # Clean LaTeX commands from title for display
5280
+ from utils.text_utils import strip_latex_commands
5281
+ title = strip_latex_commands(raw_title)
5271
5282
  from utils.text_utils import format_authors_for_display
5272
5283
  authors = format_authors_for_display(reference.get('authors', []))
5273
5284
  year = reference.get('year', '')
@@ -580,6 +580,9 @@ def clean_title_for_search(title):
580
580
  if not isinstance(title, str):
581
581
  return str(title) if title is not None else ''
582
582
 
583
+ # Strip LaTeX commands to handle math formatting and other LaTeX markup
584
+ title = strip_latex_commands(title)
585
+
583
586
  # Clean up newlines and normalize whitespace (but preserve other structure)
584
587
  title = title.replace('\n', ' ').strip()
585
588
  title = re.sub(r'\s+', ' ', title) # Normalize whitespace only
@@ -753,8 +756,11 @@ def normalize_paper_title(title: str) -> str:
753
756
  if not title:
754
757
  return ""
755
758
 
759
+ # Strip LaTeX commands first to handle math formatting consistently
760
+ normalized = strip_latex_commands(title)
761
+
756
762
  # Convert to lowercase
757
- normalized = title.lower()
763
+ normalized = normalized.lower()
758
764
 
759
765
  # Remove common prefixes that don't affect the actual title content
760
766
  prefixes_to_remove = [
@@ -2107,21 +2113,37 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2107
2113
 
2108
2114
  return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
2109
2115
 
2116
+ # Detect if cited authors look like parsing fragments
2117
+ # (many short single-word entries that might be first/last name fragments)
2118
+ def looks_like_fragments(authors_list):
2119
+ if len(authors_list) < 4: # Need at least 4 to detect fragment pattern
2120
+ return False
2121
+ single_word_count = sum(1 for author in authors_list if len(author.strip().split()) == 1)
2122
+ return single_word_count >= len(authors_list) * 0.7 # 70% or more are single words
2123
+
2110
2124
  # Normal case without "et al" - compare all authors
2111
2125
  if len(cleaned_cited) != len(correct_names):
2112
- # For non-et-al cases, be more strict about count mismatches
2113
- # Allow minor flexibility (1 author difference) but not more
2114
- if abs(len(cleaned_cited) - len(correct_names)) > 1:
2126
+
2127
+ # Check if cited authors look like parsing fragments
2128
+ if looks_like_fragments(cleaned_cited):
2115
2129
  from utils.error_utils import format_author_count_mismatch
2116
- # Convert cited names to display format (First Last) before showing in error
2117
2130
  display_cited = [format_author_for_display(author) for author in cleaned_cited]
2118
2131
  error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2119
2132
  return False, error_msg
2120
2133
 
2121
- # Use the shorter list for comparison
2122
- min_len = min(len(cleaned_cited), len(correct_names))
2123
- comparison_cited = cleaned_cited[:min_len]
2124
- comparison_correct = correct_names[:min_len]
2134
+ # For all count mismatches, show the count mismatch error
2135
+ if len(cleaned_cited) < len(correct_names):
2136
+ from utils.error_utils import format_author_count_mismatch
2137
+ display_cited = [format_author_for_display(author) for author in cleaned_cited]
2138
+ error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2139
+ return False, error_msg
2140
+
2141
+ # For cases where cited > correct, also show count mismatch
2142
+ elif len(cleaned_cited) > len(correct_names):
2143
+ from utils.error_utils import format_author_count_mismatch
2144
+ display_cited = [format_author_for_display(author) for author in cleaned_cited]
2145
+ error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2146
+ return False, error_msg
2125
2147
  else:
2126
2148
  comparison_cited = cleaned_cited
2127
2149
  comparison_correct = correct_names
@@ -2484,8 +2506,64 @@ def strip_latex_commands(text):
2484
2506
  # Remove font size commands
2485
2507
  text = re.sub(r'\\(tiny|scriptsize|footnotesize|small|normalsize|large|Large|LARGE|huge|Huge)\b', '', text)
2486
2508
 
2487
- # Remove math mode delimiters
2488
- text = re.sub(r'\$([^$]*)\$', r'\1', text)
2509
+ # Handle complex math mode patterns first
2510
+ # Pattern like $\{$$\mu$second-scale$\}$ should become μsecond-scale
2511
+ def process_complex_math(match):
2512
+ content = match.group(1)
2513
+ # Handle common Greek letters
2514
+ content = re.sub(r'\\mu\b', 'μ', content) # \mu -> μ
2515
+ content = re.sub(r'\\alpha\b', 'α', content) # \alpha -> α
2516
+ content = re.sub(r'\\beta\b', 'β', content) # \beta -> β
2517
+ content = re.sub(r'\\gamma\b', 'γ', content) # \gamma -> γ
2518
+ content = re.sub(r'\\delta\b', 'δ', content) # \delta -> δ
2519
+ content = re.sub(r'\\epsilon\b', 'ε', content) # \epsilon -> ε
2520
+ content = re.sub(r'\\lambda\b', 'λ', content) # \lambda -> λ
2521
+ content = re.sub(r'\\pi\b', 'π', content) # \pi -> π
2522
+ content = re.sub(r'\\sigma\b', 'σ', content) # \sigma -> σ
2523
+ content = re.sub(r'\\theta\b', 'θ', content) # \theta -> θ
2524
+ # Remove any remaining LaTeX commands and braces from inside math
2525
+ content = re.sub(r'\\[a-zA-Z]+\b', '', content)
2526
+ content = re.sub(r'[{}]', '', content)
2527
+ # Clean up any remaining $ signs
2528
+ content = re.sub(r'\$+', '', content)
2529
+ return content
2530
+
2531
+ # Handle complex nested math patterns first
2532
+ # Pattern like $\{$$\mu$second-scale$\}$ should become μsecond-scale
2533
+ def process_nested_math_specifically(match):
2534
+ content = match.group(0)
2535
+ # Handle the specific pattern: $\{$$\mu$second-scale$\}$
2536
+ # Extract the meaningful parts
2537
+ if r'\mu' in content:
2538
+ # Replace \mu with μ and extract the surrounding text
2539
+ content = re.sub(r'\\mu\b', 'μ', content)
2540
+ # Remove all LaTeX math markup
2541
+ content = re.sub(r'[\$\{\}\\]+', '', content)
2542
+ return content
2543
+
2544
+ # Handle the specific problematic pattern
2545
+ text = re.sub(r'\$\\\{[^}]*\\\}\$', process_nested_math_specifically, text)
2546
+
2547
+ # Handle Greek letters in math mode before removing delimiters
2548
+ def process_standard_math(match):
2549
+ content = match.group(1)
2550
+ # Handle common Greek letters - content has single backslashes
2551
+ content = re.sub(r'\\mu\b', 'μ', content)
2552
+ content = re.sub(r'\\alpha\b', 'α', content)
2553
+ content = re.sub(r'\\beta\b', 'β', content)
2554
+ content = re.sub(r'\\gamma\b', 'γ', content)
2555
+ content = re.sub(r'\\delta\b', 'δ', content)
2556
+ content = re.sub(r'\\epsilon\b', 'ε', content)
2557
+ content = re.sub(r'\\lambda\b', 'λ', content)
2558
+ content = re.sub(r'\\pi\b', 'π', content)
2559
+ content = re.sub(r'\\sigma\b', 'σ', content)
2560
+ content = re.sub(r'\\theta\b', 'θ', content)
2561
+ # Remove any remaining LaTeX commands
2562
+ content = re.sub(r'\\[a-zA-Z]+\b', '', content)
2563
+ return content
2564
+
2565
+ # Remove standard math mode delimiters with Greek letter processing
2566
+ text = re.sub(r'\$([^$]*)\$', process_standard_math, text)
2489
2567
  text = re.sub(r'\\begin\{equation\}.*?\\end\{equation\}', '', text, flags=re.DOTALL)
2490
2568
  text = re.sub(r'\\begin\{align\}.*?\\end\{align\}', '', text, flags=re.DOTALL)
2491
2569
 
@@ -3369,7 +3447,18 @@ def _extract_corrected_reference_data(error_entry: dict, corrected_data: dict) -
3369
3447
  """
3370
3448
  # Get the corrected information
3371
3449
  correct_title = error_entry.get('ref_title_correct') or corrected_data.get('title', '')
3372
- correct_authors = error_entry.get('ref_authors_correct') or corrected_data.get('authors', '')
3450
+
3451
+ # Handle authors - can be string or list of dicts from API
3452
+ authors_raw = error_entry.get('ref_authors_correct') or corrected_data.get('authors', '')
3453
+ if isinstance(authors_raw, list):
3454
+ # Convert list of author dicts to comma-separated string
3455
+ if authors_raw and isinstance(authors_raw[0], dict):
3456
+ correct_authors = ', '.join([author.get('name', '') for author in authors_raw])
3457
+ else:
3458
+ correct_authors = ', '.join(authors_raw)
3459
+ else:
3460
+ correct_authors = str(authors_raw) if authors_raw else ''
3461
+
3373
3462
  correct_year = error_entry.get('ref_year_correct') or corrected_data.get('year', '')
3374
3463
 
3375
3464
  # Prioritize the verified URL that was actually used for verification
@@ -3573,7 +3662,39 @@ def format_corrected_plaintext(original_reference, corrected_data, error_entry):
3573
3662
  if correct_url:
3574
3663
  citation_parts.append(f"{correct_url}")
3575
3664
 
3576
- return '. '.join(citation_parts) + '.'
3665
+ citation_text = '. '.join(citation_parts) + '.'
3666
+
3667
+ # Add citation key information if available (for easy copying)
3668
+ citation_key = original_reference.get('bibtex_key') or original_reference.get('bibitem_key')
3669
+ if citation_key and citation_key != 'unknown':
3670
+ bibtex_type = original_reference.get('bibtex_type', 'misc')
3671
+ citation_text += f"\n\n% Citation key for BibTeX: @{bibtex_type}{{{citation_key}, ...}}"
3672
+
3673
+ return citation_text
3674
+
3675
+
3676
+ def compare_titles_with_latex_cleaning(cited_title: str, database_title: str) -> float:
3677
+ """
3678
+ Compare two titles with proper LaTeX cleaning for accurate similarity scoring.
3679
+
3680
+ This function ensures both titles are cleaned of LaTeX commands before comparison
3681
+ to avoid false mismatches due to formatting differences like {LLM}s vs LLMs.
3682
+
3683
+ Args:
3684
+ cited_title: Title from cited reference (may contain LaTeX)
3685
+ database_title: Title from database (usually already clean)
3686
+
3687
+ Returns:
3688
+ Similarity score between 0 and 1
3689
+ """
3690
+ if not cited_title or not database_title:
3691
+ return 0.0
3692
+
3693
+ # Clean LaTeX commands from cited title to match database format
3694
+ clean_cited = strip_latex_commands(cited_title)
3695
+
3696
+ # Calculate similarity using cleaned titles
3697
+ return calculate_title_similarity(clean_cited, database_title)
3577
3698
 
3578
3699
 
3579
3700
  def calculate_title_similarity(title1: str, title2: str) -> float: