academic-refchecker 1.2.45__tar.gz → 1.2.47__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {academic_refchecker-1.2.45/src/academic_refchecker.egg-info → academic_refchecker-1.2.47}/PKG-INFO +1 -1
  2. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/__version__.py +1 -1
  3. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
  4. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/local_semantic_scholar.py +15 -4
  5. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/semantic_scholar.py +35 -34
  6. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/webpage_checker.py +12 -2
  7. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/core/refchecker.py +20 -9
  8. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/arxiv_utils.py +16 -19
  9. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/doi_utils.py +23 -5
  10. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/error_utils.py +17 -1
  11. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/text_utils.py +27 -4
  12. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/LICENSE +0 -0
  13. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/MANIFEST.in +0 -0
  14. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/README.md +0 -0
  15. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/pyproject.toml +0 -0
  16. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/requirements.txt +0 -0
  17. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/scripts/download_db.py +0 -0
  18. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/scripts/run_tests.py +0 -0
  19. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/scripts/start_vllm_server.py +0 -0
  20. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/setup.cfg +0 -0
  21. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/__init__.py +0 -0
  22. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
  23. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
  24. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
  25. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/academic_refchecker.egg-info/requires.txt +0 -0
  26. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/academic_refchecker.egg-info/top_level.txt +0 -0
  27. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/__init__.py +0 -0
  28. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/crossref.py +0 -0
  29. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/enhanced_hybrid_checker.py +0 -0
  30. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/github_checker.py +0 -0
  31. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/openalex.py +0 -0
  32. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/openreview_checker.py +0 -0
  33. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/config/__init__.py +0 -0
  34. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/config/logging.conf +0 -0
  35. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/config/settings.py +0 -0
  36. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/core/__init__.py +0 -0
  37. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/core/db_connection_pool.py +0 -0
  38. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/core/parallel_processor.py +0 -0
  39. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/database/__init__.py +0 -0
  40. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/database/download_semantic_scholar_db.py +0 -0
  41. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/llm/__init__.py +0 -0
  42. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/llm/base.py +0 -0
  43. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/llm/providers.py +0 -0
  44. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/scripts/__init__.py +0 -0
  45. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/scripts/start_vllm_server.py +0 -0
  46. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/services/__init__.py +0 -0
  47. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/services/pdf_processor.py +0 -0
  48. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/__init__.py +0 -0
  49. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/author_utils.py +0 -0
  50. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/biblatex_parser.py +0 -0
  51. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/bibliography_utils.py +0 -0
  52. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/bibtex_parser.py +0 -0
  53. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/config_validator.py +0 -0
  54. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/db_utils.py +0 -0
  55. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/mock_objects.py +0 -0
  56. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/unicode_utils.py +0 -0
  57. {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.45
3
+ Version: 1.2.47
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.45"
3
+ __version__ = "1.2.47"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.45
3
+ Version: 1.2.47
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -430,11 +430,22 @@ class LocalNonArxivReferenceChecker:
430
430
  logger.debug(f"Local DB: Author mismatch - {author_error}")
431
431
  errors.append(create_author_error(author_error, paper_data.get('authors', [])))
432
432
 
433
- # Verify year
433
+ # Verify year (with tolerance)
434
434
  paper_year = paper_data.get('year')
435
- if year and paper_year and year != paper_year:
436
- logger.debug(f"Local DB: Year mismatch - cited: {year}, actual: {paper_year}")
437
- errors.append(create_year_warning(year, paper_year))
435
+ if year and paper_year:
436
+ # Get year tolerance from config (default to 1 if not available)
437
+ year_tolerance = 1 # Default tolerance
438
+ try:
439
+ from config.settings import get_config
440
+ config = get_config()
441
+ year_tolerance = config.get('text_processing', {}).get('year_tolerance', 1)
442
+ except (ImportError, Exception):
443
+ pass # Use default if config not available
444
+
445
+ # Only flag as mismatch if the difference is greater than tolerance
446
+ if abs(year - paper_year) > year_tolerance:
447
+ logger.debug(f"Local DB: Year mismatch - cited: {year}, actual: {paper_year}")
448
+ errors.append(create_year_warning(year, paper_year))
438
449
 
439
450
  # Verify DOI
440
451
  paper_doi = None
@@ -543,49 +543,50 @@ class NonArxivReferenceChecker:
543
543
  elif paper_venue and not isinstance(paper_venue, str):
544
544
  paper_venue = str(paper_venue)
545
545
 
546
+ # Check venue mismatches
546
547
  if cited_venue and paper_venue:
547
548
  # Use the utility function to check if venues are substantially different
548
549
  if are_venues_substantially_different(cited_venue, paper_venue):
549
550
  from utils.error_utils import create_venue_warning
550
551
  errors.append(create_venue_warning(cited_venue, paper_venue))
551
552
  elif not cited_venue and paper_venue:
552
- # Check if this is an arXiv paper first
553
- external_ids = paper_data.get('externalIds', {})
554
- arxiv_id = external_ids.get('ArXiv') if external_ids else None
555
-
556
- if arxiv_id:
557
- # For arXiv papers, suggest including the arXiv URL instead of venue
558
- arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
559
-
560
- # Check if the reference already includes this ArXiv URL or equivalent DOI
561
- reference_url = reference.get('url', '')
562
-
563
- # Check for direct arXiv URL match
564
- has_arxiv_url = arxiv_url in reference_url
565
-
566
- # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
567
- arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
568
- has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
569
-
570
- if not (has_arxiv_url or has_arxiv_doi):
553
+ # Original reference has the venue in raw text but not parsed correctly
554
+ raw_text = reference.get('raw_text', '')
555
+ if raw_text and '#' in raw_text:
556
+ # Check if venue might be in the raw text format (author#title#venue#year#url)
557
+ parts = raw_text.split('#')
558
+ if len(parts) >= 3 and parts[2].strip():
559
+ # Venue is present in raw text but missing from parsed reference
571
560
  errors.append({
572
561
  'warning_type': 'venue',
573
- 'warning_details': f"Reference should include arXiv URL: {arxiv_url}",
574
- 'ref_url_correct': arxiv_url
562
+ 'warning_details': f"Venue missing: should include '{paper_venue}'",
563
+ 'ref_venue_correct': paper_venue
575
564
  })
576
- else:
577
- # Original reference has the venue in raw text but not parsed correctly
578
- raw_text = reference.get('raw_text', '')
579
- if raw_text and '#' in raw_text:
580
- # Check if venue might be in the raw text format (author#title#venue#year#url)
581
- parts = raw_text.split('#')
582
- if len(parts) >= 3 and parts[2].strip():
583
- # Venue is present in raw text but missing from parsed reference
584
- errors.append({
585
- 'warning_type': 'venue',
586
- 'warning_details': f"Venue missing: should include '{paper_venue}'",
587
- 'ref_venue_correct': paper_venue
588
- })
565
+
566
+ # Always check for missing arXiv URLs when paper has arXiv ID
567
+ external_ids = paper_data.get('externalIds', {})
568
+ arxiv_id = external_ids.get('ArXiv') if external_ids else None
569
+
570
+ if arxiv_id:
571
+ # For arXiv papers, check if reference includes the arXiv URL
572
+ arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
573
+
574
+ # Check if the reference already includes this ArXiv URL or equivalent DOI
575
+ reference_url = reference.get('url', '')
576
+
577
+ # Check for direct arXiv URL match
578
+ has_arxiv_url = arxiv_url in reference_url
579
+
580
+ # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
581
+ arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
582
+ has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
583
+
584
+ if not (has_arxiv_url or has_arxiv_doi):
585
+ errors.append({
586
+ 'warning_type': 'url',
587
+ 'warning_details': f"Reference could include arXiv URL: {arxiv_url}",
588
+ 'ref_url_correct': arxiv_url
589
+ })
589
590
 
590
591
  # Verify DOI
591
592
  paper_doi = None
@@ -71,7 +71,8 @@ class WebPageChecker:
71
71
  doc_indicators = [
72
72
  'docs', 'documentation', 'readthedocs.io', 'help', 'guide', 'tutorial',
73
73
  'reference', 'manual', 'wiki', 'blog', 'api', 'developer', 'platform',
74
- 'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop'
74
+ 'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop',
75
+ 'posts' # For blog posts and forum posts like LessWrong
75
76
  ]
76
77
 
77
78
  return any(indicator in url.lower() for indicator in doc_indicators) or self._is_likely_webpage(url)
@@ -84,7 +85,8 @@ class WebPageChecker:
84
85
  doc_domains = [
85
86
  'pytorch.org', 'tensorflow.org', 'readthedocs.io', 'onnxruntime.ai',
86
87
  'deepspeed.ai', 'huggingface.co', 'openai.com', 'microsoft.com',
87
- 'google.com', 'nvidia.com', 'intel.com', 'langchain.com'
88
+ 'google.com', 'nvidia.com', 'intel.com', 'langchain.com',
89
+ 'lesswrong.com' # LessWrong rationality and AI safety blog platform
88
90
  ]
89
91
 
90
92
  return any(domain in parsed.netloc for domain in doc_domains)
@@ -395,6 +397,14 @@ class WebPageChecker:
395
397
  organization = site_info.get('organization', '').lower()
396
398
  domain = site_info.get('domain', '').lower()
397
399
 
400
+ # Accept generic web resource terms - these are valid for any web URL
401
+ generic_web_terms = [
402
+ 'web resource', 'web site', 'website', 'online resource',
403
+ 'online', 'web', 'internet resource', 'web page', 'webpage'
404
+ ]
405
+ if cited_lower in generic_web_terms:
406
+ return True
407
+
398
408
  # Direct matches
399
409
  if cited_lower in organization or organization in cited_lower:
400
410
  return True
@@ -1922,16 +1922,27 @@ class ArxivReferenceChecker:
1922
1922
  'ref_authors_correct': ', '.join(correct_names)
1923
1923
  })
1924
1924
 
1925
- # Verify year
1925
+ # Verify year (with tolerance)
1926
1926
  paper_year = paper_data.get('year')
1927
- if year and paper_year and year != paper_year:
1928
- logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
1929
- from utils.error_utils import format_year_mismatch
1930
- errors.append({
1931
- 'warning_type': 'year',
1932
- 'warning_details': format_year_mismatch(year, paper_year),
1933
- 'ref_year_correct': paper_year
1934
- })
1927
+ if year and paper_year:
1928
+ # Get year tolerance from config (default to 1 if not available)
1929
+ year_tolerance = 1 # Default tolerance
1930
+ try:
1931
+ from config.settings import get_config
1932
+ config = get_config()
1933
+ year_tolerance = config.get('text_processing', {}).get('year_tolerance', 1)
1934
+ except (ImportError, Exception):
1935
+ pass # Use default if config not available
1936
+
1937
+ # Only flag as mismatch if the difference is greater than tolerance
1938
+ if abs(year - paper_year) > year_tolerance:
1939
+ logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
1940
+ from utils.error_utils import format_year_mismatch
1941
+ errors.append({
1942
+ 'warning_type': 'year',
1943
+ 'warning_details': format_year_mismatch(year, paper_year),
1944
+ 'ref_year_correct': paper_year
1945
+ })
1935
1946
 
1936
1947
  # Verify DOI
1937
1948
  if doi and external_ids.get('DOI'):
@@ -392,32 +392,29 @@ def get_bibtex_content(paper):
392
392
  logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
393
393
  tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
394
394
 
395
- # Choose between .bib and .bbl files based on content richness
396
- # Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
395
+ # Choose between .bib and .bbl files - .bbl files take priority when they contain entries
396
+ # .bbl files are processed biblatex output that reflects exactly what was cited
397
397
  if bib_content and bbl_content:
398
- # Count entries in both
398
+ # Count entries in both for logging
399
399
  bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
400
400
  bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
401
401
 
402
- # If we have LaTeX content, get filtered BibTeX count
403
- filtered_bib_count = bib_entry_count
404
- filtered_content = bib_content
405
- if tex_content:
406
- cited_keys = extract_cited_keys_from_tex({}, tex_content)
407
- if cited_keys:
408
- logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
409
- filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
410
- filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
411
-
412
- logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
402
+ logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
413
403
 
414
- # Prioritize .bbl if it has significantly more entries
415
- if bbl_entry_count > filtered_bib_count * 1.5: # 50% more entries threshold
416
- logger.info(f"Using .bbl files from ArXiv source")
404
+ # Only use .bbl if it actually contains bibliography entries
405
+ if bbl_entry_count > 0:
406
+ logger.info(f"Using .bbl files from ArXiv source (biblatex takes priority over bibtex)")
417
407
  return bbl_content
418
408
  else:
419
- logger.info(f"Using filtered .bib files")
420
- return filtered_content
409
+ logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
410
+ # If we have LaTeX content, filter BibTeX by cited keys
411
+ if tex_content:
412
+ cited_keys = extract_cited_keys_from_tex({}, tex_content)
413
+ if cited_keys:
414
+ logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
415
+ filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
416
+ return filtered_content
417
+ return bib_content
421
418
 
422
419
  elif bib_content:
423
420
  logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
@@ -99,9 +99,8 @@ def compare_dois(doi1: str, doi2: str) -> bool:
99
99
  """
100
100
  Compare two DOIs for equality, handling different formats and prefixes.
101
101
 
102
- This function performs exact matching after normalization, which means
103
- DOIs are only considered equal if they are identical after removing
104
- prefixes, case differences, and punctuation.
102
+ This function performs exact matching after normalization, with support
103
+ for partial DOI citations where a shorter DOI is a valid prefix of a longer one.
105
104
 
106
105
  Args:
107
106
  doi1: First DOI to compare
@@ -117,8 +116,27 @@ def compare_dois(doi1: str, doi2: str) -> bool:
117
116
  norm_doi1 = normalize_doi(doi1)
118
117
  norm_doi2 = normalize_doi(doi2)
119
118
 
120
- # DOIs must be exactly identical after normalization
121
- return norm_doi1 == norm_doi2
119
+ # First try exact match
120
+ if norm_doi1 == norm_doi2:
121
+ return True
122
+
123
+ # Handle partial DOI citations - if one DOI is a prefix of the other, consider it a match
124
+ # This handles cases like "10.1007" being cited instead of the full "10.1007/s10458-025-09691-y"
125
+ if len(norm_doi1) != len(norm_doi2):
126
+ shorter_doi = norm_doi1 if len(norm_doi1) < len(norm_doi2) else norm_doi2
127
+ longer_doi = norm_doi2 if len(norm_doi1) < len(norm_doi2) else norm_doi1
128
+
129
+ # Only consider it a valid partial match if:
130
+ # 1. The shorter DOI is at least 7 characters (e.g., "10.1007")
131
+ # 2. The longer DOI starts with the shorter DOI
132
+ # 3. The next character in the longer DOI is '/' or '.' (valid DOI separators)
133
+ if (len(shorter_doi) >= 7 and
134
+ longer_doi.startswith(shorter_doi) and
135
+ len(longer_doi) > len(shorter_doi) and
136
+ longer_doi[len(shorter_doi)] in ['/', '.']):
137
+ return True
138
+
139
+ return False
122
140
 
123
141
 
124
142
  def construct_doi_url(doi: str) -> str:
@@ -183,6 +183,14 @@ def clean_venue_for_comparison(venue: str) -> str:
183
183
  return normalize_venue_for_display(venue)
184
184
 
185
185
 
186
+ def format_missing_venue(correct_venue: str) -> str:
187
+ """
188
+ Format a missing venue message with only the actual value.
189
+ """
190
+ # Only show the actual venue; omit the empty cited line
191
+ return f"Missing venue: '{correct_venue}'"
192
+
193
+
186
194
  def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]:
187
195
  """
188
196
  Create a standardized venue warning dictionary.
@@ -197,7 +205,15 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
197
205
  # Clean both venues for display in the warning
198
206
  clean_cited = clean_venue_for_comparison(cited_venue)
199
207
  clean_correct = clean_venue_for_comparison(correct_venue)
200
-
208
+
209
+ # If cited venue cleans to empty, treat as missing venue instead of mismatch
210
+ if not clean_cited and clean_correct:
211
+ return {
212
+ 'warning_type': 'venue',
213
+ 'warning_details': format_missing_venue(clean_correct),
214
+ 'ref_venue_correct': correct_venue
215
+ }
216
+
201
217
  return {
202
218
  'warning_type': 'venue',
203
219
  'warning_details': format_three_line_mismatch("Venue mismatch", clean_cited, clean_correct),
@@ -506,8 +506,10 @@ def clean_author_name(author):
506
506
  # Fix spacing around periods in initials (e.g., "Y . Li" -> "Y. Li")
507
507
  author = re.sub(r'(\w)\s+\.', r'\1.', author)
508
508
 
509
- # Remove common prefixes/suffixes
510
- author = re.sub(r'\b(Dr\.?|Prof\.?|Professor|Mr\.?|Ms\.?|Mrs\.?)\s*', '', author, flags=re.IGNORECASE)
509
+ # Remove common honorific prefixes only when they are standalone at the start (require trailing whitespace)
510
+ # Previous pattern falsely removed the leading "Mr" from names like "Mrinmaya" due to optional whitespace.
511
+ # Anchor to start and require at least one space after the title to avoid stripping inside longer names.
512
+ author = re.sub(r'^(?:Dr|Prof|Professor|Mr|Ms|Mrs)\.?\s+', '', author, flags=re.IGNORECASE)
511
513
 
512
514
  # Remove email addresses
513
515
  author = re.sub(r'\S+@\S+\.\S+', '', author)
@@ -2111,7 +2113,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2111
2113
  # Allow minor flexibility (1 author difference) but not more
2112
2114
  if abs(len(cleaned_cited) - len(correct_names)) > 1:
2113
2115
  from utils.error_utils import format_author_count_mismatch
2114
- error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), cleaned_cited, correct_names)
2116
+ # Convert cited names to display format (First Last) before showing in error
2117
+ display_cited = [format_author_for_display(author) for author in cleaned_cited]
2118
+ error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2115
2119
  return False, error_msg
2116
2120
 
2117
2121
  # Use the shorter list for comparison
@@ -3586,6 +3590,12 @@ def calculate_title_similarity(title1: str, title2: str) -> float:
3586
3590
  # Normalize titles for comparison
3587
3591
  t1 = title1.lower().strip()
3588
3592
  t2 = title2.lower().strip()
3593
+
3594
+ # Remove trailing year suffixes like ", 2024" or " 2024" for robust matching
3595
+ def strip_trailing_year(s: str) -> str:
3596
+ return re.sub(r"[,\s]*\b(19|20)\d{2}\b\s*$", "", s).strip()
3597
+ t1 = strip_trailing_year(t1)
3598
+ t2 = strip_trailing_year(t2)
3589
3599
 
3590
3600
  # Exact match
3591
3601
  if t1 == t2:
@@ -4674,6 +4684,13 @@ def normalize_venue_for_display(venue: str) -> str:
4674
4684
 
4675
4685
  venue_text = venue.strip()
4676
4686
 
4687
+ # Strip leading editor name lists like "..., editors, Venue ..." or "..., eds., Venue ..."
4688
+ # This prevents author/editor lists from being treated as venue
4689
+ # Match 'editors,' 'editor,' or 'eds.,' possibly after a comma; capture the remainder as venue
4690
+ editors_match = re.search(r"(?:^|,)\s*(?:editors?|eds?\.?|editor)\s*,\s*(.+)$", venue_text, re.IGNORECASE)
4691
+ if editors_match:
4692
+ venue_text = editors_match.group(1).strip()
4693
+
4677
4694
  # Extract venue from complex editor strings (e.g. "In Smith, J.; and Doe, K., eds., Conference Name, volume 1")
4678
4695
  # This handles patterns like "In [authors], eds., [venue], [optional metadata]" (case-insensitive)
4679
4696
  editor_match = re.search(r'in\s+[^,]+(?:,\s*[^,]*)*,\s*eds?\.,\s*(.+?)(?:,\s*volume\s*\d+|,\s*pp?\.|$)', venue_text, re.IGNORECASE)
@@ -4700,7 +4717,9 @@ def normalize_venue_for_display(venue: str) -> str:
4700
4717
  prefixes_to_remove = [
4701
4718
  r'^\d{4}\s+\d+(st|nd|rd|th)\s+', # "2012 IEEE/RSJ"
4702
4719
  r'^\d{4}\s+', # "2024 "
4703
- r'^proceedings\s+(of\s+)?(the\s+)?((acm|ieee|usenix|aaai|sigcomm|sigkdd|sigmod|sigops|vldb|osdi|sosp|eurosys)\s+)*(\d+(st|nd|rd|th)\s+)?', # "Proceedings of the [ORG] [ORG] 29th"
4720
+ # Remove 'Proceedings of [the] [ORG]* [ordinal]*' only when followed by at least one word
4721
+ # This avoids cutting a venue down to just 'Proceedings of the'
4722
+ r'^proceedings\s+of\s+(?!the\s*$)(?:the\s+)?(?:(?:acm|ieee|usenix|aaai|sigcomm|sigkdd|sigmod|sigops|vldb|osdi|sosp|eurosys)\s+)*(?:\d+(?:st|nd|rd|th)\s+)?',
4704
4723
  r'^proc\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proc. of the IEEE" (require "of")
4705
4724
  r'^procs\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Procs. of the IEEE" (require "of")
4706
4725
  r'^in\s+',
@@ -4739,4 +4758,8 @@ def normalize_venue_for_display(venue: str) -> str:
4739
4758
  venue_text = re.sub(r'\s+', ' ', venue_text) # Normalize whitespace
4740
4759
  venue_text = venue_text.strip()
4741
4760
 
4761
+ # If what's left is too generic (e.g., just 'Proceedings of the'), treat as no venue
4762
+ if venue_text.lower() in {"proceedings of the", "proceedings of"}:
4763
+ return ""
4764
+
4742
4765
  return venue_text