academic-refchecker 1.2.41__tar.gz → 1.2.42__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {academic_refchecker-1.2.41/src/academic_refchecker.egg-info → academic_refchecker-1.2.42}/PKG-INFO +1 -1
  2. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/__version__.py +1 -1
  3. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
  4. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/checkers/openreview_checker.py +2 -1
  5. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/checkers/semantic_scholar.py +2 -5
  6. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/core/parallel_processor.py +4 -1
  7. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/error_utils.py +19 -1
  8. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/text_utils.py +223 -9
  9. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/LICENSE +0 -0
  10. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/MANIFEST.in +0 -0
  11. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/README.md +0 -0
  12. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/pyproject.toml +0 -0
  13. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/requirements.txt +0 -0
  14. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/scripts/download_db.py +0 -0
  15. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/scripts/run_tests.py +0 -0
  16. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/scripts/start_vllm_server.py +0 -0
  17. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/setup.cfg +0 -0
  18. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/__init__.py +0 -0
  19. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
  20. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
  21. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
  22. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/academic_refchecker.egg-info/requires.txt +0 -0
  23. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/academic_refchecker.egg-info/top_level.txt +0 -0
  24. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/checkers/__init__.py +0 -0
  25. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/checkers/crossref.py +0 -0
  26. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/checkers/enhanced_hybrid_checker.py +0 -0
  27. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/checkers/github_checker.py +0 -0
  28. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/checkers/local_semantic_scholar.py +0 -0
  29. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/checkers/openalex.py +0 -0
  30. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/checkers/webpage_checker.py +0 -0
  31. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/config/__init__.py +0 -0
  32. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/config/logging.conf +0 -0
  33. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/config/settings.py +0 -0
  34. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/core/__init__.py +0 -0
  35. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/core/db_connection_pool.py +0 -0
  36. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/core/refchecker.py +0 -0
  37. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/database/__init__.py +0 -0
  38. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/database/download_semantic_scholar_db.py +0 -0
  39. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/llm/__init__.py +0 -0
  40. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/llm/base.py +0 -0
  41. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/llm/providers.py +0 -0
  42. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/scripts/__init__.py +0 -0
  43. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/scripts/start_vllm_server.py +0 -0
  44. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/services/__init__.py +0 -0
  45. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/services/pdf_processor.py +0 -0
  46. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/__init__.py +0 -0
  47. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/arxiv_utils.py +0 -0
  48. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/author_utils.py +0 -0
  49. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/biblatex_parser.py +0 -0
  50. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/bibliography_utils.py +0 -0
  51. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/bibtex_parser.py +0 -0
  52. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/config_validator.py +0 -0
  53. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/db_utils.py +0 -0
  54. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/doi_utils.py +0 -0
  55. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/mock_objects.py +0 -0
  56. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/unicode_utils.py +0 -0
  57. {academic_refchecker-1.2.41 → academic_refchecker-1.2.42}/src/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.41
3
+ Version: 1.2.42
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.41"
3
+ __version__ = "1.2.42"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.41
3
+ Version: 1.2.42
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -473,9 +473,10 @@ class OpenReviewReferenceChecker:
473
473
 
474
474
  if cited_venue and paper_venue:
475
475
  if are_venues_substantially_different(cited_venue, paper_venue):
476
+ from utils.error_utils import clean_venue_for_comparison
476
477
  errors.append({
477
478
  "warning_type": "venue",
478
- "warning_details": f"Venue mismatch: cited as '{cited_venue}' but OpenReview shows '{paper_venue}'"
479
+ "warning_details": f"Venue mismatch: cited as '{clean_venue_for_comparison(cited_venue)}' but OpenReview shows '{clean_venue_for_comparison(paper_venue)}'"
479
480
  })
480
481
 
481
482
  # Create verified data structure
@@ -544,11 +544,8 @@ class NonArxivReferenceChecker:
544
544
  if cited_venue and paper_venue:
545
545
  # Use the utility function to check if venues are substantially different
546
546
  if are_venues_substantially_different(cited_venue, paper_venue):
547
- errors.append({
548
- 'warning_type': 'venue',
549
- 'warning_details': f"Venue mismatch: cited as '{cited_venue}' but actually '{paper_venue}'",
550
- 'ref_venue_correct': paper_venue
551
- })
547
+ from utils.error_utils import create_venue_warning
548
+ errors.append(create_venue_warning(cited_venue, paper_venue))
552
549
  elif not cited_venue and paper_venue:
553
550
  # Check if this is an arXiv paper first
554
551
  external_ids = paper_data.get('externalIds', {})
@@ -279,8 +279,11 @@ class ParallelReferenceProcessor:
279
279
  from utils.text_utils import format_authors_for_display
280
280
  authors = format_authors_for_display(reference.get('authors', []))
281
281
  year = reference.get('year', '')
282
- # Get venue from either 'venue' or 'journal' field
282
+ # Get venue from either 'venue' or 'journal' field and clean it up
283
283
  venue = reference.get('venue', '') or reference.get('journal', '')
284
+ if venue:
285
+ from utils.error_utils import clean_venue_for_comparison
286
+ venue = clean_venue_for_comparison(venue)
284
287
  url = reference.get('url', '')
285
288
  doi = reference.get('doi', '')
286
289
 
@@ -89,6 +89,20 @@ def create_title_error(error_details: str, correct_title: str) -> Dict[str, str]
89
89
  }
90
90
 
91
91
 
92
+ def clean_venue_for_comparison(venue: str) -> str:
93
+ """
94
+ Clean venue name for display in warnings using the shared normalization logic.
95
+
96
+ Args:
97
+ venue: Raw venue string
98
+
99
+ Returns:
100
+ Cleaned venue name suitable for display
101
+ """
102
+ from utils.text_utils import normalize_venue_for_display
103
+ return normalize_venue_for_display(venue)
104
+
105
+
92
106
  def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]:
93
107
  """
94
108
  Create a standardized venue warning dictionary.
@@ -100,9 +114,13 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
100
114
  Returns:
101
115
  Standardized warning dictionary
102
116
  """
117
+ # Clean both venues for display in the warning
118
+ clean_cited = clean_venue_for_comparison(cited_venue)
119
+ clean_correct = clean_venue_for_comparison(correct_venue)
120
+
103
121
  return {
104
122
  'warning_type': 'venue',
105
- 'warning_details': f"Venue mismatch: cited as '{cited_venue}' but actually '{correct_venue}'",
123
+ 'warning_details': f"Venue mismatch: cited as '{clean_cited}' but actually '{clean_correct}'",
106
124
  'ref_venue_correct': correct_venue
107
125
  }
108
126
 
@@ -2255,8 +2255,13 @@ def format_author_for_display(author_name):
2255
2255
  if not author_name:
2256
2256
  return author_name
2257
2257
 
2258
+ # Clean up any stray punctuation that might have been attached during parsing
2259
+ author_name = author_name.strip()
2260
+ # Remove trailing semicolons that sometimes get attached during bibliographic parsing
2261
+ author_name = re.sub(r'[;,]\s*$', '', author_name)
2262
+
2258
2263
  # Normalize apostrophes for consistent display
2259
- author_name = normalize_apostrophes(author_name.strip())
2264
+ author_name = normalize_apostrophes(author_name)
2260
2265
 
2261
2266
  # Check if it's in "Lastname, Firstname" format
2262
2267
  if ',' in author_name:
@@ -3667,8 +3672,77 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
3667
3672
  return bool(venue1 != venue2)
3668
3673
 
3669
3674
  # Clean LaTeX commands from both venues first
3670
- venue1 = strip_latex_commands(venue1)
3671
- venue2 = strip_latex_commands(venue2)
3675
+ venue1_latex_cleaned = strip_latex_commands(venue1)
3676
+ venue2_latex_cleaned = strip_latex_commands(venue2)
3677
+
3678
+ # For comparison, we need lowercase normalized versions
3679
+ def normalize_for_comparison(venue_text):
3680
+ # Get the cleaned display version first
3681
+ cleaned = normalize_venue_for_display(venue_text)
3682
+ # Then normalize for comparison: lowercase, expand abbreviations, remove punctuation
3683
+ venue_lower = cleaned.lower()
3684
+
3685
+ # Handle LaTeX penalty commands before abbreviation expansion
3686
+ venue_lower = re.sub(r'\\penalty\d+\s*', ' ', venue_lower) # Remove \\penalty0 etc
3687
+ venue_lower = re.sub(r'\s+', ' ', venue_lower).strip() # Clean up extra spaces
3688
+
3689
+ # Expand abbreviations for comparison
3690
+ def expand_abbreviations(text):
3691
+ common_abbrevs = {
3692
+ # IEEE specific abbreviations (only expand with periods, not full words)
3693
+ 'robot.': 'robotics', 'autom.': 'automation', 'lett.': 'letters',
3694
+ 'trans.': 'transactions', 'syst.': 'systems', 'netw.': 'networks',
3695
+ 'learn.': 'learning', 'ind.': 'industrial', 'electron.': 'electronics',
3696
+ 'mechatron.': 'mechatronics', 'intell.': 'intelligence',
3697
+ 'transp.': 'transportation', 'contr.': 'control', 'mag.': 'magazine',
3698
+ # General academic abbreviations (only expand with periods)
3699
+ 'int.': 'international', 'intl.': 'international', 'conf.': 'conference',
3700
+ 'j.': 'journal', 'proc.': 'proceedings', 'assoc.': 'association',
3701
+ 'comput.': 'computing', 'sci.': 'science', 'eng.': 'engineering',
3702
+ 'tech.': 'technology', 'artif.': 'artificial', 'mach.': 'machine',
3703
+ 'stat.': 'statistics', 'math.': 'mathematics', 'phys.': 'physics',
3704
+ 'chem.': 'chemistry', 'bio.': 'biology', 'med.': 'medicine',
3705
+ 'adv.': 'advances', 'ann.': 'annual', 'symp.': 'symposium',
3706
+ 'workshop': 'workshop', 'worksh.': 'workshop',
3707
+ 'natl.': 'national', 'acad.': 'academy', 'rev.': 'review',
3708
+ # Physics journal abbreviations
3709
+ 'phys.': 'physics', 'phys. rev.': 'physical review',
3710
+ 'phys. rev. lett.': 'physical review letters',
3711
+ 'phys. rev. a': 'physical review a', 'phys. rev. b': 'physical review b',
3712
+ 'phys. rev. c': 'physical review c', 'phys. rev. d': 'physical review d',
3713
+ 'phys. rev. e': 'physical review e', 'phys. lett.': 'physics letters',
3714
+ 'phys. lett. b': 'physics letters b', 'nucl. phys.': 'nuclear physics',
3715
+ 'nucl. phys. a': 'nuclear physics a', 'nucl. phys. b': 'nuclear physics b',
3716
+ 'j. phys.': 'journal of physics', 'ann. phys.': 'annals of physics',
3717
+ 'mod. phys. lett.': 'modern physics letters', 'eur. phys. j.': 'european physical journal',
3718
+ # Nature journals
3719
+ 'nature phys.': 'nature physics', 'sci. adv.': 'science advances',
3720
+ # Handle specific multi-word patterns and well-known acronyms
3721
+ 'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
3722
+ 'pnas': 'proceedings of the national academy of sciences',
3723
+ }
3724
+ # Sort by length (longest first) to ensure longer matches take precedence
3725
+ for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
3726
+ # For abbreviations ending in period, use word boundary at start only
3727
+ if abbrev.endswith('.'):
3728
+ pattern = r'\b' + re.escape(abbrev)
3729
+ else:
3730
+ pattern = r'\b' + re.escape(abbrev) + r'\b'
3731
+ text = re.sub(pattern, expansion, text)
3732
+ return text
3733
+
3734
+ venue_lower = expand_abbreviations(venue_lower)
3735
+
3736
+ # Remove punctuation and normalize spacing for comparison
3737
+ venue_lower = re.sub(r'[.,;:]', '', venue_lower) # Remove punctuation
3738
+ venue_lower = re.sub(r'\\s+on\\s+', ' ', venue_lower) # Remove \"on\" preposition
3739
+ venue_lower = re.sub(r'\\s+for\\s+', ' ', venue_lower) # Remove \"for\" preposition
3740
+ venue_lower = re.sub(r'\\s+', ' ', venue_lower).strip() # Normalize whitespace
3741
+
3742
+ return venue_lower
3743
+
3744
+ normalized_venue1 = normalize_for_comparison(venue1_latex_cleaned)
3745
+ normalized_venue2 = normalize_for_comparison(venue2_latex_cleaned)
3672
3746
 
3673
3747
  def expand_abbreviations(text):
3674
3748
  """Generic abbreviation expansion using common academic patterns"""
@@ -3985,8 +4059,8 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
3985
4059
  if not acronym or not full_text:
3986
4060
  return False
3987
4061
 
3988
- # Normalize the full text
3989
- normalized_full = normalize_venue(full_text)
4062
+ # Use the internal comparison normalization function
4063
+ normalized_full = normalize_for_comparison(full_text)
3990
4064
 
3991
4065
  # Generate all possible acronyms from the full text
3992
4066
  possible_acronyms = []
@@ -4100,9 +4174,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
4100
4174
  if (arxiv1 == 'arxiv' and arxiv2.startswith('https://arxiv.org')) or (arxiv2 == 'arxiv' and arxiv1.startswith('https://arxiv.org')):
4101
4175
  return False
4102
4176
 
4103
- # Normalize both venues first
4104
- norm1 = normalize_venue(venue1)
4105
- norm2 = normalize_venue(venue2)
4177
+ # Use normalized venues from shared function
4178
+ norm1 = normalized_venue1
4179
+ norm2 = normalized_venue2
4106
4180
 
4107
4181
  # Direct match after normalization (highest priority)
4108
4182
  if norm1 == norm2:
@@ -4356,4 +4430,144 @@ def is_year_substantially_different(cited_year: int, correct_year: int, context:
4356
4430
 
4357
4431
  # Any year difference should be flagged as a warning for manual review
4358
4432
  warning_msg = f"Year mismatch: cited as {cited_year} but actually {correct_year}"
4359
- return True, warning_msg
4433
+ return True, warning_msg
4434
+
4435
+
4436
+ def normalize_venue_for_display(venue: str) -> str:
4437
+ """
4438
+ Normalize venue names for consistent display and comparison.
4439
+
4440
+ This function is used both for display in warnings and for venue comparison
4441
+ to ensure consistent normalization across the system.
4442
+
4443
+ Args:
4444
+ venue: Raw venue string
4445
+
4446
+ Returns:
4447
+ Normalized venue string with prefixes removed and abbreviations expanded
4448
+ """
4449
+ if not venue:
4450
+ return ""
4451
+
4452
+ def expand_abbreviations(text):
4453
+ """Generic abbreviation expansion using common academic patterns"""
4454
+ # Common academic abbreviations mapping
4455
+ common_abbrevs = {
4456
+ # IEEE specific abbreviations (only expand with periods, not full words)
4457
+ 'robot.': 'robotics',
4458
+ 'autom.': 'automation',
4459
+ 'lett.': 'letters',
4460
+ 'trans.': 'transactions',
4461
+ 'syst.': 'systems',
4462
+ 'netw.': 'networks',
4463
+ 'learn.': 'learning',
4464
+ 'ind.': 'industrial',
4465
+ 'electron.': 'electronics',
4466
+ 'mechatron.': 'mechatronics',
4467
+ 'intell.': 'intelligence',
4468
+ 'transp.': 'transportation',
4469
+ 'contr.': 'control',
4470
+ 'mag.': 'magazine',
4471
+
4472
+ # General academic abbreviations (only expand with periods)
4473
+ 'int.': 'international',
4474
+ 'intl.': 'international',
4475
+ 'conf.': 'conference',
4476
+ 'j.': 'journal',
4477
+ 'proc.': 'proceedings',
4478
+ 'assoc.': 'association',
4479
+ 'comput.': 'computing',
4480
+ 'sci.': 'science',
4481
+ 'eng.': 'engineering',
4482
+ 'tech.': 'technology',
4483
+ 'artif.': 'artificial',
4484
+ 'mach.': 'machine',
4485
+ 'stat.': 'statistics',
4486
+ 'math.': 'mathematics',
4487
+ 'phys.': 'physics',
4488
+ 'chem.': 'chemistry',
4489
+ 'bio.': 'biology',
4490
+ 'med.': 'medicine',
4491
+ 'adv.': 'advances',
4492
+ 'ann.': 'annual',
4493
+ 'symp.': 'symposium',
4494
+ 'workshop': 'workshop',
4495
+ 'worksh.': 'workshop',
4496
+ }
4497
+
4498
+ text_lower = text.lower()
4499
+ for abbrev, expansion in common_abbrevs.items():
4500
+ # Only replace if it's a word boundary to avoid partial replacements
4501
+ pattern = r'\b' + re.escape(abbrev) + r'\b'
4502
+ text_lower = re.sub(pattern, expansion, text_lower)
4503
+
4504
+ return text_lower
4505
+
4506
+ venue_text = venue.strip()
4507
+
4508
+ # Extract venue from complex editor strings (e.g. "In Smith, J.; and Doe, K., eds., Conference Name, volume 1")
4509
+ # This handles patterns like "In [authors], eds., [venue], [optional metadata]" (case-insensitive)
4510
+ editor_match = re.search(r'in\s+[^,]+(?:,\s*[^,]*)*,\s*eds?\.,\s*(.+?)(?:,\s*volume\s*\d+|,\s*pp?\.|$)', venue_text, re.IGNORECASE)
4511
+ if editor_match:
4512
+ # Extract the venue part from editor string (preserve original case)
4513
+ venue_text = editor_match.group(1).strip()
4514
+ # Clean up any remaining metadata like "volume X of Proceedings..." (case-insensitive)
4515
+ venue_text = re.sub(r',\s*volume\s+\d+.*$', '', venue_text, flags=re.IGNORECASE)
4516
+ venue_text = re.sub(r'\s+of\s+proceedings.*$', '', venue_text, flags=re.IGNORECASE)
4517
+
4518
+ # Remove years, volumes, pages, and other citation metadata
4519
+ # But preserve arXiv IDs (don't remove digits after arXiv:)
4520
+ if not re.match(r'arxiv:', venue_text, re.IGNORECASE):
4521
+ venue_text = re.sub(r',?\s*\d{4}[a-z]?\s*$', '', venue_text) # Years like "2024" or "2024b"
4522
+ venue_text = re.sub(r',?\s*\(\d{4}\)$', '', venue_text) # Years in parentheses
4523
+ venue_text = re.sub(r"'\d{2}$", '', venue_text) # Year suffixes like 'CVPR'16'
4524
+ venue_text = re.sub(r',?\s*(vol\.?\s*|volume\s*)\d+.*$', '', venue_text, flags=re.IGNORECASE) # Volume info
4525
+ venue_text = re.sub(r',?\s*\d+\s*\([^)]*\).*$', '', venue_text) # Issue info with optional spaces
4526
+ venue_text = re.sub(r',?\s*pp?\.\s*\d+.*$', '', venue_text, flags=re.IGNORECASE) # Page info
4527
+ venue_text = re.sub(r'\s*\(print\).*$', '', venue_text, flags=re.IGNORECASE) # Print designation
4528
+ venue_text = re.sub(r'\s*\(\d{4}\.\s*print\).*$', '', venue_text, flags=re.IGNORECASE) # Year.Print
4529
+
4530
+ # Remove procedural prefixes (case-insensitive)
4531
+ prefixes_to_remove = [
4532
+ r'^\d{4}\s+\d+(st|nd|rd|th)\s+', # "2012 IEEE/RSJ"
4533
+ r'^\d{4}\s+', # "2024 "
4534
+ r'^proceedings\s+(of\s+)?(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proceedings of the IEEE"
4535
+ r'^proc\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proc. of the IEEE" (require "of")
4536
+ r'^procs\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Procs. of the IEEE" (require "of")
4537
+ r'^in\s+',
4538
+ r'^advances\s+in\s+', # "Advances in Neural Information Processing Systems"
4539
+ r'^adv\.\s+', # "Adv. Neural Information Processing Systems"
4540
+ # Handle ordinal prefixes: "The Twelfth", "The Ninth", etc.
4541
+ r'^the\s+(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first|thirty-second|thirty-third|thirty-fourth|thirty-fifth|thirty-sixth|thirty-seventh|thirty-eighth|thirty-ninth|fortieth|forty-first|forty-second|forty-third|forty-fourth|forty-fifth|forty-sixth|forty-seventh|forty-eighth|forty-ninth|fiftieth)\s+',
4542
+ # Handle numeric ordinals: "The 41st", "The 12th", etc.
4543
+ r'^the\s+\d+(st|nd|rd|th)\s+',
4544
+ # Handle standalone "The" prefix
4545
+ r'^the\s+',
4546
+ ]
4547
+
4548
+ for prefix_pattern in prefixes_to_remove:
4549
+ venue_text = re.sub(prefix_pattern, '', venue_text, flags=re.IGNORECASE)
4550
+
4551
+ # Note: For display purposes, we preserve case and don't expand abbreviations
4552
+ # Only do minimal cleaning needed for proper display
4553
+
4554
+ # Remove organization prefixes/suffixes that don't affect identity (case-insensitive)
4555
+ # But preserve IEEE when it's part of a journal name like \"IEEE Transactions\"
4556
+ if not re.match(r'ieee\s+transactions', venue_text, re.IGNORECASE):
4557
+ venue_text = re.sub(r'^(ieee|acm|aaai|usenix|sigcomm|sigkdd|sigmod|vldb|osdi|sosp|eurosys)\s+', '', venue_text, flags=re.IGNORECASE) # Remove org prefixes
4558
+ venue_text = re.sub(r'^ieee/\w+\s+', '', venue_text, flags=re.IGNORECASE) # Remove "IEEE/RSJ " etc
4559
+ venue_text = re.sub(r'\s+(ieee|acm|aaai|usenix)\s*$', '', venue_text, flags=re.IGNORECASE) # Remove org suffixes
4560
+ venue_text = re.sub(r'/\w+\s+', ' ', venue_text) # Remove "/ACM " style org separators
4561
+
4562
+ # IMPORTANT: Don't remove "Conference on" or "International" - they're needed for display
4563
+ # Only remove specific org-prefixed conference patterns where the org is clear
4564
+ venue_text = re.sub(r'^(ieee|acm|aaai|nips)(/\w+)?\s+conference\s+on\s+', '', venue_text, flags=re.IGNORECASE)
4565
+
4566
+ # Note: Don't remove "Conference on" as it's often part of the actual venue name
4567
+ # Only remove it if it's clearly a procedural prefix (handled in prefixes_to_remove above)
4568
+
4569
+ # Clean up spacing (preserve punctuation and case for display)
4570
+ venue_text = re.sub(r'\s+', ' ', venue_text) # Normalize whitespace
4571
+ venue_text = venue_text.strip()
4572
+
4573
+ return venue_text