academic-refchecker 1.2.65__py3-none-any.whl → 1.2.67__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/METADATA +72 -7
  2. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/RECORD +33 -18
  3. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/entry_points.txt +1 -0
  4. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/top_level.txt +1 -0
  5. backend/__init__.py +21 -0
  6. backend/__main__.py +11 -0
  7. backend/cli.py +64 -0
  8. backend/concurrency.py +100 -0
  9. backend/database.py +711 -0
  10. backend/main.py +1367 -0
  11. backend/models.py +99 -0
  12. backend/refchecker_wrapper.py +1126 -0
  13. backend/static/assets/index-2P6L_39v.css +1 -0
  14. backend/static/assets/index-hk21nqxR.js +25 -0
  15. backend/static/favicon.svg +6 -0
  16. backend/static/index.html +15 -0
  17. backend/static/vite.svg +1 -0
  18. backend/thumbnail.py +517 -0
  19. backend/websocket_manager.py +104 -0
  20. refchecker/__version__.py +2 -2
  21. refchecker/checkers/crossref.py +15 -6
  22. refchecker/checkers/enhanced_hybrid_checker.py +18 -4
  23. refchecker/checkers/local_semantic_scholar.py +2 -2
  24. refchecker/checkers/openalex.py +15 -6
  25. refchecker/checkers/semantic_scholar.py +15 -6
  26. refchecker/core/refchecker.py +17 -6
  27. refchecker/utils/__init__.py +2 -1
  28. refchecker/utils/arxiv_utils.py +18 -60
  29. refchecker/utils/doi_utils.py +32 -1
  30. refchecker/utils/error_utils.py +20 -9
  31. refchecker/utils/text_utils.py +143 -27
  32. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/WHEEL +0 -0
  33. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/licenses/LICENSE +0 -0
@@ -256,6 +256,21 @@ class EnhancedHybridReferenceChecker:
256
256
  Returns:
257
257
  Tuple of (verified_data, errors, url)
258
258
  """
259
+ # Check if this is a URL-only reference (should skip verification)
260
+ authors = reference.get('authors', [])
261
+ if authors and "URL Reference" in authors:
262
+ # Skip verification for URL references - they're just links, not papers
263
+ logger.debug("Enhanced Hybrid: Skipping verification for URL reference")
264
+ return None, [], reference.get('cited_url') or reference.get('url')
265
+
266
+ # Also check if it looks like a URL-only reference (no title, just URL)
267
+ title = reference.get('title', '').strip()
268
+ cited_url = reference.get('cited_url') or reference.get('url')
269
+ if not title and cited_url:
270
+ # This is a URL-only reference without a title
271
+ logger.debug(f"Enhanced Hybrid: Skipping verification for URL-only reference: {cited_url}")
272
+ return None, [], cited_url
273
+
259
274
  # Track all APIs that failed and could be retried
260
275
  failed_apis = []
261
276
 
@@ -533,10 +548,9 @@ class EnhancedHybridReferenceChecker:
533
548
  if self.semantic_scholar:
534
549
  return self.semantic_scholar.normalize_paper_title(title)
535
550
  else:
536
- # Basic normalization if Semantic Scholar is not available
537
- import re
538
- title = re.sub(r'\s+', ' ', title.strip().lower())
539
- return re.sub(r'[^\w\s]', '', title)
551
+ # Use the centralized normalization function from text_utils
552
+ from refchecker.utils.text_utils import normalize_paper_title as normalize_title
553
+ return normalize_title(title)
540
554
 
541
555
  def compare_authors(self, cited_authors: List[str], correct_authors: List[Any]) -> Tuple[bool, str]:
542
556
  """
@@ -37,9 +37,9 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
37
37
 
38
38
  from refchecker.utils.doi_utils import extract_doi_from_url, compare_dois, construct_doi_url
39
39
  from refchecker.utils.error_utils import create_author_error, create_year_warning, create_doi_error
40
- from refchecker.utils.text_utils import normalize_author_name, normalize_paper_title, is_name_match, compare_authors, calculate_title_similarity, extract_arxiv_id_from_url
40
+ from refchecker.utils.text_utils import normalize_author_name, normalize_paper_title, is_name_match, compare_authors, calculate_title_similarity
41
+ from refchecker.utils.url_utils import extract_arxiv_id_from_url, get_best_available_url
41
42
  from refchecker.utils.db_utils import process_semantic_scholar_result, process_semantic_scholar_results
42
- from refchecker.utils.url_utils import get_best_available_url
43
43
  from refchecker.config.settings import get_config
44
44
 
45
45
  # Set up logging
@@ -460,13 +460,22 @@ class OpenAlexReferenceChecker:
460
460
 
461
461
  if doi and work_doi:
462
462
  # Compare DOIs using the proper comparison function
463
- from refchecker.utils.doi_utils import compare_dois
463
+ from refchecker.utils.doi_utils import compare_dois, validate_doi_resolves
464
464
  if not compare_dois(doi, work_doi):
465
- errors.append({
466
- 'error_type': 'doi',
467
- 'error_details': format_doi_mismatch(doi, work_doi),
468
- 'ref_doi_correct': work_doi
469
- })
465
+ # If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
466
+ # Treat as warning instead of error
467
+ if validate_doi_resolves(doi):
468
+ errors.append({
469
+ 'warning_type': 'doi',
470
+ 'warning_details': format_doi_mismatch(doi, work_doi),
471
+ 'ref_doi_correct': work_doi
472
+ })
473
+ else:
474
+ errors.append({
475
+ 'error_type': 'doi',
476
+ 'error_details': format_doi_mismatch(doi, work_doi),
477
+ 'ref_doi_correct': work_doi
478
+ })
470
479
 
471
480
  # Extract URL from work data
472
481
  work_url = self.extract_url_from_work(work_data)
@@ -612,14 +612,23 @@ class NonArxivReferenceChecker:
612
612
  paper_doi = external_ids['DOI']
613
613
 
614
614
  # Compare DOIs using the proper comparison function
615
- from refchecker.utils.doi_utils import compare_dois
615
+ from refchecker.utils.doi_utils import compare_dois, validate_doi_resolves
616
616
  if doi and paper_doi and not compare_dois(doi, paper_doi):
617
617
  from refchecker.utils.error_utils import format_doi_mismatch
618
- errors.append({
619
- 'error_type': 'doi',
620
- 'error_details': format_doi_mismatch(doi, paper_doi),
621
- 'ref_doi_correct': paper_doi
622
- })
618
+ # If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
619
+ # Treat as warning instead of error
620
+ if validate_doi_resolves(doi):
621
+ errors.append({
622
+ 'warning_type': 'doi',
623
+ 'warning_details': format_doi_mismatch(doi, paper_doi),
624
+ 'ref_doi_correct': paper_doi
625
+ })
626
+ else:
627
+ errors.append({
628
+ 'error_type': 'doi',
629
+ 'error_details': format_doi_mismatch(doi, paper_doi),
630
+ 'ref_doi_correct': paper_doi
631
+ })
623
632
 
624
633
  # Extract URL from paper data - prioritize arXiv URLs when available
625
634
  paper_url = None
@@ -46,12 +46,13 @@ import json
46
46
  import random
47
47
  from refchecker.checkers.local_semantic_scholar import LocalNonArxivReferenceChecker
48
48
  from refchecker.utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
49
- extract_arxiv_id_from_url, normalize_text as common_normalize_text,
49
+ normalize_text as common_normalize_text,
50
50
  detect_latex_bibliography_format, extract_latex_references,
51
51
  detect_standard_acm_natbib_format, strip_latex_commands,
52
52
  format_corrected_reference, is_name_match, enhanced_name_match,
53
53
  calculate_title_similarity, normalize_arxiv_url, deduplicate_urls,
54
54
  compare_authors)
55
+ from refchecker.utils.url_utils import extract_arxiv_id_from_url
55
56
  from refchecker.utils.config_validator import ConfigValidator
56
57
  from refchecker.services.pdf_processor import PDFProcessor
57
58
  from refchecker.checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
@@ -1963,11 +1964,21 @@ class ArxivReferenceChecker:
1963
1964
  if not actual_doi_normalized.startswith(cited_doi_normalized.rstrip('.')):
1964
1965
  logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
1965
1966
  from refchecker.utils.error_utils import format_doi_mismatch
1966
- errors.append({
1967
- 'error_type': 'doi',
1968
- 'error_details': format_doi_mismatch(doi, external_ids['DOI']),
1969
- 'ref_doi_correct': external_ids['DOI']
1970
- })
1967
+ from refchecker.utils.doi_utils import validate_doi_resolves
1968
+ # If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
1969
+ # Treat as warning instead of error
1970
+ if validate_doi_resolves(doi):
1971
+ errors.append({
1972
+ 'warning_type': 'doi',
1973
+ 'warning_details': format_doi_mismatch(doi, external_ids['DOI']),
1974
+ 'ref_doi_correct': external_ids['DOI']
1975
+ })
1976
+ else:
1977
+ errors.append({
1978
+ 'error_type': 'doi',
1979
+ 'error_details': format_doi_mismatch(doi, external_ids['DOI']),
1980
+ 'ref_doi_correct': external_ids['DOI']
1981
+ })
1971
1982
  else:
1972
1983
  logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")
1973
1984
 
@@ -4,9 +4,10 @@ Utility functions for text processing, author comparison, mocking, and configura
4
4
 
5
5
  from .text_utils import (
6
6
  clean_author_name, clean_title, normalize_text,
7
- extract_arxiv_id_from_url, clean_conference_markers_from_title,
7
+ clean_conference_markers_from_title,
8
8
  remove_year_from_title
9
9
  )
10
+ from .url_utils import extract_arxiv_id_from_url
10
11
  from .author_utils import compare_authors, levenshtein_distance, extract_authors_list
11
12
  from .mock_objects import (
12
13
  MockPaper, MockReference, MockLLMProvider, MockSemanticScholarAPI, MockArxivAPI,
@@ -422,6 +422,11 @@ def get_bibtex_content(paper):
422
422
  """
423
423
  Try to get BibTeX content for a paper from various sources.
424
424
 
425
+ For ArXiv papers, only use .bbl files (compiled bibliography).
426
+ The .bbl file contains only the actually-cited references, while .bib files
427
+ are unreliable - they may contain entire bibliography databases (e.g., full
428
+ ACL Anthology with 80k+ entries) or unfiltered reference collections.
429
+
425
430
  Args:
426
431
  paper: Paper object
427
432
 
@@ -433,71 +438,24 @@ def get_bibtex_content(paper):
433
438
  # Try ArXiv source if it's an ArXiv paper
434
439
  arxiv_id = extract_arxiv_id_from_paper(paper)
435
440
  if arxiv_id:
436
- logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
441
+ logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for .bbl bibliography")
437
442
  tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
438
443
 
439
- # Choose between .bib and .bbl files based on what the main TeX file actually uses
440
- # Check the main TeX file to see if it uses \bibliography{...} (BibTeX) or not (BBL)
441
- uses_bibtex = False
442
- if tex_content:
443
- # Look for \bibliography{...} commands in the main TeX file
444
- bib_pattern = r'\\bibliography\{([^}]+)\}'
445
- bib_matches = re.findall(bib_pattern, tex_content)
446
- if bib_matches:
447
- uses_bibtex = True
448
- referenced_bibs = []
449
- for match in bib_matches:
450
- bib_names = [name.strip() for name in match.split(',')]
451
- referenced_bibs.extend(bib_names)
452
- logger.debug(f"Main TeX file references BibTeX files: {referenced_bibs}")
453
-
454
- if bib_content and bbl_content:
455
- # Count entries in both for logging
456
- bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
444
+ # Only use .bbl files for ArXiv papers (.bib files are unreliable)
445
+ if bbl_content:
457
446
  bbl_entry_count = len(re.findall(r'\\bibitem[\[\{]', bbl_content))
458
-
459
- logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
460
-
461
- # IMPORTANT: Prefer .bbl when .bib is excessively large (e.g., includes full ACL Anthology)
462
- # The .bbl file contains only the actually-cited references, while .bib may contain
463
- # entire bibliography databases. Parsing 80k+ entries would cause the tool to hang.
464
- # Use .bbl if: (1) .bbl has entries AND (2) .bib has >10x more entries than .bbl OR >1000 entries
465
- excessive_bib = bib_entry_count > 1000 or (bbl_entry_count > 0 and bib_entry_count > bbl_entry_count * 10)
466
-
467
- if bbl_entry_count > 0 and excessive_bib:
468
- logger.info(f"Using .bbl files from ArXiv source (.bib has {bib_entry_count} entries which is excessive, .bbl has {bbl_entry_count})")
469
- return bbl_content
470
- elif uses_bibtex and bib_entry_count > 0 and not excessive_bib:
471
- logger.info(f"Using .bib files from ArXiv source (main TeX uses \\bibliography{{...}})")
472
- return bib_content
473
- elif bbl_entry_count > 0:
474
- logger.info(f"Using .bbl files from ArXiv source (main TeX doesn't use \\bibliography or .bib is empty)")
447
+ if bbl_entry_count > 0:
448
+ logger.info(f"Using .bbl files from ArXiv source ({bbl_entry_count} entries)")
475
449
  return bbl_content
476
- elif bib_entry_count > 0:
477
- logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
478
- return bib_content
479
450
  else:
480
- logger.warning(f"Both .bib and .bbl files appear to be empty")
481
- return bib_content # Default to bib_content as fallback
482
-
483
- elif bib_content:
484
- logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
485
- return bib_content
486
-
487
- elif bbl_content:
488
- logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
489
- return bbl_content
490
-
491
- elif tex_content:
492
- # Check for embedded bibliography in LaTeX
493
- from refchecker.utils.text_utils import detect_latex_bibliography_format
494
- latex_format = detect_latex_bibliography_format(tex_content)
495
- if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
496
- logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
497
- # Skip embedded bibliography and return None to trigger fallback methods
498
- return None
499
-
500
- # Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
451
+ logger.debug(f"Found .bbl file but it appears empty")
452
+
453
+ # No .bbl available - return None to trigger PDF fallback
454
+ if bib_content:
455
+ bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
456
+ logger.debug(f"Skipping .bib file ({bib_entry_count} entries) - unreliable, falling back to PDF extraction")
457
+
458
+ logger.debug(f"No usable .bbl file found for ArXiv paper {arxiv_id}")
501
459
 
502
460
  return None
503
461
 
@@ -156,4 +156,35 @@ def construct_doi_url(doi: str) -> str:
156
156
  normalized_doi = normalize_doi(doi)
157
157
 
158
158
  # Construct URL
159
- return f"https://doi.org/{normalized_doi}"
159
+ return f"https://doi.org/{normalized_doi}"
160
+
161
+
162
+ def validate_doi_resolves(doi: str, timeout: float = 5.0) -> bool:
163
+ """
164
+ Validate that a DOI resolves by checking if doi.org returns a redirect.
165
+
166
+ This is useful for determining if a DOI is valid, even if it's different
167
+ from what a verification source has stored (e.g., arXiv DOI vs conference DOI).
168
+
169
+ Args:
170
+ doi: DOI string to validate
171
+ timeout: Request timeout in seconds
172
+
173
+ Returns:
174
+ True if DOI resolves (returns 302/301/200), False otherwise
175
+ """
176
+ if not doi or not is_valid_doi_format(normalize_doi(doi)):
177
+ return False
178
+
179
+ try:
180
+ import requests
181
+ url = construct_doi_url(doi)
182
+ # Use HEAD request first (faster), fall back to GET if needed
183
+ response = requests.head(url, allow_redirects=False, timeout=timeout)
184
+ # DOI.org returns 302 for valid DOIs that redirect to the paper
185
+ # Some may return 301 (permanent redirect) or 200 (direct response)
186
+ return response.status_code in (200, 301, 302, 303, 307, 308)
187
+ except Exception:
188
+ # On any error (timeout, connection error, etc.), assume DOI might be valid
189
+ # to avoid false negatives due to network issues
190
+ return True
@@ -126,28 +126,39 @@ def create_year_warning(cited_year: int, correct_year: int) -> Dict[str, Any]:
126
126
 
127
127
  def create_doi_error(cited_doi: str, correct_doi: str) -> Optional[Dict[str, str]]:
128
128
  """
129
- Create a standardized DOI error dictionary.
129
+ Create a standardized DOI error or warning dictionary.
130
+
131
+ If the cited DOI resolves (is valid), this returns a warning instead of an error,
132
+ since papers can have multiple valid DOIs (e.g., arXiv DOI vs conference DOI).
130
133
 
131
134
  Args:
132
135
  cited_doi: DOI as cited in the reference
133
136
  correct_doi: Correct DOI from database
134
137
 
135
138
  Returns:
136
- Standardized error dictionary if DOIs differ, None if they match after cleaning
139
+ Standardized error/warning dictionary if DOIs differ, None if they match after cleaning
137
140
  """
138
- # Strip trailing periods before comparison to avoid false mismatches
139
- cited_doi_clean = cited_doi.rstrip('.')
140
- correct_doi_clean = correct_doi.rstrip('.')
141
+ from refchecker.utils.doi_utils import validate_doi_resolves, compare_dois
142
+
143
+ # Use compare_dois which handles normalization (case, prefixes, trailing punctuation)
144
+ if compare_dois(cited_doi, correct_doi):
145
+ return None
141
146
 
142
- # Only create error if DOIs are actually different after cleaning
143
- if cited_doi_clean != correct_doi_clean:
147
+ # DOIs are different - determine if this should be error or warning
148
+ # If cited DOI resolves, it's likely a valid alternate DOI
149
+ # Treat as warning instead of error
150
+ if validate_doi_resolves(cited_doi):
151
+ return {
152
+ 'warning_type': 'doi',
153
+ 'warning_details': format_doi_mismatch(cited_doi, correct_doi),
154
+ 'ref_doi_correct': correct_doi
155
+ }
156
+ else:
144
157
  return {
145
158
  'error_type': 'doi',
146
159
  'error_details': format_doi_mismatch(cited_doi, correct_doi),
147
160
  'ref_doi_correct': correct_doi
148
161
  }
149
-
150
- return None
151
162
 
152
163
 
153
164
  def create_title_error(error_details: str, correct_title: str) -> Dict[str, str]:
@@ -676,22 +676,6 @@ def clean_title(title):
676
676
  return title
677
677
 
678
678
 
679
- def extract_arxiv_id_from_url(url):
680
- """
681
- Extract ArXiv ID from URL or text containing ArXiv reference.
682
-
683
- This function is deprecated. Use utils.url_utils.extract_arxiv_id_from_url instead.
684
- Kept for backwards compatibility.
685
-
686
- Args:
687
- url: URL string or text containing arXiv reference
688
-
689
- Returns:
690
- ArXiv ID or None if not found
691
- """
692
- from refchecker.utils.url_utils import extract_arxiv_id_from_url as common_extract
693
- return common_extract(url)
694
-
695
679
  def extract_year_from_text(text):
696
680
  """
697
681
  Extract a 4-digit year from text
@@ -808,6 +792,9 @@ def normalize_paper_title(title: str) -> str:
808
792
  # Strip LaTeX commands first to handle math formatting consistently
809
793
  normalized = strip_latex_commands(title)
810
794
 
795
+ # Normalize diacritics (ü -> u, é -> e, etc.) for consistent comparison
796
+ normalized = normalize_diacritics(normalized)
797
+
811
798
  # Convert to lowercase
812
799
  normalized = normalized.lower()
813
800
 
@@ -2343,6 +2330,17 @@ def detect_latex_bibliography_format(text):
2343
2330
  'details': details
2344
2331
  }
2345
2332
 
2333
+ # Check for standalone \bibitem entries (common in .bbl files without full environment wrapper)
2334
+ # This handles cases where the \begin{thebibliography} wrapper is missing
2335
+ bibitem_matches = re.findall(r'\\bibitem(?:\[[^\]]*\])?\{[^}]+\}', text)
2336
+ if bibitem_matches:
2337
+ details['bibitem_count'] = len(bibitem_matches)
2338
+ return {
2339
+ 'is_latex': True,
2340
+ 'format_type': 'thebibliography',
2341
+ 'details': details
2342
+ }
2343
+
2346
2344
  # Check for \bibliography{} command
2347
2345
  bibcommand_pattern = r'\\bibliography\{([^}]+)\}'
2348
2346
  bibcommand_match = re.search(bibcommand_pattern, text, re.IGNORECASE)
@@ -3125,7 +3123,8 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
3125
3123
  # Parse \bibitem entries (improved for .bbl files with ACM-Reference-Format)
3126
3124
  # Handle both simple \bibitem{key} and complex \bibitem[label]{key} formats
3127
3125
  # Also handle line continuation with % and various spacing patterns
3128
- bibitem_pattern = r'\\bibitem(?:\[([^\]]*)\])?\s*%?\s*\n?\s*\{([^}]+)\}\s*(.*?)(?=\\bibitem|\\end\{thebibliography\})'
3126
+ # Updated to also match end-of-string ($) for standalone bibitem entries
3127
+ bibitem_pattern = r'\\bibitem(?:\[([^\]]*)\])?\s*%?\s*\n?\s*\{([^}]+)\}\s*(.*?)(?=\\bibitem|\\end\{thebibliography\}|$)'
3129
3128
 
3130
3129
  matches = re.finditer(bibitem_pattern, text, re.DOTALL | re.IGNORECASE)
3131
3130
 
@@ -3196,10 +3195,21 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
3196
3195
  if label_year_match:
3197
3196
  ref['year'] = int(label_year_match.group(1))
3198
3197
  else:
3199
- # Try to extract from content
3200
- content_year_match = re.search(r'\b(19|20)\d{2}\b', content)
3201
- if content_year_match:
3202
- ref['year'] = int(content_year_match.group())
3198
+ # Try to extract from content - be careful to avoid ArXiv IDs like 1907.10641
3199
+ # Look for year at end of content or after a comma (typical citation format)
3200
+ # Pattern: standalone year after comma/space, not followed by a dot and more digits (ArXiv ID)
3201
+ year_patterns = [
3202
+ r',\s*((?:19|20)\d{2})\s*\.$', # Year at end after comma: ", 2019."
3203
+ r',\s*((?:19|20)\d{2})\s*$', # Year at end after comma: ", 2019"
3204
+ r'\s+((?:19|20)\d{2})\s*\.$', # Year at end after space: " 2019."
3205
+ r'\s+((?:19|20)\d{2})\s*$', # Year at end after space: " 2019"
3206
+ r'\b((?:19|20)\d{2})(?!\.\d)', # Year not followed by decimal (avoid ArXiv IDs)
3207
+ ]
3208
+ for pattern in year_patterns:
3209
+ content_year_match = re.search(pattern, content)
3210
+ if content_year_match:
3211
+ ref['year'] = int(content_year_match.group(1))
3212
+ break
3203
3213
 
3204
3214
  # Parse natbib format: usually has author line, then \newblock title, then \newblock venue
3205
3215
  parts = re.split(r'\\newblock', content, flags=re.IGNORECASE)
@@ -3300,7 +3310,80 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
3300
3310
  # Second part is usually title
3301
3311
  if len(parts) >= 2:
3302
3312
  title_part = parts[1].strip()
3303
- title_clean = strip_latex_commands(title_part).strip()
3313
+
3314
+ # Handle \href{URL}{text} or \href {URL} {text} format
3315
+ # Extract URL before stripping LaTeX commands
3316
+ # We need to use balanced brace matching because titles can contain
3317
+ # nested braces like {LLM} for capitalization protection
3318
+ href_url = None
3319
+ title_text = None
3320
+
3321
+ href_start = title_part.find('\\href')
3322
+ if href_start != -1:
3323
+ # Find first opening brace (URL)
3324
+ pos = href_start + 5 # Skip \href
3325
+ while pos < len(title_part) and title_part[pos] in ' \t\n':
3326
+ pos += 1
3327
+
3328
+ if pos < len(title_part) and title_part[pos] == '{':
3329
+ # Extract URL using balanced braces
3330
+ brace_count = 0
3331
+ url_start = pos + 1
3332
+ url_end = pos
3333
+ for i in range(pos, len(title_part)):
3334
+ if title_part[i] == '{':
3335
+ brace_count += 1
3336
+ elif title_part[i] == '}':
3337
+ brace_count -= 1
3338
+ if brace_count == 0:
3339
+ url_end = i
3340
+ break
3341
+
3342
+ if url_end > url_start:
3343
+ href_url = title_part[url_start:url_end].strip()
3344
+
3345
+ # Now find the second brace group (title text)
3346
+ pos = url_end + 1
3347
+ while pos < len(title_part) and title_part[pos] in ' \t\n':
3348
+ pos += 1
3349
+
3350
+ if pos < len(title_part) and title_part[pos] == '{':
3351
+ # Extract title text using balanced braces
3352
+ brace_count = 0
3353
+ text_start = pos + 1
3354
+ text_end = pos
3355
+ for i in range(pos, len(title_part)):
3356
+ if title_part[i] == '{':
3357
+ brace_count += 1
3358
+ elif title_part[i] == '}':
3359
+ brace_count -= 1
3360
+ if brace_count == 0:
3361
+ text_end = i
3362
+ break
3363
+
3364
+ if text_end > text_start:
3365
+ title_text = title_part[text_start:text_end].strip()
3366
+
3367
+ if href_url and title_text:
3368
+
3369
+ # Extract DOI if it's a doi.org URL
3370
+ if 'doi.org/' in href_url and not ref.get('doi'):
3371
+ doi_match = re.search(r'doi\.org/(.+)$', href_url)
3372
+ if doi_match:
3373
+ ref['doi'] = doi_match.group(1)
3374
+ ref['url'] = href_url
3375
+ # Extract arXiv ID if it's an arxiv URL
3376
+ elif 'arxiv.org/' in href_url.lower() and not ref.get('url'):
3377
+ ref['url'] = href_url
3378
+ # Generic URL
3379
+ elif not ref.get('url'):
3380
+ ref['url'] = href_url
3381
+
3382
+ # Use the title text (second part of href), not the URL
3383
+ title_clean = strip_latex_commands(title_text).strip()
3384
+ else:
3385
+ title_clean = strip_latex_commands(title_part).strip()
3386
+
3304
3387
  # Remove trailing dots and clean up
3305
3388
  title_clean = title_clean.rstrip('.')
3306
3389
  if title_clean and len(title_clean) > 5: # Reasonable title length
@@ -3310,9 +3393,13 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
3310
3393
  if len(parts) >= 3:
3311
3394
  venue_part = parts[2].strip()
3312
3395
  venue_clean = strip_latex_commands(venue_part).strip()
3313
- # Remove year and clean up
3396
+ # Remove "In " prefix if present (common in bbl format)
3397
+ venue_clean = re.sub(r'^In\s+', '', venue_clean)
3398
+ # Remove trailing year only (at end of string), not year in the middle of venue name
3399
+ # e.g., "2020 Conference on..." should keep the conference name
3314
3400
  if ref['year']:
3315
- venue_clean = re.sub(rf'\b{ref["year"]}\b.*', '', venue_clean)
3401
+ # Only remove year if it appears at the very end (possibly with punctuation)
3402
+ venue_clean = re.sub(rf',?\s*{ref["year"]}\s*\.?\s*$', '', venue_clean)
3316
3403
  venue_clean = venue_clean.rstrip(',. ')
3317
3404
  # Filter out common non-venue patterns that shouldn't be treated as venues
3318
3405
  non_venue_patterns = ['URL', 'url', 'http:', 'https:', 'DOI', 'doi:', 'ArXiv', 'arxiv:']
@@ -3387,11 +3474,24 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
3387
3474
  from refchecker.utils.url_utils import clean_url_punctuation
3388
3475
  ref['url'] = clean_url_punctuation(url_match.group(1))
3389
3476
 
3390
- # Extract DOI from \href{https://doi.org/...}
3477
+ # Extract DOI from \href{https://doi.org/...} or \href {URL} {text} with spaces
3391
3478
  if not ref.get('doi'):
3392
- doi_match = re.search(r'\\href\{https?://doi\.org/([^}]+)\}', content)
3479
+ # Handle both \href{URL}{text} and \href {URL} {text} formats
3480
+ doi_match = re.search(r'\\href\s*\{(https?://doi\.org/[^}]+)\}', content)
3393
3481
  if doi_match:
3394
- ref['doi'] = doi_match.group(1)
3482
+ doi_url = doi_match.group(1)
3483
+ # Extract DOI from the URL
3484
+ doi_id_match = re.search(r'doi\.org/(.+)$', doi_url)
3485
+ if doi_id_match:
3486
+ ref['doi'] = doi_id_match.group(1)
3487
+ if not ref.get('url'):
3488
+ ref['url'] = doi_url
3489
+
3490
+ # Extract URL from \href{URL}{text} if not already set (for non-DOI URLs like arXiv)
3491
+ if not ref.get('url'):
3492
+ href_url_match = re.search(r'\\href\s*\{([^}]+)\}\s*\{[^}]*\}', content)
3493
+ if href_url_match:
3494
+ ref['url'] = href_url_match.group(1).strip()
3395
3495
 
3396
3496
  # Extract arXiv ID from \showeprint[arxiv]{...} (ACM format) or from content (natbib format)
3397
3497
  arxiv_match = re.search(r'\\showeprint\[arxiv\]\{([^}]+)\}', content)
@@ -4020,6 +4120,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
4020
4120
  Returns:
4021
4121
  True if venues are substantially different, False if they match/overlap
4022
4122
  """
4123
+ # Import here to avoid circular dependency
4124
+ from refchecker.utils.url_utils import extract_arxiv_id_from_url
4125
+
4023
4126
  if not venue1 or not venue2:
4024
4127
  return bool(venue1 != venue2)
4025
4128
 
@@ -4088,6 +4191,19 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
4088
4191
 
4089
4192
  venue_lower = expand_abbreviations(venue_lower)
4090
4193
 
4194
+ # Strip page numbers (e.g., "pages 38--55", "pp. 123-456", "page 42")
4195
+ venue_lower = re.sub(r',?\s*pages?\s*\d+\s*[-–—]+\s*\d+', '', venue_lower)
4196
+ venue_lower = re.sub(r',?\s*pp\.?\s*\d+\s*[-–—]+\s*\d+', '', venue_lower)
4197
+ venue_lower = re.sub(r',?\s*pages?\s*\d+', '', venue_lower)
4198
+ venue_lower = re.sub(r',?\s*pp\.?\s*\d+', '', venue_lower)
4199
+
4200
+ # Strip publisher names that are commonly appended
4201
+ publishers = ['springer', 'elsevier', 'wiley', 'acm', 'ieee', 'mit press',
4202
+ 'cambridge university press', 'oxford university press',
4203
+ 'morgan kaufmann', 'addison-wesley', 'prentice hall']
4204
+ for publisher in publishers:
4205
+ venue_lower = re.sub(rf',?\s*{re.escape(publisher)}\s*$', '', venue_lower, flags=re.IGNORECASE)
4206
+
4091
4207
  # Remove punctuation and normalize spacing for comparison
4092
4208
  venue_lower = re.sub(r'[.,;:]', '', venue_lower) # Remove punctuation
4093
4209
  venue_lower = re.sub(r'\\s+on\\s+', ' ', venue_lower) # Remove \"on\" preposition