academic-refchecker 1.2.50__py3-none-any.whl → 1.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
core/refchecker.py CHANGED
@@ -50,7 +50,8 @@ from utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
50
50
  detect_latex_bibliography_format, extract_latex_references,
51
51
  detect_standard_acm_natbib_format, strip_latex_commands,
52
52
  format_corrected_reference, is_name_match, enhanced_name_match,
53
- calculate_title_similarity, normalize_arxiv_url, deduplicate_urls)
53
+ calculate_title_similarity, normalize_arxiv_url, deduplicate_urls,
54
+ compare_authors)
54
55
  from utils.config_validator import ConfigValidator
55
56
  from services.pdf_processor import PDFProcessor
56
57
  from checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
@@ -1789,7 +1790,7 @@ class ArxivReferenceChecker:
1789
1790
  if authors:
1790
1791
  db_authors = [author.get('name', '') for author in check_paper_data['authors']]
1791
1792
 
1792
- authors_match, author_error = self.compare_authors(authors, db_authors)
1793
+ authors_match, author_error = compare_authors(authors, db_authors)
1793
1794
  if authors_match:
1794
1795
  paper_data = check_paper_data
1795
1796
  search_strategy = "Normalized title with author match"
@@ -1901,10 +1902,12 @@ class ArxivReferenceChecker:
1901
1902
 
1902
1903
  if normalized_title != db_title:
1903
1904
  from utils.error_utils import format_title_mismatch
1905
+ # Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
1906
+ clean_cited_title = strip_latex_commands(title)
1904
1907
  logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
1905
1908
  errors.append({
1906
1909
  'error_type': 'title',
1907
- 'error_details': format_title_mismatch(title, paper_data.get('title')),
1910
+ 'error_details': format_title_mismatch(clean_cited_title, paper_data.get('title')),
1908
1911
  'ref_title_correct': paper_data.get('title')
1909
1912
  })
1910
1913
 
@@ -1912,7 +1915,7 @@ class ArxivReferenceChecker:
1912
1915
  if authors and paper_data.get('authors'):
1913
1916
  # Extract author names from database data
1914
1917
  correct_names = [author.get('name', '') for author in paper_data['authors']]
1915
- authors_match, author_error = self.compare_authors(authors, correct_names)
1918
+ authors_match, author_error = compare_authors(authors, correct_names)
1916
1919
 
1917
1920
  if not authors_match:
1918
1921
  logger.debug(f"DB Verification: Author mismatch - {author_error}")
@@ -2018,8 +2021,20 @@ class ArxivReferenceChecker:
2018
2021
  logger.debug(f"Database mode: Initial paper_url from database checker: {paper_url}")
2019
2022
 
2020
2023
  if not verified_data:
2021
- # Mark as unverified but keep the URL if found
2022
- return [{"error_type": "unverified", "error_details": "Reference could not be verified in database"}], paper_url, None
2024
+ # Mark as unverified but check URL for more specific reason or verification
2025
+ if reference.get('url', '').strip():
2026
+ # Use raw URL verifier to check if it can be verified or get specific reason
2027
+ url_verified_data, url_errors, url_checked = self.verify_raw_url_reference(reference)
2028
+ if url_verified_data:
2029
+ # URL verification succeeded - return as verified
2030
+ logger.debug(f"Database mode: URL verification succeeded for unverified reference")
2031
+ return None, url_checked, url_verified_data
2032
+ else:
2033
+ # URL verification failed - use specific error reason
2034
+ url_error_details = url_errors[0].get('error_details', 'Reference could not be verified in database') if url_errors else 'Reference could not be verified in database'
2035
+ return [{"error_type": "unverified", "error_details": url_error_details}], paper_url, None
2036
+ else:
2037
+ return [{"error_type": "unverified", "error_details": "Reference could not be verified in database"}], paper_url, None
2023
2038
 
2024
2039
  # Convert database errors to our format
2025
2040
  formatted_errors = []
@@ -2115,7 +2130,29 @@ class ArxivReferenceChecker:
2115
2130
  return [{"error_type": "unverified", "error_details": "Database connection not available"}], None, None
2116
2131
 
2117
2132
  # For non-database mode, use the standard reference verification
2118
- return self.verify_reference_standard(source_paper, reference)
2133
+ errors, paper_url, verified_data = self.verify_reference_standard(source_paper, reference)
2134
+
2135
+ # If standard verification failed and the reference has a URL, try raw URL verification
2136
+ if errors and verified_data is None:
2137
+ # Check if there's an unverified error
2138
+ unverified_errors = [e for e in errors if e.get('error_type') == 'unverified']
2139
+ if unverified_errors and reference.get('url', '').strip():
2140
+ # Use raw URL verifier to check if it can be verified or get specific reason
2141
+ url_verified_data, url_errors, url_checked = self.verify_raw_url_reference(reference)
2142
+ if url_verified_data:
2143
+ # URL verification succeeded - return as verified
2144
+ logger.debug(f"Non-database mode: URL verification succeeded for unverified reference")
2145
+ return None, url_checked, url_verified_data
2146
+ else:
2147
+ # URL verification failed - use specific error reason
2148
+ url_error_details = url_errors[0].get('error_details', 'Reference could not be verified') if url_errors else 'Reference could not be verified'
2149
+ # Update the unverified error with the specific reason
2150
+ for error in errors:
2151
+ if error.get('error_type') == 'unverified':
2152
+ error['error_details'] = url_error_details
2153
+ break
2154
+
2155
+ return errors, paper_url, verified_data
2119
2156
 
2120
2157
 
2121
2158
  def verify_github_reference(self, reference):
@@ -2250,6 +2287,55 @@ class ArxivReferenceChecker:
2250
2287
  formatted_errors.append(formatted_error)
2251
2288
  return formatted_errors if formatted_errors else [{"error_type": "unverified", "error_details": "Web page could not be verified"}], page_url, None
2252
2289
 
2290
+ def verify_raw_url_reference(self, reference):
2291
+ """
2292
+ Verify a raw URL from an unverified reference - can return verified data if appropriate
2293
+
2294
+ Args:
2295
+ reference: The reference to verify (already determined to be unverified by paper validators)
2296
+
2297
+ Returns:
2298
+ Tuple of (verified_data, errors, url) where:
2299
+ - verified_data: Dict with verified data if URL should be considered verified, None otherwise
2300
+ - errors: List of error dictionaries
2301
+ - url: The URL that was checked
2302
+ """
2303
+ logger.debug(f"Checking raw URL for unverified reference: {reference.get('title', 'Untitled')}")
2304
+
2305
+ # Extract URL from reference
2306
+ web_url = reference.get('url', '').strip()
2307
+ if not web_url:
2308
+ return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], None
2309
+
2310
+ # First try PDF paper checker if URL appears to be a PDF
2311
+ from checkers.pdf_paper_checker import PDFPaperChecker
2312
+ pdf_checker = PDFPaperChecker()
2313
+
2314
+ if pdf_checker.can_check_reference(reference):
2315
+ logger.debug(f"URL appears to be PDF, trying PDF verification: {web_url}")
2316
+ try:
2317
+ verified_data, errors, url = pdf_checker.verify_reference(reference)
2318
+ if verified_data:
2319
+ logger.debug(f"PDF verification successful for: {reference.get('title', 'Untitled')}")
2320
+ return verified_data, errors, url
2321
+ else:
2322
+ logger.debug(f"PDF verification failed, falling back to web page verification")
2323
+ except Exception as e:
2324
+ logger.error(f"Error in PDF verification: {e}")
2325
+ logger.debug(f"PDF verification error, falling back to web page verification")
2326
+
2327
+ # Fall back to web page checker
2328
+ from checkers.webpage_checker import WebPageChecker
2329
+ webpage_checker = WebPageChecker()
2330
+
2331
+ try:
2332
+ verified_data, errors, url = webpage_checker.verify_raw_url_for_unverified_reference(reference)
2333
+ logger.debug(f"Raw URL verification result: verified_data={verified_data is not None}, errors={len(errors)}, url={url}")
2334
+ return verified_data, errors, url
2335
+ except Exception as e:
2336
+ logger.error(f"Error checking raw URL: {e}")
2337
+ return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], web_url
2338
+
2253
2339
  def verify_reference_standard(self, source_paper, reference):
2254
2340
  """
2255
2341
  Verify if a reference is accurate using GitHub, Semantic Scholar, or other checkers
@@ -2271,11 +2357,6 @@ class ArxivReferenceChecker:
2271
2357
  if github_result:
2272
2358
  return github_result
2273
2359
 
2274
- # Next, check if this is a web page reference
2275
- webpage_result = self.verify_webpage_reference(reference)
2276
- if webpage_result:
2277
- return webpage_result
2278
-
2279
2360
  # Use the Semantic Scholar client to verify the reference
2280
2361
  verified_data, errors, paper_url = self.non_arxiv_checker.verify_reference(reference)
2281
2362
 
@@ -3054,6 +3135,13 @@ class ArxivReferenceChecker:
3054
3135
  try:
3055
3136
  # Extract bibliography
3056
3137
  bibliography = self.extract_bibliography(paper, debug_mode)
3138
+
3139
+ # Apply deduplication to all bibliography sources (not just LLM-extracted)
3140
+ if len(bibliography) > 1: # Only deduplicate if we have multiple references
3141
+ original_count = len(bibliography)
3142
+ bibliography = self._deduplicate_bibliography_entries(bibliography)
3143
+ if len(bibliography) < original_count:
3144
+ logger.debug(f"Deduplicated {original_count} references to {len(bibliography)} unique references")
3057
3145
 
3058
3146
  # Update statistics
3059
3147
  self.total_papers_processed += 1
@@ -3493,8 +3581,9 @@ class ArxivReferenceChecker:
3493
3581
  except Exception as e:
3494
3582
  logger.error(f"LLM fallback failed: {e}")
3495
3583
  return []
3496
- logger.debug("Using biblatex file")
3497
- return biblatex_refs
3584
+ if len(biblatex_refs) > 0:
3585
+ logger.debug("Using biblatex file")
3586
+ return biblatex_refs
3498
3587
 
3499
3588
  # For non-standard formats, try LLM-based extraction if available
3500
3589
  if self.llm_extractor:
@@ -4284,9 +4373,9 @@ class ArxivReferenceChecker:
4284
4373
  # If either has no title, can't reliably determine if duplicate
4285
4374
  return False
4286
4375
 
4287
- # If titles match exactly, consider them duplicates
4288
- # This handles the case where the same paper appears multiple times
4289
- if seg1['title'] == seg2['title']:
4376
+ # If titles match exactly (case-insensitive), consider them duplicates
4377
+ # This handles the case where the same paper appears multiple times with different capitalization
4378
+ if seg1['title'].lower() == seg2['title'].lower():
4290
4379
  return True
4291
4380
 
4292
4381
  # Special case: Check if one title is an arXiv identifier and the other is a real title
@@ -4299,16 +4388,54 @@ class ArxivReferenceChecker:
4299
4388
  author1 = seg1['author']
4300
4389
  author2 = seg2['author']
4301
4390
 
4302
- if author1 and author2 and author1 == author2:
4391
+ if author1 and author2 and author1.lower() == author2.lower():
4303
4392
  # Same authors - check if one title is substring of other or significant similarity
4304
- title1 = seg1['title']
4305
- title2 = seg2['title']
4393
+ title1 = seg1['title'].lower()
4394
+ title2 = seg2['title'].lower()
4306
4395
 
4307
4396
  if (title1 in title2 or title2 in title1):
4308
4397
  return True
4309
4398
 
4310
4399
  return False
4311
4400
 
4401
+ def _deduplicate_bibliography_entries(self, bibliography):
4402
+ """
4403
+ Deduplicate bibliography entries using title and author comparison.
4404
+
4405
+ This works with structured reference dictionaries from BibTeX/LaTeX parsing,
4406
+ as opposed to _deduplicate_references_with_segment_matching which works with raw text.
4407
+
4408
+ Args:
4409
+ bibliography: List of reference dictionaries with 'title', 'authors', etc.
4410
+
4411
+ Returns:
4412
+ List of unique reference dictionaries
4413
+ """
4414
+ if len(bibliography) <= 1:
4415
+ return bibliography
4416
+
4417
+ unique_refs = []
4418
+ seen_titles = set()
4419
+
4420
+ for ref in bibliography:
4421
+ title = ref.get('title', '').strip()
4422
+ if not title:
4423
+ # Keep references without titles (they can't be deduplicated)
4424
+ unique_refs.append(ref)
4425
+ continue
4426
+
4427
+ # Normalize title for comparison (case-insensitive, basic cleanup)
4428
+ normalized_title = title.lower().strip()
4429
+
4430
+ # Check if we've seen this title before (case-insensitive)
4431
+ if normalized_title in seen_titles:
4432
+ logger.debug(f"Skipping duplicate reference: '{title}'")
4433
+ else:
4434
+ unique_refs.append(ref)
4435
+ seen_titles.add(normalized_title)
4436
+
4437
+ return unique_refs
4438
+
4312
4439
  def _is_arxiv_identifier_title_mismatch(self, seg1, seg2):
4313
4440
  """
4314
4441
  Check if one reference has an arXiv identifier as title while the other has a real title,
@@ -5087,60 +5214,6 @@ class ArxivReferenceChecker:
5087
5214
 
5088
5215
  return references
5089
5216
 
5090
- def compare_authors(self, cited_authors, correct_authors):
5091
- """
5092
- Compare author lists to check if they match using improved name matching.
5093
- Uses the utility function is_name_match for robust author name comparison.
5094
- """
5095
- # Clean up author names
5096
- cleaned_cited = []
5097
- for author in cited_authors:
5098
- # Remove reference numbers (e.g., "[1]")
5099
- author = re.sub(r'^\[\d+\]', '', author)
5100
- # Remove line breaks
5101
- author = author.replace('\n', ' ')
5102
-
5103
- # Handle "et al" cases properly
5104
- author_clean = author.strip()
5105
- if author_clean.lower() == 'et al':
5106
- # Skip pure "et al" entries
5107
- continue
5108
- elif 'et al' in author_clean.lower():
5109
- # Remove "et al" from the author name (e.g., "S. M. Lundberg et al" -> "S. M. Lundberg")
5110
- author_clean = re.sub(r'\s+et\s+al\.?', '', author_clean, flags=re.IGNORECASE).strip()
5111
- if author_clean: # Only add if something remains
5112
- cleaned_cited.append(author_clean)
5113
- else:
5114
- cleaned_cited.append(author_clean)
5115
-
5116
- if not cleaned_cited:
5117
- return True, "No authors to compare"
5118
-
5119
- # Handle "et al" cases and length mismatches
5120
- has_et_al = any('et al' in a.lower() for a in cited_authors)
5121
-
5122
- if len(cleaned_cited) < len(correct_authors) and (has_et_al or len(cleaned_cited) <= 3):
5123
- # Only compare the authors that are listed
5124
- correct_authors = correct_authors[:len(cleaned_cited)]
5125
- elif len(cleaned_cited) > len(correct_authors) and len(correct_authors) >= 3:
5126
- # Use available correct authors
5127
- cleaned_cited = cleaned_cited[:len(correct_authors)]
5128
-
5129
- # If there's a big count mismatch and no "et al", it's likely an error
5130
- if abs(len(cleaned_cited) - len(correct_authors)) > 3 and not has_et_al:
5131
- return False, "Author count mismatch"
5132
-
5133
- # Compare first author (most important) using the improved utility function
5134
- if cleaned_cited and correct_authors:
5135
- # Use raw names for comparison (is_name_match handles normalization internally)
5136
- cited_first = cleaned_cited[0]
5137
- correct_first = correct_authors[0]
5138
-
5139
- if not enhanced_name_match(cited_first, correct_first):
5140
- from utils.error_utils import format_first_author_mismatch
5141
- return False, format_first_author_mismatch(cited_first, correct_first)
5142
-
5143
- return True, "Authors match"
5144
5217
 
5145
5218
  def normalize_text(self, text):
5146
5219
  """
@@ -5251,6 +5324,19 @@ class ArxivReferenceChecker:
5251
5324
  return False
5252
5325
  return True
5253
5326
 
5327
+ def compare_authors(self, authors1, authors2):
5328
+ """
5329
+ Compare authors using the text_utils compare_authors function.
5330
+
5331
+ Args:
5332
+ authors1: First list of authors
5333
+ authors2: Second list of authors
5334
+
5335
+ Returns:
5336
+ Tuple of (match_result, error_message)
5337
+ """
5338
+ return compare_authors(authors1, authors2)
5339
+
5254
5340
  def _verify_references_sequential(self, paper, bibliography, paper_errors, error_types, unverified_count, debug_mode):
5255
5341
  """
5256
5342
  Sequential reference verification (original implementation)
@@ -5267,7 +5353,10 @@ class ArxivReferenceChecker:
5267
5353
  ref_id = self.extract_arxiv_id_from_url(reference['url'])
5268
5354
 
5269
5355
  # Print reference info in non-debug mode (improved formatting)
5270
- title = reference.get('title', 'Untitled')
5356
+ raw_title = reference.get('title', 'Untitled')
5357
+ # Clean LaTeX commands from title for display
5358
+ from utils.text_utils import strip_latex_commands
5359
+ title = strip_latex_commands(raw_title)
5271
5360
  from utils.text_utils import format_authors_for_display
5272
5361
  authors = format_authors_for_display(reference.get('authors', []))
5273
5362
  year = reference.get('year', '')
@@ -5504,6 +5593,14 @@ class ArxivReferenceChecker:
5504
5593
  """Categorize the unverified error into checker error or not found"""
5505
5594
  error_details_lower = error_details.lower()
5506
5595
 
5596
+ # New specific URL-based unverified reasons
5597
+ if error_details_lower == "non-existent web page":
5598
+ return "Non-existent web page"
5599
+ elif error_details_lower == "paper not found and url doesn't reference it":
5600
+ return "Paper not found and URL doesn't reference it"
5601
+ elif error_details_lower == "paper not verified but url references paper":
5602
+ return "Paper not verified but URL references paper"
5603
+
5507
5604
  # Checker/API errors
5508
5605
  api_error_patterns = [
5509
5606
  'api error', 'rate limit', 'http error', 'network error',
utils/text_utils.py CHANGED
@@ -580,6 +580,9 @@ def clean_title_for_search(title):
580
580
  if not isinstance(title, str):
581
581
  return str(title) if title is not None else ''
582
582
 
583
+ # Strip LaTeX commands to handle math formatting and other LaTeX markup
584
+ title = strip_latex_commands(title)
585
+
583
586
  # Clean up newlines and normalize whitespace (but preserve other structure)
584
587
  title = title.replace('\n', ' ').strip()
585
588
  title = re.sub(r'\s+', ' ', title) # Normalize whitespace only
@@ -753,8 +756,11 @@ def normalize_paper_title(title: str) -> str:
753
756
  if not title:
754
757
  return ""
755
758
 
759
+ # Strip LaTeX commands first to handle math formatting consistently
760
+ normalized = strip_latex_commands(title)
761
+
756
762
  # Convert to lowercase
757
- normalized = title.lower()
763
+ normalized = normalized.lower()
758
764
 
759
765
  # Remove common prefixes that don't affect the actual title content
760
766
  prefixes_to_remove = [
@@ -2107,21 +2113,37 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2107
2113
 
2108
2114
  return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
2109
2115
 
2116
+ # Detect if cited authors look like parsing fragments
2117
+ # (many short single-word entries that might be first/last name fragments)
2118
+ def looks_like_fragments(authors_list):
2119
+ if len(authors_list) < 4: # Need at least 4 to detect fragment pattern
2120
+ return False
2121
+ single_word_count = sum(1 for author in authors_list if len(author.strip().split()) == 1)
2122
+ return single_word_count >= len(authors_list) * 0.7 # 70% or more are single words
2123
+
2110
2124
  # Normal case without "et al" - compare all authors
2111
2125
  if len(cleaned_cited) != len(correct_names):
2112
- # For non-et-al cases, be more strict about count mismatches
2113
- # Allow minor flexibility (1 author difference) but not more
2114
- if abs(len(cleaned_cited) - len(correct_names)) > 1:
2126
+
2127
+ # Check if cited authors look like parsing fragments
2128
+ if looks_like_fragments(cleaned_cited):
2115
2129
  from utils.error_utils import format_author_count_mismatch
2116
- # Convert cited names to display format (First Last) before showing in error
2117
2130
  display_cited = [format_author_for_display(author) for author in cleaned_cited]
2118
2131
  error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2119
2132
  return False, error_msg
2120
2133
 
2121
- # Use the shorter list for comparison
2122
- min_len = min(len(cleaned_cited), len(correct_names))
2123
- comparison_cited = cleaned_cited[:min_len]
2124
- comparison_correct = correct_names[:min_len]
2134
+ # For all count mismatches, show the count mismatch error
2135
+ if len(cleaned_cited) < len(correct_names):
2136
+ from utils.error_utils import format_author_count_mismatch
2137
+ display_cited = [format_author_for_display(author) for author in cleaned_cited]
2138
+ error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2139
+ return False, error_msg
2140
+
2141
+ # For cases where cited > correct, also show count mismatch
2142
+ elif len(cleaned_cited) > len(correct_names):
2143
+ from utils.error_utils import format_author_count_mismatch
2144
+ display_cited = [format_author_for_display(author) for author in cleaned_cited]
2145
+ error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2146
+ return False, error_msg
2125
2147
  else:
2126
2148
  comparison_cited = cleaned_cited
2127
2149
  comparison_correct = correct_names
@@ -2484,8 +2506,64 @@ def strip_latex_commands(text):
2484
2506
  # Remove font size commands
2485
2507
  text = re.sub(r'\\(tiny|scriptsize|footnotesize|small|normalsize|large|Large|LARGE|huge|Huge)\b', '', text)
2486
2508
 
2487
- # Remove math mode delimiters
2488
- text = re.sub(r'\$([^$]*)\$', r'\1', text)
2509
+ # Handle complex math mode patterns first
2510
+ # Pattern like $\{$$\mu$second-scale$\}$ should become μsecond-scale
2511
+ def process_complex_math(match):
2512
+ content = match.group(1)
2513
+ # Handle common Greek letters
2514
+ content = re.sub(r'\\mu\b', 'μ', content) # \mu -> μ
2515
+ content = re.sub(r'\\alpha\b', 'α', content) # \alpha -> α
2516
+ content = re.sub(r'\\beta\b', 'β', content) # \beta -> β
2517
+ content = re.sub(r'\\gamma\b', 'γ', content) # \gamma -> γ
2518
+ content = re.sub(r'\\delta\b', 'δ', content) # \delta -> δ
2519
+ content = re.sub(r'\\epsilon\b', 'ε', content) # \epsilon -> ε
2520
+ content = re.sub(r'\\lambda\b', 'λ', content) # \lambda -> λ
2521
+ content = re.sub(r'\\pi\b', 'π', content) # \pi -> π
2522
+ content = re.sub(r'\\sigma\b', 'σ', content) # \sigma -> σ
2523
+ content = re.sub(r'\\theta\b', 'θ', content) # \theta -> θ
2524
+ # Remove any remaining LaTeX commands and braces from inside math
2525
+ content = re.sub(r'\\[a-zA-Z]+\b', '', content)
2526
+ content = re.sub(r'[{}]', '', content)
2527
+ # Clean up any remaining $ signs
2528
+ content = re.sub(r'\$+', '', content)
2529
+ return content
2530
+
2531
+ # Handle complex nested math patterns first
2532
+ # Pattern like $\{$$\mu$second-scale$\}$ should become μsecond-scale
2533
+ def process_nested_math_specifically(match):
2534
+ content = match.group(0)
2535
+ # Handle the specific pattern: $\{$$\mu$second-scale$\}$
2536
+ # Extract the meaningful parts
2537
+ if r'\mu' in content:
2538
+ # Replace \mu with μ and extract the surrounding text
2539
+ content = re.sub(r'\\mu\b', 'μ', content)
2540
+ # Remove all LaTeX math markup
2541
+ content = re.sub(r'[\$\{\}\\]+', '', content)
2542
+ return content
2543
+
2544
+ # Handle the specific problematic pattern
2545
+ text = re.sub(r'\$\\\{[^}]*\\\}\$', process_nested_math_specifically, text)
2546
+
2547
+ # Handle Greek letters in math mode before removing delimiters
2548
+ def process_standard_math(match):
2549
+ content = match.group(1)
2550
+ # Handle common Greek letters - content has single backslashes
2551
+ content = re.sub(r'\\mu\b', 'μ', content)
2552
+ content = re.sub(r'\\alpha\b', 'α', content)
2553
+ content = re.sub(r'\\beta\b', 'β', content)
2554
+ content = re.sub(r'\\gamma\b', 'γ', content)
2555
+ content = re.sub(r'\\delta\b', 'δ', content)
2556
+ content = re.sub(r'\\epsilon\b', 'ε', content)
2557
+ content = re.sub(r'\\lambda\b', 'λ', content)
2558
+ content = re.sub(r'\\pi\b', 'π', content)
2559
+ content = re.sub(r'\\sigma\b', 'σ', content)
2560
+ content = re.sub(r'\\theta\b', 'θ', content)
2561
+ # Remove any remaining LaTeX commands
2562
+ content = re.sub(r'\\[a-zA-Z]+\b', '', content)
2563
+ return content
2564
+
2565
+ # Remove standard math mode delimiters with Greek letter processing
2566
+ text = re.sub(r'\$([^$]*)\$', process_standard_math, text)
2489
2567
  text = re.sub(r'\\begin\{equation\}.*?\\end\{equation\}', '', text, flags=re.DOTALL)
2490
2568
  text = re.sub(r'\\begin\{align\}.*?\\end\{align\}', '', text, flags=re.DOTALL)
2491
2569
 
@@ -3369,7 +3447,18 @@ def _extract_corrected_reference_data(error_entry: dict, corrected_data: dict) -
3369
3447
  """
3370
3448
  # Get the corrected information
3371
3449
  correct_title = error_entry.get('ref_title_correct') or corrected_data.get('title', '')
3372
- correct_authors = error_entry.get('ref_authors_correct') or corrected_data.get('authors', '')
3450
+
3451
+ # Handle authors - can be string or list of dicts from API
3452
+ authors_raw = error_entry.get('ref_authors_correct') or corrected_data.get('authors', '')
3453
+ if isinstance(authors_raw, list):
3454
+ # Convert list of author dicts to comma-separated string
3455
+ if authors_raw and isinstance(authors_raw[0], dict):
3456
+ correct_authors = ', '.join([author.get('name', '') for author in authors_raw])
3457
+ else:
3458
+ correct_authors = ', '.join(authors_raw)
3459
+ else:
3460
+ correct_authors = str(authors_raw) if authors_raw else ''
3461
+
3373
3462
  correct_year = error_entry.get('ref_year_correct') or corrected_data.get('year', '')
3374
3463
 
3375
3464
  # Prioritize the verified URL that was actually used for verification
@@ -3573,7 +3662,39 @@ def format_corrected_plaintext(original_reference, corrected_data, error_entry):
3573
3662
  if correct_url:
3574
3663
  citation_parts.append(f"{correct_url}")
3575
3664
 
3576
- return '. '.join(citation_parts) + '.'
3665
+ citation_text = '. '.join(citation_parts) + '.'
3666
+
3667
+ # Add citation key information if available (for easy copying)
3668
+ citation_key = original_reference.get('bibtex_key') or original_reference.get('bibitem_key')
3669
+ if citation_key and citation_key != 'unknown':
3670
+ bibtex_type = original_reference.get('bibtex_type', 'misc')
3671
+ citation_text += f"\n\n% Citation key for BibTeX: @{bibtex_type}{{{citation_key}, ...}}"
3672
+
3673
+ return citation_text
3674
+
3675
+
3676
+ def compare_titles_with_latex_cleaning(cited_title: str, database_title: str) -> float:
3677
+ """
3678
+ Compare two titles with proper LaTeX cleaning for accurate similarity scoring.
3679
+
3680
+ This function ensures both titles are cleaned of LaTeX commands before comparison
3681
+ to avoid false mismatches due to formatting differences like {LLM}s vs LLMs.
3682
+
3683
+ Args:
3684
+ cited_title: Title from cited reference (may contain LaTeX)
3685
+ database_title: Title from database (usually already clean)
3686
+
3687
+ Returns:
3688
+ Similarity score between 0 and 1
3689
+ """
3690
+ if not cited_title or not database_title:
3691
+ return 0.0
3692
+
3693
+ # Clean LaTeX commands from cited title to match database format
3694
+ clean_cited = strip_latex_commands(cited_title)
3695
+
3696
+ # Calculate similarity using cleaned titles
3697
+ return calculate_title_similarity(clean_cited, database_title)
3577
3698
 
3578
3699
 
3579
3700
  def calculate_title_similarity(title1: str, title2: str) -> float: