academic-refchecker 1.2.47__tar.gz → 1.2.49__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {academic_refchecker-1.2.47/src/academic_refchecker.egg-info → academic_refchecker-1.2.49}/PKG-INFO +1 -1
  2. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/__version__.py +1 -1
  3. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
  4. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/enhanced_hybrid_checker.py +84 -0
  5. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/openreview_checker.py +467 -4
  6. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/semantic_scholar.py +2 -2
  7. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/core/parallel_processor.py +7 -5
  8. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/core/refchecker.py +100 -24
  9. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/arxiv_utils.py +25 -23
  10. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/error_utils.py +33 -0
  11. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/text_utils.py +4 -1
  12. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/LICENSE +0 -0
  13. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/MANIFEST.in +0 -0
  14. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/README.md +0 -0
  15. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/pyproject.toml +0 -0
  16. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/requirements.txt +0 -0
  17. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/scripts/download_db.py +0 -0
  18. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/scripts/run_tests.py +0 -0
  19. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/scripts/start_vllm_server.py +0 -0
  20. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/setup.cfg +0 -0
  21. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/__init__.py +0 -0
  22. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
  23. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
  24. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
  25. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/academic_refchecker.egg-info/requires.txt +0 -0
  26. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/academic_refchecker.egg-info/top_level.txt +0 -0
  27. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/__init__.py +0 -0
  28. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/crossref.py +0 -0
  29. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/github_checker.py +0 -0
  30. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/local_semantic_scholar.py +0 -0
  31. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/openalex.py +0 -0
  32. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/webpage_checker.py +0 -0
  33. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/config/__init__.py +0 -0
  34. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/config/logging.conf +0 -0
  35. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/config/settings.py +0 -0
  36. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/core/__init__.py +0 -0
  37. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/core/db_connection_pool.py +0 -0
  38. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/database/__init__.py +0 -0
  39. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/database/download_semantic_scholar_db.py +0 -0
  40. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/llm/__init__.py +0 -0
  41. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/llm/base.py +0 -0
  42. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/llm/providers.py +0 -0
  43. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/scripts/__init__.py +0 -0
  44. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/scripts/start_vllm_server.py +0 -0
  45. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/services/__init__.py +0 -0
  46. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/services/pdf_processor.py +0 -0
  47. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/__init__.py +0 -0
  48. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/author_utils.py +0 -0
  49. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/biblatex_parser.py +0 -0
  50. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/bibliography_utils.py +0 -0
  51. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/bibtex_parser.py +0 -0
  52. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/config_validator.py +0 -0
  53. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/db_utils.py +0 -0
  54. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/doi_utils.py +0 -0
  55. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/mock_objects.py +0 -0
  56. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/unicode_utils.py +0 -0
  57. {academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.47
3
+ Version: 1.2.49
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.47"
3
+ __version__ = "1.2.49"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.47
3
+ Version: 1.2.49
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -312,12 +312,36 @@ class EnhancedHybridReferenceChecker:
312
312
  if (self.openreview and
313
313
  hasattr(self.openreview, 'is_openreview_reference') and
314
314
  self.openreview.is_openreview_reference(reference)):
315
+ logger.debug("Enhanced Hybrid: Trying OpenReview URL-based verification")
315
316
  verified_data, errors, url, success, failure_type = self._try_api('openreview', self.openreview, reference)
316
317
  if success:
317
318
  return verified_data, errors, url
318
319
  if failure_type in ['throttled', 'timeout', 'server_error']:
319
320
  failed_apis.append(('openreview', self.openreview, failure_type))
320
321
 
322
+ # Strategy 5b: Try OpenReview by search if venue suggests it might be there
323
+ elif (self.openreview and
324
+ hasattr(self.openreview, 'verify_reference_by_search')):
325
+ # Check if venue suggests this might be on OpenReview
326
+ venue = reference.get('venue', reference.get('journal', '')).lower()
327
+ openreview_venues = [
328
+ 'iclr', 'icml', 'neurips', 'nips', 'aaai', 'ijcai',
329
+ 'international conference on learning representations',
330
+ 'international conference on machine learning',
331
+ 'neural information processing systems'
332
+ ]
333
+
334
+ venue_suggests_openreview = any(or_venue in venue for or_venue in openreview_venues)
335
+ logger.debug(f"Enhanced Hybrid: OpenReview venue check - venue: '{venue}', suggests: {venue_suggests_openreview}")
336
+
337
+ if venue_suggests_openreview:
338
+ logger.debug("Enhanced Hybrid: Trying OpenReview search-based verification")
339
+ verified_data, errors, url, success, failure_type = self._try_openreview_search(reference)
340
+ if success:
341
+ return verified_data, errors, url
342
+ if failure_type in ['throttled', 'timeout', 'server_error']:
343
+ failed_apis.append(('openreview_search', self.openreview, failure_type))
344
+
321
345
  # Strategy 6: Try CrossRef if we haven't already (for non-DOI references)
322
346
  if not self._should_try_doi_apis_first(reference) and self.crossref:
323
347
  verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
@@ -399,6 +423,66 @@ class EnhancedHybridReferenceChecker:
399
423
  'error_details': 'Could not verify reference using any available API'
400
424
  }], None
401
425
 
426
+ def _try_openreview_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str], bool, str]:
427
+ """
428
+ Try to verify reference using OpenReview search
429
+
430
+ Returns:
431
+ Tuple of (verified_data, errors, url, success, failure_type)
432
+ """
433
+ if not self.openreview:
434
+ return None, [], None, False, 'none'
435
+
436
+ start_time = time.time()
437
+ failure_type = 'none'
438
+
439
+ try:
440
+ verified_data, errors, url = self.openreview.verify_reference_by_search(reference)
441
+ duration = time.time() - start_time
442
+
443
+ # Consider it successful if we found data or verification errors
444
+ success = verified_data is not None or len(errors) > 0
445
+ self._update_api_stats('openreview', success, duration)
446
+
447
+ if success:
448
+ logger.debug(f"Enhanced Hybrid: OpenReview search successful in {duration:.2f}s, URL: {url}")
449
+ return verified_data, errors, url, True, 'none'
450
+ else:
451
+ logger.debug(f"Enhanced Hybrid: OpenReview search found no results in {duration:.2f}s")
452
+ return None, [], None, False, 'not_found'
453
+
454
+ except requests.exceptions.Timeout as e:
455
+ duration = time.time() - start_time
456
+ self._update_api_stats('openreview', False, duration)
457
+ failure_type = 'timeout'
458
+ logger.debug(f"Enhanced Hybrid: OpenReview search timed out in {duration:.2f}s: {e}")
459
+ return None, [], None, False, failure_type
460
+
461
+ except requests.exceptions.RequestException as e:
462
+ duration = time.time() - start_time
463
+ self._update_api_stats('openreview', False, duration)
464
+
465
+ # Check if it's a rate limiting error
466
+ if hasattr(e, 'response') and e.response is not None:
467
+ if e.response.status_code in [429, 503]:
468
+ failure_type = 'throttled'
469
+ elif e.response.status_code >= 500:
470
+ failure_type = 'server_error'
471
+ else:
472
+ failure_type = 'other'
473
+ else:
474
+ failure_type = 'other'
475
+
476
+ logger.debug(f"Enhanced Hybrid: OpenReview search failed in {duration:.2f}s: {type(e).__name__}: {e}")
477
+ return None, [], None, False, failure_type
478
+
479
+ except Exception as e:
480
+ duration = time.time() - start_time
481
+ self._update_api_stats('openreview', False, duration)
482
+ failure_type = 'other'
483
+ logger.debug(f"Enhanced Hybrid: OpenReview search error in {duration:.2f}s: {type(e).__name__}: {e}")
484
+ return None, [], None, False, failure_type
485
+
402
486
  def get_performance_stats(self) -> Dict[str, Any]:
403
487
  """
404
488
  Get performance statistics for all APIs
@@ -498,6 +498,160 @@ class OpenReviewReferenceChecker:
498
498
  logger.debug(f"OpenReview verification completed for: {openreview_url}")
499
499
  return verified_data, errors, openreview_url
500
500
 
501
+ def verify_by_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
502
+ """
503
+ Verify a reference by searching OpenReview (when no URL is provided)
504
+
505
+ Args:
506
+ reference: Reference dictionary with title, authors, year, etc.
507
+
508
+ Returns:
509
+ Tuple of (verified_data, errors, paper_url) where:
510
+ - verified_data: Dict with verified OpenReview paper data or None
511
+ - errors: List of error/warning dictionaries
512
+ - paper_url: The OpenReview URL if found
513
+ """
514
+ logger.debug(f"Searching OpenReview for reference: {reference.get('title', 'Untitled')}")
515
+
516
+ title = reference.get('title', '').strip()
517
+ authors = reference.get('authors', [])
518
+ year = reference.get('year')
519
+ venue = reference.get('venue', '').strip()
520
+
521
+ if not title:
522
+ return None, [], None
523
+
524
+ # Check if venue suggests this might be on OpenReview
525
+ if not self._is_likely_openreview_venue(venue):
526
+ logger.debug(f"Venue '{venue}' doesn't suggest OpenReview, skipping search")
527
+ return None, [], None
528
+
529
+ # Search for matching papers
530
+ search_results = self.search_paper(title, authors, year)
531
+
532
+ if not search_results:
533
+ logger.debug("No matching papers found on OpenReview")
534
+ return None, [], None
535
+
536
+ # Use the best match (first result, as they're sorted by relevance)
537
+ best_match = search_results[0]
538
+ paper_url = best_match.get('forum_url')
539
+
540
+ logger.debug(f"Found OpenReview match: {best_match.get('title', 'Untitled')}")
541
+
542
+ # Verify the reference against the found paper
543
+ errors = []
544
+
545
+ # Check title match
546
+ cited_title = reference.get('title', '').strip()
547
+ paper_title = best_match.get('title', '').strip()
548
+
549
+ if cited_title and paper_title:
550
+ similarity = calculate_title_similarity(cited_title, paper_title)
551
+ if similarity < 0.8: # Slightly higher threshold for search results
552
+ from utils.error_utils import format_title_mismatch
553
+ details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
554
+ errors.append({
555
+ "warning_type": "title",
556
+ "warning_details": details
557
+ })
558
+
559
+ # Check authors
560
+ cited_authors = reference.get('authors', [])
561
+ paper_authors = best_match.get('authors', [])
562
+
563
+ if cited_authors and paper_authors:
564
+ # Convert to list format if needed
565
+ if isinstance(cited_authors, str):
566
+ cited_authors = [author.strip() for author in cited_authors.split(',')]
567
+ if isinstance(paper_authors, str):
568
+ paper_authors = [author.strip() for author in paper_authors.split(',')]
569
+
570
+ # Use the existing author comparison function
571
+ match, error_msg = compare_authors(cited_authors, paper_authors)
572
+ if not match and error_msg:
573
+ errors.append({
574
+ "warning_type": "author",
575
+ "warning_details": error_msg
576
+ })
577
+
578
+ # Check year
579
+ cited_year = reference.get('year')
580
+ paper_year = best_match.get('year')
581
+
582
+ if cited_year and paper_year:
583
+ try:
584
+ cited_year_int = int(cited_year)
585
+ paper_year_int = int(paper_year)
586
+
587
+ is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
588
+ if is_different and year_message:
589
+ from utils.error_utils import format_year_mismatch
590
+ errors.append({
591
+ "warning_type": "year",
592
+ "warning_details": format_year_mismatch(cited_year_int, paper_year_int)
593
+ })
594
+ except (ValueError, TypeError):
595
+ pass # Skip year validation if conversion fails
596
+
597
+ # Check venue if provided in reference
598
+ cited_venue = reference.get('venue', '').strip()
599
+ paper_venue = best_match.get('venue', '').strip()
600
+
601
+ if cited_venue and paper_venue:
602
+ if are_venues_substantially_different(cited_venue, paper_venue):
603
+ from utils.error_utils import format_venue_mismatch
604
+ errors.append({
605
+ "warning_type": "venue",
606
+ "warning_details": format_venue_mismatch(cited_venue, paper_venue)
607
+ })
608
+
609
+ # Create verified data structure
610
+ verified_data = {
611
+ 'title': best_match.get('title', cited_title),
612
+ 'authors': best_match.get('authors', cited_authors),
613
+ 'year': best_match.get('year', cited_year),
614
+ 'venue': best_match.get('venue', cited_venue),
615
+ 'url': paper_url,
616
+ 'abstract': best_match.get('abstract', ''),
617
+ 'keywords': best_match.get('keywords', []),
618
+ 'openreview_metadata': best_match,
619
+ 'verification_source': 'OpenReview (search)'
620
+ }
621
+
622
+ logger.debug(f"OpenReview search verification completed for: {paper_url}")
623
+ return verified_data, errors, paper_url
624
+
625
+ def _is_likely_openreview_venue(self, venue: str) -> bool:
626
+ """
627
+ Check if a venue suggests the paper might be on OpenReview
628
+
629
+ Args:
630
+ venue: Venue string from reference
631
+
632
+ Returns:
633
+ True if venue suggests OpenReview
634
+ """
635
+ if not venue:
636
+ return False
637
+
638
+ venue_lower = venue.lower()
639
+
640
+ # Common venues that use OpenReview
641
+ openreview_venues = [
642
+ 'iclr', 'international conference on learning representations',
643
+ 'neurips', 'neural information processing systems', 'nips',
644
+ 'icml', 'international conference on machine learning',
645
+ 'iclr workshop', 'neurips workshop', 'icml workshop',
646
+ 'aaai', 'ijcai', 'aistats'
647
+ ]
648
+
649
+ for or_venue in openreview_venues:
650
+ if or_venue in venue_lower:
651
+ return True
652
+
653
+ return False
654
+
501
655
  def search_paper(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
502
656
  """
503
657
  Search for papers on OpenReview by title, authors, and/or year
@@ -510,7 +664,316 @@ class OpenReviewReferenceChecker:
510
664
  Returns:
511
665
  List of matching paper metadata dictionaries
512
666
  """
513
- # This would implement search functionality if needed
514
- # For now, OpenReview verification is primarily URL-based
515
- logger.debug(f"Search functionality not yet implemented for OpenReview")
516
- return []
667
+ if not title or not title.strip():
668
+ return []
669
+
670
+ logger.debug(f"Searching OpenReview for: {title}")
671
+
672
+ # Clean title for search
673
+ search_title = clean_title_for_search(title)
674
+
675
+ # Try API search first
676
+ results = self._search_via_api(search_title, authors, year)
677
+ if results:
678
+ return results
679
+
680
+ # If API search fails, try web search as fallback
681
+ return self._search_via_web(search_title, authors, year)
682
+
683
+ def _search_via_api(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
684
+ """
685
+ Search using OpenReview API
686
+
687
+ Args:
688
+ title: Clean title to search for
689
+ authors: List of author names (optional)
690
+ year: Publication year (optional)
691
+
692
+ Returns:
693
+ List of matching paper dictionaries
694
+ """
695
+ try:
696
+ # The OpenReview API requires specific parameters
697
+ # We'll search by content.title or content.venue (for venue-based search)
698
+ search_params = {
699
+ 'limit': 20, # Limit results to avoid overwhelming the API
700
+ 'details': 'directReplies' # Get basic details
701
+ }
702
+
703
+ # Try searching by venue first if year suggests recent conferences
704
+ if year and year >= 2017: # OpenReview started around 2017
705
+ venues_by_year = {
706
+ 2025: ['ICLR 2025'],
707
+ 2024: ['ICLR 2024', 'NeurIPS 2024', 'ICML 2024'],
708
+ 2023: ['ICLR 2023', 'NeurIPS 2023', 'ICML 2023'],
709
+ 2022: ['ICLR 2022', 'NeurIPS 2022', 'ICML 2022'],
710
+ 2021: ['ICLR 2021', 'NeurIPS 2021', 'ICML 2021'],
711
+ 2020: ['ICLR 2020', 'NeurIPS 2020', 'ICML 2020'],
712
+ 2019: ['ICLR 2019', 'NeurIPS 2019', 'ICML 2019'],
713
+ 2018: ['ICLR 2018', 'NeurIPS 2018', 'ICML 2018'],
714
+ 2017: ['ICLR 2017']
715
+ }
716
+
717
+ possible_venues = venues_by_year.get(year, [])
718
+
719
+ results = []
720
+ for venue in possible_venues:
721
+ # Search by venue and then filter by title
722
+ venue_params = search_params.copy()
723
+ venue_params['content.venue'] = venue
724
+
725
+ api_url = f"{self.api_url}/notes"
726
+ response = self._respectful_request(api_url, params=venue_params)
727
+
728
+ if response and response.status_code == 200:
729
+ try:
730
+ data = response.json()
731
+ if 'notes' in data and data['notes']:
732
+ for note in data['notes']:
733
+ try:
734
+ metadata = self._parse_api_response(note)
735
+ if metadata and self._is_good_match(metadata, title, authors, year):
736
+ results.append(metadata)
737
+ if len(results) >= 5: # Limit results
738
+ break
739
+ except Exception as e:
740
+ logger.debug(f"Error parsing note: {e}")
741
+ continue
742
+
743
+ if results:
744
+ break # Found results, no need to search other venues
745
+
746
+ except (json.JSONDecodeError, KeyError) as e:
747
+ logger.debug(f"Failed to parse venue search response: {e}")
748
+ continue
749
+ else:
750
+ logger.debug(f"Venue search failed for {venue}: {response.status_code if response else 'No response'}")
751
+
752
+ if results:
753
+ logger.debug(f"OpenReview API search found {len(results)} matches via venue search")
754
+ return results
755
+
756
+ # If venue search didn't work, try other approaches
757
+ # OpenReview API is quite restrictive, so we might need to fall back to web scraping
758
+ logger.debug("OpenReview API venue search returned no results, trying web search")
759
+ return []
760
+
761
+ except Exception as e:
762
+ logger.debug(f"OpenReview API search error: {e}")
763
+ return []
764
+
765
+ def _search_via_web(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
766
+ """
767
+ Search using OpenReview web interface (fallback)
768
+
769
+ Args:
770
+ title: Clean title to search for
771
+ authors: List of author names (optional)
772
+ year: Publication year (optional)
773
+
774
+ Returns:
775
+ List of matching paper dictionaries
776
+ """
777
+ try:
778
+ # Build search URL
779
+ search_query = title.replace(' ', '+')
780
+ search_url = f"{self.base_url}/search?term={search_query}"
781
+
782
+ response = self._respectful_request(search_url)
783
+ if not response or response.status_code != 200:
784
+ return []
785
+
786
+ # Parse search results page
787
+ soup = BeautifulSoup(response.text, 'html.parser')
788
+
789
+ # Look for paper links in search results
790
+ # OpenReview search results typically contain links to forum pages
791
+ results = []
792
+
793
+ # Find links that look like OpenReview paper URLs
794
+ for link in soup.find_all('a', href=True):
795
+ href = link.get('href', '')
796
+ if '/forum?id=' in href:
797
+ paper_id = self.extract_paper_id(href)
798
+ if paper_id:
799
+ # Get full metadata for this paper
800
+ metadata = self.get_paper_metadata(paper_id)
801
+ if metadata and self._is_good_match(metadata, title, authors, year):
802
+ results.append(metadata)
803
+ if len(results) >= 5: # Limit results
804
+ break
805
+
806
+ logger.debug(f"OpenReview web search found {len(results)} matches")
807
+ return results
808
+
809
+ except Exception as e:
810
+ logger.debug(f"OpenReview web search error: {e}")
811
+ return []
812
+
813
+ def _is_good_match(self, metadata: Dict[str, Any], search_title: str, authors: List[str] = None, year: int = None) -> bool:
814
+ """
815
+ Check if the found paper is a good match for the search criteria
816
+
817
+ Args:
818
+ metadata: Paper metadata from OpenReview
819
+ search_title: Title we're searching for
820
+ authors: Authors we're looking for (optional)
821
+ year: Year we're looking for (optional)
822
+
823
+ Returns:
824
+ True if it's a good match
825
+ """
826
+ paper_title = metadata.get('title', '')
827
+ if not paper_title:
828
+ return False
829
+
830
+ # Check title similarity
831
+ title_similarity = calculate_title_similarity(search_title, paper_title)
832
+ if title_similarity < 0.7: # Require at least 70% similarity
833
+ return False
834
+
835
+ # Check year if provided
836
+ if year:
837
+ paper_year = metadata.get('year')
838
+ if paper_year and abs(int(paper_year) - year) > 1: # Allow 1 year difference
839
+ return False
840
+
841
+ # Check authors if provided
842
+ if authors and len(authors) > 0:
843
+ paper_authors = metadata.get('authors', [])
844
+ if paper_authors:
845
+ # Check if at least one author matches
846
+ author_match = False
847
+ for search_author in authors[:2]: # Check first 2 authors
848
+ for paper_author in paper_authors[:3]: # Check first 3 paper authors
849
+ if is_name_match(search_author, paper_author):
850
+ author_match = True
851
+ break
852
+ if author_match:
853
+ break
854
+
855
+ if not author_match:
856
+ return False
857
+
858
+ return True
859
+
860
+ def search_by_title(self, title: str, max_results: int = 5) -> List[Dict[str, Any]]:
861
+ """
862
+ Search OpenReview for papers by title using the working search API.
863
+
864
+ Args:
865
+ title: Paper title to search for
866
+ max_results: Maximum number of results to return
867
+
868
+ Returns:
869
+ List of paper data dictionaries
870
+ """
871
+ try:
872
+ # Use OpenReview's search API with term parameter (this works!)
873
+ params = {
874
+ 'term': title,
875
+ 'limit': max_results
876
+ }
877
+
878
+ response = self._respectful_request(f"{self.api_url}/notes/search", params=params)
879
+ if not response or response.status_code != 200:
880
+ logger.debug(f"OpenReview search API failed with status {response.status_code if response else 'None'}")
881
+ return []
882
+
883
+ data = response.json()
884
+ papers = []
885
+
886
+ for note in data.get('notes', []):
887
+ # Filter to exact or close title matches
888
+ note_title = note.get('content', {}).get('title', '')
889
+ if self._is_title_match(title, note_title):
890
+ paper_data = self._parse_api_response(note)
891
+ if paper_data:
892
+ papers.append(paper_data)
893
+
894
+ logger.debug(f"OpenReview search found {len(papers)} matching papers for '{title}'")
895
+ return papers
896
+
897
+ except Exception as e:
898
+ logger.error(f"Error searching OpenReview by title '{title}': {e}")
899
+ return []
900
+
901
+ def _is_title_match(self, search_title: str, found_title: str, threshold: float = 0.8) -> bool:
902
+ """
903
+ Check if two titles match closely enough.
904
+
905
+ Args:
906
+ search_title: Title we're searching for
907
+ found_title: Title found in search results
908
+ threshold: Similarity threshold (0.0 to 1.0)
909
+
910
+ Returns:
911
+ True if titles match closely enough
912
+ """
913
+ if not search_title or not found_title:
914
+ return False
915
+
916
+ # Exact match
917
+ if search_title.lower().strip() == found_title.lower().strip():
918
+ return True
919
+
920
+ # Check if one contains the other (for cases where one is longer)
921
+ search_clean = search_title.lower().strip()
922
+ found_clean = found_title.lower().strip()
923
+
924
+ if search_clean in found_clean or found_clean in search_clean:
925
+ return True
926
+
927
+ # Use similarity calculation from text_utils
928
+ try:
929
+ from utils.text_utils import calculate_title_similarity
930
+ similarity = calculate_title_similarity(search_title, found_title)
931
+ return similarity >= threshold
932
+ except ImportError:
933
+ # Fallback to simple word matching
934
+ search_words = set(search_clean.split())
935
+ found_words = set(found_clean.split())
936
+
937
+ if not search_words or not found_words:
938
+ return False
939
+
940
+ intersection = search_words.intersection(found_words)
941
+ union = search_words.union(found_words)
942
+
943
+ jaccard_similarity = len(intersection) / len(union) if union else 0
944
+ return jaccard_similarity >= threshold
945
+
946
+ def verify_reference_by_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
947
+ """
948
+ Verify a reference by searching OpenReview (for papers without URLs).
949
+
950
+ Args:
951
+ reference: Reference data dictionary
952
+
953
+ Returns:
954
+ Tuple of (verified_data, errors_and_warnings, debug_info)
955
+ """
956
+ title = reference.get('title', '').strip()
957
+ if not title:
958
+ return None, [], "No title provided for search"
959
+
960
+ # Search for the paper
961
+ search_results = self.search_by_title(title)
962
+
963
+ if not search_results:
964
+ return None, [], f"No papers found on OpenReview for title: {title}"
965
+
966
+ # Take the best match (first result, as search is already filtered)
967
+ best_match = search_results[0]
968
+
969
+ # Use the existing verify_reference method with the found URL
970
+ forum_url = best_match.get('forum_url')
971
+ if forum_url:
972
+ # Create a reference with the OpenReview URL for verification
973
+ reference_with_url = reference.copy()
974
+ reference_with_url['url'] = forum_url
975
+
976
+ return self.verify_reference(reference_with_url)
977
+
978
+ # If no URL, return the metadata as verification
979
+ return best_match, [], f"Found on OpenReview: {best_match.get('title')}"
@@ -583,8 +583,8 @@ class NonArxivReferenceChecker:
583
583
 
584
584
  if not (has_arxiv_url or has_arxiv_doi):
585
585
  errors.append({
586
- 'warning_type': 'url',
587
- 'warning_details': f"Reference could include arXiv URL: {arxiv_url}",
586
+ 'info_type': 'url',
587
+ 'info_details': f"Reference could include arXiv URL: {arxiv_url}",
588
588
  'ref_url_correct': arxiv_url
589
589
  })
590
590
 
@@ -340,7 +340,7 @@ class ParallelReferenceProcessor:
340
340
  # Display errors and warnings
341
341
  if result.errors:
342
342
  # Check if there's an unverified error
343
- has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' for e in result.errors)
343
+ has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' or e.get('info_type') == 'unverified' for e in result.errors)
344
344
 
345
345
  if has_unverified_error:
346
346
  # Use the centralized unverified error display function from base checker
@@ -348,9 +348,9 @@ class ParallelReferenceProcessor:
348
348
 
349
349
  # Display all non-unverified errors and warnings
350
350
  for error in result.errors:
351
- if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified':
352
- error_type = error.get('error_type') or error.get('warning_type')
353
- error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
351
+ if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified' and error.get('info_type') != 'unverified':
352
+ error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
353
+ error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
354
354
 
355
355
  from utils.error_utils import print_labeled_multiline
356
356
 
@@ -359,8 +359,10 @@ class ParallelReferenceProcessor:
359
359
  print(f" ❌ {error_details}")
360
360
  elif 'error_type' in error:
361
361
  print_labeled_multiline("❌ Error", error_details)
362
- else:
362
+ elif 'warning_type' in error:
363
363
  print_labeled_multiline("⚠️ Warning", error_details)
364
+ else:
365
+ print_labeled_multiline("ℹ️ Information", error_details)
364
366
 
365
367
  # Show timing info for slow references
366
368
  if result.processing_time > 5.0:
@@ -2033,6 +2033,9 @@ class ArxivReferenceChecker:
2033
2033
  elif 'warning_type' in error:
2034
2034
  formatted_error['warning_type'] = error['warning_type']
2035
2035
  formatted_error['warning_details'] = error['warning_details']
2036
+ elif 'info_type' in error:
2037
+ formatted_error['info_type'] = error['info_type']
2038
+ formatted_error['info_details'] = error['info_details']
2036
2039
 
2037
2040
  # Add correct information based on error type
2038
2041
  if error.get('error_type') == 'author':
@@ -2042,6 +2045,8 @@ class ArxivReferenceChecker:
2042
2045
  elif error.get('error_type') == 'doi':
2043
2046
  from utils.doi_utils import construct_doi_url
2044
2047
  formatted_error['ref_url_correct'] = construct_doi_url(error.get('ref_doi_correct', ''))
2048
+ elif error.get('info_type') == 'url':
2049
+ formatted_error['ref_url_correct'] = error.get('ref_url_correct', '')
2045
2050
 
2046
2051
  formatted_errors.append(formatted_error)
2047
2052
 
@@ -2153,17 +2158,22 @@ class ArxivReferenceChecker:
2153
2158
  for error in errors:
2154
2159
  formatted_error = {}
2155
2160
 
2156
- # Handle error_type and warning_type properly
2161
+ # Handle error_type, warning_type, and info_type properly
2157
2162
  if 'error_type' in error:
2158
2163
  formatted_error['error_type'] = error['error_type']
2159
2164
  formatted_error['error_details'] = error['error_details']
2160
2165
  elif 'warning_type' in error:
2161
2166
  formatted_error['warning_type'] = error['warning_type']
2162
2167
  formatted_error['warning_details'] = error['warning_details']
2168
+ elif 'info_type' in error:
2169
+ formatted_error['info_type'] = error['info_type']
2170
+ formatted_error['info_details'] = error['info_details']
2163
2171
 
2164
2172
  # Add correct information based on error type
2165
2173
  if error.get('warning_type') == 'year':
2166
2174
  formatted_error['ref_year_correct'] = error.get('ref_year_correct', '')
2175
+ elif error.get('info_type') == 'url':
2176
+ formatted_error['ref_url_correct'] = error.get('ref_url_correct', '')
2167
2177
 
2168
2178
  formatted_errors.append(formatted_error)
2169
2179
 
@@ -2214,13 +2224,16 @@ class ArxivReferenceChecker:
2214
2224
  for error in errors:
2215
2225
  formatted_error = {}
2216
2226
 
2217
- # Handle error_type and warning_type properly
2227
+ # Handle error_type, warning_type, and info_type properly
2218
2228
  if 'error_type' in error:
2219
2229
  formatted_error['error_type'] = error['error_type']
2220
2230
  formatted_error['error_details'] = error['error_details']
2221
2231
  elif 'warning_type' in error:
2222
2232
  formatted_error['warning_type'] = error['warning_type']
2223
2233
  formatted_error['warning_details'] = error['warning_details']
2234
+ elif 'info_type' in error:
2235
+ formatted_error['info_type'] = error['info_type']
2236
+ formatted_error['info_details'] = error['info_details']
2224
2237
 
2225
2238
  formatted_errors.append(formatted_error)
2226
2239
 
@@ -2335,13 +2348,16 @@ class ArxivReferenceChecker:
2335
2348
  logger.debug(f"DEBUG: Error {i}: {error}")
2336
2349
  formatted_error = {}
2337
2350
 
2338
- # Handle error_type and warning_type properly
2351
+ # Handle error_type, warning_type, and info_type properly
2339
2352
  if 'error_type' in error:
2340
2353
  formatted_error['error_type'] = error['error_type']
2341
2354
  formatted_error['error_details'] = error['error_details']
2342
2355
  elif 'warning_type' in error:
2343
2356
  formatted_error['warning_type'] = error['warning_type']
2344
2357
  formatted_error['warning_details'] = error['warning_details']
2358
+ elif 'info_type' in error:
2359
+ formatted_error['info_type'] = error['info_type']
2360
+ formatted_error['info_details'] = error['info_details']
2345
2361
 
2346
2362
  # Add correct information based on error type
2347
2363
  if error.get('error_type') == 'author':
@@ -2637,9 +2653,19 @@ class ArxivReferenceChecker:
2637
2653
 
2638
2654
  # Generate corrected reference using all available corrections
2639
2655
  corrected_data = self._extract_corrected_data_from_error(consolidated_entry, verified_data)
2640
- corrected_format = format_corrected_reference(reference, corrected_data, consolidated_entry)
2641
- if corrected_format:
2642
- consolidated_entry['ref_corrected_format'] = corrected_format
2656
+
2657
+ # Generate all three formats for user convenience
2658
+ from utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
2659
+ plaintext_format = format_corrected_plaintext(reference, corrected_data, consolidated_entry)
2660
+ bibtex_format = format_corrected_bibtex(reference, corrected_data, consolidated_entry)
2661
+ bibitem_format = format_corrected_bibitem(reference, corrected_data, consolidated_entry)
2662
+
2663
+ if plaintext_format:
2664
+ consolidated_entry['ref_corrected_plaintext'] = plaintext_format
2665
+ if bibtex_format:
2666
+ consolidated_entry['ref_corrected_bibtex'] = bibtex_format
2667
+ if bibitem_format:
2668
+ consolidated_entry['ref_corrected_bibitem'] = bibitem_format
2643
2669
 
2644
2670
  # Store the consolidated entry (write to file at end of run)
2645
2671
  self.errors.append(consolidated_entry)
@@ -2647,8 +2673,8 @@ class ArxivReferenceChecker:
2647
2673
  else:
2648
2674
  # Single error - handle as before
2649
2675
  error = errors[0]
2650
- error_type = error.get('error_type') or error.get('warning_type', 'unknown')
2651
- error_details = error.get('error_details') or error.get('warning_details', '')
2676
+ error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type', 'unknown')
2677
+ error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', '')
2652
2678
 
2653
2679
  error_entry = {
2654
2680
  # Source paper metadata
@@ -2696,11 +2722,21 @@ class ArxivReferenceChecker:
2696
2722
  if error_type != 'unverified':
2697
2723
  error_entry['ref_standard_format'] = self.format_standard_reference(error)
2698
2724
 
2699
- # Generate corrected reference in original format
2725
+ # Generate corrected reference in all formats for user convenience
2700
2726
  corrected_data = self._extract_corrected_data_from_error(error, verified_data)
2701
- corrected_format = format_corrected_reference(reference, corrected_data, error_entry)
2702
- if corrected_format:
2703
- error_entry['ref_corrected_format'] = corrected_format
2727
+
2728
+ # Generate all three formats
2729
+ from utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
2730
+ plaintext_format = format_corrected_plaintext(reference, corrected_data, error_entry)
2731
+ bibtex_format = format_corrected_bibtex(reference, corrected_data, error_entry)
2732
+ bibitem_format = format_corrected_bibitem(reference, corrected_data, error_entry)
2733
+
2734
+ if plaintext_format:
2735
+ error_entry['ref_corrected_plaintext'] = plaintext_format
2736
+ if bibtex_format:
2737
+ error_entry['ref_corrected_bibtex'] = bibtex_format
2738
+ if bibitem_format:
2739
+ error_entry['ref_corrected_bibitem'] = bibitem_format
2704
2740
  else:
2705
2741
  error_entry['ref_standard_format'] = None
2706
2742
 
@@ -2755,7 +2791,9 @@ class ArxivReferenceChecker:
2755
2791
  emoji = "❓"
2756
2792
  elif error_type in ['year', 'venue']: # Warning types
2757
2793
  emoji = "⚠️"
2758
- else: # Error types (title, author, doi, url, multiple, etc.)
2794
+ elif error_type == 'url': # Info type (ArXiv URL suggestion)
2795
+ emoji = "ℹ️"
2796
+ else: # Error types (title, author, doi, multiple, etc.)
2759
2797
  emoji = "❌"
2760
2798
 
2761
2799
  f.write(f"Type: {emoji} {error_entry['error_type']}\n")
@@ -2772,8 +2810,29 @@ class ArxivReferenceChecker:
2772
2810
  f.write(f" {error_entry['ref_verified_url']}\n")
2773
2811
  f.write("\n")
2774
2812
 
2775
- # Show corrected reference in original format if available
2776
- if error_entry.get('ref_corrected_format'):
2813
+ # Show corrected reference in all formats if available
2814
+ formats_written = False
2815
+
2816
+ # Plain text format
2817
+ if error_entry.get('ref_corrected_plaintext'):
2818
+ f.write("CORRECTED REFERENCE (Plain Text):\n")
2819
+ f.write(f"{error_entry['ref_corrected_plaintext']}\n\n")
2820
+ formats_written = True
2821
+
2822
+ # BibTeX format
2823
+ if error_entry.get('ref_corrected_bibtex'):
2824
+ f.write("CORRECTED REFERENCE (BibTeX):\n")
2825
+ f.write(f"{error_entry['ref_corrected_bibtex']}\n\n")
2826
+ formats_written = True
2827
+
2828
+ # Bibitem/LaTeX format
2829
+ if error_entry.get('ref_corrected_bibitem'):
2830
+ f.write("CORRECTED REFERENCE (LaTeX/Biblatex):\n")
2831
+ f.write(f"{error_entry['ref_corrected_bibitem']}\n\n")
2832
+ formats_written = True
2833
+
2834
+ # Fallback to legacy format if no new formats available
2835
+ if not formats_written and error_entry.get('ref_corrected_format'):
2777
2836
  f.write("CORRECTED REFERENCE:\n")
2778
2837
  f.write(f"{error_entry['ref_corrected_format']}\n\n")
2779
2838
 
@@ -2865,8 +2924,10 @@ class ArxivReferenceChecker:
2865
2924
  self.total_references_processed = 0
2866
2925
  self.papers_with_errors = 0
2867
2926
  self.papers_with_warnings = 0
2927
+ self.papers_with_info = 0
2868
2928
  self.total_errors_found = 0
2869
2929
  self.total_warnings_found = 0
2930
+ self.total_info_found = 0
2870
2931
  self.total_arxiv_refs = 0
2871
2932
  self.total_non_arxiv_refs = 0
2872
2933
  self.total_other_refs = 0
@@ -3025,18 +3086,21 @@ class ArxivReferenceChecker:
3025
3086
  # Separate actual errors from warnings for paper classification
3026
3087
  actual_errors = [e for e in paper_errors if 'error_type' in e and e['error_type'] != 'unverified']
3027
3088
  warnings_only = [e for e in paper_errors if 'warning_type' in e]
3089
+ info_only = [e for e in paper_errors if 'info_type' in e]
3028
3090
 
3029
3091
  if self.single_paper_mode:
3030
3092
  # Single paper mode - show simple summary
3031
- if actual_errors or warnings_only:
3093
+ if actual_errors or warnings_only or info_only:
3032
3094
  summary_parts = []
3033
3095
  if actual_errors:
3034
3096
  summary_parts.append(f"{len(actual_errors)} errors")
3035
3097
  if warnings_only:
3036
3098
  summary_parts.append(f"{len(warnings_only)} warnings")
3099
+ if info_only:
3100
+ summary_parts.append(f"{len(info_only)} information")
3037
3101
  else:
3038
3102
  # Multi-paper mode - track paper statistics
3039
- if actual_errors or warnings_only:
3103
+ if actual_errors or warnings_only or info_only:
3040
3104
  summary_parts = []
3041
3105
  if actual_errors:
3042
3106
  summary_parts.append(f"{len(actual_errors)} errors")
@@ -3045,6 +3109,10 @@ class ArxivReferenceChecker:
3045
3109
  summary_parts.append(f"{len(warnings_only)} warnings")
3046
3110
  # Count as paper with warnings if it has warnings (regardless of errors)
3047
3111
  self.papers_with_warnings += 1
3112
+ if info_only:
3113
+ summary_parts.append(f"{len(info_only)} information")
3114
+ # Count as paper with info if it has info messages (regardless of errors/warnings)
3115
+ self.papers_with_info += 1
3048
3116
 
3049
3117
  except Exception as e:
3050
3118
  logger.error(f"Error processing paper {paper_id}: {str(e)}")
@@ -3086,9 +3154,11 @@ class ArxivReferenceChecker:
3086
3154
  print(f"❌ Total errors: {self.total_errors_found}")
3087
3155
  if self.total_warnings_found > 0:
3088
3156
  print(f"⚠️ Total warnings: {self.total_warnings_found}")
3157
+ if self.total_info_found > 0:
3158
+ print(f"ℹ️ Total information: {self.total_info_found}")
3089
3159
  if self.total_unverified_refs > 0:
3090
3160
  print(f"❓ References that couldn't be verified: {self.total_unverified_refs}")
3091
- if self.total_errors_found == 0 and self.total_warnings_found == 0 and self.total_unverified_refs == 0:
3161
+ if self.total_errors_found == 0 and self.total_warnings_found == 0 and self.total_info_found == 0 and self.total_unverified_refs == 0:
3092
3162
  print(f"✅ All references verified successfully!")
3093
3163
 
3094
3164
  # Show warning if unreliable extraction was used and there are many errors
@@ -3108,6 +3178,8 @@ class ArxivReferenceChecker:
3108
3178
  print(f" Total errors: {self.total_errors_found}")
3109
3179
  print(f"⚠️ Papers with warnings: {self.papers_with_warnings}")
3110
3180
  print(f" Total warnings: {self.total_warnings_found}")
3181
+ print(f"ℹ️ Papers with information: {self.papers_with_info}")
3182
+ print(f" Total information: {self.total_info_found}")
3111
3183
  print(f"❓ References that couldn't be verified: {self.total_unverified_refs}")
3112
3184
 
3113
3185
  # Show warning if unreliable extraction was used and there are many errors
@@ -5307,7 +5379,7 @@ class ArxivReferenceChecker:
5307
5379
  # If errors found, add to dataset and optionally print details
5308
5380
  if errors:
5309
5381
  # Check if there's an unverified error among the errors
5310
- has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' for e in errors)
5382
+ has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' or e.get('info_type') == 'unverified' for e in errors)
5311
5383
 
5312
5384
  if has_unverified_error:
5313
5385
  self.total_unverified_refs += 1
@@ -5317,11 +5389,13 @@ class ArxivReferenceChecker:
5317
5389
  self.add_error_to_dataset(paper, reference, errors, reference_url, verified_data)
5318
5390
  paper_errors.extend(errors)
5319
5391
 
5320
- # Count errors vs warnings
5392
+ # Count errors vs warnings vs info
5321
5393
  error_count = sum(1 for e in errors if 'error_type' in e and e['error_type'] != 'unverified')
5322
5394
  warning_count = sum(1 for e in errors if 'warning_type' in e)
5395
+ info_count = sum(1 for e in errors if 'info_type' in e)
5323
5396
  self.total_errors_found += error_count
5324
5397
  self.total_warnings_found += warning_count
5398
+ self.total_info_found += info_count
5325
5399
 
5326
5400
  # Display all non-unverified errors and warnings
5327
5401
  self._display_non_unverified_errors(errors, debug_mode, print_output)
@@ -5468,9 +5542,9 @@ class ArxivReferenceChecker:
5468
5542
  """Display all non-unverified errors and warnings"""
5469
5543
  if not debug_mode and print_output:
5470
5544
  for error in errors:
5471
- if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified':
5472
- error_type = error.get('error_type') or error.get('warning_type')
5473
- error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
5545
+ if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified' and error.get('info_type') != 'unverified':
5546
+ error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
5547
+ error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
5474
5548
 
5475
5549
  from utils.error_utils import print_labeled_multiline
5476
5550
 
@@ -5478,8 +5552,10 @@ class ArxivReferenceChecker:
5478
5552
  print(f" ❌ {error_details}")
5479
5553
  elif 'error_type' in error:
5480
5554
  print_labeled_multiline("❌ Error", error_details)
5481
- else:
5555
+ elif 'warning_type' in error:
5482
5556
  print_labeled_multiline("⚠️ Warning", error_details)
5557
+ else:
5558
+ print_labeled_multiline("ℹ️ Information", error_details)
5483
5559
 
5484
5560
  def _output_reference_errors(self, reference, errors, url):
5485
5561
  """
@@ -392,41 +392,43 @@ def get_bibtex_content(paper):
392
392
  logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
393
393
  tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
394
394
 
395
- # Choose between .bib and .bbl files - .bbl files take priority when they contain entries
396
- # .bbl files are processed biblatex output that reflects exactly what was cited
395
+ # Choose between .bib and .bbl files based on what the main TeX file actually uses
396
+ # Check the main TeX file to see if it uses \bibliography{...} (BibTeX) or not (BBL)
397
+ uses_bibtex = False
398
+ if tex_content:
399
+ # Look for \bibliography{...} commands in the main TeX file
400
+ bib_pattern = r'\\bibliography\{([^}]+)\}'
401
+ bib_matches = re.findall(bib_pattern, tex_content)
402
+ if bib_matches:
403
+ uses_bibtex = True
404
+ referenced_bibs = []
405
+ for match in bib_matches:
406
+ bib_names = [name.strip() for name in match.split(',')]
407
+ referenced_bibs.extend(bib_names)
408
+ logger.debug(f"Main TeX file references BibTeX files: {referenced_bibs}")
409
+
397
410
  if bib_content and bbl_content:
398
411
  # Count entries in both for logging
399
412
  bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
400
- bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
413
+ bbl_entry_count = len(re.findall(r'\\bibitem[\[\{]', bbl_content))
401
414
 
402
415
  logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
403
416
 
404
- # Only use .bbl if it actually contains bibliography entries
405
- if bbl_entry_count > 0:
406
- logger.info(f"Using .bbl files from ArXiv source (biblatex takes priority over bibtex)")
417
+ if uses_bibtex and bib_entry_count > 0:
418
+ logger.info(f"Using .bib files from ArXiv source (main TeX uses \\bibliography{{...}})")
419
+ return bib_content
420
+ elif bbl_entry_count > 0:
421
+ logger.info(f"Using .bbl files from ArXiv source (main TeX doesn't use \\bibliography or .bib is empty)")
407
422
  return bbl_content
408
- else:
423
+ elif bib_entry_count > 0:
409
424
  logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
410
- # If we have LaTeX content, filter BibTeX by cited keys
411
- if tex_content:
412
- cited_keys = extract_cited_keys_from_tex({}, tex_content)
413
- if cited_keys:
414
- logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
415
- filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
416
- return filtered_content
417
425
  return bib_content
426
+ else:
427
+ logger.warning(f"Both .bib and .bbl files appear to be empty")
428
+ return bib_content # Default to bib_content as fallback
418
429
 
419
430
  elif bib_content:
420
431
  logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
421
-
422
- # If we have LaTeX content, filter BibTeX by cited keys
423
- if tex_content:
424
- cited_keys = extract_cited_keys_from_tex({}, tex_content)
425
- if cited_keys:
426
- logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
427
- filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
428
- return filtered_content
429
-
430
432
  return bib_content
431
433
 
432
434
  elif bbl_content:
@@ -294,6 +294,39 @@ def create_generic_warning(warning_type: str, warning_details: str, **kwargs) ->
294
294
  return warning_dict
295
295
 
296
296
 
297
+ def create_generic_info(info_type: str, info_details: str, **kwargs) -> Dict[str, Any]:
298
+ """
299
+ Create a generic info dictionary with custom fields.
300
+
301
+ Args:
302
+ info_type: Type of info (e.g., 'url')
303
+ info_details: Description of the information
304
+ **kwargs: Additional fields to include in the info dictionary
305
+
306
+ Returns:
307
+ Standardized info dictionary
308
+ """
309
+ info_dict = {
310
+ 'info_type': info_type,
311
+ 'info_details': info_details
312
+ }
313
+
314
+ info_dict.update(kwargs)
315
+ return info_dict
316
+
317
+
318
+ def create_info_message(reference, reason, arxiv_url=None):
319
+ """Create a standardized info message structure."""
320
+ info_msg = {
321
+ 'info_type': 'arxiv_url_available',
322
+ 'reference': reference,
323
+ 'reason': reason
324
+ }
325
+ if arxiv_url:
326
+ info_msg['arxiv_url'] = arxiv_url
327
+ return info_msg
328
+
329
+
297
330
  def format_author_mismatch(author_number: int, cited_author: str, correct_author: str) -> str:
298
331
  """
299
332
  Format a three-line author mismatch message.
@@ -2102,7 +2102,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2102
2102
  # Use standardized three-line formatting for author mismatch
2103
2103
  cited_display = format_author_for_display(cited_author)
2104
2104
  full_author_list = ', '.join(correct_names)
2105
- error_msg = format_author_mismatch(i+1, f"{cited_display} (not found in author list - et al case)", f"Correct authors: {full_author_list}")
2105
+ error_msg = format_author_mismatch(i+1, f"{cited_display} (not found in author list - et al case)", f"{full_author_list}")
2106
2106
  return False, error_msg
2107
2107
 
2108
2108
  return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
@@ -2337,6 +2337,9 @@ def format_author_for_display(author_name):
2337
2337
  if not author_name:
2338
2338
  return author_name
2339
2339
 
2340
+ # First clean the author name to remove asterisks and other unwanted characters
2341
+ author_name = clean_author_name(author_name)
2342
+
2340
2343
  # Clean up any stray punctuation that might have been attached during parsing
2341
2344
  author_name = author_name.strip()
2342
2345
  # Remove trailing semicolons that sometimes get attached during bibliographic parsing