academic-refchecker 1.2.50__py3-none-any.whl → 1.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,7 +28,7 @@ import time
28
28
  import logging
29
29
  import re
30
30
  from typing import Dict, List, Tuple, Optional, Any, Union
31
- from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search
31
+ from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search, strip_latex_commands, compare_titles_with_latex_cleaning
32
32
  from utils.error_utils import format_title_mismatch
33
33
  from config.settings import get_config
34
34
 
@@ -353,7 +353,7 @@ class NonArxivReferenceChecker:
353
353
  cited_title = title.strip()
354
354
 
355
355
  if cited_title and result_title:
356
- title_similarity = calculate_title_similarity(cited_title.lower(), result_title.lower())
356
+ title_similarity = compare_titles_with_latex_cleaning(cited_title, result_title)
357
357
  logger.debug(f"Semantic Scholar ArXiv search title similarity: {title_similarity:.3f}")
358
358
  logger.debug(f"Cited title: '{cited_title}'")
359
359
  logger.debug(f"Found title: '{result_title}'")
@@ -385,7 +385,7 @@ class NonArxivReferenceChecker:
385
385
  logger.debug(f"DEBUG: cited_title='{cited_title}', arxiv_title='{arxiv_title}'")
386
386
 
387
387
  if cited_title and arxiv_title:
388
- title_similarity = calculate_title_similarity(cited_title.lower(), arxiv_title.lower())
388
+ title_similarity = compare_titles_with_latex_cleaning(cited_title, arxiv_title)
389
389
  logger.debug(f"ArXiv API title similarity: {title_similarity:.3f}")
390
390
  logger.debug(f"Cited title: '{cited_title}'")
391
391
  logger.debug(f"ArXiv title: '{arxiv_title}'")
@@ -419,7 +419,7 @@ class NonArxivReferenceChecker:
419
419
  arxiv_title_check = arxiv_paper_check.get('title', '').strip()
420
420
  cited_title_check = title.strip()
421
421
  if cited_title_check and arxiv_title_check:
422
- title_similarity_check = calculate_title_similarity(cited_title_check.lower(), arxiv_title_check.lower())
422
+ title_similarity_check = compare_titles_with_latex_cleaning(cited_title_check, arxiv_title_check)
423
423
  if title_similarity_check < SIMILARITY_THRESHOLD:
424
424
  logger.debug(f"Detected ArXiv ID mismatch before raw text search - skipping unnecessary searches")
425
425
  arxiv_id_mismatch_detected = True
@@ -468,11 +468,13 @@ class NonArxivReferenceChecker:
468
468
  return None, [], None
469
469
 
470
470
  # Check title using similarity function to handle formatting differences
471
- title_similarity = calculate_title_similarity(title, found_title) if found_title else 0.0
471
+ title_similarity = compare_titles_with_latex_cleaning(title, found_title) if found_title else 0.0
472
472
  if found_title and title_similarity < SIMILARITY_THRESHOLD:
473
+ # Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
474
+ clean_cited_title = strip_latex_commands(title)
473
475
  errors.append({
474
476
  'error_type': 'title',
475
- 'error_details': format_title_mismatch(title, found_title),
477
+ 'error_details': format_title_mismatch(clean_cited_title, found_title),
476
478
  'ref_title_correct': paper_data.get('title', '')
477
479
  })
478
480
 
@@ -7,6 +7,7 @@ from urllib.parse import urlparse, urljoin
7
7
  from typing import Dict, Optional, Tuple, List, Any
8
8
  from bs4 import BeautifulSoup
9
9
  import time
10
+ from utils.text_utils import strip_latex_commands
10
11
 
11
12
  logger = logging.getLogger(__name__)
12
13
 
@@ -185,9 +186,11 @@ class WebPageChecker:
185
186
  if cited_title and page_title:
186
187
  if not self._check_title_match(cited_title, page_title, page_description):
187
188
  from utils.error_utils import format_title_mismatch
189
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
190
+ clean_cited_title = strip_latex_commands(cited_title)
188
191
  errors.append({
189
192
  "warning_type": "title",
190
- "warning_details": format_title_mismatch(cited_title, page_title)
193
+ "warning_details": format_title_mismatch(clean_cited_title, page_title)
191
194
  })
192
195
 
193
196
  # Check if this is a documentation page for the cited topic
@@ -509,4 +512,427 @@ class WebPageChecker:
509
512
  "warning_details": "Access blocked by site but domain is trusted (likely bot protection)"
510
513
  })
511
514
 
512
- return verified_data, errors, web_url
515
+ return verified_data, errors, web_url
516
+
517
+ def check_unverified_url_reference(self, reference: Dict[str, Any]) -> str:
518
+ """
519
+ Check a URL from an unverified reference to determine the specific unverified reason
520
+
521
+ Args:
522
+ reference: Reference dictionary with title, authors, year, url, etc.
523
+
524
+ Returns:
525
+ String with the specific unverified reason:
526
+ - "non-existent web page" if the page doesn't exist
527
+ - "paper not found and URL doesn't reference it" if page exists but doesn't contain title
528
+ - "paper not verified but URL references paper" if page exists and contains title
529
+ """
530
+ logger.debug(f"Checking unverified URL reference: {reference.get('title', 'Untitled')}")
531
+
532
+ # Extract URL from reference
533
+ web_url = reference.get('url', '').strip()
534
+ if not web_url:
535
+ return "paper not found and URL doesn't reference it" # No URL to check
536
+
537
+ # Make request to check if page exists
538
+ response = self._respectful_request(web_url)
539
+ if response is None:
540
+ return "non-existent web page"
541
+
542
+ if response.status_code == 404:
543
+ return "non-existent web page"
544
+ elif response.status_code == 403:
545
+ # For blocked resources, we can't check content but assume page exists
546
+ return "paper not verified but URL references paper"
547
+ elif response.status_code != 200:
548
+ return "non-existent web page"
549
+
550
+ try:
551
+ # Parse HTML content to search for title
552
+ content_type = response.headers.get('content-type', '').lower()
553
+ if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
554
+ # For PDFs, we can't search content, so assume it's referenced if accessible
555
+ return "paper not verified but URL references paper"
556
+
557
+ # Parse HTML content
558
+ soup = BeautifulSoup(response.content, 'html.parser')
559
+
560
+ # Extract page content for searching
561
+ page_title = self._extract_page_title(soup)
562
+ page_description = self._extract_description(soup)
563
+
564
+ # Get the full page text for comprehensive searching
565
+ page_text = soup.get_text().lower()
566
+
567
+ # Get the reference title to search for
568
+ cited_title = reference.get('title', '').strip()
569
+ if not cited_title:
570
+ return "paper not found and URL doesn't reference it"
571
+
572
+ # Search for the title in various ways
573
+ cited_title_lower = cited_title.lower()
574
+
575
+ # Direct search in page text
576
+ if cited_title_lower in page_text:
577
+ return "paper not verified but URL references paper"
578
+
579
+ # Search for key words from the title
580
+ cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
581
+ if len(word.strip('.,;:()[]{}')) > 3)
582
+
583
+ # Check if significant portion of title words appear in page
584
+ page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
585
+ if len(word.strip('.,;:()[]{}')) > 3)
586
+
587
+ common_words = cited_words.intersection(page_words)
588
+
589
+ # If most of the title words are found, consider it referenced
590
+ if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
591
+ return "paper not verified but URL references paper"
592
+
593
+ # Also check the extracted title and description specifically
594
+ if page_title:
595
+ if self._check_title_match(cited_title, page_title, page_description):
596
+ return "paper not verified but URL references paper"
597
+
598
+ # Title not found in page content
599
+ return "paper not found and URL doesn't reference it"
600
+
601
+ except Exception as e:
602
+ logger.error(f"Error checking unverified URL {web_url}: {e}")
603
+ return "paper not found and URL doesn't reference it"
604
+
605
+ def verify_raw_url_for_unverified_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
606
+ """
607
+ Verify a raw URL from an unverified reference - can return verified data if appropriate
608
+
609
+ Args:
610
+ reference: Reference dictionary with title, authors, year, url, etc.
611
+
612
+ Returns:
613
+ Tuple of (verified_data, errors, url) where:
614
+ - verified_data: Dict with verified data if URL should be considered verified, None otherwise
615
+ - errors: List of error dictionaries with specific unverified reasons
616
+ - url: The URL that was checked
617
+ """
618
+ logger.debug(f"Verifying raw URL for unverified reference: {reference.get('title', 'Untitled')}")
619
+
620
+ # Extract URL from reference
621
+ web_url = reference.get('url', '').strip()
622
+ if not web_url:
623
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], None
624
+
625
+ # Make request to check if page exists
626
+ response = self._respectful_request(web_url)
627
+ if response is None:
628
+ return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
629
+
630
+ if response.status_code == 404:
631
+ return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
632
+ elif response.status_code == 403:
633
+ # For blocked resources, we can't check content but assume page exists
634
+ # If no venue, treat as verified since URL is accessible
635
+ if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
636
+ verified_data = {
637
+ 'title': reference.get('title', ''),
638
+ 'authors': reference.get('authors', []),
639
+ 'year': reference.get('year'),
640
+ 'venue': 'Web Page',
641
+ 'url': web_url,
642
+ 'web_metadata': {
643
+ 'status_code': 403,
644
+ 'access_blocked': True
645
+ }
646
+ }
647
+ return verified_data, [], web_url
648
+ else:
649
+ return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
650
+ elif response.status_code != 200:
651
+ return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
652
+
653
+ try:
654
+ # Parse HTML content to search for title
655
+ content_type = response.headers.get('content-type', '').lower()
656
+ if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
657
+ # For PDFs, if no venue specified, treat as verified
658
+ if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
659
+ verified_data = {
660
+ 'title': reference.get('title', ''),
661
+ 'authors': reference.get('authors', []),
662
+ 'year': reference.get('year'),
663
+ 'venue': 'PDF Document',
664
+ 'url': web_url,
665
+ 'web_metadata': {
666
+ 'content_type': response.headers.get('content-type', ''),
667
+ 'status_code': response.status_code
668
+ }
669
+ }
670
+ return verified_data, [], web_url
671
+ else:
672
+ return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
673
+
674
+ # Parse HTML content
675
+ soup = BeautifulSoup(response.content, 'html.parser')
676
+
677
+ # Extract page content for searching
678
+ page_title = self._extract_page_title(soup)
679
+ page_description = self._extract_description(soup)
680
+
681
+ # Get the full page text for comprehensive searching
682
+ page_text = soup.get_text().lower()
683
+
684
+ # Get the reference title to search for
685
+ cited_title = reference.get('title', '').strip()
686
+ if not cited_title:
687
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
688
+
689
+ # Search for the title in various ways
690
+ cited_title_lower = cited_title.lower()
691
+ title_found = False
692
+
693
+ # Direct search in page text
694
+ if cited_title_lower in page_text:
695
+ title_found = True
696
+
697
+ # Search for key words from the title
698
+ if not title_found:
699
+ cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
700
+ if len(word.strip('.,;:()[]{}')) > 3)
701
+
702
+ # Check if significant portion of title words appear in page
703
+ page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
704
+ if len(word.strip('.,;:()[]{}')) > 3)
705
+
706
+ common_words = cited_words.intersection(page_words)
707
+
708
+ # If most of the title words are found, consider it referenced
709
+ if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
710
+ title_found = True
711
+
712
+ # Also check the extracted title and description specifically
713
+ if not title_found and page_title:
714
+ if self._check_title_match(cited_title, page_title, page_description):
715
+ title_found = True
716
+
717
+ # Determine if this should be verified or unverified
718
+ if title_found:
719
+ # Check if reference should be verified based on venue type
720
+ venue_field = reference.get('journal') or reference.get('venue') or reference.get('booktitle')
721
+
722
+ if not venue_field:
723
+ # No venue specified - verify with URL as venue
724
+ site_info = self._extract_site_info(soup, web_url)
725
+ venue = site_info.get('organization', 'Web Page') if site_info.get('organization') != site_info.get('domain') else 'Web Page'
726
+
727
+ verified_data = {
728
+ 'title': reference.get('title', ''),
729
+ 'authors': reference.get('authors', []),
730
+ 'year': reference.get('year'),
731
+ 'venue': venue,
732
+ 'url': web_url,
733
+ 'web_metadata': {
734
+ 'page_title': page_title,
735
+ 'description': page_description,
736
+ 'site_info': site_info,
737
+ 'final_url': response.url,
738
+ 'status_code': response.status_code
739
+ }
740
+ }
741
+ logger.debug(f"URL verified as valid source for reference without venue: {web_url}")
742
+ return verified_data, [], web_url
743
+ elif self._is_web_content_venue(venue_field, web_url):
744
+ # Has venue but it's a web content venue (news, blog, etc.) - verify it
745
+ verified_data = {
746
+ 'title': reference.get('title', ''),
747
+ 'authors': reference.get('authors', []),
748
+ 'year': reference.get('year'),
749
+ 'venue': venue_field, # Keep the original venue
750
+ 'url': web_url,
751
+ 'web_metadata': {
752
+ 'page_title': page_title,
753
+ 'description': page_description,
754
+ 'site_info': self._extract_site_info(soup, web_url),
755
+ 'final_url': response.url,
756
+ 'status_code': response.status_code
757
+ }
758
+ }
759
+ logger.debug(f"URL verified as valid web content source: {web_url}")
760
+ return verified_data, [], web_url
761
+ else:
762
+ # Has academic venue but URL references paper - still unverified (needs proper paper verification)
763
+ return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
764
+ else:
765
+ # Title not found in page content
766
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
767
+
768
+ except Exception as e:
769
+ logger.error(f"Error checking raw URL {web_url}: {e}")
770
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
771
+
772
+ def _is_web_content_venue(self, venue: str, url: str) -> bool:
773
+ """
774
+ Determine if a venue represents web content rather than academic publication
775
+
776
+ Args:
777
+ venue: The venue string (journal, venue, or booktitle)
778
+ url: The URL being checked (for additional context)
779
+
780
+ Returns:
781
+ True if this represents web content that can be verified via URL
782
+ """
783
+ if not venue:
784
+ return False
785
+
786
+ venue_lower = venue.lower().strip()
787
+
788
+ # News organizations and media outlets
789
+ news_indicators = [
790
+ 'news', 'cbc', 'bbc', 'cnn', 'reuters', 'associated press', 'ap news',
791
+ 'npr', 'pbs', 'abc news', 'nbc news', 'fox news', 'guardian', 'times',
792
+ 'post', 'herald', 'tribune', 'gazette', 'chronicle', 'observer',
793
+ 'magazine', 'weekly', 'daily', 'today', 'report', 'wire', 'press'
794
+ ]
795
+
796
+ # Special case for Wall Street Journal
797
+ if any(word in venue_lower for word in ['wall street', 'wsj']):
798
+ news_indicators.append('journal')
799
+
800
+ # Technology and industry publications
801
+ tech_publications = [
802
+ 'techcrunch', 'wired', 'ars technica', 'the verge', 'engadget',
803
+ 'zdnet', 'cnet', 'computerworld', 'infoworld', 'pcmag', 'pcworld',
804
+ 'ieee spectrum', 'mit technology review', 'scientific american'
805
+ ]
806
+
807
+ # Blogs and web platforms
808
+ blog_platforms = [
809
+ 'blog', 'medium', 'substack', 'wordpress', 'blogspot', 'tumblr',
810
+ 'linkedin', 'facebook', 'twitter', 'reddit', 'stack overflow',
811
+ 'github pages', 'personal website', 'company blog'
812
+ ]
813
+
814
+ # Government and organizational websites
815
+ org_indicators = [
816
+ 'government', 'gov', '.org', 'agency', 'department', 'ministry',
817
+ 'commission', 'bureau', 'office', 'administration', 'institute',
818
+ 'foundation', 'association', 'society', 'center', 'centre',
819
+ 'council', 'committee', 'board', 'union', 'federation', 'alliance',
820
+ 'coalition', 'consortium', 'network', 'group', 'organization',
821
+ 'organisation', 'corp', 'corporation', 'company', 'ltd', 'inc'
822
+ ]
823
+
824
+ # Documentation and technical resources
825
+ tech_resources = [
826
+ 'documentation', 'docs', 'api', 'reference', 'guide', 'tutorial',
827
+ 'manual', 'readme', 'wiki', 'help', 'support', 'developer',
828
+ 'technical', 'white paper', 'whitepaper', 'brief', 'overview',
829
+ 'policy', 'strategy', 'report', 'study', 'analysis', 'research'
830
+ ]
831
+
832
+ # Check URL domain for additional context
833
+ url_lower = url.lower() if url else ''
834
+
835
+ # Known web content domains in URL
836
+ web_domains = [
837
+ 'cbc.ca', 'bbc.com', 'cnn.com', 'reuters.com', 'npr.org', 'pbs.org',
838
+ 'nytimes.com', 'washingtonpost.com', 'theguardian.com', 'wsj.com',
839
+ 'techcrunch.com', 'wired.com', 'theverge.com', 'arstechnica.com',
840
+ 'medium.com', 'substack.com', 'linkedin.com', 'github.io',
841
+ 'readthedocs.io', 'stackoverflow.com', 'reddit.com'
842
+ ]
843
+
844
+ # Combine all indicators
845
+ all_indicators = news_indicators + tech_publications + blog_platforms + org_indicators + tech_resources
846
+
847
+ # Academic venue indicators that should NOT be considered web content
848
+ academic_indicators = [
849
+ 'proceedings', 'conference', 'symposium', 'workshop', 'transactions',
850
+ 'journal of', 'international journal', 'acm', 'ieee', 'springer',
851
+ 'nature', 'science', 'cell', 'lancet', 'plos', 'arxiv', 'pubmed',
852
+ 'artificial intelligence', 'machine learning', 'computer vision',
853
+ 'neural', 'computing', 'robotics', 'bioinformatics'
854
+ ]
855
+
856
+ # Check if venue is clearly academic (should not be treated as web content)
857
+ is_academic = any(indicator in venue_lower for indicator in academic_indicators)
858
+ if is_academic:
859
+ return False
860
+
861
+ # Check if venue matches any web content indicators
862
+ venue_matches = any(indicator and indicator in venue_lower for indicator in all_indicators)
863
+
864
+ # Check if URL domain suggests web content
865
+ url_matches = any(domain in url_lower for domain in web_domains)
866
+
867
+ # Special case: if URL contains news/blog/docs indicators, lean towards web content
868
+ url_content_indicators = ['news', 'blog', 'post', 'article', 'docs', 'help', 'guide', 'resources', 'policy', 'strategy']
869
+ url_has_content_indicators = any(indicator in url_lower for indicator in url_content_indicators)
870
+
871
+ # Special case: Check if venue is an organizational acronym/name that matches the URL domain
872
+ # This handles cases like "AECEA" on aecea.ca domain
873
+ organizational_match = self._check_organizational_venue_match(venue, url_lower)
874
+
875
+ return venue_matches or url_matches or url_has_content_indicators or organizational_match
876
+
877
+ def _check_organizational_venue_match(self, venue: str, url_lower: str) -> bool:
878
+ """
879
+ Check if the venue represents an organization that matches the URL domain
880
+
881
+ Args:
882
+ venue: The venue string
883
+ url_lower: The lowercased URL
884
+
885
+ Returns:
886
+ True if venue appears to be the organization publishing on their own domain
887
+ """
888
+ if not venue or not url_lower:
889
+ return False
890
+
891
+ venue_lower = venue.lower().strip()
892
+
893
+ # Extract domain from URL
894
+ from urllib.parse import urlparse
895
+ try:
896
+ parsed_url = urlparse(url_lower)
897
+ domain = parsed_url.netloc.lower()
898
+
899
+ # Remove common prefixes
900
+ domain = domain.replace('www.', '')
901
+
902
+ # Check if venue is likely an acronym (short, all caps or mixed case)
903
+ is_likely_acronym = (len(venue) <= 10 and
904
+ (venue.isupper() or
905
+ any(c.isupper() for c in venue) and len(venue.split()) == 1))
906
+
907
+ # Check if venue appears in domain
908
+ venue_clean = ''.join(c for c in venue_lower if c.isalnum())
909
+
910
+ if venue_clean and venue_clean in domain:
911
+ return True
912
+
913
+ # For acronyms, check if the acronym could match the domain
914
+ if is_likely_acronym:
915
+ # Split venue into words and check if initials match domain
916
+ venue_words = venue_lower.replace('.', ' ').split()
917
+ if len(venue_words) == 1 and len(venue_words[0]) <= 6:
918
+ # Single word acronym - check if it's in the domain
919
+ if venue_words[0] in domain:
920
+ return True
921
+
922
+ # Check for educational/professional associations with .ca, .org, .edu domains
923
+ if any(domain.endswith(tld) for tld in ['.ca', '.org', '.edu', '.gov']):
924
+ # These domains often host organizational content
925
+ if any(org_word in venue_lower for org_word in [
926
+ 'association', 'society', 'institute', 'foundation', 'center',
927
+ 'centre', 'council', 'committee', 'board', 'agency', 'department'
928
+ ]):
929
+ return True
930
+
931
+ # Check if venue is a short organizational name/acronym
932
+ if is_likely_acronym:
933
+ return True
934
+
935
+ return False
936
+
937
+ except Exception:
938
+ return False
@@ -275,7 +275,10 @@ class ParallelReferenceProcessor:
275
275
  reference = result.reference
276
276
 
277
277
  # Print reference info in the same format as sequential mode
278
- title = reference.get('title', 'Untitled')
278
+ raw_title = reference.get('title', 'Untitled')
279
+ # Clean LaTeX commands from title for display
280
+ from utils.text_utils import strip_latex_commands
281
+ title = strip_latex_commands(raw_title)
279
282
  from utils.text_utils import format_authors_for_display
280
283
  authors = format_authors_for_display(reference.get('authors', []))
281
284
  year = reference.get('year', '')