academic-refchecker 1.2.50__py3-none-any.whl → 1.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __version__.py +1 -1
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/METADATA +10 -1
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/RECORD +15 -14
- checkers/github_checker.py +4 -1
- checkers/openreview_checker.py +10 -5
- checkers/pdf_paper_checker.py +493 -0
- checkers/semantic_scholar.py +8 -6
- checkers/webpage_checker.py +428 -2
- core/parallel_processor.py +4 -1
- core/refchecker.py +172 -75
- utils/text_utils.py +134 -13
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/top_level.txt +0 -0
checkers/semantic_scholar.py
CHANGED
|
@@ -28,7 +28,7 @@ import time
|
|
|
28
28
|
import logging
|
|
29
29
|
import re
|
|
30
30
|
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
31
|
-
from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search
|
|
31
|
+
from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search, strip_latex_commands, compare_titles_with_latex_cleaning
|
|
32
32
|
from utils.error_utils import format_title_mismatch
|
|
33
33
|
from config.settings import get_config
|
|
34
34
|
|
|
@@ -353,7 +353,7 @@ class NonArxivReferenceChecker:
|
|
|
353
353
|
cited_title = title.strip()
|
|
354
354
|
|
|
355
355
|
if cited_title and result_title:
|
|
356
|
-
title_similarity =
|
|
356
|
+
title_similarity = compare_titles_with_latex_cleaning(cited_title, result_title)
|
|
357
357
|
logger.debug(f"Semantic Scholar ArXiv search title similarity: {title_similarity:.3f}")
|
|
358
358
|
logger.debug(f"Cited title: '{cited_title}'")
|
|
359
359
|
logger.debug(f"Found title: '{result_title}'")
|
|
@@ -385,7 +385,7 @@ class NonArxivReferenceChecker:
|
|
|
385
385
|
logger.debug(f"DEBUG: cited_title='{cited_title}', arxiv_title='{arxiv_title}'")
|
|
386
386
|
|
|
387
387
|
if cited_title and arxiv_title:
|
|
388
|
-
title_similarity =
|
|
388
|
+
title_similarity = compare_titles_with_latex_cleaning(cited_title, arxiv_title)
|
|
389
389
|
logger.debug(f"ArXiv API title similarity: {title_similarity:.3f}")
|
|
390
390
|
logger.debug(f"Cited title: '{cited_title}'")
|
|
391
391
|
logger.debug(f"ArXiv title: '{arxiv_title}'")
|
|
@@ -419,7 +419,7 @@ class NonArxivReferenceChecker:
|
|
|
419
419
|
arxiv_title_check = arxiv_paper_check.get('title', '').strip()
|
|
420
420
|
cited_title_check = title.strip()
|
|
421
421
|
if cited_title_check and arxiv_title_check:
|
|
422
|
-
title_similarity_check =
|
|
422
|
+
title_similarity_check = compare_titles_with_latex_cleaning(cited_title_check, arxiv_title_check)
|
|
423
423
|
if title_similarity_check < SIMILARITY_THRESHOLD:
|
|
424
424
|
logger.debug(f"Detected ArXiv ID mismatch before raw text search - skipping unnecessary searches")
|
|
425
425
|
arxiv_id_mismatch_detected = True
|
|
@@ -468,11 +468,13 @@ class NonArxivReferenceChecker:
|
|
|
468
468
|
return None, [], None
|
|
469
469
|
|
|
470
470
|
# Check title using similarity function to handle formatting differences
|
|
471
|
-
title_similarity =
|
|
471
|
+
title_similarity = compare_titles_with_latex_cleaning(title, found_title) if found_title else 0.0
|
|
472
472
|
if found_title and title_similarity < SIMILARITY_THRESHOLD:
|
|
473
|
+
# Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
474
|
+
clean_cited_title = strip_latex_commands(title)
|
|
473
475
|
errors.append({
|
|
474
476
|
'error_type': 'title',
|
|
475
|
-
'error_details': format_title_mismatch(
|
|
477
|
+
'error_details': format_title_mismatch(clean_cited_title, found_title),
|
|
476
478
|
'ref_title_correct': paper_data.get('title', '')
|
|
477
479
|
})
|
|
478
480
|
|
checkers/webpage_checker.py
CHANGED
|
@@ -7,6 +7,7 @@ from urllib.parse import urlparse, urljoin
|
|
|
7
7
|
from typing import Dict, Optional, Tuple, List, Any
|
|
8
8
|
from bs4 import BeautifulSoup
|
|
9
9
|
import time
|
|
10
|
+
from utils.text_utils import strip_latex_commands
|
|
10
11
|
|
|
11
12
|
logger = logging.getLogger(__name__)
|
|
12
13
|
|
|
@@ -185,9 +186,11 @@ class WebPageChecker:
|
|
|
185
186
|
if cited_title and page_title:
|
|
186
187
|
if not self._check_title_match(cited_title, page_title, page_description):
|
|
187
188
|
from utils.error_utils import format_title_mismatch
|
|
189
|
+
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
190
|
+
clean_cited_title = strip_latex_commands(cited_title)
|
|
188
191
|
errors.append({
|
|
189
192
|
"warning_type": "title",
|
|
190
|
-
"warning_details": format_title_mismatch(
|
|
193
|
+
"warning_details": format_title_mismatch(clean_cited_title, page_title)
|
|
191
194
|
})
|
|
192
195
|
|
|
193
196
|
# Check if this is a documentation page for the cited topic
|
|
@@ -509,4 +512,427 @@ class WebPageChecker:
|
|
|
509
512
|
"warning_details": "Access blocked by site but domain is trusted (likely bot protection)"
|
|
510
513
|
})
|
|
511
514
|
|
|
512
|
-
return verified_data, errors, web_url
|
|
515
|
+
return verified_data, errors, web_url
|
|
516
|
+
|
|
517
|
+
def check_unverified_url_reference(self, reference: Dict[str, Any]) -> str:
|
|
518
|
+
"""
|
|
519
|
+
Check a URL from an unverified reference to determine the specific unverified reason
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
String with the specific unverified reason:
|
|
526
|
+
- "non-existent web page" if the page doesn't exist
|
|
527
|
+
- "paper not found and URL doesn't reference it" if page exists but doesn't contain title
|
|
528
|
+
- "paper not verified but URL references paper" if page exists and contains title
|
|
529
|
+
"""
|
|
530
|
+
logger.debug(f"Checking unverified URL reference: {reference.get('title', 'Untitled')}")
|
|
531
|
+
|
|
532
|
+
# Extract URL from reference
|
|
533
|
+
web_url = reference.get('url', '').strip()
|
|
534
|
+
if not web_url:
|
|
535
|
+
return "paper not found and URL doesn't reference it" # No URL to check
|
|
536
|
+
|
|
537
|
+
# Make request to check if page exists
|
|
538
|
+
response = self._respectful_request(web_url)
|
|
539
|
+
if response is None:
|
|
540
|
+
return "non-existent web page"
|
|
541
|
+
|
|
542
|
+
if response.status_code == 404:
|
|
543
|
+
return "non-existent web page"
|
|
544
|
+
elif response.status_code == 403:
|
|
545
|
+
# For blocked resources, we can't check content but assume page exists
|
|
546
|
+
return "paper not verified but URL references paper"
|
|
547
|
+
elif response.status_code != 200:
|
|
548
|
+
return "non-existent web page"
|
|
549
|
+
|
|
550
|
+
try:
|
|
551
|
+
# Parse HTML content to search for title
|
|
552
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
553
|
+
if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
|
|
554
|
+
# For PDFs, we can't search content, so assume it's referenced if accessible
|
|
555
|
+
return "paper not verified but URL references paper"
|
|
556
|
+
|
|
557
|
+
# Parse HTML content
|
|
558
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
559
|
+
|
|
560
|
+
# Extract page content for searching
|
|
561
|
+
page_title = self._extract_page_title(soup)
|
|
562
|
+
page_description = self._extract_description(soup)
|
|
563
|
+
|
|
564
|
+
# Get the full page text for comprehensive searching
|
|
565
|
+
page_text = soup.get_text().lower()
|
|
566
|
+
|
|
567
|
+
# Get the reference title to search for
|
|
568
|
+
cited_title = reference.get('title', '').strip()
|
|
569
|
+
if not cited_title:
|
|
570
|
+
return "paper not found and URL doesn't reference it"
|
|
571
|
+
|
|
572
|
+
# Search for the title in various ways
|
|
573
|
+
cited_title_lower = cited_title.lower()
|
|
574
|
+
|
|
575
|
+
# Direct search in page text
|
|
576
|
+
if cited_title_lower in page_text:
|
|
577
|
+
return "paper not verified but URL references paper"
|
|
578
|
+
|
|
579
|
+
# Search for key words from the title
|
|
580
|
+
cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
|
|
581
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
582
|
+
|
|
583
|
+
# Check if significant portion of title words appear in page
|
|
584
|
+
page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
|
|
585
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
586
|
+
|
|
587
|
+
common_words = cited_words.intersection(page_words)
|
|
588
|
+
|
|
589
|
+
# If most of the title words are found, consider it referenced
|
|
590
|
+
if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
|
|
591
|
+
return "paper not verified but URL references paper"
|
|
592
|
+
|
|
593
|
+
# Also check the extracted title and description specifically
|
|
594
|
+
if page_title:
|
|
595
|
+
if self._check_title_match(cited_title, page_title, page_description):
|
|
596
|
+
return "paper not verified but URL references paper"
|
|
597
|
+
|
|
598
|
+
# Title not found in page content
|
|
599
|
+
return "paper not found and URL doesn't reference it"
|
|
600
|
+
|
|
601
|
+
except Exception as e:
|
|
602
|
+
logger.error(f"Error checking unverified URL {web_url}: {e}")
|
|
603
|
+
return "paper not found and URL doesn't reference it"
|
|
604
|
+
|
|
605
|
+
def verify_raw_url_for_unverified_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
606
|
+
"""
|
|
607
|
+
Verify a raw URL from an unverified reference - can return verified data if appropriate
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
611
|
+
|
|
612
|
+
Returns:
|
|
613
|
+
Tuple of (verified_data, errors, url) where:
|
|
614
|
+
- verified_data: Dict with verified data if URL should be considered verified, None otherwise
|
|
615
|
+
- errors: List of error dictionaries with specific unverified reasons
|
|
616
|
+
- url: The URL that was checked
|
|
617
|
+
"""
|
|
618
|
+
logger.debug(f"Verifying raw URL for unverified reference: {reference.get('title', 'Untitled')}")
|
|
619
|
+
|
|
620
|
+
# Extract URL from reference
|
|
621
|
+
web_url = reference.get('url', '').strip()
|
|
622
|
+
if not web_url:
|
|
623
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], None
|
|
624
|
+
|
|
625
|
+
# Make request to check if page exists
|
|
626
|
+
response = self._respectful_request(web_url)
|
|
627
|
+
if response is None:
|
|
628
|
+
return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
|
|
629
|
+
|
|
630
|
+
if response.status_code == 404:
|
|
631
|
+
return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
|
|
632
|
+
elif response.status_code == 403:
|
|
633
|
+
# For blocked resources, we can't check content but assume page exists
|
|
634
|
+
# If no venue, treat as verified since URL is accessible
|
|
635
|
+
if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
|
|
636
|
+
verified_data = {
|
|
637
|
+
'title': reference.get('title', ''),
|
|
638
|
+
'authors': reference.get('authors', []),
|
|
639
|
+
'year': reference.get('year'),
|
|
640
|
+
'venue': 'Web Page',
|
|
641
|
+
'url': web_url,
|
|
642
|
+
'web_metadata': {
|
|
643
|
+
'status_code': 403,
|
|
644
|
+
'access_blocked': True
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
return verified_data, [], web_url
|
|
648
|
+
else:
|
|
649
|
+
return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
|
|
650
|
+
elif response.status_code != 200:
|
|
651
|
+
return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
|
|
652
|
+
|
|
653
|
+
try:
|
|
654
|
+
# Parse HTML content to search for title
|
|
655
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
656
|
+
if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
|
|
657
|
+
# For PDFs, if no venue specified, treat as verified
|
|
658
|
+
if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
|
|
659
|
+
verified_data = {
|
|
660
|
+
'title': reference.get('title', ''),
|
|
661
|
+
'authors': reference.get('authors', []),
|
|
662
|
+
'year': reference.get('year'),
|
|
663
|
+
'venue': 'PDF Document',
|
|
664
|
+
'url': web_url,
|
|
665
|
+
'web_metadata': {
|
|
666
|
+
'content_type': response.headers.get('content-type', ''),
|
|
667
|
+
'status_code': response.status_code
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
return verified_data, [], web_url
|
|
671
|
+
else:
|
|
672
|
+
return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
|
|
673
|
+
|
|
674
|
+
# Parse HTML content
|
|
675
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
676
|
+
|
|
677
|
+
# Extract page content for searching
|
|
678
|
+
page_title = self._extract_page_title(soup)
|
|
679
|
+
page_description = self._extract_description(soup)
|
|
680
|
+
|
|
681
|
+
# Get the full page text for comprehensive searching
|
|
682
|
+
page_text = soup.get_text().lower()
|
|
683
|
+
|
|
684
|
+
# Get the reference title to search for
|
|
685
|
+
cited_title = reference.get('title', '').strip()
|
|
686
|
+
if not cited_title:
|
|
687
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
|
|
688
|
+
|
|
689
|
+
# Search for the title in various ways
|
|
690
|
+
cited_title_lower = cited_title.lower()
|
|
691
|
+
title_found = False
|
|
692
|
+
|
|
693
|
+
# Direct search in page text
|
|
694
|
+
if cited_title_lower in page_text:
|
|
695
|
+
title_found = True
|
|
696
|
+
|
|
697
|
+
# Search for key words from the title
|
|
698
|
+
if not title_found:
|
|
699
|
+
cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
|
|
700
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
701
|
+
|
|
702
|
+
# Check if significant portion of title words appear in page
|
|
703
|
+
page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
|
|
704
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
705
|
+
|
|
706
|
+
common_words = cited_words.intersection(page_words)
|
|
707
|
+
|
|
708
|
+
# If most of the title words are found, consider it referenced
|
|
709
|
+
if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
|
|
710
|
+
title_found = True
|
|
711
|
+
|
|
712
|
+
# Also check the extracted title and description specifically
|
|
713
|
+
if not title_found and page_title:
|
|
714
|
+
if self._check_title_match(cited_title, page_title, page_description):
|
|
715
|
+
title_found = True
|
|
716
|
+
|
|
717
|
+
# Determine if this should be verified or unverified
|
|
718
|
+
if title_found:
|
|
719
|
+
# Check if reference should be verified based on venue type
|
|
720
|
+
venue_field = reference.get('journal') or reference.get('venue') or reference.get('booktitle')
|
|
721
|
+
|
|
722
|
+
if not venue_field:
|
|
723
|
+
# No venue specified - verify with URL as venue
|
|
724
|
+
site_info = self._extract_site_info(soup, web_url)
|
|
725
|
+
venue = site_info.get('organization', 'Web Page') if site_info.get('organization') != site_info.get('domain') else 'Web Page'
|
|
726
|
+
|
|
727
|
+
verified_data = {
|
|
728
|
+
'title': reference.get('title', ''),
|
|
729
|
+
'authors': reference.get('authors', []),
|
|
730
|
+
'year': reference.get('year'),
|
|
731
|
+
'venue': venue,
|
|
732
|
+
'url': web_url,
|
|
733
|
+
'web_metadata': {
|
|
734
|
+
'page_title': page_title,
|
|
735
|
+
'description': page_description,
|
|
736
|
+
'site_info': site_info,
|
|
737
|
+
'final_url': response.url,
|
|
738
|
+
'status_code': response.status_code
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
logger.debug(f"URL verified as valid source for reference without venue: {web_url}")
|
|
742
|
+
return verified_data, [], web_url
|
|
743
|
+
elif self._is_web_content_venue(venue_field, web_url):
|
|
744
|
+
# Has venue but it's a web content venue (news, blog, etc.) - verify it
|
|
745
|
+
verified_data = {
|
|
746
|
+
'title': reference.get('title', ''),
|
|
747
|
+
'authors': reference.get('authors', []),
|
|
748
|
+
'year': reference.get('year'),
|
|
749
|
+
'venue': venue_field, # Keep the original venue
|
|
750
|
+
'url': web_url,
|
|
751
|
+
'web_metadata': {
|
|
752
|
+
'page_title': page_title,
|
|
753
|
+
'description': page_description,
|
|
754
|
+
'site_info': self._extract_site_info(soup, web_url),
|
|
755
|
+
'final_url': response.url,
|
|
756
|
+
'status_code': response.status_code
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
logger.debug(f"URL verified as valid web content source: {web_url}")
|
|
760
|
+
return verified_data, [], web_url
|
|
761
|
+
else:
|
|
762
|
+
# Has academic venue but URL references paper - still unverified (needs proper paper verification)
|
|
763
|
+
return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
|
|
764
|
+
else:
|
|
765
|
+
# Title not found in page content
|
|
766
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
|
|
767
|
+
|
|
768
|
+
except Exception as e:
|
|
769
|
+
logger.error(f"Error checking raw URL {web_url}: {e}")
|
|
770
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
|
|
771
|
+
|
|
772
|
+
def _is_web_content_venue(self, venue: str, url: str) -> bool:
|
|
773
|
+
"""
|
|
774
|
+
Determine if a venue represents web content rather than academic publication
|
|
775
|
+
|
|
776
|
+
Args:
|
|
777
|
+
venue: The venue string (journal, venue, or booktitle)
|
|
778
|
+
url: The URL being checked (for additional context)
|
|
779
|
+
|
|
780
|
+
Returns:
|
|
781
|
+
True if this represents web content that can be verified via URL
|
|
782
|
+
"""
|
|
783
|
+
if not venue:
|
|
784
|
+
return False
|
|
785
|
+
|
|
786
|
+
venue_lower = venue.lower().strip()
|
|
787
|
+
|
|
788
|
+
# News organizations and media outlets
|
|
789
|
+
news_indicators = [
|
|
790
|
+
'news', 'cbc', 'bbc', 'cnn', 'reuters', 'associated press', 'ap news',
|
|
791
|
+
'npr', 'pbs', 'abc news', 'nbc news', 'fox news', 'guardian', 'times',
|
|
792
|
+
'post', 'herald', 'tribune', 'gazette', 'chronicle', 'observer',
|
|
793
|
+
'magazine', 'weekly', 'daily', 'today', 'report', 'wire', 'press'
|
|
794
|
+
]
|
|
795
|
+
|
|
796
|
+
# Special case for Wall Street Journal
|
|
797
|
+
if any(word in venue_lower for word in ['wall street', 'wsj']):
|
|
798
|
+
news_indicators.append('journal')
|
|
799
|
+
|
|
800
|
+
# Technology and industry publications
|
|
801
|
+
tech_publications = [
|
|
802
|
+
'techcrunch', 'wired', 'ars technica', 'the verge', 'engadget',
|
|
803
|
+
'zdnet', 'cnet', 'computerworld', 'infoworld', 'pcmag', 'pcworld',
|
|
804
|
+
'ieee spectrum', 'mit technology review', 'scientific american'
|
|
805
|
+
]
|
|
806
|
+
|
|
807
|
+
# Blogs and web platforms
|
|
808
|
+
blog_platforms = [
|
|
809
|
+
'blog', 'medium', 'substack', 'wordpress', 'blogspot', 'tumblr',
|
|
810
|
+
'linkedin', 'facebook', 'twitter', 'reddit', 'stack overflow',
|
|
811
|
+
'github pages', 'personal website', 'company blog'
|
|
812
|
+
]
|
|
813
|
+
|
|
814
|
+
# Government and organizational websites
|
|
815
|
+
org_indicators = [
|
|
816
|
+
'government', 'gov', '.org', 'agency', 'department', 'ministry',
|
|
817
|
+
'commission', 'bureau', 'office', 'administration', 'institute',
|
|
818
|
+
'foundation', 'association', 'society', 'center', 'centre',
|
|
819
|
+
'council', 'committee', 'board', 'union', 'federation', 'alliance',
|
|
820
|
+
'coalition', 'consortium', 'network', 'group', 'organization',
|
|
821
|
+
'organisation', 'corp', 'corporation', 'company', 'ltd', 'inc'
|
|
822
|
+
]
|
|
823
|
+
|
|
824
|
+
# Documentation and technical resources
|
|
825
|
+
tech_resources = [
|
|
826
|
+
'documentation', 'docs', 'api', 'reference', 'guide', 'tutorial',
|
|
827
|
+
'manual', 'readme', 'wiki', 'help', 'support', 'developer',
|
|
828
|
+
'technical', 'white paper', 'whitepaper', 'brief', 'overview',
|
|
829
|
+
'policy', 'strategy', 'report', 'study', 'analysis', 'research'
|
|
830
|
+
]
|
|
831
|
+
|
|
832
|
+
# Check URL domain for additional context
|
|
833
|
+
url_lower = url.lower() if url else ''
|
|
834
|
+
|
|
835
|
+
# Known web content domains in URL
|
|
836
|
+
web_domains = [
|
|
837
|
+
'cbc.ca', 'bbc.com', 'cnn.com', 'reuters.com', 'npr.org', 'pbs.org',
|
|
838
|
+
'nytimes.com', 'washingtonpost.com', 'theguardian.com', 'wsj.com',
|
|
839
|
+
'techcrunch.com', 'wired.com', 'theverge.com', 'arstechnica.com',
|
|
840
|
+
'medium.com', 'substack.com', 'linkedin.com', 'github.io',
|
|
841
|
+
'readthedocs.io', 'stackoverflow.com', 'reddit.com'
|
|
842
|
+
]
|
|
843
|
+
|
|
844
|
+
# Combine all indicators
|
|
845
|
+
all_indicators = news_indicators + tech_publications + blog_platforms + org_indicators + tech_resources
|
|
846
|
+
|
|
847
|
+
# Academic venue indicators that should NOT be considered web content
|
|
848
|
+
academic_indicators = [
|
|
849
|
+
'proceedings', 'conference', 'symposium', 'workshop', 'transactions',
|
|
850
|
+
'journal of', 'international journal', 'acm', 'ieee', 'springer',
|
|
851
|
+
'nature', 'science', 'cell', 'lancet', 'plos', 'arxiv', 'pubmed',
|
|
852
|
+
'artificial intelligence', 'machine learning', 'computer vision',
|
|
853
|
+
'neural', 'computing', 'robotics', 'bioinformatics'
|
|
854
|
+
]
|
|
855
|
+
|
|
856
|
+
# Check if venue is clearly academic (should not be treated as web content)
|
|
857
|
+
is_academic = any(indicator in venue_lower for indicator in academic_indicators)
|
|
858
|
+
if is_academic:
|
|
859
|
+
return False
|
|
860
|
+
|
|
861
|
+
# Check if venue matches any web content indicators
|
|
862
|
+
venue_matches = any(indicator and indicator in venue_lower for indicator in all_indicators)
|
|
863
|
+
|
|
864
|
+
# Check if URL domain suggests web content
|
|
865
|
+
url_matches = any(domain in url_lower for domain in web_domains)
|
|
866
|
+
|
|
867
|
+
# Special case: if URL contains news/blog/docs indicators, lean towards web content
|
|
868
|
+
url_content_indicators = ['news', 'blog', 'post', 'article', 'docs', 'help', 'guide', 'resources', 'policy', 'strategy']
|
|
869
|
+
url_has_content_indicators = any(indicator in url_lower for indicator in url_content_indicators)
|
|
870
|
+
|
|
871
|
+
# Special case: Check if venue is an organizational acronym/name that matches the URL domain
|
|
872
|
+
# This handles cases like "AECEA" on aecea.ca domain
|
|
873
|
+
organizational_match = self._check_organizational_venue_match(venue, url_lower)
|
|
874
|
+
|
|
875
|
+
return venue_matches or url_matches or url_has_content_indicators or organizational_match
|
|
876
|
+
|
|
877
|
+
def _check_organizational_venue_match(self, venue: str, url_lower: str) -> bool:
|
|
878
|
+
"""
|
|
879
|
+
Check if the venue represents an organization that matches the URL domain
|
|
880
|
+
|
|
881
|
+
Args:
|
|
882
|
+
venue: The venue string
|
|
883
|
+
url_lower: The lowercased URL
|
|
884
|
+
|
|
885
|
+
Returns:
|
|
886
|
+
True if venue appears to be the organization publishing on their own domain
|
|
887
|
+
"""
|
|
888
|
+
if not venue or not url_lower:
|
|
889
|
+
return False
|
|
890
|
+
|
|
891
|
+
venue_lower = venue.lower().strip()
|
|
892
|
+
|
|
893
|
+
# Extract domain from URL
|
|
894
|
+
from urllib.parse import urlparse
|
|
895
|
+
try:
|
|
896
|
+
parsed_url = urlparse(url_lower)
|
|
897
|
+
domain = parsed_url.netloc.lower()
|
|
898
|
+
|
|
899
|
+
# Remove common prefixes
|
|
900
|
+
domain = domain.replace('www.', '')
|
|
901
|
+
|
|
902
|
+
# Check if venue is likely an acronym (short, all caps or mixed case)
|
|
903
|
+
is_likely_acronym = (len(venue) <= 10 and
|
|
904
|
+
(venue.isupper() or
|
|
905
|
+
any(c.isupper() for c in venue) and len(venue.split()) == 1))
|
|
906
|
+
|
|
907
|
+
# Check if venue appears in domain
|
|
908
|
+
venue_clean = ''.join(c for c in venue_lower if c.isalnum())
|
|
909
|
+
|
|
910
|
+
if venue_clean and venue_clean in domain:
|
|
911
|
+
return True
|
|
912
|
+
|
|
913
|
+
# For acronyms, check if the acronym could match the domain
|
|
914
|
+
if is_likely_acronym:
|
|
915
|
+
# Split venue into words and check if initials match domain
|
|
916
|
+
venue_words = venue_lower.replace('.', ' ').split()
|
|
917
|
+
if len(venue_words) == 1 and len(venue_words[0]) <= 6:
|
|
918
|
+
# Single word acronym - check if it's in the domain
|
|
919
|
+
if venue_words[0] in domain:
|
|
920
|
+
return True
|
|
921
|
+
|
|
922
|
+
# Check for educational/professional associations with .ca, .org, .edu domains
|
|
923
|
+
if any(domain.endswith(tld) for tld in ['.ca', '.org', '.edu', '.gov']):
|
|
924
|
+
# These domains often host organizational content
|
|
925
|
+
if any(org_word in venue_lower for org_word in [
|
|
926
|
+
'association', 'society', 'institute', 'foundation', 'center',
|
|
927
|
+
'centre', 'council', 'committee', 'board', 'agency', 'department'
|
|
928
|
+
]):
|
|
929
|
+
return True
|
|
930
|
+
|
|
931
|
+
# Check if venue is a short organizational name/acronym
|
|
932
|
+
if is_likely_acronym:
|
|
933
|
+
return True
|
|
934
|
+
|
|
935
|
+
return False
|
|
936
|
+
|
|
937
|
+
except Exception:
|
|
938
|
+
return False
|
core/parallel_processor.py
CHANGED
|
@@ -275,7 +275,10 @@ class ParallelReferenceProcessor:
|
|
|
275
275
|
reference = result.reference
|
|
276
276
|
|
|
277
277
|
# Print reference info in the same format as sequential mode
|
|
278
|
-
|
|
278
|
+
raw_title = reference.get('title', 'Untitled')
|
|
279
|
+
# Clean LaTeX commands from title for display
|
|
280
|
+
from utils.text_utils import strip_latex_commands
|
|
281
|
+
title = strip_latex_commands(raw_title)
|
|
279
282
|
from utils.text_utils import format_authors_for_display
|
|
280
283
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
281
284
|
year = reference.get('year', '')
|