academic-refchecker 1.2.55__tar.gz → 1.2.57__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.55/src/academic_refchecker.egg-info → academic_refchecker-1.2.57}/PKG-INFO +1 -1
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/requirements.txt +1 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
- academic_refchecker-1.2.57/src/refchecker/__version__.py +5 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/local_semantic_scholar.py +4 -5
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/semantic_scholar.py +38 -24
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/core/refchecker.py +13 -17
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/llm/providers.py +17 -1
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/services/pdf_processor.py +22 -2
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/error_utils.py +8 -8
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/url_utils.py +8 -5
- academic_refchecker-1.2.55/src/refchecker/__version__.py +0 -5
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/LICENSE +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/MANIFEST.in +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/README.md +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/pyproject.toml +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/scripts/download_db.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/scripts/run_tests.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/setup.cfg +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/__init__.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/__main__.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/__init__.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/crossref.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/github_checker.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/openalex.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/openreview_checker.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/pdf_paper_checker.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/webpage_checker.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/config/__init__.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/config/logging.conf +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/config/settings.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/core/__init__.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/core/db_connection_pool.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/core/parallel_processor.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/database/__init__.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/llm/__init__.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/llm/base.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/scripts/__init__.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/services/__init__.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/__init__.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/arxiv_utils.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/author_utils.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/biblatex_parser.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/bibliography_utils.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/bibtex_parser.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/config_validator.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/db_utils.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/doi_utils.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/mock_objects.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/text_utils.py +0 -0
- {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/unicode_utils.py +0 -0
|
@@ -469,15 +469,14 @@ class LocalNonArxivReferenceChecker:
|
|
|
469
469
|
# since this is a Semantic Scholar database checker
|
|
470
470
|
external_ids = paper_data.get('externalIds', {})
|
|
471
471
|
|
|
472
|
-
# First try to get the Semantic Scholar URL
|
|
473
|
-
if
|
|
474
|
-
|
|
475
|
-
paper_url = construct_semantic_scholar_url(external_ids['CorpusId'])
|
|
472
|
+
# First try to get the Semantic Scholar URL using paperId (SHA hash)
|
|
473
|
+
if paper_data.get('paperId'):
|
|
474
|
+
paper_url = f"https://www.semanticscholar.org/paper/{paper_data['paperId']}"
|
|
476
475
|
logger.debug(f"Using Semantic Scholar URL for verification: {paper_url}")
|
|
477
476
|
else:
|
|
478
477
|
# Fallback to best available URL if Semantic Scholar URL not available
|
|
479
478
|
open_access_pdf = paper_data.get('openAccessPdf')
|
|
480
|
-
paper_url = get_best_available_url(external_ids, open_access_pdf)
|
|
479
|
+
paper_url = get_best_available_url(external_ids, open_access_pdf, paper_data.get('paperId'))
|
|
481
480
|
if paper_url:
|
|
482
481
|
logger.debug(f"Using fallback URL: {paper_url}")
|
|
483
482
|
|
|
@@ -85,7 +85,7 @@ class NonArxivReferenceChecker:
|
|
|
85
85
|
params = {
|
|
86
86
|
"query": query,
|
|
87
87
|
"limit": 10,
|
|
88
|
-
"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,journal",
|
|
88
|
+
"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal",
|
|
89
89
|
"sort": "relevance" # Ensure consistent ordering
|
|
90
90
|
}
|
|
91
91
|
|
|
@@ -135,7 +135,7 @@ class NonArxivReferenceChecker:
|
|
|
135
135
|
endpoint = f"{self.base_url}/paper/DOI:{doi}"
|
|
136
136
|
|
|
137
137
|
params = {
|
|
138
|
-
"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,journal"
|
|
138
|
+
"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal"
|
|
139
139
|
}
|
|
140
140
|
|
|
141
141
|
# Make the request with retries and backoff
|
|
@@ -260,7 +260,7 @@ class NonArxivReferenceChecker:
|
|
|
260
260
|
corpus_id = corpus_match.group(1)
|
|
261
261
|
# Try to get the paper directly by CorpusID
|
|
262
262
|
endpoint = f"{self.base_url}/paper/CorpusId:{corpus_id}"
|
|
263
|
-
params = {"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,journal"}
|
|
263
|
+
params = {"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal"}
|
|
264
264
|
|
|
265
265
|
for attempt in range(self.max_retries):
|
|
266
266
|
try:
|
|
@@ -537,12 +537,33 @@ class NonArxivReferenceChecker:
|
|
|
537
537
|
|
|
538
538
|
# Verify venue
|
|
539
539
|
cited_venue = reference.get('journal', '') or reference.get('venue', '')
|
|
540
|
-
paper_venue = paper_data.get('venue') or paper_data.get('journal')
|
|
541
540
|
|
|
542
|
-
#
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
541
|
+
# Extract venue from paper_data - check multiple fields since Semantic Scholar
|
|
542
|
+
# returns venue info in different fields depending on publication type
|
|
543
|
+
paper_venue = None
|
|
544
|
+
|
|
545
|
+
# First try the simple 'venue' field (string)
|
|
546
|
+
if paper_data.get('venue'):
|
|
547
|
+
paper_venue = paper_data.get('venue')
|
|
548
|
+
|
|
549
|
+
# If no venue, try publicationVenue object
|
|
550
|
+
if not paper_venue and paper_data.get('publicationVenue'):
|
|
551
|
+
pub_venue = paper_data.get('publicationVenue')
|
|
552
|
+
if isinstance(pub_venue, dict):
|
|
553
|
+
paper_venue = pub_venue.get('name', '')
|
|
554
|
+
elif isinstance(pub_venue, str):
|
|
555
|
+
paper_venue = pub_venue
|
|
556
|
+
|
|
557
|
+
# If still no venue, try journal object
|
|
558
|
+
if not paper_venue and paper_data.get('journal'):
|
|
559
|
+
journal = paper_data.get('journal')
|
|
560
|
+
if isinstance(journal, dict):
|
|
561
|
+
paper_venue = journal.get('name', '')
|
|
562
|
+
elif isinstance(journal, str):
|
|
563
|
+
paper_venue = journal
|
|
564
|
+
|
|
565
|
+
# Ensure paper_venue is a string
|
|
566
|
+
if paper_venue and not isinstance(paper_venue, str):
|
|
546
567
|
paper_venue = str(paper_venue)
|
|
547
568
|
|
|
548
569
|
# Check venue mismatches
|
|
@@ -552,18 +573,12 @@ class NonArxivReferenceChecker:
|
|
|
552
573
|
from refchecker.utils.error_utils import create_venue_warning
|
|
553
574
|
errors.append(create_venue_warning(cited_venue, paper_venue))
|
|
554
575
|
elif not cited_venue and paper_venue:
|
|
555
|
-
#
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
# Venue is present in raw text but missing from parsed reference
|
|
562
|
-
errors.append({
|
|
563
|
-
'warning_type': 'venue',
|
|
564
|
-
'warning_details': f"Venue missing: should include '{paper_venue}'",
|
|
565
|
-
'ref_venue_correct': paper_venue
|
|
566
|
-
})
|
|
576
|
+
# Reference has no venue but paper has one - always warn about missing venue
|
|
577
|
+
errors.append({
|
|
578
|
+
'warning_type': 'venue',
|
|
579
|
+
'warning_details': f"Venue missing: should include '{paper_venue}'",
|
|
580
|
+
'ref_venue_correct': paper_venue
|
|
581
|
+
})
|
|
567
582
|
|
|
568
583
|
# Always check for missing arXiv URLs when paper has arXiv ID
|
|
569
584
|
external_ids = paper_data.get('externalIds', {})
|
|
@@ -612,10 +627,9 @@ class NonArxivReferenceChecker:
|
|
|
612
627
|
logger.debug(f"Semantic Scholar - Extracting URL from paper data: {list(paper_data.keys())}")
|
|
613
628
|
|
|
614
629
|
# Return the Semantic Scholar URL that was actually used for verification
|
|
615
|
-
# First priority: Semantic Scholar URL
|
|
616
|
-
if
|
|
617
|
-
|
|
618
|
-
paper_url = construct_semantic_scholar_url(external_ids['CorpusId'])
|
|
630
|
+
# First priority: Semantic Scholar URL using paperId (SHA hash, works in web URLs)
|
|
631
|
+
if paper_data.get('paperId'):
|
|
632
|
+
paper_url = f"https://www.semanticscholar.org/paper/{paper_data['paperId']}"
|
|
619
633
|
logger.debug(f"Using Semantic Scholar URL for verification: {paper_url}")
|
|
620
634
|
|
|
621
635
|
# Second priority: DOI URL (if this was verified through DOI)
|
|
@@ -2089,14 +2089,12 @@ class ArxivReferenceChecker:
|
|
|
2089
2089
|
if correct_paper_data:
|
|
2090
2090
|
logger.debug(f"Database mode: Found correct paper: '{correct_paper_data.get('title', '')}'")
|
|
2091
2091
|
# Use the CORRECT paper's Semantic Scholar URL
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
from refchecker.utils.url_utils import construct_semantic_scholar_url
|
|
2095
|
-
correct_paper_url = construct_semantic_scholar_url(correct_external_ids['CorpusId'])
|
|
2092
|
+
if correct_paper_data.get('paperId'):
|
|
2093
|
+
correct_paper_url = f"https://www.semanticscholar.org/paper/{correct_paper_data['paperId']}"
|
|
2096
2094
|
paper_url = correct_paper_url # Update the main URL
|
|
2097
2095
|
logger.debug(f"Database mode: Using correct paper's Semantic Scholar URL for ArXiv ID mismatch: {paper_url}")
|
|
2098
2096
|
else:
|
|
2099
|
-
logger.debug("Database mode: Correct paper found but no
|
|
2097
|
+
logger.debug("Database mode: Correct paper found but no paperId available")
|
|
2100
2098
|
else:
|
|
2101
2099
|
logger.debug("Database mode: Could not find correct paper by title/authors")
|
|
2102
2100
|
except Exception as e:
|
|
@@ -2117,12 +2115,11 @@ class ArxivReferenceChecker:
|
|
|
2117
2115
|
formatted_errors.append(formatted_error)
|
|
2118
2116
|
|
|
2119
2117
|
# Fallback to wrong paper's URL if we couldn't find the correct one
|
|
2120
|
-
if not correct_paper_data and verified_data and verified_data.get('
|
|
2121
|
-
|
|
2122
|
-
paper_url = construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
|
|
2118
|
+
if not correct_paper_data and verified_data and verified_data.get('paperId'):
|
|
2119
|
+
paper_url = f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
|
|
2123
2120
|
logger.debug(f"Database mode: Fallback to wrong paper's Semantic Scholar URL: {paper_url}")
|
|
2124
2121
|
elif not correct_paper_data:
|
|
2125
|
-
logger.debug(f"Database mode: No
|
|
2122
|
+
logger.debug(f"Database mode: No paperId available for Semantic Scholar URL construction. verified_data keys: {list(verified_data.keys()) if verified_data else 'None'}")
|
|
2126
2123
|
|
|
2127
2124
|
return formatted_errors if formatted_errors else None, paper_url, verified_data
|
|
2128
2125
|
else:
|
|
@@ -5521,10 +5518,9 @@ class ArxivReferenceChecker:
|
|
|
5521
5518
|
if verified_data and verified_data.get('url') and 'arxiv.org' not in verified_data['url']:
|
|
5522
5519
|
return verified_data['url']
|
|
5523
5520
|
|
|
5524
|
-
# Second priority: Semantic Scholar URL from
|
|
5525
|
-
if verified_data and verified_data.get('
|
|
5526
|
-
|
|
5527
|
-
return construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
|
|
5521
|
+
# Second priority: Semantic Scholar URL from paperId (if no direct URL available)
|
|
5522
|
+
if verified_data and verified_data.get('paperId'):
|
|
5523
|
+
return f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
|
|
5528
5524
|
|
|
5529
5525
|
# Third priority: DOI URL from verified data (more reliable than potentially wrong ArXiv URLs)
|
|
5530
5526
|
if verified_data and verified_data.get('externalIds', {}).get('DOI'):
|
|
@@ -5576,11 +5572,11 @@ class ArxivReferenceChecker:
|
|
|
5576
5572
|
# Non-ArXiv URL, probably safe to use
|
|
5577
5573
|
return reference_url
|
|
5578
5574
|
|
|
5579
|
-
def _get_fallback_url(self, external_ids):
|
|
5575
|
+
def _get_fallback_url(self, external_ids, verified_data=None):
|
|
5580
5576
|
"""Get fallback URL from external IDs (Semantic Scholar or DOI)"""
|
|
5581
|
-
|
|
5582
|
-
|
|
5583
|
-
return
|
|
5577
|
+
# Prefer paperId for Semantic Scholar URLs
|
|
5578
|
+
if verified_data and verified_data.get('paperId'):
|
|
5579
|
+
return f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
|
|
5584
5580
|
elif external_ids.get('DOI'):
|
|
5585
5581
|
from refchecker.utils.doi_utils import construct_doi_url
|
|
5586
5582
|
return construct_doi_url(external_ids['DOI'])
|
|
@@ -318,7 +318,23 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
|
|
|
318
318
|
}
|
|
319
319
|
)
|
|
320
320
|
|
|
321
|
-
|
|
321
|
+
# Handle empty responses (content safety filter or other issues)
|
|
322
|
+
if not response.candidates:
|
|
323
|
+
logger.warning("Google API returned empty candidates (possibly content filtered)")
|
|
324
|
+
return ""
|
|
325
|
+
|
|
326
|
+
# Safely access the text
|
|
327
|
+
try:
|
|
328
|
+
return response.text or ""
|
|
329
|
+
except (ValueError, AttributeError) as e:
|
|
330
|
+
# response.text raises ValueError if multiple candidates or no text
|
|
331
|
+
logger.warning(f"Could not get text from Google response: {e}")
|
|
332
|
+
# Try to extract text from first candidate manually
|
|
333
|
+
if response.candidates and hasattr(response.candidates[0], 'content'):
|
|
334
|
+
content = response.candidates[0].content
|
|
335
|
+
if hasattr(content, 'parts') and content.parts:
|
|
336
|
+
return content.parts[0].text or ""
|
|
337
|
+
return ""
|
|
322
338
|
|
|
323
339
|
except Exception as e:
|
|
324
340
|
logger.error(f"Google API call failed: {e}")
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/services/pdf_processor.py
RENAMED
|
@@ -69,10 +69,30 @@ class PDFProcessor:
|
|
|
69
69
|
with open(pdf_path, 'rb') as file:
|
|
70
70
|
pdf_reader = pypdf.PdfReader(file)
|
|
71
71
|
text = ""
|
|
72
|
+
failed_pages = []
|
|
72
73
|
|
|
73
74
|
for page_num in range(len(pdf_reader.pages)):
|
|
74
|
-
|
|
75
|
-
|
|
75
|
+
try:
|
|
76
|
+
page = pdf_reader.pages[page_num]
|
|
77
|
+
page_text = page.extract_text()
|
|
78
|
+
if page_text:
|
|
79
|
+
text += page_text + "\n"
|
|
80
|
+
except TypeError as e:
|
|
81
|
+
# Handle pypdf errors like "NumberObject is not iterable"
|
|
82
|
+
# which can occur with malformed PDF pages
|
|
83
|
+
failed_pages.append(page_num + 1) # 1-indexed for logging
|
|
84
|
+
logger.warning(f"Skipping page {page_num + 1} due to PDF parsing error: {e}")
|
|
85
|
+
continue
|
|
86
|
+
except Exception as e:
|
|
87
|
+
failed_pages.append(page_num + 1)
|
|
88
|
+
logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
if failed_pages:
|
|
92
|
+
logger.warning(f"Failed to extract text from {len(failed_pages)} pages: {failed_pages[:10]}{'...' if len(failed_pages) > 10 else ''}")
|
|
93
|
+
|
|
94
|
+
if not text.strip():
|
|
95
|
+
raise ValueError(f"No text could be extracted from any pages of {pdf_path}")
|
|
76
96
|
|
|
77
97
|
# Cache the result
|
|
78
98
|
self.cache[pdf_path] = text
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/error_utils.py
RENAMED
|
@@ -42,8 +42,8 @@ def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str
|
|
|
42
42
|
|
|
43
43
|
Example:
|
|
44
44
|
Title mismatch:
|
|
45
|
-
|
|
46
|
-
|
|
45
|
+
cited: 'Cited Title'
|
|
46
|
+
actual: 'Correct Title'
|
|
47
47
|
|
|
48
48
|
Args:
|
|
49
49
|
mismatch_type: The type of mismatch (e.g., "Author 2 mismatch", "Title mismatch")
|
|
@@ -57,11 +57,10 @@ def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str
|
|
|
57
57
|
if not mismatch_type.endswith(":"):
|
|
58
58
|
mismatch_type = mismatch_type.rstrip() + ":"
|
|
59
59
|
|
|
60
|
-
# Use fixed indentation for
|
|
61
|
-
|
|
62
|
-
vs_indent = "" # vs: starts at column 0 for clear visual separation
|
|
60
|
+
# Use fixed indentation for labels, keeping detail column aligned
|
|
61
|
+
label_indent = " " # 7 spaces to indent labels
|
|
63
62
|
|
|
64
|
-
return f"{mismatch_type}\n{
|
|
63
|
+
return f"{mismatch_type}\n{label_indent}cited: {left}\n{label_indent}actual: {right}"
|
|
65
64
|
|
|
66
65
|
|
|
67
66
|
def format_title_mismatch(cited_title: str, verified_title: str) -> str:
|
|
@@ -187,8 +186,9 @@ def format_missing_venue(correct_venue: str) -> str:
|
|
|
187
186
|
"""
|
|
188
187
|
Format a missing venue message with only the actual value.
|
|
189
188
|
"""
|
|
190
|
-
# Only show the actual venue
|
|
191
|
-
|
|
189
|
+
# Only show the actual venue with indented label
|
|
190
|
+
label_indent = " " # 7 spaces to indent labels
|
|
191
|
+
return f"Missing venue:\n{label_indent}actual: {correct_venue}"
|
|
192
192
|
|
|
193
193
|
|
|
194
194
|
def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]:
|
|
@@ -102,7 +102,9 @@ def construct_semantic_scholar_url(paper_id: str) -> str:
|
|
|
102
102
|
Construct a Semantic Scholar URL from a paper ID.
|
|
103
103
|
|
|
104
104
|
Args:
|
|
105
|
-
paper_id: Semantic Scholar paper ID
|
|
105
|
+
paper_id: Semantic Scholar paper ID (SHA hash, NOT CorpusId)
|
|
106
|
+
The paperId is the 40-character hex hash that works in web URLs.
|
|
107
|
+
CorpusId (numeric) does NOT work in web URLs.
|
|
106
108
|
|
|
107
109
|
Returns:
|
|
108
110
|
Full Semantic Scholar URL
|
|
@@ -151,7 +153,7 @@ def construct_pubmed_url(pmid: str) -> str:
|
|
|
151
153
|
return f"https://pubmed.ncbi.nlm.nih.gov/{clean_pmid}/"
|
|
152
154
|
|
|
153
155
|
|
|
154
|
-
def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] = None) -> Optional[str]:
|
|
156
|
+
def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] = None, paper_id: Optional[str] = None) -> Optional[str]:
|
|
155
157
|
"""
|
|
156
158
|
Get the best available URL from a paper's external IDs and open access information.
|
|
157
159
|
Priority: Open Access PDF > DOI > ArXiv > Semantic Scholar > OpenAlex > PubMed
|
|
@@ -159,6 +161,7 @@ def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] =
|
|
|
159
161
|
Args:
|
|
160
162
|
external_ids: Dictionary of external identifiers
|
|
161
163
|
open_access_pdf: Open access PDF URL if available
|
|
164
|
+
paper_id: Semantic Scholar paperId (SHA hash) if available
|
|
162
165
|
|
|
163
166
|
Returns:
|
|
164
167
|
Best available URL or None if no valid URL found
|
|
@@ -175,9 +178,9 @@ def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] =
|
|
|
175
178
|
if external_ids.get('ArXiv'):
|
|
176
179
|
return construct_arxiv_url(external_ids['ArXiv'])
|
|
177
180
|
|
|
178
|
-
# Priority 4: Semantic Scholar URL
|
|
179
|
-
if
|
|
180
|
-
return construct_semantic_scholar_url(
|
|
181
|
+
# Priority 4: Semantic Scholar URL (using paperId, not CorpusId)
|
|
182
|
+
if paper_id:
|
|
183
|
+
return construct_semantic_scholar_url(paper_id)
|
|
181
184
|
|
|
182
185
|
# Priority 5: OpenAlex URL
|
|
183
186
|
if external_ids.get('OpenAlex'):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/__init__.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/crossref.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/github_checker.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/openalex.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/webpage_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/config/logging.conf
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/core/db_connection_pool.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/core/parallel_processor.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/database/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/scripts/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/services/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/arxiv_utils.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/author_utils.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/biblatex_parser.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/bibliography_utils.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/bibtex_parser.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/config_validator.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/mock_objects.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/text_utils.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/unicode_utils.py
RENAMED
|
File without changes
|