academic-refchecker 2.0.13__tar.gz → 2.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-2.0.13/academic_refchecker.egg-info → academic_refchecker-2.0.14}/PKG-INFO +1 -1
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14/academic_refchecker.egg-info}/PKG-INFO +1 -1
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/main.py +33 -5
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/refchecker_wrapper.py +42 -1
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/thumbnail.py +117 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/__version__.py +1 -1
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/arxiv_citation.py +181 -49
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/enhanced_hybrid_checker.py +117 -4
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/semantic_scholar.py +43 -1
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/author_utils.py +15 -2
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/bibliography_utils.py +2 -2
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/LICENSE +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/MANIFEST.in +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/README.md +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/SOURCES.txt +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/__init__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/__main__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/cli.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/concurrency.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/database.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/models.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/static/assets/index-2P6L_39v.css +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/static/assets/index-hk21nqxR.js +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/static/favicon.svg +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/static/index.html +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/static/vite.svg +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/websocket_manager.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/pyproject.toml +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/requirements.txt +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/scripts/download_db.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/scripts/run_tests.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/setup.cfg +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/__init__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/__main__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/__init__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/crossref.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/github_checker.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/openalex.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/openreview_checker.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/pdf_paper_checker.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/webpage_checker.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/config/__init__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/config/logging.conf +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/config/settings.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/core/__init__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/core/db_connection_pool.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/core/parallel_processor.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/core/refchecker.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/database/__init__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/llm/__init__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/llm/base.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/llm/providers.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/scripts/__init__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/services/__init__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/services/pdf_processor.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/__init__.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/arxiv_rate_limiter.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/arxiv_utils.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/biblatex_parser.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/bibtex_parser.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/config_validator.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/db_utils.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/doi_utils.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/error_utils.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/mock_objects.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/text_utils.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/unicode_utils.py +0 -0
- {academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/url_utils.py +0 -0
|
@@ -27,6 +27,7 @@ from .thumbnail import (
|
|
|
27
27
|
generate_pdf_thumbnail_async,
|
|
28
28
|
generate_pdf_preview_async,
|
|
29
29
|
get_text_thumbnail_async,
|
|
30
|
+
get_text_preview_async,
|
|
30
31
|
get_thumbnail_cache_path,
|
|
31
32
|
get_preview_cache_path
|
|
32
33
|
)
|
|
@@ -220,12 +221,15 @@ async def start_check(
|
|
|
220
221
|
elif source_type == "text":
|
|
221
222
|
if not source_text:
|
|
222
223
|
raise HTTPException(status_code=400, detail="No text provided")
|
|
224
|
+
# Normalize line endings - remove all \r to prevent double carriage returns
|
|
225
|
+
# Browser may send \r\n, and Windows file writing can add extra \r
|
|
226
|
+
normalized_text = source_text.replace('\r\n', '\n').replace('\r', '\n')
|
|
223
227
|
# Save pasted text to a file for later retrieval and thumbnail generation
|
|
224
228
|
text_dir = Path(tempfile.gettempdir()) / "refchecker_texts"
|
|
225
229
|
text_dir.mkdir(parents=True, exist_ok=True)
|
|
226
230
|
text_file_path = text_dir / f"pasted_{session_id}.txt"
|
|
227
|
-
with open(text_file_path, "w", encoding="utf-8") as f:
|
|
228
|
-
f.write(
|
|
231
|
+
with open(text_file_path, "w", encoding="utf-8", newline='\n') as f:
|
|
232
|
+
f.write(normalized_text)
|
|
229
233
|
paper_source = str(text_file_path)
|
|
230
234
|
paper_title = "Pasted Text"
|
|
231
235
|
elif source_type == "url":
|
|
@@ -646,9 +650,33 @@ async def get_preview(check_id: int):
|
|
|
646
650
|
media_type="image/png",
|
|
647
651
|
headers={"Cache-Control": "public, max-age=86400"} # Cache for 1 day
|
|
648
652
|
)
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
653
|
+
|
|
654
|
+
# For text sources, generate a high-resolution text preview for overlay display
|
|
655
|
+
if source_type == 'text':
|
|
656
|
+
logger.info(f"Generating text preview for check {check_id}")
|
|
657
|
+
preview_path = await get_text_preview_async(check_id, "", paper_source)
|
|
658
|
+
if preview_path and os.path.exists(preview_path):
|
|
659
|
+
return FileResponse(
|
|
660
|
+
preview_path,
|
|
661
|
+
media_type="image/png",
|
|
662
|
+
headers={"Cache-Control": "public, max-age=86400"}
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
# For non-PDF file uploads, also generate a text preview
|
|
666
|
+
if source_type == 'file' and not paper_source.lower().endswith('.pdf'):
|
|
667
|
+
logger.info(f"Generating text preview for uploaded file check {check_id}")
|
|
668
|
+
if os.path.exists(paper_source):
|
|
669
|
+
preview_path = await get_text_preview_async(check_id, "", paper_source)
|
|
670
|
+
else:
|
|
671
|
+
preview_path = await get_text_preview_async(check_id, "Uploaded file")
|
|
672
|
+
if preview_path and os.path.exists(preview_path):
|
|
673
|
+
return FileResponse(
|
|
674
|
+
preview_path,
|
|
675
|
+
media_type="image/png",
|
|
676
|
+
headers={"Cache-Control": "public, max-age=86400"}
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
raise HTTPException(status_code=404, detail="Could not generate preview")
|
|
652
680
|
|
|
653
681
|
except HTTPException:
|
|
654
682
|
raise
|
|
@@ -3,6 +3,7 @@ Wrapper around refchecker library with progress callbacks for real-time updates
|
|
|
3
3
|
"""
|
|
4
4
|
import sys
|
|
5
5
|
import os
|
|
6
|
+
import re
|
|
6
7
|
import asyncio
|
|
7
8
|
import logging
|
|
8
9
|
import tempfile
|
|
@@ -238,6 +239,18 @@ class ProgressRefChecker:
|
|
|
238
239
|
if not any(u.get('url') == doi_url for u in authoritative_urls):
|
|
239
240
|
authoritative_urls.append({"type": "doi", "url": doi_url})
|
|
240
241
|
|
|
242
|
+
# Add Semantic Scholar URL if available
|
|
243
|
+
s2_paper_id = external_ids.get('S2PaperId')
|
|
244
|
+
if s2_paper_id:
|
|
245
|
+
s2_url = f"https://www.semanticscholar.org/paper/{s2_paper_id}"
|
|
246
|
+
if not any(u.get('url') == s2_url for u in authoritative_urls):
|
|
247
|
+
authoritative_urls.append({"type": "semantic_scholar", "url": s2_url})
|
|
248
|
+
|
|
249
|
+
# Also check for inline S2 URL (from merged data)
|
|
250
|
+
s2_inline_url = verified_data.get('_semantic_scholar_url')
|
|
251
|
+
if s2_inline_url and not any(u.get('url') == s2_inline_url for u in authoritative_urls):
|
|
252
|
+
authoritative_urls.append({"type": "semantic_scholar", "url": s2_inline_url})
|
|
253
|
+
|
|
241
254
|
# Format errors, warnings, and suggestions
|
|
242
255
|
formatted_errors = []
|
|
243
256
|
formatted_warnings = []
|
|
@@ -462,11 +475,20 @@ class ProgressRefChecker:
|
|
|
462
475
|
raise ValueError("PDF extraction requires an LLM to be configured. Please configure an LLM provider in settings.")
|
|
463
476
|
pdf_processor = PDFProcessor()
|
|
464
477
|
paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, paper_source)
|
|
465
|
-
elif paper_source.lower().endswith(('.tex', '.txt')):
|
|
478
|
+
elif paper_source.lower().endswith(('.tex', '.txt', '.bib')):
|
|
466
479
|
def read_file():
|
|
467
480
|
with open(paper_source, 'r', encoding='utf-8') as f:
|
|
468
481
|
return f.read()
|
|
469
482
|
paper_text = await asyncio.to_thread(read_file)
|
|
483
|
+
|
|
484
|
+
# For .bib files, extract references directly using BibTeX parser
|
|
485
|
+
if paper_source.lower().endswith('.bib'):
|
|
486
|
+
logger.info("Processing uploaded .bib file as BibTeX")
|
|
487
|
+
refs_result = await self._extract_references_from_bibtex(paper_text)
|
|
488
|
+
if refs_result and refs_result[0]:
|
|
489
|
+
arxiv_source_references = refs_result[0]
|
|
490
|
+
extraction_method = 'bib'
|
|
491
|
+
logger.info(f"Extracted {len(arxiv_source_references)} references from .bib file")
|
|
470
492
|
else:
|
|
471
493
|
raise ValueError(f"Unsupported file type: {paper_source}")
|
|
472
494
|
elif source_type == "text":
|
|
@@ -494,6 +516,25 @@ class ProgressRefChecker:
|
|
|
494
516
|
arxiv_source_references = refs_result[0]
|
|
495
517
|
extraction_method = 'bbl' # Mark as bbl extraction
|
|
496
518
|
logger.info(f"Extracted {len(arxiv_source_references)} references from pasted .bbl content")
|
|
519
|
+
# Check if the pasted text is BibTeX format (@article, @misc, @inproceedings, etc.)
|
|
520
|
+
elif re.search(r'@\s*(article|book|inproceedings|incollection|misc|techreport|phdthesis|mastersthesis|conference|inbook|proceedings)\s*\{', paper_text, re.IGNORECASE):
|
|
521
|
+
logger.info("Detected BibTeX format in pasted text")
|
|
522
|
+
refs_result = await self._extract_references_from_bibtex(paper_text)
|
|
523
|
+
if refs_result and refs_result[0]:
|
|
524
|
+
arxiv_source_references = refs_result[0]
|
|
525
|
+
extraction_method = 'bib' # Mark as bib extraction
|
|
526
|
+
logger.info(f"Extracted {len(arxiv_source_references)} references from pasted BibTeX content")
|
|
527
|
+
# Fallback: Try BibTeX parsing anyway for partial/malformed content
|
|
528
|
+
# This handles cases like incomplete paste, or BibTeX-like content without standard entry types
|
|
529
|
+
elif any(marker in paper_text for marker in ['title={', 'author={', 'year={', 'eprint={', '@']):
|
|
530
|
+
logger.info("Detected possible BibTeX-like content, attempting parse")
|
|
531
|
+
refs_result = await self._extract_references_from_bibtex(paper_text)
|
|
532
|
+
if refs_result and refs_result[0]:
|
|
533
|
+
arxiv_source_references = refs_result[0]
|
|
534
|
+
extraction_method = 'bib'
|
|
535
|
+
logger.info(f"Extracted {len(arxiv_source_references)} references from partial BibTeX content")
|
|
536
|
+
else:
|
|
537
|
+
logger.warning("BibTeX-like content detected but parsing failed, will try LLM extraction")
|
|
497
538
|
# Don't update title for pasted text - keep the placeholder
|
|
498
539
|
else:
|
|
499
540
|
raise ValueError(f"Unsupported source type: {source_type}")
|
|
@@ -416,6 +416,13 @@ def get_text_thumbnail(check_id: int, text_preview: str = "", text_file_path: st
|
|
|
416
416
|
except Exception as e:
|
|
417
417
|
logger.warning(f"Could not read text file: {e}")
|
|
418
418
|
|
|
419
|
+
# Clean up text content - remove excessive blank lines that cause rendering issues
|
|
420
|
+
if text_content:
|
|
421
|
+
# Normalize line endings and remove consecutive blank lines
|
|
422
|
+
lines = text_content.replace('\r\n', '\n').replace('\r', '\n').split('\n')
|
|
423
|
+
# Keep only non-empty lines
|
|
424
|
+
text_content = '\n'.join(line for line in lines if line.strip())
|
|
425
|
+
|
|
419
426
|
# Create a document-like image with actual text content
|
|
420
427
|
doc = fitz.open()
|
|
421
428
|
page = doc.new_page(width=THUMBNAIL_WIDTH, height=int(THUMBNAIL_WIDTH * 1.4))
|
|
@@ -483,6 +490,116 @@ def get_text_thumbnail(check_id: int, text_preview: str = "", text_file_path: st
|
|
|
483
490
|
return None
|
|
484
491
|
|
|
485
492
|
|
|
493
|
+
def get_text_preview(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
|
|
494
|
+
"""
|
|
495
|
+
Generate a high-resolution preview for pasted text showing actual content.
|
|
496
|
+
|
|
497
|
+
Creates a larger image (similar to PDF previews) with the text content.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
check_id: Check ID for naming
|
|
501
|
+
text_preview: Optional first few lines of text to display
|
|
502
|
+
text_file_path: Optional path to the text file to read content from
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
Path to the generated preview, or None if generation failed
|
|
506
|
+
"""
|
|
507
|
+
try:
|
|
508
|
+
import fitz
|
|
509
|
+
|
|
510
|
+
output_path = get_preview_cache_path(f"text_{check_id}", check_id)
|
|
511
|
+
|
|
512
|
+
if output_path.exists():
|
|
513
|
+
return str(output_path)
|
|
514
|
+
|
|
515
|
+
# Try to read text content from file
|
|
516
|
+
text_content = text_preview
|
|
517
|
+
if text_file_path and os.path.exists(text_file_path):
|
|
518
|
+
try:
|
|
519
|
+
with open(text_file_path, 'r', encoding='utf-8') as f:
|
|
520
|
+
text_content = f.read()
|
|
521
|
+
except Exception as e:
|
|
522
|
+
logger.warning(f"Could not read text file: {e}")
|
|
523
|
+
|
|
524
|
+
# Clean up text content - remove excessive blank lines that cause rendering issues
|
|
525
|
+
if text_content:
|
|
526
|
+
# Normalize line endings and remove consecutive blank lines
|
|
527
|
+
lines = text_content.replace('\r\n', '\n').replace('\r', '\n').split('\n')
|
|
528
|
+
# Keep only non-empty lines
|
|
529
|
+
text_content = '\n'.join(line for line in lines if line.strip())
|
|
530
|
+
|
|
531
|
+
# Create a document-like image with actual text content at high resolution
|
|
532
|
+
doc = fitz.open()
|
|
533
|
+
page = doc.new_page(width=PREVIEW_WIDTH, height=int(PREVIEW_WIDTH * 1.4))
|
|
534
|
+
|
|
535
|
+
# Fill with white/off-white background
|
|
536
|
+
page.draw_rect(page.rect, color=(0.9, 0.9, 0.9), fill=(0.98, 0.98, 0.98))
|
|
537
|
+
|
|
538
|
+
# Draw border
|
|
539
|
+
page.draw_rect(page.rect, color=(0.7, 0.7, 0.7), width=2)
|
|
540
|
+
|
|
541
|
+
# Draw actual text content if available
|
|
542
|
+
margin = 40
|
|
543
|
+
if text_content:
|
|
544
|
+
# Create a text box for the content
|
|
545
|
+
text_rect = fitz.Rect(margin, margin, PREVIEW_WIDTH - margin, int(PREVIEW_WIDTH * 1.4) - margin)
|
|
546
|
+
|
|
547
|
+
# Truncate to first ~4000 chars for preview
|
|
548
|
+
display_text = text_content[:4000]
|
|
549
|
+
if len(text_content) > 4000:
|
|
550
|
+
display_text += "\n\n..."
|
|
551
|
+
|
|
552
|
+
# Insert text with readable font size
|
|
553
|
+
page.insert_textbox(
|
|
554
|
+
text_rect,
|
|
555
|
+
display_text,
|
|
556
|
+
fontsize=14,
|
|
557
|
+
color=(0.15, 0.15, 0.15),
|
|
558
|
+
fontname="helv"
|
|
559
|
+
)
|
|
560
|
+
else:
|
|
561
|
+
# Fallback: Draw placeholder
|
|
562
|
+
header_rect = fitz.Rect(margin, margin, PREVIEW_WIDTH - margin, margin + 60)
|
|
563
|
+
page.insert_textbox(header_rect, "Pasted Text", fontsize=36, color=(0.3, 0.3, 0.5))
|
|
564
|
+
|
|
565
|
+
# Draw placeholder lines
|
|
566
|
+
line_height = 24
|
|
567
|
+
y = margin + 100
|
|
568
|
+
|
|
569
|
+
for i in range(20):
|
|
570
|
+
line_width = PREVIEW_WIDTH - 2 * margin
|
|
571
|
+
if i % 3 == 2:
|
|
572
|
+
line_width = line_width * 0.7
|
|
573
|
+
|
|
574
|
+
page.draw_line(
|
|
575
|
+
fitz.Point(margin, y),
|
|
576
|
+
fitz.Point(margin + line_width, y),
|
|
577
|
+
color=(0.7, 0.7, 0.7),
|
|
578
|
+
width=3
|
|
579
|
+
)
|
|
580
|
+
y += line_height
|
|
581
|
+
|
|
582
|
+
# Render to pixmap and save
|
|
583
|
+
pix = page.get_pixmap(alpha=False)
|
|
584
|
+
pix.save(str(output_path))
|
|
585
|
+
doc.close()
|
|
586
|
+
|
|
587
|
+
logger.info(f"Generated text preview: {output_path}")
|
|
588
|
+
return str(output_path)
|
|
589
|
+
|
|
590
|
+
except ImportError:
|
|
591
|
+
logger.error("PyMuPDF (fitz) is not installed")
|
|
592
|
+
return None
|
|
593
|
+
except Exception as e:
|
|
594
|
+
logger.error(f"Error generating text preview: {e}")
|
|
595
|
+
return None
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
async def get_text_preview_async(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
|
|
599
|
+
"""Async wrapper for text preview generation."""
|
|
600
|
+
return await asyncio.to_thread(get_text_preview, check_id, text_preview, text_file_path)
|
|
601
|
+
|
|
602
|
+
|
|
486
603
|
async def get_text_thumbnail_async(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
|
|
487
604
|
"""Async wrapper for text thumbnail generation."""
|
|
488
605
|
return await asyncio.to_thread(get_text_thumbnail, check_id, text_preview, text_file_path)
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/arxiv_citation.py
RENAMED
|
@@ -8,8 +8,8 @@ for papers found on ArXiv, as it reflects the author-submitted metadata.
|
|
|
8
8
|
|
|
9
9
|
Key features:
|
|
10
10
|
- Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
|
|
11
|
-
-
|
|
12
|
-
-
|
|
11
|
+
- Checks reference against all historical versions when latest doesn't match
|
|
12
|
+
- Annotates errors with version info when reference matches an older version
|
|
13
13
|
- Parses BibTeX to extract normalized metadata matching refchecker schema
|
|
14
14
|
|
|
15
15
|
Usage:
|
|
@@ -30,6 +30,7 @@ Usage:
|
|
|
30
30
|
import re
|
|
31
31
|
import logging
|
|
32
32
|
import requests
|
|
33
|
+
import html
|
|
33
34
|
from typing import Dict, List, Tuple, Optional, Any
|
|
34
35
|
|
|
35
36
|
import bibtexparser
|
|
@@ -88,6 +89,8 @@ class ArXivCitationChecker:
|
|
|
88
89
|
# export.arxiv.org URLs
|
|
89
90
|
r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
90
91
|
r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
92
|
+
# DOI format
|
|
93
|
+
r"(?:arxiv[:./])(\d{4}\.\d{4,5})(v\d+)?"
|
|
91
94
|
]
|
|
92
95
|
|
|
93
96
|
def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
|
@@ -107,6 +110,8 @@ class ArXivCitationChecker:
|
|
|
107
110
|
reference.get('cited_url', ''),
|
|
108
111
|
reference.get('raw_text', ''),
|
|
109
112
|
reference.get('eprint', ''), # BibTeX field
|
|
113
|
+
reference.get('journal', ''),
|
|
114
|
+
reference.get('doi', ''), # DOI field (may contain arXiv ID)
|
|
110
115
|
]
|
|
111
116
|
|
|
112
117
|
for source in sources:
|
|
@@ -324,35 +329,133 @@ class ArXivCitationChecker:
|
|
|
324
329
|
|
|
325
330
|
return None
|
|
326
331
|
|
|
327
|
-
def
|
|
332
|
+
def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
|
|
328
333
|
"""
|
|
329
|
-
|
|
334
|
+
Check if a reference is an ArXiv paper.
|
|
330
335
|
|
|
331
|
-
|
|
332
|
-
|
|
336
|
+
Args:
|
|
337
|
+
reference: Reference dictionary
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
True if reference appears to be an ArXiv paper
|
|
341
|
+
"""
|
|
342
|
+
arxiv_id, _ = self.extract_arxiv_id(reference)
|
|
343
|
+
return arxiv_id is not None
|
|
344
|
+
|
|
345
|
+
def _fetch_version_metadata_from_html(self, arxiv_id: str, version_num: int) -> Optional[Dict[str, Any]]:
|
|
346
|
+
"""
|
|
347
|
+
Fetch and parse metadata for a specific version using HTML scraping.
|
|
333
348
|
|
|
334
349
|
Args:
|
|
335
350
|
arxiv_id: ArXiv ID without version
|
|
351
|
+
version_num: Version number to fetch (1, 2, 3, etc.)
|
|
336
352
|
|
|
337
353
|
Returns:
|
|
338
|
-
|
|
354
|
+
Dictionary with version metadata or None if version doesn't exist
|
|
339
355
|
"""
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
356
|
+
version_str = f"v{version_num}"
|
|
357
|
+
url = f"{self.abs_url}/{arxiv_id}{version_str}"
|
|
358
|
+
|
|
359
|
+
self.rate_limiter.wait()
|
|
360
|
+
try:
|
|
361
|
+
logger.debug(f"Checking historical version: {url}")
|
|
362
|
+
response = requests.get(url, timeout=self.timeout)
|
|
363
|
+
if response.status_code == 404:
|
|
364
|
+
return None # Version does not exist
|
|
365
|
+
response.raise_for_status()
|
|
366
|
+
html_content = response.text
|
|
367
|
+
|
|
368
|
+
# Parse meta tags for metadata
|
|
369
|
+
# Title
|
|
370
|
+
title_match = re.search(r'<meta name="citation_title" content="(.*?)"', html_content)
|
|
371
|
+
title = html.unescape(title_match.group(1)).strip() if title_match else ""
|
|
372
|
+
|
|
373
|
+
# Authors
|
|
374
|
+
authors = []
|
|
375
|
+
for auth in re.findall(r'<meta name="citation_author" content="(.*?)"', html_content):
|
|
376
|
+
authors.append(html.unescape(auth).strip())
|
|
377
|
+
|
|
378
|
+
# Date/Year
|
|
379
|
+
date_match = re.search(r'<meta name="citation_date" content="(.*?)"', html_content)
|
|
380
|
+
year = None
|
|
381
|
+
if date_match:
|
|
382
|
+
ym = re.search(r'^(\d{4})', date_match.group(1))
|
|
383
|
+
if ym:
|
|
384
|
+
year = int(ym.group(1))
|
|
385
|
+
|
|
386
|
+
return {
|
|
387
|
+
'version': version_str,
|
|
388
|
+
'version_num': version_num,
|
|
389
|
+
'title': title,
|
|
390
|
+
'authors': [{'name': a} for a in authors],
|
|
391
|
+
'year': year,
|
|
392
|
+
'url': url,
|
|
393
|
+
}
|
|
394
|
+
except Exception as e:
|
|
395
|
+
logger.warning(f"Failed to fetch history {version_str}: {e}")
|
|
396
|
+
return None
|
|
397
|
+
|
|
398
|
+
def _get_latest_version_number(self, arxiv_id: str) -> Optional[int]:
|
|
345
399
|
"""
|
|
346
|
-
|
|
400
|
+
Get the latest version number by fetching the abstract page.
|
|
347
401
|
|
|
348
402
|
Args:
|
|
349
|
-
|
|
403
|
+
arxiv_id: ArXiv ID without version
|
|
350
404
|
|
|
351
405
|
Returns:
|
|
352
|
-
|
|
406
|
+
Latest version number as integer, or None if couldn't determine
|
|
353
407
|
"""
|
|
354
|
-
|
|
355
|
-
|
|
408
|
+
url = f"{self.abs_url}/{arxiv_id}"
|
|
409
|
+
|
|
410
|
+
self.rate_limiter.wait()
|
|
411
|
+
try:
|
|
412
|
+
response = requests.get(url, timeout=self.timeout)
|
|
413
|
+
response.raise_for_status()
|
|
414
|
+
|
|
415
|
+
# Look for version links like "[v1]", "[v2]", etc.
|
|
416
|
+
versions = re.findall(r'\[v(\d+)\]', response.text)
|
|
417
|
+
if versions:
|
|
418
|
+
return max(int(v) for v in versions)
|
|
419
|
+
return None
|
|
420
|
+
except Exception as e:
|
|
421
|
+
logger.warning(f"Failed to get latest version for {arxiv_id}: {e}")
|
|
422
|
+
return None
|
|
423
|
+
|
|
424
|
+
def _compare_info_match(
|
|
425
|
+
self, cited_title: str, cited_authors: List[str], cited_year: Optional[int],
|
|
426
|
+
authoritative_title: str, authoritative_authors: List[str], authoritative_year: Optional[int]) -> bool:
|
|
427
|
+
"""
|
|
428
|
+
Compare the information of a cited paper with the authoritative information.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
cited_title: Title from the reference
|
|
432
|
+
cited_authors: Authors from the reference
|
|
433
|
+
cited_year: Year from the reference
|
|
434
|
+
authoritative_title: Title from ArXiv version
|
|
435
|
+
authoritative_authors: Authors from ArXiv version
|
|
436
|
+
authoritative_year: Year from ArXiv version
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
True if the information matches, False otherwise.
|
|
440
|
+
"""
|
|
441
|
+
# Compare title
|
|
442
|
+
if cited_title and authoritative_title:
|
|
443
|
+
title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
|
|
444
|
+
if title_similarity < SIMILARITY_THRESHOLD:
|
|
445
|
+
return False
|
|
446
|
+
|
|
447
|
+
# Compare authors
|
|
448
|
+
if cited_authors and authoritative_authors:
|
|
449
|
+
authors_match, _ = compare_authors(cited_authors, authoritative_authors)
|
|
450
|
+
if not authors_match:
|
|
451
|
+
return False
|
|
452
|
+
|
|
453
|
+
# Compare year
|
|
454
|
+
if cited_year and authoritative_year:
|
|
455
|
+
if cited_year != authoritative_year:
|
|
456
|
+
return False
|
|
457
|
+
|
|
458
|
+
return True
|
|
356
459
|
|
|
357
460
|
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
358
461
|
"""
|
|
@@ -360,10 +463,10 @@ class ArXivCitationChecker:
|
|
|
360
463
|
|
|
361
464
|
This method:
|
|
362
465
|
1. Extracts the ArXiv ID from the reference
|
|
363
|
-
2. Fetches the official BibTeX from ArXiv (
|
|
364
|
-
3.
|
|
365
|
-
4.
|
|
366
|
-
5.
|
|
466
|
+
2. Fetches the official BibTeX from ArXiv (latest version)
|
|
467
|
+
3. Compares cited metadata against latest version
|
|
468
|
+
4. If errors found, checks historical versions to find a match
|
|
469
|
+
5. Annotates errors with version info if reference matches an older version
|
|
367
470
|
|
|
368
471
|
Args:
|
|
369
472
|
reference: Reference dictionary with title, authors, year, url, etc.
|
|
@@ -385,34 +488,26 @@ class ArXivCitationChecker:
|
|
|
385
488
|
|
|
386
489
|
logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
|
|
387
490
|
|
|
388
|
-
#
|
|
491
|
+
# Extract information from reference for comparison
|
|
492
|
+
cited_title = reference.get('title', '').strip()
|
|
493
|
+
cited_authors = reference.get('authors', [])
|
|
494
|
+
cited_year = reference.get('year')
|
|
495
|
+
|
|
496
|
+
# Fetch authoritative BibTeX (latest version)
|
|
389
497
|
bibtex_content = self.fetch_bibtex(arxiv_id)
|
|
390
498
|
|
|
391
499
|
if not bibtex_content:
|
|
392
500
|
logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
|
|
393
501
|
return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
|
|
394
502
|
|
|
395
|
-
|
|
396
|
-
verified_data = self.parse_bibtex(bibtex_content)
|
|
503
|
+
latest_data = self.parse_bibtex(bibtex_content)
|
|
397
504
|
|
|
398
|
-
if not
|
|
505
|
+
if not latest_data:
|
|
399
506
|
logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
|
|
400
507
|
return None, [], None
|
|
401
|
-
|
|
402
|
-
#
|
|
403
|
-
|
|
404
|
-
# ArXiv BibTeX always returns latest version metadata
|
|
405
|
-
# We don't know the actual latest version number without additional API call,
|
|
406
|
-
# but we can warn that a specific version was cited
|
|
407
|
-
errors.append({
|
|
408
|
-
'warning_type': 'version',
|
|
409
|
-
'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
|
|
410
|
-
})
|
|
411
|
-
logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
|
|
412
|
-
|
|
413
|
-
# Compare title
|
|
414
|
-
cited_title = reference.get('title', '').strip()
|
|
415
|
-
authoritative_title = verified_data.get('title', '').strip()
|
|
508
|
+
|
|
509
|
+
# Compare against latest version
|
|
510
|
+
authoritative_title = latest_data.get('title', '').strip()
|
|
416
511
|
|
|
417
512
|
if cited_title and authoritative_title:
|
|
418
513
|
title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
|
|
@@ -426,9 +521,8 @@ class ArXivCitationChecker:
|
|
|
426
521
|
})
|
|
427
522
|
|
|
428
523
|
# Compare authors
|
|
429
|
-
cited_authors = reference.get('authors', [])
|
|
430
524
|
if cited_authors:
|
|
431
|
-
authoritative_authors =
|
|
525
|
+
authoritative_authors = latest_data.get('authors', [])
|
|
432
526
|
authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
|
|
433
527
|
|
|
434
528
|
if not authors_match:
|
|
@@ -440,9 +534,7 @@ class ArXivCitationChecker:
|
|
|
440
534
|
})
|
|
441
535
|
|
|
442
536
|
# Compare year
|
|
443
|
-
|
|
444
|
-
authoritative_year = verified_data.get('year')
|
|
445
|
-
|
|
537
|
+
authoritative_year = latest_data.get('year')
|
|
446
538
|
year_warning = validate_year(
|
|
447
539
|
cited_year=cited_year,
|
|
448
540
|
paper_year=authoritative_year,
|
|
@@ -451,10 +543,50 @@ class ArXivCitationChecker:
|
|
|
451
543
|
)
|
|
452
544
|
if year_warning:
|
|
453
545
|
errors.append(year_warning)
|
|
454
|
-
|
|
455
|
-
# Build URL
|
|
546
|
+
|
|
456
547
|
paper_url = f"https://arxiv.org/abs/{arxiv_id}"
|
|
457
548
|
|
|
458
|
-
|
|
549
|
+
# If no errors against latest version, we're done
|
|
550
|
+
if len(errors) == 0:
|
|
551
|
+
logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with no errors")
|
|
552
|
+
return latest_data, errors, paper_url
|
|
553
|
+
|
|
554
|
+
# Check if reference matches a historical version
|
|
555
|
+
# Get latest version number first
|
|
556
|
+
latest_version_num = self._get_latest_version_number(arxiv_id)
|
|
557
|
+
|
|
558
|
+
if latest_version_num and latest_version_num > 1:
|
|
559
|
+
# Check historical versions (1 to latest-1)
|
|
560
|
+
for version_num in range(1, latest_version_num):
|
|
561
|
+
version_data = self._fetch_version_metadata_from_html(arxiv_id, version_num)
|
|
562
|
+
if not version_data:
|
|
563
|
+
continue
|
|
564
|
+
|
|
565
|
+
# Check if reference matches this historical version
|
|
566
|
+
if self._compare_info_match(
|
|
567
|
+
cited_title, cited_authors, cited_year,
|
|
568
|
+
version_data['title'], version_data['authors'], version_data['year']):
|
|
569
|
+
|
|
570
|
+
logger.debug(f"ArXivCitationChecker: Reference matches historical version v{version_num}")
|
|
571
|
+
|
|
572
|
+
# Convert errors to warnings with version update info
|
|
573
|
+
# Version update issues are informational, not errors - the citation was correct for its time
|
|
574
|
+
version_suffix = f" (v{version_num} vs v{latest_version_num} update)"
|
|
575
|
+
warnings = []
|
|
576
|
+
for error in errors:
|
|
577
|
+
warning = {
|
|
578
|
+
'warning_type': error.get('error_type', 'unknown') + version_suffix,
|
|
579
|
+
'warning_details': error.get('error_details', ''),
|
|
580
|
+
}
|
|
581
|
+
# Preserve correction hints
|
|
582
|
+
for key in ['ref_title_correct', 'ref_authors_correct', 'ref_year_correct']:
|
|
583
|
+
if key in error:
|
|
584
|
+
warning[key] = error[key]
|
|
585
|
+
warnings.append(warning)
|
|
586
|
+
|
|
587
|
+
# Return with warnings instead of errors - URL points to the matched version
|
|
588
|
+
matched_url = f"https://arxiv.org/abs/{arxiv_id}v{version_num}"
|
|
589
|
+
return latest_data, warnings, matched_url
|
|
459
590
|
|
|
460
|
-
|
|
591
|
+
logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
|
|
592
|
+
return latest_data, errors, paper_url
|
|
@@ -257,6 +257,90 @@ class EnhancedHybridReferenceChecker:
|
|
|
257
257
|
|
|
258
258
|
return True
|
|
259
259
|
|
|
260
|
+
def _merge_arxiv_with_semantic_scholar(
|
|
261
|
+
self,
|
|
262
|
+
arxiv_data: Dict[str, Any],
|
|
263
|
+
arxiv_errors: List[Dict[str, Any]],
|
|
264
|
+
arxiv_url: str,
|
|
265
|
+
ss_data: Dict[str, Any],
|
|
266
|
+
ss_errors: List[Dict[str, Any]],
|
|
267
|
+
ss_url: str,
|
|
268
|
+
reference: Dict[str, Any]
|
|
269
|
+
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
|
270
|
+
"""
|
|
271
|
+
Merge ArXiv verification results with Semantic Scholar data.
|
|
272
|
+
|
|
273
|
+
ArXiv is authoritative for title/author/year, but Semantic Scholar
|
|
274
|
+
provides venue information and additional URLs (DOI, S2 page).
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
arxiv_data: Verified data from ArXiv
|
|
278
|
+
arxiv_errors: Errors/warnings from ArXiv verification
|
|
279
|
+
arxiv_url: ArXiv URL
|
|
280
|
+
ss_data: Data from Semantic Scholar
|
|
281
|
+
ss_errors: Errors from Semantic Scholar (used for venue checking)
|
|
282
|
+
ss_url: Semantic Scholar URL
|
|
283
|
+
reference: Original reference
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
Tuple of (merged_data, merged_errors)
|
|
287
|
+
"""
|
|
288
|
+
merged_data = dict(arxiv_data) if arxiv_data else {}
|
|
289
|
+
merged_errors = list(arxiv_errors) if arxiv_errors else []
|
|
290
|
+
|
|
291
|
+
if not ss_data:
|
|
292
|
+
return merged_data, merged_errors
|
|
293
|
+
|
|
294
|
+
# Add Semantic Scholar URL to external IDs
|
|
295
|
+
if 'externalIds' not in merged_data:
|
|
296
|
+
merged_data['externalIds'] = {}
|
|
297
|
+
|
|
298
|
+
ss_external_ids = ss_data.get('externalIds', {})
|
|
299
|
+
|
|
300
|
+
# Add S2 paper ID
|
|
301
|
+
if ss_data.get('paperId'):
|
|
302
|
+
merged_data['externalIds']['S2PaperId'] = ss_data['paperId']
|
|
303
|
+
|
|
304
|
+
# Add DOI if available from Semantic Scholar
|
|
305
|
+
if ss_external_ids.get('DOI') and not merged_data['externalIds'].get('DOI'):
|
|
306
|
+
merged_data['externalIds']['DOI'] = ss_external_ids['DOI']
|
|
307
|
+
|
|
308
|
+
# Store Semantic Scholar URL
|
|
309
|
+
merged_data['_semantic_scholar_url'] = ss_url
|
|
310
|
+
|
|
311
|
+
# Check for venue mismatch - if paper was published at a venue but citation only says arXiv
|
|
312
|
+
ss_venue = ss_data.get('venue', '')
|
|
313
|
+
cited_venue = reference.get('venue', reference.get('journal', '')).strip().lower()
|
|
314
|
+
|
|
315
|
+
# Normalize ArXiv venue names
|
|
316
|
+
is_cited_as_arxiv = (
|
|
317
|
+
not cited_venue or
|
|
318
|
+
cited_venue in ['arxiv', 'arxiv preprint', 'arxiv.org', 'preprint']
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Check if Semantic Scholar shows a real publication venue
|
|
322
|
+
if ss_venue and is_cited_as_arxiv:
|
|
323
|
+
# Ignore generic/empty venues
|
|
324
|
+
ss_venue_lower = ss_venue.lower().strip()
|
|
325
|
+
is_real_venue = (
|
|
326
|
+
ss_venue_lower and
|
|
327
|
+
ss_venue_lower not in ['arxiv', 'arxiv.org', 'preprint', ''] and
|
|
328
|
+
not ss_venue_lower.startswith('arxiv')
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
if is_real_venue:
|
|
332
|
+
# This paper was published at a venue but is only cited as arXiv
|
|
333
|
+
logger.debug(f"Enhanced Hybrid: Paper published at '{ss_venue}' but cited as arXiv")
|
|
334
|
+
merged_errors.append({
|
|
335
|
+
'warning_type': 'venue',
|
|
336
|
+
'warning_details': f"Paper was published at venue but cited as arXiv preprint:\n cited: arXiv\n actual: {ss_venue}",
|
|
337
|
+
'ref_venue_correct': ss_venue
|
|
338
|
+
})
|
|
339
|
+
# Also add the venue to merged data
|
|
340
|
+
merged_data['venue'] = ss_venue
|
|
341
|
+
|
|
342
|
+
return merged_data, merged_errors
|
|
343
|
+
|
|
260
344
|
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
261
345
|
"""
|
|
262
346
|
Verify a non-arXiv reference using multiple APIs in priority order
|
|
@@ -287,6 +371,9 @@ class EnhancedHybridReferenceChecker:
|
|
|
287
371
|
# Track all APIs that failed and could be retried
|
|
288
372
|
failed_apis = []
|
|
289
373
|
|
|
374
|
+
# Store ArXiv result for potential merging with Semantic Scholar
|
|
375
|
+
arxiv_result = None
|
|
376
|
+
|
|
290
377
|
# PHASE 1: Try all APIs once in priority order
|
|
291
378
|
|
|
292
379
|
# Strategy 0: For ArXiv papers, try ArXiv Citation checker first (authoritative source)
|
|
@@ -295,13 +382,15 @@ class EnhancedHybridReferenceChecker:
|
|
|
295
382
|
logger.debug("Enhanced Hybrid: Reference appears to be ArXiv paper, trying ArXiv Citation checker first")
|
|
296
383
|
verified_data, errors, url, success, failure_type = self._try_api('arxiv_citation', self.arxiv_citation, reference)
|
|
297
384
|
if success:
|
|
298
|
-
logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded
|
|
299
|
-
|
|
385
|
+
logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded, also querying Semantic Scholar for venue/URLs")
|
|
386
|
+
arxiv_result = (verified_data, errors, url)
|
|
387
|
+
# Continue to Semantic Scholar to get venue and additional URLs
|
|
300
388
|
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
301
389
|
failed_apis.append(('arxiv_citation', self.arxiv_citation, failure_type))
|
|
302
390
|
|
|
303
391
|
# Strategy 1: Always try local database first (fastest)
|
|
304
|
-
if
|
|
392
|
+
# Skip if we already have ArXiv result - we'll go straight to Semantic Scholar for venue info
|
|
393
|
+
if self.local_db and not arxiv_result:
|
|
305
394
|
verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)
|
|
306
395
|
if success:
|
|
307
396
|
return verified_data, errors, url
|
|
@@ -309,8 +398,9 @@ class EnhancedHybridReferenceChecker:
|
|
|
309
398
|
failed_apis.append(('local_db', self.local_db, failure_type))
|
|
310
399
|
|
|
311
400
|
# Strategy 2: If reference has DOI, prioritize CrossRef
|
|
401
|
+
# Skip if we already have ArXiv result - we'll go straight to Semantic Scholar for venue info
|
|
312
402
|
crossref_result = None
|
|
313
|
-
if self._should_try_doi_apis_first(reference) and self.crossref:
|
|
403
|
+
if self._should_try_doi_apis_first(reference) and self.crossref and not arxiv_result:
|
|
314
404
|
verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
|
|
315
405
|
if success:
|
|
316
406
|
# Check if the data is complete enough to use
|
|
@@ -327,11 +417,34 @@ class EnhancedHybridReferenceChecker:
|
|
|
327
417
|
if self.semantic_scholar:
|
|
328
418
|
verified_data, errors, url, success, failure_type = self._try_api('semantic_scholar', self.semantic_scholar, reference)
|
|
329
419
|
if success:
|
|
420
|
+
# If we have ArXiv result, merge Semantic Scholar venue/URLs into it
|
|
421
|
+
if arxiv_result:
|
|
422
|
+
# Check if SS data is valid and venue is not just arxiv
|
|
423
|
+
# (skip merge if SS only found the arxiv version, no published venue)
|
|
424
|
+
if verified_data:
|
|
425
|
+
ss_venue = self.semantic_scholar.get_venue_from_paper_data(verified_data)
|
|
426
|
+
if ss_venue and 'arxiv' in ss_venue.lower():
|
|
427
|
+
# SS only found arxiv venue, skip merge and return arxiv result
|
|
428
|
+
logger.debug("Enhanced Hybrid: Semantic Scholar only found ArXiv venue, skipping merge")
|
|
429
|
+
return arxiv_result
|
|
430
|
+
|
|
431
|
+
arxiv_data, arxiv_errors, arxiv_url = arxiv_result
|
|
432
|
+
merged_data, merged_errors = self._merge_arxiv_with_semantic_scholar(
|
|
433
|
+
arxiv_data, arxiv_errors, arxiv_url,
|
|
434
|
+
verified_data, errors, url,
|
|
435
|
+
reference
|
|
436
|
+
)
|
|
437
|
+
return merged_data, merged_errors, arxiv_url
|
|
330
438
|
return verified_data, errors, url
|
|
331
439
|
# For Semantic Scholar, only retry retryable failures (not 'not_found')
|
|
332
440
|
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
333
441
|
failed_apis.append(('semantic_scholar', self.semantic_scholar, failure_type))
|
|
334
442
|
|
|
443
|
+
# If ArXiv succeeded but Semantic Scholar failed, return ArXiv result
|
|
444
|
+
if arxiv_result:
|
|
445
|
+
logger.debug("Enhanced Hybrid: Returning ArXiv result (Semantic Scholar unavailable)")
|
|
446
|
+
return arxiv_result
|
|
447
|
+
|
|
335
448
|
# Strategy 4: Try OpenAlex API (excellent reliability, replaces Google Scholar)
|
|
336
449
|
openalex_result = None
|
|
337
450
|
if self.openalex:
|
|
@@ -223,7 +223,49 @@ class NonArxivReferenceChecker:
|
|
|
223
223
|
"""
|
|
224
224
|
return compare_authors(cited_authors, correct_authors)
|
|
225
225
|
|
|
226
|
-
|
|
226
|
+
def get_venue_from_paper_data(self, paper_data: Dict[str, Any]) -> Optional[str]:
|
|
227
|
+
"""
|
|
228
|
+
Extract venue from paper data dictionary.
|
|
229
|
+
|
|
230
|
+
Checks multiple fields since Semantic Scholar returns venue info
|
|
231
|
+
in different fields depending on publication type.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
paper_data: Paper data dictionary from Semantic Scholar
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Venue string or None if not found
|
|
238
|
+
"""
|
|
239
|
+
if not paper_data:
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
paper_venue = None
|
|
243
|
+
|
|
244
|
+
# First try the simple 'venue' field (string)
|
|
245
|
+
if paper_data.get('venue'):
|
|
246
|
+
paper_venue = paper_data.get('venue')
|
|
247
|
+
|
|
248
|
+
# If no venue, try publicationVenue object
|
|
249
|
+
if not paper_venue and paper_data.get('publicationVenue'):
|
|
250
|
+
pub_venue = paper_data.get('publicationVenue')
|
|
251
|
+
if isinstance(pub_venue, dict):
|
|
252
|
+
paper_venue = pub_venue.get('name', '')
|
|
253
|
+
elif isinstance(pub_venue, str):
|
|
254
|
+
paper_venue = pub_venue
|
|
255
|
+
|
|
256
|
+
# If still no venue, try journal object
|
|
257
|
+
if not paper_venue and paper_data.get('journal'):
|
|
258
|
+
journal = paper_data.get('journal')
|
|
259
|
+
if isinstance(journal, dict):
|
|
260
|
+
paper_venue = journal.get('name', '')
|
|
261
|
+
elif isinstance(journal, str):
|
|
262
|
+
paper_venue = journal
|
|
263
|
+
|
|
264
|
+
# Ensure paper_venue is a string
|
|
265
|
+
if paper_venue and not isinstance(paper_venue, str):
|
|
266
|
+
paper_venue = str(paper_venue)
|
|
267
|
+
|
|
268
|
+
return paper_venue if paper_venue else None
|
|
227
269
|
|
|
228
270
|
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
229
271
|
"""
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/author_utils.py
RENAMED
|
@@ -42,13 +42,26 @@ def compare_authors(cited_authors, correct_authors, threshold=0.8):
|
|
|
42
42
|
Compare two author lists and return similarity metrics
|
|
43
43
|
|
|
44
44
|
Args:
|
|
45
|
-
cited_authors: List of authors as cited
|
|
46
|
-
correct_authors: List of correct authors
|
|
45
|
+
cited_authors: List of authors as cited (can be strings or dicts with 'name' key)
|
|
46
|
+
correct_authors: List of correct authors (can be strings or dicts with 'name' key)
|
|
47
47
|
threshold: Similarity threshold (0-1)
|
|
48
48
|
|
|
49
49
|
Returns:
|
|
50
50
|
Dictionary with comparison results
|
|
51
51
|
"""
|
|
52
|
+
# Normalize author lists to strings (handle dict format from APIs)
|
|
53
|
+
def normalize_author_list(authors):
|
|
54
|
+
result = []
|
|
55
|
+
for a in authors:
|
|
56
|
+
if isinstance(a, dict):
|
|
57
|
+
result.append(a.get('name', str(a)))
|
|
58
|
+
else:
|
|
59
|
+
result.append(str(a))
|
|
60
|
+
return result
|
|
61
|
+
|
|
62
|
+
cited_authors = normalize_author_list(cited_authors) if cited_authors else []
|
|
63
|
+
correct_authors = normalize_author_list(correct_authors) if correct_authors else []
|
|
64
|
+
|
|
52
65
|
if not cited_authors or not correct_authors:
|
|
53
66
|
return {
|
|
54
67
|
'match': False,
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/bibliography_utils.py
RENAMED
|
@@ -164,8 +164,8 @@ def _parse_bibtex_references(bibliography_text):
|
|
|
164
164
|
Returns:
|
|
165
165
|
List of reference dictionaries
|
|
166
166
|
"""
|
|
167
|
-
from refchecker.utils.bibtex_parser import
|
|
168
|
-
return
|
|
167
|
+
from refchecker.utils.bibtex_parser import parse_bibtex_references
|
|
168
|
+
return parse_bibtex_references(bibliography_text)
|
|
169
169
|
|
|
170
170
|
|
|
171
171
|
def _parse_biblatex_references(bibliography_text):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/requires.txt
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/academic_refchecker.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/static/assets/index-2P6L_39v.css
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/static/assets/index-hk21nqxR.js
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/__init__.py
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/crossref.py
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/github_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/openalex.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/webpage_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/config/logging.conf
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/core/db_connection_pool.py
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/core/parallel_processor.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/database/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/scripts/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/services/__init__.py
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/services/pdf_processor.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/arxiv_rate_limiter.py
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/arxiv_utils.py
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/biblatex_parser.py
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/bibtex_parser.py
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/config_validator.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/error_utils.py
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/mock_objects.py
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/text_utils.py
RENAMED
|
File without changes
|
{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/unicode_utils.py
RENAMED
|
File without changes
|
|
File without changes
|