academic-refchecker 2.0.11__tar.gz → 2.0.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-2.0.11/academic_refchecker.egg-info → academic_refchecker-2.0.13}/PKG-INFO +2 -1
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13/academic_refchecker.egg-info}/PKG-INFO +2 -1
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/SOURCES.txt +2 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/requires.txt +1 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/pyproject.toml +1 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/requirements.txt +1 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/__version__.py +1 -1
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/__init__.py +3 -1
- academic_refchecker-2.0.13/src/refchecker/checkers/arxiv_citation.py +460 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/enhanced_hybrid_checker.py +24 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/config/settings.py +8 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/llm/base.py +1 -15
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/llm/providers.py +102 -94
- academic_refchecker-2.0.13/src/refchecker/utils/arxiv_rate_limiter.py +133 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/text_utils.py +32 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/LICENSE +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/MANIFEST.in +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/README.md +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/__init__.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/__main__.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/cli.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/concurrency.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/database.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/main.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/models.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/refchecker_wrapper.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/static/assets/index-2P6L_39v.css +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/static/assets/index-hk21nqxR.js +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/static/favicon.svg +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/static/index.html +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/static/vite.svg +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/thumbnail.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/backend/websocket_manager.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/scripts/download_db.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/scripts/run_tests.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/setup.cfg +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/__init__.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/__main__.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/crossref.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/github_checker.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/openalex.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/openreview_checker.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/pdf_paper_checker.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/semantic_scholar.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/webpage_checker.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/config/__init__.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/config/logging.conf +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/core/__init__.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/core/db_connection_pool.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/core/parallel_processor.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/core/refchecker.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/database/__init__.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/llm/__init__.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/scripts/__init__.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/services/__init__.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/services/pdf_processor.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/__init__.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/arxiv_utils.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/author_utils.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/biblatex_parser.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/bibliography_utils.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/bibtex_parser.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/config_validator.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/db_utils.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/doi_utils.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/error_utils.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/mock_objects.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/unicode_utils.py +0 -0
- {academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/utils/url_utils.py +0 -0
{academic_refchecker-2.0.11/academic_refchecker.egg-info → academic_refchecker-2.0.13}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: academic-refchecker
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.13
|
|
4
4
|
Summary: A comprehensive tool for validating reference accuracy in academic papers
|
|
5
5
|
Author-email: Mark Russinovich <markrussinovich@hotmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -32,6 +32,7 @@ Requires-Dist: python-Levenshtein>=0.12.0
|
|
|
32
32
|
Requires-Dist: pandas<2.4.0,>=1.3.0
|
|
33
33
|
Requires-Dist: numpy<2.0.0,>=1.22.4
|
|
34
34
|
Requires-Dist: pdfplumber>=0.6.0
|
|
35
|
+
Requires-Dist: bibtexparser>=1.4.0
|
|
35
36
|
Provides-Extra: dev
|
|
36
37
|
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
|
37
38
|
Requires-Dist: pytest-cov>=2.0.0; extra == "dev"
|
{academic_refchecker-2.0.11 → academic_refchecker-2.0.13/academic_refchecker.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: academic-refchecker
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.13
|
|
4
4
|
Summary: A comprehensive tool for validating reference accuracy in academic papers
|
|
5
5
|
Author-email: Mark Russinovich <markrussinovich@hotmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -32,6 +32,7 @@ Requires-Dist: python-Levenshtein>=0.12.0
|
|
|
32
32
|
Requires-Dist: pandas<2.4.0,>=1.3.0
|
|
33
33
|
Requires-Dist: numpy<2.0.0,>=1.22.4
|
|
34
34
|
Requires-Dist: pdfplumber>=0.6.0
|
|
35
|
+
Requires-Dist: bibtexparser>=1.4.0
|
|
35
36
|
Provides-Extra: dev
|
|
36
37
|
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
|
37
38
|
Requires-Dist: pytest-cov>=2.0.0; extra == "dev"
|
{academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/academic_refchecker.egg-info/SOURCES.txt
RENAMED
|
@@ -31,6 +31,7 @@ src/refchecker/__init__.py
|
|
|
31
31
|
src/refchecker/__main__.py
|
|
32
32
|
src/refchecker/__version__.py
|
|
33
33
|
src/refchecker/checkers/__init__.py
|
|
34
|
+
src/refchecker/checkers/arxiv_citation.py
|
|
34
35
|
src/refchecker/checkers/crossref.py
|
|
35
36
|
src/refchecker/checkers/enhanced_hybrid_checker.py
|
|
36
37
|
src/refchecker/checkers/github_checker.py
|
|
@@ -57,6 +58,7 @@ src/refchecker/scripts/start_vllm_server.py
|
|
|
57
58
|
src/refchecker/services/__init__.py
|
|
58
59
|
src/refchecker/services/pdf_processor.py
|
|
59
60
|
src/refchecker/utils/__init__.py
|
|
61
|
+
src/refchecker/utils/arxiv_rate_limiter.py
|
|
60
62
|
src/refchecker/utils/arxiv_utils.py
|
|
61
63
|
src/refchecker/utils/author_utils.py
|
|
62
64
|
src/refchecker/utils/biblatex_parser.py
|
{academic_refchecker-2.0.11 → academic_refchecker-2.0.13}/src/refchecker/checkers/__init__.py
RENAMED
|
@@ -7,11 +7,13 @@ from .local_semantic_scholar import LocalNonArxivReferenceChecker
|
|
|
7
7
|
from .enhanced_hybrid_checker import EnhancedHybridReferenceChecker
|
|
8
8
|
from .openalex import OpenAlexReferenceChecker
|
|
9
9
|
from .crossref import CrossRefReferenceChecker
|
|
10
|
+
from .arxiv_citation import ArXivCitationChecker
|
|
10
11
|
|
|
11
12
|
__all__ = [
|
|
12
13
|
"NonArxivReferenceChecker",
|
|
13
14
|
"LocalNonArxivReferenceChecker",
|
|
14
15
|
"EnhancedHybridReferenceChecker",
|
|
15
16
|
"OpenAlexReferenceChecker",
|
|
16
|
-
"CrossRefReferenceChecker"
|
|
17
|
+
"CrossRefReferenceChecker",
|
|
18
|
+
"ArXivCitationChecker",
|
|
17
19
|
]
|
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
ArXiv Citation Checker - Authoritative Source for ArXiv Papers
|
|
4
|
+
|
|
5
|
+
This module provides functionality to verify ArXiv papers by fetching the official
|
|
6
|
+
BibTeX citation directly from ArXiv. This is used as the authoritative metadata source
|
|
7
|
+
for papers found on ArXiv, as it reflects the author-submitted metadata.
|
|
8
|
+
|
|
9
|
+
Key features:
|
|
10
|
+
- Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
|
|
11
|
+
- Always uses the latest version metadata (strips version suffixes)
|
|
12
|
+
- Logs warnings when cited version differs from latest version
|
|
13
|
+
- Parses BibTeX to extract normalized metadata matching refchecker schema
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
from refchecker.checkers.arxiv_citation import ArXivCitationChecker
|
|
17
|
+
|
|
18
|
+
checker = ArXivCitationChecker()
|
|
19
|
+
|
|
20
|
+
reference = {
|
|
21
|
+
'title': 'Attention Is All You Need',
|
|
22
|
+
'authors': ['Ashish Vaswani', 'Noam Shazeer'],
|
|
23
|
+
'year': 2017,
|
|
24
|
+
'url': 'https://arxiv.org/abs/1706.03762v5',
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
verified_data, errors, url = checker.verify_reference(reference)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import re
|
|
31
|
+
import logging
|
|
32
|
+
import requests
|
|
33
|
+
from typing import Dict, List, Tuple, Optional, Any
|
|
34
|
+
|
|
35
|
+
import bibtexparser
|
|
36
|
+
from bibtexparser.bparser import BibTexParser
|
|
37
|
+
from bibtexparser.customization import convert_to_unicode
|
|
38
|
+
|
|
39
|
+
from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
|
|
40
|
+
from refchecker.utils.text_utils import (
|
|
41
|
+
normalize_text,
|
|
42
|
+
compare_authors,
|
|
43
|
+
compare_titles_with_latex_cleaning,
|
|
44
|
+
strip_latex_commands,
|
|
45
|
+
)
|
|
46
|
+
from refchecker.utils.error_utils import format_title_mismatch, validate_year
|
|
47
|
+
from refchecker.config.settings import get_config
|
|
48
|
+
|
|
49
|
+
logger = logging.getLogger(__name__)
|
|
50
|
+
|
|
51
|
+
# Get configuration
|
|
52
|
+
config = get_config()
|
|
53
|
+
SIMILARITY_THRESHOLD = config["text_processing"]["similarity_threshold"]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ArXivCitationChecker:
|
|
57
|
+
"""
|
|
58
|
+
Reference checker that uses ArXiv's official BibTeX export as the authoritative source.
|
|
59
|
+
|
|
60
|
+
This checker fetches the official BibTeX citation from ArXiv for papers identified
|
|
61
|
+
by their ArXiv ID. It uses the latest version's metadata as the authoritative source
|
|
62
|
+
and logs warnings when the cited version differs from the latest.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self, timeout: int = 30):
|
|
66
|
+
"""
|
|
67
|
+
Initialize the ArXiv Citation Checker.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
timeout: HTTP request timeout in seconds
|
|
71
|
+
"""
|
|
72
|
+
self.base_url = "https://arxiv.org/bibtex"
|
|
73
|
+
self.abs_url = "https://arxiv.org/abs"
|
|
74
|
+
self.timeout = timeout
|
|
75
|
+
self.rate_limiter = ArXivRateLimiter.get_instance()
|
|
76
|
+
|
|
77
|
+
# Pattern to extract arXiv IDs from various URL formats
|
|
78
|
+
self.arxiv_id_patterns = [
|
|
79
|
+
# Standard arxiv.org URLs
|
|
80
|
+
r'arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
81
|
+
r'arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
82
|
+
# Old format with category
|
|
83
|
+
r'arxiv\.org/abs/([a-z-]+/[0-9]{7})(v\d+)?',
|
|
84
|
+
r'arxiv\.org/pdf/([a-z-]+/[0-9]{7})(v\d+)?',
|
|
85
|
+
# arXiv: prefix in text
|
|
86
|
+
r'arXiv:([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
87
|
+
r'arXiv:([a-z-]+/[0-9]{7})(v\d+)?',
|
|
88
|
+
# export.arxiv.org URLs
|
|
89
|
+
r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
90
|
+
r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
|
94
|
+
"""
|
|
95
|
+
Extract ArXiv ID from a reference, returning both the base ID and version.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
reference: Reference dictionary containing url, raw_text, etc.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Tuple of (arxiv_id_without_version, version_string_or_None)
|
|
102
|
+
For example: ("2301.12345", "v2") or ("2301.12345", None)
|
|
103
|
+
"""
|
|
104
|
+
# Sources to check for ArXiv ID
|
|
105
|
+
sources = [
|
|
106
|
+
reference.get('url', ''),
|
|
107
|
+
reference.get('cited_url', ''),
|
|
108
|
+
reference.get('raw_text', ''),
|
|
109
|
+
reference.get('eprint', ''), # BibTeX field
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
for source in sources:
|
|
113
|
+
if not source:
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
for pattern in self.arxiv_id_patterns:
|
|
117
|
+
match = re.search(pattern, source, re.IGNORECASE)
|
|
118
|
+
if match:
|
|
119
|
+
arxiv_id = match.group(1)
|
|
120
|
+
version = match.group(2) if len(match.groups()) > 1 else None
|
|
121
|
+
logger.debug(f"Extracted ArXiv ID: {arxiv_id}, version: {version}")
|
|
122
|
+
return arxiv_id, version
|
|
123
|
+
|
|
124
|
+
return None, None
|
|
125
|
+
|
|
126
|
+
def fetch_bibtex(self, arxiv_id: str) -> Optional[str]:
|
|
127
|
+
"""
|
|
128
|
+
Fetch the official BibTeX citation from ArXiv.
|
|
129
|
+
|
|
130
|
+
This always fetches the latest version's BibTeX (ArXiv default behavior).
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
arxiv_id: ArXiv ID without version suffix (e.g., "2301.12345")
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
BibTeX string or None if fetch failed
|
|
137
|
+
"""
|
|
138
|
+
url = f"{self.base_url}/{arxiv_id}"
|
|
139
|
+
|
|
140
|
+
# Wait for rate limit
|
|
141
|
+
self.rate_limiter.wait()
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
logger.debug(f"Fetching ArXiv BibTeX from: {url}")
|
|
145
|
+
response = requests.get(url, timeout=self.timeout)
|
|
146
|
+
response.raise_for_status()
|
|
147
|
+
|
|
148
|
+
bibtex_content = response.text.strip()
|
|
149
|
+
|
|
150
|
+
# Validate it looks like BibTeX
|
|
151
|
+
if bibtex_content and bibtex_content.startswith('@'):
|
|
152
|
+
logger.debug(f"Successfully fetched BibTeX for ArXiv paper {arxiv_id}")
|
|
153
|
+
return bibtex_content
|
|
154
|
+
else:
|
|
155
|
+
logger.debug(f"Invalid BibTeX response for ArXiv paper {arxiv_id}")
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
except requests.exceptions.Timeout:
|
|
159
|
+
logger.warning(f"Timeout fetching ArXiv BibTeX for {arxiv_id}")
|
|
160
|
+
return None
|
|
161
|
+
except requests.exceptions.RequestException as e:
|
|
162
|
+
logger.warning(f"Failed to fetch ArXiv BibTeX for {arxiv_id}: {e}")
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
def parse_bibtex(self, bibtex_str: str) -> Optional[Dict[str, Any]]:
|
|
166
|
+
"""
|
|
167
|
+
Parse BibTeX string and extract metadata in refchecker schema format.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
bibtex_str: BibTeX content string
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Dictionary with parsed metadata or None if parsing failed
|
|
174
|
+
"""
|
|
175
|
+
try:
|
|
176
|
+
# Configure parser
|
|
177
|
+
parser = BibTexParser(common_strings=True)
|
|
178
|
+
parser.customization = convert_to_unicode
|
|
179
|
+
|
|
180
|
+
# Parse BibTeX
|
|
181
|
+
bib_database = bibtexparser.loads(bibtex_str, parser=parser)
|
|
182
|
+
|
|
183
|
+
if not bib_database.entries:
|
|
184
|
+
logger.debug("No entries found in BibTeX")
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
entry = bib_database.entries[0]
|
|
188
|
+
|
|
189
|
+
# Extract and normalize fields
|
|
190
|
+
title = entry.get('title', '')
|
|
191
|
+
# Clean title - remove braces used for capitalization protection
|
|
192
|
+
title = re.sub(r'\{([^}]*)\}', r'\1', title)
|
|
193
|
+
title = title.strip()
|
|
194
|
+
|
|
195
|
+
# Extract authors
|
|
196
|
+
authors_str = entry.get('author', '')
|
|
197
|
+
authors = self._parse_authors(authors_str)
|
|
198
|
+
|
|
199
|
+
# Extract year - prefer year from eprint ID (original submission) over BibTeX year (latest revision)
|
|
200
|
+
arxiv_id = entry.get('eprint', '')
|
|
201
|
+
year = self._extract_year_from_eprint(arxiv_id)
|
|
202
|
+
|
|
203
|
+
# Fall back to BibTeX year field if eprint year extraction fails
|
|
204
|
+
if not year and entry.get('year'):
|
|
205
|
+
try:
|
|
206
|
+
year = int(entry['year'])
|
|
207
|
+
except ValueError:
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
# Build result in refchecker schema format
|
|
211
|
+
result = {
|
|
212
|
+
'title': title,
|
|
213
|
+
'authors': [{'name': author} for author in authors],
|
|
214
|
+
'year': year,
|
|
215
|
+
'venue': 'arXiv',
|
|
216
|
+
'externalIds': {
|
|
217
|
+
'ArXiv': arxiv_id,
|
|
218
|
+
},
|
|
219
|
+
'url': f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else None,
|
|
220
|
+
'isOpenAccess': True,
|
|
221
|
+
'openAccessPdf': {
|
|
222
|
+
'url': f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else None
|
|
223
|
+
},
|
|
224
|
+
# Store original bibtex for reference
|
|
225
|
+
'_bibtex_entry': entry,
|
|
226
|
+
'_source': 'ArXiv BibTeX Reference',
|
|
227
|
+
'_source_url': f"https://arxiv.org/bibtex/{arxiv_id}" if arxiv_id else None,
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
# Add DOI if present (some ArXiv papers have DOIs)
|
|
231
|
+
if entry.get('doi'):
|
|
232
|
+
result['externalIds']['DOI'] = entry['doi']
|
|
233
|
+
|
|
234
|
+
logger.debug(f"Parsed ArXiv BibTeX: title='{title[:50]}...', authors={len(authors)}, year={year}")
|
|
235
|
+
return result
|
|
236
|
+
|
|
237
|
+
except Exception as e:
|
|
238
|
+
logger.warning(f"Failed to parse BibTeX: {e}")
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
def _parse_authors(self, authors_str: str) -> List[str]:
|
|
242
|
+
"""
|
|
243
|
+
Parse BibTeX author string into list of author names.
|
|
244
|
+
|
|
245
|
+
BibTeX format: "Last1, First1 and Last2, First2 and ..."
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
authors_str: BibTeX author field value
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
List of author names in "First Last" format
|
|
252
|
+
"""
|
|
253
|
+
if not authors_str:
|
|
254
|
+
return []
|
|
255
|
+
|
|
256
|
+
authors = []
|
|
257
|
+
|
|
258
|
+
# Split by " and " (BibTeX convention)
|
|
259
|
+
author_parts = re.split(r'\s+and\s+', authors_str)
|
|
260
|
+
|
|
261
|
+
for part in author_parts:
|
|
262
|
+
part = part.strip()
|
|
263
|
+
if not part:
|
|
264
|
+
continue
|
|
265
|
+
|
|
266
|
+
# Handle "Last, First" format
|
|
267
|
+
if ',' in part:
|
|
268
|
+
parts = part.split(',', 1)
|
|
269
|
+
if len(parts) == 2:
|
|
270
|
+
last = parts[0].strip()
|
|
271
|
+
first = parts[1].strip()
|
|
272
|
+
# Convert to "First Last" format
|
|
273
|
+
name = f"{first} {last}".strip()
|
|
274
|
+
else:
|
|
275
|
+
name = part
|
|
276
|
+
else:
|
|
277
|
+
# Already in "First Last" format
|
|
278
|
+
name = part
|
|
279
|
+
|
|
280
|
+
# Clean up the name
|
|
281
|
+
name = re.sub(r'\s+', ' ', name) # Normalize whitespace
|
|
282
|
+
name = re.sub(r'\{([^}]*)\}', r'\1', name) # Remove braces
|
|
283
|
+
|
|
284
|
+
if name:
|
|
285
|
+
authors.append(name)
|
|
286
|
+
|
|
287
|
+
return authors
|
|
288
|
+
|
|
289
|
+
def _extract_year_from_eprint(self, eprint: str) -> Optional[int]:
|
|
290
|
+
"""
|
|
291
|
+
Extract year from ArXiv eprint ID.
|
|
292
|
+
|
|
293
|
+
New format (YYMM.NNNNN): First two digits are year
|
|
294
|
+
Old format (cat-name/YYMMNNN): Digits after slash, first two are year
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
eprint: ArXiv eprint ID
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Year as integer or None
|
|
301
|
+
"""
|
|
302
|
+
if not eprint:
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
# New format: 2301.12345
|
|
306
|
+
match = re.match(r'^(\d{2})\d{2}\.\d{4,5}', eprint)
|
|
307
|
+
if match:
|
|
308
|
+
yy = int(match.group(1))
|
|
309
|
+
# ArXiv started in 1991, new format started in 2007
|
|
310
|
+
if yy >= 7:
|
|
311
|
+
return 2000 + yy
|
|
312
|
+
else:
|
|
313
|
+
# Very early 2000s papers (unlikely in new format)
|
|
314
|
+
return 2000 + yy
|
|
315
|
+
|
|
316
|
+
# Old format: hep-th/9901001
|
|
317
|
+
match = re.match(r'^[a-z-]+/(\d{2})\d+', eprint)
|
|
318
|
+
if match:
|
|
319
|
+
yy = int(match.group(1))
|
|
320
|
+
if yy >= 91: # ArXiv started in 1991
|
|
321
|
+
return 1900 + yy
|
|
322
|
+
else:
|
|
323
|
+
return 2000 + yy
|
|
324
|
+
|
|
325
|
+
return None
|
|
326
|
+
|
|
327
|
+
def get_latest_version_info(self, arxiv_id: str) -> Optional[str]:
|
|
328
|
+
"""
|
|
329
|
+
Get the latest version number for an ArXiv paper.
|
|
330
|
+
|
|
331
|
+
Note: This requires fetching the abstract page, so it's optional.
|
|
332
|
+
For now, we rely on the BibTeX always returning latest version metadata.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
arxiv_id: ArXiv ID without version
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
Latest version string (e.g., "v3") or None if couldn't determine
|
|
339
|
+
"""
|
|
340
|
+
# The BibTeX endpoint always returns the latest version's metadata,
|
|
341
|
+
# so we don't need to explicitly fetch version info
|
|
342
|
+
return None
|
|
343
|
+
|
|
344
|
+
def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
|
|
345
|
+
"""
|
|
346
|
+
Check if a reference is an ArXiv paper.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
reference: Reference dictionary
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
True if reference appears to be an ArXiv paper
|
|
353
|
+
"""
|
|
354
|
+
arxiv_id, _ = self.extract_arxiv_id(reference)
|
|
355
|
+
return arxiv_id is not None
|
|
356
|
+
|
|
357
|
+
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
358
|
+
"""
|
|
359
|
+
Verify a reference using ArXiv's official BibTeX as authoritative source.
|
|
360
|
+
|
|
361
|
+
This method:
|
|
362
|
+
1. Extracts the ArXiv ID from the reference
|
|
363
|
+
2. Fetches the official BibTeX from ArXiv (always latest version)
|
|
364
|
+
3. Parses the BibTeX to get authoritative metadata
|
|
365
|
+
4. Compares cited metadata against authoritative source
|
|
366
|
+
5. Logs warnings for version mismatches
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
Tuple of (verified_data, errors, url)
|
|
373
|
+
- verified_data: Authoritative paper metadata from ArXiv or None
|
|
374
|
+
- errors: List of error/warning dictionaries
|
|
375
|
+
- url: ArXiv URL for the paper
|
|
376
|
+
"""
|
|
377
|
+
errors = []
|
|
378
|
+
|
|
379
|
+
# Extract ArXiv ID
|
|
380
|
+
arxiv_id, cited_version = self.extract_arxiv_id(reference)
|
|
381
|
+
|
|
382
|
+
if not arxiv_id:
|
|
383
|
+
logger.debug("ArXivCitationChecker: No ArXiv ID found in reference")
|
|
384
|
+
return None, [], None
|
|
385
|
+
|
|
386
|
+
logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
|
|
387
|
+
|
|
388
|
+
# Fetch authoritative BibTeX
|
|
389
|
+
bibtex_content = self.fetch_bibtex(arxiv_id)
|
|
390
|
+
|
|
391
|
+
if not bibtex_content:
|
|
392
|
+
logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
|
|
393
|
+
return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
|
|
394
|
+
|
|
395
|
+
# Parse BibTeX
|
|
396
|
+
verified_data = self.parse_bibtex(bibtex_content)
|
|
397
|
+
|
|
398
|
+
if not verified_data:
|
|
399
|
+
logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
|
|
400
|
+
return None, [], None
|
|
401
|
+
|
|
402
|
+
# Log version mismatch warning if cited version differs from latest
|
|
403
|
+
if cited_version:
|
|
404
|
+
# ArXiv BibTeX always returns latest version metadata
|
|
405
|
+
# We don't know the actual latest version number without additional API call,
|
|
406
|
+
# but we can warn that a specific version was cited
|
|
407
|
+
errors.append({
|
|
408
|
+
'warning_type': 'version',
|
|
409
|
+
'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
|
|
410
|
+
})
|
|
411
|
+
logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
|
|
412
|
+
|
|
413
|
+
# Compare title
|
|
414
|
+
cited_title = reference.get('title', '').strip()
|
|
415
|
+
authoritative_title = verified_data.get('title', '').strip()
|
|
416
|
+
|
|
417
|
+
if cited_title and authoritative_title:
|
|
418
|
+
title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
|
|
419
|
+
|
|
420
|
+
if title_similarity < SIMILARITY_THRESHOLD:
|
|
421
|
+
clean_cited_title = strip_latex_commands(cited_title)
|
|
422
|
+
errors.append({
|
|
423
|
+
'error_type': 'title',
|
|
424
|
+
'error_details': format_title_mismatch(clean_cited_title, authoritative_title),
|
|
425
|
+
'ref_title_correct': authoritative_title
|
|
426
|
+
})
|
|
427
|
+
|
|
428
|
+
# Compare authors
|
|
429
|
+
cited_authors = reference.get('authors', [])
|
|
430
|
+
if cited_authors:
|
|
431
|
+
authoritative_authors = verified_data.get('authors', [])
|
|
432
|
+
authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
|
|
433
|
+
|
|
434
|
+
if not authors_match:
|
|
435
|
+
correct_author_names = ', '.join([a.get('name', '') for a in authoritative_authors])
|
|
436
|
+
errors.append({
|
|
437
|
+
'error_type': 'author',
|
|
438
|
+
'error_details': author_error,
|
|
439
|
+
'ref_authors_correct': correct_author_names
|
|
440
|
+
})
|
|
441
|
+
|
|
442
|
+
# Compare year
|
|
443
|
+
cited_year = reference.get('year')
|
|
444
|
+
authoritative_year = verified_data.get('year')
|
|
445
|
+
|
|
446
|
+
year_warning = validate_year(
|
|
447
|
+
cited_year=cited_year,
|
|
448
|
+
paper_year=authoritative_year,
|
|
449
|
+
use_flexible_validation=True,
|
|
450
|
+
context={'arxiv_match': True}
|
|
451
|
+
)
|
|
452
|
+
if year_warning:
|
|
453
|
+
errors.append(year_warning)
|
|
454
|
+
|
|
455
|
+
# Build URL
|
|
456
|
+
paper_url = f"https://arxiv.org/abs/{arxiv_id}"
|
|
457
|
+
|
|
458
|
+
logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
|
|
459
|
+
|
|
460
|
+
return verified_data, errors, paper_url
|
|
@@ -43,6 +43,7 @@ class EnhancedHybridReferenceChecker:
|
|
|
43
43
|
contact_email: Optional[str] = None,
|
|
44
44
|
enable_openalex: bool = True,
|
|
45
45
|
enable_crossref: bool = True,
|
|
46
|
+
enable_arxiv_citation: bool = True,
|
|
46
47
|
debug_mode: bool = False):
|
|
47
48
|
"""
|
|
48
49
|
Initialize the enhanced hybrid reference checker
|
|
@@ -53,11 +54,22 @@ class EnhancedHybridReferenceChecker:
|
|
|
53
54
|
contact_email: Email for polite pool access to APIs
|
|
54
55
|
enable_openalex: Whether to use OpenAlex API
|
|
55
56
|
enable_crossref: Whether to use CrossRef API
|
|
57
|
+
enable_arxiv_citation: Whether to use ArXiv Citation checker as authoritative source
|
|
56
58
|
debug_mode: Whether to enable debug logging
|
|
57
59
|
"""
|
|
58
60
|
self.contact_email = contact_email
|
|
59
61
|
self.debug_mode = debug_mode
|
|
60
62
|
|
|
63
|
+
# Initialize ArXiv Citation checker (authoritative source for ArXiv papers)
|
|
64
|
+
self.arxiv_citation = None
|
|
65
|
+
if enable_arxiv_citation:
|
|
66
|
+
try:
|
|
67
|
+
from .arxiv_citation import ArXivCitationChecker
|
|
68
|
+
self.arxiv_citation = ArXivCitationChecker()
|
|
69
|
+
logger.debug("Enhanced Hybrid: ArXiv Citation checker initialized")
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.warning(f"Enhanced Hybrid: Failed to initialize ArXiv Citation checker: {e}")
|
|
72
|
+
|
|
61
73
|
# Initialize local database checker if available
|
|
62
74
|
self.local_db = None
|
|
63
75
|
if db_path:
|
|
@@ -112,6 +124,7 @@ class EnhancedHybridReferenceChecker:
|
|
|
112
124
|
|
|
113
125
|
# Track API performance for adaptive selection
|
|
114
126
|
self.api_stats = {
|
|
127
|
+
'arxiv_citation': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
115
128
|
'local_db': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
116
129
|
'semantic_scholar': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
117
130
|
'openalex': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
@@ -276,6 +289,17 @@ class EnhancedHybridReferenceChecker:
|
|
|
276
289
|
|
|
277
290
|
# PHASE 1: Try all APIs once in priority order
|
|
278
291
|
|
|
292
|
+
# Strategy 0: For ArXiv papers, try ArXiv Citation checker first (authoritative source)
|
|
293
|
+
# This fetches the official BibTeX from ArXiv which is the author-submitted metadata
|
|
294
|
+
if self.arxiv_citation and self.arxiv_citation.is_arxiv_reference(reference):
|
|
295
|
+
logger.debug("Enhanced Hybrid: Reference appears to be ArXiv paper, trying ArXiv Citation checker first")
|
|
296
|
+
verified_data, errors, url, success, failure_type = self._try_api('arxiv_citation', self.arxiv_citation, reference)
|
|
297
|
+
if success:
|
|
298
|
+
logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded as authoritative source")
|
|
299
|
+
return verified_data, errors, url
|
|
300
|
+
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
301
|
+
failed_apis.append(('arxiv_citation', self.arxiv_citation, failure_type))
|
|
302
|
+
|
|
279
303
|
# Strategy 1: Always try local database first (fastest)
|
|
280
304
|
if self.local_db:
|
|
281
305
|
verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)
|
|
@@ -22,6 +22,14 @@ DEFAULT_CONFIG = {
|
|
|
22
22
|
"timeout": 30,
|
|
23
23
|
},
|
|
24
24
|
|
|
25
|
+
"arxiv_citation": {
|
|
26
|
+
"base_url": "https://arxiv.org/bibtex",
|
|
27
|
+
"rate_limit_delay": 3.0, # Share rate limiting with other ArXiv endpoints
|
|
28
|
+
"timeout": 30,
|
|
29
|
+
"use_as_authoritative": True, # Use ArXiv BibTeX as authoritative source
|
|
30
|
+
"enabled": True, # Enable ArXiv citation checker in hybrid checker
|
|
31
|
+
},
|
|
32
|
+
|
|
25
33
|
# Processing Settings
|
|
26
34
|
"processing": {
|
|
27
35
|
"max_papers": 50,
|
|
@@ -110,21 +110,7 @@ class LLMProvider(ABC):
|
|
|
110
110
|
|
|
111
111
|
logger.debug(f"Created {len(chunks)} balanced overlapping chunks for parallel processing")
|
|
112
112
|
return chunks
|
|
113
|
-
|
|
114
|
-
def _parse_llm_response(self, response_text: str) -> List[str]:
|
|
115
|
-
"""Parse LLM response and extract individual references"""
|
|
116
|
-
if not response_text:
|
|
117
|
-
return []
|
|
118
|
-
|
|
119
|
-
# Split by newlines and filter out empty lines
|
|
120
|
-
references = []
|
|
121
|
-
for line in response_text.strip().split('\n'):
|
|
122
|
-
line = line.strip()
|
|
123
|
-
if line and not line.startswith('#') and len(line) > 10: # Basic filtering
|
|
124
|
-
references.append(line)
|
|
125
|
-
|
|
126
|
-
return references
|
|
127
|
-
|
|
113
|
+
|
|
128
114
|
def extract_references_with_chunking(self, bibliography_text: str) -> List[str]:
|
|
129
115
|
"""
|
|
130
116
|
Template method that handles chunking for all providers.
|