academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Database Processing Utilities for Reference Checking
|
|
4
|
+
|
|
5
|
+
This module provides utilities for processing database results,
|
|
6
|
+
particularly for Semantic Scholar data processing.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Dict, List, Any, Optional
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def process_semantic_scholar_result(paper_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
17
|
+
"""
|
|
18
|
+
Process a single Semantic Scholar database result by parsing JSON fields
|
|
19
|
+
and reconstructing the paper data structure.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
paper_data: Raw paper data dictionary from database
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Processed paper data dictionary
|
|
26
|
+
"""
|
|
27
|
+
try:
|
|
28
|
+
# Extract authors from JSON
|
|
29
|
+
if paper_data.get('authors'):
|
|
30
|
+
if isinstance(paper_data['authors'], str):
|
|
31
|
+
paper_data['authors'] = json.loads(paper_data['authors'])
|
|
32
|
+
else:
|
|
33
|
+
paper_data['authors'] = []
|
|
34
|
+
|
|
35
|
+
# Reconstruct external IDs from flattened columns
|
|
36
|
+
external_ids = {}
|
|
37
|
+
for key, value in paper_data.items():
|
|
38
|
+
if key.startswith('externalIds_') and value:
|
|
39
|
+
external_id_type = key.replace('externalIds_', '')
|
|
40
|
+
external_ids[external_id_type] = value
|
|
41
|
+
paper_data['externalIds'] = external_ids
|
|
42
|
+
|
|
43
|
+
# Add other JSON fields
|
|
44
|
+
if paper_data.get('s2FieldsOfStudy'):
|
|
45
|
+
if isinstance(paper_data['s2FieldsOfStudy'], str):
|
|
46
|
+
paper_data['s2FieldsOfStudy'] = json.loads(paper_data['s2FieldsOfStudy'])
|
|
47
|
+
|
|
48
|
+
if paper_data.get('publicationTypes'):
|
|
49
|
+
if isinstance(paper_data['publicationTypes'], str):
|
|
50
|
+
paper_data['publicationTypes'] = json.loads(paper_data['publicationTypes'])
|
|
51
|
+
|
|
52
|
+
return paper_data
|
|
53
|
+
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.warning(f"Error processing database result: {e}")
|
|
56
|
+
return paper_data
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def process_semantic_scholar_results(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
60
|
+
"""
|
|
61
|
+
Process multiple Semantic Scholar database results.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
results: List of raw database row dictionaries
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
List of processed paper data dictionaries
|
|
68
|
+
"""
|
|
69
|
+
processed_results = []
|
|
70
|
+
|
|
71
|
+
for paper_data in results:
|
|
72
|
+
processed_result = process_semantic_scholar_result(paper_data)
|
|
73
|
+
if processed_result:
|
|
74
|
+
processed_results.append(processed_result)
|
|
75
|
+
|
|
76
|
+
return processed_results
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def extract_external_ids(paper_data: Dict[str, Any]) -> Dict[str, str]:
|
|
80
|
+
"""
|
|
81
|
+
Extract external IDs from flattened database columns.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
paper_data: Paper data dictionary from database
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Dictionary of external IDs
|
|
88
|
+
"""
|
|
89
|
+
external_ids = {}
|
|
90
|
+
|
|
91
|
+
for key, value in paper_data.items():
|
|
92
|
+
if key.startswith('externalIds_') and value:
|
|
93
|
+
external_id_type = key.replace('externalIds_', '')
|
|
94
|
+
external_ids[external_id_type] = value
|
|
95
|
+
|
|
96
|
+
return external_ids
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def parse_json_field(data: Dict[str, Any], field_name: str) -> Any:
|
|
100
|
+
"""
|
|
101
|
+
Parse a JSON field from database data, handling both string and already-parsed data.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
data: Database record dictionary
|
|
105
|
+
field_name: Name of the field to parse
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Parsed data or empty list/dict if parsing fails
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
field_data = data.get(field_name)
|
|
112
|
+
if not field_data:
|
|
113
|
+
return [] if field_name in ['authors', 's2FieldsOfStudy', 'publicationTypes'] else {}
|
|
114
|
+
|
|
115
|
+
if isinstance(field_data, str):
|
|
116
|
+
return json.loads(field_data)
|
|
117
|
+
else:
|
|
118
|
+
return field_data
|
|
119
|
+
|
|
120
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
121
|
+
logger.warning(f"Failed to parse JSON field '{field_name}': {e}")
|
|
122
|
+
return [] if field_name in ['authors', 's2FieldsOfStudy', 'publicationTypes'] else {}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def reconstruct_paper_structure(row_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
126
|
+
"""
|
|
127
|
+
Reconstruct the full paper data structure from flattened database row.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
row_data: Raw database row data
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Reconstructed paper data structure
|
|
134
|
+
"""
|
|
135
|
+
# Start with the row data
|
|
136
|
+
paper_data = dict(row_data)
|
|
137
|
+
|
|
138
|
+
# Parse JSON fields
|
|
139
|
+
paper_data['authors'] = parse_json_field(paper_data, 'authors')
|
|
140
|
+
paper_data['s2FieldsOfStudy'] = parse_json_field(paper_data, 's2FieldsOfStudy')
|
|
141
|
+
paper_data['publicationTypes'] = parse_json_field(paper_data, 'publicationTypes')
|
|
142
|
+
|
|
143
|
+
# Reconstruct external IDs
|
|
144
|
+
paper_data['externalIds'] = extract_external_ids(paper_data)
|
|
145
|
+
|
|
146
|
+
return paper_data
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def safe_json_loads(json_string: str, default_value: Any = None) -> Any:
|
|
150
|
+
"""
|
|
151
|
+
Safely load JSON string with fallback to default value.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
json_string: JSON string to parse
|
|
155
|
+
default_value: Default value if parsing fails
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Parsed JSON data or default value
|
|
159
|
+
"""
|
|
160
|
+
if not json_string:
|
|
161
|
+
return default_value
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
return json.loads(json_string)
|
|
165
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
166
|
+
logger.debug(f"Failed to parse JSON: {e}")
|
|
167
|
+
return default_value
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def flatten_external_ids(external_ids: Dict[str, str]) -> Dict[str, str]:
|
|
171
|
+
"""
|
|
172
|
+
Flatten external IDs dictionary into database column format.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
external_ids: Dictionary of external IDs
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Flattened dictionary with externalIds_ prefix
|
|
179
|
+
"""
|
|
180
|
+
flattened = {}
|
|
181
|
+
|
|
182
|
+
for id_type, id_value in external_ids.items():
|
|
183
|
+
flattened[f'externalIds_{id_type}'] = id_value
|
|
184
|
+
|
|
185
|
+
return flattened
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def validate_paper_data(paper_data: Dict[str, Any]) -> bool:
|
|
189
|
+
"""
|
|
190
|
+
Validate that paper data has required fields.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
paper_data: Paper data dictionary to validate
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
True if data appears valid, False otherwise
|
|
197
|
+
"""
|
|
198
|
+
# Check for essential fields
|
|
199
|
+
required_fields = ['title']
|
|
200
|
+
|
|
201
|
+
for field in required_fields:
|
|
202
|
+
if not paper_data.get(field):
|
|
203
|
+
return False
|
|
204
|
+
|
|
205
|
+
# Validate authors field
|
|
206
|
+
authors = paper_data.get('authors', [])
|
|
207
|
+
if not isinstance(authors, list):
|
|
208
|
+
return False
|
|
209
|
+
|
|
210
|
+
return True
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
DOI Utilities for Reference Checking
|
|
4
|
+
|
|
5
|
+
This module provides utilities for DOI handling, extraction, and validation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def extract_doi_from_url(url: str) -> Optional[str]:
|
|
13
|
+
"""
|
|
14
|
+
Extract DOI from a URL using comprehensive pattern matching.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
url: URL that might contain a DOI
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Extracted DOI or None if not found
|
|
21
|
+
"""
|
|
22
|
+
if not url:
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
# Only extract DOIs from actual DOI URLs, not from other domains
|
|
26
|
+
# This prevents false positives from URLs like aclanthology.org
|
|
27
|
+
if 'doi.org' not in url and 'doi:' not in url:
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
# DOI patterns ordered by specificity and reliability
|
|
31
|
+
doi_patterns = [
|
|
32
|
+
r'doi\.org/([^/\s\?#]+(?:/[^/\s\?#]+)*)', # Full DOI pattern from doi.org
|
|
33
|
+
r'doi:([^/\s\?#]+(?:/[^/\s\?#]+)*)', # doi: prefix format
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
for pattern in doi_patterns:
|
|
37
|
+
match = re.search(pattern, url)
|
|
38
|
+
if match:
|
|
39
|
+
doi_candidate = match.group(1)
|
|
40
|
+
# DOIs must start with "10." and have at least one slash
|
|
41
|
+
if doi_candidate.startswith('10.') and '/' in doi_candidate and len(doi_candidate) > 6:
|
|
42
|
+
return doi_candidate
|
|
43
|
+
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def normalize_doi(doi: str) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Normalize a DOI by removing common prefixes, cleaning whitespace, and converting to lowercase.
|
|
50
|
+
|
|
51
|
+
DOI suffixes are case-insensitive according to the DOI specification, so we normalize
|
|
52
|
+
to lowercase to ensure consistent URL generation across all checkers.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
doi: DOI string to normalize
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Normalized DOI string in lowercase
|
|
59
|
+
"""
|
|
60
|
+
if not doi:
|
|
61
|
+
return ""
|
|
62
|
+
|
|
63
|
+
# Remove common URL prefixes
|
|
64
|
+
normalized = doi.replace('https://doi.org/', '').replace('http://doi.org/', '')
|
|
65
|
+
normalized = normalized.replace('doi:', '')
|
|
66
|
+
|
|
67
|
+
# Remove hash fragments and query parameters
|
|
68
|
+
normalized = normalized.split('#')[0].split('?')[0]
|
|
69
|
+
|
|
70
|
+
# Clean whitespace and trailing punctuation
|
|
71
|
+
normalized = normalized.strip()
|
|
72
|
+
|
|
73
|
+
# Remove trailing punctuation that might be included in extraction
|
|
74
|
+
normalized = normalized.rstrip('.,;:)')
|
|
75
|
+
|
|
76
|
+
# Convert to lowercase for consistency (DOI suffixes are case-insensitive)
|
|
77
|
+
return normalized.lower()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def is_valid_doi_format(doi: str) -> bool:
|
|
81
|
+
"""
|
|
82
|
+
Check if a string matches the basic DOI format.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
doi: String to validate as DOI
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
True if the string matches DOI format, False otherwise
|
|
89
|
+
"""
|
|
90
|
+
if not doi:
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
# Basic DOI format: starts with "10." followed by at least one slash
|
|
94
|
+
doi_format_pattern = r'^10\.\d+/.+'
|
|
95
|
+
return bool(re.match(doi_format_pattern, doi))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def compare_dois(doi1: str, doi2: str) -> bool:
|
|
99
|
+
"""
|
|
100
|
+
Compare two DOIs for equality, handling different formats and prefixes.
|
|
101
|
+
|
|
102
|
+
This function performs exact matching after normalization, with support
|
|
103
|
+
for partial DOI citations where a shorter DOI is a valid prefix of a longer one.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
doi1: First DOI to compare
|
|
107
|
+
doi2: Second DOI to compare
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
True if DOIs are equivalent, False otherwise
|
|
111
|
+
"""
|
|
112
|
+
if not doi1 or not doi2:
|
|
113
|
+
return False
|
|
114
|
+
|
|
115
|
+
# Normalize both DOIs (handles prefixes, case, punctuation)
|
|
116
|
+
norm_doi1 = normalize_doi(doi1)
|
|
117
|
+
norm_doi2 = normalize_doi(doi2)
|
|
118
|
+
|
|
119
|
+
# First try exact match
|
|
120
|
+
if norm_doi1 == norm_doi2:
|
|
121
|
+
return True
|
|
122
|
+
|
|
123
|
+
# Handle partial DOI citations - if one DOI is a prefix of the other, consider it a match
|
|
124
|
+
# This handles cases like "10.1007" being cited instead of the full "10.1007/s10458-025-09691-y"
|
|
125
|
+
if len(norm_doi1) != len(norm_doi2):
|
|
126
|
+
shorter_doi = norm_doi1 if len(norm_doi1) < len(norm_doi2) else norm_doi2
|
|
127
|
+
longer_doi = norm_doi2 if len(norm_doi1) < len(norm_doi2) else norm_doi1
|
|
128
|
+
|
|
129
|
+
# Only consider it a valid partial match if:
|
|
130
|
+
# 1. The shorter DOI is at least 7 characters (e.g., "10.1007")
|
|
131
|
+
# 2. The longer DOI starts with the shorter DOI
|
|
132
|
+
# 3. The next character in the longer DOI is '/' or '.' (valid DOI separators)
|
|
133
|
+
if (len(shorter_doi) >= 7 and
|
|
134
|
+
longer_doi.startswith(shorter_doi) and
|
|
135
|
+
len(longer_doi) > len(shorter_doi) and
|
|
136
|
+
longer_doi[len(shorter_doi)] in ['/', '.']):
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
return False
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def construct_doi_url(doi: str) -> str:
|
|
143
|
+
"""
|
|
144
|
+
Construct a proper DOI URL from a DOI string.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
doi: DOI string
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Full DOI URL
|
|
151
|
+
"""
|
|
152
|
+
if not doi:
|
|
153
|
+
return ""
|
|
154
|
+
|
|
155
|
+
# Normalize the DOI first
|
|
156
|
+
normalized_doi = normalize_doi(doi)
|
|
157
|
+
|
|
158
|
+
# Construct URL
|
|
159
|
+
return f"https://doi.org/{normalized_doi}"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def validate_doi_resolves(doi: str, timeout: float = 5.0) -> bool:
|
|
163
|
+
"""
|
|
164
|
+
Validate that a DOI resolves by checking if doi.org returns a redirect.
|
|
165
|
+
|
|
166
|
+
This is useful for determining if a DOI is valid, even if it's different
|
|
167
|
+
from what a verification source has stored (e.g., arXiv DOI vs conference DOI).
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
doi: DOI string to validate
|
|
171
|
+
timeout: Request timeout in seconds
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
True if DOI resolves (returns 302/301/200), False otherwise
|
|
175
|
+
"""
|
|
176
|
+
if not doi or not is_valid_doi_format(normalize_doi(doi)):
|
|
177
|
+
return False
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
import requests
|
|
181
|
+
url = construct_doi_url(doi)
|
|
182
|
+
# Use HEAD request first (faster), fall back to GET if needed
|
|
183
|
+
response = requests.head(url, allow_redirects=False, timeout=timeout)
|
|
184
|
+
# DOI.org returns 302 for valid DOIs that redirect to the paper
|
|
185
|
+
# Some may return 301 (permanent redirect) or 200 (direct response)
|
|
186
|
+
return response.status_code in (200, 301, 302, 303, 307, 308)
|
|
187
|
+
except Exception:
|
|
188
|
+
# On any error (timeout, connection error, etc.), assume DOI might be valid
|
|
189
|
+
# to avoid false negatives due to network issues
|
|
190
|
+
return True
|