academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
import re
|
|
5
|
+
import logging
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
from typing import Dict, Optional, Tuple, List, Any
|
|
8
|
+
from refchecker.utils.text_utils import strip_latex_commands
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
class GitHubChecker:
|
|
13
|
+
"""
|
|
14
|
+
Checker for verifying GitHub repository references
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, github_token: Optional[str] = None):
|
|
18
|
+
"""
|
|
19
|
+
Initialize GitHub checker
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
github_token: Optional GitHub API token for higher rate limits
|
|
23
|
+
"""
|
|
24
|
+
self.github_token = github_token
|
|
25
|
+
self.base_headers = {
|
|
26
|
+
'Accept': 'application/vnd.github.v3+json',
|
|
27
|
+
'User-Agent': 'refchecker-academic-tool'
|
|
28
|
+
}
|
|
29
|
+
if github_token:
|
|
30
|
+
self.base_headers['Authorization'] = f'token {github_token}'
|
|
31
|
+
|
|
32
|
+
def extract_github_repo_info(self, url: str) -> Optional[Tuple[str, str]]:
|
|
33
|
+
"""
|
|
34
|
+
Extract owner and repository name from GitHub URL
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
url: GitHub URL
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Tuple of (owner, repo) or None if not a valid GitHub URL
|
|
41
|
+
"""
|
|
42
|
+
if not url:
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
url = url.strip().rstrip('/')
|
|
46
|
+
|
|
47
|
+
# Handle various GitHub URL formats
|
|
48
|
+
patterns = [
|
|
49
|
+
r'https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?(?:/.*)?$',
|
|
50
|
+
r'git@github\.com:([^/]+)/([^/]+?)(?:\.git)?$'
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
for pattern in patterns:
|
|
54
|
+
match = re.match(pattern, url, re.IGNORECASE)
|
|
55
|
+
if match:
|
|
56
|
+
owner, repo = match.groups()
|
|
57
|
+
# Remove common suffixes
|
|
58
|
+
repo = repo.replace('.git', '')
|
|
59
|
+
return owner, repo
|
|
60
|
+
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
def is_github_url(self, url: str) -> bool:
|
|
64
|
+
"""
|
|
65
|
+
Check if URL is a GitHub repository URL
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
url: URL to check
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
True if it's a GitHub repository URL
|
|
72
|
+
"""
|
|
73
|
+
return self.extract_github_repo_info(url) is not None
|
|
74
|
+
|
|
75
|
+
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
76
|
+
"""
|
|
77
|
+
Verify a GitHub repository reference
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Tuple of (verified_data, errors, paper_url) where:
|
|
84
|
+
- verified_data: Dict with verified repository information or None
|
|
85
|
+
- errors: List of error/warning dictionaries
|
|
86
|
+
- paper_url: The GitHub repository URL
|
|
87
|
+
"""
|
|
88
|
+
logger.debug(f"Verifying GitHub reference: {reference.get('title', 'Untitled')}")
|
|
89
|
+
|
|
90
|
+
# Extract GitHub URL from reference
|
|
91
|
+
github_url = None
|
|
92
|
+
if reference.get('url') and self.is_github_url(reference['url']):
|
|
93
|
+
github_url = reference['url']
|
|
94
|
+
elif reference.get('venue') and 'github.com' in reference.get('venue', ''):
|
|
95
|
+
# Sometimes GitHub URLs are in the venue field
|
|
96
|
+
venue_parts = reference['venue'].split()
|
|
97
|
+
for part in venue_parts:
|
|
98
|
+
if self.is_github_url(part):
|
|
99
|
+
github_url = part
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
if not github_url:
|
|
103
|
+
logger.debug("No GitHub URL found in reference")
|
|
104
|
+
return None, [], None
|
|
105
|
+
|
|
106
|
+
# Extract repository information
|
|
107
|
+
repo_info = self.extract_github_repo_info(github_url)
|
|
108
|
+
if not repo_info:
|
|
109
|
+
logger.debug(f"Could not parse GitHub URL: {github_url}")
|
|
110
|
+
return None, [{"error_type": "unverified", "error_details": "Invalid GitHub URL format"}], github_url
|
|
111
|
+
|
|
112
|
+
owner, repo = repo_info
|
|
113
|
+
api_url = f'https://api.github.com/repos/{owner}/{repo}'
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
# Make API request
|
|
117
|
+
response = requests.get(api_url, headers=self.base_headers, timeout=10)
|
|
118
|
+
|
|
119
|
+
if response.status_code == 404:
|
|
120
|
+
logger.debug(f"GitHub repository not found: {owner}/{repo}")
|
|
121
|
+
return None, [{"error_type": "unverified", "error_details": "Repository not found or is private"}], github_url
|
|
122
|
+
elif response.status_code == 403:
|
|
123
|
+
logger.warning("GitHub API rate limit exceeded")
|
|
124
|
+
return None, [{"error_type": "unverified", "error_details": "GitHub API rate limit exceeded"}], github_url
|
|
125
|
+
elif response.status_code != 200:
|
|
126
|
+
logger.warning(f"GitHub API error {response.status_code} for {owner}/{repo}")
|
|
127
|
+
return None, [{"error_type": "unverified", "error_details": f"GitHub API error: {response.status_code}"}], github_url
|
|
128
|
+
|
|
129
|
+
repo_data = response.json()
|
|
130
|
+
|
|
131
|
+
# Extract repository metadata
|
|
132
|
+
actual_name = repo_data.get('name', '')
|
|
133
|
+
actual_description = repo_data.get('description', '') or ''
|
|
134
|
+
actual_owner = repo_data.get('owner', {}).get('login', '')
|
|
135
|
+
actual_owner_name = repo_data.get('owner', {}).get('name', actual_owner) or actual_owner
|
|
136
|
+
created_at = repo_data.get('created_at', '')
|
|
137
|
+
archived = repo_data.get('archived', False)
|
|
138
|
+
|
|
139
|
+
# Parse creation year
|
|
140
|
+
creation_year = None
|
|
141
|
+
if created_at:
|
|
142
|
+
try:
|
|
143
|
+
creation_year = int(created_at.split('-')[0])
|
|
144
|
+
except (ValueError, IndexError):
|
|
145
|
+
pass
|
|
146
|
+
|
|
147
|
+
# Create verified data structure
|
|
148
|
+
verified_data = {
|
|
149
|
+
'title': actual_description if actual_description else actual_name,
|
|
150
|
+
'authors': [actual_owner_name] if actual_owner_name else [actual_owner],
|
|
151
|
+
'year': creation_year,
|
|
152
|
+
'venue': 'GitHub Repository',
|
|
153
|
+
'url': github_url,
|
|
154
|
+
'github_metadata': {
|
|
155
|
+
'name': actual_name,
|
|
156
|
+
'description': actual_description,
|
|
157
|
+
'owner': actual_owner,
|
|
158
|
+
'owner_name': actual_owner_name,
|
|
159
|
+
'created_year': creation_year,
|
|
160
|
+
'stars': repo_data.get('stargazers_count', 0),
|
|
161
|
+
'language': repo_data.get('language', ''),
|
|
162
|
+
'license': repo_data.get('license', {}).get('name', '') if repo_data.get('license') else '',
|
|
163
|
+
'archived': archived
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
# Verify title
|
|
168
|
+
errors = []
|
|
169
|
+
cited_title = reference.get('title', '').strip()
|
|
170
|
+
if cited_title:
|
|
171
|
+
title_match = self._check_title_match(cited_title, actual_name, actual_description)
|
|
172
|
+
if not title_match:
|
|
173
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
174
|
+
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
175
|
+
clean_cited_title = strip_latex_commands(cited_title)
|
|
176
|
+
details = format_title_mismatch(clean_cited_title, actual_name)
|
|
177
|
+
if actual_description:
|
|
178
|
+
snippet = actual_description[:100] + ('...' if len(actual_description) > 100 else '')
|
|
179
|
+
details += f" ({snippet})"
|
|
180
|
+
errors.append({
|
|
181
|
+
"warning_type": "title",
|
|
182
|
+
"warning_details": details
|
|
183
|
+
})
|
|
184
|
+
|
|
185
|
+
# Verify authors
|
|
186
|
+
cited_authors = reference.get('authors', [])
|
|
187
|
+
if cited_authors:
|
|
188
|
+
author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
|
|
189
|
+
author_match = self._check_author_match(author_str, actual_owner, actual_owner_name)
|
|
190
|
+
if not author_match:
|
|
191
|
+
from refchecker.utils.error_utils import format_three_line_mismatch
|
|
192
|
+
left = author_str
|
|
193
|
+
right = f"{actual_owner} ({actual_owner_name})" if actual_owner_name else actual_owner
|
|
194
|
+
details = format_three_line_mismatch("Author mismatch", left, right)
|
|
195
|
+
errors.append({
|
|
196
|
+
"warning_type": "author",
|
|
197
|
+
"warning_details": details
|
|
198
|
+
})
|
|
199
|
+
|
|
200
|
+
# Verify year
|
|
201
|
+
cited_year = reference.get('year')
|
|
202
|
+
if cited_year and creation_year:
|
|
203
|
+
try:
|
|
204
|
+
cited_year_int = int(cited_year)
|
|
205
|
+
if cited_year_int < creation_year:
|
|
206
|
+
from refchecker.utils.error_utils import format_year_mismatch
|
|
207
|
+
errors.append({
|
|
208
|
+
"warning_type": "year",
|
|
209
|
+
"warning_details": format_year_mismatch(cited_year, creation_year),
|
|
210
|
+
"ref_year_correct": str(creation_year)
|
|
211
|
+
})
|
|
212
|
+
except (ValueError, TypeError):
|
|
213
|
+
pass
|
|
214
|
+
|
|
215
|
+
# Add notes for archived repositories
|
|
216
|
+
if archived:
|
|
217
|
+
errors.append({
|
|
218
|
+
"warning_type": "status",
|
|
219
|
+
"warning_details": "Repository is archived (no longer actively maintained)"
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
logger.debug(f"GitHub verification successful for {owner}/{repo}")
|
|
223
|
+
return verified_data, errors, github_url
|
|
224
|
+
|
|
225
|
+
except requests.exceptions.RequestException as e:
|
|
226
|
+
logger.error(f"Network error accessing GitHub API for {owner}/{repo}: {e}")
|
|
227
|
+
return None, [{"error_type": "unverified", "error_details": f"Network error: {str(e)}"}], github_url
|
|
228
|
+
except Exception as e:
|
|
229
|
+
logger.error(f"Unexpected error verifying GitHub repository {owner}/{repo}: {e}")
|
|
230
|
+
return None, [{"error_type": "unverified", "error_details": f"Unexpected error: {str(e)}"}], github_url
|
|
231
|
+
|
|
232
|
+
def _check_title_match(self, cited_title: str, repo_name: str, repo_description: str) -> bool:
|
|
233
|
+
"""
|
|
234
|
+
Check if cited title matches repository name or description
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
cited_title: Title as cited in reference
|
|
238
|
+
repo_name: Repository name
|
|
239
|
+
repo_description: Repository description
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
True if title matches reasonably well
|
|
243
|
+
"""
|
|
244
|
+
cited_lower = cited_title.lower().strip()
|
|
245
|
+
repo_name_lower = repo_name.lower().strip()
|
|
246
|
+
repo_desc_lower = repo_description.lower().strip() if repo_description else ''
|
|
247
|
+
|
|
248
|
+
# Direct name match
|
|
249
|
+
if cited_lower in repo_name_lower or repo_name_lower in cited_lower:
|
|
250
|
+
return True
|
|
251
|
+
|
|
252
|
+
# Check against description if available
|
|
253
|
+
if repo_desc_lower:
|
|
254
|
+
# Extract significant words (more than 3 characters)
|
|
255
|
+
cited_words = set(word.strip('.,;:()[]') for word in cited_lower.split() if len(word.strip('.,;:()[]')) > 3)
|
|
256
|
+
desc_words = set(word.strip('.,;:()[]') for word in repo_desc_lower.split() if len(word.strip('.,;:()[]')) > 3)
|
|
257
|
+
|
|
258
|
+
# Check for significant word overlap (at least 2 words or key technical terms)
|
|
259
|
+
common_words = cited_words.intersection(desc_words)
|
|
260
|
+
if len(common_words) >= 2:
|
|
261
|
+
return True
|
|
262
|
+
|
|
263
|
+
# Check for key technical terms that indicate the same project
|
|
264
|
+
key_terms = {'tensorflow', 'pytorch', 'transformers', 'autogen', 'machine learning', 'deep learning', 'neural', 'ai', 'llm'}
|
|
265
|
+
if any(term in cited_lower and term in repo_desc_lower for term in key_terms):
|
|
266
|
+
return True
|
|
267
|
+
|
|
268
|
+
return False
|
|
269
|
+
|
|
270
|
+
def _check_author_match(self, cited_authors: str, repo_owner: str, repo_owner_name: str) -> bool:
|
|
271
|
+
"""
|
|
272
|
+
Check if cited authors match repository owner
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
cited_authors: Authors as cited in reference
|
|
276
|
+
repo_owner: Repository owner username
|
|
277
|
+
repo_owner_name: Repository owner display name
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
True if authors match reasonably well
|
|
281
|
+
"""
|
|
282
|
+
cited_lower = cited_authors.lower().strip()
|
|
283
|
+
owner_lower = repo_owner.lower().strip()
|
|
284
|
+
owner_name_lower = repo_owner_name.lower().strip() if repo_owner_name else ''
|
|
285
|
+
|
|
286
|
+
# Direct matches
|
|
287
|
+
if cited_lower in owner_lower or owner_lower in cited_lower:
|
|
288
|
+
return True
|
|
289
|
+
if owner_name_lower and (cited_lower in owner_name_lower or owner_name_lower in cited_lower):
|
|
290
|
+
return True
|
|
291
|
+
|
|
292
|
+
# Handle common abbreviation patterns
|
|
293
|
+
abbrev_patterns = {
|
|
294
|
+
'huggingface': ['h.f.', 'hf', 'hugging', 'h. f.', 'hugging face'],
|
|
295
|
+
'microsoft': ['m.', 'ms', 'msft', 'm. a. team', 'microsoft'],
|
|
296
|
+
'google': ['g.', 'g. b. team', 'google', 'brain team', 'alphabet'],
|
|
297
|
+
'tensorflow': ['t.', 't. contributors', 'tensorflow', 'g. b. team'],
|
|
298
|
+
'pytorch': ['pytorch team', 'facebook', 'meta'],
|
|
299
|
+
'openai': ['openai', 'o.a.', 'open ai']
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
for org, abbrevs in abbrev_patterns.items():
|
|
303
|
+
if org in owner_lower:
|
|
304
|
+
if any(abbrev in cited_lower for abbrev in abbrevs):
|
|
305
|
+
return True
|
|
306
|
+
|
|
307
|
+
# Check for team patterns
|
|
308
|
+
if 'team' in cited_lower and owner_lower in cited_lower:
|
|
309
|
+
return True
|
|
310
|
+
|
|
311
|
+
# Check initials against organization name
|
|
312
|
+
if len(repo_owner) >= 2:
|
|
313
|
+
# Extract words from organization name
|
|
314
|
+
org_words = re.sub(r'[_-]', ' ', repo_owner).split()
|
|
315
|
+
if len(org_words) >= 2:
|
|
316
|
+
# Generate initials
|
|
317
|
+
initials = ''.join(word[0].upper() for word in org_words if word)
|
|
318
|
+
initials_variants = [
|
|
319
|
+
initials.lower(),
|
|
320
|
+
'. '.join(initials.lower()) + '.',
|
|
321
|
+
' '.join(initials.lower()),
|
|
322
|
+
]
|
|
323
|
+
if any(variant in cited_lower for variant in initials_variants):
|
|
324
|
+
return True
|
|
325
|
+
|
|
326
|
+
return False
|