academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,326 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import requests
4
+ import re
5
+ import logging
6
+ from urllib.parse import urlparse
7
+ from typing import Dict, Optional, Tuple, List, Any
8
+ from refchecker.utils.text_utils import strip_latex_commands
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class GitHubChecker:
13
+ """
14
+ Checker for verifying GitHub repository references
15
+ """
16
+
17
+ def __init__(self, github_token: Optional[str] = None):
18
+ """
19
+ Initialize GitHub checker
20
+
21
+ Args:
22
+ github_token: Optional GitHub API token for higher rate limits
23
+ """
24
+ self.github_token = github_token
25
+ self.base_headers = {
26
+ 'Accept': 'application/vnd.github.v3+json',
27
+ 'User-Agent': 'refchecker-academic-tool'
28
+ }
29
+ if github_token:
30
+ self.base_headers['Authorization'] = f'token {github_token}'
31
+
32
+ def extract_github_repo_info(self, url: str) -> Optional[Tuple[str, str]]:
33
+ """
34
+ Extract owner and repository name from GitHub URL
35
+
36
+ Args:
37
+ url: GitHub URL
38
+
39
+ Returns:
40
+ Tuple of (owner, repo) or None if not a valid GitHub URL
41
+ """
42
+ if not url:
43
+ return None
44
+
45
+ url = url.strip().rstrip('/')
46
+
47
+ # Handle various GitHub URL formats
48
+ patterns = [
49
+ r'https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?(?:/.*)?$',
50
+ r'git@github\.com:([^/]+)/([^/]+?)(?:\.git)?$'
51
+ ]
52
+
53
+ for pattern in patterns:
54
+ match = re.match(pattern, url, re.IGNORECASE)
55
+ if match:
56
+ owner, repo = match.groups()
57
+ # Remove common suffixes
58
+ repo = repo.replace('.git', '')
59
+ return owner, repo
60
+
61
+ return None
62
+
63
+ def is_github_url(self, url: str) -> bool:
64
+ """
65
+ Check if URL is a GitHub repository URL
66
+
67
+ Args:
68
+ url: URL to check
69
+
70
+ Returns:
71
+ True if it's a GitHub repository URL
72
+ """
73
+ return self.extract_github_repo_info(url) is not None
74
+
75
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
76
+ """
77
+ Verify a GitHub repository reference
78
+
79
+ Args:
80
+ reference: Reference dictionary with title, authors, year, url, etc.
81
+
82
+ Returns:
83
+ Tuple of (verified_data, errors, paper_url) where:
84
+ - verified_data: Dict with verified repository information or None
85
+ - errors: List of error/warning dictionaries
86
+ - paper_url: The GitHub repository URL
87
+ """
88
+ logger.debug(f"Verifying GitHub reference: {reference.get('title', 'Untitled')}")
89
+
90
+ # Extract GitHub URL from reference
91
+ github_url = None
92
+ if reference.get('url') and self.is_github_url(reference['url']):
93
+ github_url = reference['url']
94
+ elif reference.get('venue') and 'github.com' in reference.get('venue', ''):
95
+ # Sometimes GitHub URLs are in the venue field
96
+ venue_parts = reference['venue'].split()
97
+ for part in venue_parts:
98
+ if self.is_github_url(part):
99
+ github_url = part
100
+ break
101
+
102
+ if not github_url:
103
+ logger.debug("No GitHub URL found in reference")
104
+ return None, [], None
105
+
106
+ # Extract repository information
107
+ repo_info = self.extract_github_repo_info(github_url)
108
+ if not repo_info:
109
+ logger.debug(f"Could not parse GitHub URL: {github_url}")
110
+ return None, [{"error_type": "unverified", "error_details": "Invalid GitHub URL format"}], github_url
111
+
112
+ owner, repo = repo_info
113
+ api_url = f'https://api.github.com/repos/{owner}/{repo}'
114
+
115
+ try:
116
+ # Make API request
117
+ response = requests.get(api_url, headers=self.base_headers, timeout=10)
118
+
119
+ if response.status_code == 404:
120
+ logger.debug(f"GitHub repository not found: {owner}/{repo}")
121
+ return None, [{"error_type": "unverified", "error_details": "Repository not found or is private"}], github_url
122
+ elif response.status_code == 403:
123
+ logger.warning("GitHub API rate limit exceeded")
124
+ return None, [{"error_type": "unverified", "error_details": "GitHub API rate limit exceeded"}], github_url
125
+ elif response.status_code != 200:
126
+ logger.warning(f"GitHub API error {response.status_code} for {owner}/{repo}")
127
+ return None, [{"error_type": "unverified", "error_details": f"GitHub API error: {response.status_code}"}], github_url
128
+
129
+ repo_data = response.json()
130
+
131
+ # Extract repository metadata
132
+ actual_name = repo_data.get('name', '')
133
+ actual_description = repo_data.get('description', '') or ''
134
+ actual_owner = repo_data.get('owner', {}).get('login', '')
135
+ actual_owner_name = repo_data.get('owner', {}).get('name', actual_owner) or actual_owner
136
+ created_at = repo_data.get('created_at', '')
137
+ archived = repo_data.get('archived', False)
138
+
139
+ # Parse creation year
140
+ creation_year = None
141
+ if created_at:
142
+ try:
143
+ creation_year = int(created_at.split('-')[0])
144
+ except (ValueError, IndexError):
145
+ pass
146
+
147
+ # Create verified data structure
148
+ verified_data = {
149
+ 'title': actual_description if actual_description else actual_name,
150
+ 'authors': [actual_owner_name] if actual_owner_name else [actual_owner],
151
+ 'year': creation_year,
152
+ 'venue': 'GitHub Repository',
153
+ 'url': github_url,
154
+ 'github_metadata': {
155
+ 'name': actual_name,
156
+ 'description': actual_description,
157
+ 'owner': actual_owner,
158
+ 'owner_name': actual_owner_name,
159
+ 'created_year': creation_year,
160
+ 'stars': repo_data.get('stargazers_count', 0),
161
+ 'language': repo_data.get('language', ''),
162
+ 'license': repo_data.get('license', {}).get('name', '') if repo_data.get('license') else '',
163
+ 'archived': archived
164
+ }
165
+ }
166
+
167
+ # Verify title
168
+ errors = []
169
+ cited_title = reference.get('title', '').strip()
170
+ if cited_title:
171
+ title_match = self._check_title_match(cited_title, actual_name, actual_description)
172
+ if not title_match:
173
+ from refchecker.utils.error_utils import format_title_mismatch
174
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
175
+ clean_cited_title = strip_latex_commands(cited_title)
176
+ details = format_title_mismatch(clean_cited_title, actual_name)
177
+ if actual_description:
178
+ snippet = actual_description[:100] + ('...' if len(actual_description) > 100 else '')
179
+ details += f" ({snippet})"
180
+ errors.append({
181
+ "warning_type": "title",
182
+ "warning_details": details
183
+ })
184
+
185
+ # Verify authors
186
+ cited_authors = reference.get('authors', [])
187
+ if cited_authors:
188
+ author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
189
+ author_match = self._check_author_match(author_str, actual_owner, actual_owner_name)
190
+ if not author_match:
191
+ from refchecker.utils.error_utils import format_three_line_mismatch
192
+ left = author_str
193
+ right = f"{actual_owner} ({actual_owner_name})" if actual_owner_name else actual_owner
194
+ details = format_three_line_mismatch("Author mismatch", left, right)
195
+ errors.append({
196
+ "warning_type": "author",
197
+ "warning_details": details
198
+ })
199
+
200
+ # Verify year
201
+ cited_year = reference.get('year')
202
+ if cited_year and creation_year:
203
+ try:
204
+ cited_year_int = int(cited_year)
205
+ if cited_year_int < creation_year:
206
+ from refchecker.utils.error_utils import format_year_mismatch
207
+ errors.append({
208
+ "warning_type": "year",
209
+ "warning_details": format_year_mismatch(cited_year, creation_year),
210
+ "ref_year_correct": str(creation_year)
211
+ })
212
+ except (ValueError, TypeError):
213
+ pass
214
+
215
+ # Add notes for archived repositories
216
+ if archived:
217
+ errors.append({
218
+ "warning_type": "status",
219
+ "warning_details": "Repository is archived (no longer actively maintained)"
220
+ })
221
+
222
+ logger.debug(f"GitHub verification successful for {owner}/{repo}")
223
+ return verified_data, errors, github_url
224
+
225
+ except requests.exceptions.RequestException as e:
226
+ logger.error(f"Network error accessing GitHub API for {owner}/{repo}: {e}")
227
+ return None, [{"error_type": "unverified", "error_details": f"Network error: {str(e)}"}], github_url
228
+ except Exception as e:
229
+ logger.error(f"Unexpected error verifying GitHub repository {owner}/{repo}: {e}")
230
+ return None, [{"error_type": "unverified", "error_details": f"Unexpected error: {str(e)}"}], github_url
231
+
232
+ def _check_title_match(self, cited_title: str, repo_name: str, repo_description: str) -> bool:
233
+ """
234
+ Check if cited title matches repository name or description
235
+
236
+ Args:
237
+ cited_title: Title as cited in reference
238
+ repo_name: Repository name
239
+ repo_description: Repository description
240
+
241
+ Returns:
242
+ True if title matches reasonably well
243
+ """
244
+ cited_lower = cited_title.lower().strip()
245
+ repo_name_lower = repo_name.lower().strip()
246
+ repo_desc_lower = repo_description.lower().strip() if repo_description else ''
247
+
248
+ # Direct name match
249
+ if cited_lower in repo_name_lower or repo_name_lower in cited_lower:
250
+ return True
251
+
252
+ # Check against description if available
253
+ if repo_desc_lower:
254
+ # Extract significant words (more than 3 characters)
255
+ cited_words = set(word.strip('.,;:()[]') for word in cited_lower.split() if len(word.strip('.,;:()[]')) > 3)
256
+ desc_words = set(word.strip('.,;:()[]') for word in repo_desc_lower.split() if len(word.strip('.,;:()[]')) > 3)
257
+
258
+ # Check for significant word overlap (at least 2 words or key technical terms)
259
+ common_words = cited_words.intersection(desc_words)
260
+ if len(common_words) >= 2:
261
+ return True
262
+
263
+ # Check for key technical terms that indicate the same project
264
+ key_terms = {'tensorflow', 'pytorch', 'transformers', 'autogen', 'machine learning', 'deep learning', 'neural', 'ai', 'llm'}
265
+ if any(term in cited_lower and term in repo_desc_lower for term in key_terms):
266
+ return True
267
+
268
+ return False
269
+
270
+ def _check_author_match(self, cited_authors: str, repo_owner: str, repo_owner_name: str) -> bool:
271
+ """
272
+ Check if cited authors match repository owner
273
+
274
+ Args:
275
+ cited_authors: Authors as cited in reference
276
+ repo_owner: Repository owner username
277
+ repo_owner_name: Repository owner display name
278
+
279
+ Returns:
280
+ True if authors match reasonably well
281
+ """
282
+ cited_lower = cited_authors.lower().strip()
283
+ owner_lower = repo_owner.lower().strip()
284
+ owner_name_lower = repo_owner_name.lower().strip() if repo_owner_name else ''
285
+
286
+ # Direct matches
287
+ if cited_lower in owner_lower or owner_lower in cited_lower:
288
+ return True
289
+ if owner_name_lower and (cited_lower in owner_name_lower or owner_name_lower in cited_lower):
290
+ return True
291
+
292
+ # Handle common abbreviation patterns
293
+ abbrev_patterns = {
294
+ 'huggingface': ['h.f.', 'hf', 'hugging', 'h. f.', 'hugging face'],
295
+ 'microsoft': ['m.', 'ms', 'msft', 'm. a. team', 'microsoft'],
296
+ 'google': ['g.', 'g. b. team', 'google', 'brain team', 'alphabet'],
297
+ 'tensorflow': ['t.', 't. contributors', 'tensorflow', 'g. b. team'],
298
+ 'pytorch': ['pytorch team', 'facebook', 'meta'],
299
+ 'openai': ['openai', 'o.a.', 'open ai']
300
+ }
301
+
302
+ for org, abbrevs in abbrev_patterns.items():
303
+ if org in owner_lower:
304
+ if any(abbrev in cited_lower for abbrev in abbrevs):
305
+ return True
306
+
307
+ # Check for team patterns
308
+ if 'team' in cited_lower and owner_lower in cited_lower:
309
+ return True
310
+
311
+ # Check initials against organization name
312
+ if len(repo_owner) >= 2:
313
+ # Extract words from organization name
314
+ org_words = re.sub(r'[_-]', ' ', repo_owner).split()
315
+ if len(org_words) >= 2:
316
+ # Generate initials
317
+ initials = ''.join(word[0].upper() for word in org_words if word)
318
+ initials_variants = [
319
+ initials.lower(),
320
+ '. '.join(initials.lower()) + '.',
321
+ ' '.join(initials.lower()),
322
+ ]
323
+ if any(variant in cited_lower for variant in initials_variants):
324
+ return True
325
+
326
+ return False