academic-refchecker 2.0.18__tar.gz → 2.0.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.19/PKG-INFO +366 -0
- academic_refchecker-2.0.19/README.md +294 -0
- academic_refchecker-2.0.19/academic_refchecker.egg-info/PKG-INFO +366 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/__version__.py +1 -1
- academic_refchecker-2.0.18/PKG-INFO +0 -877
- academic_refchecker-2.0.18/README.md +0 -805
- academic_refchecker-2.0.18/academic_refchecker.egg-info/PKG-INFO +0 -877
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/LICENSE +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/MANIFEST.in +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/academic_refchecker.egg-info/SOURCES.txt +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/__init__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/__main__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/cli.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/concurrency.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/database.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/main.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/models.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/refchecker_wrapper.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/static/assets/index-2P6L_39v.css +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/static/assets/index-B92lKsA8.js +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/static/assets/index-BuguAhjS.css +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/static/assets/index-DMZJNrR0.js +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/static/assets/index-hk21nqxR.js +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/static/favicon.svg +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/static/index.html +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/static/vite.svg +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/thumbnail.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/backend/websocket_manager.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/pyproject.toml +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/requirements.txt +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/scripts/download_db.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/scripts/run_tests.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/setup.cfg +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/__init__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/__main__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/checkers/__init__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/checkers/arxiv_citation.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/checkers/crossref.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/checkers/github_checker.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/checkers/openalex.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/checkers/openreview_checker.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/checkers/pdf_paper_checker.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/checkers/semantic_scholar.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/checkers/webpage_checker.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/config/__init__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/config/logging.conf +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/config/settings.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/core/__init__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/core/db_connection_pool.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/core/parallel_processor.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/core/refchecker.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/database/__init__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/llm/__init__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/llm/base.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/llm/providers.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/scripts/__init__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/services/__init__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/services/pdf_processor.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/__init__.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/arxiv_rate_limiter.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/arxiv_utils.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/author_utils.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/biblatex_parser.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/bibliography_utils.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/bibtex_parser.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/config_validator.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/db_utils.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/doi_utils.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/error_utils.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/mock_objects.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/text_utils.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/unicode_utils.py +0 -0
- {academic_refchecker-2.0.18 → academic_refchecker-2.0.19}/src/refchecker/utils/url_utils.py +0 -0
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: academic-refchecker
|
|
3
|
+
Version: 2.0.19
|
|
4
|
+
Summary: A comprehensive tool for validating reference accuracy in academic papers
|
|
5
|
+
Author-email: Mark Russinovich <markrussinovich@hotmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/markrussinovich/refchecker
|
|
8
|
+
Project-URL: Repository, https://github.com/markrussinovich/refchecker
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/markrussinovich/refchecker/issues
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Requires-Python: >=3.7
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: requests>=2.25.0
|
|
24
|
+
Requires-Dist: beautifulsoup4>=4.9.0
|
|
25
|
+
Requires-Dist: pypdf>=5.0.0
|
|
26
|
+
Requires-Dist: arxiv>=1.4.0
|
|
27
|
+
Requires-Dist: python-dateutil>=2.8.0
|
|
28
|
+
Requires-Dist: tqdm>=4.60.0
|
|
29
|
+
Requires-Dist: colorama>=0.4.4
|
|
30
|
+
Requires-Dist: fuzzywuzzy>=0.18.0
|
|
31
|
+
Requires-Dist: python-Levenshtein>=0.12.0
|
|
32
|
+
Requires-Dist: pandas<2.4.0,>=1.3.0
|
|
33
|
+
Requires-Dist: numpy<2.0.0,>=1.22.4
|
|
34
|
+
Requires-Dist: pdfplumber>=0.6.0
|
|
35
|
+
Requires-Dist: bibtexparser>=1.4.0
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-cov>=2.0.0; extra == "dev"
|
|
39
|
+
Requires-Dist: black>=21.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: isort>=5.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: flake8>=3.9.0; extra == "dev"
|
|
42
|
+
Requires-Dist: mypy>=0.910; extra == "dev"
|
|
43
|
+
Provides-Extra: docs
|
|
44
|
+
Requires-Dist: sphinx>=4.0.0; extra == "docs"
|
|
45
|
+
Requires-Dist: sphinx-rtd-theme>=0.5.0; extra == "docs"
|
|
46
|
+
Provides-Extra: llm
|
|
47
|
+
Requires-Dist: openai>=1.0.0; extra == "llm"
|
|
48
|
+
Requires-Dist: anthropic>=0.7.0; extra == "llm"
|
|
49
|
+
Requires-Dist: google-generativeai>=0.3.0; extra == "llm"
|
|
50
|
+
Provides-Extra: optional
|
|
51
|
+
Requires-Dist: lxml>=4.6.0; extra == "optional"
|
|
52
|
+
Requires-Dist: selenium>=4.0.0; extra == "optional"
|
|
53
|
+
Requires-Dist: pikepdf>=5.0.0; extra == "optional"
|
|
54
|
+
Requires-Dist: nltk>=3.6.0; extra == "optional"
|
|
55
|
+
Requires-Dist: scikit-learn>=1.0.0; extra == "optional"
|
|
56
|
+
Requires-Dist: joblib>=1.1.0; extra == "optional"
|
|
57
|
+
Provides-Extra: vllm
|
|
58
|
+
Requires-Dist: vllm>=0.3.0; extra == "vllm"
|
|
59
|
+
Requires-Dist: huggingface_hub>=0.17.0; extra == "vllm"
|
|
60
|
+
Requires-Dist: torch>=2.0.0; extra == "vllm"
|
|
61
|
+
Provides-Extra: webui
|
|
62
|
+
Requires-Dist: fastapi>=0.100.0; extra == "webui"
|
|
63
|
+
Requires-Dist: uvicorn[standard]>=0.22.0; extra == "webui"
|
|
64
|
+
Requires-Dist: pydantic>=2.0.0; extra == "webui"
|
|
65
|
+
Requires-Dist: aiosqlite>=0.19.0; extra == "webui"
|
|
66
|
+
Requires-Dist: httpx>=0.24.0; extra == "webui"
|
|
67
|
+
Requires-Dist: cryptography>=42.0.0; extra == "webui"
|
|
68
|
+
Requires-Dist: pymupdf>=1.23.0; extra == "webui"
|
|
69
|
+
Requires-Dist: Pillow>=9.0.0; extra == "webui"
|
|
70
|
+
Requires-Dist: python-multipart>=0.0.6; extra == "webui"
|
|
71
|
+
Dynamic: license-file
|
|
72
|
+
|
|
73
|
+
# RefChecker
|
|
74
|
+
|
|
75
|
+
Validate reference accuracy in academic papers. Useful for authors checking bibliographies and reviewers ensuring citations are authentic. RefChecker verifies citations against Semantic Scholar, OpenAlex, and CrossRef.
|
|
76
|
+
|
|
77
|
+
*Built by Mark Russinovich with AI assistants (Cursor, GitHub Copilot, Claude Code). [Watch the deep dive video](https://www.youtube.com/watch?v=n929Alz-fjo).*
|
|
78
|
+
|
|
79
|
+
## Contents
|
|
80
|
+
|
|
81
|
+
- [Quick Start](#quick-start)
|
|
82
|
+
- [Features](#features)
|
|
83
|
+
- [Sample Output](#sample-output)
|
|
84
|
+
- [Install](#install)
|
|
85
|
+
- [Run](#run)
|
|
86
|
+
- [Output](#output)
|
|
87
|
+
- [Configure](#configure)
|
|
88
|
+
- [Docker](#docker)
|
|
89
|
+
- [Local Database](#local-database)
|
|
90
|
+
- [Testing](#testing)
|
|
91
|
+
- [License](#license)
|
|
92
|
+
|
|
93
|
+
## Quick Start
|
|
94
|
+
|
|
95
|
+
### Web UI (Docker)
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
docker run -p 8000:8000 ghcr.io/markrussinovich/refchecker:latest
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Open **http://localhost:8000** in your browser.
|
|
102
|
+
|
|
103
|
+
### Web UI (pip)
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
pip install academic-refchecker[llm,webui]
|
|
107
|
+
refchecker-webui
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### CLI (pip)
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
pip install academic-refchecker[llm]
|
|
114
|
+
academic-refchecker --paper 1706.03762
|
|
115
|
+
academic-refchecker --paper /path/to/paper.pdf
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
> **Performance**: Set `SEMANTIC_SCHOLAR_API_KEY` for 1-2s per reference vs 5-10s without.
|
|
119
|
+
|
|
120
|
+
## Features
|
|
121
|
+
|
|
122
|
+
- **Multiple formats**: ArXiv papers, PDFs, LaTeX, text files
|
|
123
|
+
- **LLM-powered extraction**: OpenAI, Anthropic, Google, Azure, vLLM
|
|
124
|
+
- **Multi-source verification**: Semantic Scholar, OpenAlex, CrossRef
|
|
125
|
+
- **Comprehensive checks**: Titles, authors, years, venues, DOIs, ArXiv IDs
|
|
126
|
+
- **Smart matching**: Handles formatting variations (BERT vs B-ERT, pre-trained vs pretrained)
|
|
127
|
+
- **Detailed reports**: Errors, warnings, corrected references
|
|
128
|
+
|
|
129
|
+
## Sample Output
|
|
130
|
+
|
|
131
|
+
**Web UI**
|
|
132
|
+
|
|
133
|
+

|
|
134
|
+
|
|
135
|
+
**CLI**
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
📄 Processing: Attention Is All You Need
|
|
139
|
+
URL: https://arxiv.org/abs/1706.03762
|
|
140
|
+
|
|
141
|
+
[1/45] Neural machine translation in linear time
|
|
142
|
+
Nal Kalchbrenner et al. | 2017
|
|
143
|
+
⚠️ Warning: Year mismatch: cited '2017', actual '2016'
|
|
144
|
+
|
|
145
|
+
[2/45] Effective approaches to attention-based neural machine translation
|
|
146
|
+
Minh-Thang Luong et al. | 2015
|
|
147
|
+
❌ Error: First author mismatch: cited 'Minh-Thang Luong', actual 'Thang Luong'
|
|
148
|
+
|
|
149
|
+
[3/45] Deep Residual Learning for Image Recognition
|
|
150
|
+
Kaiming He et al. | 2016 | https://doi.org/10.1109/CVPR.2016.91
|
|
151
|
+
❌ Error: DOI mismatch: cited '10.1109/CVPR.2016.91', actual '10.1109/CVPR.2016.90'
|
|
152
|
+
|
|
153
|
+
============================================================
|
|
154
|
+
📋 SUMMARY
|
|
155
|
+
📚 Total references processed: 68
|
|
156
|
+
❌ Total errors: 55 ⚠️ Total warnings: 16 ❓ Unverified: 15
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Install
|
|
160
|
+
|
|
161
|
+
### PyPI (Recommended)
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
pip install academic-refchecker[llm,webui] # Web UI + CLI + LLM providers
|
|
165
|
+
pip install academic-refchecker # CLI only
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### From Source (Development)
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
git clone https://github.com/markrussinovich/refchecker.git && cd refchecker
|
|
172
|
+
python -m venv .venv && source .venv/bin/activate
|
|
173
|
+
pip install -e ".[llm,webui]"
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**Requirements:** Python 3.7+ (3.10+ recommended). Node.js 18+ is only needed for Web UI development.
|
|
177
|
+
|
|
178
|
+
## Run
|
|
179
|
+
|
|
180
|
+
### Web UI
|
|
181
|
+
|
|
182
|
+
The Web UI shows live progress, history, and export (including corrected values).
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
refchecker-webui --port 8000
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
#### Development (frontend)
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
cd web-ui
|
|
192
|
+
npm install
|
|
193
|
+
npm start
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Open **http://localhost:5173**.
|
|
197
|
+
|
|
198
|
+
Alternative (separate servers):
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
# Terminal 1
|
|
202
|
+
python -m uvicorn backend.main:app --reload --port 8000
|
|
203
|
+
|
|
204
|
+
# Terminal 2
|
|
205
|
+
cd web-ui
|
|
206
|
+
npm run dev
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
Verify the backend is running:
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
curl http://localhost:8000/
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
Web UI documentation: see [web-ui/README.md](web-ui/README.md).
|
|
216
|
+
|
|
217
|
+
### CLI
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
# ArXiv (ID or URL)
|
|
221
|
+
academic-refchecker --paper 1706.03762
|
|
222
|
+
academic-refchecker --paper https://arxiv.org/abs/1706.03762
|
|
223
|
+
|
|
224
|
+
# Local files
|
|
225
|
+
academic-refchecker --paper paper.pdf
|
|
226
|
+
academic-refchecker --paper paper.tex
|
|
227
|
+
academic-refchecker --paper paper.txt
|
|
228
|
+
academic-refchecker --paper refs.bib
|
|
229
|
+
|
|
230
|
+
# Faster/offline verification (local DB)
|
|
231
|
+
academic-refchecker --paper paper.pdf --db-path semantic_scholar_db/semantic_scholar.db
|
|
232
|
+
|
|
233
|
+
# Save results
|
|
234
|
+
academic-refchecker --paper 1706.03762 --output-file errors.txt
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## Output
|
|
238
|
+
|
|
239
|
+
RefChecker reports these result types:
|
|
240
|
+
|
|
241
|
+
| Type | Description | Examples |
|
|
242
|
+
|------|-------------|----------|
|
|
243
|
+
| ❌ **Error** | Critical issues needing correction | Author/title/DOI mismatches, incorrect ArXiv IDs |
|
|
244
|
+
| ⚠️ **Warning** | Minor issues to review | Year differences, venue variations |
|
|
245
|
+
| ℹ️ **Suggestion** | Recommended improvements | Add missing ArXiv/DOI URLs, small metadata fixes |
|
|
246
|
+
| ❓ **Unverified** | Could not verify against any source | Rare publications, preprints |
|
|
247
|
+
|
|
248
|
+
Verified references include discovered URLs (Semantic Scholar, ArXiv, DOI). Suggestions are non-blocking improvements.
|
|
249
|
+
|
|
250
|
+
<details>
|
|
251
|
+
<summary>Detailed examples</summary>
|
|
252
|
+
|
|
253
|
+
```
|
|
254
|
+
❌ Error: First author mismatch: cited 'T. Xie', actual 'Zhao Xu'
|
|
255
|
+
❌ Error: DOI mismatch: cited '10.5555/3295222.3295349', actual '10.48550/arXiv.1706.03762'
|
|
256
|
+
⚠️ Warning: Year mismatch: cited '2024', actual '2023'
|
|
257
|
+
ℹ️ Suggestion: Add ArXiv URL https://arxiv.org/abs/1706.03762
|
|
258
|
+
❓ Could not verify: Llama guard (M. A. Research, 2024)
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
</details>
|
|
262
|
+
|
|
263
|
+
## Configure
|
|
264
|
+
|
|
265
|
+
### LLM
|
|
266
|
+
|
|
267
|
+
LLM-powered extraction improves accuracy with complex bibliographies. Claude Sonnet 4 performs best; GPT-4o may hallucinate DOIs.
|
|
268
|
+
|
|
269
|
+
| Provider | Env Variable | Example Model |
|
|
270
|
+
|----------|--------------|---------------|
|
|
271
|
+
| Anthropic | `ANTHROPIC_API_KEY` | `claude-sonnet-4-20250514` |
|
|
272
|
+
| OpenAI | `OPENAI_API_KEY` | `gpt-4o` |
|
|
273
|
+
| Google | `GOOGLE_API_KEY` | `gemini-2.5-flash` |
|
|
274
|
+
| Azure | `AZURE_OPENAI_API_KEY` | `gpt-4` |
|
|
275
|
+
| vLLM | (local) | `meta-llama/Llama-3.1-8B-Instruct` |
|
|
276
|
+
|
|
277
|
+
```bash
|
|
278
|
+
export ANTHROPIC_API_KEY=your_key
|
|
279
|
+
academic-refchecker --paper 1706.03762 --llm-provider anthropic
|
|
280
|
+
|
|
281
|
+
academic-refchecker --paper paper.pdf --llm-provider openai --llm-model gpt-4o
|
|
282
|
+
academic-refchecker --paper paper.pdf --llm-provider vllm --llm-model meta-llama/Llama-3.1-8B-Instruct
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
#### Local models (vLLM)
|
|
286
|
+
|
|
287
|
+
There is no separate “GPU Docker image”. For local inference, install the vLLM extra and run an OpenAI-compatible vLLM server:
|
|
288
|
+
|
|
289
|
+
```bash
|
|
290
|
+
pip install "academic-refchecker[vllm]"
|
|
291
|
+
python scripts/start_vllm_server.py --model meta-llama/Llama-3.1-8B-Instruct --port 8001
|
|
292
|
+
academic-refchecker --paper paper.pdf --llm-provider vllm --llm-endpoint http://localhost:8001/v1
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
### Command Line
|
|
296
|
+
|
|
297
|
+
```bash
|
|
298
|
+
--paper PAPER # ArXiv ID, URL, or file path
|
|
299
|
+
--llm-provider PROVIDER # openai, anthropic, google, azure, vllm
|
|
300
|
+
--llm-model MODEL # Override default model
|
|
301
|
+
--db-path PATH # Local database for offline verification
|
|
302
|
+
--output-file [PATH] # Save results (default: reference_errors.txt)
|
|
303
|
+
--debug # Verbose output
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
### Environment Variables
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
# LLM
|
|
310
|
+
export REFCHECKER_LLM_PROVIDER=anthropic
|
|
311
|
+
export ANTHROPIC_API_KEY=your_key # Also: OPENAI_API_KEY, GOOGLE_API_KEY
|
|
312
|
+
|
|
313
|
+
# Performance
|
|
314
|
+
export SEMANTIC_SCHOLAR_API_KEY=your_key # Higher rate limits / faster verification
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## Docker
|
|
318
|
+
|
|
319
|
+
Pre-built images are published to GitHub Container Registry.
|
|
320
|
+
|
|
321
|
+
```bash
|
|
322
|
+
docker run -p 8000:8000 \
|
|
323
|
+
-e ANTHROPIC_API_KEY=your_key \
|
|
324
|
+
-v refchecker-data:/app/data \
|
|
325
|
+
ghcr.io/markrussinovich/refchecker:latest
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
Docker Compose:
|
|
329
|
+
|
|
330
|
+
```bash
|
|
331
|
+
git clone https://github.com/markrussinovich/refchecker.git && cd refchecker
|
|
332
|
+
cp .env.example .env # Add your API keys
|
|
333
|
+
docker compose up -d
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
| Tag | Description | Arch | Size |
|
|
337
|
+
|-----|-------------|------|------|
|
|
338
|
+
| `latest` | RefChecker (Web UI + API-based LLM support) | amd64, arm64 | ~800MB |
|
|
339
|
+
|
|
340
|
+
## Local Database
|
|
341
|
+
|
|
342
|
+
For offline verification or faster processing:
|
|
343
|
+
|
|
344
|
+
```bash
|
|
345
|
+
python scripts/download_db.py \
|
|
346
|
+
--field "computer science" \
|
|
347
|
+
--start-year 2020 --end-year 2024
|
|
348
|
+
|
|
349
|
+
academic-refchecker --paper paper.pdf --db-path semantic_scholar_db/semantic_scholar.db
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
## Testing
|
|
353
|
+
|
|
354
|
+
490+ tests covering unit, integration, and end-to-end scenarios.
|
|
355
|
+
|
|
356
|
+
```bash
|
|
357
|
+
pytest tests/ # All tests
|
|
358
|
+
pytest tests/unit/ # Unit only
|
|
359
|
+
pytest --cov=src tests/ # With coverage
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
See [tests/README.md](tests/README.md) for details.
|
|
363
|
+
|
|
364
|
+
## License
|
|
365
|
+
|
|
366
|
+
MIT License - see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
# RefChecker
|
|
2
|
+
|
|
3
|
+
Validate reference accuracy in academic papers. Useful for authors checking bibliographies and reviewers ensuring citations are authentic. RefChecker verifies citations against Semantic Scholar, OpenAlex, and CrossRef.
|
|
4
|
+
|
|
5
|
+
*Built by Mark Russinovich with AI assistants (Cursor, GitHub Copilot, Claude Code). [Watch the deep dive video](https://www.youtube.com/watch?v=n929Alz-fjo).*
|
|
6
|
+
|
|
7
|
+
## Contents
|
|
8
|
+
|
|
9
|
+
- [Quick Start](#quick-start)
|
|
10
|
+
- [Features](#features)
|
|
11
|
+
- [Sample Output](#sample-output)
|
|
12
|
+
- [Install](#install)
|
|
13
|
+
- [Run](#run)
|
|
14
|
+
- [Output](#output)
|
|
15
|
+
- [Configure](#configure)
|
|
16
|
+
- [Docker](#docker)
|
|
17
|
+
- [Local Database](#local-database)
|
|
18
|
+
- [Testing](#testing)
|
|
19
|
+
- [License](#license)
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
### Web UI (Docker)
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
docker run -p 8000:8000 ghcr.io/markrussinovich/refchecker:latest
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Open **http://localhost:8000** in your browser.
|
|
30
|
+
|
|
31
|
+
### Web UI (pip)
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install academic-refchecker[llm,webui]
|
|
35
|
+
refchecker-webui
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### CLI (pip)
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install academic-refchecker[llm]
|
|
42
|
+
academic-refchecker --paper 1706.03762
|
|
43
|
+
academic-refchecker --paper /path/to/paper.pdf
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
> **Performance**: Set `SEMANTIC_SCHOLAR_API_KEY` for 1-2s per reference vs 5-10s without.
|
|
47
|
+
|
|
48
|
+
## Features
|
|
49
|
+
|
|
50
|
+
- **Multiple formats**: ArXiv papers, PDFs, LaTeX, text files
|
|
51
|
+
- **LLM-powered extraction**: OpenAI, Anthropic, Google, Azure, vLLM
|
|
52
|
+
- **Multi-source verification**: Semantic Scholar, OpenAlex, CrossRef
|
|
53
|
+
- **Comprehensive checks**: Titles, authors, years, venues, DOIs, ArXiv IDs
|
|
54
|
+
- **Smart matching**: Handles formatting variations (BERT vs B-ERT, pre-trained vs pretrained)
|
|
55
|
+
- **Detailed reports**: Errors, warnings, corrected references
|
|
56
|
+
|
|
57
|
+
## Sample Output
|
|
58
|
+
|
|
59
|
+
**Web UI**
|
|
60
|
+
|
|
61
|
+

|
|
62
|
+
|
|
63
|
+
**CLI**
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
📄 Processing: Attention Is All You Need
|
|
67
|
+
URL: https://arxiv.org/abs/1706.03762
|
|
68
|
+
|
|
69
|
+
[1/45] Neural machine translation in linear time
|
|
70
|
+
Nal Kalchbrenner et al. | 2017
|
|
71
|
+
⚠️ Warning: Year mismatch: cited '2017', actual '2016'
|
|
72
|
+
|
|
73
|
+
[2/45] Effective approaches to attention-based neural machine translation
|
|
74
|
+
Minh-Thang Luong et al. | 2015
|
|
75
|
+
❌ Error: First author mismatch: cited 'Minh-Thang Luong', actual 'Thang Luong'
|
|
76
|
+
|
|
77
|
+
[3/45] Deep Residual Learning for Image Recognition
|
|
78
|
+
Kaiming He et al. | 2016 | https://doi.org/10.1109/CVPR.2016.91
|
|
79
|
+
❌ Error: DOI mismatch: cited '10.1109/CVPR.2016.91', actual '10.1109/CVPR.2016.90'
|
|
80
|
+
|
|
81
|
+
============================================================
|
|
82
|
+
📋 SUMMARY
|
|
83
|
+
📚 Total references processed: 68
|
|
84
|
+
❌ Total errors: 55 ⚠️ Total warnings: 16 ❓ Unverified: 15
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Install
|
|
88
|
+
|
|
89
|
+
### PyPI (Recommended)
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pip install academic-refchecker[llm,webui] # Web UI + CLI + LLM providers
|
|
93
|
+
pip install academic-refchecker # CLI only
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### From Source (Development)
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
git clone https://github.com/markrussinovich/refchecker.git && cd refchecker
|
|
100
|
+
python -m venv .venv && source .venv/bin/activate
|
|
101
|
+
pip install -e ".[llm,webui]"
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Requirements:** Python 3.7+ (3.10+ recommended). Node.js 18+ is only needed for Web UI development.
|
|
105
|
+
|
|
106
|
+
## Run
|
|
107
|
+
|
|
108
|
+
### Web UI
|
|
109
|
+
|
|
110
|
+
The Web UI shows live progress, history, and export (including corrected values).
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
refchecker-webui --port 8000
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
#### Development (frontend)
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
cd web-ui
|
|
120
|
+
npm install
|
|
121
|
+
npm start
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Open **http://localhost:5173**.
|
|
125
|
+
|
|
126
|
+
Alternative (separate servers):
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
# Terminal 1
|
|
130
|
+
python -m uvicorn backend.main:app --reload --port 8000
|
|
131
|
+
|
|
132
|
+
# Terminal 2
|
|
133
|
+
cd web-ui
|
|
134
|
+
npm run dev
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Verify the backend is running:
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
curl http://localhost:8000/
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Web UI documentation: see [web-ui/README.md](web-ui/README.md).
|
|
144
|
+
|
|
145
|
+
### CLI
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# ArXiv (ID or URL)
|
|
149
|
+
academic-refchecker --paper 1706.03762
|
|
150
|
+
academic-refchecker --paper https://arxiv.org/abs/1706.03762
|
|
151
|
+
|
|
152
|
+
# Local files
|
|
153
|
+
academic-refchecker --paper paper.pdf
|
|
154
|
+
academic-refchecker --paper paper.tex
|
|
155
|
+
academic-refchecker --paper paper.txt
|
|
156
|
+
academic-refchecker --paper refs.bib
|
|
157
|
+
|
|
158
|
+
# Faster/offline verification (local DB)
|
|
159
|
+
academic-refchecker --paper paper.pdf --db-path semantic_scholar_db/semantic_scholar.db
|
|
160
|
+
|
|
161
|
+
# Save results
|
|
162
|
+
academic-refchecker --paper 1706.03762 --output-file errors.txt
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Output
|
|
166
|
+
|
|
167
|
+
RefChecker reports these result types:
|
|
168
|
+
|
|
169
|
+
| Type | Description | Examples |
|
|
170
|
+
|------|-------------|----------|
|
|
171
|
+
| ❌ **Error** | Critical issues needing correction | Author/title/DOI mismatches, incorrect ArXiv IDs |
|
|
172
|
+
| ⚠️ **Warning** | Minor issues to review | Year differences, venue variations |
|
|
173
|
+
| ℹ️ **Suggestion** | Recommended improvements | Add missing ArXiv/DOI URLs, small metadata fixes |
|
|
174
|
+
| ❓ **Unverified** | Could not verify against any source | Rare publications, preprints |
|
|
175
|
+
|
|
176
|
+
Verified references include discovered URLs (Semantic Scholar, ArXiv, DOI). Suggestions are non-blocking improvements.
|
|
177
|
+
|
|
178
|
+
<details>
|
|
179
|
+
<summary>Detailed examples</summary>
|
|
180
|
+
|
|
181
|
+
```
|
|
182
|
+
❌ Error: First author mismatch: cited 'T. Xie', actual 'Zhao Xu'
|
|
183
|
+
❌ Error: DOI mismatch: cited '10.5555/3295222.3295349', actual '10.48550/arXiv.1706.03762'
|
|
184
|
+
⚠️ Warning: Year mismatch: cited '2024', actual '2023'
|
|
185
|
+
ℹ️ Suggestion: Add ArXiv URL https://arxiv.org/abs/1706.03762
|
|
186
|
+
❓ Could not verify: Llama guard (M. A. Research, 2024)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
</details>
|
|
190
|
+
|
|
191
|
+
## Configure
|
|
192
|
+
|
|
193
|
+
### LLM
|
|
194
|
+
|
|
195
|
+
LLM-powered extraction improves accuracy with complex bibliographies. Claude Sonnet 4 performs best; GPT-4o may hallucinate DOIs.
|
|
196
|
+
|
|
197
|
+
| Provider | Env Variable | Example Model |
|
|
198
|
+
|----------|--------------|---------------|
|
|
199
|
+
| Anthropic | `ANTHROPIC_API_KEY` | `claude-sonnet-4-20250514` |
|
|
200
|
+
| OpenAI | `OPENAI_API_KEY` | `gpt-4o` |
|
|
201
|
+
| Google | `GOOGLE_API_KEY` | `gemini-2.5-flash` |
|
|
202
|
+
| Azure | `AZURE_OPENAI_API_KEY` | `gpt-4` |
|
|
203
|
+
| vLLM | (local) | `meta-llama/Llama-3.1-8B-Instruct` |
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
export ANTHROPIC_API_KEY=your_key
|
|
207
|
+
academic-refchecker --paper 1706.03762 --llm-provider anthropic
|
|
208
|
+
|
|
209
|
+
academic-refchecker --paper paper.pdf --llm-provider openai --llm-model gpt-4o
|
|
210
|
+
academic-refchecker --paper paper.pdf --llm-provider vllm --llm-model meta-llama/Llama-3.1-8B-Instruct
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
#### Local models (vLLM)
|
|
214
|
+
|
|
215
|
+
There is no separate “GPU Docker image”. For local inference, install the vLLM extra and run an OpenAI-compatible vLLM server:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
pip install "academic-refchecker[vllm]"
|
|
219
|
+
python scripts/start_vllm_server.py --model meta-llama/Llama-3.1-8B-Instruct --port 8001
|
|
220
|
+
academic-refchecker --paper paper.pdf --llm-provider vllm --llm-endpoint http://localhost:8001/v1
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### Command Line
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
--paper PAPER # ArXiv ID, URL, or file path
|
|
227
|
+
--llm-provider PROVIDER # openai, anthropic, google, azure, vllm
|
|
228
|
+
--llm-model MODEL # Override default model
|
|
229
|
+
--db-path PATH # Local database for offline verification
|
|
230
|
+
--output-file [PATH] # Save results (default: reference_errors.txt)
|
|
231
|
+
--debug # Verbose output
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### Environment Variables
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
# LLM
|
|
238
|
+
export REFCHECKER_LLM_PROVIDER=anthropic
|
|
239
|
+
export ANTHROPIC_API_KEY=your_key # Also: OPENAI_API_KEY, GOOGLE_API_KEY
|
|
240
|
+
|
|
241
|
+
# Performance
|
|
242
|
+
export SEMANTIC_SCHOLAR_API_KEY=your_key # Higher rate limits / faster verification
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## Docker
|
|
246
|
+
|
|
247
|
+
Pre-built images are published to GitHub Container Registry.
|
|
248
|
+
|
|
249
|
+
```bash
|
|
250
|
+
docker run -p 8000:8000 \
|
|
251
|
+
-e ANTHROPIC_API_KEY=your_key \
|
|
252
|
+
-v refchecker-data:/app/data \
|
|
253
|
+
ghcr.io/markrussinovich/refchecker:latest
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
Docker Compose:
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
git clone https://github.com/markrussinovich/refchecker.git && cd refchecker
|
|
260
|
+
cp .env.example .env # Add your API keys
|
|
261
|
+
docker compose up -d
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
| Tag | Description | Arch | Size |
|
|
265
|
+
|-----|-------------|------|------|
|
|
266
|
+
| `latest` | RefChecker (Web UI + API-based LLM support) | amd64, arm64 | ~800MB |
|
|
267
|
+
|
|
268
|
+
## Local Database
|
|
269
|
+
|
|
270
|
+
For offline verification or faster processing:
|
|
271
|
+
|
|
272
|
+
```bash
|
|
273
|
+
python scripts/download_db.py \
|
|
274
|
+
--field "computer science" \
|
|
275
|
+
--start-year 2020 --end-year 2024
|
|
276
|
+
|
|
277
|
+
academic-refchecker --paper paper.pdf --db-path semantic_scholar_db/semantic_scholar.db
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
## Testing
|
|
281
|
+
|
|
282
|
+
490+ tests covering unit, integration, and end-to-end scenarios.
|
|
283
|
+
|
|
284
|
+
```bash
|
|
285
|
+
pytest tests/ # All tests
|
|
286
|
+
pytest tests/unit/ # Unit only
|
|
287
|
+
pytest --cov=src tests/ # With coverage
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
See [tests/README.md](tests/README.md) for details.
|
|
291
|
+
|
|
292
|
+
## License
|
|
293
|
+
|
|
294
|
+
MIT License - see [LICENSE](LICENSE).
|