academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,738 @@
1
+ Metadata-Version: 2.4
2
+ Name: academic-refchecker
3
+ Version: 2.0.7
4
+ Summary: A comprehensive tool for validating reference accuracy in academic papers
5
+ Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/markrussinovich/refchecker
8
+ Project-URL: Repository, https://github.com/markrussinovich/refchecker
9
+ Project-URL: Bug Tracker, https://github.com/markrussinovich/refchecker/issues
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.7
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Operating System :: OS Independent
20
+ Requires-Python: >=3.7
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: requests>=2.25.0
24
+ Requires-Dist: beautifulsoup4>=4.9.0
25
+ Requires-Dist: pypdf>=5.0.0
26
+ Requires-Dist: arxiv>=1.4.0
27
+ Requires-Dist: python-dateutil>=2.8.0
28
+ Requires-Dist: tqdm>=4.60.0
29
+ Requires-Dist: colorama>=0.4.4
30
+ Requires-Dist: fuzzywuzzy>=0.18.0
31
+ Requires-Dist: python-Levenshtein>=0.12.0
32
+ Requires-Dist: pandas<2.4.0,>=1.3.0
33
+ Requires-Dist: numpy<2.0.0,>=1.22.4
34
+ Requires-Dist: pdfplumber>=0.6.0
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest>=6.0.0; extra == "dev"
37
+ Requires-Dist: pytest-cov>=2.0.0; extra == "dev"
38
+ Requires-Dist: black>=21.0.0; extra == "dev"
39
+ Requires-Dist: isort>=5.0.0; extra == "dev"
40
+ Requires-Dist: flake8>=3.9.0; extra == "dev"
41
+ Requires-Dist: mypy>=0.910; extra == "dev"
42
+ Provides-Extra: docs
43
+ Requires-Dist: sphinx>=4.0.0; extra == "docs"
44
+ Requires-Dist: sphinx-rtd-theme>=0.5.0; extra == "docs"
45
+ Provides-Extra: llm
46
+ Requires-Dist: openai>=1.0.0; extra == "llm"
47
+ Requires-Dist: anthropic>=0.7.0; extra == "llm"
48
+ Requires-Dist: google-generativeai>=0.3.0; extra == "llm"
49
+ Provides-Extra: optional
50
+ Requires-Dist: lxml>=4.6.0; extra == "optional"
51
+ Requires-Dist: selenium>=4.0.0; extra == "optional"
52
+ Requires-Dist: pikepdf>=5.0.0; extra == "optional"
53
+ Requires-Dist: nltk>=3.6.0; extra == "optional"
54
+ Requires-Dist: scikit-learn>=1.0.0; extra == "optional"
55
+ Requires-Dist: joblib>=1.1.0; extra == "optional"
56
+ Provides-Extra: vllm
57
+ Requires-Dist: vllm>=0.3.0; extra == "vllm"
58
+ Requires-Dist: huggingface_hub>=0.17.0; extra == "vllm"
59
+ Requires-Dist: torch>=2.0.0; extra == "vllm"
60
+ Provides-Extra: webui
61
+ Requires-Dist: fastapi>=0.100.0; extra == "webui"
62
+ Requires-Dist: uvicorn[standard]>=0.22.0; extra == "webui"
63
+ Requires-Dist: pydantic>=2.0.0; extra == "webui"
64
+ Requires-Dist: aiosqlite>=0.19.0; extra == "webui"
65
+ Requires-Dist: httpx>=0.24.0; extra == "webui"
66
+ Requires-Dist: cryptography>=42.0.0; extra == "webui"
67
+ Requires-Dist: pymupdf>=1.23.0; extra == "webui"
68
+ Requires-Dist: Pillow>=9.0.0; extra == "webui"
69
+ Requires-Dist: python-multipart>=0.0.6; extra == "webui"
70
+ Dynamic: license-file
71
+
72
+ # ๐Ÿ“š Academic Paper Reference Checker
73
+
74
+ *Developed by Mark Russinovich with various AI assistants, including Cursor, GitHub Copilot and Claude Code*
75
+
76
+ A comprehensive tool for validating reference accuracy in academic papers, useful for both authors checking their bibliography and conference reviewers ensuring that paper references are authentic and accurate. This tool processes papers from various local and online sources including ArXiv, PDF files, LaTeX documents, and text files to verify the accuracy of references by comparing cited information against authoritative sources.
77
+
78
+ ## ๐ŸŽฅ Project Deep Dive
79
+
80
+ Learn about RefChecker's design philosophy and development process in this detailed discussion between Mark Russinovich (RefChecker's author) and Scott Hanselman. Mark shares insights into how he leveraged AI coding assistants including Cursor, GitHub Copilot, and Claude to build this comprehensive academic reference validation tool.
81
+
82
+ **[๐Ÿ“บ Watch: "AI Coding with Mark Russinovich: Building RefChecker"](https://www.youtube.com/watch?v=n929Alz-fjo)**
83
+
84
+ *This video provides valuable insights into modern AI-assisted development workflows and the technical decisions behind RefChecker's architecture.*
85
+
86
+ ## ๐Ÿ“Š Sample Output
87
+
88
+ ```
89
+ ๐Ÿ“„ Processing: Attention Is All You Need
90
+ URL: https://arxiv.org/abs/1706.03762
91
+
92
+ [1/45] Neural machine translation in linear time
93
+ Nal Kalchbrenner, Lasse Espeholt, Karen Simonyan, Aaron van den Oord, Alex Graves, Koray Kavukcuoglu
94
+ 2017
95
+
96
+ Verified URL: https://www.semanticscholar.org/paper/5f4ac1ac7ca4b17d3db1b52d9aafd9e8b26c0d7
97
+ ArXiv URL: https://arxiv.org/abs/1610.10099
98
+ DOI URL: https://doi.org/10.48550/arxiv.1610.10099
99
+ โš ๏ธ Warning: Year mismatch:
100
+ cited: '2017'
101
+ actual: '2016'
102
+
103
+ [2/45] Effective approaches to attention-based neural machine translation
104
+ Minh-Thang Luong, Hieu Pham, Christopher D. Manning
105
+ 2015
106
+
107
+ Verified URL: https://www.semanticscholar.org/paper/93499a7c7f699b6630a86fad964536f9423bb6d0
108
+ ArXiv URL: https://arxiv.org/abs/1508.04025
109
+ DOI URL: https://doi.org/10.18653/v1/d15-1166
110
+ โŒ Error: First author mismatch:
111
+ cited: 'Minh-Thang Luong'
112
+ actual: 'Thang Luong'
113
+
114
+ [3/45] Deep Residual Learning for Image Recognition
115
+ Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
116
+ Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition
117
+ 2016
118
+ https://doi.org/10.1109/CVPR.2016.91
119
+
120
+ Verified URL: https://www.semanticscholar.org/paper/2c03df8b48bf3fa39054345bafabfeff15bfd11d
121
+ ArXiv URL: https://arxiv.org/abs/1512.03385
122
+ DOI URL: https://doi.org/10.1109/CVPR.2016.90
123
+ โŒ Error: DOI mismatch:
124
+ cited: '10.1109/CVPR.2016.91'
125
+ actual: '10.1109/CVPR.2016.90'
126
+
127
+ ============================================================
128
+ ๐Ÿ“‹ SUMMARY
129
+ ============================================================
130
+ ๐Ÿ“š Total references processed: 68
131
+ โŒ Total errors: 55
132
+ โš ๏ธ Total warnings: 16
133
+ โ“ References that couldn't be verified: 15
134
+ ```
135
+
136
+ ## ๐Ÿ“‹ Table of Contents
137
+
138
+ - [๐ŸŽฅ Project Deep Dive](#-project-deep-dive)
139
+ - [๐Ÿ“Š Sample Output](#-sample-output)
140
+ - [๐ŸŽฏ Features](#-features)
141
+ - [๐Ÿš€ Quick Start](#-quick-start)
142
+ - [๐ŸŒ Web UI](#-web-ui)
143
+ - [๐Ÿค– LLM-Enhanced Reference Extraction](#-llm-enhanced-reference-extraction)
144
+ - [๐Ÿ“ฆ Installation](#-installation)
145
+ - [๐Ÿ“– Usage](#-usage)
146
+ - [๐Ÿ“Š Output and Results](#-output-and-results)
147
+ - [โš™๏ธ Configuration](#-configuration)
148
+ - [๐Ÿ—„๏ธ Local Database Setup](#-local-database-setup)
149
+ - [๐Ÿงช Testing](#-testing)
150
+ - [๐Ÿ“„ License](#-license)
151
+
152
+ ## ๐ŸŽฏ Features
153
+
154
+ - **๐Ÿ“„ Multiple Input Formats**: Process ArXiv papers, local PDFs, LaTeX files, and text documents
155
+ - **๐Ÿ” Advanced Bibliography Detection**: Uses intelligent pattern matching to identify bibliography sections
156
+ - **๐Ÿค– LLM-Enhanced Reference Extraction**: Recommended AI-powered bibliography parsing with support for OpenAI, Anthropic, Google, Azure, and local vLLM
157
+ - **โœ… Comprehensive Error Detection**: Identifies issues with titles, authors, years, venues, URLs, and DOIs
158
+ - **๐Ÿ”„ Multi-Tier Verification Sources**: Uses a prioritized check of Semantic Scholar, OpenAlex, and CrossRef with intelligent retry logic
159
+ - **๐Ÿ”— Enhanced URL Discovery**: Automatically discovers and displays additional authoritative URLs (Semantic Scholar, ArXiv, DOI) obtained through verification
160
+ - **๐Ÿง  Smart Title Matching**: Advanced similarity algorithms handle common academic formatting variations (BERT vs B-ERT, pre-trained vs pretrained)
161
+ - **๐Ÿข Venue Normalization**: Recognizes common journal and conference abbreviation patterns
162
+ - **๐Ÿ“Š Detailed Reporting**: Generates comprehensive error reports with drop-in corrected references
163
+
164
+ ## ๐Ÿš€ Quick Start
165
+
166
+ ### Check Your First Paper
167
+
168
+ 1. **Check a famous paper:**
169
+ ```bash
170
+ python run_refchecker.py --paper 1706.03762
171
+ ```
172
+
173
+ 2. **Check your own PDF:**
174
+ ```bash
175
+ python run_refchecker.py --paper /path/to/your/paper.pdf
176
+ ```
177
+
178
+ 3. **For faster processing with local database** (see [Local Database Setup](#local-database-setup)):
179
+ ```bash
180
+ python run_refchecker.py --paper 1706.03762 --db-path semantic_scholar_db/semantic_scholar.db
181
+ ```
182
+
183
+ > **โšก Performance Tip**: Reference verification takes 5-10 seconds per reference without a Semantic Scholar API key due to rate limiting. With an API key, verification speeds up to 1-2 seconds per reference. Set `SEMANTIC_SCHOLAR_API_KEY` environment variable or use `--semantic-scholar-api-key` for faster processing.
184
+
185
+ ## ๐ŸŒ Web UI
186
+
187
+ RefChecker also includes a modern web interface with real-time progress updates, check history, and export options.
188
+
189
+ ![RefChecker Web UI](assets/webui.png)
190
+
191
+ ### Option 1: Install from PyPI (Recommended)
192
+
193
+ The simplest way to run the Web UI is using the pip-installed package:
194
+
195
+ ```bash
196
+ # Install RefChecker with Web UI support
197
+ pip install academic-refchecker[llm,webui]
198
+
199
+ # Start the web server
200
+ refchecker-webui
201
+ ```
202
+
203
+ Then open **http://localhost:8000** in your browser.
204
+
205
+ The `refchecker-webui` command starts a complete web server with both the API backend and the pre-built frontend.
206
+
207
+ **Options:**
208
+ ```bash
209
+ refchecker-webui --port 8080 # Use a different port
210
+ refchecker-webui --host 0.0.0.0 # Allow external connections
211
+ ```
212
+
213
+ ### Option 2: Run from Cloned Repository (Development)
214
+
215
+ If you're developing or modifying the Web UI:
216
+
217
+ **Prerequisites:**
218
+ - **Python 3.8+** with dependencies installed
219
+ - **Node.js 18+** and npm
220
+
221
+ ```bash
222
+ # Clone the repository
223
+ git clone https://github.com/markrussinovich/refchecker.git
224
+ cd refchecker
225
+
226
+ # Install Python dependencies
227
+ pip install -e ".[llm,webui]"
228
+
229
+ # Install and run the frontend development server
230
+ cd web-ui
231
+ npm install # First time only
232
+ npm start # Starts both backend and frontend
233
+ ```
234
+
235
+ Then open **http://localhost:5173** in your browser.
236
+
237
+ **Alternative: Start Servers Separately**
238
+
239
+ *Terminal 1 - Backend:*
240
+ ```bash
241
+ python -m uvicorn backend.main:app --reload --port 8000
242
+ ```
243
+
244
+ *Terminal 2 - Frontend:*
245
+ ```bash
246
+ cd web-ui
247
+ npm run dev
248
+ ```
249
+
250
+ ### Features
251
+
252
+ - โœจ Real-time validation with live progress updates
253
+ - ๐Ÿ“„ Support for ArXiv URLs and file uploads (PDF, LaTeX, text)
254
+ - ๐Ÿ“Š Live statistics with filtering by status
255
+ - ๐Ÿ“‹ Export references as Markdown, plain text, or BibTeX (with corrected values)
256
+ - ๐Ÿ“š Persistent check history
257
+ - ๐ŸŒ“ Automatic dark/light mode
258
+
259
+ For complete Web UI documentation, see **[web-ui/README.md](web-ui/README.md)**.
260
+
261
+ ## ๐Ÿค– LLM-Enhanced Reference Extraction
262
+
263
+ RefChecker supports AI-powered bibliography parsing using Large Language Models (LLMs) for improved accuracy with complex citation formats. While models as small as Llama 3.1-8B are fairly reliable at reference extraction, they can struggle with non-standard bibliographies. GPT-4o frequently hallucinates DOIs while Sonnet 4 has shown the best performance on large, complex bibliographies.
264
+
265
+ ### Supported LLM Providers
266
+
267
+ - **OpenAI** e.g., GPT-4.1, o3
268
+ - **Anthropic** e.g., Claude Sonnet 4
269
+ - **Google** e.g., Gemini 2.5
270
+ - **Azure OpenAI** e.g., GPT-4o, o3
271
+ - **vLLM** e.g., Local Hugging Face models via OpenAI-compatible server
272
+
273
+ ### Quick LLM Setup
274
+
275
+ 1. **Using Environment Variables**:
276
+ ```bash
277
+ # Enable LLM with Anthropic Claude
278
+ export REFCHECKER_USE_LLM=true
279
+ export REFCHECKER_LLM_PROVIDER=anthropic
280
+ export ANTHROPIC_API_KEY=your_api_key_here
281
+
282
+ python run_refchecker.py --paper 1706.03762
283
+ ```
284
+
285
+ 2. **Using Command Line Arguments**:
286
+ ```bash
287
+ # Enable LLM with specific provider and model
288
+ python run_refchecker.py --paper 1706.03762 \
289
+ --llm-provider anthropic \
290
+ --llm-model claude-sonnet-4-20250514 \
291
+ ```
292
+ API keys are obtained from environment variables, or if not found, the tool will prompt you interactively to enter them securely.
293
+
294
+ ### LLM Examples
295
+
296
+ #### OpenAI GPT-4
297
+
298
+ With `OPENAI_API_KEY` environment variable:
299
+
300
+ ```bash
301
+ python run_refchecker.py --paper /path/to/paper.pdf \
302
+ --llm-provider openai \
303
+ --llm-model gpt-4o \
304
+ ```
305
+
306
+ #### Anthropic Claude
307
+
308
+ With `ANTHROPIC_API_KEY` environment variable:
309
+
310
+ ```bash
311
+ python run_refchecker.py --paper https://arxiv.org/abs/1706.03762 \
312
+ --llm-provider anthropic \
313
+ --llm-model claude-sonnet-4-20250514 \
314
+ ```
315
+
316
+ #### Google Gemini
317
+
318
+ ```bash
319
+ python run_refchecker.py --paper paper.tex \
320
+ --llm-provider google \
321
+ --llm-model gemini-2.5-flash
322
+ ```
323
+
324
+ #### Azure OpenAI
325
+
326
+ ```bash
327
+ python run_refchecker.py --paper paper.txt \
328
+ --llm-provider azure \
329
+ --llm-model gpt-4 \
330
+ --llm-endpoint https://your-resource.openai.azure.com/
331
+ ```
332
+
333
+ #### vLLM (Local Models)
334
+
335
+ For running models locally:
336
+
337
+ ```bash
338
+ # automatic Huggingface model download with VLLM server launch
339
+ python run_refchecker.py --paper paper.pdf \
340
+ --llm-provider vllm \
341
+ --llm-model meta-llama/Llama-3.1-8B-Instruct
342
+ ```
343
+
344
+ You can debug vllm server issues by running refchecker with the `--debug` flag.
345
+
346
+ ## ๐Ÿ“ฆ Installation
347
+
348
+ ### Prerequisites
349
+
350
+ - **Python 3.8+** (3.10+ recommended)
351
+ - **Node.js 18+** and npm (only required for Web UI)
352
+
353
+ ### Option 1: Install from PyPI (Recommended)
354
+
355
+ For the latest stable release with all features:
356
+
357
+ ```bash
358
+ pip install academic-refchecker[llm,webui]
359
+ ```
360
+
361
+ This installs RefChecker with:
362
+ - **llm**: Support for OpenAI, Anthropic, Google, Azure, and vLLM providers
363
+ - **webui**: Web interface dependencies (FastAPI, uvicorn, etc.)
364
+
365
+ For a minimal installation (CLI only, no LLM or Web UI):
366
+ ```bash
367
+ pip install academic-refchecker
368
+ ```
369
+
370
+ Other optional extras:
371
+ - **dev**: Development tools (pytest, black, flake8, mypy)
372
+ - **optional**: Enhanced features (lxml, selenium, pikepdf, nltk, scikit-learn)
373
+ - **vllm**: Local model inference with vLLM
374
+
375
+ ### Option 2: Install from Source
376
+
377
+ #### 1. Clone the Repository
378
+
379
+ ```bash
380
+ git clone https://github.com/markrussinovich/refchecker.git
381
+ cd refchecker
382
+ ```
383
+
384
+ #### 2. Create and Activate Virtual Environment (Recommended)
385
+
386
+ ```bash
387
+ python -m venv .venv
388
+ # On Windows:
389
+ .venv\Scripts\activate
390
+ # On macOS/Linux:
391
+ source .venv/bin/activate
392
+ ```
393
+
394
+ #### 3. Install Dependencies
395
+
396
+ ```bash
397
+ # Install all dependencies including LLM and Web UI support
398
+ pip install -e ".[llm,webui]"
399
+
400
+ # Or install from requirements.txt
401
+ pip install -r requirements.txt
402
+ ```
403
+
404
+ #### 4. (Optional) Install Additional Dependencies
405
+
406
+ For enhanced performance and LLM support, you can install optional dependencies:
407
+
408
+ ```bash
409
+ # For LLM providers
410
+ pip install openai # For OpenAI GPT models
411
+ pip install anthropic # For Anthropic Claude models
412
+ pip install google-generativeai # For Google Gemini models
413
+
414
+ # For faster XML/HTML parsing
415
+ pip install lxml
416
+
417
+ # For dynamic web scraping (if needed)
418
+ pip install selenium
419
+
420
+ # For better PDF processing
421
+ pip install pikepdf
422
+ ```
423
+
424
+ ### Web UI Installation
425
+
426
+ The Web UI requires Node.js 18+ in addition to the Python dependencies:
427
+
428
+ ```bash
429
+ cd web-ui
430
+ npm install
431
+ ```
432
+
433
+ ## ๐Ÿ“– Usage
434
+
435
+ Check papers in various formats and online locations:
436
+
437
+ #### ArXiv Papers
438
+
439
+ ```bash
440
+ # Check a specific ArXiv paper by ID
441
+ python run_refchecker.py --paper 1706.03762
442
+
443
+ # Check by ArXiv URL
444
+ python run_refchecker.py --paper https://arxiv.org/abs/1706.03762
445
+
446
+ # Check by ArXiv PDF URL
447
+ python run_refchecker.py --paper https://arxiv.org/pdf/1706.03762.pdf
448
+ ```
449
+
450
+ #### Local PDF Files
451
+
452
+ ```bash
453
+ # Check a local PDF file
454
+ python run_refchecker.py --paper /path/to/your/paper.pdf
455
+
456
+ # Check with offline database for faster processing
457
+ python run_refchecker.py --paper /path/to/your/paper.pdf --db-path semantic_scholar_db/semantic_scholar.db
458
+ ```
459
+
460
+ #### LaTeX Files
461
+
462
+ ```bash
463
+ # Check a LaTeX document
464
+ python run_refchecker.py --paper /path/to/your/paper.tex
465
+
466
+ # Check with debug mode for detailed processing info
467
+ python run_refchecker.py --paper /path/to/your/paper.tex --debug
468
+ ```
469
+
470
+ #### Text Files
471
+
472
+ ```bash
473
+ # Check a plain text file containing paper content
474
+ python run_refchecker.py --paper /path/to/your/paper.txt
475
+
476
+ # Combine with local database for offline verification
477
+ python run_refchecker.py --paper /path/to/your/paper.txt --db-path semantic_scholar_db/semantic_scholar.db
478
+ ```
479
+
480
+
481
+ ## ๐Ÿ“Š Output and Results
482
+
483
+ ### Generated Files
484
+
485
+ By default, no files are generated. To save detailed results, use the `--output-file` option:
486
+
487
+ ```bash
488
+ # Save to default filename (reference_errors.txt)
489
+ python run_refchecker.py --paper 1706.03762 --output-file
490
+
491
+ # Save to custom filename
492
+ python run_refchecker.py --paper 1706.03762 --output-file my_errors.txt
493
+ ```
494
+
495
+ The output file contains a detailed report of references with errors and warnings, including corrected references.
496
+
497
+ ### Enhanced URL Display
498
+
499
+ RefChecker automatically discovers and displays authoritative URLs for verified references:
500
+
501
+ - **Verified URL**: The primary authoritative source (typically Semantic Scholar)
502
+ - **ArXiv URL**: Direct link to the ArXiv preprint when available
503
+ - **DOI URL**: Digital Object Identifier link when available
504
+ - **Additional URLs**: Other relevant sources discovered during verification
505
+
506
+ This enhanced URL display helps users access multiple authoritative sources for each reference and provides comprehensive citation information.
507
+
508
+ ### Error Types
509
+
510
+ - **โŒ Errors**: Critical issues that need correction
511
+ - `author`: Author name mismatches
512
+ ```
513
+ [16/19] Bag of tricks: Benchmarking of jailbreak attacks on llms
514
+ T. Xie, X. Qi, Y. Zeng, Y. Huang, U. M. Sehwag, K. Huang, L. He, B. Wei, D. Li, Y. Sheng et al
515
+
516
+ Verified URL: https://www.semanticscholar.org/paper/a1b2c3d4e5f6789012345678901234567890abcd
517
+ ArXiv URL: https://arxiv.org/abs/2312.02119
518
+ DOI URL: https://doi.org/10.48550/arxiv.2312.02119
519
+ โŒ Error: First author mismatch:
520
+ cited: 'T. Xie'
521
+ actual: 'Zhao Xu'
522
+ ```
523
+ - `title`: Title discrepancies
524
+ ```
525
+ [8/19] BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
526
+ J. Devlin, M.-W. Chang, K. Lee, K. Toutanova
527
+
528
+ Verified URL: https://www.semanticscholar.org/paper/df2b0e26d0599ce3e70df8a9da02e51594e0e992
529
+ ArXiv URL: https://arxiv.org/abs/1810.04805
530
+ DOI URL: https://doi.org/10.18653/v1/n19-1423
531
+ โŒ Error: Title mismatch:
532
+ cited: 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding'
533
+ actual: 'BERT: Pre-training of Deep Bidirectional Transformers for Language Comprehension'
534
+ ```
535
+ - `arxiv_id`: Incorrect URLs or arXiv IDs
536
+ ```
537
+ [5/19] Jbshield: Neural representation-level defense against adversarial prompts in large language models
538
+ W. Zhang, M. Li, H. Wang
539
+ https://arxiv.org/abs/2503.01234
540
+
541
+ Verified URL: https://www.semanticscholar.org/paper/e1f2a3b4c5d6e7f8901234567890123456789012
542
+ DOI URL: https://doi.org/10.48550/arxiv.2401.12345
543
+ โŒ Error: Incorrect ArXiv ID: ArXiv ID 2503.01234 points to 'Self-Adaptive Gamma Context-Aware SSM-based Model for Metal Defect Detection'
544
+ ```
545
+ - `doi`: DOI mismatches
546
+ ```
547
+ [12/19] Attention Is All You Need
548
+ Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin
549
+ Neural Information Processing Systems
550
+ 2017
551
+ https://doi.org/10.5555/3295222.3295349
552
+
553
+ Verified URL: https://www.semanticscholar.org/paper/204e3073870fae3d05bcbc2f6a8e263d9b72e776
554
+ ArXiv URL: https://arxiv.org/abs/1706.03762
555
+ DOI URL: https://doi.org/10.48550/arXiv.1706.03762
556
+ โŒ Error: DOI mismatch:
557
+ cited: '10.5555/3295222.3295349'
558
+ actual: '10.48550/arXiv.1706.03762'
559
+ ```
560
+
561
+ - **โš ๏ธ Warnings**: Minor issues that may need attention
562
+ - `year`: Publication year differences (common due to multiple paper versions)
563
+ ```
564
+ [14/19] Smoothllm: Defending large language models against jailbreaking attacks
565
+ A. Robey, E. Wong, H. Hassani, G. J. Pappas
566
+ 2024
567
+
568
+ Verified URL: https://www.semanticscholar.org/paper/f1a2b3c4d5e6f7890123456789012345678901ab
569
+ ArXiv URL: https://arxiv.org/abs/2310.03684
570
+ DOI URL: https://doi.org/10.48550/arxiv.2310.03684
571
+ โš ๏ธ Warning: Year mismatch:
572
+ cited: '2024'
573
+ actual: '2023'
574
+ ```
575
+ - `venue`: Venue format variations
576
+ ```
577
+ [2/19] Gradient cuff: Detecting jailbreak attacks on large language models by exploring refusal loss landscapes
578
+ X. Hu, P.-Y. Chen, T.-Y. Ho
579
+ arXiv, 2024
580
+
581
+ Verified URL: https://www.semanticscholar.org/paper/c1d2e3f4a5b6c7d8e9f0123456789012345678ab
582
+ ArXiv URL: https://arxiv.org/abs/2403.02151
583
+ DOI URL: https://doi.org/10.48550/arxiv.2403.02151
584
+ โš ๏ธ Warning: Venue mismatch:
585
+ cited: 'arXiv, 2024'
586
+ actual: 'Neural Information Processing Systems'
587
+ ```
588
+
589
+ - **โ“ Unverified**: References that couldn't be verified with any of the checker APIs
590
+ ```
591
+ [15/19] Llama guard: A fine-tuned safety model for prompt moderation
592
+ M. A. Research
593
+ โ“ Could not verify: Llama guard: A fine-tuned safety model for prompt moderation
594
+ Cited as: M. A. Research (2024)
595
+ URL: https://research.meta.com/publications/llama-guard-a-fine-tuned-safety-model-for-prompt-moderation/
596
+ ```
597
+
598
+ ## โš™๏ธ Configuration
599
+
600
+ ### Command Line Arguments
601
+
602
+ ```bash
603
+ # Basic options
604
+ --paper PAPER # Paper to check (ArXiv ID, URL, or file path)
605
+ --debug # Enable debug mode
606
+ --semantic-scholar-api-key KEY # Semantic Scholar API key (1-2s vs 5-10s without key; can also use SEMANTIC_SCHOLAR_API_KEY env var)
607
+ --db-path PATH # Local database path
608
+ --output-file [PATH] # Path to output file for reference discrepancies (default: reference_errors.txt if flag provided, no file if not provided)
609
+
610
+ # LLM options
611
+ --llm-provider {openai,anthropic,google,azure,vllm} # Enable LLM with provider
612
+ --llm-model MODEL # Override default model
613
+ --llm-endpoint URL # Override endpoint (for Azure/vLLM)
614
+ ```
615
+
616
+ ### API Key Handling
617
+
618
+ The refchecker tool automatically handles API keys for LLM providers in the following order:
619
+
620
+ 1. **Environment Variables** (recommended): The tool checks for provider-specific environment variables
621
+ 2. **Interactive Prompts**: If no API key is found in environment variables, the tool will securely prompt you to enter it
622
+
623
+ When you use an LLM provider without setting the corresponding environment variable, you'll see a prompt like:
624
+ ```
625
+ OpenAI API key not found in environment variables.
626
+ Checked environment variables: REFCHECKER_OPENAI_API_KEY, OPENAI_API_KEY
627
+ Please enter your OpenAI API key (input will be hidden):
628
+ API key: [your input is hidden]
629
+ ```
630
+
631
+ This approach ensures your API keys are never exposed in command line history while providing a seamless user experience.
632
+
633
+ ### Environment Variables
634
+
635
+ ```bash
636
+ # Enable/disable LLM
637
+ export REFCHECKER_USE_LLM=true
638
+
639
+ # Provider selection
640
+ export REFCHECKER_LLM_PROVIDER=anthropic # openai, anthropic, google, azure
641
+
642
+ # Semantic Scholar API key (for higher rate limits and faster verification: 1-2s vs 5-10s without key)
643
+ export SEMANTIC_SCHOLAR_API_KEY=your_key
644
+
645
+ # Provider-specific API keys (native environment variables preferred)
646
+ export OPENAI_API_KEY=your_key # or REFCHECKER_OPENAI_API_KEY
647
+ export ANTHROPIC_API_KEY=your_key # or REFCHECKER_ANTHROPIC_API_KEY
648
+ export GOOGLE_API_KEY=your_key # or REFCHECKER_GOOGLE_API_KEY
649
+ export AZURE_OPENAI_API_KEY=your_key # or REFCHECKER_AZURE_API_KEY
650
+ export AZURE_OPENAI_ENDPOINT=your_endpoint # or REFCHECKER_AZURE_ENDPOINT
651
+
652
+ # Model configuration
653
+ export REFCHECKER_LLM_MODEL=claude-sonnet-4-20250514
654
+ export REFCHECKER_LLM_MAX_TOKENS=4000
655
+ export REFCHECKER_LLM_TEMPERATURE=0.1
656
+ ```
657
+
658
+
659
+ ## ๐Ÿ—„๏ธ Local Database Setup
660
+
661
+ ### Downloading the Database
662
+
663
+ Create a local database for offline verification:
664
+
665
+ ```bash
666
+ # Download recent computer science papers
667
+ python download_semantic_scholar_db.py \
668
+ --field "computer science" \
669
+ --start-year 2020 \
670
+ --end-year 2024 \
671
+ --batch-size 100
672
+
673
+ # Download papers matching a specific query
674
+ python download_semantic_scholar_db.py \
675
+ --query "attention is all you need" \
676
+ --batch-size 50
677
+
678
+ # Download with API key for higher rate limits
679
+ python download_semantic_scholar_db.py \
680
+ --api-key YOUR_API_KEY \
681
+ --field "machine learning" \
682
+ --start-year 2023
683
+ ```
684
+
685
+ ### Database Options
686
+
687
+ - **`--output-dir`**: Directory to store database (default: `semantic_scholar_db`)
688
+ - **`--batch-size`**: Papers per batch (default: 100)
689
+ - **`--api-key`**: Semantic Scholar API key for higher limits
690
+ - **`--fields`**: Metadata fields to include
691
+ - **`--query`**: Search query for specific papers
692
+ - **`--start-year`/`--end-year`**: Year range filter
693
+
694
+ ## ๐Ÿงช Testing
695
+
696
+ RefChecker includes a comprehensive test suite with **490+ tests** covering unit, integration, and end-to-end scenarios. The tests ensure reliability across all components and provide examples of how to use the system.
697
+
698
+ ### Quick Test Run
699
+
700
+ ```bash
701
+ # Run all tests
702
+ pytest tests/
703
+
704
+ # Run specific test categories
705
+ pytest tests/unit/ # Unit tests only
706
+ pytest tests/integration/ # Integration tests only
707
+ pytest tests/e2e/ # End-to-end tests only
708
+
709
+ # Run with coverage
710
+ pytest --cov=src --cov-report=html tests/
711
+
712
+ # Run tests in parallel (if pytest-xdist installed)
713
+ pytest -n auto tests/
714
+ ```
715
+
716
+ ### Test Categories
717
+
718
+ - **Unit Tests** Individual components like text utilities, error handling, and reference extraction
719
+ - **Integration Tests** API interactions, LLM providers, and component integration
720
+ - **End-to-End Tests** Complete workflows, performance testing, and edge cases
721
+
722
+ ### Test Structure
723
+
724
+ ```
725
+ tests/
726
+ โ”œโ”€โ”€ unit/ # Unit tests for individual components
727
+ โ”œโ”€โ”€ integration/ # Integration tests for APIs and services
728
+ โ”œโ”€โ”€ e2e/ # End-to-end workflow tests
729
+ โ”œโ”€โ”€ fixtures/ # Test data and mock objects
730
+ โ””โ”€โ”€ README.md # Detailed testing documentation
731
+ ```
732
+
733
+ For detailed testing information, test execution options, and guidance on writing new tests, see the **[Testing Documentation](tests/README.md)**.
734
+
735
+
736
+ ## ๐Ÿ“„ License
737
+
738
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.