academic-refchecker 1.2.53__py3-none-any.whl → 1.2.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.53.dist-info → academic_refchecker-1.2.55.dist-info}/METADATA +23 -23
- academic_refchecker-1.2.55.dist-info/RECORD +49 -0
- academic_refchecker-1.2.55.dist-info/entry_points.txt +2 -0
- academic_refchecker-1.2.55.dist-info/top_level.txt +1 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +5 -0
- {checkers → refchecker/checkers}/crossref.py +5 -5
- {checkers → refchecker/checkers}/enhanced_hybrid_checker.py +1 -1
- {checkers → refchecker/checkers}/github_checker.py +4 -4
- {checkers → refchecker/checkers}/local_semantic_scholar.py +7 -7
- {checkers → refchecker/checkers}/openalex.py +6 -6
- {checkers → refchecker/checkers}/openreview_checker.py +8 -8
- {checkers → refchecker/checkers}/pdf_paper_checker.py +1 -1
- {checkers → refchecker/checkers}/semantic_scholar.py +10 -10
- {checkers → refchecker/checkers}/webpage_checker.py +3 -3
- {core → refchecker/core}/parallel_processor.py +6 -6
- {core → refchecker/core}/refchecker.py +63 -63
- {utils → refchecker/utils}/arxiv_utils.py +3 -3
- {utils → refchecker/utils}/biblatex_parser.py +4 -4
- {utils → refchecker/utils}/bibliography_utils.py +5 -5
- {utils → refchecker/utils}/bibtex_parser.py +5 -5
- {utils → refchecker/utils}/error_utils.py +1 -1
- {utils → refchecker/utils}/text_utils.py +62 -13
- __version__.py +0 -3
- academic_refchecker-1.2.53.dist-info/RECORD +0 -47
- academic_refchecker-1.2.53.dist-info/entry_points.txt +0 -2
- academic_refchecker-1.2.53.dist-info/top_level.txt +0 -9
- {academic_refchecker-1.2.53.dist-info → academic_refchecker-1.2.55.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.53.dist-info → academic_refchecker-1.2.55.dist-info}/licenses/LICENSE +0 -0
- {checkers → refchecker/checkers}/__init__.py +0 -0
- {config → refchecker/config}/__init__.py +0 -0
- {config → refchecker/config}/logging.conf +0 -0
- {config → refchecker/config}/settings.py +0 -0
- {core → refchecker/core}/__init__.py +0 -0
- {core → refchecker/core}/db_connection_pool.py +0 -0
- {database → refchecker/database}/__init__.py +0 -0
- {database → refchecker/database}/download_semantic_scholar_db.py +0 -0
- {llm → refchecker/llm}/__init__.py +0 -0
- {llm → refchecker/llm}/base.py +0 -0
- {llm → refchecker/llm}/providers.py +0 -0
- {scripts → refchecker/scripts}/__init__.py +0 -0
- {scripts → refchecker/scripts}/start_vllm_server.py +0 -0
- {services → refchecker/services}/__init__.py +0 -0
- {services → refchecker/services}/pdf_processor.py +0 -0
- {utils → refchecker/utils}/__init__.py +0 -0
- {utils → refchecker/utils}/author_utils.py +0 -0
- {utils → refchecker/utils}/config_validator.py +0 -0
- {utils → refchecker/utils}/db_utils.py +0 -0
- {utils → refchecker/utils}/doi_utils.py +0 -0
- {utils → refchecker/utils}/mock_objects.py +0 -0
- {utils → refchecker/utils}/unicode_utils.py +0 -0
- {utils → refchecker/utils}/url_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: academic-refchecker
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.55
|
|
4
4
|
Summary: A comprehensive tool for validating reference accuracy in academic papers
|
|
5
5
|
Author-email: Mark Russinovich <markrussinovich@hotmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -156,17 +156,17 @@ Learn about RefChecker's design philosophy and development process in this detai
|
|
|
156
156
|
|
|
157
157
|
1. **Check a famous paper:**
|
|
158
158
|
```bash
|
|
159
|
-
python
|
|
159
|
+
python run_refchecker.py --paper 1706.03762
|
|
160
160
|
```
|
|
161
161
|
|
|
162
162
|
2. **Check your own PDF:**
|
|
163
163
|
```bash
|
|
164
|
-
python
|
|
164
|
+
python run_refchecker.py --paper /path/to/your/paper.pdf
|
|
165
165
|
```
|
|
166
166
|
|
|
167
167
|
3. **For faster processing with local database** (see [Local Database Setup](#local-database-setup)):
|
|
168
168
|
```bash
|
|
169
|
-
python
|
|
169
|
+
python run_refchecker.py --paper 1706.03762 --db-path semantic_scholar_db/semantic_scholar.db
|
|
170
170
|
```
|
|
171
171
|
|
|
172
172
|
> **⚡ Performance Tip**: Reference verification takes 5-10 seconds per reference without a Semantic Scholar API key due to rate limiting. With an API key, verification speeds up to 1-2 seconds per reference. Set `SEMANTIC_SCHOLAR_API_KEY` environment variable or use `--semantic-scholar-api-key` for faster processing.
|
|
@@ -192,13 +192,13 @@ RefChecker supports AI-powered bibliography parsing using Large Language Models
|
|
|
192
192
|
export REFCHECKER_LLM_PROVIDER=anthropic
|
|
193
193
|
export ANTHROPIC_API_KEY=your_api_key_here
|
|
194
194
|
|
|
195
|
-
python
|
|
195
|
+
python run_refchecker.py --paper 1706.03762
|
|
196
196
|
```
|
|
197
197
|
|
|
198
198
|
2. **Using Command Line Arguments**:
|
|
199
199
|
```bash
|
|
200
200
|
# Enable LLM with specific provider and model
|
|
201
|
-
python
|
|
201
|
+
python run_refchecker.py --paper 1706.03762 \
|
|
202
202
|
--llm-provider anthropic \
|
|
203
203
|
--llm-model claude-sonnet-4-20250514 \
|
|
204
204
|
```
|
|
@@ -211,7 +211,7 @@ RefChecker supports AI-powered bibliography parsing using Large Language Models
|
|
|
211
211
|
With `OPENAI_API_KEY` environment variable:
|
|
212
212
|
|
|
213
213
|
```bash
|
|
214
|
-
python
|
|
214
|
+
python run_refchecker.py --paper /path/to/paper.pdf \
|
|
215
215
|
--llm-provider openai \
|
|
216
216
|
--llm-model gpt-4o \
|
|
217
217
|
```
|
|
@@ -221,7 +221,7 @@ python refchecker.py --paper /path/to/paper.pdf \
|
|
|
221
221
|
With `ANTHROPIC_API_KEY` environment variable:
|
|
222
222
|
|
|
223
223
|
```bash
|
|
224
|
-
python
|
|
224
|
+
python run_refchecker.py --paper https://arxiv.org/abs/1706.03762 \
|
|
225
225
|
--llm-provider anthropic \
|
|
226
226
|
--llm-model claude-sonnet-4-20250514 \
|
|
227
227
|
```
|
|
@@ -229,7 +229,7 @@ python refchecker.py --paper https://arxiv.org/abs/1706.03762 \
|
|
|
229
229
|
#### Google Gemini
|
|
230
230
|
|
|
231
231
|
```bash
|
|
232
|
-
python
|
|
232
|
+
python run_refchecker.py --paper paper.tex \
|
|
233
233
|
--llm-provider google \
|
|
234
234
|
--llm-model gemini-2.5-flash
|
|
235
235
|
```
|
|
@@ -237,7 +237,7 @@ python refchecker.py --paper paper.tex \
|
|
|
237
237
|
#### Azure OpenAI
|
|
238
238
|
|
|
239
239
|
```bash
|
|
240
|
-
python
|
|
240
|
+
python run_refchecker.py --paper paper.txt \
|
|
241
241
|
--llm-provider azure \
|
|
242
242
|
--llm-model gpt-4 \
|
|
243
243
|
--llm-endpoint https://your-resource.openai.azure.com/
|
|
@@ -249,7 +249,7 @@ For running models locally:
|
|
|
249
249
|
|
|
250
250
|
```bash
|
|
251
251
|
# automatic Huggingface model download with VLLM server launch
|
|
252
|
-
python
|
|
252
|
+
python run_refchecker.py --paper paper.pdf \
|
|
253
253
|
--llm-provider vllm \
|
|
254
254
|
--llm-model meta-llama/Llama-3.1-8B-Instruct
|
|
255
255
|
```
|
|
@@ -319,43 +319,43 @@ Check papers in various formats and online locations:
|
|
|
319
319
|
|
|
320
320
|
```bash
|
|
321
321
|
# Check a specific ArXiv paper by ID
|
|
322
|
-
python
|
|
322
|
+
python run_refchecker.py --paper 1706.03762
|
|
323
323
|
|
|
324
324
|
# Check by ArXiv URL
|
|
325
|
-
python
|
|
325
|
+
python run_refchecker.py --paper https://arxiv.org/abs/1706.03762
|
|
326
326
|
|
|
327
327
|
# Check by ArXiv PDF URL
|
|
328
|
-
python
|
|
328
|
+
python run_refchecker.py --paper https://arxiv.org/pdf/1706.03762.pdf
|
|
329
329
|
```
|
|
330
330
|
|
|
331
331
|
#### Local PDF Files
|
|
332
332
|
|
|
333
333
|
```bash
|
|
334
334
|
# Check a local PDF file
|
|
335
|
-
python
|
|
335
|
+
python run_refchecker.py --paper /path/to/your/paper.pdf
|
|
336
336
|
|
|
337
337
|
# Check with offline database for faster processing
|
|
338
|
-
python
|
|
338
|
+
python run_refchecker.py --paper /path/to/your/paper.pdf --db-path semantic_scholar_db/semantic_scholar.db
|
|
339
339
|
```
|
|
340
340
|
|
|
341
341
|
#### LaTeX Files
|
|
342
342
|
|
|
343
343
|
```bash
|
|
344
344
|
# Check a LaTeX document
|
|
345
|
-
python
|
|
345
|
+
python run_refchecker.py --paper /path/to/your/paper.tex
|
|
346
346
|
|
|
347
347
|
# Check with debug mode for detailed processing info
|
|
348
|
-
python
|
|
348
|
+
python run_refchecker.py --paper /path/to/your/paper.tex --debug
|
|
349
349
|
```
|
|
350
350
|
|
|
351
351
|
#### Text Files
|
|
352
352
|
|
|
353
353
|
```bash
|
|
354
354
|
# Check a plain text file containing paper content
|
|
355
|
-
python
|
|
355
|
+
python run_refchecker.py --paper /path/to/your/paper.txt
|
|
356
356
|
|
|
357
357
|
# Combine with local database for offline verification
|
|
358
|
-
python
|
|
358
|
+
python run_refchecker.py --paper /path/to/your/paper.txt --db-path semantic_scholar_db/semantic_scholar.db
|
|
359
359
|
```
|
|
360
360
|
|
|
361
361
|
|
|
@@ -367,10 +367,10 @@ By default, no files are generated. To save detailed results, use the `--output-
|
|
|
367
367
|
|
|
368
368
|
```bash
|
|
369
369
|
# Save to default filename (reference_errors.txt)
|
|
370
|
-
python
|
|
370
|
+
python run_refchecker.py --paper 1706.03762 --output-file
|
|
371
371
|
|
|
372
372
|
# Save to custom filename
|
|
373
|
-
python
|
|
373
|
+
python run_refchecker.py --paper 1706.03762 --output-file my_errors.txt
|
|
374
374
|
```
|
|
375
375
|
|
|
376
376
|
The output file contains a detailed report of references with errors and warnings, including corrected references.
|
|
@@ -574,7 +574,7 @@ python download_semantic_scholar_db.py \
|
|
|
574
574
|
|
|
575
575
|
## 🧪 Testing
|
|
576
576
|
|
|
577
|
-
RefChecker includes a comprehensive test suite with
|
|
577
|
+
RefChecker includes a comprehensive test suite with **490+ tests** covering unit, integration, and end-to-end scenarios. The tests ensure reliability across all components and provide examples of how to use the system.
|
|
578
578
|
|
|
579
579
|
### Quick Test Run
|
|
580
580
|
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
academic_refchecker-1.2.55.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
2
|
+
refchecker/__init__.py,sha256=Pg5MrtLxDBRcNYcI02N-bv3tzURVd1S3nQ8IyF7Zw7E,322
|
|
3
|
+
refchecker/__main__.py,sha256=agBbT9iKN0g2xXtRNCoh29Nr7z2n5vU-r0MCVJKi4tI,232
|
|
4
|
+
refchecker/__version__.py,sha256=Xg3VGFPggqe_vQC3vz8fNW_FXpXAhzc4wLE7rwOBHjw,89
|
|
5
|
+
refchecker/checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
|
|
6
|
+
refchecker/checkers/crossref.py,sha256=5BeSCK8K_S_-iwgQaNAbxZGNsxaxOyBzUQ3AD0Rc6nU,20433
|
|
7
|
+
refchecker/checkers/enhanced_hybrid_checker.py,sha256=c5I_h8w6xD7XkBNkbneffeAnrO8B-uXH99edWBJvDMo,27788
|
|
8
|
+
refchecker/checkers/github_checker.py,sha256=YJ2sLj22qezw3uWjA0jhtDO0fOW4HUwcVbv2DQ4LjR0,14277
|
|
9
|
+
refchecker/checkers/local_semantic_scholar.py,sha256=nrAJhm0VNEl4RwJWAEOGNCRE31h7CneLc9zSqY5zrHY,21092
|
|
10
|
+
refchecker/checkers/openalex.py,sha256=omMQbZOnkDndMJSl9SQVtiETzpv1w1pt93YjlFTq8WA,19616
|
|
11
|
+
refchecker/checkers/openreview_checker.py,sha256=0IHZe4Nscy8fle28rmhy1hhsofR5g0FFSakk8FFH_0A,40540
|
|
12
|
+
refchecker/checkers/pdf_paper_checker.py,sha256=lrg09poNJBz9FNMrUoEjQ6CJbdYZAVANw0bCaTSb5oo,19904
|
|
13
|
+
refchecker/checkers/semantic_scholar.py,sha256=CCrOMdOCfazX8bkikU209dz0xsV_xkeeYcmxO-K9-6I,35072
|
|
14
|
+
refchecker/checkers/webpage_checker.py,sha256=A_d5kg3OOsyliC00OVq_l0J-RJ4Ln7hUoURk21aO2fs,43653
|
|
15
|
+
refchecker/config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
|
|
16
|
+
refchecker/config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
|
|
17
|
+
refchecker/config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
|
|
18
|
+
refchecker/core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
|
|
19
|
+
refchecker/core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
|
|
20
|
+
refchecker/core/parallel_processor.py,sha256=HpVFEMwPBiP2FRjvGqlaXpjV5S0qP-hxdB_Wdl_lACo,17704
|
|
21
|
+
refchecker/core/refchecker.py,sha256=IAxetrSC0Z7EzVR5coIL2g8MqhWlsZzQiDnceDE4_uc,287102
|
|
22
|
+
refchecker/database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
|
|
23
|
+
refchecker/database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
|
|
24
|
+
refchecker/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
refchecker/llm/base.py,sha256=uMF-KOqZ9ZQ7rccOQLpKJiW9sEMMxr7ePXBSF0yYDJY,16782
|
|
26
|
+
refchecker/llm/providers.py,sha256=A0usJpprCO5D-VX0hqaQzBfi4DG3rdjA39vu02XJsGw,40092
|
|
27
|
+
refchecker/scripts/__init__.py,sha256=xJwo6afG8s7S888BK2Bxw2d7FX8aLkbl0l_ZoJOFibE,37
|
|
28
|
+
refchecker/scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,4213
|
|
29
|
+
refchecker/services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
|
|
30
|
+
refchecker/services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
|
|
31
|
+
refchecker/utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
|
|
32
|
+
refchecker/utils/arxiv_utils.py,sha256=idlCzkTApYwH-kdTiH9nrfo4GMmwdtUAv7cAGtoEG-0,19799
|
|
33
|
+
refchecker/utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
|
|
34
|
+
refchecker/utils/biblatex_parser.py,sha256=IKRUMtRsjdXIktyk9XGArt_ms0asmqP549uhFvvumuE,25581
|
|
35
|
+
refchecker/utils/bibliography_utils.py,sha256=d6kqDOQou_PX6WQkOzrGyN5GpzaOjhu54w9wGfBRQZw,11760
|
|
36
|
+
refchecker/utils/bibtex_parser.py,sha256=xY0dEqT8lBZF-W21YRpG28lp_F2ikLan7nK70WiCU2o,15286
|
|
37
|
+
refchecker/utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
|
|
38
|
+
refchecker/utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
|
|
39
|
+
refchecker/utils/doi_utils.py,sha256=ezUiRnYRpoO0U_Rqgxv1FxqmeTwPh6X8gLgSDbqg5sY,4874
|
|
40
|
+
refchecker/utils/error_utils.py,sha256=Mm4ZqP_4FCRT9x4J_2IKSIAKRWaGLym-wbZqhj1wYzc,12512
|
|
41
|
+
refchecker/utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
42
|
+
refchecker/utils/text_utils.py,sha256=d_X4r1nVvkL7i0DhxfLaVK3CzbMP2oZvqX3kxfDudQw,220978
|
|
43
|
+
refchecker/utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
44
|
+
refchecker/utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
|
|
45
|
+
academic_refchecker-1.2.55.dist-info/METADATA,sha256=q0soTYffNjsEJBWQVvIXMGqqVeQn-1ayHP4EOhTFDvk,23345
|
|
46
|
+
academic_refchecker-1.2.55.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
47
|
+
academic_refchecker-1.2.55.dist-info/entry_points.txt,sha256=kG6k2JwFIRvmKe0oZTr2RYStyfl79BirJxyaO6kjIxA,72
|
|
48
|
+
academic_refchecker-1.2.55.dist-info/top_level.txt,sha256=ZdIg_PFHiATpVT5Uvp4L17Q0d8mk8ZBsINXKf1tE0bo,11
|
|
49
|
+
academic_refchecker-1.2.55.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
refchecker
|
refchecker/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RefChecker - Academic Paper Reference Validation Tool
|
|
3
|
+
|
|
4
|
+
A comprehensive tool for validating reference accuracy in academic papers.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "1.2.1"
|
|
8
|
+
__author__ = "RefChecker Team"
|
|
9
|
+
__email__ = "markrussinovich@hotmail.com"
|
|
10
|
+
|
|
11
|
+
from .core.refchecker import ArxivReferenceChecker
|
|
12
|
+
|
|
13
|
+
__all__ = ["ArxivReferenceChecker"]
|
refchecker/__main__.py
ADDED
|
@@ -30,9 +30,9 @@ import logging
|
|
|
30
30
|
import re
|
|
31
31
|
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
32
32
|
from urllib.parse import quote_plus
|
|
33
|
-
from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
|
|
34
|
-
from utils.error_utils import format_year_mismatch, format_doi_mismatch
|
|
35
|
-
from config.settings import get_config
|
|
33
|
+
from refchecker.utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
|
|
34
|
+
from refchecker.utils.error_utils import format_year_mismatch, format_doi_mismatch
|
|
35
|
+
from refchecker.config.settings import get_config
|
|
36
36
|
|
|
37
37
|
# Set up logging
|
|
38
38
|
logger = logging.getLogger(__name__)
|
|
@@ -358,7 +358,7 @@ class CrossRefReferenceChecker:
|
|
|
358
358
|
# Check for DOI
|
|
359
359
|
doi = work_data.get('DOI')
|
|
360
360
|
if doi:
|
|
361
|
-
from utils.doi_utils import construct_doi_url
|
|
361
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
362
362
|
doi_url = construct_doi_url(doi)
|
|
363
363
|
logger.debug(f"Generated DOI URL: {doi_url}")
|
|
364
364
|
return doi_url
|
|
@@ -487,7 +487,7 @@ class CrossRefReferenceChecker:
|
|
|
487
487
|
work_doi = work_data.get('DOI')
|
|
488
488
|
if doi and work_doi:
|
|
489
489
|
# Compare DOIs using the proper comparison function
|
|
490
|
-
from utils.doi_utils import compare_dois
|
|
490
|
+
from refchecker.utils.doi_utils import compare_dois
|
|
491
491
|
if not compare_dois(doi, work_doi):
|
|
492
492
|
errors.append({
|
|
493
493
|
'error_type': 'doi',
|
|
@@ -542,7 +542,7 @@ class EnhancedHybridReferenceChecker:
|
|
|
542
542
|
"""
|
|
543
543
|
Compare author lists (delegates to shared utility)
|
|
544
544
|
"""
|
|
545
|
-
from utils.text_utils import compare_authors
|
|
545
|
+
from refchecker.utils.text_utils import compare_authors
|
|
546
546
|
return compare_authors(cited_authors, correct_authors)
|
|
547
547
|
|
|
548
548
|
# Backward compatibility alias
|
|
@@ -5,7 +5,7 @@ import re
|
|
|
5
5
|
import logging
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
7
|
from typing import Dict, Optional, Tuple, List, Any
|
|
8
|
-
from utils.text_utils import strip_latex_commands
|
|
8
|
+
from refchecker.utils.text_utils import strip_latex_commands
|
|
9
9
|
|
|
10
10
|
logger = logging.getLogger(__name__)
|
|
11
11
|
|
|
@@ -170,7 +170,7 @@ class GitHubChecker:
|
|
|
170
170
|
if cited_title:
|
|
171
171
|
title_match = self._check_title_match(cited_title, actual_name, actual_description)
|
|
172
172
|
if not title_match:
|
|
173
|
-
from utils.error_utils import format_title_mismatch
|
|
173
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
174
174
|
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
175
175
|
clean_cited_title = strip_latex_commands(cited_title)
|
|
176
176
|
details = format_title_mismatch(clean_cited_title, actual_name)
|
|
@@ -188,7 +188,7 @@ class GitHubChecker:
|
|
|
188
188
|
author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
|
|
189
189
|
author_match = self._check_author_match(author_str, actual_owner, actual_owner_name)
|
|
190
190
|
if not author_match:
|
|
191
|
-
from utils.error_utils import format_three_line_mismatch
|
|
191
|
+
from refchecker.utils.error_utils import format_three_line_mismatch
|
|
192
192
|
left = author_str
|
|
193
193
|
right = f"{actual_owner} ({actual_owner_name})" if actual_owner_name else actual_owner
|
|
194
194
|
details = format_three_line_mismatch("Author mismatch", left, right)
|
|
@@ -203,7 +203,7 @@ class GitHubChecker:
|
|
|
203
203
|
try:
|
|
204
204
|
cited_year_int = int(cited_year)
|
|
205
205
|
if cited_year_int < creation_year:
|
|
206
|
-
from utils.error_utils import format_year_mismatch
|
|
206
|
+
from refchecker.utils.error_utils import format_year_mismatch
|
|
207
207
|
errors.append({
|
|
208
208
|
"warning_type": "year",
|
|
209
209
|
"warning_details": format_year_mismatch(cited_year, creation_year),
|
|
@@ -35,12 +35,12 @@ import sys
|
|
|
35
35
|
import os
|
|
36
36
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
37
37
|
|
|
38
|
-
from utils.doi_utils import extract_doi_from_url, compare_dois, construct_doi_url
|
|
39
|
-
from utils.error_utils import create_author_error, create_year_warning, create_doi_error
|
|
40
|
-
from utils.text_utils import normalize_author_name, normalize_paper_title, is_name_match, compare_authors, calculate_title_similarity, extract_arxiv_id_from_url
|
|
41
|
-
from utils.db_utils import process_semantic_scholar_result, process_semantic_scholar_results
|
|
42
|
-
from utils.url_utils import get_best_available_url
|
|
43
|
-
from config.settings import get_config
|
|
38
|
+
from refchecker.utils.doi_utils import extract_doi_from_url, compare_dois, construct_doi_url
|
|
39
|
+
from refchecker.utils.error_utils import create_author_error, create_year_warning, create_doi_error
|
|
40
|
+
from refchecker.utils.text_utils import normalize_author_name, normalize_paper_title, is_name_match, compare_authors, calculate_title_similarity, extract_arxiv_id_from_url
|
|
41
|
+
from refchecker.utils.db_utils import process_semantic_scholar_result, process_semantic_scholar_results
|
|
42
|
+
from refchecker.utils.url_utils import get_best_available_url
|
|
43
|
+
from refchecker.config.settings import get_config
|
|
44
44
|
|
|
45
45
|
# Set up logging
|
|
46
46
|
logger = logging.getLogger(__name__)
|
|
@@ -471,7 +471,7 @@ class LocalNonArxivReferenceChecker:
|
|
|
471
471
|
|
|
472
472
|
# First try to get the Semantic Scholar URL since that's what we used for verification
|
|
473
473
|
if external_ids.get('CorpusId'):
|
|
474
|
-
from utils.url_utils import construct_semantic_scholar_url
|
|
474
|
+
from refchecker.utils.url_utils import construct_semantic_scholar_url
|
|
475
475
|
paper_url = construct_semantic_scholar_url(external_ids['CorpusId'])
|
|
476
476
|
logger.debug(f"Using Semantic Scholar URL for verification: {paper_url}")
|
|
477
477
|
else:
|
|
@@ -32,9 +32,9 @@ import logging
|
|
|
32
32
|
import re
|
|
33
33
|
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
34
34
|
from urllib.parse import quote_plus
|
|
35
|
-
from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
|
|
36
|
-
from utils.error_utils import format_year_mismatch, format_doi_mismatch
|
|
37
|
-
from config.settings import get_config
|
|
35
|
+
from refchecker.utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
|
|
36
|
+
from refchecker.utils.error_utils import format_year_mismatch, format_doi_mismatch
|
|
37
|
+
from refchecker.config.settings import get_config
|
|
38
38
|
|
|
39
39
|
# Set up logging
|
|
40
40
|
logger = logging.getLogger(__name__)
|
|
@@ -343,7 +343,7 @@ class OpenAlexReferenceChecker:
|
|
|
343
343
|
# Fall back to DOI URL
|
|
344
344
|
doi = work_data.get('doi')
|
|
345
345
|
if doi:
|
|
346
|
-
from utils.doi_utils import construct_doi_url
|
|
346
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
347
347
|
doi_url = construct_doi_url(doi)
|
|
348
348
|
logger.debug(f"Generated DOI URL: {doi_url}")
|
|
349
349
|
return doi_url
|
|
@@ -351,7 +351,7 @@ class OpenAlexReferenceChecker:
|
|
|
351
351
|
# Check ids for other identifiers
|
|
352
352
|
ids = work_data.get('ids', {})
|
|
353
353
|
if ids.get('doi'):
|
|
354
|
-
from utils.doi_utils import construct_doi_url
|
|
354
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
355
355
|
doi_url = construct_doi_url(ids['doi'])
|
|
356
356
|
logger.debug(f"Generated DOI URL from ids: {doi_url}")
|
|
357
357
|
return doi_url
|
|
@@ -460,7 +460,7 @@ class OpenAlexReferenceChecker:
|
|
|
460
460
|
|
|
461
461
|
if doi and work_doi:
|
|
462
462
|
# Compare DOIs using the proper comparison function
|
|
463
|
-
from utils.doi_utils import compare_dois
|
|
463
|
+
from refchecker.utils.doi_utils import compare_dois
|
|
464
464
|
if not compare_dois(doi, work_doi):
|
|
465
465
|
errors.append({
|
|
466
466
|
'error_type': 'doi',
|
|
@@ -32,7 +32,7 @@ import json
|
|
|
32
32
|
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
33
33
|
from urllib.parse import urlparse, parse_qs
|
|
34
34
|
from bs4 import BeautifulSoup
|
|
35
|
-
from utils.text_utils import (
|
|
35
|
+
from refchecker.utils.text_utils import (
|
|
36
36
|
normalize_text, clean_title_basic, is_name_match,
|
|
37
37
|
calculate_title_similarity, compare_authors,
|
|
38
38
|
clean_title_for_search, are_venues_substantially_different,
|
|
@@ -426,7 +426,7 @@ class OpenReviewReferenceChecker:
|
|
|
426
426
|
if cited_title and paper_title:
|
|
427
427
|
similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
|
|
428
428
|
if similarity < 0.7: # Using a reasonable threshold
|
|
429
|
-
from utils.error_utils import format_title_mismatch
|
|
429
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
430
430
|
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
431
431
|
clean_cited_title = strip_latex_commands(cited_title)
|
|
432
432
|
details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
|
|
@@ -465,7 +465,7 @@ class OpenReviewReferenceChecker:
|
|
|
465
465
|
|
|
466
466
|
is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
|
|
467
467
|
if is_different and year_message:
|
|
468
|
-
from utils.error_utils import format_year_mismatch
|
|
468
|
+
from refchecker.utils.error_utils import format_year_mismatch
|
|
469
469
|
errors.append({
|
|
470
470
|
"warning_type": "year",
|
|
471
471
|
"warning_details": format_year_mismatch(cited_year_int, paper_year_int)
|
|
@@ -479,7 +479,7 @@ class OpenReviewReferenceChecker:
|
|
|
479
479
|
|
|
480
480
|
if cited_venue and paper_venue:
|
|
481
481
|
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
482
|
-
from utils.error_utils import format_venue_mismatch
|
|
482
|
+
from refchecker.utils.error_utils import format_venue_mismatch
|
|
483
483
|
errors.append({
|
|
484
484
|
"warning_type": "venue",
|
|
485
485
|
"warning_details": format_venue_mismatch(cited_venue, paper_venue)
|
|
@@ -552,7 +552,7 @@ class OpenReviewReferenceChecker:
|
|
|
552
552
|
if cited_title and paper_title:
|
|
553
553
|
similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
|
|
554
554
|
if similarity < 0.8: # Slightly higher threshold for search results
|
|
555
|
-
from utils.error_utils import format_title_mismatch
|
|
555
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
556
556
|
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
557
557
|
clean_cited_title = strip_latex_commands(cited_title)
|
|
558
558
|
details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
|
|
@@ -591,7 +591,7 @@ class OpenReviewReferenceChecker:
|
|
|
591
591
|
|
|
592
592
|
is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
|
|
593
593
|
if is_different and year_message:
|
|
594
|
-
from utils.error_utils import format_year_mismatch
|
|
594
|
+
from refchecker.utils.error_utils import format_year_mismatch
|
|
595
595
|
errors.append({
|
|
596
596
|
"warning_type": "year",
|
|
597
597
|
"warning_details": format_year_mismatch(cited_year_int, paper_year_int)
|
|
@@ -605,7 +605,7 @@ class OpenReviewReferenceChecker:
|
|
|
605
605
|
|
|
606
606
|
if cited_venue and paper_venue:
|
|
607
607
|
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
608
|
-
from utils.error_utils import format_venue_mismatch
|
|
608
|
+
from refchecker.utils.error_utils import format_venue_mismatch
|
|
609
609
|
errors.append({
|
|
610
610
|
"warning_type": "venue",
|
|
611
611
|
"warning_details": format_venue_mismatch(cited_venue, paper_venue)
|
|
@@ -931,7 +931,7 @@ class OpenReviewReferenceChecker:
|
|
|
931
931
|
|
|
932
932
|
# Use similarity calculation from text_utils
|
|
933
933
|
try:
|
|
934
|
-
from utils.text_utils import calculate_title_similarity
|
|
934
|
+
from refchecker.utils.text_utils import calculate_title_similarity
|
|
935
935
|
similarity = calculate_title_similarity(search_title, found_title)
|
|
936
936
|
return similarity >= threshold
|
|
937
937
|
except ImportError:
|
|
@@ -15,7 +15,7 @@ from pypdf import PdfReader
|
|
|
15
15
|
from fuzzywuzzy import fuzz
|
|
16
16
|
from bs4 import BeautifulSoup
|
|
17
17
|
|
|
18
|
-
from utils.text_utils import normalize_text, calculate_title_similarity
|
|
18
|
+
from refchecker.utils.text_utils import normalize_text, calculate_title_similarity
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
|
|
@@ -28,9 +28,9 @@ import time
|
|
|
28
28
|
import logging
|
|
29
29
|
import re
|
|
30
30
|
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
31
|
-
from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search, strip_latex_commands, compare_titles_with_latex_cleaning
|
|
32
|
-
from utils.error_utils import format_title_mismatch
|
|
33
|
-
from config.settings import get_config
|
|
31
|
+
from refchecker.utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search, strip_latex_commands, compare_titles_with_latex_cleaning
|
|
32
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
33
|
+
from refchecker.config.settings import get_config
|
|
34
34
|
|
|
35
35
|
# Set up logging
|
|
36
36
|
logger = logging.getLogger(__name__)
|
|
@@ -523,12 +523,12 @@ class NonArxivReferenceChecker:
|
|
|
523
523
|
arxiv_id_match = (cited_arxiv_id == found_arxiv_id)
|
|
524
524
|
|
|
525
525
|
# Use flexible year validation
|
|
526
|
-
from utils.text_utils import is_year_substantially_different
|
|
526
|
+
from refchecker.utils.text_utils import is_year_substantially_different
|
|
527
527
|
context = {'arxiv_match': arxiv_id_match}
|
|
528
528
|
is_different, warning_message = is_year_substantially_different(year, paper_year, context)
|
|
529
529
|
|
|
530
530
|
if is_different and warning_message:
|
|
531
|
-
from utils.error_utils import format_year_mismatch
|
|
531
|
+
from refchecker.utils.error_utils import format_year_mismatch
|
|
532
532
|
errors.append({
|
|
533
533
|
'warning_type': 'year',
|
|
534
534
|
'warning_details': format_year_mismatch(year, paper_year),
|
|
@@ -549,7 +549,7 @@ class NonArxivReferenceChecker:
|
|
|
549
549
|
if cited_venue and paper_venue:
|
|
550
550
|
# Use the utility function to check if venues are substantially different
|
|
551
551
|
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
552
|
-
from utils.error_utils import create_venue_warning
|
|
552
|
+
from refchecker.utils.error_utils import create_venue_warning
|
|
553
553
|
errors.append(create_venue_warning(cited_venue, paper_venue))
|
|
554
554
|
elif not cited_venue and paper_venue:
|
|
555
555
|
# Original reference has the venue in raw text but not parsed correctly
|
|
@@ -597,9 +597,9 @@ class NonArxivReferenceChecker:
|
|
|
597
597
|
paper_doi = external_ids['DOI']
|
|
598
598
|
|
|
599
599
|
# Compare DOIs using the proper comparison function
|
|
600
|
-
from utils.doi_utils import compare_dois
|
|
600
|
+
from refchecker.utils.doi_utils import compare_dois
|
|
601
601
|
if doi and paper_doi and not compare_dois(doi, paper_doi):
|
|
602
|
-
from utils.error_utils import format_doi_mismatch
|
|
602
|
+
from refchecker.utils.error_utils import format_doi_mismatch
|
|
603
603
|
errors.append({
|
|
604
604
|
'error_type': 'doi',
|
|
605
605
|
'error_details': format_doi_mismatch(doi, paper_doi),
|
|
@@ -614,13 +614,13 @@ class NonArxivReferenceChecker:
|
|
|
614
614
|
# Return the Semantic Scholar URL that was actually used for verification
|
|
615
615
|
# First priority: Semantic Scholar URL since that's what we used for verification
|
|
616
616
|
if external_ids.get('CorpusId'):
|
|
617
|
-
from utils.url_utils import construct_semantic_scholar_url
|
|
617
|
+
from refchecker.utils.url_utils import construct_semantic_scholar_url
|
|
618
618
|
paper_url = construct_semantic_scholar_url(external_ids['CorpusId'])
|
|
619
619
|
logger.debug(f"Using Semantic Scholar URL for verification: {paper_url}")
|
|
620
620
|
|
|
621
621
|
# Second priority: DOI URL (if this was verified through DOI)
|
|
622
622
|
elif external_ids.get('DOI'):
|
|
623
|
-
from utils.doi_utils import construct_doi_url
|
|
623
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
624
624
|
paper_url = construct_doi_url(external_ids['DOI'])
|
|
625
625
|
logger.debug(f"Using DOI URL for verification: {paper_url}")
|
|
626
626
|
|
|
@@ -7,7 +7,7 @@ from urllib.parse import urlparse, urljoin
|
|
|
7
7
|
from typing import Dict, Optional, Tuple, List, Any
|
|
8
8
|
from bs4 import BeautifulSoup
|
|
9
9
|
import time
|
|
10
|
-
from utils.text_utils import strip_latex_commands
|
|
10
|
+
from refchecker.utils.text_utils import strip_latex_commands
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
@@ -185,7 +185,7 @@ class WebPageChecker:
|
|
|
185
185
|
# Check title match
|
|
186
186
|
if cited_title and page_title:
|
|
187
187
|
if not self._check_title_match(cited_title, page_title, page_description):
|
|
188
|
-
from utils.error_utils import format_title_mismatch
|
|
188
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
189
189
|
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
190
190
|
clean_cited_title = strip_latex_commands(cited_title)
|
|
191
191
|
errors.append({
|
|
@@ -207,7 +207,7 @@ class WebPageChecker:
|
|
|
207
207
|
if cited_authors:
|
|
208
208
|
author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
|
|
209
209
|
if not self._check_author_match(author_str, site_info, web_url):
|
|
210
|
-
from utils.error_utils import format_three_line_mismatch
|
|
210
|
+
from refchecker.utils.error_utils import format_three_line_mismatch
|
|
211
211
|
left = author_str
|
|
212
212
|
right = site_info.get('organization', 'unknown')
|
|
213
213
|
details = format_three_line_mismatch("Author/organization mismatch", left, right)
|
|
@@ -13,7 +13,7 @@ from threading import Thread, Lock
|
|
|
13
13
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
14
14
|
from dataclasses import dataclass
|
|
15
15
|
from typing import List, Dict, Any, Optional, Tuple, Callable
|
|
16
|
-
from utils.text_utils import deduplicate_urls
|
|
16
|
+
from refchecker.utils.text_utils import deduplicate_urls
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
@@ -277,15 +277,15 @@ class ParallelReferenceProcessor:
|
|
|
277
277
|
# Print reference info in the same format as sequential mode
|
|
278
278
|
raw_title = reference.get('title', 'Untitled')
|
|
279
279
|
# Clean LaTeX commands from title for display
|
|
280
|
-
from utils.text_utils import strip_latex_commands
|
|
280
|
+
from refchecker.utils.text_utils import strip_latex_commands
|
|
281
281
|
title = strip_latex_commands(raw_title)
|
|
282
|
-
from utils.text_utils import format_authors_for_display
|
|
282
|
+
from refchecker.utils.text_utils import format_authors_for_display
|
|
283
283
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
284
284
|
year = reference.get('year', '')
|
|
285
285
|
# Get venue from either 'venue' or 'journal' field and clean it up
|
|
286
286
|
venue = reference.get('venue', '') or reference.get('journal', '')
|
|
287
287
|
if venue:
|
|
288
|
-
from utils.error_utils import clean_venue_for_comparison
|
|
288
|
+
from refchecker.utils.error_utils import clean_venue_for_comparison
|
|
289
289
|
venue = clean_venue_for_comparison(venue)
|
|
290
290
|
url = reference.get('url', '')
|
|
291
291
|
doi = reference.get('doi', '')
|
|
@@ -331,7 +331,7 @@ class ParallelReferenceProcessor:
|
|
|
331
331
|
|
|
332
332
|
# Show DOI URL if available and different from what's already shown
|
|
333
333
|
if external_ids.get('DOI'):
|
|
334
|
-
from utils.doi_utils import construct_doi_url
|
|
334
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
335
335
|
doi_url = construct_doi_url(external_ids['DOI'])
|
|
336
336
|
if doi_url != verified_url_to_show and doi_url != url:
|
|
337
337
|
print(f" DOI URL: {doi_url}")
|
|
@@ -355,7 +355,7 @@ class ParallelReferenceProcessor:
|
|
|
355
355
|
error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
|
|
356
356
|
error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
|
|
357
357
|
|
|
358
|
-
from utils.error_utils import print_labeled_multiline
|
|
358
|
+
from refchecker.utils.error_utils import print_labeled_multiline
|
|
359
359
|
|
|
360
360
|
if error_type == 'arxiv_id':
|
|
361
361
|
# Keep existing style for arXiv ID errors
|