academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Config module
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[loggers]
|
|
2
|
+
keys=root,refchecker
|
|
3
|
+
|
|
4
|
+
[handlers]
|
|
5
|
+
keys=consoleHandler,fileHandler
|
|
6
|
+
|
|
7
|
+
[formatters]
|
|
8
|
+
keys=simpleFormatter,detailedFormatter
|
|
9
|
+
|
|
10
|
+
[logger_root]
|
|
11
|
+
level=DEBUG
|
|
12
|
+
handlers=consoleHandler
|
|
13
|
+
|
|
14
|
+
[logger_refchecker]
|
|
15
|
+
level=DEBUG
|
|
16
|
+
handlers=consoleHandler,fileHandler
|
|
17
|
+
qualname=refchecker
|
|
18
|
+
propagate=0
|
|
19
|
+
|
|
20
|
+
[handler_consoleHandler]
|
|
21
|
+
class=StreamHandler
|
|
22
|
+
level=INFO
|
|
23
|
+
formatter=simpleFormatter
|
|
24
|
+
args=(sys.stdout,)
|
|
25
|
+
|
|
26
|
+
[handler_fileHandler]
|
|
27
|
+
class=FileHandler
|
|
28
|
+
level=DEBUG
|
|
29
|
+
formatter=detailedFormatter
|
|
30
|
+
args=('logs/refchecker.log',)
|
|
31
|
+
|
|
32
|
+
[formatter_simpleFormatter]
|
|
33
|
+
format=%(asctime)s - %(levelname)s - %(message)s
|
|
34
|
+
|
|
35
|
+
[formatter_detailedFormatter]
|
|
36
|
+
format=%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration settings for RefChecker
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from typing import Dict, Any
|
|
7
|
+
|
|
8
|
+
# Default configuration
|
|
9
|
+
DEFAULT_CONFIG = {
|
|
10
|
+
# API Settings
|
|
11
|
+
"semantic_scholar": {
|
|
12
|
+
"base_url": "https://api.semanticscholar.org/graph/v1",
|
|
13
|
+
"rate_limit_delay": 1.0,
|
|
14
|
+
"max_retries": 3,
|
|
15
|
+
"timeout": 30,
|
|
16
|
+
},
|
|
17
|
+
|
|
18
|
+
"arxiv": {
|
|
19
|
+
"base_url": "https://export.arxiv.org/api/query",
|
|
20
|
+
"rate_limit_delay": 3.0,
|
|
21
|
+
"max_retries": 5,
|
|
22
|
+
"timeout": 30,
|
|
23
|
+
},
|
|
24
|
+
|
|
25
|
+
# Processing Settings
|
|
26
|
+
"processing": {
|
|
27
|
+
"max_papers": 50,
|
|
28
|
+
"days_back": 365,
|
|
29
|
+
"batch_size": 100,
|
|
30
|
+
},
|
|
31
|
+
|
|
32
|
+
# Output Settings
|
|
33
|
+
"output": {
|
|
34
|
+
"debug_dir": "debug",
|
|
35
|
+
"logs_dir": "logs",
|
|
36
|
+
"output_dir": "output",
|
|
37
|
+
"validation_output_dir": "validation_output",
|
|
38
|
+
},
|
|
39
|
+
|
|
40
|
+
# Database Settings
|
|
41
|
+
"database": {
|
|
42
|
+
"default_path": "semantic_scholar_db/semantic_scholar.db",
|
|
43
|
+
"download_batch_size": 100,
|
|
44
|
+
},
|
|
45
|
+
|
|
46
|
+
# Text Processing Settings
|
|
47
|
+
"text_processing": {
|
|
48
|
+
"similarity_threshold": 0.8,
|
|
49
|
+
"max_title_similarity": 0.8,
|
|
50
|
+
"max_author_similarity": 0.7,
|
|
51
|
+
"year_tolerance": 1,
|
|
52
|
+
},
|
|
53
|
+
|
|
54
|
+
# LLM Settings
|
|
55
|
+
"llm": {
|
|
56
|
+
"enabled": False,
|
|
57
|
+
"provider": "openai",
|
|
58
|
+
"fallback_enabled": True,
|
|
59
|
+
"parallel_chunks": True, # Enable parallel chunk processing
|
|
60
|
+
"max_chunk_workers": 4, # Maximum number of parallel workers for chunk processing
|
|
61
|
+
"openai": {
|
|
62
|
+
"model": "gpt-4.1",
|
|
63
|
+
"max_tokens": 4000,
|
|
64
|
+
"temperature": 0.1,
|
|
65
|
+
"timeout": 30,
|
|
66
|
+
},
|
|
67
|
+
"anthropic": {
|
|
68
|
+
"model": "claude-sonnet-4-20250514",
|
|
69
|
+
"max_tokens": 4000,
|
|
70
|
+
"temperature": 0.1,
|
|
71
|
+
"timeout": 30,
|
|
72
|
+
},
|
|
73
|
+
"google": {
|
|
74
|
+
"model": "gemini-2.5-flash",
|
|
75
|
+
"max_tokens": 4000,
|
|
76
|
+
"temperature": 0.1,
|
|
77
|
+
"timeout": 30,
|
|
78
|
+
},
|
|
79
|
+
"azure": {
|
|
80
|
+
"model": "gpt-4o",
|
|
81
|
+
"max_tokens": 4000,
|
|
82
|
+
"temperature": 0.1,
|
|
83
|
+
"timeout": 30,
|
|
84
|
+
},
|
|
85
|
+
"vllm": {
|
|
86
|
+
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
87
|
+
"max_tokens": 4000,
|
|
88
|
+
"temperature": 0.1,
|
|
89
|
+
"timeout": 30,
|
|
90
|
+
"server_url": "http://localhost:8000",
|
|
91
|
+
"download_path": "./models",
|
|
92
|
+
"auto_download": True,
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
def get_config() -> Dict[str, Any]:
|
|
98
|
+
"""Get configuration with environment variable overrides"""
|
|
99
|
+
config = DEFAULT_CONFIG.copy()
|
|
100
|
+
|
|
101
|
+
# Override with environment variables if present
|
|
102
|
+
if os.getenv("SEMANTIC_SCHOLAR_API_KEY"):
|
|
103
|
+
config["semantic_scholar"]["api_key"] = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
|
|
104
|
+
|
|
105
|
+
if os.getenv("REFCHECKER_DEBUG"):
|
|
106
|
+
config["debug"] = os.getenv("REFCHECKER_DEBUG").lower() == "true"
|
|
107
|
+
|
|
108
|
+
if os.getenv("REFCHECKER_OUTPUT_DIR"):
|
|
109
|
+
config["output"]["output_dir"] = os.getenv("REFCHECKER_OUTPUT_DIR")
|
|
110
|
+
|
|
111
|
+
# LLM configuration from environment variables
|
|
112
|
+
if os.getenv("REFCHECKER_USE_LLM"):
|
|
113
|
+
config["llm"]["enabled"] = os.getenv("REFCHECKER_USE_LLM").lower() == "true"
|
|
114
|
+
|
|
115
|
+
if os.getenv("REFCHECKER_LLM_PROVIDER"):
|
|
116
|
+
config["llm"]["provider"] = os.getenv("REFCHECKER_LLM_PROVIDER")
|
|
117
|
+
|
|
118
|
+
if os.getenv("REFCHECKER_LLM_FALLBACK_ON_ERROR"):
|
|
119
|
+
config["llm"]["fallback_enabled"] = os.getenv("REFCHECKER_LLM_FALLBACK_ON_ERROR").lower() == "true"
|
|
120
|
+
|
|
121
|
+
# Provider-specific API keys - check native variables first, then fallback to refchecker-prefixed
|
|
122
|
+
if os.getenv("OPENAI_API_KEY") or os.getenv("REFCHECKER_OPENAI_API_KEY"):
|
|
123
|
+
config["llm"]["openai"]["api_key"] = os.getenv("OPENAI_API_KEY") or os.getenv("REFCHECKER_OPENAI_API_KEY")
|
|
124
|
+
|
|
125
|
+
if os.getenv("ANTHROPIC_API_KEY") or os.getenv("REFCHECKER_ANTHROPIC_API_KEY"):
|
|
126
|
+
config["llm"]["anthropic"]["api_key"] = os.getenv("ANTHROPIC_API_KEY") or os.getenv("REFCHECKER_ANTHROPIC_API_KEY")
|
|
127
|
+
|
|
128
|
+
if os.getenv("GOOGLE_API_KEY") or os.getenv("REFCHECKER_GOOGLE_API_KEY"):
|
|
129
|
+
config["llm"]["google"]["api_key"] = os.getenv("GOOGLE_API_KEY") or os.getenv("REFCHECKER_GOOGLE_API_KEY")
|
|
130
|
+
|
|
131
|
+
if os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("REFCHECKER_AZURE_API_KEY"):
|
|
132
|
+
config["llm"]["azure"]["api_key"] = os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("REFCHECKER_AZURE_API_KEY")
|
|
133
|
+
|
|
134
|
+
if os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("REFCHECKER_AZURE_ENDPOINT"):
|
|
135
|
+
config["llm"]["azure"]["endpoint"] = os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("REFCHECKER_AZURE_ENDPOINT")
|
|
136
|
+
|
|
137
|
+
# vLLM configuration
|
|
138
|
+
if os.getenv("REFCHECKER_VLLM_SERVER_URL"):
|
|
139
|
+
config["llm"]["vllm"]["server_url"] = os.getenv("REFCHECKER_VLLM_SERVER_URL")
|
|
140
|
+
|
|
141
|
+
if os.getenv("REFCHECKER_VLLM_DOWNLOAD_PATH"):
|
|
142
|
+
config["llm"]["vllm"]["download_path"] = os.getenv("REFCHECKER_VLLM_DOWNLOAD_PATH")
|
|
143
|
+
|
|
144
|
+
if os.getenv("REFCHECKER_VLLM_AUTO_DOWNLOAD"):
|
|
145
|
+
config["llm"]["vllm"]["auto_download"] = os.getenv("REFCHECKER_VLLM_AUTO_DOWNLOAD").lower() == "true"
|
|
146
|
+
|
|
147
|
+
# Parallel processing configuration
|
|
148
|
+
if os.getenv("REFCHECKER_LLM_PARALLEL_CHUNKS"):
|
|
149
|
+
config["llm"]["parallel_chunks"] = os.getenv("REFCHECKER_LLM_PARALLEL_CHUNKS").lower() == "true"
|
|
150
|
+
|
|
151
|
+
if os.getenv("REFCHECKER_LLM_MAX_CHUNK_WORKERS"):
|
|
152
|
+
config["llm"]["max_chunk_workers"] = int(os.getenv("REFCHECKER_LLM_MAX_CHUNK_WORKERS"))
|
|
153
|
+
|
|
154
|
+
# Model configuration
|
|
155
|
+
if os.getenv("REFCHECKER_LLM_MODEL"):
|
|
156
|
+
provider = config["llm"]["provider"]
|
|
157
|
+
if provider in config["llm"]:
|
|
158
|
+
config["llm"][provider]["model"] = os.getenv("REFCHECKER_LLM_MODEL")
|
|
159
|
+
|
|
160
|
+
if os.getenv("REFCHECKER_LLM_MAX_TOKENS"):
|
|
161
|
+
provider = config["llm"]["provider"]
|
|
162
|
+
if provider in config["llm"]:
|
|
163
|
+
config["llm"][provider]["max_tokens"] = int(os.getenv("REFCHECKER_LLM_MAX_TOKENS"))
|
|
164
|
+
|
|
165
|
+
if os.getenv("REFCHECKER_LLM_TEMPERATURE"):
|
|
166
|
+
provider = config["llm"]["provider"]
|
|
167
|
+
if provider in config["llm"]:
|
|
168
|
+
config["llm"][provider]["temperature"] = float(os.getenv("REFCHECKER_LLM_TEMPERATURE"))
|
|
169
|
+
|
|
170
|
+
return config
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Thread-safe database connection pool for parallel reference checking.
|
|
3
|
+
|
|
4
|
+
This module provides a connection pool that allows multiple worker threads
|
|
5
|
+
to safely access SQLite databases in parallel by maintaining per-thread connections.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import threading
|
|
9
|
+
import sqlite3
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Dict, Optional
|
|
12
|
+
from contextlib import contextmanager
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DatabaseConnectionPool:
|
|
18
|
+
"""
|
|
19
|
+
Thread-safe SQLite connection pool.
|
|
20
|
+
|
|
21
|
+
Each thread gets its own database connection to avoid SQLite's
|
|
22
|
+
"objects created in a thread can only be used in that same thread" restriction.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, db_path: str):
|
|
26
|
+
"""
|
|
27
|
+
Initialize the connection pool.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
db_path: Path to the SQLite database file
|
|
31
|
+
"""
|
|
32
|
+
self.db_path = db_path
|
|
33
|
+
self._connections: Dict[int, sqlite3.Connection] = {}
|
|
34
|
+
self._lock = threading.Lock()
|
|
35
|
+
|
|
36
|
+
def get_connection(self) -> sqlite3.Connection:
|
|
37
|
+
"""
|
|
38
|
+
Get a database connection for the current thread.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
SQLite connection object for the current thread
|
|
42
|
+
"""
|
|
43
|
+
thread_id = threading.get_ident()
|
|
44
|
+
|
|
45
|
+
with self._lock:
|
|
46
|
+
if thread_id not in self._connections:
|
|
47
|
+
logger.debug(f"Creating new database connection for thread {thread_id}")
|
|
48
|
+
conn = sqlite3.connect(self.db_path)
|
|
49
|
+
conn.row_factory = sqlite3.Row # Return rows as dictionaries
|
|
50
|
+
self._connections[thread_id] = conn
|
|
51
|
+
|
|
52
|
+
return self._connections[thread_id]
|
|
53
|
+
|
|
54
|
+
@contextmanager
|
|
55
|
+
def connection(self):
|
|
56
|
+
"""
|
|
57
|
+
Context manager for database connections.
|
|
58
|
+
|
|
59
|
+
Usage:
|
|
60
|
+
with pool.connection() as conn:
|
|
61
|
+
cursor = conn.cursor()
|
|
62
|
+
cursor.execute("SELECT ...")
|
|
63
|
+
"""
|
|
64
|
+
conn = self.get_connection()
|
|
65
|
+
try:
|
|
66
|
+
yield conn
|
|
67
|
+
except Exception:
|
|
68
|
+
# Rollback on error
|
|
69
|
+
conn.rollback()
|
|
70
|
+
raise
|
|
71
|
+
else:
|
|
72
|
+
# Commit on success
|
|
73
|
+
conn.commit()
|
|
74
|
+
|
|
75
|
+
def close_all(self):
|
|
76
|
+
"""Close all connections in the pool."""
|
|
77
|
+
with self._lock:
|
|
78
|
+
# Don't try to close connections from different threads - SQLite doesn't allow it
|
|
79
|
+
# The connections will be cleaned up when the worker threads exit
|
|
80
|
+
logger.debug(f"Clearing connection pool (connections will close when threads exit)")
|
|
81
|
+
self._connections.clear()
|
|
82
|
+
|
|
83
|
+
def close_current_thread(self):
|
|
84
|
+
"""Close the connection for the current thread."""
|
|
85
|
+
thread_id = threading.get_ident()
|
|
86
|
+
|
|
87
|
+
with self._lock:
|
|
88
|
+
if thread_id in self._connections:
|
|
89
|
+
try:
|
|
90
|
+
self._connections[thread_id].close()
|
|
91
|
+
del self._connections[thread_id]
|
|
92
|
+
logger.debug(f"Closed database connection for current thread {thread_id}")
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error(f"Error closing connection for thread {thread_id}: {e}")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class ThreadSafeLocalChecker:
|
|
98
|
+
"""
|
|
99
|
+
Thread-safe wrapper for LocalNonArxivReferenceChecker that uses connection pooling.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
def __init__(self, db_path: str):
|
|
103
|
+
"""
|
|
104
|
+
Initialize the thread-safe checker.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
db_path: Path to the SQLite database file
|
|
108
|
+
"""
|
|
109
|
+
self.db_path = db_path
|
|
110
|
+
self.connection_pool = DatabaseConnectionPool(db_path)
|
|
111
|
+
|
|
112
|
+
def verify_reference(self, reference):
|
|
113
|
+
"""
|
|
114
|
+
Thread-safe reference verification.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
reference: Reference dictionary to verify
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Tuple of (verified_data, errors, paper_url)
|
|
121
|
+
"""
|
|
122
|
+
# Import here to avoid circular imports
|
|
123
|
+
from checkers.local_semantic_scholar import LocalNonArxivReferenceChecker
|
|
124
|
+
|
|
125
|
+
# Get thread-local connection
|
|
126
|
+
conn = self.connection_pool.get_connection()
|
|
127
|
+
|
|
128
|
+
# Create a properly initialized checker instance with thread-local connection
|
|
129
|
+
checker = LocalNonArxivReferenceChecker.__new__(LocalNonArxivReferenceChecker)
|
|
130
|
+
|
|
131
|
+
# Initialize the essential attributes (same as __init__ does)
|
|
132
|
+
checker.db_path = self.db_path
|
|
133
|
+
checker.conn = conn
|
|
134
|
+
# The connection should already have row_factory set from the pool
|
|
135
|
+
|
|
136
|
+
# Call the verification method
|
|
137
|
+
return checker.verify_reference(reference)
|
|
138
|
+
|
|
139
|
+
def close(self):
|
|
140
|
+
"""Close all database connections."""
|
|
141
|
+
self.connection_pool.close_all()
|