academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1 @@
1
+ # Config module
@@ -0,0 +1,36 @@
1
+ [loggers]
2
+ keys=root,refchecker
3
+
4
+ [handlers]
5
+ keys=consoleHandler,fileHandler
6
+
7
+ [formatters]
8
+ keys=simpleFormatter,detailedFormatter
9
+
10
+ [logger_root]
11
+ level=DEBUG
12
+ handlers=consoleHandler
13
+
14
+ [logger_refchecker]
15
+ level=DEBUG
16
+ handlers=consoleHandler,fileHandler
17
+ qualname=refchecker
18
+ propagate=0
19
+
20
+ [handler_consoleHandler]
21
+ class=StreamHandler
22
+ level=INFO
23
+ formatter=simpleFormatter
24
+ args=(sys.stdout,)
25
+
26
+ [handler_fileHandler]
27
+ class=FileHandler
28
+ level=DEBUG
29
+ formatter=detailedFormatter
30
+ args=('logs/refchecker.log',)
31
+
32
+ [formatter_simpleFormatter]
33
+ format=%(asctime)s - %(levelname)s - %(message)s
34
+
35
+ [formatter_detailedFormatter]
36
+ format=%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s
@@ -0,0 +1,170 @@
1
+ """
2
+ Configuration settings for RefChecker
3
+ """
4
+
5
+ import os
6
+ from typing import Dict, Any
7
+
8
+ # Default configuration
9
+ DEFAULT_CONFIG = {
10
+ # API Settings
11
+ "semantic_scholar": {
12
+ "base_url": "https://api.semanticscholar.org/graph/v1",
13
+ "rate_limit_delay": 1.0,
14
+ "max_retries": 3,
15
+ "timeout": 30,
16
+ },
17
+
18
+ "arxiv": {
19
+ "base_url": "https://export.arxiv.org/api/query",
20
+ "rate_limit_delay": 3.0,
21
+ "max_retries": 5,
22
+ "timeout": 30,
23
+ },
24
+
25
+ # Processing Settings
26
+ "processing": {
27
+ "max_papers": 50,
28
+ "days_back": 365,
29
+ "batch_size": 100,
30
+ },
31
+
32
+ # Output Settings
33
+ "output": {
34
+ "debug_dir": "debug",
35
+ "logs_dir": "logs",
36
+ "output_dir": "output",
37
+ "validation_output_dir": "validation_output",
38
+ },
39
+
40
+ # Database Settings
41
+ "database": {
42
+ "default_path": "semantic_scholar_db/semantic_scholar.db",
43
+ "download_batch_size": 100,
44
+ },
45
+
46
+ # Text Processing Settings
47
+ "text_processing": {
48
+ "similarity_threshold": 0.8,
49
+ "max_title_similarity": 0.8,
50
+ "max_author_similarity": 0.7,
51
+ "year_tolerance": 1,
52
+ },
53
+
54
+ # LLM Settings
55
+ "llm": {
56
+ "enabled": False,
57
+ "provider": "openai",
58
+ "fallback_enabled": True,
59
+ "parallel_chunks": True, # Enable parallel chunk processing
60
+ "max_chunk_workers": 4, # Maximum number of parallel workers for chunk processing
61
+ "openai": {
62
+ "model": "gpt-4.1",
63
+ "max_tokens": 4000,
64
+ "temperature": 0.1,
65
+ "timeout": 30,
66
+ },
67
+ "anthropic": {
68
+ "model": "claude-sonnet-4-20250514",
69
+ "max_tokens": 4000,
70
+ "temperature": 0.1,
71
+ "timeout": 30,
72
+ },
73
+ "google": {
74
+ "model": "gemini-2.5-flash",
75
+ "max_tokens": 4000,
76
+ "temperature": 0.1,
77
+ "timeout": 30,
78
+ },
79
+ "azure": {
80
+ "model": "gpt-4o",
81
+ "max_tokens": 4000,
82
+ "temperature": 0.1,
83
+ "timeout": 30,
84
+ },
85
+ "vllm": {
86
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
87
+ "max_tokens": 4000,
88
+ "temperature": 0.1,
89
+ "timeout": 30,
90
+ "server_url": "http://localhost:8000",
91
+ "download_path": "./models",
92
+ "auto_download": True,
93
+ }
94
+ }
95
+ }
96
+
97
+ def get_config() -> Dict[str, Any]:
98
+ """Get configuration with environment variable overrides"""
99
+ config = DEFAULT_CONFIG.copy()
100
+
101
+ # Override with environment variables if present
102
+ if os.getenv("SEMANTIC_SCHOLAR_API_KEY"):
103
+ config["semantic_scholar"]["api_key"] = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
104
+
105
+ if os.getenv("REFCHECKER_DEBUG"):
106
+ config["debug"] = os.getenv("REFCHECKER_DEBUG").lower() == "true"
107
+
108
+ if os.getenv("REFCHECKER_OUTPUT_DIR"):
109
+ config["output"]["output_dir"] = os.getenv("REFCHECKER_OUTPUT_DIR")
110
+
111
+ # LLM configuration from environment variables
112
+ if os.getenv("REFCHECKER_USE_LLM"):
113
+ config["llm"]["enabled"] = os.getenv("REFCHECKER_USE_LLM").lower() == "true"
114
+
115
+ if os.getenv("REFCHECKER_LLM_PROVIDER"):
116
+ config["llm"]["provider"] = os.getenv("REFCHECKER_LLM_PROVIDER")
117
+
118
+ if os.getenv("REFCHECKER_LLM_FALLBACK_ON_ERROR"):
119
+ config["llm"]["fallback_enabled"] = os.getenv("REFCHECKER_LLM_FALLBACK_ON_ERROR").lower() == "true"
120
+
121
+ # Provider-specific API keys - check native variables first, then fallback to refchecker-prefixed
122
+ if os.getenv("OPENAI_API_KEY") or os.getenv("REFCHECKER_OPENAI_API_KEY"):
123
+ config["llm"]["openai"]["api_key"] = os.getenv("OPENAI_API_KEY") or os.getenv("REFCHECKER_OPENAI_API_KEY")
124
+
125
+ if os.getenv("ANTHROPIC_API_KEY") or os.getenv("REFCHECKER_ANTHROPIC_API_KEY"):
126
+ config["llm"]["anthropic"]["api_key"] = os.getenv("ANTHROPIC_API_KEY") or os.getenv("REFCHECKER_ANTHROPIC_API_KEY")
127
+
128
+ if os.getenv("GOOGLE_API_KEY") or os.getenv("REFCHECKER_GOOGLE_API_KEY"):
129
+ config["llm"]["google"]["api_key"] = os.getenv("GOOGLE_API_KEY") or os.getenv("REFCHECKER_GOOGLE_API_KEY")
130
+
131
+ if os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("REFCHECKER_AZURE_API_KEY"):
132
+ config["llm"]["azure"]["api_key"] = os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("REFCHECKER_AZURE_API_KEY")
133
+
134
+ if os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("REFCHECKER_AZURE_ENDPOINT"):
135
+ config["llm"]["azure"]["endpoint"] = os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("REFCHECKER_AZURE_ENDPOINT")
136
+
137
+ # vLLM configuration
138
+ if os.getenv("REFCHECKER_VLLM_SERVER_URL"):
139
+ config["llm"]["vllm"]["server_url"] = os.getenv("REFCHECKER_VLLM_SERVER_URL")
140
+
141
+ if os.getenv("REFCHECKER_VLLM_DOWNLOAD_PATH"):
142
+ config["llm"]["vllm"]["download_path"] = os.getenv("REFCHECKER_VLLM_DOWNLOAD_PATH")
143
+
144
+ if os.getenv("REFCHECKER_VLLM_AUTO_DOWNLOAD"):
145
+ config["llm"]["vllm"]["auto_download"] = os.getenv("REFCHECKER_VLLM_AUTO_DOWNLOAD").lower() == "true"
146
+
147
+ # Parallel processing configuration
148
+ if os.getenv("REFCHECKER_LLM_PARALLEL_CHUNKS"):
149
+ config["llm"]["parallel_chunks"] = os.getenv("REFCHECKER_LLM_PARALLEL_CHUNKS").lower() == "true"
150
+
151
+ if os.getenv("REFCHECKER_LLM_MAX_CHUNK_WORKERS"):
152
+ config["llm"]["max_chunk_workers"] = int(os.getenv("REFCHECKER_LLM_MAX_CHUNK_WORKERS"))
153
+
154
+ # Model configuration
155
+ if os.getenv("REFCHECKER_LLM_MODEL"):
156
+ provider = config["llm"]["provider"]
157
+ if provider in config["llm"]:
158
+ config["llm"][provider]["model"] = os.getenv("REFCHECKER_LLM_MODEL")
159
+
160
+ if os.getenv("REFCHECKER_LLM_MAX_TOKENS"):
161
+ provider = config["llm"]["provider"]
162
+ if provider in config["llm"]:
163
+ config["llm"][provider]["max_tokens"] = int(os.getenv("REFCHECKER_LLM_MAX_TOKENS"))
164
+
165
+ if os.getenv("REFCHECKER_LLM_TEMPERATURE"):
166
+ provider = config["llm"]["provider"]
167
+ if provider in config["llm"]:
168
+ config["llm"][provider]["temperature"] = float(os.getenv("REFCHECKER_LLM_TEMPERATURE"))
169
+
170
+ return config
@@ -0,0 +1,7 @@
1
+ """
2
+ Core functionality for RefChecker
3
+ """
4
+
5
+ from .refchecker import ArxivReferenceChecker, setup_logging
6
+
7
+ __all__ = ["ArxivReferenceChecker", "setup_logging"]
@@ -0,0 +1,141 @@
1
+ """
2
+ Thread-safe database connection pool for parallel reference checking.
3
+
4
+ This module provides a connection pool that allows multiple worker threads
5
+ to safely access SQLite databases in parallel by maintaining per-thread connections.
6
+ """
7
+
8
+ import threading
9
+ import sqlite3
10
+ import logging
11
+ from typing import Dict, Optional
12
+ from contextlib import contextmanager
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DatabaseConnectionPool:
18
+ """
19
+ Thread-safe SQLite connection pool.
20
+
21
+ Each thread gets its own database connection to avoid SQLite's
22
+ "objects created in a thread can only be used in that same thread" restriction.
23
+ """
24
+
25
+ def __init__(self, db_path: str):
26
+ """
27
+ Initialize the connection pool.
28
+
29
+ Args:
30
+ db_path: Path to the SQLite database file
31
+ """
32
+ self.db_path = db_path
33
+ self._connections: Dict[int, sqlite3.Connection] = {}
34
+ self._lock = threading.Lock()
35
+
36
+ def get_connection(self) -> sqlite3.Connection:
37
+ """
38
+ Get a database connection for the current thread.
39
+
40
+ Returns:
41
+ SQLite connection object for the current thread
42
+ """
43
+ thread_id = threading.get_ident()
44
+
45
+ with self._lock:
46
+ if thread_id not in self._connections:
47
+ logger.debug(f"Creating new database connection for thread {thread_id}")
48
+ conn = sqlite3.connect(self.db_path)
49
+ conn.row_factory = sqlite3.Row # Return rows as dictionaries
50
+ self._connections[thread_id] = conn
51
+
52
+ return self._connections[thread_id]
53
+
54
+ @contextmanager
55
+ def connection(self):
56
+ """
57
+ Context manager for database connections.
58
+
59
+ Usage:
60
+ with pool.connection() as conn:
61
+ cursor = conn.cursor()
62
+ cursor.execute("SELECT ...")
63
+ """
64
+ conn = self.get_connection()
65
+ try:
66
+ yield conn
67
+ except Exception:
68
+ # Rollback on error
69
+ conn.rollback()
70
+ raise
71
+ else:
72
+ # Commit on success
73
+ conn.commit()
74
+
75
+ def close_all(self):
76
+ """Close all connections in the pool."""
77
+ with self._lock:
78
+ # Don't try to close connections from different threads - SQLite doesn't allow it
79
+ # The connections will be cleaned up when the worker threads exit
80
+ logger.debug(f"Clearing connection pool (connections will close when threads exit)")
81
+ self._connections.clear()
82
+
83
+ def close_current_thread(self):
84
+ """Close the connection for the current thread."""
85
+ thread_id = threading.get_ident()
86
+
87
+ with self._lock:
88
+ if thread_id in self._connections:
89
+ try:
90
+ self._connections[thread_id].close()
91
+ del self._connections[thread_id]
92
+ logger.debug(f"Closed database connection for current thread {thread_id}")
93
+ except Exception as e:
94
+ logger.error(f"Error closing connection for thread {thread_id}: {e}")
95
+
96
+
97
+ class ThreadSafeLocalChecker:
98
+ """
99
+ Thread-safe wrapper for LocalNonArxivReferenceChecker that uses connection pooling.
100
+ """
101
+
102
+ def __init__(self, db_path: str):
103
+ """
104
+ Initialize the thread-safe checker.
105
+
106
+ Args:
107
+ db_path: Path to the SQLite database file
108
+ """
109
+ self.db_path = db_path
110
+ self.connection_pool = DatabaseConnectionPool(db_path)
111
+
112
+ def verify_reference(self, reference):
113
+ """
114
+ Thread-safe reference verification.
115
+
116
+ Args:
117
+ reference: Reference dictionary to verify
118
+
119
+ Returns:
120
+ Tuple of (verified_data, errors, paper_url)
121
+ """
122
+ # Import here to avoid circular imports
123
+ from checkers.local_semantic_scholar import LocalNonArxivReferenceChecker
124
+
125
+ # Get thread-local connection
126
+ conn = self.connection_pool.get_connection()
127
+
128
+ # Create a properly initialized checker instance with thread-local connection
129
+ checker = LocalNonArxivReferenceChecker.__new__(LocalNonArxivReferenceChecker)
130
+
131
+ # Initialize the essential attributes (same as __init__ does)
132
+ checker.db_path = self.db_path
133
+ checker.conn = conn
134
+ # The connection should already have row_factory set from the pool
135
+
136
+ # Call the verification method
137
+ return checker.verify_reference(reference)
138
+
139
+ def close(self):
140
+ """Close all database connections."""
141
+ self.connection_pool.close_all()