academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1 @@
1
+ """Scripts for refchecker package."""
@@ -0,0 +1,121 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Standalone vLLM server launcher script
4
+ Runs outside of debugger environment to avoid pydevd conflicts
5
+ """
6
+
7
+ import sys
8
+ import subprocess
9
+ import os
10
+ import time
11
+ import argparse
12
+ import signal
13
+
14
+ def start_vllm_server(model_name, port=8000, tensor_parallel_size=1, max_model_len=None, gpu_memory_util=0.9, daemon=False):
15
+ """Start vLLM server with specified parameters"""
16
+
17
+ # Kill any existing server on the port
18
+ try:
19
+ subprocess.run(["pkill", "-f", "vllm.entrypoints.openai.api_server"],
20
+ timeout=10, capture_output=True)
21
+ time.sleep(2)
22
+ except:
23
+ pass
24
+
25
+ # Build command
26
+ cmd = [
27
+ sys.executable, "-m", "vllm.entrypoints.openai.api_server",
28
+ "--model", model_name,
29
+ "--host", "0.0.0.0",
30
+ "--port", str(port),
31
+ "--tensor-parallel-size", str(tensor_parallel_size),
32
+ "--gpu-memory-utilization", str(gpu_memory_util)
33
+ ]
34
+
35
+ if max_model_len:
36
+ cmd.extend(["--max-model-len", str(max_model_len)])
37
+
38
+ print(f"Starting vLLM server: {' '.join(cmd)}")
39
+
40
+ # Create clean environment without debugger variables
41
+ clean_env = {}
42
+ for key, value in os.environ.items():
43
+ if not any(debug_key in key.upper() for debug_key in ['DEBUGPY', 'PYDEVD']):
44
+ clean_env[key] = value
45
+
46
+ # Remove debugger paths from PYTHONPATH if present
47
+ if 'PYTHONPATH' in clean_env:
48
+ pythonpath_parts = clean_env['PYTHONPATH'].split(':')
49
+ clean_pythonpath = [p for p in pythonpath_parts if 'debugpy' not in p and 'pydevd' not in p]
50
+ if clean_pythonpath:
51
+ clean_env['PYTHONPATH'] = ':'.join(clean_pythonpath)
52
+ else:
53
+ clean_env.pop('PYTHONPATH', None)
54
+
55
+ # Start server
56
+ if daemon:
57
+ # For daemon mode, redirect output to /dev/null to avoid blocking
58
+ with open(os.devnull, 'w') as devnull:
59
+ process = subprocess.Popen(
60
+ cmd,
61
+ env=clean_env,
62
+ start_new_session=True,
63
+ stdout=devnull,
64
+ stderr=devnull
65
+ )
66
+ return process
67
+ else:
68
+ # For non-daemon mode, keep stdout/stderr for monitoring with real-time streaming
69
+ process = subprocess.Popen(
70
+ cmd,
71
+ env=clean_env,
72
+ start_new_session=True,
73
+ stdout=subprocess.PIPE,
74
+ stderr=subprocess.STDOUT,
75
+ text=True,
76
+ bufsize=0, # Unbuffered for real-time output
77
+ universal_newlines=True
78
+ )
79
+ return process
80
+
81
+ def main():
82
+ parser = argparse.ArgumentParser(description="Start vLLM server")
83
+ parser.add_argument("--model", required=True, help="Model name to serve")
84
+ parser.add_argument("--port", type=int, default=8000, help="Port to serve on")
85
+ parser.add_argument("--tensor-parallel-size", type=int, default=1, help="Tensor parallel size")
86
+ parser.add_argument("--max-model-len", type=int, help="Maximum model length")
87
+ parser.add_argument("--gpu-memory-util", type=float, default=0.9, help="GPU memory utilization")
88
+ parser.add_argument("--daemon", action="store_true", help="Run as daemon")
89
+
90
+ args = parser.parse_args()
91
+
92
+ process = start_vllm_server(
93
+ model_name=args.model,
94
+ port=args.port,
95
+ tensor_parallel_size=args.tensor_parallel_size,
96
+ max_model_len=args.max_model_len,
97
+ gpu_memory_util=args.gpu_memory_util,
98
+ daemon=args.daemon
99
+ )
100
+
101
+ if args.daemon:
102
+ print(f"vLLM server started as daemon with PID: {process.pid}")
103
+ print(f"Server URL: http://localhost:{args.port}")
104
+ return 0
105
+ else:
106
+ print(f"vLLM server starting with PID: {process.pid}")
107
+ print(f"Server URL: http://localhost:{args.port}")
108
+ print("Press Ctrl+C to stop...")
109
+
110
+ try:
111
+ # Stream output with immediate flushing
112
+ for line in process.stdout:
113
+ print(line.rstrip(), flush=True)
114
+ except KeyboardInterrupt:
115
+ print("\nStopping vLLM server...")
116
+ process.terminate()
117
+ process.wait()
118
+ return 0
119
+
120
+ if __name__ == "__main__":
121
+ sys.exit(main())
@@ -0,0 +1,8 @@
1
+ """
2
+ Services module for ArXiv Reference Checker
3
+ Contains service classes for modular functionality
4
+ """
5
+
6
+ from .pdf_processor import PDFProcessor, Paper
7
+
8
+ __all__ = ['PDFProcessor', 'Paper']
@@ -0,0 +1,268 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDF Processing Service for ArXiv Reference Checker
4
+ Extracted from core.refchecker to improve modularity
5
+ """
6
+
7
+ import os
8
+ import logging
9
+ from typing import Optional, Dict, Any
10
+ from dataclasses import dataclass
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ @dataclass
15
+ class Paper:
16
+ """Represents a paper with metadata"""
17
+ title: str
18
+ authors: list
19
+ abstract: str = ""
20
+ year: Optional[int] = None
21
+ venue: str = ""
22
+ url: str = ""
23
+ doi: str = ""
24
+ arxiv_id: str = ""
25
+ pdf_path: str = ""
26
+
27
+ def to_dict(self) -> Dict[str, Any]:
28
+ """Convert paper to dictionary format"""
29
+ return {
30
+ 'title': self.title,
31
+ 'authors': self.authors,
32
+ 'abstract': self.abstract,
33
+ 'year': self.year,
34
+ 'venue': self.venue,
35
+ 'url': self.url,
36
+ 'doi': self.doi,
37
+ 'arxiv_id': self.arxiv_id,
38
+ 'pdf_path': self.pdf_path
39
+ }
40
+
41
+ class PDFProcessor:
42
+ """Service for processing PDF files and extracting text"""
43
+
44
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
45
+ self.config = config or {}
46
+ self.cache = {}
47
+
48
+ def extract_text_from_pdf(self, pdf_path: str) -> str:
49
+ """
50
+ Extract text from PDF file
51
+
52
+ Args:
53
+ pdf_path: Path to PDF file
54
+
55
+ Returns:
56
+ Extracted text content
57
+ """
58
+ if not os.path.exists(pdf_path):
59
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
60
+
61
+ # Check cache first
62
+ if pdf_path in self.cache:
63
+ logger.debug(f"Using cached text for {pdf_path}")
64
+ return self.cache[pdf_path]
65
+
66
+ try:
67
+ import pypdf
68
+
69
+ with open(pdf_path, 'rb') as file:
70
+ pdf_reader = pypdf.PdfReader(file)
71
+ text = ""
72
+ failed_pages = []
73
+
74
+ for page_num in range(len(pdf_reader.pages)):
75
+ try:
76
+ page = pdf_reader.pages[page_num]
77
+ page_text = page.extract_text()
78
+ if page_text:
79
+ text += page_text + "\n"
80
+ except TypeError as e:
81
+ # Handle pypdf errors like "NumberObject is not iterable"
82
+ # which can occur with malformed PDF pages
83
+ failed_pages.append(page_num + 1) # 1-indexed for logging
84
+ logger.warning(f"Skipping page {page_num + 1} due to PDF parsing error: {e}")
85
+ continue
86
+ except Exception as e:
87
+ failed_pages.append(page_num + 1)
88
+ logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
89
+ continue
90
+
91
+ if failed_pages:
92
+ logger.warning(f"Failed to extract text from {len(failed_pages)} pages: {failed_pages[:10]}{'...' if len(failed_pages) > 10 else ''}")
93
+
94
+ if not text.strip():
95
+ raise ValueError(f"No text could be extracted from any pages of {pdf_path}")
96
+
97
+ # Cache the result
98
+ self.cache[pdf_path] = text
99
+ logger.debug(f"Extracted {len(text)} characters from {pdf_path}")
100
+ return text
101
+
102
+ except ImportError:
103
+ logger.error("pypdf not installed. Install with: pip install pypdf")
104
+ raise
105
+ except Exception as e:
106
+ logger.error(f"Error extracting text from PDF {pdf_path}: {e}")
107
+ raise
108
+
109
+ def create_local_file_paper(self, file_path: str, metadata: Optional[Dict[str, Any]] = None) -> Paper:
110
+ """
111
+ Create a Paper object from a local file
112
+
113
+ Args:
114
+ file_path: Path to the file
115
+ metadata: Optional metadata dictionary
116
+
117
+ Returns:
118
+ Paper object
119
+ """
120
+ if not os.path.exists(file_path):
121
+ raise FileNotFoundError(f"File not found: {file_path}")
122
+
123
+ # Extract text if it's a PDF
124
+ text_content = ""
125
+ if file_path.lower().endswith('.pdf'):
126
+ try:
127
+ text_content = self.extract_text_from_pdf(file_path)
128
+ except Exception as e:
129
+ logger.warning(f"Could not extract text from {file_path}: {e}")
130
+
131
+ # Use metadata if provided, otherwise extract from filename
132
+ if metadata:
133
+ title = metadata.get('title', os.path.basename(file_path))
134
+ authors = metadata.get('authors', [])
135
+ abstract = metadata.get('abstract', '')
136
+ year = metadata.get('year')
137
+ venue = metadata.get('venue', '')
138
+ url = metadata.get('url', '')
139
+ doi = metadata.get('doi', '')
140
+ arxiv_id = metadata.get('arxiv_id', '')
141
+ else:
142
+ # Basic extraction from filename
143
+ title = os.path.splitext(os.path.basename(file_path))[0]
144
+ authors = []
145
+ abstract = text_content[:500] if text_content else "" # First 500 chars as abstract
146
+ year = None
147
+ venue = ""
148
+ url = ""
149
+ doi = ""
150
+ arxiv_id = ""
151
+
152
+ return Paper(
153
+ title=title,
154
+ authors=authors,
155
+ abstract=abstract,
156
+ year=year,
157
+ venue=venue,
158
+ url=url,
159
+ doi=doi,
160
+ arxiv_id=arxiv_id,
161
+ pdf_path=file_path
162
+ )
163
+
164
+ def extract_bibliography_from_text(self, text: str) -> str:
165
+ """
166
+ Extract bibliography section from text
167
+
168
+ Args:
169
+ text: Full text content
170
+
171
+ Returns:
172
+ Bibliography section text
173
+ """
174
+ if not text:
175
+ return ""
176
+
177
+ # Common bibliography section headers
178
+ bib_headers = [
179
+ r'\n\s*REFERENCES\s*\n',
180
+ r'\n\s*References\s*\n',
181
+ r'\n\s*BIBLIOGRAPHY\s*\n',
182
+ r'\n\s*Bibliography\s*\n',
183
+ r'\n\s*WORKS CITED\s*\n',
184
+ r'\n\s*Works Cited\s*\n'
185
+ ]
186
+
187
+ import re
188
+
189
+ # Find bibliography section
190
+ for header in bib_headers:
191
+ match = re.search(header, text, re.IGNORECASE)
192
+ if match:
193
+ # Extract from bibliography header
194
+ bib_start = match.end()
195
+ full_bib_text = text[bib_start:].strip()
196
+
197
+ # Find the end of the bibliography section by looking for common section headers
198
+ # that typically follow references
199
+ end_markers = [
200
+ r'\n\s*APPENDIX\s*[A-Z]?\s*\n',
201
+ r'\n\s*Appendix\s*[A-Z]?\s*\n',
202
+ r'\n\s*[A-Z]\s+[A-Z]{2,}.*\n', # Pattern like "A LRE Dataset", "B ADDITIONAL RESULTS"
203
+ r'\n\s*[A-Z]\.\d+\s+.*\n', # Pattern like "A.1 Dataset Details"
204
+ r'\nTable\s+\d+:.*\n[A-Z]\s+[A-Z]', # Table followed by appendix section like "Table 7: ...\nA LRE"
205
+ r'\n\s*SUPPLEMENTARY\s+MATERIAL\s*\n',
206
+ r'\n\s*Supplementary\s+Material\s*\n',
207
+ r'\n\s*SUPPLEMENTAL\s+MATERIAL\s*\n',
208
+ r'\n\s*Supplemental\s+Material\s*\n',
209
+ r'\n\s*ACKNOWLEDGMENTS?\s*\n',
210
+ r'\n\s*Acknowledgments?\s*\n',
211
+ r'\n\s*AUTHOR\s+CONTRIBUTIONS?\s*\n',
212
+ r'\n\s*Author\s+Contributions?\s*\n',
213
+ r'\n\s*FUNDING\s*\n',
214
+ r'\n\s*Funding\s*\n',
215
+ r'\n\s*ETHICS\s+STATEMENT\s*\n',
216
+ r'\n\s*Ethics\s+Statement\s*\n',
217
+ r'\n\s*CONFLICT\s+OF\s+INTEREST\s*\n',
218
+ r'\n\s*Conflict\s+of\s+Interest\s*\n',
219
+ r'\n\s*DATA\s+AVAILABILITY\s*\n',
220
+ r'\n\s*Data\s+Availability\s*\n'
221
+ ]
222
+
223
+ bib_text = full_bib_text
224
+ bib_end = len(full_bib_text)
225
+
226
+ # Look for section markers that indicate end of bibliography
227
+ for end_marker in end_markers:
228
+ end_match = re.search(end_marker, full_bib_text, re.IGNORECASE)
229
+ if end_match and end_match.start() < bib_end:
230
+ bib_end = end_match.start()
231
+
232
+ # If we found an end marker, truncate there
233
+ if bib_end < len(full_bib_text):
234
+ bib_text = full_bib_text[:bib_end].strip()
235
+ logger.debug(f"Bibliography section truncated at position {bib_end}")
236
+
237
+ # Also try to detect bibliography end by finding the last numbered reference
238
+ # Look for the highest numbered reference in the text
239
+ ref_numbers = re.findall(r'\[(\d+)\]', bib_text)
240
+ if ref_numbers:
241
+ max_ref_num = max(int(num) for num in ref_numbers)
242
+ logger.debug(f"Found references up to [{max_ref_num}]")
243
+
244
+ # Look for the end of the last numbered reference
245
+ last_ref_pattern = rf'\[{max_ref_num}\][^[]*?(?=\n\s*[A-Z]{{2,}}|\n\s*\w+\s*\n\s*[A-Z]|\Z)'
246
+ last_ref_match = re.search(last_ref_pattern, bib_text, re.DOTALL)
247
+ if last_ref_match:
248
+ potential_end = last_ref_match.end()
249
+ # Only use this if it's before our section marker end
250
+ if potential_end < bib_end:
251
+ bib_text = bib_text[:potential_end].strip()
252
+ logger.debug(f"Bibliography truncated after reference [{max_ref_num}]")
253
+
254
+ # Final fallback: limit to reasonable length
255
+ if len(bib_text) > 50000: # Limit to ~50KB
256
+ bib_text = bib_text[:50000]
257
+ logger.debug("Bibliography section truncated to 50KB limit")
258
+
259
+ logger.debug(f"Found bibliography section: {len(bib_text)} characters")
260
+ return bib_text
261
+
262
+ logger.warning("No bibliography section found in text")
263
+ return ""
264
+
265
+ def clear_cache(self):
266
+ """Clear the text extraction cache"""
267
+ self.cache.clear()
268
+ logger.debug("PDF text cache cleared")
@@ -0,0 +1,27 @@
1
+ """
2
+ Utility functions for text processing, author comparison, mocking, and configuration validation
3
+ """
4
+
5
+ from .text_utils import (
6
+ clean_author_name, clean_title, normalize_text,
7
+ clean_conference_markers_from_title,
8
+ remove_year_from_title
9
+ )
10
+ from .url_utils import extract_arxiv_id_from_url
11
+ from .author_utils import compare_authors, levenshtein_distance, extract_authors_list
12
+ from .mock_objects import (
13
+ MockPaper, MockReference, MockLLMProvider, MockSemanticScholarAPI, MockArxivAPI,
14
+ create_mock_config, create_mock_paper, create_mock_reference,
15
+ create_mock_bibliography, create_mock_extracted_references
16
+ )
17
+ from .config_validator import ConfigValidator, ValidationResult
18
+
19
+ __all__ = [
20
+ "clean_author_name", "clean_title", "normalize_text",
21
+ "extract_arxiv_id_from_url", "clean_conference_markers_from_title",
22
+ "remove_year_from_title", "compare_authors", "levenshtein_distance",
23
+ "extract_authors_list", "MockPaper", "MockReference", "MockLLMProvider",
24
+ "MockSemanticScholarAPI", "MockArxivAPI", "create_mock_config",
25
+ "create_mock_paper", "create_mock_reference", "create_mock_bibliography",
26
+ "create_mock_extracted_references", "ConfigValidator", "ValidationResult"
27
+ ]