academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Scripts for refchecker package."""
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Standalone vLLM server launcher script
|
|
4
|
+
Runs outside of debugger environment to avoid pydevd conflicts
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import subprocess
|
|
9
|
+
import os
|
|
10
|
+
import time
|
|
11
|
+
import argparse
|
|
12
|
+
import signal
|
|
13
|
+
|
|
14
|
+
def start_vllm_server(model_name, port=8000, tensor_parallel_size=1, max_model_len=None, gpu_memory_util=0.9, daemon=False):
|
|
15
|
+
"""Start vLLM server with specified parameters"""
|
|
16
|
+
|
|
17
|
+
# Kill any existing server on the port
|
|
18
|
+
try:
|
|
19
|
+
subprocess.run(["pkill", "-f", "vllm.entrypoints.openai.api_server"],
|
|
20
|
+
timeout=10, capture_output=True)
|
|
21
|
+
time.sleep(2)
|
|
22
|
+
except:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
# Build command
|
|
26
|
+
cmd = [
|
|
27
|
+
sys.executable, "-m", "vllm.entrypoints.openai.api_server",
|
|
28
|
+
"--model", model_name,
|
|
29
|
+
"--host", "0.0.0.0",
|
|
30
|
+
"--port", str(port),
|
|
31
|
+
"--tensor-parallel-size", str(tensor_parallel_size),
|
|
32
|
+
"--gpu-memory-utilization", str(gpu_memory_util)
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
if max_model_len:
|
|
36
|
+
cmd.extend(["--max-model-len", str(max_model_len)])
|
|
37
|
+
|
|
38
|
+
print(f"Starting vLLM server: {' '.join(cmd)}")
|
|
39
|
+
|
|
40
|
+
# Create clean environment without debugger variables
|
|
41
|
+
clean_env = {}
|
|
42
|
+
for key, value in os.environ.items():
|
|
43
|
+
if not any(debug_key in key.upper() for debug_key in ['DEBUGPY', 'PYDEVD']):
|
|
44
|
+
clean_env[key] = value
|
|
45
|
+
|
|
46
|
+
# Remove debugger paths from PYTHONPATH if present
|
|
47
|
+
if 'PYTHONPATH' in clean_env:
|
|
48
|
+
pythonpath_parts = clean_env['PYTHONPATH'].split(':')
|
|
49
|
+
clean_pythonpath = [p for p in pythonpath_parts if 'debugpy' not in p and 'pydevd' not in p]
|
|
50
|
+
if clean_pythonpath:
|
|
51
|
+
clean_env['PYTHONPATH'] = ':'.join(clean_pythonpath)
|
|
52
|
+
else:
|
|
53
|
+
clean_env.pop('PYTHONPATH', None)
|
|
54
|
+
|
|
55
|
+
# Start server
|
|
56
|
+
if daemon:
|
|
57
|
+
# For daemon mode, redirect output to /dev/null to avoid blocking
|
|
58
|
+
with open(os.devnull, 'w') as devnull:
|
|
59
|
+
process = subprocess.Popen(
|
|
60
|
+
cmd,
|
|
61
|
+
env=clean_env,
|
|
62
|
+
start_new_session=True,
|
|
63
|
+
stdout=devnull,
|
|
64
|
+
stderr=devnull
|
|
65
|
+
)
|
|
66
|
+
return process
|
|
67
|
+
else:
|
|
68
|
+
# For non-daemon mode, keep stdout/stderr for monitoring with real-time streaming
|
|
69
|
+
process = subprocess.Popen(
|
|
70
|
+
cmd,
|
|
71
|
+
env=clean_env,
|
|
72
|
+
start_new_session=True,
|
|
73
|
+
stdout=subprocess.PIPE,
|
|
74
|
+
stderr=subprocess.STDOUT,
|
|
75
|
+
text=True,
|
|
76
|
+
bufsize=0, # Unbuffered for real-time output
|
|
77
|
+
universal_newlines=True
|
|
78
|
+
)
|
|
79
|
+
return process
|
|
80
|
+
|
|
81
|
+
def main():
|
|
82
|
+
parser = argparse.ArgumentParser(description="Start vLLM server")
|
|
83
|
+
parser.add_argument("--model", required=True, help="Model name to serve")
|
|
84
|
+
parser.add_argument("--port", type=int, default=8000, help="Port to serve on")
|
|
85
|
+
parser.add_argument("--tensor-parallel-size", type=int, default=1, help="Tensor parallel size")
|
|
86
|
+
parser.add_argument("--max-model-len", type=int, help="Maximum model length")
|
|
87
|
+
parser.add_argument("--gpu-memory-util", type=float, default=0.9, help="GPU memory utilization")
|
|
88
|
+
parser.add_argument("--daemon", action="store_true", help="Run as daemon")
|
|
89
|
+
|
|
90
|
+
args = parser.parse_args()
|
|
91
|
+
|
|
92
|
+
process = start_vllm_server(
|
|
93
|
+
model_name=args.model,
|
|
94
|
+
port=args.port,
|
|
95
|
+
tensor_parallel_size=args.tensor_parallel_size,
|
|
96
|
+
max_model_len=args.max_model_len,
|
|
97
|
+
gpu_memory_util=args.gpu_memory_util,
|
|
98
|
+
daemon=args.daemon
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if args.daemon:
|
|
102
|
+
print(f"vLLM server started as daemon with PID: {process.pid}")
|
|
103
|
+
print(f"Server URL: http://localhost:{args.port}")
|
|
104
|
+
return 0
|
|
105
|
+
else:
|
|
106
|
+
print(f"vLLM server starting with PID: {process.pid}")
|
|
107
|
+
print(f"Server URL: http://localhost:{args.port}")
|
|
108
|
+
print("Press Ctrl+C to stop...")
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
# Stream output with immediate flushing
|
|
112
|
+
for line in process.stdout:
|
|
113
|
+
print(line.rstrip(), flush=True)
|
|
114
|
+
except KeyboardInterrupt:
|
|
115
|
+
print("\nStopping vLLM server...")
|
|
116
|
+
process.terminate()
|
|
117
|
+
process.wait()
|
|
118
|
+
return 0
|
|
119
|
+
|
|
120
|
+
if __name__ == "__main__":
|
|
121
|
+
sys.exit(main())
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
PDF Processing Service for ArXiv Reference Checker
|
|
4
|
+
Extracted from core.refchecker to improve modularity
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional, Dict, Any
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Paper:
|
|
16
|
+
"""Represents a paper with metadata"""
|
|
17
|
+
title: str
|
|
18
|
+
authors: list
|
|
19
|
+
abstract: str = ""
|
|
20
|
+
year: Optional[int] = None
|
|
21
|
+
venue: str = ""
|
|
22
|
+
url: str = ""
|
|
23
|
+
doi: str = ""
|
|
24
|
+
arxiv_id: str = ""
|
|
25
|
+
pdf_path: str = ""
|
|
26
|
+
|
|
27
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
28
|
+
"""Convert paper to dictionary format"""
|
|
29
|
+
return {
|
|
30
|
+
'title': self.title,
|
|
31
|
+
'authors': self.authors,
|
|
32
|
+
'abstract': self.abstract,
|
|
33
|
+
'year': self.year,
|
|
34
|
+
'venue': self.venue,
|
|
35
|
+
'url': self.url,
|
|
36
|
+
'doi': self.doi,
|
|
37
|
+
'arxiv_id': self.arxiv_id,
|
|
38
|
+
'pdf_path': self.pdf_path
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
class PDFProcessor:
|
|
42
|
+
"""Service for processing PDF files and extracting text"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
45
|
+
self.config = config or {}
|
|
46
|
+
self.cache = {}
|
|
47
|
+
|
|
48
|
+
def extract_text_from_pdf(self, pdf_path: str) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Extract text from PDF file
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
pdf_path: Path to PDF file
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Extracted text content
|
|
57
|
+
"""
|
|
58
|
+
if not os.path.exists(pdf_path):
|
|
59
|
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
|
60
|
+
|
|
61
|
+
# Check cache first
|
|
62
|
+
if pdf_path in self.cache:
|
|
63
|
+
logger.debug(f"Using cached text for {pdf_path}")
|
|
64
|
+
return self.cache[pdf_path]
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
import pypdf
|
|
68
|
+
|
|
69
|
+
with open(pdf_path, 'rb') as file:
|
|
70
|
+
pdf_reader = pypdf.PdfReader(file)
|
|
71
|
+
text = ""
|
|
72
|
+
failed_pages = []
|
|
73
|
+
|
|
74
|
+
for page_num in range(len(pdf_reader.pages)):
|
|
75
|
+
try:
|
|
76
|
+
page = pdf_reader.pages[page_num]
|
|
77
|
+
page_text = page.extract_text()
|
|
78
|
+
if page_text:
|
|
79
|
+
text += page_text + "\n"
|
|
80
|
+
except TypeError as e:
|
|
81
|
+
# Handle pypdf errors like "NumberObject is not iterable"
|
|
82
|
+
# which can occur with malformed PDF pages
|
|
83
|
+
failed_pages.append(page_num + 1) # 1-indexed for logging
|
|
84
|
+
logger.warning(f"Skipping page {page_num + 1} due to PDF parsing error: {e}")
|
|
85
|
+
continue
|
|
86
|
+
except Exception as e:
|
|
87
|
+
failed_pages.append(page_num + 1)
|
|
88
|
+
logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
if failed_pages:
|
|
92
|
+
logger.warning(f"Failed to extract text from {len(failed_pages)} pages: {failed_pages[:10]}{'...' if len(failed_pages) > 10 else ''}")
|
|
93
|
+
|
|
94
|
+
if not text.strip():
|
|
95
|
+
raise ValueError(f"No text could be extracted from any pages of {pdf_path}")
|
|
96
|
+
|
|
97
|
+
# Cache the result
|
|
98
|
+
self.cache[pdf_path] = text
|
|
99
|
+
logger.debug(f"Extracted {len(text)} characters from {pdf_path}")
|
|
100
|
+
return text
|
|
101
|
+
|
|
102
|
+
except ImportError:
|
|
103
|
+
logger.error("pypdf not installed. Install with: pip install pypdf")
|
|
104
|
+
raise
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.error(f"Error extracting text from PDF {pdf_path}: {e}")
|
|
107
|
+
raise
|
|
108
|
+
|
|
109
|
+
def create_local_file_paper(self, file_path: str, metadata: Optional[Dict[str, Any]] = None) -> Paper:
|
|
110
|
+
"""
|
|
111
|
+
Create a Paper object from a local file
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
file_path: Path to the file
|
|
115
|
+
metadata: Optional metadata dictionary
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Paper object
|
|
119
|
+
"""
|
|
120
|
+
if not os.path.exists(file_path):
|
|
121
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
122
|
+
|
|
123
|
+
# Extract text if it's a PDF
|
|
124
|
+
text_content = ""
|
|
125
|
+
if file_path.lower().endswith('.pdf'):
|
|
126
|
+
try:
|
|
127
|
+
text_content = self.extract_text_from_pdf(file_path)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.warning(f"Could not extract text from {file_path}: {e}")
|
|
130
|
+
|
|
131
|
+
# Use metadata if provided, otherwise extract from filename
|
|
132
|
+
if metadata:
|
|
133
|
+
title = metadata.get('title', os.path.basename(file_path))
|
|
134
|
+
authors = metadata.get('authors', [])
|
|
135
|
+
abstract = metadata.get('abstract', '')
|
|
136
|
+
year = metadata.get('year')
|
|
137
|
+
venue = metadata.get('venue', '')
|
|
138
|
+
url = metadata.get('url', '')
|
|
139
|
+
doi = metadata.get('doi', '')
|
|
140
|
+
arxiv_id = metadata.get('arxiv_id', '')
|
|
141
|
+
else:
|
|
142
|
+
# Basic extraction from filename
|
|
143
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
|
144
|
+
authors = []
|
|
145
|
+
abstract = text_content[:500] if text_content else "" # First 500 chars as abstract
|
|
146
|
+
year = None
|
|
147
|
+
venue = ""
|
|
148
|
+
url = ""
|
|
149
|
+
doi = ""
|
|
150
|
+
arxiv_id = ""
|
|
151
|
+
|
|
152
|
+
return Paper(
|
|
153
|
+
title=title,
|
|
154
|
+
authors=authors,
|
|
155
|
+
abstract=abstract,
|
|
156
|
+
year=year,
|
|
157
|
+
venue=venue,
|
|
158
|
+
url=url,
|
|
159
|
+
doi=doi,
|
|
160
|
+
arxiv_id=arxiv_id,
|
|
161
|
+
pdf_path=file_path
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def extract_bibliography_from_text(self, text: str) -> str:
|
|
165
|
+
"""
|
|
166
|
+
Extract bibliography section from text
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
text: Full text content
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Bibliography section text
|
|
173
|
+
"""
|
|
174
|
+
if not text:
|
|
175
|
+
return ""
|
|
176
|
+
|
|
177
|
+
# Common bibliography section headers
|
|
178
|
+
bib_headers = [
|
|
179
|
+
r'\n\s*REFERENCES\s*\n',
|
|
180
|
+
r'\n\s*References\s*\n',
|
|
181
|
+
r'\n\s*BIBLIOGRAPHY\s*\n',
|
|
182
|
+
r'\n\s*Bibliography\s*\n',
|
|
183
|
+
r'\n\s*WORKS CITED\s*\n',
|
|
184
|
+
r'\n\s*Works Cited\s*\n'
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
import re
|
|
188
|
+
|
|
189
|
+
# Find bibliography section
|
|
190
|
+
for header in bib_headers:
|
|
191
|
+
match = re.search(header, text, re.IGNORECASE)
|
|
192
|
+
if match:
|
|
193
|
+
# Extract from bibliography header
|
|
194
|
+
bib_start = match.end()
|
|
195
|
+
full_bib_text = text[bib_start:].strip()
|
|
196
|
+
|
|
197
|
+
# Find the end of the bibliography section by looking for common section headers
|
|
198
|
+
# that typically follow references
|
|
199
|
+
end_markers = [
|
|
200
|
+
r'\n\s*APPENDIX\s*[A-Z]?\s*\n',
|
|
201
|
+
r'\n\s*Appendix\s*[A-Z]?\s*\n',
|
|
202
|
+
r'\n\s*[A-Z]\s+[A-Z]{2,}.*\n', # Pattern like "A LRE Dataset", "B ADDITIONAL RESULTS"
|
|
203
|
+
r'\n\s*[A-Z]\.\d+\s+.*\n', # Pattern like "A.1 Dataset Details"
|
|
204
|
+
r'\nTable\s+\d+:.*\n[A-Z]\s+[A-Z]', # Table followed by appendix section like "Table 7: ...\nA LRE"
|
|
205
|
+
r'\n\s*SUPPLEMENTARY\s+MATERIAL\s*\n',
|
|
206
|
+
r'\n\s*Supplementary\s+Material\s*\n',
|
|
207
|
+
r'\n\s*SUPPLEMENTAL\s+MATERIAL\s*\n',
|
|
208
|
+
r'\n\s*Supplemental\s+Material\s*\n',
|
|
209
|
+
r'\n\s*ACKNOWLEDGMENTS?\s*\n',
|
|
210
|
+
r'\n\s*Acknowledgments?\s*\n',
|
|
211
|
+
r'\n\s*AUTHOR\s+CONTRIBUTIONS?\s*\n',
|
|
212
|
+
r'\n\s*Author\s+Contributions?\s*\n',
|
|
213
|
+
r'\n\s*FUNDING\s*\n',
|
|
214
|
+
r'\n\s*Funding\s*\n',
|
|
215
|
+
r'\n\s*ETHICS\s+STATEMENT\s*\n',
|
|
216
|
+
r'\n\s*Ethics\s+Statement\s*\n',
|
|
217
|
+
r'\n\s*CONFLICT\s+OF\s+INTEREST\s*\n',
|
|
218
|
+
r'\n\s*Conflict\s+of\s+Interest\s*\n',
|
|
219
|
+
r'\n\s*DATA\s+AVAILABILITY\s*\n',
|
|
220
|
+
r'\n\s*Data\s+Availability\s*\n'
|
|
221
|
+
]
|
|
222
|
+
|
|
223
|
+
bib_text = full_bib_text
|
|
224
|
+
bib_end = len(full_bib_text)
|
|
225
|
+
|
|
226
|
+
# Look for section markers that indicate end of bibliography
|
|
227
|
+
for end_marker in end_markers:
|
|
228
|
+
end_match = re.search(end_marker, full_bib_text, re.IGNORECASE)
|
|
229
|
+
if end_match and end_match.start() < bib_end:
|
|
230
|
+
bib_end = end_match.start()
|
|
231
|
+
|
|
232
|
+
# If we found an end marker, truncate there
|
|
233
|
+
if bib_end < len(full_bib_text):
|
|
234
|
+
bib_text = full_bib_text[:bib_end].strip()
|
|
235
|
+
logger.debug(f"Bibliography section truncated at position {bib_end}")
|
|
236
|
+
|
|
237
|
+
# Also try to detect bibliography end by finding the last numbered reference
|
|
238
|
+
# Look for the highest numbered reference in the text
|
|
239
|
+
ref_numbers = re.findall(r'\[(\d+)\]', bib_text)
|
|
240
|
+
if ref_numbers:
|
|
241
|
+
max_ref_num = max(int(num) for num in ref_numbers)
|
|
242
|
+
logger.debug(f"Found references up to [{max_ref_num}]")
|
|
243
|
+
|
|
244
|
+
# Look for the end of the last numbered reference
|
|
245
|
+
last_ref_pattern = rf'\[{max_ref_num}\][^[]*?(?=\n\s*[A-Z]{{2,}}|\n\s*\w+\s*\n\s*[A-Z]|\Z)'
|
|
246
|
+
last_ref_match = re.search(last_ref_pattern, bib_text, re.DOTALL)
|
|
247
|
+
if last_ref_match:
|
|
248
|
+
potential_end = last_ref_match.end()
|
|
249
|
+
# Only use this if it's before our section marker end
|
|
250
|
+
if potential_end < bib_end:
|
|
251
|
+
bib_text = bib_text[:potential_end].strip()
|
|
252
|
+
logger.debug(f"Bibliography truncated after reference [{max_ref_num}]")
|
|
253
|
+
|
|
254
|
+
# Final fallback: limit to reasonable length
|
|
255
|
+
if len(bib_text) > 50000: # Limit to ~50KB
|
|
256
|
+
bib_text = bib_text[:50000]
|
|
257
|
+
logger.debug("Bibliography section truncated to 50KB limit")
|
|
258
|
+
|
|
259
|
+
logger.debug(f"Found bibliography section: {len(bib_text)} characters")
|
|
260
|
+
return bib_text
|
|
261
|
+
|
|
262
|
+
logger.warning("No bibliography section found in text")
|
|
263
|
+
return ""
|
|
264
|
+
|
|
265
|
+
def clear_cache(self):
|
|
266
|
+
"""Clear the text extraction cache"""
|
|
267
|
+
self.cache.clear()
|
|
268
|
+
logger.debug("PDF text cache cleared")
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for text processing, author comparison, mocking, and configuration validation
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .text_utils import (
|
|
6
|
+
clean_author_name, clean_title, normalize_text,
|
|
7
|
+
clean_conference_markers_from_title,
|
|
8
|
+
remove_year_from_title
|
|
9
|
+
)
|
|
10
|
+
from .url_utils import extract_arxiv_id_from_url
|
|
11
|
+
from .author_utils import compare_authors, levenshtein_distance, extract_authors_list
|
|
12
|
+
from .mock_objects import (
|
|
13
|
+
MockPaper, MockReference, MockLLMProvider, MockSemanticScholarAPI, MockArxivAPI,
|
|
14
|
+
create_mock_config, create_mock_paper, create_mock_reference,
|
|
15
|
+
create_mock_bibliography, create_mock_extracted_references
|
|
16
|
+
)
|
|
17
|
+
from .config_validator import ConfigValidator, ValidationResult
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"clean_author_name", "clean_title", "normalize_text",
|
|
21
|
+
"extract_arxiv_id_from_url", "clean_conference_markers_from_title",
|
|
22
|
+
"remove_year_from_title", "compare_authors", "levenshtein_distance",
|
|
23
|
+
"extract_authors_list", "MockPaper", "MockReference", "MockLLMProvider",
|
|
24
|
+
"MockSemanticScholarAPI", "MockArxivAPI", "create_mock_config",
|
|
25
|
+
"create_mock_paper", "create_mock_reference", "create_mock_bibliography",
|
|
26
|
+
"create_mock_extracted_references", "ConfigValidator", "ValidationResult"
|
|
27
|
+
]
|