academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Unicode parsing utility functions for handling text processing in pipelines.
|
|
4
|
+
Provides robust Unicode support for various text processing scenarios.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import unicodedata
|
|
8
|
+
import re
|
|
9
|
+
import json
|
|
10
|
+
import codecs
|
|
11
|
+
from typing import Any, Dict, List, Optional, Union
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def normalize_unicode_text(text: str, form: str = 'NFKC') -> str:
|
|
15
|
+
"""
|
|
16
|
+
Normalize Unicode text to handle various Unicode forms.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
text: Input text to normalize
|
|
20
|
+
form: Unicode normalization form ('NFC', 'NFKC', 'NFD', 'NFKD')
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Normalized Unicode text
|
|
24
|
+
"""
|
|
25
|
+
if not isinstance(text, str):
|
|
26
|
+
text = str(text)
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
# Normalize Unicode characters
|
|
30
|
+
normalized = unicodedata.normalize(form, text)
|
|
31
|
+
return normalized
|
|
32
|
+
except Exception as e:
|
|
33
|
+
print(f"Warning: Unicode normalization failed: {e}")
|
|
34
|
+
return text
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def clean_unicode_control_chars(text: str) -> str:
|
|
38
|
+
"""
|
|
39
|
+
Remove or replace problematic Unicode control characters.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
text: Input text to clean
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Cleaned text with control characters handled
|
|
46
|
+
"""
|
|
47
|
+
if not isinstance(text, str):
|
|
48
|
+
text = str(text)
|
|
49
|
+
|
|
50
|
+
# Remove common problematic control characters
|
|
51
|
+
# Keep essential whitespace characters (space, tab, newline, carriage return)
|
|
52
|
+
control_char_pattern = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]')
|
|
53
|
+
cleaned = control_char_pattern.sub('', text)
|
|
54
|
+
|
|
55
|
+
# Replace non-breaking spaces and similar with regular spaces
|
|
56
|
+
cleaned = re.sub(r'[\u00A0\u2000-\u200B\u2028\u2029\u202F\u205F\u3000]', ' ', cleaned)
|
|
57
|
+
|
|
58
|
+
return cleaned
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def safe_encode_decode(text: str, encoding: str = 'utf-8', errors: str = 'replace') -> str:
|
|
62
|
+
"""
|
|
63
|
+
Safely encode and decode text to handle encoding issues.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
text: Input text
|
|
67
|
+
encoding: Target encoding (default: utf-8)
|
|
68
|
+
errors: Error handling strategy ('ignore', 'replace', 'strict')
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Safely encoded/decoded text
|
|
72
|
+
"""
|
|
73
|
+
if not isinstance(text, str):
|
|
74
|
+
text = str(text)
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
# Encode then decode to handle any encoding issues
|
|
78
|
+
encoded = text.encode(encoding, errors=errors)
|
|
79
|
+
decoded = encoded.decode(encoding, errors=errors)
|
|
80
|
+
return decoded
|
|
81
|
+
except Exception as e:
|
|
82
|
+
print(f"Warning: Encoding/decoding failed: {e}")
|
|
83
|
+
return text
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def fix_mojibake(text: str) -> str:
|
|
87
|
+
"""
|
|
88
|
+
Attempt to fix common mojibake (character encoding corruption) issues.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
text: Input text that may contain mojibake
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Text with mojibake corrections attempted
|
|
95
|
+
"""
|
|
96
|
+
if not isinstance(text, str):
|
|
97
|
+
text = str(text)
|
|
98
|
+
|
|
99
|
+
# Common mojibake patterns and their fixes
|
|
100
|
+
mojibake_fixes = {
|
|
101
|
+
# UTF-8 interpreted as Latin-1 then re-encoded
|
|
102
|
+
'á': 'á',
|
|
103
|
+
'é': 'é',
|
|
104
|
+
'Ã': 'í',
|
|
105
|
+
'ó': 'ó',
|
|
106
|
+
'ú': 'ú',
|
|
107
|
+
'ñ': 'ñ',
|
|
108
|
+
'ü': 'ü',
|
|
109
|
+
'Â': '', # Often spurious  characters
|
|
110
|
+
'’': "'", # Right single quotation mark
|
|
111
|
+
'“': '"', # Left double quotation mark
|
|
112
|
+
'â€': '"', # Right double quotation mark
|
|
113
|
+
'â€"': '—', # Em dash
|
|
114
|
+
'â€"': '–', # En dash
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
for broken, fixed in mojibake_fixes.items():
|
|
118
|
+
text = text.replace(broken, fixed)
|
|
119
|
+
|
|
120
|
+
return text
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def safe_json_loads(text: str) -> Any:
|
|
124
|
+
"""
|
|
125
|
+
Safely load JSON with Unicode handling.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
text: JSON string to parse
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Parsed JSON object, or None if parsing fails
|
|
132
|
+
"""
|
|
133
|
+
if not isinstance(text, str):
|
|
134
|
+
text = str(text)
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
# Clean the text first
|
|
138
|
+
cleaned_text = normalize_unicode_text(text)
|
|
139
|
+
cleaned_text = clean_unicode_control_chars(cleaned_text)
|
|
140
|
+
|
|
141
|
+
# Try to parse JSON
|
|
142
|
+
return json.loads(cleaned_text)
|
|
143
|
+
except json.JSONDecodeError as e:
|
|
144
|
+
print(f"Warning: JSON parsing failed: {e}")
|
|
145
|
+
# Try with mojibake fixes
|
|
146
|
+
try:
|
|
147
|
+
fixed_text = fix_mojibake(cleaned_text)
|
|
148
|
+
return json.loads(fixed_text)
|
|
149
|
+
except json.JSONDecodeError:
|
|
150
|
+
print("Warning: JSON parsing failed even after mojibake fixes")
|
|
151
|
+
return None
|
|
152
|
+
except Exception as e:
|
|
153
|
+
print(f"Warning: Unexpected error in JSON parsing: {e}")
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def safe_file_read(file_path: str, encoding: str = 'utf-8', fallback_encodings: Optional[List[str]] = None) -> str:
|
|
158
|
+
"""
|
|
159
|
+
Safely read a file with Unicode handling and encoding detection.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
file_path: Path to file to read
|
|
163
|
+
encoding: Primary encoding to try
|
|
164
|
+
fallback_encodings: List of fallback encodings to try
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
File contents as string
|
|
168
|
+
"""
|
|
169
|
+
if fallback_encodings is None:
|
|
170
|
+
fallback_encodings = ['utf-8-sig', 'latin-1', 'cp1252', 'iso-8859-1']
|
|
171
|
+
|
|
172
|
+
encodings_to_try = [encoding] + [enc for enc in fallback_encodings if enc != encoding]
|
|
173
|
+
|
|
174
|
+
for enc in encodings_to_try:
|
|
175
|
+
try:
|
|
176
|
+
with codecs.open(file_path, 'r', encoding=enc, errors='replace') as f:
|
|
177
|
+
content = f.read()
|
|
178
|
+
|
|
179
|
+
# Clean and normalize the content
|
|
180
|
+
content = normalize_unicode_text(content)
|
|
181
|
+
content = clean_unicode_control_chars(content)
|
|
182
|
+
|
|
183
|
+
return content
|
|
184
|
+
except UnicodeDecodeError:
|
|
185
|
+
continue
|
|
186
|
+
except Exception as e:
|
|
187
|
+
print(f"Warning: Error reading file with encoding {enc}: {e}")
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
raise ValueError(f"Could not read file {file_path} with any of the attempted encodings")
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def safe_file_write(file_path: str, content: str, encoding: str = 'utf-8') -> None:
|
|
194
|
+
"""
|
|
195
|
+
Safely write content to file with Unicode handling.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
file_path: Path to file to write
|
|
199
|
+
content: Content to write
|
|
200
|
+
encoding: Encoding to use for writing
|
|
201
|
+
"""
|
|
202
|
+
if not isinstance(content, str):
|
|
203
|
+
content = str(content)
|
|
204
|
+
|
|
205
|
+
# Normalize content before writing
|
|
206
|
+
content = normalize_unicode_text(content)
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
with codecs.open(file_path, 'w', encoding=encoding, errors='replace') as f:
|
|
210
|
+
f.write(content)
|
|
211
|
+
except Exception as e:
|
|
212
|
+
print(f"Warning: Error writing file {file_path}: {e}")
|
|
213
|
+
# Fallback: write with error replacement
|
|
214
|
+
with codecs.open(file_path, 'w', encoding=encoding, errors='replace') as f:
|
|
215
|
+
f.write(content)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def process_text_robust(text: Union[str, bytes, Any],
|
|
219
|
+
normalize: bool = True,
|
|
220
|
+
clean_control_chars: bool = True,
|
|
221
|
+
fix_mojibake_issues: bool = True,
|
|
222
|
+
safe_encoding: bool = True) -> str:
|
|
223
|
+
"""
|
|
224
|
+
Robustly process text with comprehensive Unicode handling.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
text: Input text to process
|
|
228
|
+
normalize: Whether to normalize Unicode
|
|
229
|
+
clean_control_chars: Whether to clean control characters
|
|
230
|
+
fix_mojibake_issues: Whether to attempt mojibake fixes
|
|
231
|
+
safe_encoding: Whether to apply safe encoding/decoding
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Processed text string
|
|
235
|
+
"""
|
|
236
|
+
# Handle bytes input
|
|
237
|
+
if isinstance(text, bytes):
|
|
238
|
+
try:
|
|
239
|
+
text = text.decode('utf-8', errors='replace')
|
|
240
|
+
except:
|
|
241
|
+
text = str(text)
|
|
242
|
+
elif not isinstance(text, str):
|
|
243
|
+
text = str(text)
|
|
244
|
+
|
|
245
|
+
# Apply processing steps
|
|
246
|
+
if normalize:
|
|
247
|
+
text = normalize_unicode_text(text)
|
|
248
|
+
|
|
249
|
+
if clean_control_chars:
|
|
250
|
+
text = clean_unicode_control_chars(text)
|
|
251
|
+
|
|
252
|
+
if fix_mojibake_issues:
|
|
253
|
+
text = fix_mojibake(text)
|
|
254
|
+
|
|
255
|
+
if safe_encoding:
|
|
256
|
+
text = safe_encode_decode(text)
|
|
257
|
+
|
|
258
|
+
return text
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def validate_unicode_text(text: str) -> Dict[str, Any]:
|
|
262
|
+
"""
|
|
263
|
+
Validate and analyze Unicode text for potential issues.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
text: Text to validate
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
Dictionary with validation results and statistics
|
|
270
|
+
"""
|
|
271
|
+
if not isinstance(text, str):
|
|
272
|
+
text = str(text)
|
|
273
|
+
|
|
274
|
+
results = {
|
|
275
|
+
'length': len(text),
|
|
276
|
+
'is_ascii': text.isascii(),
|
|
277
|
+
'encoding_issues': [],
|
|
278
|
+
'control_chars_count': 0,
|
|
279
|
+
'non_printable_count': 0,
|
|
280
|
+
'unicode_categories': {},
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
# Count control characters
|
|
284
|
+
control_char_pattern = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]')
|
|
285
|
+
results['control_chars_count'] = len(control_char_pattern.findall(text))
|
|
286
|
+
|
|
287
|
+
# Count non-printable characters
|
|
288
|
+
results['non_printable_count'] = sum(1 for c in text if not c.isprintable())
|
|
289
|
+
|
|
290
|
+
# Analyze Unicode categories
|
|
291
|
+
for char in text[:1000]: # Sample first 1000 chars for performance
|
|
292
|
+
category = unicodedata.category(char)
|
|
293
|
+
results['unicode_categories'][category] = results['unicode_categories'].get(category, 0) + 1
|
|
294
|
+
|
|
295
|
+
# Check for common encoding issues
|
|
296
|
+
if 'Ã' in text and any(char in text for char in ['¡', '©', '', '³', 'º', '±', '¼']):
|
|
297
|
+
results['encoding_issues'].append('Possible UTF-8 to Latin-1 mojibake')
|
|
298
|
+
|
|
299
|
+
if 'â€' in text:
|
|
300
|
+
results['encoding_issues'].append('Possible smart quote encoding issues')
|
|
301
|
+
|
|
302
|
+
return results
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
# Example usage and testing functions
|
|
306
|
+
def test_unicode_utils():
|
|
307
|
+
"""Test function to verify Unicode utilities work correctly."""
|
|
308
|
+
|
|
309
|
+
# Test cases
|
|
310
|
+
test_cases = [
|
|
311
|
+
"Normal ASCII text",
|
|
312
|
+
"Unicode: café, naïve, résumé",
|
|
313
|
+
"Mojibake: caf© na√Øve r©sum©",
|
|
314
|
+
"Control chars: Hello\x00\x01World",
|
|
315
|
+
"Smart quotes: \"Hello\" 'World'",
|
|
316
|
+
"Mixed: café\u00A0with\u2000spaces",
|
|
317
|
+
]
|
|
318
|
+
|
|
319
|
+
print("Testing Unicode utilities...")
|
|
320
|
+
for i, test_text in enumerate(test_cases):
|
|
321
|
+
print(f"\nTest {i+1}: {repr(test_text[:50])}")
|
|
322
|
+
|
|
323
|
+
# Process the text
|
|
324
|
+
processed = process_text_robust(test_text)
|
|
325
|
+
print(f"Processed: {repr(processed[:50])}")
|
|
326
|
+
|
|
327
|
+
# Validate the text
|
|
328
|
+
validation = validate_unicode_text(test_text)
|
|
329
|
+
print(f"Issues found: {len(validation['encoding_issues'])}")
|
|
330
|
+
if validation['encoding_issues']:
|
|
331
|
+
print(f"Encoding issues: {validation['encoding_issues']}")
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
if __name__ == "__main__":
|
|
335
|
+
test_unicode_utils()
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
URL Utilities for Reference Checking
|
|
4
|
+
|
|
5
|
+
This module provides utilities for URL construction, validation, and manipulation
|
|
6
|
+
related to academic references.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import Optional
|
|
11
|
+
from .doi_utils import normalize_doi
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def construct_doi_url(doi: str) -> str:
|
|
15
|
+
"""
|
|
16
|
+
Construct a proper DOI URL from a DOI string.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
doi: DOI string
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Full DOI URL
|
|
23
|
+
"""
|
|
24
|
+
if not doi:
|
|
25
|
+
return ""
|
|
26
|
+
|
|
27
|
+
# Normalize the DOI first
|
|
28
|
+
normalized_doi = normalize_doi(doi)
|
|
29
|
+
|
|
30
|
+
# Construct URL
|
|
31
|
+
return f"https://doi.org/{normalized_doi}"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def extract_arxiv_id_from_url(url: str) -> Optional[str]:
|
|
35
|
+
"""
|
|
36
|
+
Extract ArXiv ID from an ArXiv URL or text containing ArXiv reference.
|
|
37
|
+
|
|
38
|
+
This is the common function that handles all ArXiv ID extraction patterns:
|
|
39
|
+
- URLs: https://arxiv.org/abs/1234.5678, https://arxiv.org/pdf/1234.5678.pdf, https://arxiv.org/html/1234.5678
|
|
40
|
+
- Text references: arXiv:1234.5678, arXiv preprint arXiv:1234.5678
|
|
41
|
+
- Version handling: removes version numbers (v1, v2, etc.)
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
url: ArXiv URL or text containing ArXiv reference
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
ArXiv ID (without version) if found, None otherwise
|
|
48
|
+
"""
|
|
49
|
+
if not url or not isinstance(url, str):
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
# Pattern 1: arXiv: format (e.g., "arXiv:1610.10099" or "arXiv preprint arXiv:1610.10099")
|
|
53
|
+
arxiv_text_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', url, re.IGNORECASE)
|
|
54
|
+
if arxiv_text_match:
|
|
55
|
+
arxiv_id = arxiv_text_match.group(1)
|
|
56
|
+
# Remove version number if present
|
|
57
|
+
return re.sub(r'v\d+$', '', arxiv_id)
|
|
58
|
+
|
|
59
|
+
# Pattern 2: arxiv.org URLs (abs, pdf, html)
|
|
60
|
+
# Handle URLs with version numbers and various formats
|
|
61
|
+
arxiv_url_match = re.search(r'arxiv\.org/(?:abs|pdf|html)/([^\s/?#]+?)(?:\.pdf|v\d+)?(?:[?\#]|$)', url, re.IGNORECASE)
|
|
62
|
+
if arxiv_url_match:
|
|
63
|
+
arxiv_id = arxiv_url_match.group(1)
|
|
64
|
+
# Remove version number if present
|
|
65
|
+
return re.sub(r'v\d+$', '', arxiv_id)
|
|
66
|
+
|
|
67
|
+
# Pattern 3: Fallback for simpler URL patterns
|
|
68
|
+
fallback_match = re.search(r'arxiv\.org/(?:abs|pdf|html)/([^/?#]+)', url, re.IGNORECASE)
|
|
69
|
+
if fallback_match:
|
|
70
|
+
arxiv_id = fallback_match.group(1).replace('.pdf', '')
|
|
71
|
+
# Remove version number if present
|
|
72
|
+
return re.sub(r'v\d+$', '', arxiv_id)
|
|
73
|
+
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def construct_arxiv_url(arxiv_id: str, url_type: str = "abs") -> str:
|
|
78
|
+
"""
|
|
79
|
+
Construct an ArXiv URL from an ArXiv ID.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
arxiv_id: ArXiv identifier
|
|
83
|
+
url_type: Type of URL ('abs' for abstract, 'pdf' for PDF)
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Full ArXiv URL
|
|
87
|
+
"""
|
|
88
|
+
if not arxiv_id:
|
|
89
|
+
return ""
|
|
90
|
+
|
|
91
|
+
# Remove version number if present for consistency
|
|
92
|
+
clean_id = arxiv_id.replace('v1', '').replace('v2', '').replace('v3', '')
|
|
93
|
+
|
|
94
|
+
if url_type == "pdf":
|
|
95
|
+
return f"https://arxiv.org/pdf/{clean_id}.pdf"
|
|
96
|
+
else:
|
|
97
|
+
return f"https://arxiv.org/abs/{clean_id}"
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def construct_semantic_scholar_url(paper_id: str) -> str:
|
|
101
|
+
"""
|
|
102
|
+
Construct a Semantic Scholar URL from a paper ID.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
paper_id: Semantic Scholar paper ID (SHA hash, NOT CorpusId)
|
|
106
|
+
The paperId is the 40-character hex hash that works in web URLs.
|
|
107
|
+
CorpusId (numeric) does NOT work in web URLs.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Full Semantic Scholar URL
|
|
111
|
+
"""
|
|
112
|
+
if not paper_id:
|
|
113
|
+
return ""
|
|
114
|
+
|
|
115
|
+
return f"https://www.semanticscholar.org/paper/{paper_id}"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def construct_openalex_url(work_id: str) -> str:
|
|
119
|
+
"""
|
|
120
|
+
Construct an OpenAlex URL from a work ID.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
work_id: OpenAlex work identifier
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Full OpenAlex URL
|
|
127
|
+
"""
|
|
128
|
+
if not work_id:
|
|
129
|
+
return ""
|
|
130
|
+
|
|
131
|
+
# Remove prefix if present
|
|
132
|
+
clean_id = work_id.replace('https://openalex.org/', '')
|
|
133
|
+
|
|
134
|
+
return f"https://openalex.org/{clean_id}"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def construct_pubmed_url(pmid: str) -> str:
|
|
138
|
+
"""
|
|
139
|
+
Construct a PubMed URL from a PMID.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
pmid: PubMed identifier
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Full PubMed URL
|
|
146
|
+
"""
|
|
147
|
+
if not pmid:
|
|
148
|
+
return ""
|
|
149
|
+
|
|
150
|
+
# Remove PMID prefix if present
|
|
151
|
+
clean_pmid = pmid.replace('PMID:', '').strip()
|
|
152
|
+
|
|
153
|
+
return f"https://pubmed.ncbi.nlm.nih.gov/{clean_pmid}/"
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] = None, paper_id: Optional[str] = None) -> Optional[str]:
|
|
157
|
+
"""
|
|
158
|
+
Get the best available URL from a paper's external IDs and open access information.
|
|
159
|
+
Priority: Open Access PDF > DOI > ArXiv > Semantic Scholar > OpenAlex > PubMed
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
external_ids: Dictionary of external identifiers
|
|
163
|
+
open_access_pdf: Open access PDF URL if available
|
|
164
|
+
paper_id: Semantic Scholar paperId (SHA hash) if available
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Best available URL or None if no valid URL found
|
|
168
|
+
"""
|
|
169
|
+
# Priority 1: Open access PDF
|
|
170
|
+
if open_access_pdf:
|
|
171
|
+
return open_access_pdf
|
|
172
|
+
|
|
173
|
+
# Priority 2: DOI URL
|
|
174
|
+
if external_ids.get('DOI'):
|
|
175
|
+
return construct_doi_url(external_ids['DOI'])
|
|
176
|
+
|
|
177
|
+
# Priority 3: ArXiv URL
|
|
178
|
+
if external_ids.get('ArXiv'):
|
|
179
|
+
return construct_arxiv_url(external_ids['ArXiv'])
|
|
180
|
+
|
|
181
|
+
# Priority 4: Semantic Scholar URL (using paperId, not CorpusId)
|
|
182
|
+
if paper_id:
|
|
183
|
+
return construct_semantic_scholar_url(paper_id)
|
|
184
|
+
|
|
185
|
+
# Priority 5: OpenAlex URL
|
|
186
|
+
if external_ids.get('OpenAlex'):
|
|
187
|
+
return construct_openalex_url(external_ids['OpenAlex'])
|
|
188
|
+
|
|
189
|
+
# Priority 6: PubMed URL
|
|
190
|
+
if external_ids.get('PubMed'):
|
|
191
|
+
return construct_pubmed_url(external_ids['PubMed'])
|
|
192
|
+
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def validate_url_format(url: str) -> bool:
|
|
197
|
+
"""
|
|
198
|
+
Basic validation of URL format.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
url: URL to validate
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
True if URL appears to be valid, False otherwise
|
|
205
|
+
"""
|
|
206
|
+
if not url:
|
|
207
|
+
return False
|
|
208
|
+
|
|
209
|
+
# Basic URL format check
|
|
210
|
+
return url.startswith(('http://', 'https://')) and '.' in url
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def clean_url(url: str) -> str:
|
|
214
|
+
"""
|
|
215
|
+
Clean a URL by removing common issues like extra spaces, fragments, malformed LaTeX, etc.
|
|
216
|
+
|
|
217
|
+
This function handles:
|
|
218
|
+
- Whitespace trimming
|
|
219
|
+
- Malformed LaTeX URL wrappers like \\url{https://...}
|
|
220
|
+
- Markdown-style links like [text](url)
|
|
221
|
+
- Trailing punctuation from academic references
|
|
222
|
+
- DOI URL query parameter cleanup
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
url: URL to clean
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Cleaned URL
|
|
229
|
+
"""
|
|
230
|
+
if not url:
|
|
231
|
+
return ""
|
|
232
|
+
|
|
233
|
+
# Remove leading/trailing whitespace
|
|
234
|
+
url = url.strip()
|
|
235
|
+
|
|
236
|
+
# Handle malformed URLs that contain \url{} wrappers within the URL text
|
|
237
|
+
# e.g., "https://\url{https://www.example.com/}" -> "https://www.example.com/"
|
|
238
|
+
import re
|
|
239
|
+
url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
|
|
240
|
+
url_match = re.search(url_pattern, url)
|
|
241
|
+
if url_match:
|
|
242
|
+
url = url_match.group(1)
|
|
243
|
+
|
|
244
|
+
# Handle markdown-style links like [text](url) or [url](url)
|
|
245
|
+
# e.g., "[https://example.com](https://example.com)" -> "https://example.com"
|
|
246
|
+
markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
|
|
247
|
+
markdown_match = re.search(markdown_pattern, url)
|
|
248
|
+
if markdown_match:
|
|
249
|
+
# Use the URL from parentheses
|
|
250
|
+
url = markdown_match.group(2)
|
|
251
|
+
|
|
252
|
+
# Remove trailing punctuation that's commonly part of sentence structure
|
|
253
|
+
# but preserve legitimate URL characters
|
|
254
|
+
url = url.rstrip('.,;!?)')
|
|
255
|
+
|
|
256
|
+
# Note: Preserving query parameters for all URLs now
|
|
257
|
+
# Previously this function removed query parameters for non-DOI URLs,
|
|
258
|
+
# but this was causing issues with OpenReview and other URLs that need their parameters
|
|
259
|
+
# Only remove query parameters for DOI URLs where they're typically not needed
|
|
260
|
+
if '?' in url and 'doi.org' in url:
|
|
261
|
+
base_url, params = url.split('?', 1)
|
|
262
|
+
url = base_url
|
|
263
|
+
|
|
264
|
+
return url
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def clean_url_punctuation(url: str) -> str:
|
|
268
|
+
"""
|
|
269
|
+
Clean trailing punctuation from URLs that often gets included during extraction.
|
|
270
|
+
|
|
271
|
+
This function removes trailing punctuation that commonly gets extracted with URLs
|
|
272
|
+
from academic references (periods, commas, semicolons, etc.) while preserving
|
|
273
|
+
legitimate URL characters including query parameters.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
url: URL string that may have trailing punctuation
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
Cleaned URL with trailing punctuation removed
|
|
280
|
+
"""
|
|
281
|
+
if not url:
|
|
282
|
+
return ""
|
|
283
|
+
|
|
284
|
+
# Remove leading/trailing whitespace
|
|
285
|
+
url = url.strip()
|
|
286
|
+
|
|
287
|
+
# Handle malformed URLs that contain \\url{} wrappers within the URL text
|
|
288
|
+
# e.g., "https://\\url{https://www.example.com/}" -> "https://www.example.com/"
|
|
289
|
+
import re
|
|
290
|
+
url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
|
|
291
|
+
url_match = re.search(url_pattern, url)
|
|
292
|
+
if url_match:
|
|
293
|
+
url = url_match.group(1)
|
|
294
|
+
|
|
295
|
+
# Handle markdown-style links like [text](url) or [url](url)
|
|
296
|
+
# e.g., "[https://example.com](https://example.com)" -> "https://example.com"
|
|
297
|
+
markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
|
|
298
|
+
markdown_match = re.search(markdown_pattern, url)
|
|
299
|
+
if markdown_match:
|
|
300
|
+
# Use the URL from parentheses
|
|
301
|
+
url = markdown_match.group(2)
|
|
302
|
+
|
|
303
|
+
# Remove trailing punctuation that's commonly part of sentence structure
|
|
304
|
+
# but preserve legitimate URL characters
|
|
305
|
+
url = url.rstrip('.,;!?)')
|
|
306
|
+
|
|
307
|
+
return url
|