academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,335 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unicode parsing utility functions for handling text processing in pipelines.
4
+ Provides robust Unicode support for various text processing scenarios.
5
+ """
6
+
7
+ import unicodedata
8
+ import re
9
+ import json
10
+ import codecs
11
+ from typing import Any, Dict, List, Optional, Union
12
+
13
+
14
+ def normalize_unicode_text(text: str, form: str = 'NFKC') -> str:
15
+ """
16
+ Normalize Unicode text to handle various Unicode forms.
17
+
18
+ Args:
19
+ text: Input text to normalize
20
+ form: Unicode normalization form ('NFC', 'NFKC', 'NFD', 'NFKD')
21
+
22
+ Returns:
23
+ Normalized Unicode text
24
+ """
25
+ if not isinstance(text, str):
26
+ text = str(text)
27
+
28
+ try:
29
+ # Normalize Unicode characters
30
+ normalized = unicodedata.normalize(form, text)
31
+ return normalized
32
+ except Exception as e:
33
+ print(f"Warning: Unicode normalization failed: {e}")
34
+ return text
35
+
36
+
37
+ def clean_unicode_control_chars(text: str) -> str:
38
+ """
39
+ Remove or replace problematic Unicode control characters.
40
+
41
+ Args:
42
+ text: Input text to clean
43
+
44
+ Returns:
45
+ Cleaned text with control characters handled
46
+ """
47
+ if not isinstance(text, str):
48
+ text = str(text)
49
+
50
+ # Remove common problematic control characters
51
+ # Keep essential whitespace characters (space, tab, newline, carriage return)
52
+ control_char_pattern = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]')
53
+ cleaned = control_char_pattern.sub('', text)
54
+
55
+ # Replace non-breaking spaces and similar with regular spaces
56
+ cleaned = re.sub(r'[\u00A0\u2000-\u200B\u2028\u2029\u202F\u205F\u3000]', ' ', cleaned)
57
+
58
+ return cleaned
59
+
60
+
61
+ def safe_encode_decode(text: str, encoding: str = 'utf-8', errors: str = 'replace') -> str:
62
+ """
63
+ Safely encode and decode text to handle encoding issues.
64
+
65
+ Args:
66
+ text: Input text
67
+ encoding: Target encoding (default: utf-8)
68
+ errors: Error handling strategy ('ignore', 'replace', 'strict')
69
+
70
+ Returns:
71
+ Safely encoded/decoded text
72
+ """
73
+ if not isinstance(text, str):
74
+ text = str(text)
75
+
76
+ try:
77
+ # Encode then decode to handle any encoding issues
78
+ encoded = text.encode(encoding, errors=errors)
79
+ decoded = encoded.decode(encoding, errors=errors)
80
+ return decoded
81
+ except Exception as e:
82
+ print(f"Warning: Encoding/decoding failed: {e}")
83
+ return text
84
+
85
+
86
+ def fix_mojibake(text: str) -> str:
87
+ """
88
+ Attempt to fix common mojibake (character encoding corruption) issues.
89
+
90
+ Args:
91
+ text: Input text that may contain mojibake
92
+
93
+ Returns:
94
+ Text with mojibake corrections attempted
95
+ """
96
+ if not isinstance(text, str):
97
+ text = str(text)
98
+
99
+ # Common mojibake patterns and their fixes
100
+ mojibake_fixes = {
101
+ # UTF-8 interpreted as Latin-1 then re-encoded
102
+ 'á': 'á',
103
+ 'é': 'é',
104
+ 'í': 'í',
105
+ 'ó': 'ó',
106
+ 'ú': 'ú',
107
+ 'ñ': 'ñ',
108
+ 'ü': 'ü',
109
+ 'Â': '', # Often spurious  characters
110
+ '’': "'", # Right single quotation mark
111
+ '“': '"', # Left double quotation mark
112
+ 'â€': '"', # Right double quotation mark
113
+ 'â€"': '—', # Em dash
114
+ 'â€"': '–', # En dash
115
+ }
116
+
117
+ for broken, fixed in mojibake_fixes.items():
118
+ text = text.replace(broken, fixed)
119
+
120
+ return text
121
+
122
+
123
+ def safe_json_loads(text: str) -> Any:
124
+ """
125
+ Safely load JSON with Unicode handling.
126
+
127
+ Args:
128
+ text: JSON string to parse
129
+
130
+ Returns:
131
+ Parsed JSON object, or None if parsing fails
132
+ """
133
+ if not isinstance(text, str):
134
+ text = str(text)
135
+
136
+ try:
137
+ # Clean the text first
138
+ cleaned_text = normalize_unicode_text(text)
139
+ cleaned_text = clean_unicode_control_chars(cleaned_text)
140
+
141
+ # Try to parse JSON
142
+ return json.loads(cleaned_text)
143
+ except json.JSONDecodeError as e:
144
+ print(f"Warning: JSON parsing failed: {e}")
145
+ # Try with mojibake fixes
146
+ try:
147
+ fixed_text = fix_mojibake(cleaned_text)
148
+ return json.loads(fixed_text)
149
+ except json.JSONDecodeError:
150
+ print("Warning: JSON parsing failed even after mojibake fixes")
151
+ return None
152
+ except Exception as e:
153
+ print(f"Warning: Unexpected error in JSON parsing: {e}")
154
+ return None
155
+
156
+
157
+ def safe_file_read(file_path: str, encoding: str = 'utf-8', fallback_encodings: Optional[List[str]] = None) -> str:
158
+ """
159
+ Safely read a file with Unicode handling and encoding detection.
160
+
161
+ Args:
162
+ file_path: Path to file to read
163
+ encoding: Primary encoding to try
164
+ fallback_encodings: List of fallback encodings to try
165
+
166
+ Returns:
167
+ File contents as string
168
+ """
169
+ if fallback_encodings is None:
170
+ fallback_encodings = ['utf-8-sig', 'latin-1', 'cp1252', 'iso-8859-1']
171
+
172
+ encodings_to_try = [encoding] + [enc for enc in fallback_encodings if enc != encoding]
173
+
174
+ for enc in encodings_to_try:
175
+ try:
176
+ with codecs.open(file_path, 'r', encoding=enc, errors='replace') as f:
177
+ content = f.read()
178
+
179
+ # Clean and normalize the content
180
+ content = normalize_unicode_text(content)
181
+ content = clean_unicode_control_chars(content)
182
+
183
+ return content
184
+ except UnicodeDecodeError:
185
+ continue
186
+ except Exception as e:
187
+ print(f"Warning: Error reading file with encoding {enc}: {e}")
188
+ continue
189
+
190
+ raise ValueError(f"Could not read file {file_path} with any of the attempted encodings")
191
+
192
+
193
+ def safe_file_write(file_path: str, content: str, encoding: str = 'utf-8') -> None:
194
+ """
195
+ Safely write content to file with Unicode handling.
196
+
197
+ Args:
198
+ file_path: Path to file to write
199
+ content: Content to write
200
+ encoding: Encoding to use for writing
201
+ """
202
+ if not isinstance(content, str):
203
+ content = str(content)
204
+
205
+ # Normalize content before writing
206
+ content = normalize_unicode_text(content)
207
+
208
+ try:
209
+ with codecs.open(file_path, 'w', encoding=encoding, errors='replace') as f:
210
+ f.write(content)
211
+ except Exception as e:
212
+ print(f"Warning: Error writing file {file_path}: {e}")
213
+ # Fallback: write with error replacement
214
+ with codecs.open(file_path, 'w', encoding=encoding, errors='replace') as f:
215
+ f.write(content)
216
+
217
+
218
+ def process_text_robust(text: Union[str, bytes, Any],
219
+ normalize: bool = True,
220
+ clean_control_chars: bool = True,
221
+ fix_mojibake_issues: bool = True,
222
+ safe_encoding: bool = True) -> str:
223
+ """
224
+ Robustly process text with comprehensive Unicode handling.
225
+
226
+ Args:
227
+ text: Input text to process
228
+ normalize: Whether to normalize Unicode
229
+ clean_control_chars: Whether to clean control characters
230
+ fix_mojibake_issues: Whether to attempt mojibake fixes
231
+ safe_encoding: Whether to apply safe encoding/decoding
232
+
233
+ Returns:
234
+ Processed text string
235
+ """
236
+ # Handle bytes input
237
+ if isinstance(text, bytes):
238
+ try:
239
+ text = text.decode('utf-8', errors='replace')
240
+ except:
241
+ text = str(text)
242
+ elif not isinstance(text, str):
243
+ text = str(text)
244
+
245
+ # Apply processing steps
246
+ if normalize:
247
+ text = normalize_unicode_text(text)
248
+
249
+ if clean_control_chars:
250
+ text = clean_unicode_control_chars(text)
251
+
252
+ if fix_mojibake_issues:
253
+ text = fix_mojibake(text)
254
+
255
+ if safe_encoding:
256
+ text = safe_encode_decode(text)
257
+
258
+ return text
259
+
260
+
261
+ def validate_unicode_text(text: str) -> Dict[str, Any]:
262
+ """
263
+ Validate and analyze Unicode text for potential issues.
264
+
265
+ Args:
266
+ text: Text to validate
267
+
268
+ Returns:
269
+ Dictionary with validation results and statistics
270
+ """
271
+ if not isinstance(text, str):
272
+ text = str(text)
273
+
274
+ results = {
275
+ 'length': len(text),
276
+ 'is_ascii': text.isascii(),
277
+ 'encoding_issues': [],
278
+ 'control_chars_count': 0,
279
+ 'non_printable_count': 0,
280
+ 'unicode_categories': {},
281
+ }
282
+
283
+ # Count control characters
284
+ control_char_pattern = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]')
285
+ results['control_chars_count'] = len(control_char_pattern.findall(text))
286
+
287
+ # Count non-printable characters
288
+ results['non_printable_count'] = sum(1 for c in text if not c.isprintable())
289
+
290
+ # Analyze Unicode categories
291
+ for char in text[:1000]: # Sample first 1000 chars for performance
292
+ category = unicodedata.category(char)
293
+ results['unicode_categories'][category] = results['unicode_categories'].get(category, 0) + 1
294
+
295
+ # Check for common encoding issues
296
+ if 'Ã' in text and any(char in text for char in ['¡', '©', '­', '³', 'º', '±', '¼']):
297
+ results['encoding_issues'].append('Possible UTF-8 to Latin-1 mojibake')
298
+
299
+ if 'â€' in text:
300
+ results['encoding_issues'].append('Possible smart quote encoding issues')
301
+
302
+ return results
303
+
304
+
305
+ # Example usage and testing functions
306
+ def test_unicode_utils():
307
+ """Test function to verify Unicode utilities work correctly."""
308
+
309
+ # Test cases
310
+ test_cases = [
311
+ "Normal ASCII text",
312
+ "Unicode: café, naïve, résumé",
313
+ "Mojibake: caf© na√Øve r©sum©",
314
+ "Control chars: Hello\x00\x01World",
315
+ "Smart quotes: \"Hello\" 'World'",
316
+ "Mixed: café\u00A0with\u2000spaces",
317
+ ]
318
+
319
+ print("Testing Unicode utilities...")
320
+ for i, test_text in enumerate(test_cases):
321
+ print(f"\nTest {i+1}: {repr(test_text[:50])}")
322
+
323
+ # Process the text
324
+ processed = process_text_robust(test_text)
325
+ print(f"Processed: {repr(processed[:50])}")
326
+
327
+ # Validate the text
328
+ validation = validate_unicode_text(test_text)
329
+ print(f"Issues found: {len(validation['encoding_issues'])}")
330
+ if validation['encoding_issues']:
331
+ print(f"Encoding issues: {validation['encoding_issues']}")
332
+
333
+
334
+ if __name__ == "__main__":
335
+ test_unicode_utils()
@@ -0,0 +1,307 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ URL Utilities for Reference Checking
4
+
5
+ This module provides utilities for URL construction, validation, and manipulation
6
+ related to academic references.
7
+ """
8
+
9
+ import re
10
+ from typing import Optional
11
+ from .doi_utils import normalize_doi
12
+
13
+
14
+ def construct_doi_url(doi: str) -> str:
15
+ """
16
+ Construct a proper DOI URL from a DOI string.
17
+
18
+ Args:
19
+ doi: DOI string
20
+
21
+ Returns:
22
+ Full DOI URL
23
+ """
24
+ if not doi:
25
+ return ""
26
+
27
+ # Normalize the DOI first
28
+ normalized_doi = normalize_doi(doi)
29
+
30
+ # Construct URL
31
+ return f"https://doi.org/{normalized_doi}"
32
+
33
+
34
+ def extract_arxiv_id_from_url(url: str) -> Optional[str]:
35
+ """
36
+ Extract ArXiv ID from an ArXiv URL or text containing ArXiv reference.
37
+
38
+ This is the common function that handles all ArXiv ID extraction patterns:
39
+ - URLs: https://arxiv.org/abs/1234.5678, https://arxiv.org/pdf/1234.5678.pdf, https://arxiv.org/html/1234.5678
40
+ - Text references: arXiv:1234.5678, arXiv preprint arXiv:1234.5678
41
+ - Version handling: removes version numbers (v1, v2, etc.)
42
+
43
+ Args:
44
+ url: ArXiv URL or text containing ArXiv reference
45
+
46
+ Returns:
47
+ ArXiv ID (without version) if found, None otherwise
48
+ """
49
+ if not url or not isinstance(url, str):
50
+ return None
51
+
52
+ # Pattern 1: arXiv: format (e.g., "arXiv:1610.10099" or "arXiv preprint arXiv:1610.10099")
53
+ arxiv_text_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', url, re.IGNORECASE)
54
+ if arxiv_text_match:
55
+ arxiv_id = arxiv_text_match.group(1)
56
+ # Remove version number if present
57
+ return re.sub(r'v\d+$', '', arxiv_id)
58
+
59
+ # Pattern 2: arxiv.org URLs (abs, pdf, html)
60
+ # Handle URLs with version numbers and various formats
61
+ arxiv_url_match = re.search(r'arxiv\.org/(?:abs|pdf|html)/([^\s/?#]+?)(?:\.pdf|v\d+)?(?:[?\#]|$)', url, re.IGNORECASE)
62
+ if arxiv_url_match:
63
+ arxiv_id = arxiv_url_match.group(1)
64
+ # Remove version number if present
65
+ return re.sub(r'v\d+$', '', arxiv_id)
66
+
67
+ # Pattern 3: Fallback for simpler URL patterns
68
+ fallback_match = re.search(r'arxiv\.org/(?:abs|pdf|html)/([^/?#]+)', url, re.IGNORECASE)
69
+ if fallback_match:
70
+ arxiv_id = fallback_match.group(1).replace('.pdf', '')
71
+ # Remove version number if present
72
+ return re.sub(r'v\d+$', '', arxiv_id)
73
+
74
+ return None
75
+
76
+
77
+ def construct_arxiv_url(arxiv_id: str, url_type: str = "abs") -> str:
78
+ """
79
+ Construct an ArXiv URL from an ArXiv ID.
80
+
81
+ Args:
82
+ arxiv_id: ArXiv identifier
83
+ url_type: Type of URL ('abs' for abstract, 'pdf' for PDF)
84
+
85
+ Returns:
86
+ Full ArXiv URL
87
+ """
88
+ if not arxiv_id:
89
+ return ""
90
+
91
+ # Remove version number if present for consistency
92
+ clean_id = arxiv_id.replace('v1', '').replace('v2', '').replace('v3', '')
93
+
94
+ if url_type == "pdf":
95
+ return f"https://arxiv.org/pdf/{clean_id}.pdf"
96
+ else:
97
+ return f"https://arxiv.org/abs/{clean_id}"
98
+
99
+
100
+ def construct_semantic_scholar_url(paper_id: str) -> str:
101
+ """
102
+ Construct a Semantic Scholar URL from a paper ID.
103
+
104
+ Args:
105
+ paper_id: Semantic Scholar paper ID (SHA hash, NOT CorpusId)
106
+ The paperId is the 40-character hex hash that works in web URLs.
107
+ CorpusId (numeric) does NOT work in web URLs.
108
+
109
+ Returns:
110
+ Full Semantic Scholar URL
111
+ """
112
+ if not paper_id:
113
+ return ""
114
+
115
+ return f"https://www.semanticscholar.org/paper/{paper_id}"
116
+
117
+
118
+ def construct_openalex_url(work_id: str) -> str:
119
+ """
120
+ Construct an OpenAlex URL from a work ID.
121
+
122
+ Args:
123
+ work_id: OpenAlex work identifier
124
+
125
+ Returns:
126
+ Full OpenAlex URL
127
+ """
128
+ if not work_id:
129
+ return ""
130
+
131
+ # Remove prefix if present
132
+ clean_id = work_id.replace('https://openalex.org/', '')
133
+
134
+ return f"https://openalex.org/{clean_id}"
135
+
136
+
137
+ def construct_pubmed_url(pmid: str) -> str:
138
+ """
139
+ Construct a PubMed URL from a PMID.
140
+
141
+ Args:
142
+ pmid: PubMed identifier
143
+
144
+ Returns:
145
+ Full PubMed URL
146
+ """
147
+ if not pmid:
148
+ return ""
149
+
150
+ # Remove PMID prefix if present
151
+ clean_pmid = pmid.replace('PMID:', '').strip()
152
+
153
+ return f"https://pubmed.ncbi.nlm.nih.gov/{clean_pmid}/"
154
+
155
+
156
+ def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] = None, paper_id: Optional[str] = None) -> Optional[str]:
157
+ """
158
+ Get the best available URL from a paper's external IDs and open access information.
159
+ Priority: Open Access PDF > DOI > ArXiv > Semantic Scholar > OpenAlex > PubMed
160
+
161
+ Args:
162
+ external_ids: Dictionary of external identifiers
163
+ open_access_pdf: Open access PDF URL if available
164
+ paper_id: Semantic Scholar paperId (SHA hash) if available
165
+
166
+ Returns:
167
+ Best available URL or None if no valid URL found
168
+ """
169
+ # Priority 1: Open access PDF
170
+ if open_access_pdf:
171
+ return open_access_pdf
172
+
173
+ # Priority 2: DOI URL
174
+ if external_ids.get('DOI'):
175
+ return construct_doi_url(external_ids['DOI'])
176
+
177
+ # Priority 3: ArXiv URL
178
+ if external_ids.get('ArXiv'):
179
+ return construct_arxiv_url(external_ids['ArXiv'])
180
+
181
+ # Priority 4: Semantic Scholar URL (using paperId, not CorpusId)
182
+ if paper_id:
183
+ return construct_semantic_scholar_url(paper_id)
184
+
185
+ # Priority 5: OpenAlex URL
186
+ if external_ids.get('OpenAlex'):
187
+ return construct_openalex_url(external_ids['OpenAlex'])
188
+
189
+ # Priority 6: PubMed URL
190
+ if external_ids.get('PubMed'):
191
+ return construct_pubmed_url(external_ids['PubMed'])
192
+
193
+ return None
194
+
195
+
196
+ def validate_url_format(url: str) -> bool:
197
+ """
198
+ Basic validation of URL format.
199
+
200
+ Args:
201
+ url: URL to validate
202
+
203
+ Returns:
204
+ True if URL appears to be valid, False otherwise
205
+ """
206
+ if not url:
207
+ return False
208
+
209
+ # Basic URL format check
210
+ return url.startswith(('http://', 'https://')) and '.' in url
211
+
212
+
213
+ def clean_url(url: str) -> str:
214
+ """
215
+ Clean a URL by removing common issues like extra spaces, fragments, malformed LaTeX, etc.
216
+
217
+ This function handles:
218
+ - Whitespace trimming
219
+ - Malformed LaTeX URL wrappers like \\url{https://...}
220
+ - Markdown-style links like [text](url)
221
+ - Trailing punctuation from academic references
222
+ - DOI URL query parameter cleanup
223
+
224
+ Args:
225
+ url: URL to clean
226
+
227
+ Returns:
228
+ Cleaned URL
229
+ """
230
+ if not url:
231
+ return ""
232
+
233
+ # Remove leading/trailing whitespace
234
+ url = url.strip()
235
+
236
+ # Handle malformed URLs that contain \url{} wrappers within the URL text
237
+ # e.g., "https://\url{https://www.example.com/}" -> "https://www.example.com/"
238
+ import re
239
+ url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
240
+ url_match = re.search(url_pattern, url)
241
+ if url_match:
242
+ url = url_match.group(1)
243
+
244
+ # Handle markdown-style links like [text](url) or [url](url)
245
+ # e.g., "[https://example.com](https://example.com)" -> "https://example.com"
246
+ markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
247
+ markdown_match = re.search(markdown_pattern, url)
248
+ if markdown_match:
249
+ # Use the URL from parentheses
250
+ url = markdown_match.group(2)
251
+
252
+ # Remove trailing punctuation that's commonly part of sentence structure
253
+ # but preserve legitimate URL characters
254
+ url = url.rstrip('.,;!?)')
255
+
256
+ # Note: Preserving query parameters for all URLs now
257
+ # Previously this function removed query parameters for non-DOI URLs,
258
+ # but this was causing issues with OpenReview and other URLs that need their parameters
259
+ # Only remove query parameters for DOI URLs where they're typically not needed
260
+ if '?' in url and 'doi.org' in url:
261
+ base_url, params = url.split('?', 1)
262
+ url = base_url
263
+
264
+ return url
265
+
266
+
267
+ def clean_url_punctuation(url: str) -> str:
268
+ """
269
+ Clean trailing punctuation from URLs that often gets included during extraction.
270
+
271
+ This function removes trailing punctuation that commonly gets extracted with URLs
272
+ from academic references (periods, commas, semicolons, etc.) while preserving
273
+ legitimate URL characters including query parameters.
274
+
275
+ Args:
276
+ url: URL string that may have trailing punctuation
277
+
278
+ Returns:
279
+ Cleaned URL with trailing punctuation removed
280
+ """
281
+ if not url:
282
+ return ""
283
+
284
+ # Remove leading/trailing whitespace
285
+ url = url.strip()
286
+
287
+ # Handle malformed URLs that contain \\url{} wrappers within the URL text
288
+ # e.g., "https://\\url{https://www.example.com/}" -> "https://www.example.com/"
289
+ import re
290
+ url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
291
+ url_match = re.search(url_pattern, url)
292
+ if url_match:
293
+ url = url_match.group(1)
294
+
295
+ # Handle markdown-style links like [text](url) or [url](url)
296
+ # e.g., "[https://example.com](https://example.com)" -> "https://example.com"
297
+ markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
298
+ markdown_match = re.search(markdown_pattern, url)
299
+ if markdown_match:
300
+ # Use the URL from parentheses
301
+ url = markdown_match.group(2)
302
+
303
+ # Remove trailing punctuation that's commonly part of sentence structure
304
+ # but preserve legitimate URL characters
305
+ url = url.rstrip('.,;!?)')
306
+
307
+ return url