academic-refchecker 1.2.42__tar.gz → 1.2.44__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.42/src/academic_refchecker.egg-info → academic_refchecker-1.2.44}/PKG-INFO +1 -1
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/__version__.py +1 -1
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/core/refchecker.py +1 -20
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/bibtex_parser.py +104 -27
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/text_utils.py +64 -1
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/url_utils.py +44 -1
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/LICENSE +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/MANIFEST.in +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/README.md +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/pyproject.toml +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/requirements.txt +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/scripts/download_db.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/scripts/run_tests.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/setup.cfg +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/__init__.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/__init__.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/crossref.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/github_checker.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/openalex.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/openreview_checker.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/semantic_scholar.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/webpage_checker.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/config/__init__.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/config/logging.conf +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/config/settings.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/core/__init__.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/core/db_connection_pool.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/core/parallel_processor.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/database/__init__.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/llm/__init__.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/llm/base.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/llm/providers.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/scripts/__init__.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/services/__init__.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/services/pdf_processor.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/__init__.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/arxiv_utils.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/author_utils.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/biblatex_parser.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/bibliography_utils.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/config_validator.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/db_utils.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/doi_utils.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/error_utils.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/mock_objects.py +0 -0
- {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/unicode_utils.py +0 -0
|
@@ -5181,7 +5181,7 @@ class ArxivReferenceChecker:
|
|
|
5181
5181
|
from utils.text_utils import format_authors_for_display
|
|
5182
5182
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
5183
5183
|
year = reference.get('year', '')
|
|
5184
|
-
venue = reference.get('venue', '')
|
|
5184
|
+
venue = reference.get('venue', '') or reference.get('journal', '')
|
|
5185
5185
|
url = reference.get('url', '')
|
|
5186
5186
|
doi = reference.get('doi', '')
|
|
5187
5187
|
# Extract actual reference number from raw text for accurate display
|
|
@@ -5407,25 +5407,6 @@ class ArxivReferenceChecker:
|
|
|
5407
5407
|
if error_details:
|
|
5408
5408
|
subreason = self._categorize_unverified_reason(error_details)
|
|
5409
5409
|
print(f" Subreason: {subreason}")
|
|
5410
|
-
|
|
5411
|
-
year_str = self._format_year_string(reference.get('year'))
|
|
5412
|
-
|
|
5413
|
-
# Apply LaTeX cleaning and formatting to authors for display
|
|
5414
|
-
authors = reference.get('authors', [])
|
|
5415
|
-
if authors:
|
|
5416
|
-
from utils.text_utils import strip_latex_commands, format_authors_for_display
|
|
5417
|
-
cleaned_authors = [strip_latex_commands(author) for author in authors]
|
|
5418
|
-
authors_display = format_authors_for_display(cleaned_authors)
|
|
5419
|
-
else:
|
|
5420
|
-
authors_display = 'Unknown authors'
|
|
5421
|
-
|
|
5422
|
-
# Only show URL if it exists and is different from reference_url
|
|
5423
|
-
ref_url = reference.get('url', '').strip()
|
|
5424
|
-
if ref_url and ref_url != reference_url:
|
|
5425
|
-
# Clean trailing punctuation from URL display
|
|
5426
|
-
from utils.url_utils import clean_url_punctuation
|
|
5427
|
-
clean_ref_url = clean_url_punctuation(ref_url)
|
|
5428
|
-
print(f" URL: {clean_ref_url}")
|
|
5429
5410
|
|
|
5430
5411
|
def _categorize_unverified_reason(self, error_details):
|
|
5431
5412
|
"""Categorize the unverified error into checker error or not found"""
|
|
@@ -103,37 +103,99 @@ def parse_bibtex_entry_content(entry_type: str, entry_key: str, content: str) ->
|
|
|
103
103
|
Returns:
|
|
104
104
|
Dictionary with parsed entry data
|
|
105
105
|
"""
|
|
106
|
-
# Extract fields using regex
|
|
107
106
|
fields = {}
|
|
108
107
|
|
|
109
|
-
#
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
108
|
+
# Use a more robust approach with manual parsing
|
|
109
|
+
i = 0
|
|
110
|
+
while i < len(content):
|
|
111
|
+
# Skip whitespace
|
|
112
|
+
while i < len(content) and content[i].isspace():
|
|
113
|
+
i += 1
|
|
114
|
+
|
|
115
|
+
if i >= len(content):
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
# Look for field name
|
|
119
|
+
field_start = i
|
|
120
|
+
while i < len(content) and (content[i].isalnum() or content[i] == '_'):
|
|
121
|
+
i += 1
|
|
122
|
+
|
|
123
|
+
if i == field_start:
|
|
124
|
+
i += 1 # Skip non-alphanumeric character
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
field_name = content[field_start:i].lower()
|
|
128
|
+
|
|
129
|
+
# Skip whitespace
|
|
130
|
+
while i < len(content) and content[i].isspace():
|
|
131
|
+
i += 1
|
|
132
|
+
|
|
133
|
+
# Look for equals sign
|
|
134
|
+
if i >= len(content) or content[i] != '=':
|
|
135
|
+
continue
|
|
136
|
+
i += 1 # Skip '='
|
|
137
|
+
|
|
138
|
+
# Skip whitespace
|
|
139
|
+
while i < len(content) and content[i].isspace():
|
|
140
|
+
i += 1
|
|
141
|
+
|
|
142
|
+
if i >= len(content):
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
# Parse field value
|
|
146
|
+
field_value = ""
|
|
147
|
+
if content[i] == '"':
|
|
148
|
+
# Handle quoted strings
|
|
149
|
+
i += 1 # Skip opening quote
|
|
150
|
+
value_start = i
|
|
151
|
+
while i < len(content) and content[i] != '"':
|
|
152
|
+
i += 1
|
|
153
|
+
if i < len(content):
|
|
154
|
+
field_value = content[value_start:i]
|
|
155
|
+
i += 1 # Skip closing quote
|
|
156
|
+
elif content[i] == '{':
|
|
157
|
+
# Handle braced strings with proper nesting
|
|
158
|
+
brace_count = 0
|
|
159
|
+
value_start = i + 1 # Skip opening brace
|
|
160
|
+
i += 1
|
|
161
|
+
while i < len(content):
|
|
162
|
+
if content[i] == '{':
|
|
163
|
+
brace_count += 1
|
|
164
|
+
elif content[i] == '}':
|
|
165
|
+
if brace_count == 0:
|
|
166
|
+
break
|
|
167
|
+
brace_count -= 1
|
|
168
|
+
i += 1
|
|
169
|
+
|
|
170
|
+
if i < len(content):
|
|
171
|
+
field_value = content[value_start:i]
|
|
172
|
+
i += 1 # Skip closing brace
|
|
173
|
+
|
|
174
|
+
if field_value:
|
|
175
|
+
field_value = field_value.strip()
|
|
176
|
+
# Strip outer quotes if present (handles cases like title = {"Some Title"})
|
|
177
|
+
if field_value.startswith('"') and field_value.endswith('"'):
|
|
178
|
+
field_value = field_value[1:-1]
|
|
179
|
+
fields[field_name] = field_value
|
|
180
|
+
|
|
181
|
+
# Skip to next field (look for comma)
|
|
182
|
+
while i < len(content) and content[i] not in ',}':
|
|
183
|
+
i += 1
|
|
184
|
+
if i < len(content) and content[i] == ',':
|
|
185
|
+
i += 1
|
|
121
186
|
|
|
122
|
-
#
|
|
187
|
+
# Fallback to regex if manual parsing failed
|
|
123
188
|
if not fields:
|
|
124
|
-
logger.debug("
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
if field_value.startswith('"') and field_value.endswith('"'):
|
|
135
|
-
field_value = field_value[1:-1]
|
|
136
|
-
fields[field_name] = field_value
|
|
189
|
+
logger.debug("Manual parsing failed, trying regex approach")
|
|
190
|
+
field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
|
|
191
|
+
|
|
192
|
+
for match in re.finditer(field_pattern, content, re.DOTALL):
|
|
193
|
+
field_name = match.group(1).lower()
|
|
194
|
+
field_value = match.group(2) or match.group(3) or ""
|
|
195
|
+
field_value = field_value.strip()
|
|
196
|
+
if field_value.startswith('"') and field_value.endswith('"'):
|
|
197
|
+
field_value = field_value[1:-1]
|
|
198
|
+
fields[field_name] = field_value
|
|
137
199
|
|
|
138
200
|
return {
|
|
139
201
|
'type': entry_type,
|
|
@@ -216,6 +278,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
|
|
|
216
278
|
|
|
217
279
|
# Extract journal/venue
|
|
218
280
|
journal = fields.get('journal', fields.get('booktitle', fields.get('venue', '')))
|
|
281
|
+
# Remove braces from journal/venue names
|
|
282
|
+
if journal and journal.startswith('{') and journal.endswith('}'):
|
|
283
|
+
journal = journal[1:-1]
|
|
219
284
|
|
|
220
285
|
# Extract DOI and construct URL
|
|
221
286
|
doi = fields.get('doi', '')
|
|
@@ -225,6 +290,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
|
|
|
225
290
|
|
|
226
291
|
# Extract other URLs
|
|
227
292
|
url = fields.get('url', '')
|
|
293
|
+
if url:
|
|
294
|
+
from utils.url_utils import clean_url
|
|
295
|
+
url = clean_url(url)
|
|
228
296
|
|
|
229
297
|
# Handle special @misc entries with only howpublished field
|
|
230
298
|
if not title and not authors and entry_type == 'misc':
|
|
@@ -249,6 +317,10 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
|
|
|
249
317
|
else:
|
|
250
318
|
url = howpublished
|
|
251
319
|
|
|
320
|
+
# Clean the reconstructed URL
|
|
321
|
+
from utils.url_utils import clean_url
|
|
322
|
+
url = clean_url(url)
|
|
323
|
+
|
|
252
324
|
# Generate title from domain/path
|
|
253
325
|
if 'jailbreakchat.com' in domain:
|
|
254
326
|
title = 'JailbreakChat Website'
|
|
@@ -275,6 +347,11 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
|
|
|
275
347
|
|
|
276
348
|
if url.startswith('\\url{') and url.endswith('}'):
|
|
277
349
|
url = url[5:-1] # Remove \url{...}
|
|
350
|
+
|
|
351
|
+
# Clean any URL we extracted
|
|
352
|
+
if url:
|
|
353
|
+
from utils.url_utils import clean_url
|
|
354
|
+
url = clean_url(url)
|
|
278
355
|
|
|
279
356
|
# Construct ArXiv URL from eprint field if no URL present
|
|
280
357
|
if not url and not doi_url:
|
|
@@ -11,6 +11,69 @@ from typing import List
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
def expand_abbreviations(text: str) -> str:
|
|
15
|
+
"""
|
|
16
|
+
Generic abbreviation expansion using common academic patterns.
|
|
17
|
+
|
|
18
|
+
This function expands common academic abbreviations to their full forms
|
|
19
|
+
to improve venue name matching and comparison.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
text: Text containing potential abbreviations
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Text with abbreviations expanded
|
|
26
|
+
"""
|
|
27
|
+
if not text:
|
|
28
|
+
return text
|
|
29
|
+
|
|
30
|
+
common_abbrevs = {
|
|
31
|
+
# IEEE specific abbreviations (only expand with periods, not full words)
|
|
32
|
+
'robot.': 'robotics', 'autom.': 'automation', 'lett.': 'letters',
|
|
33
|
+
'trans.': 'transactions', 'syst.': 'systems', 'netw.': 'networks',
|
|
34
|
+
'learn.': 'learning', 'ind.': 'industrial', 'electron.': 'electronics',
|
|
35
|
+
'mechatron.': 'mechatronics', 'intell.': 'intelligence',
|
|
36
|
+
'transp.': 'transportation', 'contr.': 'control', 'mag.': 'magazine',
|
|
37
|
+
# General academic abbreviations (only expand with periods)
|
|
38
|
+
'int.': 'international', 'intl.': 'international', 'conf.': 'conference',
|
|
39
|
+
'j.': 'journal', 'proc.': 'proceedings', 'assoc.': 'association',
|
|
40
|
+
'comput.': 'computing', 'sci.': 'science', 'eng.': 'engineering',
|
|
41
|
+
'tech.': 'technology', 'artif.': 'artificial', 'mach.': 'machine',
|
|
42
|
+
'stat.': 'statistics', 'math.': 'mathematics', 'phys.': 'physics',
|
|
43
|
+
'chem.': 'chemistry', 'bio.': 'biology', 'med.': 'medicine',
|
|
44
|
+
'adv.': 'advances', 'ann.': 'annual', 'symp.': 'symposium',
|
|
45
|
+
'workshop': 'workshop', 'worksh.': 'workshop',
|
|
46
|
+
'natl.': 'national', 'acad.': 'academy', 'rev.': 'review',
|
|
47
|
+
# Physics journal abbreviations
|
|
48
|
+
'phys.': 'physics', 'phys. rev.': 'physical review',
|
|
49
|
+
'phys. rev. lett.': 'physical review letters',
|
|
50
|
+
'phys. rev. a': 'physical review a', 'phys. rev. b': 'physical review b',
|
|
51
|
+
'phys. rev. c': 'physical review c', 'phys. rev. d': 'physical review d',
|
|
52
|
+
'phys. rev. e': 'physical review e', 'phys. lett.': 'physics letters',
|
|
53
|
+
'phys. lett. b': 'physics letters b', 'nucl. phys.': 'nuclear physics',
|
|
54
|
+
'nucl. phys. a': 'nuclear physics a', 'nucl. phys. b': 'nuclear physics b',
|
|
55
|
+
'j. phys.': 'journal of physics', 'ann. phys.': 'annals of physics',
|
|
56
|
+
'mod. phys. lett.': 'modern physics letters', 'eur. phys. j.': 'european physical journal',
|
|
57
|
+
# Nature journals
|
|
58
|
+
'nature phys.': 'nature physics', 'sci. adv.': 'science advances',
|
|
59
|
+
# Handle specific multi-word patterns and well-known acronyms
|
|
60
|
+
'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
|
|
61
|
+
'pnas': 'proceedings of the national academy of sciences',
|
|
62
|
+
'neurips': 'neural information processing systems',
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Sort by length (longest first) to ensure longer matches take precedence
|
|
66
|
+
for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
|
|
67
|
+
# For abbreviations ending in period, use word boundary at start only
|
|
68
|
+
if abbrev.endswith('.'):
|
|
69
|
+
pattern = r'\b' + re.escape(abbrev)
|
|
70
|
+
else:
|
|
71
|
+
pattern = r'\b' + re.escape(abbrev) + r'\b'
|
|
72
|
+
text = re.sub(pattern, expansion, text)
|
|
73
|
+
|
|
74
|
+
return text
|
|
75
|
+
|
|
76
|
+
|
|
14
77
|
def normalize_apostrophes(text):
|
|
15
78
|
"""
|
|
16
79
|
Normalize all apostrophe variants to standard ASCII apostrophe
|
|
@@ -4531,7 +4594,7 @@ def normalize_venue_for_display(venue: str) -> str:
|
|
|
4531
4594
|
prefixes_to_remove = [
|
|
4532
4595
|
r'^\d{4}\s+\d+(st|nd|rd|th)\s+', # "2012 IEEE/RSJ"
|
|
4533
4596
|
r'^\d{4}\s+', # "2024 "
|
|
4534
|
-
r'^proceedings\s+(of\s+)?(the\s+)?(\d+(st|nd|rd|th)\s+)?
|
|
4597
|
+
r'^proceedings\s+(of\s+)?(the\s+)?((acm|ieee|usenix|aaai|sigcomm|sigkdd|sigmod|sigops|vldb|osdi|sosp|eurosys)\s+)*(\d+(st|nd|rd|th)\s+)?', # "Proceedings of the [ORG] [ORG] 29th"
|
|
4535
4598
|
r'^proc\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proc. of the IEEE" (require "of")
|
|
4536
4599
|
r'^procs\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Procs. of the IEEE" (require "of")
|
|
4537
4600
|
r'^in\s+',
|
|
@@ -209,7 +209,14 @@ def validate_url_format(url: str) -> bool:
|
|
|
209
209
|
|
|
210
210
|
def clean_url(url: str) -> str:
|
|
211
211
|
"""
|
|
212
|
-
Clean a URL by removing common issues like extra spaces, fragments, etc.
|
|
212
|
+
Clean a URL by removing common issues like extra spaces, fragments, malformed LaTeX, etc.
|
|
213
|
+
|
|
214
|
+
This function handles:
|
|
215
|
+
- Whitespace trimming
|
|
216
|
+
- Malformed LaTeX URL wrappers like \\url{https://...}
|
|
217
|
+
- Markdown-style links like [text](url)
|
|
218
|
+
- Trailing punctuation from academic references
|
|
219
|
+
- DOI URL query parameter cleanup
|
|
213
220
|
|
|
214
221
|
Args:
|
|
215
222
|
url: URL to clean
|
|
@@ -223,6 +230,26 @@ def clean_url(url: str) -> str:
|
|
|
223
230
|
# Remove leading/trailing whitespace
|
|
224
231
|
url = url.strip()
|
|
225
232
|
|
|
233
|
+
# Handle malformed URLs that contain \url{} wrappers within the URL text
|
|
234
|
+
# e.g., "https://\url{https://www.example.com/}" -> "https://www.example.com/"
|
|
235
|
+
import re
|
|
236
|
+
url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
|
|
237
|
+
url_match = re.search(url_pattern, url)
|
|
238
|
+
if url_match:
|
|
239
|
+
url = url_match.group(1)
|
|
240
|
+
|
|
241
|
+
# Handle markdown-style links like [text](url) or [url](url)
|
|
242
|
+
# e.g., "[https://example.com](https://example.com)" -> "https://example.com"
|
|
243
|
+
markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
|
|
244
|
+
markdown_match = re.search(markdown_pattern, url)
|
|
245
|
+
if markdown_match:
|
|
246
|
+
# Use the URL from parentheses
|
|
247
|
+
url = markdown_match.group(2)
|
|
248
|
+
|
|
249
|
+
# Remove trailing punctuation that's commonly part of sentence structure
|
|
250
|
+
# but preserve legitimate URL characters
|
|
251
|
+
url = url.rstrip('.,;!?)')
|
|
252
|
+
|
|
226
253
|
# Note: Preserving query parameters for all URLs now
|
|
227
254
|
# Previously this function removed query parameters for non-DOI URLs,
|
|
228
255
|
# but this was causing issues with OpenReview and other URLs that need their parameters
|
|
@@ -254,6 +281,22 @@ def clean_url_punctuation(url: str) -> str:
|
|
|
254
281
|
# Remove leading/trailing whitespace
|
|
255
282
|
url = url.strip()
|
|
256
283
|
|
|
284
|
+
# Handle malformed URLs that contain \\url{} wrappers within the URL text
|
|
285
|
+
# e.g., "https://\\url{https://www.example.com/}" -> "https://www.example.com/"
|
|
286
|
+
import re
|
|
287
|
+
url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
|
|
288
|
+
url_match = re.search(url_pattern, url)
|
|
289
|
+
if url_match:
|
|
290
|
+
url = url_match.group(1)
|
|
291
|
+
|
|
292
|
+
# Handle markdown-style links like [text](url) or [url](url)
|
|
293
|
+
# e.g., "[https://example.com](https://example.com)" -> "https://example.com"
|
|
294
|
+
markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
|
|
295
|
+
markdown_match = re.search(markdown_pattern, url)
|
|
296
|
+
if markdown_match:
|
|
297
|
+
# Use the URL from parentheses
|
|
298
|
+
url = markdown_match.group(2)
|
|
299
|
+
|
|
257
300
|
# Remove trailing punctuation that's commonly part of sentence structure
|
|
258
301
|
# but preserve legitimate URL characters
|
|
259
302
|
url = url.rstrip('.,;!?)')
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/enhanced_hybrid_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/local_semantic_scholar.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/openreview_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|