academic-refchecker 1.2.41__tar.gz → 1.2.43__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.41/src/academic_refchecker.egg-info → academic_refchecker-1.2.43}/PKG-INFO +1 -1
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/__version__.py +1 -1
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/openreview_checker.py +2 -1
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/semantic_scholar.py +2 -5
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/core/parallel_processor.py +4 -1
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/core/refchecker.py +0 -19
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/bibtex_parser.py +104 -27
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/error_utils.py +19 -1
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/text_utils.py +286 -9
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/url_utils.py +27 -1
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/LICENSE +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/MANIFEST.in +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/README.md +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/pyproject.toml +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/requirements.txt +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/scripts/download_db.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/scripts/run_tests.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/setup.cfg +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/__init__.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/__init__.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/crossref.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/github_checker.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/openalex.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/webpage_checker.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/config/__init__.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/config/logging.conf +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/config/settings.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/core/__init__.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/core/db_connection_pool.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/database/__init__.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/llm/__init__.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/llm/base.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/llm/providers.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/scripts/__init__.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/services/__init__.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/services/pdf_processor.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/__init__.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/arxiv_utils.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/author_utils.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/biblatex_parser.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/bibliography_utils.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/config_validator.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/db_utils.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/doi_utils.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/mock_objects.py +0 -0
- {academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/utils/unicode_utils.py +0 -0
{academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/openreview_checker.py
RENAMED
|
@@ -473,9 +473,10 @@ class OpenReviewReferenceChecker:
|
|
|
473
473
|
|
|
474
474
|
if cited_venue and paper_venue:
|
|
475
475
|
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
476
|
+
from utils.error_utils import clean_venue_for_comparison
|
|
476
477
|
errors.append({
|
|
477
478
|
"warning_type": "venue",
|
|
478
|
-
"warning_details": f"Venue mismatch: cited as '{cited_venue}' but OpenReview shows '{paper_venue}'"
|
|
479
|
+
"warning_details": f"Venue mismatch: cited as '{clean_venue_for_comparison(cited_venue)}' but OpenReview shows '{clean_venue_for_comparison(paper_venue)}'"
|
|
479
480
|
})
|
|
480
481
|
|
|
481
482
|
# Create verified data structure
|
|
@@ -544,11 +544,8 @@ class NonArxivReferenceChecker:
|
|
|
544
544
|
if cited_venue and paper_venue:
|
|
545
545
|
# Use the utility function to check if venues are substantially different
|
|
546
546
|
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
'warning_details': f"Venue mismatch: cited as '{cited_venue}' but actually '{paper_venue}'",
|
|
550
|
-
'ref_venue_correct': paper_venue
|
|
551
|
-
})
|
|
547
|
+
from utils.error_utils import create_venue_warning
|
|
548
|
+
errors.append(create_venue_warning(cited_venue, paper_venue))
|
|
552
549
|
elif not cited_venue and paper_venue:
|
|
553
550
|
# Check if this is an arXiv paper first
|
|
554
551
|
external_ids = paper_data.get('externalIds', {})
|
|
@@ -279,8 +279,11 @@ class ParallelReferenceProcessor:
|
|
|
279
279
|
from utils.text_utils import format_authors_for_display
|
|
280
280
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
281
281
|
year = reference.get('year', '')
|
|
282
|
-
# Get venue from either 'venue' or 'journal' field
|
|
282
|
+
# Get venue from either 'venue' or 'journal' field and clean it up
|
|
283
283
|
venue = reference.get('venue', '') or reference.get('journal', '')
|
|
284
|
+
if venue:
|
|
285
|
+
from utils.error_utils import clean_venue_for_comparison
|
|
286
|
+
venue = clean_venue_for_comparison(venue)
|
|
284
287
|
url = reference.get('url', '')
|
|
285
288
|
doi = reference.get('doi', '')
|
|
286
289
|
|
|
@@ -5407,25 +5407,6 @@ class ArxivReferenceChecker:
|
|
|
5407
5407
|
if error_details:
|
|
5408
5408
|
subreason = self._categorize_unverified_reason(error_details)
|
|
5409
5409
|
print(f" Subreason: {subreason}")
|
|
5410
|
-
|
|
5411
|
-
year_str = self._format_year_string(reference.get('year'))
|
|
5412
|
-
|
|
5413
|
-
# Apply LaTeX cleaning and formatting to authors for display
|
|
5414
|
-
authors = reference.get('authors', [])
|
|
5415
|
-
if authors:
|
|
5416
|
-
from utils.text_utils import strip_latex_commands, format_authors_for_display
|
|
5417
|
-
cleaned_authors = [strip_latex_commands(author) for author in authors]
|
|
5418
|
-
authors_display = format_authors_for_display(cleaned_authors)
|
|
5419
|
-
else:
|
|
5420
|
-
authors_display = 'Unknown authors'
|
|
5421
|
-
|
|
5422
|
-
# Only show URL if it exists and is different from reference_url
|
|
5423
|
-
ref_url = reference.get('url', '').strip()
|
|
5424
|
-
if ref_url and ref_url != reference_url:
|
|
5425
|
-
# Clean trailing punctuation from URL display
|
|
5426
|
-
from utils.url_utils import clean_url_punctuation
|
|
5427
|
-
clean_ref_url = clean_url_punctuation(ref_url)
|
|
5428
|
-
print(f" URL: {clean_ref_url}")
|
|
5429
5410
|
|
|
5430
5411
|
def _categorize_unverified_reason(self, error_details):
|
|
5431
5412
|
"""Categorize the unverified error into checker error or not found"""
|
|
@@ -103,37 +103,99 @@ def parse_bibtex_entry_content(entry_type: str, entry_key: str, content: str) ->
|
|
|
103
103
|
Returns:
|
|
104
104
|
Dictionary with parsed entry data
|
|
105
105
|
"""
|
|
106
|
-
# Extract fields using regex
|
|
107
106
|
fields = {}
|
|
108
107
|
|
|
109
|
-
#
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
108
|
+
# Use a more robust approach with manual parsing
|
|
109
|
+
i = 0
|
|
110
|
+
while i < len(content):
|
|
111
|
+
# Skip whitespace
|
|
112
|
+
while i < len(content) and content[i].isspace():
|
|
113
|
+
i += 1
|
|
114
|
+
|
|
115
|
+
if i >= len(content):
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
# Look for field name
|
|
119
|
+
field_start = i
|
|
120
|
+
while i < len(content) and (content[i].isalnum() or content[i] == '_'):
|
|
121
|
+
i += 1
|
|
122
|
+
|
|
123
|
+
if i == field_start:
|
|
124
|
+
i += 1 # Skip non-alphanumeric character
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
field_name = content[field_start:i].lower()
|
|
128
|
+
|
|
129
|
+
# Skip whitespace
|
|
130
|
+
while i < len(content) and content[i].isspace():
|
|
131
|
+
i += 1
|
|
132
|
+
|
|
133
|
+
# Look for equals sign
|
|
134
|
+
if i >= len(content) or content[i] != '=':
|
|
135
|
+
continue
|
|
136
|
+
i += 1 # Skip '='
|
|
137
|
+
|
|
138
|
+
# Skip whitespace
|
|
139
|
+
while i < len(content) and content[i].isspace():
|
|
140
|
+
i += 1
|
|
141
|
+
|
|
142
|
+
if i >= len(content):
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
# Parse field value
|
|
146
|
+
field_value = ""
|
|
147
|
+
if content[i] == '"':
|
|
148
|
+
# Handle quoted strings
|
|
149
|
+
i += 1 # Skip opening quote
|
|
150
|
+
value_start = i
|
|
151
|
+
while i < len(content) and content[i] != '"':
|
|
152
|
+
i += 1
|
|
153
|
+
if i < len(content):
|
|
154
|
+
field_value = content[value_start:i]
|
|
155
|
+
i += 1 # Skip closing quote
|
|
156
|
+
elif content[i] == '{':
|
|
157
|
+
# Handle braced strings with proper nesting
|
|
158
|
+
brace_count = 0
|
|
159
|
+
value_start = i + 1 # Skip opening brace
|
|
160
|
+
i += 1
|
|
161
|
+
while i < len(content):
|
|
162
|
+
if content[i] == '{':
|
|
163
|
+
brace_count += 1
|
|
164
|
+
elif content[i] == '}':
|
|
165
|
+
if brace_count == 0:
|
|
166
|
+
break
|
|
167
|
+
brace_count -= 1
|
|
168
|
+
i += 1
|
|
169
|
+
|
|
170
|
+
if i < len(content):
|
|
171
|
+
field_value = content[value_start:i]
|
|
172
|
+
i += 1 # Skip closing brace
|
|
173
|
+
|
|
174
|
+
if field_value:
|
|
175
|
+
field_value = field_value.strip()
|
|
176
|
+
# Strip outer quotes if present (handles cases like title = {"Some Title"})
|
|
177
|
+
if field_value.startswith('"') and field_value.endswith('"'):
|
|
178
|
+
field_value = field_value[1:-1]
|
|
179
|
+
fields[field_name] = field_value
|
|
180
|
+
|
|
181
|
+
# Skip to next field (look for comma)
|
|
182
|
+
while i < len(content) and content[i] not in ',}':
|
|
183
|
+
i += 1
|
|
184
|
+
if i < len(content) and content[i] == ',':
|
|
185
|
+
i += 1
|
|
121
186
|
|
|
122
|
-
#
|
|
187
|
+
# Fallback to regex if manual parsing failed
|
|
123
188
|
if not fields:
|
|
124
|
-
logger.debug("
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
if field_value.startswith('"') and field_value.endswith('"'):
|
|
135
|
-
field_value = field_value[1:-1]
|
|
136
|
-
fields[field_name] = field_value
|
|
189
|
+
logger.debug("Manual parsing failed, trying regex approach")
|
|
190
|
+
field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
|
|
191
|
+
|
|
192
|
+
for match in re.finditer(field_pattern, content, re.DOTALL):
|
|
193
|
+
field_name = match.group(1).lower()
|
|
194
|
+
field_value = match.group(2) or match.group(3) or ""
|
|
195
|
+
field_value = field_value.strip()
|
|
196
|
+
if field_value.startswith('"') and field_value.endswith('"'):
|
|
197
|
+
field_value = field_value[1:-1]
|
|
198
|
+
fields[field_name] = field_value
|
|
137
199
|
|
|
138
200
|
return {
|
|
139
201
|
'type': entry_type,
|
|
@@ -216,6 +278,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
|
|
|
216
278
|
|
|
217
279
|
# Extract journal/venue
|
|
218
280
|
journal = fields.get('journal', fields.get('booktitle', fields.get('venue', '')))
|
|
281
|
+
# Remove braces from journal/venue names
|
|
282
|
+
if journal and journal.startswith('{') and journal.endswith('}'):
|
|
283
|
+
journal = journal[1:-1]
|
|
219
284
|
|
|
220
285
|
# Extract DOI and construct URL
|
|
221
286
|
doi = fields.get('doi', '')
|
|
@@ -225,6 +290,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
|
|
|
225
290
|
|
|
226
291
|
# Extract other URLs
|
|
227
292
|
url = fields.get('url', '')
|
|
293
|
+
if url:
|
|
294
|
+
from utils.url_utils import clean_url
|
|
295
|
+
url = clean_url(url)
|
|
228
296
|
|
|
229
297
|
# Handle special @misc entries with only howpublished field
|
|
230
298
|
if not title and not authors and entry_type == 'misc':
|
|
@@ -249,6 +317,10 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
|
|
|
249
317
|
else:
|
|
250
318
|
url = howpublished
|
|
251
319
|
|
|
320
|
+
# Clean the reconstructed URL
|
|
321
|
+
from utils.url_utils import clean_url
|
|
322
|
+
url = clean_url(url)
|
|
323
|
+
|
|
252
324
|
# Generate title from domain/path
|
|
253
325
|
if 'jailbreakchat.com' in domain:
|
|
254
326
|
title = 'JailbreakChat Website'
|
|
@@ -275,6 +347,11 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
|
|
|
275
347
|
|
|
276
348
|
if url.startswith('\\url{') and url.endswith('}'):
|
|
277
349
|
url = url[5:-1] # Remove \url{...}
|
|
350
|
+
|
|
351
|
+
# Clean any URL we extracted
|
|
352
|
+
if url:
|
|
353
|
+
from utils.url_utils import clean_url
|
|
354
|
+
url = clean_url(url)
|
|
278
355
|
|
|
279
356
|
# Construct ArXiv URL from eprint field if no URL present
|
|
280
357
|
if not url and not doi_url:
|
|
@@ -89,6 +89,20 @@ def create_title_error(error_details: str, correct_title: str) -> Dict[str, str]
|
|
|
89
89
|
}
|
|
90
90
|
|
|
91
91
|
|
|
92
|
+
def clean_venue_for_comparison(venue: str) -> str:
|
|
93
|
+
"""
|
|
94
|
+
Clean venue name for display in warnings using the shared normalization logic.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
venue: Raw venue string
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Cleaned venue name suitable for display
|
|
101
|
+
"""
|
|
102
|
+
from utils.text_utils import normalize_venue_for_display
|
|
103
|
+
return normalize_venue_for_display(venue)
|
|
104
|
+
|
|
105
|
+
|
|
92
106
|
def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]:
|
|
93
107
|
"""
|
|
94
108
|
Create a standardized venue warning dictionary.
|
|
@@ -100,9 +114,13 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
|
|
|
100
114
|
Returns:
|
|
101
115
|
Standardized warning dictionary
|
|
102
116
|
"""
|
|
117
|
+
# Clean both venues for display in the warning
|
|
118
|
+
clean_cited = clean_venue_for_comparison(cited_venue)
|
|
119
|
+
clean_correct = clean_venue_for_comparison(correct_venue)
|
|
120
|
+
|
|
103
121
|
return {
|
|
104
122
|
'warning_type': 'venue',
|
|
105
|
-
'warning_details': f"Venue mismatch: cited as '{
|
|
123
|
+
'warning_details': f"Venue mismatch: cited as '{clean_cited}' but actually '{clean_correct}'",
|
|
106
124
|
'ref_venue_correct': correct_venue
|
|
107
125
|
}
|
|
108
126
|
|
|
@@ -11,6 +11,69 @@ from typing import List
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
def expand_abbreviations(text: str) -> str:
|
|
15
|
+
"""
|
|
16
|
+
Generic abbreviation expansion using common academic patterns.
|
|
17
|
+
|
|
18
|
+
This function expands common academic abbreviations to their full forms
|
|
19
|
+
to improve venue name matching and comparison.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
text: Text containing potential abbreviations
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Text with abbreviations expanded
|
|
26
|
+
"""
|
|
27
|
+
if not text:
|
|
28
|
+
return text
|
|
29
|
+
|
|
30
|
+
common_abbrevs = {
|
|
31
|
+
# IEEE specific abbreviations (only expand with periods, not full words)
|
|
32
|
+
'robot.': 'robotics', 'autom.': 'automation', 'lett.': 'letters',
|
|
33
|
+
'trans.': 'transactions', 'syst.': 'systems', 'netw.': 'networks',
|
|
34
|
+
'learn.': 'learning', 'ind.': 'industrial', 'electron.': 'electronics',
|
|
35
|
+
'mechatron.': 'mechatronics', 'intell.': 'intelligence',
|
|
36
|
+
'transp.': 'transportation', 'contr.': 'control', 'mag.': 'magazine',
|
|
37
|
+
# General academic abbreviations (only expand with periods)
|
|
38
|
+
'int.': 'international', 'intl.': 'international', 'conf.': 'conference',
|
|
39
|
+
'j.': 'journal', 'proc.': 'proceedings', 'assoc.': 'association',
|
|
40
|
+
'comput.': 'computing', 'sci.': 'science', 'eng.': 'engineering',
|
|
41
|
+
'tech.': 'technology', 'artif.': 'artificial', 'mach.': 'machine',
|
|
42
|
+
'stat.': 'statistics', 'math.': 'mathematics', 'phys.': 'physics',
|
|
43
|
+
'chem.': 'chemistry', 'bio.': 'biology', 'med.': 'medicine',
|
|
44
|
+
'adv.': 'advances', 'ann.': 'annual', 'symp.': 'symposium',
|
|
45
|
+
'workshop': 'workshop', 'worksh.': 'workshop',
|
|
46
|
+
'natl.': 'national', 'acad.': 'academy', 'rev.': 'review',
|
|
47
|
+
# Physics journal abbreviations
|
|
48
|
+
'phys.': 'physics', 'phys. rev.': 'physical review',
|
|
49
|
+
'phys. rev. lett.': 'physical review letters',
|
|
50
|
+
'phys. rev. a': 'physical review a', 'phys. rev. b': 'physical review b',
|
|
51
|
+
'phys. rev. c': 'physical review c', 'phys. rev. d': 'physical review d',
|
|
52
|
+
'phys. rev. e': 'physical review e', 'phys. lett.': 'physics letters',
|
|
53
|
+
'phys. lett. b': 'physics letters b', 'nucl. phys.': 'nuclear physics',
|
|
54
|
+
'nucl. phys. a': 'nuclear physics a', 'nucl. phys. b': 'nuclear physics b',
|
|
55
|
+
'j. phys.': 'journal of physics', 'ann. phys.': 'annals of physics',
|
|
56
|
+
'mod. phys. lett.': 'modern physics letters', 'eur. phys. j.': 'european physical journal',
|
|
57
|
+
# Nature journals
|
|
58
|
+
'nature phys.': 'nature physics', 'sci. adv.': 'science advances',
|
|
59
|
+
# Handle specific multi-word patterns and well-known acronyms
|
|
60
|
+
'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
|
|
61
|
+
'pnas': 'proceedings of the national academy of sciences',
|
|
62
|
+
'neurips': 'neural information processing systems',
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Sort by length (longest first) to ensure longer matches take precedence
|
|
66
|
+
for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
|
|
67
|
+
# For abbreviations ending in period, use word boundary at start only
|
|
68
|
+
if abbrev.endswith('.'):
|
|
69
|
+
pattern = r'\b' + re.escape(abbrev)
|
|
70
|
+
else:
|
|
71
|
+
pattern = r'\b' + re.escape(abbrev) + r'\b'
|
|
72
|
+
text = re.sub(pattern, expansion, text)
|
|
73
|
+
|
|
74
|
+
return text
|
|
75
|
+
|
|
76
|
+
|
|
14
77
|
def normalize_apostrophes(text):
|
|
15
78
|
"""
|
|
16
79
|
Normalize all apostrophe variants to standard ASCII apostrophe
|
|
@@ -2255,8 +2318,13 @@ def format_author_for_display(author_name):
|
|
|
2255
2318
|
if not author_name:
|
|
2256
2319
|
return author_name
|
|
2257
2320
|
|
|
2321
|
+
# Clean up any stray punctuation that might have been attached during parsing
|
|
2322
|
+
author_name = author_name.strip()
|
|
2323
|
+
# Remove trailing semicolons that sometimes get attached during bibliographic parsing
|
|
2324
|
+
author_name = re.sub(r'[;,]\s*$', '', author_name)
|
|
2325
|
+
|
|
2258
2326
|
# Normalize apostrophes for consistent display
|
|
2259
|
-
author_name = normalize_apostrophes(author_name
|
|
2327
|
+
author_name = normalize_apostrophes(author_name)
|
|
2260
2328
|
|
|
2261
2329
|
# Check if it's in "Lastname, Firstname" format
|
|
2262
2330
|
if ',' in author_name:
|
|
@@ -3667,8 +3735,77 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
3667
3735
|
return bool(venue1 != venue2)
|
|
3668
3736
|
|
|
3669
3737
|
# Clean LaTeX commands from both venues first
|
|
3670
|
-
|
|
3671
|
-
|
|
3738
|
+
venue1_latex_cleaned = strip_latex_commands(venue1)
|
|
3739
|
+
venue2_latex_cleaned = strip_latex_commands(venue2)
|
|
3740
|
+
|
|
3741
|
+
# For comparison, we need lowercase normalized versions
|
|
3742
|
+
def normalize_for_comparison(venue_text):
|
|
3743
|
+
# Get the cleaned display version first
|
|
3744
|
+
cleaned = normalize_venue_for_display(venue_text)
|
|
3745
|
+
# Then normalize for comparison: lowercase, expand abbreviations, remove punctuation
|
|
3746
|
+
venue_lower = cleaned.lower()
|
|
3747
|
+
|
|
3748
|
+
# Handle LaTeX penalty commands before abbreviation expansion
|
|
3749
|
+
venue_lower = re.sub(r'\\penalty\d+\s*', ' ', venue_lower) # Remove \\penalty0 etc
|
|
3750
|
+
venue_lower = re.sub(r'\s+', ' ', venue_lower).strip() # Clean up extra spaces
|
|
3751
|
+
|
|
3752
|
+
# Expand abbreviations for comparison
|
|
3753
|
+
def expand_abbreviations(text):
|
|
3754
|
+
common_abbrevs = {
|
|
3755
|
+
# IEEE specific abbreviations (only expand with periods, not full words)
|
|
3756
|
+
'robot.': 'robotics', 'autom.': 'automation', 'lett.': 'letters',
|
|
3757
|
+
'trans.': 'transactions', 'syst.': 'systems', 'netw.': 'networks',
|
|
3758
|
+
'learn.': 'learning', 'ind.': 'industrial', 'electron.': 'electronics',
|
|
3759
|
+
'mechatron.': 'mechatronics', 'intell.': 'intelligence',
|
|
3760
|
+
'transp.': 'transportation', 'contr.': 'control', 'mag.': 'magazine',
|
|
3761
|
+
# General academic abbreviations (only expand with periods)
|
|
3762
|
+
'int.': 'international', 'intl.': 'international', 'conf.': 'conference',
|
|
3763
|
+
'j.': 'journal', 'proc.': 'proceedings', 'assoc.': 'association',
|
|
3764
|
+
'comput.': 'computing', 'sci.': 'science', 'eng.': 'engineering',
|
|
3765
|
+
'tech.': 'technology', 'artif.': 'artificial', 'mach.': 'machine',
|
|
3766
|
+
'stat.': 'statistics', 'math.': 'mathematics', 'phys.': 'physics',
|
|
3767
|
+
'chem.': 'chemistry', 'bio.': 'biology', 'med.': 'medicine',
|
|
3768
|
+
'adv.': 'advances', 'ann.': 'annual', 'symp.': 'symposium',
|
|
3769
|
+
'workshop': 'workshop', 'worksh.': 'workshop',
|
|
3770
|
+
'natl.': 'national', 'acad.': 'academy', 'rev.': 'review',
|
|
3771
|
+
# Physics journal abbreviations
|
|
3772
|
+
'phys.': 'physics', 'phys. rev.': 'physical review',
|
|
3773
|
+
'phys. rev. lett.': 'physical review letters',
|
|
3774
|
+
'phys. rev. a': 'physical review a', 'phys. rev. b': 'physical review b',
|
|
3775
|
+
'phys. rev. c': 'physical review c', 'phys. rev. d': 'physical review d',
|
|
3776
|
+
'phys. rev. e': 'physical review e', 'phys. lett.': 'physics letters',
|
|
3777
|
+
'phys. lett. b': 'physics letters b', 'nucl. phys.': 'nuclear physics',
|
|
3778
|
+
'nucl. phys. a': 'nuclear physics a', 'nucl. phys. b': 'nuclear physics b',
|
|
3779
|
+
'j. phys.': 'journal of physics', 'ann. phys.': 'annals of physics',
|
|
3780
|
+
'mod. phys. lett.': 'modern physics letters', 'eur. phys. j.': 'european physical journal',
|
|
3781
|
+
# Nature journals
|
|
3782
|
+
'nature phys.': 'nature physics', 'sci. adv.': 'science advances',
|
|
3783
|
+
# Handle specific multi-word patterns and well-known acronyms
|
|
3784
|
+
'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
|
|
3785
|
+
'pnas': 'proceedings of the national academy of sciences',
|
|
3786
|
+
}
|
|
3787
|
+
# Sort by length (longest first) to ensure longer matches take precedence
|
|
3788
|
+
for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
|
|
3789
|
+
# For abbreviations ending in period, use word boundary at start only
|
|
3790
|
+
if abbrev.endswith('.'):
|
|
3791
|
+
pattern = r'\b' + re.escape(abbrev)
|
|
3792
|
+
else:
|
|
3793
|
+
pattern = r'\b' + re.escape(abbrev) + r'\b'
|
|
3794
|
+
text = re.sub(pattern, expansion, text)
|
|
3795
|
+
return text
|
|
3796
|
+
|
|
3797
|
+
venue_lower = expand_abbreviations(venue_lower)
|
|
3798
|
+
|
|
3799
|
+
# Remove punctuation and normalize spacing for comparison
|
|
3800
|
+
venue_lower = re.sub(r'[.,;:]', '', venue_lower) # Remove punctuation
|
|
3801
|
+
venue_lower = re.sub(r'\\s+on\\s+', ' ', venue_lower) # Remove \"on\" preposition
|
|
3802
|
+
venue_lower = re.sub(r'\\s+for\\s+', ' ', venue_lower) # Remove \"for\" preposition
|
|
3803
|
+
venue_lower = re.sub(r'\\s+', ' ', venue_lower).strip() # Normalize whitespace
|
|
3804
|
+
|
|
3805
|
+
return venue_lower
|
|
3806
|
+
|
|
3807
|
+
normalized_venue1 = normalize_for_comparison(venue1_latex_cleaned)
|
|
3808
|
+
normalized_venue2 = normalize_for_comparison(venue2_latex_cleaned)
|
|
3672
3809
|
|
|
3673
3810
|
def expand_abbreviations(text):
|
|
3674
3811
|
"""Generic abbreviation expansion using common academic patterns"""
|
|
@@ -3985,8 +4122,8 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
3985
4122
|
if not acronym or not full_text:
|
|
3986
4123
|
return False
|
|
3987
4124
|
|
|
3988
|
-
#
|
|
3989
|
-
normalized_full =
|
|
4125
|
+
# Use the internal comparison normalization function
|
|
4126
|
+
normalized_full = normalize_for_comparison(full_text)
|
|
3990
4127
|
|
|
3991
4128
|
# Generate all possible acronyms from the full text
|
|
3992
4129
|
possible_acronyms = []
|
|
@@ -4100,9 +4237,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
4100
4237
|
if (arxiv1 == 'arxiv' and arxiv2.startswith('https://arxiv.org')) or (arxiv2 == 'arxiv' and arxiv1.startswith('https://arxiv.org')):
|
|
4101
4238
|
return False
|
|
4102
4239
|
|
|
4103
|
-
#
|
|
4104
|
-
norm1 =
|
|
4105
|
-
norm2 =
|
|
4240
|
+
# Use normalized venues from shared function
|
|
4241
|
+
norm1 = normalized_venue1
|
|
4242
|
+
norm2 = normalized_venue2
|
|
4106
4243
|
|
|
4107
4244
|
# Direct match after normalization (highest priority)
|
|
4108
4245
|
if norm1 == norm2:
|
|
@@ -4356,4 +4493,144 @@ def is_year_substantially_different(cited_year: int, correct_year: int, context:
|
|
|
4356
4493
|
|
|
4357
4494
|
# Any year difference should be flagged as a warning for manual review
|
|
4358
4495
|
warning_msg = f"Year mismatch: cited as {cited_year} but actually {correct_year}"
|
|
4359
|
-
return True, warning_msg
|
|
4496
|
+
return True, warning_msg
|
|
4497
|
+
|
|
4498
|
+
|
|
4499
|
+
def normalize_venue_for_display(venue: str) -> str:
|
|
4500
|
+
"""
|
|
4501
|
+
Normalize venue names for consistent display and comparison.
|
|
4502
|
+
|
|
4503
|
+
This function is used both for display in warnings and for venue comparison
|
|
4504
|
+
to ensure consistent normalization across the system.
|
|
4505
|
+
|
|
4506
|
+
Args:
|
|
4507
|
+
venue: Raw venue string
|
|
4508
|
+
|
|
4509
|
+
Returns:
|
|
4510
|
+
Normalized venue string with prefixes removed and abbreviations expanded
|
|
4511
|
+
"""
|
|
4512
|
+
if not venue:
|
|
4513
|
+
return ""
|
|
4514
|
+
|
|
4515
|
+
def expand_abbreviations(text):
|
|
4516
|
+
"""Generic abbreviation expansion using common academic patterns"""
|
|
4517
|
+
# Common academic abbreviations mapping
|
|
4518
|
+
common_abbrevs = {
|
|
4519
|
+
# IEEE specific abbreviations (only expand with periods, not full words)
|
|
4520
|
+
'robot.': 'robotics',
|
|
4521
|
+
'autom.': 'automation',
|
|
4522
|
+
'lett.': 'letters',
|
|
4523
|
+
'trans.': 'transactions',
|
|
4524
|
+
'syst.': 'systems',
|
|
4525
|
+
'netw.': 'networks',
|
|
4526
|
+
'learn.': 'learning',
|
|
4527
|
+
'ind.': 'industrial',
|
|
4528
|
+
'electron.': 'electronics',
|
|
4529
|
+
'mechatron.': 'mechatronics',
|
|
4530
|
+
'intell.': 'intelligence',
|
|
4531
|
+
'transp.': 'transportation',
|
|
4532
|
+
'contr.': 'control',
|
|
4533
|
+
'mag.': 'magazine',
|
|
4534
|
+
|
|
4535
|
+
# General academic abbreviations (only expand with periods)
|
|
4536
|
+
'int.': 'international',
|
|
4537
|
+
'intl.': 'international',
|
|
4538
|
+
'conf.': 'conference',
|
|
4539
|
+
'j.': 'journal',
|
|
4540
|
+
'proc.': 'proceedings',
|
|
4541
|
+
'assoc.': 'association',
|
|
4542
|
+
'comput.': 'computing',
|
|
4543
|
+
'sci.': 'science',
|
|
4544
|
+
'eng.': 'engineering',
|
|
4545
|
+
'tech.': 'technology',
|
|
4546
|
+
'artif.': 'artificial',
|
|
4547
|
+
'mach.': 'machine',
|
|
4548
|
+
'stat.': 'statistics',
|
|
4549
|
+
'math.': 'mathematics',
|
|
4550
|
+
'phys.': 'physics',
|
|
4551
|
+
'chem.': 'chemistry',
|
|
4552
|
+
'bio.': 'biology',
|
|
4553
|
+
'med.': 'medicine',
|
|
4554
|
+
'adv.': 'advances',
|
|
4555
|
+
'ann.': 'annual',
|
|
4556
|
+
'symp.': 'symposium',
|
|
4557
|
+
'workshop': 'workshop',
|
|
4558
|
+
'worksh.': 'workshop',
|
|
4559
|
+
}
|
|
4560
|
+
|
|
4561
|
+
text_lower = text.lower()
|
|
4562
|
+
for abbrev, expansion in common_abbrevs.items():
|
|
4563
|
+
# Only replace if it's a word boundary to avoid partial replacements
|
|
4564
|
+
pattern = r'\b' + re.escape(abbrev) + r'\b'
|
|
4565
|
+
text_lower = re.sub(pattern, expansion, text_lower)
|
|
4566
|
+
|
|
4567
|
+
return text_lower
|
|
4568
|
+
|
|
4569
|
+
venue_text = venue.strip()
|
|
4570
|
+
|
|
4571
|
+
# Extract venue from complex editor strings (e.g. "In Smith, J.; and Doe, K., eds., Conference Name, volume 1")
|
|
4572
|
+
# This handles patterns like "In [authors], eds., [venue], [optional metadata]" (case-insensitive)
|
|
4573
|
+
editor_match = re.search(r'in\s+[^,]+(?:,\s*[^,]*)*,\s*eds?\.,\s*(.+?)(?:,\s*volume\s*\d+|,\s*pp?\.|$)', venue_text, re.IGNORECASE)
|
|
4574
|
+
if editor_match:
|
|
4575
|
+
# Extract the venue part from editor string (preserve original case)
|
|
4576
|
+
venue_text = editor_match.group(1).strip()
|
|
4577
|
+
# Clean up any remaining metadata like "volume X of Proceedings..." (case-insensitive)
|
|
4578
|
+
venue_text = re.sub(r',\s*volume\s+\d+.*$', '', venue_text, flags=re.IGNORECASE)
|
|
4579
|
+
venue_text = re.sub(r'\s+of\s+proceedings.*$', '', venue_text, flags=re.IGNORECASE)
|
|
4580
|
+
|
|
4581
|
+
# Remove years, volumes, pages, and other citation metadata
|
|
4582
|
+
# But preserve arXiv IDs (don't remove digits after arXiv:)
|
|
4583
|
+
if not re.match(r'arxiv:', venue_text, re.IGNORECASE):
|
|
4584
|
+
venue_text = re.sub(r',?\s*\d{4}[a-z]?\s*$', '', venue_text) # Years like "2024" or "2024b"
|
|
4585
|
+
venue_text = re.sub(r',?\s*\(\d{4}\)$', '', venue_text) # Years in parentheses
|
|
4586
|
+
venue_text = re.sub(r"'\d{2}$", '', venue_text) # Year suffixes like 'CVPR'16'
|
|
4587
|
+
venue_text = re.sub(r',?\s*(vol\.?\s*|volume\s*)\d+.*$', '', venue_text, flags=re.IGNORECASE) # Volume info
|
|
4588
|
+
venue_text = re.sub(r',?\s*\d+\s*\([^)]*\).*$', '', venue_text) # Issue info with optional spaces
|
|
4589
|
+
venue_text = re.sub(r',?\s*pp?\.\s*\d+.*$', '', venue_text, flags=re.IGNORECASE) # Page info
|
|
4590
|
+
venue_text = re.sub(r'\s*\(print\).*$', '', venue_text, flags=re.IGNORECASE) # Print designation
|
|
4591
|
+
venue_text = re.sub(r'\s*\(\d{4}\.\s*print\).*$', '', venue_text, flags=re.IGNORECASE) # Year.Print
|
|
4592
|
+
|
|
4593
|
+
# Remove procedural prefixes (case-insensitive)
|
|
4594
|
+
prefixes_to_remove = [
|
|
4595
|
+
r'^\d{4}\s+\d+(st|nd|rd|th)\s+', # "2012 IEEE/RSJ"
|
|
4596
|
+
r'^\d{4}\s+', # "2024 "
|
|
4597
|
+
r'^proceedings\s+(of\s+)?(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proceedings of the IEEE"
|
|
4598
|
+
r'^proc\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proc. of the IEEE" (require "of")
|
|
4599
|
+
r'^procs\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Procs. of the IEEE" (require "of")
|
|
4600
|
+
r'^in\s+',
|
|
4601
|
+
r'^advances\s+in\s+', # "Advances in Neural Information Processing Systems"
|
|
4602
|
+
r'^adv\.\s+', # "Adv. Neural Information Processing Systems"
|
|
4603
|
+
# Handle ordinal prefixes: "The Twelfth", "The Ninth", etc.
|
|
4604
|
+
r'^the\s+(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first|thirty-second|thirty-third|thirty-fourth|thirty-fifth|thirty-sixth|thirty-seventh|thirty-eighth|thirty-ninth|fortieth|forty-first|forty-second|forty-third|forty-fourth|forty-fifth|forty-sixth|forty-seventh|forty-eighth|forty-ninth|fiftieth)\s+',
|
|
4605
|
+
# Handle numeric ordinals: "The 41st", "The 12th", etc.
|
|
4606
|
+
r'^the\s+\d+(st|nd|rd|th)\s+',
|
|
4607
|
+
# Handle standalone "The" prefix
|
|
4608
|
+
r'^the\s+',
|
|
4609
|
+
]
|
|
4610
|
+
|
|
4611
|
+
for prefix_pattern in prefixes_to_remove:
|
|
4612
|
+
venue_text = re.sub(prefix_pattern, '', venue_text, flags=re.IGNORECASE)
|
|
4613
|
+
|
|
4614
|
+
# Note: For display purposes, we preserve case and don't expand abbreviations
|
|
4615
|
+
# Only do minimal cleaning needed for proper display
|
|
4616
|
+
|
|
4617
|
+
# Remove organization prefixes/suffixes that don't affect identity (case-insensitive)
|
|
4618
|
+
# But preserve IEEE when it's part of a journal name like \"IEEE Transactions\"
|
|
4619
|
+
if not re.match(r'ieee\s+transactions', venue_text, re.IGNORECASE):
|
|
4620
|
+
venue_text = re.sub(r'^(ieee|acm|aaai|usenix|sigcomm|sigkdd|sigmod|vldb|osdi|sosp|eurosys)\s+', '', venue_text, flags=re.IGNORECASE) # Remove org prefixes
|
|
4621
|
+
venue_text = re.sub(r'^ieee/\w+\s+', '', venue_text, flags=re.IGNORECASE) # Remove "IEEE/RSJ " etc
|
|
4622
|
+
venue_text = re.sub(r'\s+(ieee|acm|aaai|usenix)\s*$', '', venue_text, flags=re.IGNORECASE) # Remove org suffixes
|
|
4623
|
+
venue_text = re.sub(r'/\w+\s+', ' ', venue_text) # Remove "/ACM " style org separators
|
|
4624
|
+
|
|
4625
|
+
# IMPORTANT: Don't remove "Conference on" or "International" - they're needed for display
|
|
4626
|
+
# Only remove specific org-prefixed conference patterns where the org is clear
|
|
4627
|
+
venue_text = re.sub(r'^(ieee|acm|aaai|nips)(/\w+)?\s+conference\s+on\s+', '', venue_text, flags=re.IGNORECASE)
|
|
4628
|
+
|
|
4629
|
+
# Note: Don't remove "Conference on" as it's often part of the actual venue name
|
|
4630
|
+
# Only remove it if it's clearly a procedural prefix (handled in prefixes_to_remove above)
|
|
4631
|
+
|
|
4632
|
+
# Clean up spacing (preserve punctuation and case for display)
|
|
4633
|
+
venue_text = re.sub(r'\s+', ' ', venue_text) # Normalize whitespace
|
|
4634
|
+
venue_text = venue_text.strip()
|
|
4635
|
+
|
|
4636
|
+
return venue_text
|
|
@@ -209,7 +209,13 @@ def validate_url_format(url: str) -> bool:
|
|
|
209
209
|
|
|
210
210
|
def clean_url(url: str) -> str:
|
|
211
211
|
"""
|
|
212
|
-
Clean a URL by removing common issues like extra spaces, fragments, etc.
|
|
212
|
+
Clean a URL by removing common issues like extra spaces, fragments, malformed LaTeX, etc.
|
|
213
|
+
|
|
214
|
+
This function handles:
|
|
215
|
+
- Whitespace trimming
|
|
216
|
+
- Malformed LaTeX URL wrappers like \\url{https://...}
|
|
217
|
+
- Trailing punctuation from academic references
|
|
218
|
+
- DOI URL query parameter cleanup
|
|
213
219
|
|
|
214
220
|
Args:
|
|
215
221
|
url: URL to clean
|
|
@@ -223,6 +229,18 @@ def clean_url(url: str) -> str:
|
|
|
223
229
|
# Remove leading/trailing whitespace
|
|
224
230
|
url = url.strip()
|
|
225
231
|
|
|
232
|
+
# Handle malformed URLs that contain \url{} wrappers within the URL text
|
|
233
|
+
# e.g., "https://\url{https://www.example.com/}" -> "https://www.example.com/"
|
|
234
|
+
import re
|
|
235
|
+
url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
|
|
236
|
+
url_match = re.search(url_pattern, url)
|
|
237
|
+
if url_match:
|
|
238
|
+
url = url_match.group(1)
|
|
239
|
+
|
|
240
|
+
# Remove trailing punctuation that's commonly part of sentence structure
|
|
241
|
+
# but preserve legitimate URL characters
|
|
242
|
+
url = url.rstrip('.,;!?)')
|
|
243
|
+
|
|
226
244
|
# Note: Preserving query parameters for all URLs now
|
|
227
245
|
# Previously this function removed query parameters for non-DOI URLs,
|
|
228
246
|
# but this was causing issues with OpenReview and other URLs that need their parameters
|
|
@@ -254,6 +272,14 @@ def clean_url_punctuation(url: str) -> str:
|
|
|
254
272
|
# Remove leading/trailing whitespace
|
|
255
273
|
url = url.strip()
|
|
256
274
|
|
|
275
|
+
# Handle malformed URLs that contain \\url{} wrappers within the URL text
|
|
276
|
+
# e.g., "https://\\url{https://www.example.com/}" -> "https://www.example.com/"
|
|
277
|
+
import re
|
|
278
|
+
url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
|
|
279
|
+
url_match = re.search(url_pattern, url)
|
|
280
|
+
if url_match:
|
|
281
|
+
url = url_match.group(1)
|
|
282
|
+
|
|
257
283
|
# Remove trailing punctuation that's commonly part of sentence structure
|
|
258
284
|
# but preserve legitimate URL characters
|
|
259
285
|
url = url.rstrip('.,;!?)')
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/enhanced_hybrid_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.41 → academic_refchecker-1.2.43}/src/checkers/local_semantic_scholar.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|