academic-refchecker 1.2.41__py3-none-any.whl → 1.2.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.41"
3
+ __version__ = "1.2.43"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.41
3
+ Version: 1.2.43
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,21 +1,21 @@
1
- __version__.py,sha256=tbn_aH6OynXsBIY3A9fiPSRAJGwMoVxGjhXqbRQuZ9g,65
2
- academic_refchecker-1.2.41.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ __version__.py,sha256=JbybFux4Juuafz1jN0cgsedPmzBO8U9DJ874tJu2saA,65
2
+ academic_refchecker-1.2.43.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
3
  checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
4
  checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
5
5
  checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
6
6
  checkers/github_checker.py,sha256=54K6_YJW5w2GtzodnSOLfK5d1ErFJxbTOIIV5P_kFX0,13543
7
7
  checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
8
8
  checkers/openalex.py,sha256=GxYUH9GZ0AyF-WFKgXiFHqkalrSnySgFSkiM1PsK0VI,19757
9
- checkers/openreview_checker.py,sha256=QRQXUk1Ws-e-wETSeLgq06WmHQrjUk17my_Zj4rrwmY,20303
10
- checkers/semantic_scholar.py,sha256=YHR9nWaT7aieyczVMRKCPHr3k_Hl8g1rzd0k4f3bDTs,35022
9
+ checkers/openreview_checker.py,sha256=FLh21F0Zr7Gj3BI0u-gE6IwGNOZiRcViirDBeNvUp94,20432
10
+ checkers/semantic_scholar.py,sha256=BelhyIJ-W8navRdqEGpk12CIXYWmVL2Cq8HHZR7ynJs,34905
11
11
  checkers/webpage_checker.py,sha256=BvNwOqukTX9IeQUpUfIrI_5Gr2w9VLBt5x_PB-hKUIo,21616
12
12
  config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
13
13
  config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
14
14
  config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
15
15
  core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
16
16
  core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
17
- core/parallel_processor.py,sha256=_CMR27Z3Nh325ZO4syteso93t0x4cGQACVyXM1V8Ucw,16886
18
- core/refchecker.py,sha256=ElXgD1iPI-rDDFZmCPMZpkIP4UeX3nPAJVCfsVPNgcw,274640
17
+ core/parallel_processor.py,sha256=5V2iJDBFwwryMCnCNU_oRt2u5he1wpy-_9qapC_6f00,17043
18
+ core/refchecker.py,sha256=sVRg3PUzrs2vLFlEBoi4bxUy-TpO5iQHCkokGas-ygQ,273616
19
19
  database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
20
20
  database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
21
21
  llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -30,17 +30,17 @@ utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
30
30
  utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
31
31
  utils/biblatex_parser.py,sha256=OkHXQcjiBrEDuhBfEk0RtmAYxufu5lAxAjb8__DzMjI,25537
32
32
  utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
33
- utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
33
+ utils/bibtex_parser.py,sha256=a89NLy_q2kwED4QFJgxWFgPQOJBV73bIUL3RS_Urmro,15231
34
34
  utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
35
35
  utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
36
36
  utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
37
- utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
37
+ utils/error_utils.py,sha256=JqnRg4z-O9GcJ1eJGeTMzmOQwPWbWo2Lf6Duwj-ymHQ,6258
38
38
  utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
39
- utils/text_utils.py,sha256=mwF038oE_e3xR8V6FUXf2k8BdiQx-VbIfB-yQ0j1huY,190784
39
+ utils/text_utils.py,sha256=jPgCOBTVboLRJyypoOtL-dg1wBDQrKBux2ImvC6wL58,206296
40
40
  utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
41
- utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
42
- academic_refchecker-1.2.41.dist-info/METADATA,sha256=WbuXtXUEZIgztArmnM9y4F2VOChXf71-ZRTPQSk2tsI,22298
43
- academic_refchecker-1.2.41.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- academic_refchecker-1.2.41.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
- academic_refchecker-1.2.41.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
- academic_refchecker-1.2.41.dist-info/RECORD,,
41
+ utils/url_utils.py,sha256=aq1hSYEA888bOKuBOGWRclgTFIjw32rpFdsBO_Ja8ZM,8402
42
+ academic_refchecker-1.2.43.dist-info/METADATA,sha256=ZsJhIw1n7Yjoug6mpV4zpAPf-eSW5xSMdd3Dl_WTOlI,22298
43
+ academic_refchecker-1.2.43.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ academic_refchecker-1.2.43.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
+ academic_refchecker-1.2.43.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
+ academic_refchecker-1.2.43.dist-info/RECORD,,
@@ -473,9 +473,10 @@ class OpenReviewReferenceChecker:
473
473
 
474
474
  if cited_venue and paper_venue:
475
475
  if are_venues_substantially_different(cited_venue, paper_venue):
476
+ from utils.error_utils import clean_venue_for_comparison
476
477
  errors.append({
477
478
  "warning_type": "venue",
478
- "warning_details": f"Venue mismatch: cited as '{cited_venue}' but OpenReview shows '{paper_venue}'"
479
+ "warning_details": f"Venue mismatch: cited as '{clean_venue_for_comparison(cited_venue)}' but OpenReview shows '{clean_venue_for_comparison(paper_venue)}'"
479
480
  })
480
481
 
481
482
  # Create verified data structure
@@ -544,11 +544,8 @@ class NonArxivReferenceChecker:
544
544
  if cited_venue and paper_venue:
545
545
  # Use the utility function to check if venues are substantially different
546
546
  if are_venues_substantially_different(cited_venue, paper_venue):
547
- errors.append({
548
- 'warning_type': 'venue',
549
- 'warning_details': f"Venue mismatch: cited as '{cited_venue}' but actually '{paper_venue}'",
550
- 'ref_venue_correct': paper_venue
551
- })
547
+ from utils.error_utils import create_venue_warning
548
+ errors.append(create_venue_warning(cited_venue, paper_venue))
552
549
  elif not cited_venue and paper_venue:
553
550
  # Check if this is an arXiv paper first
554
551
  external_ids = paper_data.get('externalIds', {})
@@ -279,8 +279,11 @@ class ParallelReferenceProcessor:
279
279
  from utils.text_utils import format_authors_for_display
280
280
  authors = format_authors_for_display(reference.get('authors', []))
281
281
  year = reference.get('year', '')
282
- # Get venue from either 'venue' or 'journal' field
282
+ # Get venue from either 'venue' or 'journal' field and clean it up
283
283
  venue = reference.get('venue', '') or reference.get('journal', '')
284
+ if venue:
285
+ from utils.error_utils import clean_venue_for_comparison
286
+ venue = clean_venue_for_comparison(venue)
284
287
  url = reference.get('url', '')
285
288
  doi = reference.get('doi', '')
286
289
 
core/refchecker.py CHANGED
@@ -5407,25 +5407,6 @@ class ArxivReferenceChecker:
5407
5407
  if error_details:
5408
5408
  subreason = self._categorize_unverified_reason(error_details)
5409
5409
  print(f" Subreason: {subreason}")
5410
-
5411
- year_str = self._format_year_string(reference.get('year'))
5412
-
5413
- # Apply LaTeX cleaning and formatting to authors for display
5414
- authors = reference.get('authors', [])
5415
- if authors:
5416
- from utils.text_utils import strip_latex_commands, format_authors_for_display
5417
- cleaned_authors = [strip_latex_commands(author) for author in authors]
5418
- authors_display = format_authors_for_display(cleaned_authors)
5419
- else:
5420
- authors_display = 'Unknown authors'
5421
-
5422
- # Only show URL if it exists and is different from reference_url
5423
- ref_url = reference.get('url', '').strip()
5424
- if ref_url and ref_url != reference_url:
5425
- # Clean trailing punctuation from URL display
5426
- from utils.url_utils import clean_url_punctuation
5427
- clean_ref_url = clean_url_punctuation(ref_url)
5428
- print(f" URL: {clean_ref_url}")
5429
5410
 
5430
5411
  def _categorize_unverified_reason(self, error_details):
5431
5412
  """Categorize the unverified error into checker error or not found"""
utils/bibtex_parser.py CHANGED
@@ -103,37 +103,99 @@ def parse_bibtex_entry_content(entry_type: str, entry_key: str, content: str) ->
103
103
  Returns:
104
104
  Dictionary with parsed entry data
105
105
  """
106
- # Extract fields using regex
107
106
  fields = {}
108
107
 
109
- # Pattern to match field = {value} or field = "value"
110
- # Handle nested braces properly
111
- field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
112
-
113
- for match in re.finditer(field_pattern, content, re.DOTALL):
114
- field_name = match.group(1).lower()
115
- field_value = match.group(2) or match.group(3) or ""
116
- # Strip outer quotes if present (handles cases like title = {"Some Title"})
117
- field_value = field_value.strip()
118
- if field_value.startswith('"') and field_value.endswith('"'):
119
- field_value = field_value[1:-1]
120
- fields[field_name] = field_value
108
+ # Use a more robust approach with manual parsing
109
+ i = 0
110
+ while i < len(content):
111
+ # Skip whitespace
112
+ while i < len(content) and content[i].isspace():
113
+ i += 1
114
+
115
+ if i >= len(content):
116
+ break
117
+
118
+ # Look for field name
119
+ field_start = i
120
+ while i < len(content) and (content[i].isalnum() or content[i] == '_'):
121
+ i += 1
122
+
123
+ if i == field_start:
124
+ i += 1 # Skip non-alphanumeric character
125
+ continue
126
+
127
+ field_name = content[field_start:i].lower()
128
+
129
+ # Skip whitespace
130
+ while i < len(content) and content[i].isspace():
131
+ i += 1
132
+
133
+ # Look for equals sign
134
+ if i >= len(content) or content[i] != '=':
135
+ continue
136
+ i += 1 # Skip '='
137
+
138
+ # Skip whitespace
139
+ while i < len(content) and content[i].isspace():
140
+ i += 1
141
+
142
+ if i >= len(content):
143
+ break
144
+
145
+ # Parse field value
146
+ field_value = ""
147
+ if content[i] == '"':
148
+ # Handle quoted strings
149
+ i += 1 # Skip opening quote
150
+ value_start = i
151
+ while i < len(content) and content[i] != '"':
152
+ i += 1
153
+ if i < len(content):
154
+ field_value = content[value_start:i]
155
+ i += 1 # Skip closing quote
156
+ elif content[i] == '{':
157
+ # Handle braced strings with proper nesting
158
+ brace_count = 0
159
+ value_start = i + 1 # Skip opening brace
160
+ i += 1
161
+ while i < len(content):
162
+ if content[i] == '{':
163
+ brace_count += 1
164
+ elif content[i] == '}':
165
+ if brace_count == 0:
166
+ break
167
+ brace_count -= 1
168
+ i += 1
169
+
170
+ if i < len(content):
171
+ field_value = content[value_start:i]
172
+ i += 1 # Skip closing brace
173
+
174
+ if field_value:
175
+ field_value = field_value.strip()
176
+ # Strip outer quotes if present (handles cases like title = {"Some Title"})
177
+ if field_value.startswith('"') and field_value.endswith('"'):
178
+ field_value = field_value[1:-1]
179
+ fields[field_name] = field_value
180
+
181
+ # Skip to next field (look for comma)
182
+ while i < len(content) and content[i] not in ',}':
183
+ i += 1
184
+ if i < len(content) and content[i] == ',':
185
+ i += 1
121
186
 
122
- # If field extraction failed, try a simpler approach
187
+ # Fallback to regex if manual parsing failed
123
188
  if not fields:
124
- logger.debug("Field extraction failed, trying line-by-line approach")
125
- lines = content.split('\n')
126
- for line in lines:
127
- line = line.strip()
128
- if '=' in line:
129
- field_match = re.match(r'(\w+)\s*=\s*[{"]([^{}"]*)[}"]', line)
130
- if field_match:
131
- field_name = field_match.group(1).lower()
132
- field_value = field_match.group(2).strip()
133
- # Strip outer quotes if present
134
- if field_value.startswith('"') and field_value.endswith('"'):
135
- field_value = field_value[1:-1]
136
- fields[field_name] = field_value
189
+ logger.debug("Manual parsing failed, trying regex approach")
190
+ field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
191
+
192
+ for match in re.finditer(field_pattern, content, re.DOTALL):
193
+ field_name = match.group(1).lower()
194
+ field_value = match.group(2) or match.group(3) or ""
195
+ field_value = field_value.strip()
196
+ if field_value.startswith('"') and field_value.endswith('"'):
197
+ field_value = field_value[1:-1]
198
+ fields[field_name] = field_value
137
199
 
138
200
  return {
139
201
  'type': entry_type,
@@ -216,6 +278,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
216
278
 
217
279
  # Extract journal/venue
218
280
  journal = fields.get('journal', fields.get('booktitle', fields.get('venue', '')))
281
+ # Remove braces from journal/venue names
282
+ if journal and journal.startswith('{') and journal.endswith('}'):
283
+ journal = journal[1:-1]
219
284
 
220
285
  # Extract DOI and construct URL
221
286
  doi = fields.get('doi', '')
@@ -225,6 +290,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
225
290
 
226
291
  # Extract other URLs
227
292
  url = fields.get('url', '')
293
+ if url:
294
+ from utils.url_utils import clean_url
295
+ url = clean_url(url)
228
296
 
229
297
  # Handle special @misc entries with only howpublished field
230
298
  if not title and not authors and entry_type == 'misc':
@@ -249,6 +317,10 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
249
317
  else:
250
318
  url = howpublished
251
319
 
320
+ # Clean the reconstructed URL
321
+ from utils.url_utils import clean_url
322
+ url = clean_url(url)
323
+
252
324
  # Generate title from domain/path
253
325
  if 'jailbreakchat.com' in domain:
254
326
  title = 'JailbreakChat Website'
@@ -275,6 +347,11 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
275
347
 
276
348
  if url.startswith('\\url{') and url.endswith('}'):
277
349
  url = url[5:-1] # Remove \url{...}
350
+
351
+ # Clean any URL we extracted
352
+ if url:
353
+ from utils.url_utils import clean_url
354
+ url = clean_url(url)
278
355
 
279
356
  # Construct ArXiv URL from eprint field if no URL present
280
357
  if not url and not doi_url:
utils/error_utils.py CHANGED
@@ -89,6 +89,20 @@ def create_title_error(error_details: str, correct_title: str) -> Dict[str, str]
89
89
  }
90
90
 
91
91
 
92
+ def clean_venue_for_comparison(venue: str) -> str:
93
+ """
94
+ Clean venue name for display in warnings using the shared normalization logic.
95
+
96
+ Args:
97
+ venue: Raw venue string
98
+
99
+ Returns:
100
+ Cleaned venue name suitable for display
101
+ """
102
+ from utils.text_utils import normalize_venue_for_display
103
+ return normalize_venue_for_display(venue)
104
+
105
+
92
106
  def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]:
93
107
  """
94
108
  Create a standardized venue warning dictionary.
@@ -100,9 +114,13 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
100
114
  Returns:
101
115
  Standardized warning dictionary
102
116
  """
117
+ # Clean both venues for display in the warning
118
+ clean_cited = clean_venue_for_comparison(cited_venue)
119
+ clean_correct = clean_venue_for_comparison(correct_venue)
120
+
103
121
  return {
104
122
  'warning_type': 'venue',
105
- 'warning_details': f"Venue mismatch: cited as '{cited_venue}' but actually '{correct_venue}'",
123
+ 'warning_details': f"Venue mismatch: cited as '{clean_cited}' but actually '{clean_correct}'",
106
124
  'ref_venue_correct': correct_venue
107
125
  }
108
126
 
utils/text_utils.py CHANGED
@@ -11,6 +11,69 @@ from typing import List
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
+ def expand_abbreviations(text: str) -> str:
15
+ """
16
+ Generic abbreviation expansion using common academic patterns.
17
+
18
+ This function expands common academic abbreviations to their full forms
19
+ to improve venue name matching and comparison.
20
+
21
+ Args:
22
+ text: Text containing potential abbreviations
23
+
24
+ Returns:
25
+ Text with abbreviations expanded
26
+ """
27
+ if not text:
28
+ return text
29
+
30
+ common_abbrevs = {
31
+ # IEEE specific abbreviations (only expand with periods, not full words)
32
+ 'robot.': 'robotics', 'autom.': 'automation', 'lett.': 'letters',
33
+ 'trans.': 'transactions', 'syst.': 'systems', 'netw.': 'networks',
34
+ 'learn.': 'learning', 'ind.': 'industrial', 'electron.': 'electronics',
35
+ 'mechatron.': 'mechatronics', 'intell.': 'intelligence',
36
+ 'transp.': 'transportation', 'contr.': 'control', 'mag.': 'magazine',
37
+ # General academic abbreviations (only expand with periods)
38
+ 'int.': 'international', 'intl.': 'international', 'conf.': 'conference',
39
+ 'j.': 'journal', 'proc.': 'proceedings', 'assoc.': 'association',
40
+ 'comput.': 'computing', 'sci.': 'science', 'eng.': 'engineering',
41
+ 'tech.': 'technology', 'artif.': 'artificial', 'mach.': 'machine',
42
+ 'stat.': 'statistics', 'math.': 'mathematics', 'phys.': 'physics',
43
+ 'chem.': 'chemistry', 'bio.': 'biology', 'med.': 'medicine',
44
+ 'adv.': 'advances', 'ann.': 'annual', 'symp.': 'symposium',
45
+ 'workshop': 'workshop', 'worksh.': 'workshop',
46
+ 'natl.': 'national', 'acad.': 'academy', 'rev.': 'review',
47
+ # Physics journal abbreviations
48
+ 'phys.': 'physics', 'phys. rev.': 'physical review',
49
+ 'phys. rev. lett.': 'physical review letters',
50
+ 'phys. rev. a': 'physical review a', 'phys. rev. b': 'physical review b',
51
+ 'phys. rev. c': 'physical review c', 'phys. rev. d': 'physical review d',
52
+ 'phys. rev. e': 'physical review e', 'phys. lett.': 'physics letters',
53
+ 'phys. lett. b': 'physics letters b', 'nucl. phys.': 'nuclear physics',
54
+ 'nucl. phys. a': 'nuclear physics a', 'nucl. phys. b': 'nuclear physics b',
55
+ 'j. phys.': 'journal of physics', 'ann. phys.': 'annals of physics',
56
+ 'mod. phys. lett.': 'modern physics letters', 'eur. phys. j.': 'european physical journal',
57
+ # Nature journals
58
+ 'nature phys.': 'nature physics', 'sci. adv.': 'science advances',
59
+ # Handle specific multi-word patterns and well-known acronyms
60
+ 'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
61
+ 'pnas': 'proceedings of the national academy of sciences',
62
+ 'neurips': 'neural information processing systems',
63
+ }
64
+
65
+ # Sort by length (longest first) to ensure longer matches take precedence
66
+ for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
67
+ # For abbreviations ending in period, use word boundary at start only
68
+ if abbrev.endswith('.'):
69
+ pattern = r'\b' + re.escape(abbrev)
70
+ else:
71
+ pattern = r'\b' + re.escape(abbrev) + r'\b'
72
+ text = re.sub(pattern, expansion, text)
73
+
74
+ return text
75
+
76
+
14
77
  def normalize_apostrophes(text):
15
78
  """
16
79
  Normalize all apostrophe variants to standard ASCII apostrophe
@@ -2255,8 +2318,13 @@ def format_author_for_display(author_name):
2255
2318
  if not author_name:
2256
2319
  return author_name
2257
2320
 
2321
+ # Clean up any stray punctuation that might have been attached during parsing
2322
+ author_name = author_name.strip()
2323
+ # Remove trailing semicolons that sometimes get attached during bibliographic parsing
2324
+ author_name = re.sub(r'[;,]\s*$', '', author_name)
2325
+
2258
2326
  # Normalize apostrophes for consistent display
2259
- author_name = normalize_apostrophes(author_name.strip())
2327
+ author_name = normalize_apostrophes(author_name)
2260
2328
 
2261
2329
  # Check if it's in "Lastname, Firstname" format
2262
2330
  if ',' in author_name:
@@ -3667,8 +3735,77 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
3667
3735
  return bool(venue1 != venue2)
3668
3736
 
3669
3737
  # Clean LaTeX commands from both venues first
3670
- venue1 = strip_latex_commands(venue1)
3671
- venue2 = strip_latex_commands(venue2)
3738
+ venue1_latex_cleaned = strip_latex_commands(venue1)
3739
+ venue2_latex_cleaned = strip_latex_commands(venue2)
3740
+
3741
+ # For comparison, we need lowercase normalized versions
3742
+ def normalize_for_comparison(venue_text):
3743
+ # Get the cleaned display version first
3744
+ cleaned = normalize_venue_for_display(venue_text)
3745
+ # Then normalize for comparison: lowercase, expand abbreviations, remove punctuation
3746
+ venue_lower = cleaned.lower()
3747
+
3748
+ # Handle LaTeX penalty commands before abbreviation expansion
3749
+ venue_lower = re.sub(r'\\penalty\d+\s*', ' ', venue_lower) # Remove \\penalty0 etc
3750
+ venue_lower = re.sub(r'\s+', ' ', venue_lower).strip() # Clean up extra spaces
3751
+
3752
+ # Expand abbreviations for comparison
3753
+ def expand_abbreviations(text):
3754
+ common_abbrevs = {
3755
+ # IEEE specific abbreviations (only expand with periods, not full words)
3756
+ 'robot.': 'robotics', 'autom.': 'automation', 'lett.': 'letters',
3757
+ 'trans.': 'transactions', 'syst.': 'systems', 'netw.': 'networks',
3758
+ 'learn.': 'learning', 'ind.': 'industrial', 'electron.': 'electronics',
3759
+ 'mechatron.': 'mechatronics', 'intell.': 'intelligence',
3760
+ 'transp.': 'transportation', 'contr.': 'control', 'mag.': 'magazine',
3761
+ # General academic abbreviations (only expand with periods)
3762
+ 'int.': 'international', 'intl.': 'international', 'conf.': 'conference',
3763
+ 'j.': 'journal', 'proc.': 'proceedings', 'assoc.': 'association',
3764
+ 'comput.': 'computing', 'sci.': 'science', 'eng.': 'engineering',
3765
+ 'tech.': 'technology', 'artif.': 'artificial', 'mach.': 'machine',
3766
+ 'stat.': 'statistics', 'math.': 'mathematics', 'phys.': 'physics',
3767
+ 'chem.': 'chemistry', 'bio.': 'biology', 'med.': 'medicine',
3768
+ 'adv.': 'advances', 'ann.': 'annual', 'symp.': 'symposium',
3769
+ 'workshop': 'workshop', 'worksh.': 'workshop',
3770
+ 'natl.': 'national', 'acad.': 'academy', 'rev.': 'review',
3771
+ # Physics journal abbreviations
3772
+ 'phys.': 'physics', 'phys. rev.': 'physical review',
3773
+ 'phys. rev. lett.': 'physical review letters',
3774
+ 'phys. rev. a': 'physical review a', 'phys. rev. b': 'physical review b',
3775
+ 'phys. rev. c': 'physical review c', 'phys. rev. d': 'physical review d',
3776
+ 'phys. rev. e': 'physical review e', 'phys. lett.': 'physics letters',
3777
+ 'phys. lett. b': 'physics letters b', 'nucl. phys.': 'nuclear physics',
3778
+ 'nucl. phys. a': 'nuclear physics a', 'nucl. phys. b': 'nuclear physics b',
3779
+ 'j. phys.': 'journal of physics', 'ann. phys.': 'annals of physics',
3780
+ 'mod. phys. lett.': 'modern physics letters', 'eur. phys. j.': 'european physical journal',
3781
+ # Nature journals
3782
+ 'nature phys.': 'nature physics', 'sci. adv.': 'science advances',
3783
+ # Handle specific multi-word patterns and well-known acronyms
3784
+ 'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
3785
+ 'pnas': 'proceedings of the national academy of sciences',
3786
+ }
3787
+ # Sort by length (longest first) to ensure longer matches take precedence
3788
+ for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
3789
+ # For abbreviations ending in period, use word boundary at start only
3790
+ if abbrev.endswith('.'):
3791
+ pattern = r'\b' + re.escape(abbrev)
3792
+ else:
3793
+ pattern = r'\b' + re.escape(abbrev) + r'\b'
3794
+ text = re.sub(pattern, expansion, text)
3795
+ return text
3796
+
3797
+ venue_lower = expand_abbreviations(venue_lower)
3798
+
3799
+ # Remove punctuation and normalize spacing for comparison
3800
+ venue_lower = re.sub(r'[.,;:]', '', venue_lower) # Remove punctuation
3801
+ venue_lower = re.sub(r'\\s+on\\s+', ' ', venue_lower) # Remove \"on\" preposition
3802
+ venue_lower = re.sub(r'\\s+for\\s+', ' ', venue_lower) # Remove \"for\" preposition
3803
+ venue_lower = re.sub(r'\\s+', ' ', venue_lower).strip() # Normalize whitespace
3804
+
3805
+ return venue_lower
3806
+
3807
+ normalized_venue1 = normalize_for_comparison(venue1_latex_cleaned)
3808
+ normalized_venue2 = normalize_for_comparison(venue2_latex_cleaned)
3672
3809
 
3673
3810
  def expand_abbreviations(text):
3674
3811
  """Generic abbreviation expansion using common academic patterns"""
@@ -3985,8 +4122,8 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
3985
4122
  if not acronym or not full_text:
3986
4123
  return False
3987
4124
 
3988
- # Normalize the full text
3989
- normalized_full = normalize_venue(full_text)
4125
+ # Use the internal comparison normalization function
4126
+ normalized_full = normalize_for_comparison(full_text)
3990
4127
 
3991
4128
  # Generate all possible acronyms from the full text
3992
4129
  possible_acronyms = []
@@ -4100,9 +4237,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
4100
4237
  if (arxiv1 == 'arxiv' and arxiv2.startswith('https://arxiv.org')) or (arxiv2 == 'arxiv' and arxiv1.startswith('https://arxiv.org')):
4101
4238
  return False
4102
4239
 
4103
- # Normalize both venues first
4104
- norm1 = normalize_venue(venue1)
4105
- norm2 = normalize_venue(venue2)
4240
+ # Use normalized venues from shared function
4241
+ norm1 = normalized_venue1
4242
+ norm2 = normalized_venue2
4106
4243
 
4107
4244
  # Direct match after normalization (highest priority)
4108
4245
  if norm1 == norm2:
@@ -4356,4 +4493,144 @@ def is_year_substantially_different(cited_year: int, correct_year: int, context:
4356
4493
 
4357
4494
  # Any year difference should be flagged as a warning for manual review
4358
4495
  warning_msg = f"Year mismatch: cited as {cited_year} but actually {correct_year}"
4359
- return True, warning_msg
4496
+ return True, warning_msg
4497
+
4498
+
4499
+ def normalize_venue_for_display(venue: str) -> str:
4500
+ """
4501
+ Normalize venue names for consistent display and comparison.
4502
+
4503
+ This function is used both for display in warnings and for venue comparison
4504
+ to ensure consistent normalization across the system.
4505
+
4506
+ Args:
4507
+ venue: Raw venue string
4508
+
4509
+ Returns:
4510
+ Normalized venue string with prefixes removed and abbreviations expanded
4511
+ """
4512
+ if not venue:
4513
+ return ""
4514
+
4515
+ def expand_abbreviations(text):
4516
+ """Generic abbreviation expansion using common academic patterns"""
4517
+ # Common academic abbreviations mapping
4518
+ common_abbrevs = {
4519
+ # IEEE specific abbreviations (only expand with periods, not full words)
4520
+ 'robot.': 'robotics',
4521
+ 'autom.': 'automation',
4522
+ 'lett.': 'letters',
4523
+ 'trans.': 'transactions',
4524
+ 'syst.': 'systems',
4525
+ 'netw.': 'networks',
4526
+ 'learn.': 'learning',
4527
+ 'ind.': 'industrial',
4528
+ 'electron.': 'electronics',
4529
+ 'mechatron.': 'mechatronics',
4530
+ 'intell.': 'intelligence',
4531
+ 'transp.': 'transportation',
4532
+ 'contr.': 'control',
4533
+ 'mag.': 'magazine',
4534
+
4535
+ # General academic abbreviations (only expand with periods)
4536
+ 'int.': 'international',
4537
+ 'intl.': 'international',
4538
+ 'conf.': 'conference',
4539
+ 'j.': 'journal',
4540
+ 'proc.': 'proceedings',
4541
+ 'assoc.': 'association',
4542
+ 'comput.': 'computing',
4543
+ 'sci.': 'science',
4544
+ 'eng.': 'engineering',
4545
+ 'tech.': 'technology',
4546
+ 'artif.': 'artificial',
4547
+ 'mach.': 'machine',
4548
+ 'stat.': 'statistics',
4549
+ 'math.': 'mathematics',
4550
+ 'phys.': 'physics',
4551
+ 'chem.': 'chemistry',
4552
+ 'bio.': 'biology',
4553
+ 'med.': 'medicine',
4554
+ 'adv.': 'advances',
4555
+ 'ann.': 'annual',
4556
+ 'symp.': 'symposium',
4557
+ 'workshop': 'workshop',
4558
+ 'worksh.': 'workshop',
4559
+ }
4560
+
4561
+ text_lower = text.lower()
4562
+ for abbrev, expansion in common_abbrevs.items():
4563
+ # Only replace if it's a word boundary to avoid partial replacements
4564
+ pattern = r'\b' + re.escape(abbrev) + r'\b'
4565
+ text_lower = re.sub(pattern, expansion, text_lower)
4566
+
4567
+ return text_lower
4568
+
4569
+ venue_text = venue.strip()
4570
+
4571
+ # Extract venue from complex editor strings (e.g. "In Smith, J.; and Doe, K., eds., Conference Name, volume 1")
4572
+ # This handles patterns like "In [authors], eds., [venue], [optional metadata]" (case-insensitive)
4573
+ editor_match = re.search(r'in\s+[^,]+(?:,\s*[^,]*)*,\s*eds?\.,\s*(.+?)(?:,\s*volume\s*\d+|,\s*pp?\.|$)', venue_text, re.IGNORECASE)
4574
+ if editor_match:
4575
+ # Extract the venue part from editor string (preserve original case)
4576
+ venue_text = editor_match.group(1).strip()
4577
+ # Clean up any remaining metadata like "volume X of Proceedings..." (case-insensitive)
4578
+ venue_text = re.sub(r',\s*volume\s+\d+.*$', '', venue_text, flags=re.IGNORECASE)
4579
+ venue_text = re.sub(r'\s+of\s+proceedings.*$', '', venue_text, flags=re.IGNORECASE)
4580
+
4581
+ # Remove years, volumes, pages, and other citation metadata
4582
+ # But preserve arXiv IDs (don't remove digits after arXiv:)
4583
+ if not re.match(r'arxiv:', venue_text, re.IGNORECASE):
4584
+ venue_text = re.sub(r',?\s*\d{4}[a-z]?\s*$', '', venue_text) # Years like "2024" or "2024b"
4585
+ venue_text = re.sub(r',?\s*\(\d{4}\)$', '', venue_text) # Years in parentheses
4586
+ venue_text = re.sub(r"'\d{2}$", '', venue_text) # Year suffixes like 'CVPR'16'
4587
+ venue_text = re.sub(r',?\s*(vol\.?\s*|volume\s*)\d+.*$', '', venue_text, flags=re.IGNORECASE) # Volume info
4588
+ venue_text = re.sub(r',?\s*\d+\s*\([^)]*\).*$', '', venue_text) # Issue info with optional spaces
4589
+ venue_text = re.sub(r',?\s*pp?\.\s*\d+.*$', '', venue_text, flags=re.IGNORECASE) # Page info
4590
+ venue_text = re.sub(r'\s*\(print\).*$', '', venue_text, flags=re.IGNORECASE) # Print designation
4591
+ venue_text = re.sub(r'\s*\(\d{4}\.\s*print\).*$', '', venue_text, flags=re.IGNORECASE) # Year.Print
4592
+
4593
+ # Remove procedural prefixes (case-insensitive)
4594
+ prefixes_to_remove = [
4595
+ r'^\d{4}\s+\d+(st|nd|rd|th)\s+', # "2012 IEEE/RSJ"
4596
+ r'^\d{4}\s+', # "2024 "
4597
+ r'^proceedings\s+(of\s+)?(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proceedings of the IEEE"
4598
+ r'^proc\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proc. of the IEEE" (require "of")
4599
+ r'^procs\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Procs. of the IEEE" (require "of")
4600
+ r'^in\s+',
4601
+ r'^advances\s+in\s+', # "Advances in Neural Information Processing Systems"
4602
+ r'^adv\.\s+', # "Adv. Neural Information Processing Systems"
4603
+ # Handle ordinal prefixes: "The Twelfth", "The Ninth", etc.
4604
+ r'^the\s+(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first|thirty-second|thirty-third|thirty-fourth|thirty-fifth|thirty-sixth|thirty-seventh|thirty-eighth|thirty-ninth|fortieth|forty-first|forty-second|forty-third|forty-fourth|forty-fifth|forty-sixth|forty-seventh|forty-eighth|forty-ninth|fiftieth)\s+',
4605
+ # Handle numeric ordinals: "The 41st", "The 12th", etc.
4606
+ r'^the\s+\d+(st|nd|rd|th)\s+',
4607
+ # Handle standalone "The" prefix
4608
+ r'^the\s+',
4609
+ ]
4610
+
4611
+ for prefix_pattern in prefixes_to_remove:
4612
+ venue_text = re.sub(prefix_pattern, '', venue_text, flags=re.IGNORECASE)
4613
+
4614
+ # Note: For display purposes, we preserve case and don't expand abbreviations
4615
+ # Only do minimal cleaning needed for proper display
4616
+
4617
+ # Remove organization prefixes/suffixes that don't affect identity (case-insensitive)
4618
+ # But preserve IEEE when it's part of a journal name like \"IEEE Transactions\"
4619
+ if not re.match(r'ieee\s+transactions', venue_text, re.IGNORECASE):
4620
+ venue_text = re.sub(r'^(ieee|acm|aaai|usenix|sigcomm|sigkdd|sigmod|vldb|osdi|sosp|eurosys)\s+', '', venue_text, flags=re.IGNORECASE) # Remove org prefixes
4621
+ venue_text = re.sub(r'^ieee/\w+\s+', '', venue_text, flags=re.IGNORECASE) # Remove "IEEE/RSJ " etc
4622
+ venue_text = re.sub(r'\s+(ieee|acm|aaai|usenix)\s*$', '', venue_text, flags=re.IGNORECASE) # Remove org suffixes
4623
+ venue_text = re.sub(r'/\w+\s+', ' ', venue_text) # Remove "/ACM " style org separators
4624
+
4625
+ # IMPORTANT: Don't remove "Conference on" or "International" - they're needed for display
4626
+ # Only remove specific org-prefixed conference patterns where the org is clear
4627
+ venue_text = re.sub(r'^(ieee|acm|aaai|nips)(/\w+)?\s+conference\s+on\s+', '', venue_text, flags=re.IGNORECASE)
4628
+
4629
+ # Note: Don't remove "Conference on" as it's often part of the actual venue name
4630
+ # Only remove it if it's clearly a procedural prefix (handled in prefixes_to_remove above)
4631
+
4632
+ # Clean up spacing (preserve punctuation and case for display)
4633
+ venue_text = re.sub(r'\s+', ' ', venue_text) # Normalize whitespace
4634
+ venue_text = venue_text.strip()
4635
+
4636
+ return venue_text
utils/url_utils.py CHANGED
@@ -209,7 +209,13 @@ def validate_url_format(url: str) -> bool:
209
209
 
210
210
  def clean_url(url: str) -> str:
211
211
  """
212
- Clean a URL by removing common issues like extra spaces, fragments, etc.
212
+ Clean a URL by removing common issues like extra spaces, fragments, malformed LaTeX, etc.
213
+
214
+ This function handles:
215
+ - Whitespace trimming
216
+ - Malformed LaTeX URL wrappers like \\url{https://...}
217
+ - Trailing punctuation from academic references
218
+ - DOI URL query parameter cleanup
213
219
 
214
220
  Args:
215
221
  url: URL to clean
@@ -223,6 +229,18 @@ def clean_url(url: str) -> str:
223
229
  # Remove leading/trailing whitespace
224
230
  url = url.strip()
225
231
 
232
+ # Handle malformed URLs that contain \url{} wrappers within the URL text
233
+ # e.g., "https://\url{https://www.example.com/}" -> "https://www.example.com/"
234
+ import re
235
+ url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
236
+ url_match = re.search(url_pattern, url)
237
+ if url_match:
238
+ url = url_match.group(1)
239
+
240
+ # Remove trailing punctuation that's commonly part of sentence structure
241
+ # but preserve legitimate URL characters
242
+ url = url.rstrip('.,;!?)')
243
+
226
244
  # Note: Preserving query parameters for all URLs now
227
245
  # Previously this function removed query parameters for non-DOI URLs,
228
246
  # but this was causing issues with OpenReview and other URLs that need their parameters
@@ -254,6 +272,14 @@ def clean_url_punctuation(url: str) -> str:
254
272
  # Remove leading/trailing whitespace
255
273
  url = url.strip()
256
274
 
275
+ # Handle malformed URLs that contain \\url{} wrappers within the URL text
276
+ # e.g., "https://\\url{https://www.example.com/}" -> "https://www.example.com/"
277
+ import re
278
+ url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
279
+ url_match = re.search(url_pattern, url)
280
+ if url_match:
281
+ url = url_match.group(1)
282
+
257
283
  # Remove trailing punctuation that's commonly part of sentence structure
258
284
  # but preserve legitimate URL characters
259
285
  url = url.rstrip('.,;!?)')