academic-refchecker 1.2.33__py3-none-any.whl → 1.2.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.33"
3
+ __version__ = "1.2.35"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.33
3
+ Version: 1.2.35
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,20 +1,21 @@
1
- __version__.py,sha256=YtdSvC3iqdJ10GwnY3kN7EWY4lINq9u1TnQ2R-XQg1Y,65
2
- academic_refchecker-1.2.33.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ __version__.py,sha256=uj_o09nsXqyl0HrS9JiFstvRwB4CAFwQuTgnfqbNKdg,65
2
+ academic_refchecker-1.2.35.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
3
  checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
4
  checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
5
- checkers/enhanced_hybrid_checker.py,sha256=_OcI_vJgsz3JZD9yrwwIZJ7YABaUH0PxMIlLdZo864M,22329
5
+ checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
6
6
  checkers/github_checker.py,sha256=54K6_YJW5w2GtzodnSOLfK5d1ErFJxbTOIIV5P_kFX0,13543
7
7
  checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
8
8
  checkers/openalex.py,sha256=GxYUH9GZ0AyF-WFKgXiFHqkalrSnySgFSkiM1PsK0VI,19757
9
- checkers/semantic_scholar.py,sha256=FZawz0Hkofdi6QqfQpswi2AUgadfwSvZ9H85cx22AOU,34584
9
+ checkers/openreview_checker.py,sha256=QRQXUk1Ws-e-wETSeLgq06WmHQrjUk17my_Zj4rrwmY,20303
10
+ checkers/semantic_scholar.py,sha256=YHR9nWaT7aieyczVMRKCPHr3k_Hl8g1rzd0k4f3bDTs,35022
10
11
  checkers/webpage_checker.py,sha256=BvNwOqukTX9IeQUpUfIrI_5Gr2w9VLBt5x_PB-hKUIo,21616
11
12
  config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
12
13
  config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
13
14
  config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
14
15
  core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
15
16
  core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
16
- core/parallel_processor.py,sha256=hlfjCwc73G51v8L_dx-bKiaVrzAe7oxDYZI35LiRPA0,16714
17
- core/refchecker.py,sha256=hO0i_wZZkry17EMwOUtXBEZPV1ukr3Uk-mb-JJReIQo,280359
17
+ core/parallel_processor.py,sha256=2S1cAPhtWH3glvtiJrt9JkZzk2iJkPKXsc-F3lg0X6U,16795
18
+ core/refchecker.py,sha256=lR5UUd-O8-2z-aijyhtFOGkCqQHSh21bXL3PsDKRSno,283410
18
19
  database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
19
20
  database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
20
21
  llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -25,17 +26,18 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
25
26
  services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
26
27
  services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
27
28
  utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
28
- utils/arxiv_utils.py,sha256=jcs2raExxSqpl5KYcQN-vcWX1bPcwTme0m_AvHrckV4,7621
29
+ utils/arxiv_utils.py,sha256=OfUB0zR8OxcsnC_N7meSNte68uhfvmNySGL6uj1NmRY,6753
29
30
  utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
30
31
  utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
31
32
  utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
32
33
  utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
33
34
  utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
34
35
  utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
35
- utils/text_utils.py,sha256=1cSHC2JMhF4R7XNbg4yEx5Bl5JQ-e3w5MDW0nemKbKs,162749
36
+ utils/text_utils.py,sha256=SbuzUQD8430z7Ll1_4aTilVzwknh1O4N8LeSAx5yF-M,177904
37
+ utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
36
38
  utils/url_utils.py,sha256=qoimCrMFCBGvlmF_t1c6zSOmkWi_rUm-gZM0XZ4rEVE,6291
37
- academic_refchecker-1.2.33.dist-info/METADATA,sha256=AxxBrvxaEbvV47TiH1xVYu0Ckl1ZIxvbFCnQ5reh2lY,22298
38
- academic_refchecker-1.2.33.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
- academic_refchecker-1.2.33.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
40
- academic_refchecker-1.2.33.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
41
- academic_refchecker-1.2.33.dist-info/RECORD,,
39
+ academic_refchecker-1.2.35.dist-info/METADATA,sha256=W8YaWup9_0p1c24SOEJP-AHSG9pcGBrXFugNfn5TR0g,22298
40
+ academic_refchecker-1.2.35.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
41
+ academic_refchecker-1.2.35.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
42
+ academic_refchecker-1.2.35.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
43
+ academic_refchecker-1.2.35.dist-info/RECORD,,
@@ -98,6 +98,16 @@ class EnhancedHybridReferenceChecker:
98
98
  except Exception as e:
99
99
  logger.warning(f"Enhanced Hybrid: Failed to initialize CrossRef: {e}")
100
100
 
101
+ # Initialize OpenReview checker
102
+ self.openreview = None
103
+ try:
104
+ from .openreview_checker import OpenReviewReferenceChecker
105
+ self.openreview = OpenReviewReferenceChecker()
106
+ logger.debug("Enhanced Hybrid: OpenReview checker initialized")
107
+ except Exception as e:
108
+ logger.warning(f"Enhanced Hybrid: Failed to initialize OpenReview: {e}")
109
+ self.openreview = None
110
+
101
111
  # Google Scholar removed - using more reliable APIs only
102
112
 
103
113
  # Track API performance for adaptive selection
@@ -105,7 +115,8 @@ class EnhancedHybridReferenceChecker:
105
115
  'local_db': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
106
116
  'semantic_scholar': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
107
117
  'openalex': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
108
- 'crossref': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0}
118
+ 'crossref': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
119
+ 'openreview': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0}
109
120
  }
110
121
 
111
122
  # Track failed API calls for retry logic - OPTIMIZED CONFIGURATION
@@ -297,7 +308,17 @@ class EnhancedHybridReferenceChecker:
297
308
  if failure_type in ['throttled', 'timeout', 'server_error']:
298
309
  failed_apis.append(('openalex', self.openalex, failure_type))
299
310
 
300
- # Strategy 5: Try CrossRef if we haven't already (for non-DOI references)
311
+ # Strategy 5: Try OpenReview if URL suggests it's an OpenReview paper
312
+ if (self.openreview and
313
+ hasattr(self.openreview, 'is_openreview_reference') and
314
+ self.openreview.is_openreview_reference(reference)):
315
+ verified_data, errors, url, success, failure_type = self._try_api('openreview', self.openreview, reference)
316
+ if success:
317
+ return verified_data, errors, url
318
+ if failure_type in ['throttled', 'timeout', 'server_error']:
319
+ failed_apis.append(('openreview', self.openreview, failure_type))
320
+
321
+ # Strategy 6: Try CrossRef if we haven't already (for non-DOI references)
301
322
  if not self._should_try_doi_apis_first(reference) and self.crossref:
302
323
  verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
303
324
  if success:
@@ -0,0 +1,512 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ OpenReview API Client for Reference Verification
4
+
5
+ This module provides functionality to verify references from OpenReview papers.
6
+ OpenReview is a platform for open peer review in machine learning conferences
7
+ like ICLR, NeurIPS, ICML, etc.
8
+
9
+ Usage:
10
+ from openreview_checker import OpenReviewReferenceChecker
11
+
12
+ # Initialize the checker
13
+ checker = OpenReviewReferenceChecker()
14
+
15
+ # Verify a reference
16
+ reference = {
17
+ 'title': 'Title of the paper',
18
+ 'authors': ['Author 1', 'Author 2'],
19
+ 'year': 2024,
20
+ 'url': 'https://openreview.net/forum?id=ZG3RaNIsO8',
21
+ 'raw_text': 'Full citation text'
22
+ }
23
+
24
+ verified_data, errors, url = checker.verify_reference(reference)
25
+ """
26
+
27
+ import requests
28
+ import time
29
+ import logging
30
+ import re
31
+ import json
32
+ from typing import Dict, List, Tuple, Optional, Any, Union
33
+ from urllib.parse import urlparse, parse_qs
34
+ from bs4 import BeautifulSoup
35
+ from utils.text_utils import (
36
+ normalize_text, clean_title_basic, is_name_match,
37
+ calculate_title_similarity, compare_authors,
38
+ clean_title_for_search, are_venues_substantially_different,
39
+ is_year_substantially_different
40
+ )
41
+
42
+ # Set up logging
43
+ logger = logging.getLogger(__name__)
44
+
45
+ class OpenReviewReferenceChecker:
46
+ """
47
+ A class to verify references using OpenReview
48
+ """
49
+
50
+ def __init__(self, request_delay: float = 1.0):
51
+ """
52
+ Initialize the OpenReview client
53
+
54
+ Args:
55
+ request_delay: Delay between requests to be respectful to OpenReview servers
56
+ """
57
+ self.base_url = "https://openreview.net"
58
+ self.api_url = "https://api.openreview.net"
59
+ self.request_delay = request_delay
60
+ self.last_request_time = 0
61
+
62
+ # Session for connection pooling
63
+ self.session = requests.Session()
64
+ self.session.headers.update({
65
+ 'User-Agent': 'RefChecker/1.0 (Academic Reference Verification)',
66
+ 'Accept': 'application/json, text/html',
67
+ 'Accept-Language': 'en-US,en;q=0.9'
68
+ })
69
+
70
+ def is_openreview_url(self, url: str) -> bool:
71
+ """
72
+ Check if URL is from OpenReview
73
+
74
+ Args:
75
+ url: URL to check
76
+
77
+ Returns:
78
+ True if it's an OpenReview URL
79
+ """
80
+ return bool(url and 'openreview.net' in url.lower())
81
+
82
+ def is_openreview_reference(self, reference: Dict[str, Any]) -> bool:
83
+ """
84
+ Determine if this reference is from OpenReview based on URL patterns
85
+
86
+ Args:
87
+ reference: Reference dictionary to check
88
+
89
+ Returns:
90
+ True if reference appears to be from OpenReview
91
+ """
92
+ # Check various URL fields for OpenReview URLs
93
+ url_fields = ['url', 'openreview_url', 'link', 'venue_url']
94
+ for field in url_fields:
95
+ url = reference.get(field, '')
96
+ if url and self.is_openreview_url(url):
97
+ return True
98
+
99
+ # Check raw text for OpenReview URLs
100
+ raw_text = reference.get('raw_text', '')
101
+ if raw_text and 'openreview.net' in raw_text.lower():
102
+ return True
103
+
104
+ return False
105
+
106
+ def extract_paper_id(self, url: str) -> Optional[str]:
107
+ """
108
+ Extract paper ID from OpenReview URL
109
+
110
+ Args:
111
+ url: OpenReview URL
112
+
113
+ Returns:
114
+ Paper ID if found, None otherwise
115
+ """
116
+ if not self.is_openreview_url(url):
117
+ return None
118
+
119
+ # Handle different OpenReview URL formats:
120
+ # https://openreview.net/forum?id=ZG3RaNIsO8
121
+ # https://openreview.net/pdf?id=ZG3RaNIsO8
122
+ # https://openreview.net/forum?id=ZG3RaNIsO8&noteId=...
123
+
124
+ parsed = urlparse(url)
125
+ query_params = parse_qs(parsed.query)
126
+
127
+ if 'id' in query_params:
128
+ return query_params['id'][0]
129
+
130
+ # Also check path-based URLs (if they exist)
131
+ path_match = re.search(r'/(?:forum|pdf|notes)/([A-Za-z0-9_-]+)', parsed.path)
132
+ if path_match:
133
+ return path_match.group(1)
134
+
135
+ return None
136
+
137
+ def _respectful_request(self, url: str, **kwargs) -> Optional[requests.Response]:
138
+ """Make a respectful HTTP request with rate limiting"""
139
+ current_time = time.time()
140
+ time_since_last = current_time - self.last_request_time
141
+
142
+ if time_since_last < self.request_delay:
143
+ time.sleep(self.request_delay - time_since_last)
144
+
145
+ try:
146
+ logger.debug(f"Making request to: {url}")
147
+ response = self.session.get(url, timeout=15, **kwargs)
148
+ self.last_request_time = time.time()
149
+ logger.debug(f"Request successful: {response.status_code}")
150
+ return response
151
+ except requests.exceptions.RequestException as e:
152
+ logger.debug(f"Request failed for {url}: {type(e).__name__}: {e}")
153
+ return None
154
+
155
+ def get_paper_metadata(self, paper_id: str) -> Optional[Dict[str, Any]]:
156
+ """
157
+ Get paper metadata from OpenReview
158
+
159
+ Args:
160
+ paper_id: OpenReview paper ID
161
+
162
+ Returns:
163
+ Paper metadata dictionary or None if not found
164
+ """
165
+ # Try API endpoint first
166
+ api_url = f"{self.api_url}/notes?id={paper_id}"
167
+ response = self._respectful_request(api_url)
168
+
169
+ if response and response.status_code == 200:
170
+ try:
171
+ data = response.json()
172
+ if 'notes' in data and data['notes']:
173
+ note = data['notes'][0]
174
+ return self._parse_api_response(note)
175
+ except (json.JSONDecodeError, KeyError) as e:
176
+ logger.debug(f"Failed to parse API response: {e}")
177
+
178
+ # Fall back to web scraping
179
+ forum_url = f"{self.base_url}/forum?id={paper_id}"
180
+ response = self._respectful_request(forum_url)
181
+
182
+ if not response or response.status_code != 200:
183
+ return None
184
+
185
+ return self._parse_web_page(response.text, forum_url)
186
+
187
+ def _parse_api_response(self, note: Dict[str, Any]) -> Dict[str, Any]:
188
+ """
189
+ Parse OpenReview API response to extract metadata
190
+
191
+ Args:
192
+ note: Note data from API response
193
+
194
+ Returns:
195
+ Parsed metadata dictionary
196
+ """
197
+ content = note.get('content', {})
198
+
199
+ # Extract basic metadata
200
+ metadata = {
201
+ 'id': note.get('id'),
202
+ 'title': content.get('title', '').strip(),
203
+ 'authors': [],
204
+ 'year': None,
205
+ 'venue': None,
206
+ 'abstract': content.get('abstract', '').strip(),
207
+ 'keywords': content.get('keywords', []),
208
+ 'pdf_url': content.get('pdf'),
209
+ 'forum_url': f"{self.base_url}/forum?id={note.get('id')}",
210
+ 'source': 'openreview_api'
211
+ }
212
+
213
+ # Parse authors
214
+ authors_raw = content.get('authors', [])
215
+ if isinstance(authors_raw, list):
216
+ metadata['authors'] = [author.strip() for author in authors_raw if author.strip()]
217
+ elif isinstance(authors_raw, str):
218
+ # Sometimes authors are in a single string
219
+ metadata['authors'] = [author.strip() for author in authors_raw.split(',') if author.strip()]
220
+
221
+ # Extract year from various sources
222
+ # Check creation time
223
+ if 'cdate' in note:
224
+ try:
225
+ import datetime
226
+ timestamp = note['cdate'] / 1000.0 # Convert from milliseconds
227
+ year = datetime.datetime.fromtimestamp(timestamp).year
228
+ metadata['year'] = year
229
+ except (ValueError, TypeError):
230
+ pass
231
+
232
+ # Check if venue/conference info is available
233
+ venue_info = content.get('venue', '')
234
+ if venue_info:
235
+ metadata['venue'] = venue_info.strip()
236
+
237
+ # Try to extract venue from forum context or submission info
238
+ if not metadata['venue']:
239
+ # Common venues for OpenReview
240
+ forum_path = note.get('forum', '')
241
+ if 'ICLR' in str(content) or 'iclr' in forum_path.lower():
242
+ metadata['venue'] = 'ICLR'
243
+ elif 'NeurIPS' in str(content) or 'neurips' in forum_path.lower():
244
+ metadata['venue'] = 'NeurIPS'
245
+ elif 'ICML' in str(content) or 'icml' in forum_path.lower():
246
+ metadata['venue'] = 'ICML'
247
+
248
+ return metadata
249
+
250
+ def _parse_web_page(self, html: str, url: str) -> Dict[str, Any]:
251
+ """
252
+ Parse OpenReview web page to extract metadata
253
+
254
+ Args:
255
+ html: HTML content of the page
256
+ url: Original URL
257
+
258
+ Returns:
259
+ Parsed metadata dictionary
260
+ """
261
+ soup = BeautifulSoup(html, 'html.parser')
262
+
263
+ # Extract paper ID from URL
264
+ paper_id = self.extract_paper_id(url)
265
+
266
+ metadata = {
267
+ 'id': paper_id,
268
+ 'title': '',
269
+ 'authors': [],
270
+ 'year': None,
271
+ 'venue': None,
272
+ 'abstract': '',
273
+ 'keywords': [],
274
+ 'forum_url': url,
275
+ 'source': 'openreview_web'
276
+ }
277
+
278
+ # Extract title
279
+ title_elem = soup.find('h2', {'class': 'citation_title'}) or soup.find('h1')
280
+ if title_elem:
281
+ metadata['title'] = title_elem.get_text().strip()
282
+
283
+ # Try to find title in meta tags
284
+ if not metadata['title']:
285
+ meta_title = soup.find('meta', {'property': 'og:title'}) or soup.find('meta', {'name': 'title'})
286
+ if meta_title and meta_title.get('content'):
287
+ metadata['title'] = meta_title['content'].strip()
288
+
289
+ # Extract authors from meta tags (most reliable for OpenReview)
290
+ author_metas = soup.find_all('meta', {'name': 'citation_author'})
291
+ if author_metas:
292
+ metadata['authors'] = [meta.get('content', '').strip() for meta in author_metas if meta.get('content', '').strip()]
293
+
294
+ # Fallback: try to find authors in HTML structure
295
+ if not metadata['authors']:
296
+ authors_section = soup.find('div', {'class': 'authors'}) or soup.find('span', {'class': 'authors'})
297
+ if authors_section:
298
+ # Extract author names from links or text
299
+ author_links = authors_section.find_all('a')
300
+ if author_links:
301
+ metadata['authors'] = [link.get_text().strip() for link in author_links]
302
+ else:
303
+ # Parse comma-separated authors
304
+ authors_text = authors_section.get_text().strip()
305
+ metadata['authors'] = [author.strip() for author in authors_text.split(',') if author.strip()]
306
+
307
+ # Extract year from various sources
308
+ year_pattern = r'\b(20\d{2})\b'
309
+
310
+ # Check date/year elements
311
+ date_elem = soup.find('span', {'class': 'date'}) or soup.find('time')
312
+ if date_elem:
313
+ year_match = re.search(year_pattern, date_elem.get_text())
314
+ if year_match:
315
+ metadata['year'] = int(year_match.group(1))
316
+
317
+ # Check meta tags for date
318
+ if not metadata['year']:
319
+ meta_date = soup.find('meta', {'name': 'citation_date'}) or soup.find('meta', {'name': 'date'})
320
+ if meta_date and meta_date.get('content'):
321
+ year_match = re.search(year_pattern, meta_date['content'])
322
+ if year_match:
323
+ metadata['year'] = int(year_match.group(1))
324
+
325
+ # Extract abstract
326
+ abstract_elem = soup.find('div', {'class': 'abstract'}) or soup.find('section', {'class': 'abstract'})
327
+ if abstract_elem:
328
+ metadata['abstract'] = abstract_elem.get_text().strip()
329
+
330
+ # Extract venue information from meta tags (most reliable for OpenReview)
331
+ venue_meta = soup.find('meta', {'name': 'citation_conference_title'})
332
+ if venue_meta and venue_meta.get('content'):
333
+ venue_full = venue_meta['content'].strip()
334
+ # Convert long conference names to common abbreviations
335
+ if 'International Conference on Learning Representations' in venue_full:
336
+ # Extract year if present
337
+ year_match = re.search(r'\b(20\d{2})\b', venue_full)
338
+ if year_match:
339
+ metadata['venue'] = f'ICLR {year_match.group(1)}'
340
+ else:
341
+ metadata['venue'] = 'ICLR'
342
+ elif 'Neural Information Processing Systems' in venue_full or 'NeurIPS' in venue_full:
343
+ year_match = re.search(r'\b(20\d{2})\b', venue_full)
344
+ if year_match:
345
+ metadata['venue'] = f'NeurIPS {year_match.group(1)}'
346
+ else:
347
+ metadata['venue'] = 'NeurIPS'
348
+ else:
349
+ metadata['venue'] = venue_full
350
+
351
+ # Fallback: try HTML structure
352
+ if not metadata['venue']:
353
+ venue_elem = soup.find('div', {'class': 'venue'}) or soup.find('span', {'class': 'venue'})
354
+ if venue_elem:
355
+ metadata['venue'] = venue_elem.get_text().strip()
356
+
357
+ # Final fallback: try to determine venue from page context or URL
358
+ if not metadata['venue']:
359
+ page_text = soup.get_text().lower()
360
+ if 'iclr' in page_text or 'iclr' in url.lower():
361
+ if '2024' in page_text:
362
+ metadata['venue'] = 'ICLR 2024'
363
+ else:
364
+ metadata['venue'] = 'ICLR'
365
+ elif 'neurips' in page_text or 'neurips' in url.lower():
366
+ metadata['venue'] = 'NeurIPS'
367
+ elif 'icml' in page_text or 'icml' in url.lower():
368
+ metadata['venue'] = 'ICML'
369
+
370
+ # Extract keywords if available
371
+ keywords_elem = soup.find('div', {'class': 'keywords'})
372
+ if keywords_elem:
373
+ keywords_text = keywords_elem.get_text()
374
+ metadata['keywords'] = [kw.strip() for kw in keywords_text.split(',') if kw.strip()]
375
+
376
+ return metadata
377
+
378
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
379
+ """
380
+ Verify a reference against OpenReview
381
+
382
+ Args:
383
+ reference: Reference dictionary with title, authors, year, url, etc.
384
+
385
+ Returns:
386
+ Tuple of (verified_data, errors, paper_url) where:
387
+ - verified_data: Dict with verified OpenReview paper data or None
388
+ - errors: List of error/warning dictionaries
389
+ - paper_url: The OpenReview URL
390
+ """
391
+ logger.debug(f"Verifying OpenReview reference: {reference.get('title', 'Untitled')}")
392
+
393
+ # Extract OpenReview URL from reference
394
+ openreview_url = None
395
+ for url_key in ['url', 'openreview_url', 'link']:
396
+ if url_key in reference and reference[url_key]:
397
+ url = reference[url_key].strip()
398
+ if self.is_openreview_url(url):
399
+ openreview_url = url
400
+ break
401
+
402
+ if not openreview_url:
403
+ logger.debug("No OpenReview URL found in reference")
404
+ return None, [], None
405
+
406
+ # Extract paper ID
407
+ paper_id = self.extract_paper_id(openreview_url)
408
+ if not paper_id:
409
+ return None, [{"error_type": "unverified", "error_details": "Could not extract paper ID from OpenReview URL"}], openreview_url
410
+
411
+ # Get paper metadata
412
+ paper_data = self.get_paper_metadata(paper_id)
413
+ if not paper_data:
414
+ return None, [{"error_type": "unverified", "error_details": "Paper not found on OpenReview"}], openreview_url
415
+
416
+ logger.debug(f"Found OpenReview paper: {paper_data.get('title', 'Untitled')}")
417
+
418
+ # Verify the reference against the paper data
419
+ errors = []
420
+
421
+ # Check title match
422
+ cited_title = reference.get('title', '').strip()
423
+ paper_title = paper_data.get('title', '').strip()
424
+
425
+ if cited_title and paper_title:
426
+ similarity = calculate_title_similarity(cited_title, paper_title)
427
+ if similarity < 0.7: # Using a reasonable threshold
428
+ errors.append({
429
+ "warning_type": "title",
430
+ "warning_details": f"Title mismatch: cited as '{cited_title}' but OpenReview shows '{paper_title}' (similarity: {similarity:.2f})"
431
+ })
432
+
433
+ # Check authors
434
+ cited_authors = reference.get('authors', [])
435
+ paper_authors = paper_data.get('authors', [])
436
+
437
+ if cited_authors and paper_authors:
438
+ # Convert to list format if needed
439
+ if isinstance(cited_authors, str):
440
+ cited_authors = [author.strip() for author in cited_authors.split(',')]
441
+ if isinstance(paper_authors, str):
442
+ paper_authors = [author.strip() for author in paper_authors.split(',')]
443
+
444
+ # Use the existing author comparison function
445
+ match, error_msg = compare_authors(cited_authors, paper_authors)
446
+ if not match and error_msg:
447
+ errors.append({
448
+ "warning_type": "author",
449
+ "warning_details": error_msg
450
+ })
451
+
452
+ # Check year
453
+ cited_year = reference.get('year')
454
+ paper_year = paper_data.get('year')
455
+
456
+ if cited_year and paper_year:
457
+ try:
458
+ cited_year_int = int(cited_year)
459
+ paper_year_int = int(paper_year)
460
+
461
+ is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
462
+ if is_different and year_message:
463
+ errors.append({
464
+ "warning_type": "year",
465
+ "warning_details": year_message
466
+ })
467
+ except (ValueError, TypeError):
468
+ pass # Skip year validation if conversion fails
469
+
470
+ # Check venue if provided in reference
471
+ cited_venue = reference.get('venue', '').strip()
472
+ paper_venue = paper_data.get('venue', '').strip()
473
+
474
+ if cited_venue and paper_venue:
475
+ if are_venues_substantially_different(cited_venue, paper_venue):
476
+ errors.append({
477
+ "warning_type": "venue",
478
+ "warning_details": f"Venue mismatch: cited as '{cited_venue}' but OpenReview shows '{paper_venue}'"
479
+ })
480
+
481
+ # Create verified data structure
482
+ verified_data = {
483
+ 'title': paper_data.get('title', cited_title),
484
+ 'authors': paper_data.get('authors', cited_authors),
485
+ 'year': paper_data.get('year', cited_year),
486
+ 'venue': paper_data.get('venue', cited_venue),
487
+ 'url': openreview_url,
488
+ 'abstract': paper_data.get('abstract', ''),
489
+ 'keywords': paper_data.get('keywords', []),
490
+ 'openreview_metadata': paper_data,
491
+ 'verification_source': 'OpenReview'
492
+ }
493
+
494
+ logger.debug(f"OpenReview verification completed for: {openreview_url}")
495
+ return verified_data, errors, openreview_url
496
+
497
+ def search_paper(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
498
+ """
499
+ Search for papers on OpenReview by title, authors, and/or year
500
+
501
+ Args:
502
+ title: Paper title to search for
503
+ authors: List of author names (optional)
504
+ year: Publication year (optional)
505
+
506
+ Returns:
507
+ List of matching paper metadata dictionaries
508
+ """
509
+ # This would implement search functionality if needed
510
+ # For now, OpenReview verification is primarily URL-based
511
+ logger.debug(f"Search functionality not yet implemented for OpenReview")
512
+ return []
@@ -558,9 +558,17 @@ class NonArxivReferenceChecker:
558
558
  # For arXiv papers, suggest including the arXiv URL instead of venue
559
559
  arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
560
560
 
561
- # Check if the reference already includes this ArXiv URL
561
+ # Check if the reference already includes this ArXiv URL or equivalent DOI
562
562
  reference_url = reference.get('url', '')
563
- if arxiv_url not in reference_url:
563
+
564
+ # Check for direct arXiv URL match
565
+ has_arxiv_url = arxiv_url in reference_url
566
+
567
+ # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
568
+ arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
569
+ has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
570
+
571
+ if not (has_arxiv_url or has_arxiv_doi):
564
572
  errors.append({
565
573
  'warning_type': 'venue',
566
574
  'warning_details': f"Reference should include arXiv URL: {arxiv_url}",
@@ -276,7 +276,8 @@ class ParallelReferenceProcessor:
276
276
 
277
277
  # Print reference info in the same format as sequential mode
278
278
  title = reference.get('title', 'Untitled')
279
- authors = ', '.join(reference.get('authors', []))
279
+ from utils.text_utils import format_authors_for_display
280
+ authors = format_authors_for_display(reference.get('authors', []))
280
281
  year = reference.get('year', '')
281
282
  venue = reference.get('venue', '')
282
283
  url = reference.get('url', '')