academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,984 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
OpenReview API Client for Reference Verification
|
|
4
|
+
|
|
5
|
+
This module provides functionality to verify references from OpenReview papers.
|
|
6
|
+
OpenReview is a platform for open peer review in machine learning conferences
|
|
7
|
+
like ICLR, NeurIPS, ICML, etc.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from openreview_checker import OpenReviewReferenceChecker
|
|
11
|
+
|
|
12
|
+
# Initialize the checker
|
|
13
|
+
checker = OpenReviewReferenceChecker()
|
|
14
|
+
|
|
15
|
+
# Verify a reference
|
|
16
|
+
reference = {
|
|
17
|
+
'title': 'Title of the paper',
|
|
18
|
+
'authors': ['Author 1', 'Author 2'],
|
|
19
|
+
'year': 2024,
|
|
20
|
+
'url': 'https://openreview.net/forum?id=ZG3RaNIsO8',
|
|
21
|
+
'raw_text': 'Full citation text'
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
verified_data, errors, url = checker.verify_reference(reference)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import requests
|
|
28
|
+
import time
|
|
29
|
+
import logging
|
|
30
|
+
import re
|
|
31
|
+
import json
|
|
32
|
+
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
33
|
+
from urllib.parse import urlparse, parse_qs
|
|
34
|
+
from bs4 import BeautifulSoup
|
|
35
|
+
from refchecker.utils.text_utils import (
|
|
36
|
+
normalize_text, clean_title_basic, is_name_match,
|
|
37
|
+
calculate_title_similarity, compare_authors,
|
|
38
|
+
clean_title_for_search, are_venues_substantially_different,
|
|
39
|
+
is_year_substantially_different, strip_latex_commands,
|
|
40
|
+
compare_titles_with_latex_cleaning
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Set up logging
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
class OpenReviewReferenceChecker:
|
|
47
|
+
"""
|
|
48
|
+
A class to verify references using OpenReview
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(self, request_delay: float = 1.0):
|
|
52
|
+
"""
|
|
53
|
+
Initialize the OpenReview client
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
request_delay: Delay between requests to be respectful to OpenReview servers
|
|
57
|
+
"""
|
|
58
|
+
self.base_url = "https://openreview.net"
|
|
59
|
+
self.api_url = "https://api.openreview.net"
|
|
60
|
+
self.request_delay = request_delay
|
|
61
|
+
self.last_request_time = 0
|
|
62
|
+
|
|
63
|
+
# Session for connection pooling
|
|
64
|
+
self.session = requests.Session()
|
|
65
|
+
self.session.headers.update({
|
|
66
|
+
'User-Agent': 'RefChecker/1.0 (Academic Reference Verification)',
|
|
67
|
+
'Accept': 'application/json, text/html',
|
|
68
|
+
'Accept-Language': 'en-US,en;q=0.9'
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
def is_openreview_url(self, url: str) -> bool:
|
|
72
|
+
"""
|
|
73
|
+
Check if URL is from OpenReview
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
url: URL to check
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
True if it's an OpenReview URL
|
|
80
|
+
"""
|
|
81
|
+
return bool(url and 'openreview.net' in url.lower())
|
|
82
|
+
|
|
83
|
+
def is_openreview_reference(self, reference: Dict[str, Any]) -> bool:
|
|
84
|
+
"""
|
|
85
|
+
Determine if this reference is from OpenReview based on URL patterns
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
reference: Reference dictionary to check
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
True if reference appears to be from OpenReview
|
|
92
|
+
"""
|
|
93
|
+
# Check various URL fields for OpenReview URLs
|
|
94
|
+
url_fields = ['url', 'openreview_url', 'link', 'venue_url']
|
|
95
|
+
for field in url_fields:
|
|
96
|
+
url = reference.get(field, '')
|
|
97
|
+
if url and self.is_openreview_url(url):
|
|
98
|
+
return True
|
|
99
|
+
|
|
100
|
+
# Check raw text for OpenReview URLs
|
|
101
|
+
raw_text = reference.get('raw_text', '')
|
|
102
|
+
if raw_text and 'openreview.net' in raw_text.lower():
|
|
103
|
+
return True
|
|
104
|
+
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
def extract_paper_id(self, url: str) -> Optional[str]:
|
|
108
|
+
"""
|
|
109
|
+
Extract paper ID from OpenReview URL
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
url: OpenReview URL
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Paper ID if found, None otherwise
|
|
116
|
+
"""
|
|
117
|
+
if not self.is_openreview_url(url):
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
# Handle different OpenReview URL formats:
|
|
121
|
+
# https://openreview.net/forum?id=ZG3RaNIsO8
|
|
122
|
+
# https://openreview.net/pdf?id=ZG3RaNIsO8
|
|
123
|
+
# https://openreview.net/forum?id=ZG3RaNIsO8¬eId=...
|
|
124
|
+
|
|
125
|
+
parsed = urlparse(url)
|
|
126
|
+
query_params = parse_qs(parsed.query)
|
|
127
|
+
|
|
128
|
+
if 'id' in query_params:
|
|
129
|
+
return query_params['id'][0]
|
|
130
|
+
|
|
131
|
+
# Also check path-based URLs (if they exist)
|
|
132
|
+
path_match = re.search(r'/(?:forum|pdf|notes)/([A-Za-z0-9_-]+)', parsed.path)
|
|
133
|
+
if path_match:
|
|
134
|
+
return path_match.group(1)
|
|
135
|
+
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
def _respectful_request(self, url: str, **kwargs) -> Optional[requests.Response]:
|
|
139
|
+
"""Make a respectful HTTP request with rate limiting"""
|
|
140
|
+
current_time = time.time()
|
|
141
|
+
time_since_last = current_time - self.last_request_time
|
|
142
|
+
|
|
143
|
+
if time_since_last < self.request_delay:
|
|
144
|
+
time.sleep(self.request_delay - time_since_last)
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
logger.debug(f"Making request to: {url}")
|
|
148
|
+
response = self.session.get(url, timeout=15, **kwargs)
|
|
149
|
+
self.last_request_time = time.time()
|
|
150
|
+
logger.debug(f"Request successful: {response.status_code}")
|
|
151
|
+
return response
|
|
152
|
+
except requests.exceptions.RequestException as e:
|
|
153
|
+
logger.debug(f"Request failed for {url}: {type(e).__name__}: {e}")
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
def get_paper_metadata(self, paper_id: str) -> Optional[Dict[str, Any]]:
|
|
157
|
+
"""
|
|
158
|
+
Get paper metadata from OpenReview
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
paper_id: OpenReview paper ID
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Paper metadata dictionary or None if not found
|
|
165
|
+
"""
|
|
166
|
+
# Try API endpoint first
|
|
167
|
+
api_url = f"{self.api_url}/notes?id={paper_id}"
|
|
168
|
+
response = self._respectful_request(api_url)
|
|
169
|
+
|
|
170
|
+
if response and response.status_code == 200:
|
|
171
|
+
try:
|
|
172
|
+
data = response.json()
|
|
173
|
+
if 'notes' in data and data['notes']:
|
|
174
|
+
note = data['notes'][0]
|
|
175
|
+
return self._parse_api_response(note)
|
|
176
|
+
except (json.JSONDecodeError, KeyError) as e:
|
|
177
|
+
logger.debug(f"Failed to parse API response: {e}")
|
|
178
|
+
|
|
179
|
+
# Fall back to web scraping
|
|
180
|
+
forum_url = f"{self.base_url}/forum?id={paper_id}"
|
|
181
|
+
response = self._respectful_request(forum_url)
|
|
182
|
+
|
|
183
|
+
if not response or response.status_code != 200:
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
return self._parse_web_page(response.text, forum_url)
|
|
187
|
+
|
|
188
|
+
def _parse_api_response(self, note: Dict[str, Any]) -> Dict[str, Any]:
|
|
189
|
+
"""
|
|
190
|
+
Parse OpenReview API response to extract metadata
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
note: Note data from API response
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Parsed metadata dictionary
|
|
197
|
+
"""
|
|
198
|
+
content = note.get('content', {})
|
|
199
|
+
|
|
200
|
+
# Extract basic metadata
|
|
201
|
+
metadata = {
|
|
202
|
+
'id': note.get('id'),
|
|
203
|
+
'title': content.get('title', '').strip(),
|
|
204
|
+
'authors': [],
|
|
205
|
+
'year': None,
|
|
206
|
+
'venue': None,
|
|
207
|
+
'abstract': content.get('abstract', '').strip(),
|
|
208
|
+
'keywords': content.get('keywords', []),
|
|
209
|
+
'pdf_url': content.get('pdf'),
|
|
210
|
+
'forum_url': f"{self.base_url}/forum?id={note.get('id')}",
|
|
211
|
+
'source': 'openreview_api'
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
# Parse authors
|
|
215
|
+
authors_raw = content.get('authors', [])
|
|
216
|
+
if isinstance(authors_raw, list):
|
|
217
|
+
metadata['authors'] = [author.strip() for author in authors_raw if author.strip()]
|
|
218
|
+
elif isinstance(authors_raw, str):
|
|
219
|
+
# Sometimes authors are in a single string
|
|
220
|
+
metadata['authors'] = [author.strip() for author in authors_raw.split(',') if author.strip()]
|
|
221
|
+
|
|
222
|
+
# Extract year from various sources
|
|
223
|
+
# Check creation time
|
|
224
|
+
if 'cdate' in note:
|
|
225
|
+
try:
|
|
226
|
+
import datetime
|
|
227
|
+
timestamp = note['cdate'] / 1000.0 # Convert from milliseconds
|
|
228
|
+
year = datetime.datetime.fromtimestamp(timestamp).year
|
|
229
|
+
metadata['year'] = year
|
|
230
|
+
except (ValueError, TypeError):
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
# Check if venue/conference info is available
|
|
234
|
+
venue_info = content.get('venue', '')
|
|
235
|
+
if venue_info:
|
|
236
|
+
metadata['venue'] = venue_info.strip()
|
|
237
|
+
|
|
238
|
+
# Try to extract venue from forum context or submission info
|
|
239
|
+
if not metadata['venue']:
|
|
240
|
+
# Common venues for OpenReview
|
|
241
|
+
forum_path = note.get('forum', '')
|
|
242
|
+
if 'ICLR' in str(content) or 'iclr' in forum_path.lower():
|
|
243
|
+
metadata['venue'] = 'ICLR'
|
|
244
|
+
elif 'NeurIPS' in str(content) or 'neurips' in forum_path.lower():
|
|
245
|
+
metadata['venue'] = 'NeurIPS'
|
|
246
|
+
elif 'ICML' in str(content) or 'icml' in forum_path.lower():
|
|
247
|
+
metadata['venue'] = 'ICML'
|
|
248
|
+
|
|
249
|
+
return metadata
|
|
250
|
+
|
|
251
|
+
def _parse_web_page(self, html: str, url: str) -> Dict[str, Any]:
|
|
252
|
+
"""
|
|
253
|
+
Parse OpenReview web page to extract metadata
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
html: HTML content of the page
|
|
257
|
+
url: Original URL
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Parsed metadata dictionary
|
|
261
|
+
"""
|
|
262
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
263
|
+
|
|
264
|
+
# Extract paper ID from URL
|
|
265
|
+
paper_id = self.extract_paper_id(url)
|
|
266
|
+
|
|
267
|
+
metadata = {
|
|
268
|
+
'id': paper_id,
|
|
269
|
+
'title': '',
|
|
270
|
+
'authors': [],
|
|
271
|
+
'year': None,
|
|
272
|
+
'venue': None,
|
|
273
|
+
'abstract': '',
|
|
274
|
+
'keywords': [],
|
|
275
|
+
'forum_url': url,
|
|
276
|
+
'source': 'openreview_web'
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
# Extract title
|
|
280
|
+
title_elem = soup.find('h2', {'class': 'citation_title'}) or soup.find('h1')
|
|
281
|
+
if title_elem:
|
|
282
|
+
metadata['title'] = title_elem.get_text().strip()
|
|
283
|
+
|
|
284
|
+
# Try to find title in meta tags
|
|
285
|
+
if not metadata['title']:
|
|
286
|
+
meta_title = soup.find('meta', {'property': 'og:title'}) or soup.find('meta', {'name': 'title'})
|
|
287
|
+
if meta_title and meta_title.get('content'):
|
|
288
|
+
metadata['title'] = meta_title['content'].strip()
|
|
289
|
+
|
|
290
|
+
# Extract authors from meta tags (most reliable for OpenReview)
|
|
291
|
+
author_metas = soup.find_all('meta', {'name': 'citation_author'})
|
|
292
|
+
if author_metas:
|
|
293
|
+
metadata['authors'] = [meta.get('content', '').strip() for meta in author_metas if meta.get('content', '').strip()]
|
|
294
|
+
|
|
295
|
+
# Fallback: try to find authors in HTML structure
|
|
296
|
+
if not metadata['authors']:
|
|
297
|
+
authors_section = soup.find('div', {'class': 'authors'}) or soup.find('span', {'class': 'authors'})
|
|
298
|
+
if authors_section:
|
|
299
|
+
# Extract author names from links or text
|
|
300
|
+
author_links = authors_section.find_all('a')
|
|
301
|
+
if author_links:
|
|
302
|
+
metadata['authors'] = [link.get_text().strip() for link in author_links]
|
|
303
|
+
else:
|
|
304
|
+
# Parse comma-separated authors
|
|
305
|
+
authors_text = authors_section.get_text().strip()
|
|
306
|
+
metadata['authors'] = [author.strip() for author in authors_text.split(',') if author.strip()]
|
|
307
|
+
|
|
308
|
+
# Extract year from various sources
|
|
309
|
+
year_pattern = r'\b(20\d{2})\b'
|
|
310
|
+
|
|
311
|
+
# Check date/year elements
|
|
312
|
+
date_elem = soup.find('span', {'class': 'date'}) or soup.find('time')
|
|
313
|
+
if date_elem:
|
|
314
|
+
year_match = re.search(year_pattern, date_elem.get_text())
|
|
315
|
+
if year_match:
|
|
316
|
+
metadata['year'] = int(year_match.group(1))
|
|
317
|
+
|
|
318
|
+
# Check meta tags for date
|
|
319
|
+
if not metadata['year']:
|
|
320
|
+
meta_date = soup.find('meta', {'name': 'citation_date'}) or soup.find('meta', {'name': 'date'})
|
|
321
|
+
if meta_date and meta_date.get('content'):
|
|
322
|
+
year_match = re.search(year_pattern, meta_date['content'])
|
|
323
|
+
if year_match:
|
|
324
|
+
metadata['year'] = int(year_match.group(1))
|
|
325
|
+
|
|
326
|
+
# Extract abstract
|
|
327
|
+
abstract_elem = soup.find('div', {'class': 'abstract'}) or soup.find('section', {'class': 'abstract'})
|
|
328
|
+
if abstract_elem:
|
|
329
|
+
metadata['abstract'] = abstract_elem.get_text().strip()
|
|
330
|
+
|
|
331
|
+
# Extract venue information from meta tags (most reliable for OpenReview)
|
|
332
|
+
venue_meta = soup.find('meta', {'name': 'citation_conference_title'})
|
|
333
|
+
if venue_meta and venue_meta.get('content'):
|
|
334
|
+
venue_full = venue_meta['content'].strip()
|
|
335
|
+
# Convert long conference names to common abbreviations
|
|
336
|
+
if 'International Conference on Learning Representations' in venue_full:
|
|
337
|
+
# Extract year if present
|
|
338
|
+
year_match = re.search(r'\b(20\d{2})\b', venue_full)
|
|
339
|
+
if year_match:
|
|
340
|
+
metadata['venue'] = f'ICLR {year_match.group(1)}'
|
|
341
|
+
else:
|
|
342
|
+
metadata['venue'] = 'ICLR'
|
|
343
|
+
elif 'Neural Information Processing Systems' in venue_full or 'NeurIPS' in venue_full:
|
|
344
|
+
year_match = re.search(r'\b(20\d{2})\b', venue_full)
|
|
345
|
+
if year_match:
|
|
346
|
+
metadata['venue'] = f'NeurIPS {year_match.group(1)}'
|
|
347
|
+
else:
|
|
348
|
+
metadata['venue'] = 'NeurIPS'
|
|
349
|
+
else:
|
|
350
|
+
metadata['venue'] = venue_full
|
|
351
|
+
|
|
352
|
+
# Fallback: try HTML structure
|
|
353
|
+
if not metadata['venue']:
|
|
354
|
+
venue_elem = soup.find('div', {'class': 'venue'}) or soup.find('span', {'class': 'venue'})
|
|
355
|
+
if venue_elem:
|
|
356
|
+
metadata['venue'] = venue_elem.get_text().strip()
|
|
357
|
+
|
|
358
|
+
# Final fallback: try to determine venue from page context or URL
|
|
359
|
+
if not metadata['venue']:
|
|
360
|
+
page_text = soup.get_text().lower()
|
|
361
|
+
if 'iclr' in page_text or 'iclr' in url.lower():
|
|
362
|
+
if '2024' in page_text:
|
|
363
|
+
metadata['venue'] = 'ICLR 2024'
|
|
364
|
+
else:
|
|
365
|
+
metadata['venue'] = 'ICLR'
|
|
366
|
+
elif 'neurips' in page_text or 'neurips' in url.lower():
|
|
367
|
+
metadata['venue'] = 'NeurIPS'
|
|
368
|
+
elif 'icml' in page_text or 'icml' in url.lower():
|
|
369
|
+
metadata['venue'] = 'ICML'
|
|
370
|
+
|
|
371
|
+
# Extract keywords if available
|
|
372
|
+
keywords_elem = soup.find('div', {'class': 'keywords'})
|
|
373
|
+
if keywords_elem:
|
|
374
|
+
keywords_text = keywords_elem.get_text()
|
|
375
|
+
metadata['keywords'] = [kw.strip() for kw in keywords_text.split(',') if kw.strip()]
|
|
376
|
+
|
|
377
|
+
return metadata
|
|
378
|
+
|
|
379
|
+
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
380
|
+
"""
|
|
381
|
+
Verify a reference against OpenReview
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
Tuple of (verified_data, errors, paper_url) where:
|
|
388
|
+
- verified_data: Dict with verified OpenReview paper data or None
|
|
389
|
+
- errors: List of error/warning dictionaries
|
|
390
|
+
- paper_url: The OpenReview URL
|
|
391
|
+
"""
|
|
392
|
+
logger.debug(f"Verifying OpenReview reference: {reference.get('title', 'Untitled')}")
|
|
393
|
+
|
|
394
|
+
# Extract OpenReview URL from reference
|
|
395
|
+
openreview_url = None
|
|
396
|
+
for url_key in ['url', 'openreview_url', 'link']:
|
|
397
|
+
if url_key in reference and reference[url_key]:
|
|
398
|
+
url = reference[url_key].strip()
|
|
399
|
+
if self.is_openreview_url(url):
|
|
400
|
+
openreview_url = url
|
|
401
|
+
break
|
|
402
|
+
|
|
403
|
+
if not openreview_url:
|
|
404
|
+
logger.debug("No OpenReview URL found in reference")
|
|
405
|
+
return None, [], None
|
|
406
|
+
|
|
407
|
+
# Extract paper ID
|
|
408
|
+
paper_id = self.extract_paper_id(openreview_url)
|
|
409
|
+
if not paper_id:
|
|
410
|
+
return None, [{"error_type": "unverified", "error_details": "Could not extract paper ID from OpenReview URL"}], openreview_url
|
|
411
|
+
|
|
412
|
+
# Get paper metadata
|
|
413
|
+
paper_data = self.get_paper_metadata(paper_id)
|
|
414
|
+
if not paper_data:
|
|
415
|
+
return None, [{"error_type": "unverified", "error_details": "Paper not found on OpenReview"}], openreview_url
|
|
416
|
+
|
|
417
|
+
logger.debug(f"Found OpenReview paper: {paper_data.get('title', 'Untitled')}")
|
|
418
|
+
|
|
419
|
+
# Verify the reference against the paper data
|
|
420
|
+
errors = []
|
|
421
|
+
|
|
422
|
+
# Check title match
|
|
423
|
+
cited_title = reference.get('title', '').strip()
|
|
424
|
+
paper_title = paper_data.get('title', '').strip()
|
|
425
|
+
|
|
426
|
+
if cited_title and paper_title:
|
|
427
|
+
similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
|
|
428
|
+
if similarity < 0.7: # Using a reasonable threshold
|
|
429
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
430
|
+
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
431
|
+
clean_cited_title = strip_latex_commands(cited_title)
|
|
432
|
+
details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
|
|
433
|
+
errors.append({
|
|
434
|
+
"warning_type": "title",
|
|
435
|
+
"warning_details": details
|
|
436
|
+
})
|
|
437
|
+
|
|
438
|
+
# Check authors
|
|
439
|
+
cited_authors = reference.get('authors', [])
|
|
440
|
+
paper_authors = paper_data.get('authors', [])
|
|
441
|
+
|
|
442
|
+
if cited_authors and paper_authors:
|
|
443
|
+
# Convert to list format if needed
|
|
444
|
+
if isinstance(cited_authors, str):
|
|
445
|
+
cited_authors = [author.strip() for author in cited_authors.split(',')]
|
|
446
|
+
if isinstance(paper_authors, str):
|
|
447
|
+
paper_authors = [author.strip() for author in paper_authors.split(',')]
|
|
448
|
+
|
|
449
|
+
# Use the existing author comparison function
|
|
450
|
+
match, error_msg = compare_authors(cited_authors, paper_authors)
|
|
451
|
+
if not match and error_msg:
|
|
452
|
+
errors.append({
|
|
453
|
+
"warning_type": "author",
|
|
454
|
+
"warning_details": error_msg
|
|
455
|
+
})
|
|
456
|
+
|
|
457
|
+
# Check year
|
|
458
|
+
cited_year = reference.get('year')
|
|
459
|
+
paper_year = paper_data.get('year')
|
|
460
|
+
|
|
461
|
+
if cited_year and paper_year:
|
|
462
|
+
try:
|
|
463
|
+
cited_year_int = int(cited_year)
|
|
464
|
+
paper_year_int = int(paper_year)
|
|
465
|
+
|
|
466
|
+
is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
|
|
467
|
+
if is_different and year_message:
|
|
468
|
+
from refchecker.utils.error_utils import format_year_mismatch
|
|
469
|
+
errors.append({
|
|
470
|
+
"warning_type": "year",
|
|
471
|
+
"warning_details": format_year_mismatch(cited_year_int, paper_year_int)
|
|
472
|
+
})
|
|
473
|
+
except (ValueError, TypeError):
|
|
474
|
+
pass # Skip year validation if conversion fails
|
|
475
|
+
|
|
476
|
+
# Check venue if provided in reference
|
|
477
|
+
cited_venue = reference.get('venue', '').strip()
|
|
478
|
+
paper_venue = paper_data.get('venue', '').strip()
|
|
479
|
+
|
|
480
|
+
if cited_venue and paper_venue:
|
|
481
|
+
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
482
|
+
from refchecker.utils.error_utils import format_venue_mismatch
|
|
483
|
+
errors.append({
|
|
484
|
+
"warning_type": "venue",
|
|
485
|
+
"warning_details": format_venue_mismatch(cited_venue, paper_venue)
|
|
486
|
+
})
|
|
487
|
+
|
|
488
|
+
# Create verified data structure
|
|
489
|
+
verified_data = {
|
|
490
|
+
'title': paper_data.get('title', cited_title),
|
|
491
|
+
'authors': paper_data.get('authors', cited_authors),
|
|
492
|
+
'year': paper_data.get('year', cited_year),
|
|
493
|
+
'venue': paper_data.get('venue', cited_venue),
|
|
494
|
+
'url': openreview_url,
|
|
495
|
+
'abstract': paper_data.get('abstract', ''),
|
|
496
|
+
'keywords': paper_data.get('keywords', []),
|
|
497
|
+
'openreview_metadata': paper_data,
|
|
498
|
+
'verification_source': 'OpenReview'
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
logger.debug(f"OpenReview verification completed for: {openreview_url}")
|
|
502
|
+
return verified_data, errors, openreview_url
|
|
503
|
+
|
|
504
|
+
def verify_by_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
505
|
+
"""
|
|
506
|
+
Verify a reference by searching OpenReview (when no URL is provided)
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
reference: Reference dictionary with title, authors, year, etc.
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
Tuple of (verified_data, errors, paper_url) where:
|
|
513
|
+
- verified_data: Dict with verified OpenReview paper data or None
|
|
514
|
+
- errors: List of error/warning dictionaries
|
|
515
|
+
- paper_url: The OpenReview URL if found
|
|
516
|
+
"""
|
|
517
|
+
logger.debug(f"Searching OpenReview for reference: {reference.get('title', 'Untitled')}")
|
|
518
|
+
|
|
519
|
+
title = reference.get('title', '').strip()
|
|
520
|
+
authors = reference.get('authors', [])
|
|
521
|
+
year = reference.get('year')
|
|
522
|
+
venue = reference.get('venue', '').strip()
|
|
523
|
+
|
|
524
|
+
if not title:
|
|
525
|
+
return None, [], None
|
|
526
|
+
|
|
527
|
+
# Check if venue suggests this might be on OpenReview
|
|
528
|
+
if not self._is_likely_openreview_venue(venue):
|
|
529
|
+
logger.debug(f"Venue '{venue}' doesn't suggest OpenReview, skipping search")
|
|
530
|
+
return None, [], None
|
|
531
|
+
|
|
532
|
+
# Search for matching papers
|
|
533
|
+
search_results = self.search_paper(title, authors, year)
|
|
534
|
+
|
|
535
|
+
if not search_results:
|
|
536
|
+
logger.debug("No matching papers found on OpenReview")
|
|
537
|
+
return None, [], None
|
|
538
|
+
|
|
539
|
+
# Use the best match (first result, as they're sorted by relevance)
|
|
540
|
+
best_match = search_results[0]
|
|
541
|
+
paper_url = best_match.get('forum_url')
|
|
542
|
+
|
|
543
|
+
logger.debug(f"Found OpenReview match: {best_match.get('title', 'Untitled')}")
|
|
544
|
+
|
|
545
|
+
# Verify the reference against the found paper
|
|
546
|
+
errors = []
|
|
547
|
+
|
|
548
|
+
# Check title match
|
|
549
|
+
cited_title = reference.get('title', '').strip()
|
|
550
|
+
paper_title = best_match.get('title', '').strip()
|
|
551
|
+
|
|
552
|
+
if cited_title and paper_title:
|
|
553
|
+
similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
|
|
554
|
+
if similarity < 0.8: # Slightly higher threshold for search results
|
|
555
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
556
|
+
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
557
|
+
clean_cited_title = strip_latex_commands(cited_title)
|
|
558
|
+
details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
|
|
559
|
+
errors.append({
|
|
560
|
+
"warning_type": "title",
|
|
561
|
+
"warning_details": details
|
|
562
|
+
})
|
|
563
|
+
|
|
564
|
+
# Check authors
|
|
565
|
+
cited_authors = reference.get('authors', [])
|
|
566
|
+
paper_authors = best_match.get('authors', [])
|
|
567
|
+
|
|
568
|
+
if cited_authors and paper_authors:
|
|
569
|
+
# Convert to list format if needed
|
|
570
|
+
if isinstance(cited_authors, str):
|
|
571
|
+
cited_authors = [author.strip() for author in cited_authors.split(',')]
|
|
572
|
+
if isinstance(paper_authors, str):
|
|
573
|
+
paper_authors = [author.strip() for author in paper_authors.split(',')]
|
|
574
|
+
|
|
575
|
+
# Use the existing author comparison function
|
|
576
|
+
match, error_msg = compare_authors(cited_authors, paper_authors)
|
|
577
|
+
if not match and error_msg:
|
|
578
|
+
errors.append({
|
|
579
|
+
"warning_type": "author",
|
|
580
|
+
"warning_details": error_msg
|
|
581
|
+
})
|
|
582
|
+
|
|
583
|
+
# Check year
|
|
584
|
+
cited_year = reference.get('year')
|
|
585
|
+
paper_year = best_match.get('year')
|
|
586
|
+
|
|
587
|
+
if cited_year and paper_year:
|
|
588
|
+
try:
|
|
589
|
+
cited_year_int = int(cited_year)
|
|
590
|
+
paper_year_int = int(paper_year)
|
|
591
|
+
|
|
592
|
+
is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
|
|
593
|
+
if is_different and year_message:
|
|
594
|
+
from refchecker.utils.error_utils import format_year_mismatch
|
|
595
|
+
errors.append({
|
|
596
|
+
"warning_type": "year",
|
|
597
|
+
"warning_details": format_year_mismatch(cited_year_int, paper_year_int)
|
|
598
|
+
})
|
|
599
|
+
except (ValueError, TypeError):
|
|
600
|
+
pass # Skip year validation if conversion fails
|
|
601
|
+
|
|
602
|
+
# Check venue if provided in reference
|
|
603
|
+
cited_venue = reference.get('venue', '').strip()
|
|
604
|
+
paper_venue = best_match.get('venue', '').strip()
|
|
605
|
+
|
|
606
|
+
if cited_venue and paper_venue:
|
|
607
|
+
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
608
|
+
from refchecker.utils.error_utils import format_venue_mismatch
|
|
609
|
+
errors.append({
|
|
610
|
+
"warning_type": "venue",
|
|
611
|
+
"warning_details": format_venue_mismatch(cited_venue, paper_venue)
|
|
612
|
+
})
|
|
613
|
+
|
|
614
|
+
# Create verified data structure
|
|
615
|
+
verified_data = {
|
|
616
|
+
'title': best_match.get('title', cited_title),
|
|
617
|
+
'authors': best_match.get('authors', cited_authors),
|
|
618
|
+
'year': best_match.get('year', cited_year),
|
|
619
|
+
'venue': best_match.get('venue', cited_venue),
|
|
620
|
+
'url': paper_url,
|
|
621
|
+
'abstract': best_match.get('abstract', ''),
|
|
622
|
+
'keywords': best_match.get('keywords', []),
|
|
623
|
+
'openreview_metadata': best_match,
|
|
624
|
+
'verification_source': 'OpenReview (search)'
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
logger.debug(f"OpenReview search verification completed for: {paper_url}")
|
|
628
|
+
return verified_data, errors, paper_url
|
|
629
|
+
|
|
630
|
+
def _is_likely_openreview_venue(self, venue: str) -> bool:
|
|
631
|
+
"""
|
|
632
|
+
Check if a venue suggests the paper might be on OpenReview
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
venue: Venue string from reference
|
|
636
|
+
|
|
637
|
+
Returns:
|
|
638
|
+
True if venue suggests OpenReview
|
|
639
|
+
"""
|
|
640
|
+
if not venue:
|
|
641
|
+
return False
|
|
642
|
+
|
|
643
|
+
venue_lower = venue.lower()
|
|
644
|
+
|
|
645
|
+
# Common venues that use OpenReview
|
|
646
|
+
openreview_venues = [
|
|
647
|
+
'iclr', 'international conference on learning representations',
|
|
648
|
+
'neurips', 'neural information processing systems', 'nips',
|
|
649
|
+
'icml', 'international conference on machine learning',
|
|
650
|
+
'iclr workshop', 'neurips workshop', 'icml workshop',
|
|
651
|
+
'aaai', 'ijcai', 'aistats'
|
|
652
|
+
]
|
|
653
|
+
|
|
654
|
+
for or_venue in openreview_venues:
|
|
655
|
+
if or_venue in venue_lower:
|
|
656
|
+
return True
|
|
657
|
+
|
|
658
|
+
return False
|
|
659
|
+
|
|
660
|
+
def search_paper(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
|
|
661
|
+
"""
|
|
662
|
+
Search for papers on OpenReview by title, authors, and/or year
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
title: Paper title to search for
|
|
666
|
+
authors: List of author names (optional)
|
|
667
|
+
year: Publication year (optional)
|
|
668
|
+
|
|
669
|
+
Returns:
|
|
670
|
+
List of matching paper metadata dictionaries
|
|
671
|
+
"""
|
|
672
|
+
if not title or not title.strip():
|
|
673
|
+
return []
|
|
674
|
+
|
|
675
|
+
logger.debug(f"Searching OpenReview for: {title}")
|
|
676
|
+
|
|
677
|
+
# Clean title for search
|
|
678
|
+
search_title = clean_title_for_search(title)
|
|
679
|
+
|
|
680
|
+
# Try API search first
|
|
681
|
+
results = self._search_via_api(search_title, authors, year)
|
|
682
|
+
if results:
|
|
683
|
+
return results
|
|
684
|
+
|
|
685
|
+
# If API search fails, try web search as fallback
|
|
686
|
+
return self._search_via_web(search_title, authors, year)
|
|
687
|
+
|
|
688
|
+
def _search_via_api(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
|
|
689
|
+
"""
|
|
690
|
+
Search using OpenReview API
|
|
691
|
+
|
|
692
|
+
Args:
|
|
693
|
+
title: Clean title to search for
|
|
694
|
+
authors: List of author names (optional)
|
|
695
|
+
year: Publication year (optional)
|
|
696
|
+
|
|
697
|
+
Returns:
|
|
698
|
+
List of matching paper dictionaries
|
|
699
|
+
"""
|
|
700
|
+
try:
|
|
701
|
+
# The OpenReview API requires specific parameters
|
|
702
|
+
# We'll search by content.title or content.venue (for venue-based search)
|
|
703
|
+
search_params = {
|
|
704
|
+
'limit': 20, # Limit results to avoid overwhelming the API
|
|
705
|
+
'details': 'directReplies' # Get basic details
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
# Try searching by venue first if year suggests recent conferences
|
|
709
|
+
if year and year >= 2017: # OpenReview started around 2017
|
|
710
|
+
venues_by_year = {
|
|
711
|
+
2025: ['ICLR 2025'],
|
|
712
|
+
2024: ['ICLR 2024', 'NeurIPS 2024', 'ICML 2024'],
|
|
713
|
+
2023: ['ICLR 2023', 'NeurIPS 2023', 'ICML 2023'],
|
|
714
|
+
2022: ['ICLR 2022', 'NeurIPS 2022', 'ICML 2022'],
|
|
715
|
+
2021: ['ICLR 2021', 'NeurIPS 2021', 'ICML 2021'],
|
|
716
|
+
2020: ['ICLR 2020', 'NeurIPS 2020', 'ICML 2020'],
|
|
717
|
+
2019: ['ICLR 2019', 'NeurIPS 2019', 'ICML 2019'],
|
|
718
|
+
2018: ['ICLR 2018', 'NeurIPS 2018', 'ICML 2018'],
|
|
719
|
+
2017: ['ICLR 2017']
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
possible_venues = venues_by_year.get(year, [])
|
|
723
|
+
|
|
724
|
+
results = []
|
|
725
|
+
for venue in possible_venues:
|
|
726
|
+
# Search by venue and then filter by title
|
|
727
|
+
venue_params = search_params.copy()
|
|
728
|
+
venue_params['content.venue'] = venue
|
|
729
|
+
|
|
730
|
+
api_url = f"{self.api_url}/notes"
|
|
731
|
+
response = self._respectful_request(api_url, params=venue_params)
|
|
732
|
+
|
|
733
|
+
if response and response.status_code == 200:
|
|
734
|
+
try:
|
|
735
|
+
data = response.json()
|
|
736
|
+
if 'notes' in data and data['notes']:
|
|
737
|
+
for note in data['notes']:
|
|
738
|
+
try:
|
|
739
|
+
metadata = self._parse_api_response(note)
|
|
740
|
+
if metadata and self._is_good_match(metadata, title, authors, year):
|
|
741
|
+
results.append(metadata)
|
|
742
|
+
if len(results) >= 5: # Limit results
|
|
743
|
+
break
|
|
744
|
+
except Exception as e:
|
|
745
|
+
logger.debug(f"Error parsing note: {e}")
|
|
746
|
+
continue
|
|
747
|
+
|
|
748
|
+
if results:
|
|
749
|
+
break # Found results, no need to search other venues
|
|
750
|
+
|
|
751
|
+
except (json.JSONDecodeError, KeyError) as e:
|
|
752
|
+
logger.debug(f"Failed to parse venue search response: {e}")
|
|
753
|
+
continue
|
|
754
|
+
else:
|
|
755
|
+
logger.debug(f"Venue search failed for {venue}: {response.status_code if response else 'No response'}")
|
|
756
|
+
|
|
757
|
+
if results:
|
|
758
|
+
logger.debug(f"OpenReview API search found {len(results)} matches via venue search")
|
|
759
|
+
return results
|
|
760
|
+
|
|
761
|
+
# If venue search didn't work, try other approaches
|
|
762
|
+
# OpenReview API is quite restrictive, so we might need to fall back to web scraping
|
|
763
|
+
logger.debug("OpenReview API venue search returned no results, trying web search")
|
|
764
|
+
return []
|
|
765
|
+
|
|
766
|
+
except Exception as e:
|
|
767
|
+
logger.debug(f"OpenReview API search error: {e}")
|
|
768
|
+
return []
|
|
769
|
+
|
|
770
|
+
def _search_via_web(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
|
|
771
|
+
"""
|
|
772
|
+
Search using OpenReview web interface (fallback)
|
|
773
|
+
|
|
774
|
+
Args:
|
|
775
|
+
title: Clean title to search for
|
|
776
|
+
authors: List of author names (optional)
|
|
777
|
+
year: Publication year (optional)
|
|
778
|
+
|
|
779
|
+
Returns:
|
|
780
|
+
List of matching paper dictionaries
|
|
781
|
+
"""
|
|
782
|
+
try:
|
|
783
|
+
# Build search URL
|
|
784
|
+
search_query = title.replace(' ', '+')
|
|
785
|
+
search_url = f"{self.base_url}/search?term={search_query}"
|
|
786
|
+
|
|
787
|
+
response = self._respectful_request(search_url)
|
|
788
|
+
if not response or response.status_code != 200:
|
|
789
|
+
return []
|
|
790
|
+
|
|
791
|
+
# Parse search results page
|
|
792
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
793
|
+
|
|
794
|
+
# Look for paper links in search results
|
|
795
|
+
# OpenReview search results typically contain links to forum pages
|
|
796
|
+
results = []
|
|
797
|
+
|
|
798
|
+
# Find links that look like OpenReview paper URLs
|
|
799
|
+
for link in soup.find_all('a', href=True):
|
|
800
|
+
href = link.get('href', '')
|
|
801
|
+
if '/forum?id=' in href:
|
|
802
|
+
paper_id = self.extract_paper_id(href)
|
|
803
|
+
if paper_id:
|
|
804
|
+
# Get full metadata for this paper
|
|
805
|
+
metadata = self.get_paper_metadata(paper_id)
|
|
806
|
+
if metadata and self._is_good_match(metadata, title, authors, year):
|
|
807
|
+
results.append(metadata)
|
|
808
|
+
if len(results) >= 5: # Limit results
|
|
809
|
+
break
|
|
810
|
+
|
|
811
|
+
logger.debug(f"OpenReview web search found {len(results)} matches")
|
|
812
|
+
return results
|
|
813
|
+
|
|
814
|
+
except Exception as e:
|
|
815
|
+
logger.debug(f"OpenReview web search error: {e}")
|
|
816
|
+
return []
|
|
817
|
+
|
|
818
|
+
def _is_good_match(self, metadata: Dict[str, Any], search_title: str, authors: List[str] = None, year: int = None) -> bool:
|
|
819
|
+
"""
|
|
820
|
+
Check if the found paper is a good match for the search criteria
|
|
821
|
+
|
|
822
|
+
Args:
|
|
823
|
+
metadata: Paper metadata from OpenReview
|
|
824
|
+
search_title: Title we're searching for
|
|
825
|
+
authors: Authors we're looking for (optional)
|
|
826
|
+
year: Year we're looking for (optional)
|
|
827
|
+
|
|
828
|
+
Returns:
|
|
829
|
+
True if it's a good match
|
|
830
|
+
"""
|
|
831
|
+
paper_title = metadata.get('title', '')
|
|
832
|
+
if not paper_title:
|
|
833
|
+
return False
|
|
834
|
+
|
|
835
|
+
# Check title similarity
|
|
836
|
+
title_similarity = calculate_title_similarity(search_title, paper_title)
|
|
837
|
+
if title_similarity < 0.7: # Require at least 70% similarity
|
|
838
|
+
return False
|
|
839
|
+
|
|
840
|
+
# Check year if provided
|
|
841
|
+
if year:
|
|
842
|
+
paper_year = metadata.get('year')
|
|
843
|
+
if paper_year and abs(int(paper_year) - year) > 1: # Allow 1 year difference
|
|
844
|
+
return False
|
|
845
|
+
|
|
846
|
+
# Check authors if provided
|
|
847
|
+
if authors and len(authors) > 0:
|
|
848
|
+
paper_authors = metadata.get('authors', [])
|
|
849
|
+
if paper_authors:
|
|
850
|
+
# Check if at least one author matches
|
|
851
|
+
author_match = False
|
|
852
|
+
for search_author in authors[:2]: # Check first 2 authors
|
|
853
|
+
for paper_author in paper_authors[:3]: # Check first 3 paper authors
|
|
854
|
+
if is_name_match(search_author, paper_author):
|
|
855
|
+
author_match = True
|
|
856
|
+
break
|
|
857
|
+
if author_match:
|
|
858
|
+
break
|
|
859
|
+
|
|
860
|
+
if not author_match:
|
|
861
|
+
return False
|
|
862
|
+
|
|
863
|
+
return True
|
|
864
|
+
|
|
865
|
+
def search_by_title(self, title: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
866
|
+
"""
|
|
867
|
+
Search OpenReview for papers by title using the working search API.
|
|
868
|
+
|
|
869
|
+
Args:
|
|
870
|
+
title: Paper title to search for
|
|
871
|
+
max_results: Maximum number of results to return
|
|
872
|
+
|
|
873
|
+
Returns:
|
|
874
|
+
List of paper data dictionaries
|
|
875
|
+
"""
|
|
876
|
+
try:
|
|
877
|
+
# Use OpenReview's search API with term parameter (this works!)
|
|
878
|
+
params = {
|
|
879
|
+
'term': title,
|
|
880
|
+
'limit': max_results
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
response = self._respectful_request(f"{self.api_url}/notes/search", params=params)
|
|
884
|
+
if not response or response.status_code != 200:
|
|
885
|
+
logger.debug(f"OpenReview search API failed with status {response.status_code if response else 'None'}")
|
|
886
|
+
return []
|
|
887
|
+
|
|
888
|
+
data = response.json()
|
|
889
|
+
papers = []
|
|
890
|
+
|
|
891
|
+
for note in data.get('notes', []):
|
|
892
|
+
# Filter to exact or close title matches
|
|
893
|
+
note_title = note.get('content', {}).get('title', '')
|
|
894
|
+
if self._is_title_match(title, note_title):
|
|
895
|
+
paper_data = self._parse_api_response(note)
|
|
896
|
+
if paper_data:
|
|
897
|
+
papers.append(paper_data)
|
|
898
|
+
|
|
899
|
+
logger.debug(f"OpenReview search found {len(papers)} matching papers for '{title}'")
|
|
900
|
+
return papers
|
|
901
|
+
|
|
902
|
+
except Exception as e:
|
|
903
|
+
logger.error(f"Error searching OpenReview by title '{title}': {e}")
|
|
904
|
+
return []
|
|
905
|
+
|
|
906
|
+
def _is_title_match(self, search_title: str, found_title: str, threshold: float = 0.8) -> bool:
|
|
907
|
+
"""
|
|
908
|
+
Check if two titles match closely enough.
|
|
909
|
+
|
|
910
|
+
Args:
|
|
911
|
+
search_title: Title we're searching for
|
|
912
|
+
found_title: Title found in search results
|
|
913
|
+
threshold: Similarity threshold (0.0 to 1.0)
|
|
914
|
+
|
|
915
|
+
Returns:
|
|
916
|
+
True if titles match closely enough
|
|
917
|
+
"""
|
|
918
|
+
if not search_title or not found_title:
|
|
919
|
+
return False
|
|
920
|
+
|
|
921
|
+
# Exact match
|
|
922
|
+
if search_title.lower().strip() == found_title.lower().strip():
|
|
923
|
+
return True
|
|
924
|
+
|
|
925
|
+
# Check if one contains the other (for cases where one is longer)
|
|
926
|
+
search_clean = search_title.lower().strip()
|
|
927
|
+
found_clean = found_title.lower().strip()
|
|
928
|
+
|
|
929
|
+
if search_clean in found_clean or found_clean in search_clean:
|
|
930
|
+
return True
|
|
931
|
+
|
|
932
|
+
# Use similarity calculation from text_utils
|
|
933
|
+
try:
|
|
934
|
+
from refchecker.utils.text_utils import calculate_title_similarity
|
|
935
|
+
similarity = calculate_title_similarity(search_title, found_title)
|
|
936
|
+
return similarity >= threshold
|
|
937
|
+
except ImportError:
|
|
938
|
+
# Fallback to simple word matching
|
|
939
|
+
search_words = set(search_clean.split())
|
|
940
|
+
found_words = set(found_clean.split())
|
|
941
|
+
|
|
942
|
+
if not search_words or not found_words:
|
|
943
|
+
return False
|
|
944
|
+
|
|
945
|
+
intersection = search_words.intersection(found_words)
|
|
946
|
+
union = search_words.union(found_words)
|
|
947
|
+
|
|
948
|
+
jaccard_similarity = len(intersection) / len(union) if union else 0
|
|
949
|
+
return jaccard_similarity >= threshold
|
|
950
|
+
|
|
951
|
+
def verify_reference_by_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
952
|
+
"""
|
|
953
|
+
Verify a reference by searching OpenReview (for papers without URLs).
|
|
954
|
+
|
|
955
|
+
Args:
|
|
956
|
+
reference: Reference data dictionary
|
|
957
|
+
|
|
958
|
+
Returns:
|
|
959
|
+
Tuple of (verified_data, errors_and_warnings, debug_info)
|
|
960
|
+
"""
|
|
961
|
+
title = reference.get('title', '').strip()
|
|
962
|
+
if not title:
|
|
963
|
+
return None, [], "No title provided for search"
|
|
964
|
+
|
|
965
|
+
# Search for the paper
|
|
966
|
+
search_results = self.search_by_title(title)
|
|
967
|
+
|
|
968
|
+
if not search_results:
|
|
969
|
+
return None, [], f"No papers found on OpenReview for title: {title}"
|
|
970
|
+
|
|
971
|
+
# Take the best match (first result, as search is already filtered)
|
|
972
|
+
best_match = search_results[0]
|
|
973
|
+
|
|
974
|
+
# Use the existing verify_reference method with the found URL
|
|
975
|
+
forum_url = best_match.get('forum_url')
|
|
976
|
+
if forum_url:
|
|
977
|
+
# Create a reference with the OpenReview URL for verification
|
|
978
|
+
reference_with_url = reference.copy()
|
|
979
|
+
reference_with_url['url'] = forum_url
|
|
980
|
+
|
|
981
|
+
return self.verify_reference(reference_with_url)
|
|
982
|
+
|
|
983
|
+
# If no URL, return the metadata as verification
|
|
984
|
+
return best_match, [], f"Found on OpenReview: {best_match.get('title')}"
|