academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,984 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ OpenReview API Client for Reference Verification
4
+
5
+ This module provides functionality to verify references from OpenReview papers.
6
+ OpenReview is a platform for open peer review in machine learning conferences
7
+ like ICLR, NeurIPS, ICML, etc.
8
+
9
+ Usage:
10
+ from openreview_checker import OpenReviewReferenceChecker
11
+
12
+ # Initialize the checker
13
+ checker = OpenReviewReferenceChecker()
14
+
15
+ # Verify a reference
16
+ reference = {
17
+ 'title': 'Title of the paper',
18
+ 'authors': ['Author 1', 'Author 2'],
19
+ 'year': 2024,
20
+ 'url': 'https://openreview.net/forum?id=ZG3RaNIsO8',
21
+ 'raw_text': 'Full citation text'
22
+ }
23
+
24
+ verified_data, errors, url = checker.verify_reference(reference)
25
+ """
26
+
27
+ import requests
28
+ import time
29
+ import logging
30
+ import re
31
+ import json
32
+ from typing import Dict, List, Tuple, Optional, Any, Union
33
+ from urllib.parse import urlparse, parse_qs
34
+ from bs4 import BeautifulSoup
35
+ from refchecker.utils.text_utils import (
36
+ normalize_text, clean_title_basic, is_name_match,
37
+ calculate_title_similarity, compare_authors,
38
+ clean_title_for_search, are_venues_substantially_different,
39
+ is_year_substantially_different, strip_latex_commands,
40
+ compare_titles_with_latex_cleaning
41
+ )
42
+
43
+ # Set up logging
44
+ logger = logging.getLogger(__name__)
45
+
46
+ class OpenReviewReferenceChecker:
47
+ """
48
+ A class to verify references using OpenReview
49
+ """
50
+
51
+ def __init__(self, request_delay: float = 1.0):
52
+ """
53
+ Initialize the OpenReview client
54
+
55
+ Args:
56
+ request_delay: Delay between requests to be respectful to OpenReview servers
57
+ """
58
+ self.base_url = "https://openreview.net"
59
+ self.api_url = "https://api.openreview.net"
60
+ self.request_delay = request_delay
61
+ self.last_request_time = 0
62
+
63
+ # Session for connection pooling
64
+ self.session = requests.Session()
65
+ self.session.headers.update({
66
+ 'User-Agent': 'RefChecker/1.0 (Academic Reference Verification)',
67
+ 'Accept': 'application/json, text/html',
68
+ 'Accept-Language': 'en-US,en;q=0.9'
69
+ })
70
+
71
+ def is_openreview_url(self, url: str) -> bool:
72
+ """
73
+ Check if URL is from OpenReview
74
+
75
+ Args:
76
+ url: URL to check
77
+
78
+ Returns:
79
+ True if it's an OpenReview URL
80
+ """
81
+ return bool(url and 'openreview.net' in url.lower())
82
+
83
+ def is_openreview_reference(self, reference: Dict[str, Any]) -> bool:
84
+ """
85
+ Determine if this reference is from OpenReview based on URL patterns
86
+
87
+ Args:
88
+ reference: Reference dictionary to check
89
+
90
+ Returns:
91
+ True if reference appears to be from OpenReview
92
+ """
93
+ # Check various URL fields for OpenReview URLs
94
+ url_fields = ['url', 'openreview_url', 'link', 'venue_url']
95
+ for field in url_fields:
96
+ url = reference.get(field, '')
97
+ if url and self.is_openreview_url(url):
98
+ return True
99
+
100
+ # Check raw text for OpenReview URLs
101
+ raw_text = reference.get('raw_text', '')
102
+ if raw_text and 'openreview.net' in raw_text.lower():
103
+ return True
104
+
105
+ return False
106
+
107
+ def extract_paper_id(self, url: str) -> Optional[str]:
108
+ """
109
+ Extract paper ID from OpenReview URL
110
+
111
+ Args:
112
+ url: OpenReview URL
113
+
114
+ Returns:
115
+ Paper ID if found, None otherwise
116
+ """
117
+ if not self.is_openreview_url(url):
118
+ return None
119
+
120
+ # Handle different OpenReview URL formats:
121
+ # https://openreview.net/forum?id=ZG3RaNIsO8
122
+ # https://openreview.net/pdf?id=ZG3RaNIsO8
123
+ # https://openreview.net/forum?id=ZG3RaNIsO8&noteId=...
124
+
125
+ parsed = urlparse(url)
126
+ query_params = parse_qs(parsed.query)
127
+
128
+ if 'id' in query_params:
129
+ return query_params['id'][0]
130
+
131
+ # Also check path-based URLs (if they exist)
132
+ path_match = re.search(r'/(?:forum|pdf|notes)/([A-Za-z0-9_-]+)', parsed.path)
133
+ if path_match:
134
+ return path_match.group(1)
135
+
136
+ return None
137
+
138
+ def _respectful_request(self, url: str, **kwargs) -> Optional[requests.Response]:
139
+ """Make a respectful HTTP request with rate limiting"""
140
+ current_time = time.time()
141
+ time_since_last = current_time - self.last_request_time
142
+
143
+ if time_since_last < self.request_delay:
144
+ time.sleep(self.request_delay - time_since_last)
145
+
146
+ try:
147
+ logger.debug(f"Making request to: {url}")
148
+ response = self.session.get(url, timeout=15, **kwargs)
149
+ self.last_request_time = time.time()
150
+ logger.debug(f"Request successful: {response.status_code}")
151
+ return response
152
+ except requests.exceptions.RequestException as e:
153
+ logger.debug(f"Request failed for {url}: {type(e).__name__}: {e}")
154
+ return None
155
+
156
+ def get_paper_metadata(self, paper_id: str) -> Optional[Dict[str, Any]]:
157
+ """
158
+ Get paper metadata from OpenReview
159
+
160
+ Args:
161
+ paper_id: OpenReview paper ID
162
+
163
+ Returns:
164
+ Paper metadata dictionary or None if not found
165
+ """
166
+ # Try API endpoint first
167
+ api_url = f"{self.api_url}/notes?id={paper_id}"
168
+ response = self._respectful_request(api_url)
169
+
170
+ if response and response.status_code == 200:
171
+ try:
172
+ data = response.json()
173
+ if 'notes' in data and data['notes']:
174
+ note = data['notes'][0]
175
+ return self._parse_api_response(note)
176
+ except (json.JSONDecodeError, KeyError) as e:
177
+ logger.debug(f"Failed to parse API response: {e}")
178
+
179
+ # Fall back to web scraping
180
+ forum_url = f"{self.base_url}/forum?id={paper_id}"
181
+ response = self._respectful_request(forum_url)
182
+
183
+ if not response or response.status_code != 200:
184
+ return None
185
+
186
+ return self._parse_web_page(response.text, forum_url)
187
+
188
+ def _parse_api_response(self, note: Dict[str, Any]) -> Dict[str, Any]:
189
+ """
190
+ Parse OpenReview API response to extract metadata
191
+
192
+ Args:
193
+ note: Note data from API response
194
+
195
+ Returns:
196
+ Parsed metadata dictionary
197
+ """
198
+ content = note.get('content', {})
199
+
200
+ # Extract basic metadata
201
+ metadata = {
202
+ 'id': note.get('id'),
203
+ 'title': content.get('title', '').strip(),
204
+ 'authors': [],
205
+ 'year': None,
206
+ 'venue': None,
207
+ 'abstract': content.get('abstract', '').strip(),
208
+ 'keywords': content.get('keywords', []),
209
+ 'pdf_url': content.get('pdf'),
210
+ 'forum_url': f"{self.base_url}/forum?id={note.get('id')}",
211
+ 'source': 'openreview_api'
212
+ }
213
+
214
+ # Parse authors
215
+ authors_raw = content.get('authors', [])
216
+ if isinstance(authors_raw, list):
217
+ metadata['authors'] = [author.strip() for author in authors_raw if author.strip()]
218
+ elif isinstance(authors_raw, str):
219
+ # Sometimes authors are in a single string
220
+ metadata['authors'] = [author.strip() for author in authors_raw.split(',') if author.strip()]
221
+
222
+ # Extract year from various sources
223
+ # Check creation time
224
+ if 'cdate' in note:
225
+ try:
226
+ import datetime
227
+ timestamp = note['cdate'] / 1000.0 # Convert from milliseconds
228
+ year = datetime.datetime.fromtimestamp(timestamp).year
229
+ metadata['year'] = year
230
+ except (ValueError, TypeError):
231
+ pass
232
+
233
+ # Check if venue/conference info is available
234
+ venue_info = content.get('venue', '')
235
+ if venue_info:
236
+ metadata['venue'] = venue_info.strip()
237
+
238
+ # Try to extract venue from forum context or submission info
239
+ if not metadata['venue']:
240
+ # Common venues for OpenReview
241
+ forum_path = note.get('forum', '')
242
+ if 'ICLR' in str(content) or 'iclr' in forum_path.lower():
243
+ metadata['venue'] = 'ICLR'
244
+ elif 'NeurIPS' in str(content) or 'neurips' in forum_path.lower():
245
+ metadata['venue'] = 'NeurIPS'
246
+ elif 'ICML' in str(content) or 'icml' in forum_path.lower():
247
+ metadata['venue'] = 'ICML'
248
+
249
+ return metadata
250
+
251
+ def _parse_web_page(self, html: str, url: str) -> Dict[str, Any]:
252
+ """
253
+ Parse OpenReview web page to extract metadata
254
+
255
+ Args:
256
+ html: HTML content of the page
257
+ url: Original URL
258
+
259
+ Returns:
260
+ Parsed metadata dictionary
261
+ """
262
+ soup = BeautifulSoup(html, 'html.parser')
263
+
264
+ # Extract paper ID from URL
265
+ paper_id = self.extract_paper_id(url)
266
+
267
+ metadata = {
268
+ 'id': paper_id,
269
+ 'title': '',
270
+ 'authors': [],
271
+ 'year': None,
272
+ 'venue': None,
273
+ 'abstract': '',
274
+ 'keywords': [],
275
+ 'forum_url': url,
276
+ 'source': 'openreview_web'
277
+ }
278
+
279
+ # Extract title
280
+ title_elem = soup.find('h2', {'class': 'citation_title'}) or soup.find('h1')
281
+ if title_elem:
282
+ metadata['title'] = title_elem.get_text().strip()
283
+
284
+ # Try to find title in meta tags
285
+ if not metadata['title']:
286
+ meta_title = soup.find('meta', {'property': 'og:title'}) or soup.find('meta', {'name': 'title'})
287
+ if meta_title and meta_title.get('content'):
288
+ metadata['title'] = meta_title['content'].strip()
289
+
290
+ # Extract authors from meta tags (most reliable for OpenReview)
291
+ author_metas = soup.find_all('meta', {'name': 'citation_author'})
292
+ if author_metas:
293
+ metadata['authors'] = [meta.get('content', '').strip() for meta in author_metas if meta.get('content', '').strip()]
294
+
295
+ # Fallback: try to find authors in HTML structure
296
+ if not metadata['authors']:
297
+ authors_section = soup.find('div', {'class': 'authors'}) or soup.find('span', {'class': 'authors'})
298
+ if authors_section:
299
+ # Extract author names from links or text
300
+ author_links = authors_section.find_all('a')
301
+ if author_links:
302
+ metadata['authors'] = [link.get_text().strip() for link in author_links]
303
+ else:
304
+ # Parse comma-separated authors
305
+ authors_text = authors_section.get_text().strip()
306
+ metadata['authors'] = [author.strip() for author in authors_text.split(',') if author.strip()]
307
+
308
+ # Extract year from various sources
309
+ year_pattern = r'\b(20\d{2})\b'
310
+
311
+ # Check date/year elements
312
+ date_elem = soup.find('span', {'class': 'date'}) or soup.find('time')
313
+ if date_elem:
314
+ year_match = re.search(year_pattern, date_elem.get_text())
315
+ if year_match:
316
+ metadata['year'] = int(year_match.group(1))
317
+
318
+ # Check meta tags for date
319
+ if not metadata['year']:
320
+ meta_date = soup.find('meta', {'name': 'citation_date'}) or soup.find('meta', {'name': 'date'})
321
+ if meta_date and meta_date.get('content'):
322
+ year_match = re.search(year_pattern, meta_date['content'])
323
+ if year_match:
324
+ metadata['year'] = int(year_match.group(1))
325
+
326
+ # Extract abstract
327
+ abstract_elem = soup.find('div', {'class': 'abstract'}) or soup.find('section', {'class': 'abstract'})
328
+ if abstract_elem:
329
+ metadata['abstract'] = abstract_elem.get_text().strip()
330
+
331
+ # Extract venue information from meta tags (most reliable for OpenReview)
332
+ venue_meta = soup.find('meta', {'name': 'citation_conference_title'})
333
+ if venue_meta and venue_meta.get('content'):
334
+ venue_full = venue_meta['content'].strip()
335
+ # Convert long conference names to common abbreviations
336
+ if 'International Conference on Learning Representations' in venue_full:
337
+ # Extract year if present
338
+ year_match = re.search(r'\b(20\d{2})\b', venue_full)
339
+ if year_match:
340
+ metadata['venue'] = f'ICLR {year_match.group(1)}'
341
+ else:
342
+ metadata['venue'] = 'ICLR'
343
+ elif 'Neural Information Processing Systems' in venue_full or 'NeurIPS' in venue_full:
344
+ year_match = re.search(r'\b(20\d{2})\b', venue_full)
345
+ if year_match:
346
+ metadata['venue'] = f'NeurIPS {year_match.group(1)}'
347
+ else:
348
+ metadata['venue'] = 'NeurIPS'
349
+ else:
350
+ metadata['venue'] = venue_full
351
+
352
+ # Fallback: try HTML structure
353
+ if not metadata['venue']:
354
+ venue_elem = soup.find('div', {'class': 'venue'}) or soup.find('span', {'class': 'venue'})
355
+ if venue_elem:
356
+ metadata['venue'] = venue_elem.get_text().strip()
357
+
358
+ # Final fallback: try to determine venue from page context or URL
359
+ if not metadata['venue']:
360
+ page_text = soup.get_text().lower()
361
+ if 'iclr' in page_text or 'iclr' in url.lower():
362
+ if '2024' in page_text:
363
+ metadata['venue'] = 'ICLR 2024'
364
+ else:
365
+ metadata['venue'] = 'ICLR'
366
+ elif 'neurips' in page_text or 'neurips' in url.lower():
367
+ metadata['venue'] = 'NeurIPS'
368
+ elif 'icml' in page_text or 'icml' in url.lower():
369
+ metadata['venue'] = 'ICML'
370
+
371
+ # Extract keywords if available
372
+ keywords_elem = soup.find('div', {'class': 'keywords'})
373
+ if keywords_elem:
374
+ keywords_text = keywords_elem.get_text()
375
+ metadata['keywords'] = [kw.strip() for kw in keywords_text.split(',') if kw.strip()]
376
+
377
+ return metadata
378
+
379
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
380
+ """
381
+ Verify a reference against OpenReview
382
+
383
+ Args:
384
+ reference: Reference dictionary with title, authors, year, url, etc.
385
+
386
+ Returns:
387
+ Tuple of (verified_data, errors, paper_url) where:
388
+ - verified_data: Dict with verified OpenReview paper data or None
389
+ - errors: List of error/warning dictionaries
390
+ - paper_url: The OpenReview URL
391
+ """
392
+ logger.debug(f"Verifying OpenReview reference: {reference.get('title', 'Untitled')}")
393
+
394
+ # Extract OpenReview URL from reference
395
+ openreview_url = None
396
+ for url_key in ['url', 'openreview_url', 'link']:
397
+ if url_key in reference and reference[url_key]:
398
+ url = reference[url_key].strip()
399
+ if self.is_openreview_url(url):
400
+ openreview_url = url
401
+ break
402
+
403
+ if not openreview_url:
404
+ logger.debug("No OpenReview URL found in reference")
405
+ return None, [], None
406
+
407
+ # Extract paper ID
408
+ paper_id = self.extract_paper_id(openreview_url)
409
+ if not paper_id:
410
+ return None, [{"error_type": "unverified", "error_details": "Could not extract paper ID from OpenReview URL"}], openreview_url
411
+
412
+ # Get paper metadata
413
+ paper_data = self.get_paper_metadata(paper_id)
414
+ if not paper_data:
415
+ return None, [{"error_type": "unverified", "error_details": "Paper not found on OpenReview"}], openreview_url
416
+
417
+ logger.debug(f"Found OpenReview paper: {paper_data.get('title', 'Untitled')}")
418
+
419
+ # Verify the reference against the paper data
420
+ errors = []
421
+
422
+ # Check title match
423
+ cited_title = reference.get('title', '').strip()
424
+ paper_title = paper_data.get('title', '').strip()
425
+
426
+ if cited_title and paper_title:
427
+ similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
428
+ if similarity < 0.7: # Using a reasonable threshold
429
+ from refchecker.utils.error_utils import format_title_mismatch
430
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
431
+ clean_cited_title = strip_latex_commands(cited_title)
432
+ details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
433
+ errors.append({
434
+ "warning_type": "title",
435
+ "warning_details": details
436
+ })
437
+
438
+ # Check authors
439
+ cited_authors = reference.get('authors', [])
440
+ paper_authors = paper_data.get('authors', [])
441
+
442
+ if cited_authors and paper_authors:
443
+ # Convert to list format if needed
444
+ if isinstance(cited_authors, str):
445
+ cited_authors = [author.strip() for author in cited_authors.split(',')]
446
+ if isinstance(paper_authors, str):
447
+ paper_authors = [author.strip() for author in paper_authors.split(',')]
448
+
449
+ # Use the existing author comparison function
450
+ match, error_msg = compare_authors(cited_authors, paper_authors)
451
+ if not match and error_msg:
452
+ errors.append({
453
+ "warning_type": "author",
454
+ "warning_details": error_msg
455
+ })
456
+
457
+ # Check year
458
+ cited_year = reference.get('year')
459
+ paper_year = paper_data.get('year')
460
+
461
+ if cited_year and paper_year:
462
+ try:
463
+ cited_year_int = int(cited_year)
464
+ paper_year_int = int(paper_year)
465
+
466
+ is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
467
+ if is_different and year_message:
468
+ from refchecker.utils.error_utils import format_year_mismatch
469
+ errors.append({
470
+ "warning_type": "year",
471
+ "warning_details": format_year_mismatch(cited_year_int, paper_year_int)
472
+ })
473
+ except (ValueError, TypeError):
474
+ pass # Skip year validation if conversion fails
475
+
476
+ # Check venue if provided in reference
477
+ cited_venue = reference.get('venue', '').strip()
478
+ paper_venue = paper_data.get('venue', '').strip()
479
+
480
+ if cited_venue and paper_venue:
481
+ if are_venues_substantially_different(cited_venue, paper_venue):
482
+ from refchecker.utils.error_utils import format_venue_mismatch
483
+ errors.append({
484
+ "warning_type": "venue",
485
+ "warning_details": format_venue_mismatch(cited_venue, paper_venue)
486
+ })
487
+
488
+ # Create verified data structure
489
+ verified_data = {
490
+ 'title': paper_data.get('title', cited_title),
491
+ 'authors': paper_data.get('authors', cited_authors),
492
+ 'year': paper_data.get('year', cited_year),
493
+ 'venue': paper_data.get('venue', cited_venue),
494
+ 'url': openreview_url,
495
+ 'abstract': paper_data.get('abstract', ''),
496
+ 'keywords': paper_data.get('keywords', []),
497
+ 'openreview_metadata': paper_data,
498
+ 'verification_source': 'OpenReview'
499
+ }
500
+
501
+ logger.debug(f"OpenReview verification completed for: {openreview_url}")
502
+ return verified_data, errors, openreview_url
503
+
504
+ def verify_by_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
505
+ """
506
+ Verify a reference by searching OpenReview (when no URL is provided)
507
+
508
+ Args:
509
+ reference: Reference dictionary with title, authors, year, etc.
510
+
511
+ Returns:
512
+ Tuple of (verified_data, errors, paper_url) where:
513
+ - verified_data: Dict with verified OpenReview paper data or None
514
+ - errors: List of error/warning dictionaries
515
+ - paper_url: The OpenReview URL if found
516
+ """
517
+ logger.debug(f"Searching OpenReview for reference: {reference.get('title', 'Untitled')}")
518
+
519
+ title = reference.get('title', '').strip()
520
+ authors = reference.get('authors', [])
521
+ year = reference.get('year')
522
+ venue = reference.get('venue', '').strip()
523
+
524
+ if not title:
525
+ return None, [], None
526
+
527
+ # Check if venue suggests this might be on OpenReview
528
+ if not self._is_likely_openreview_venue(venue):
529
+ logger.debug(f"Venue '{venue}' doesn't suggest OpenReview, skipping search")
530
+ return None, [], None
531
+
532
+ # Search for matching papers
533
+ search_results = self.search_paper(title, authors, year)
534
+
535
+ if not search_results:
536
+ logger.debug("No matching papers found on OpenReview")
537
+ return None, [], None
538
+
539
+ # Use the best match (first result, as they're sorted by relevance)
540
+ best_match = search_results[0]
541
+ paper_url = best_match.get('forum_url')
542
+
543
+ logger.debug(f"Found OpenReview match: {best_match.get('title', 'Untitled')}")
544
+
545
+ # Verify the reference against the found paper
546
+ errors = []
547
+
548
+ # Check title match
549
+ cited_title = reference.get('title', '').strip()
550
+ paper_title = best_match.get('title', '').strip()
551
+
552
+ if cited_title and paper_title:
553
+ similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
554
+ if similarity < 0.8: # Slightly higher threshold for search results
555
+ from refchecker.utils.error_utils import format_title_mismatch
556
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
557
+ clean_cited_title = strip_latex_commands(cited_title)
558
+ details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
559
+ errors.append({
560
+ "warning_type": "title",
561
+ "warning_details": details
562
+ })
563
+
564
+ # Check authors
565
+ cited_authors = reference.get('authors', [])
566
+ paper_authors = best_match.get('authors', [])
567
+
568
+ if cited_authors and paper_authors:
569
+ # Convert to list format if needed
570
+ if isinstance(cited_authors, str):
571
+ cited_authors = [author.strip() for author in cited_authors.split(',')]
572
+ if isinstance(paper_authors, str):
573
+ paper_authors = [author.strip() for author in paper_authors.split(',')]
574
+
575
+ # Use the existing author comparison function
576
+ match, error_msg = compare_authors(cited_authors, paper_authors)
577
+ if not match and error_msg:
578
+ errors.append({
579
+ "warning_type": "author",
580
+ "warning_details": error_msg
581
+ })
582
+
583
+ # Check year
584
+ cited_year = reference.get('year')
585
+ paper_year = best_match.get('year')
586
+
587
+ if cited_year and paper_year:
588
+ try:
589
+ cited_year_int = int(cited_year)
590
+ paper_year_int = int(paper_year)
591
+
592
+ is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
593
+ if is_different and year_message:
594
+ from refchecker.utils.error_utils import format_year_mismatch
595
+ errors.append({
596
+ "warning_type": "year",
597
+ "warning_details": format_year_mismatch(cited_year_int, paper_year_int)
598
+ })
599
+ except (ValueError, TypeError):
600
+ pass # Skip year validation if conversion fails
601
+
602
+ # Check venue if provided in reference
603
+ cited_venue = reference.get('venue', '').strip()
604
+ paper_venue = best_match.get('venue', '').strip()
605
+
606
+ if cited_venue and paper_venue:
607
+ if are_venues_substantially_different(cited_venue, paper_venue):
608
+ from refchecker.utils.error_utils import format_venue_mismatch
609
+ errors.append({
610
+ "warning_type": "venue",
611
+ "warning_details": format_venue_mismatch(cited_venue, paper_venue)
612
+ })
613
+
614
+ # Create verified data structure
615
+ verified_data = {
616
+ 'title': best_match.get('title', cited_title),
617
+ 'authors': best_match.get('authors', cited_authors),
618
+ 'year': best_match.get('year', cited_year),
619
+ 'venue': best_match.get('venue', cited_venue),
620
+ 'url': paper_url,
621
+ 'abstract': best_match.get('abstract', ''),
622
+ 'keywords': best_match.get('keywords', []),
623
+ 'openreview_metadata': best_match,
624
+ 'verification_source': 'OpenReview (search)'
625
+ }
626
+
627
+ logger.debug(f"OpenReview search verification completed for: {paper_url}")
628
+ return verified_data, errors, paper_url
629
+
630
+ def _is_likely_openreview_venue(self, venue: str) -> bool:
631
+ """
632
+ Check if a venue suggests the paper might be on OpenReview
633
+
634
+ Args:
635
+ venue: Venue string from reference
636
+
637
+ Returns:
638
+ True if venue suggests OpenReview
639
+ """
640
+ if not venue:
641
+ return False
642
+
643
+ venue_lower = venue.lower()
644
+
645
+ # Common venues that use OpenReview
646
+ openreview_venues = [
647
+ 'iclr', 'international conference on learning representations',
648
+ 'neurips', 'neural information processing systems', 'nips',
649
+ 'icml', 'international conference on machine learning',
650
+ 'iclr workshop', 'neurips workshop', 'icml workshop',
651
+ 'aaai', 'ijcai', 'aistats'
652
+ ]
653
+
654
+ for or_venue in openreview_venues:
655
+ if or_venue in venue_lower:
656
+ return True
657
+
658
+ return False
659
+
660
+ def search_paper(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
661
+ """
662
+ Search for papers on OpenReview by title, authors, and/or year
663
+
664
+ Args:
665
+ title: Paper title to search for
666
+ authors: List of author names (optional)
667
+ year: Publication year (optional)
668
+
669
+ Returns:
670
+ List of matching paper metadata dictionaries
671
+ """
672
+ if not title or not title.strip():
673
+ return []
674
+
675
+ logger.debug(f"Searching OpenReview for: {title}")
676
+
677
+ # Clean title for search
678
+ search_title = clean_title_for_search(title)
679
+
680
+ # Try API search first
681
+ results = self._search_via_api(search_title, authors, year)
682
+ if results:
683
+ return results
684
+
685
+ # If API search fails, try web search as fallback
686
+ return self._search_via_web(search_title, authors, year)
687
+
688
+ def _search_via_api(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
689
+ """
690
+ Search using OpenReview API
691
+
692
+ Args:
693
+ title: Clean title to search for
694
+ authors: List of author names (optional)
695
+ year: Publication year (optional)
696
+
697
+ Returns:
698
+ List of matching paper dictionaries
699
+ """
700
+ try:
701
+ # The OpenReview API requires specific parameters
702
+ # We'll search by content.title or content.venue (for venue-based search)
703
+ search_params = {
704
+ 'limit': 20, # Limit results to avoid overwhelming the API
705
+ 'details': 'directReplies' # Get basic details
706
+ }
707
+
708
+ # Try searching by venue first if year suggests recent conferences
709
+ if year and year >= 2017: # OpenReview started around 2017
710
+ venues_by_year = {
711
+ 2025: ['ICLR 2025'],
712
+ 2024: ['ICLR 2024', 'NeurIPS 2024', 'ICML 2024'],
713
+ 2023: ['ICLR 2023', 'NeurIPS 2023', 'ICML 2023'],
714
+ 2022: ['ICLR 2022', 'NeurIPS 2022', 'ICML 2022'],
715
+ 2021: ['ICLR 2021', 'NeurIPS 2021', 'ICML 2021'],
716
+ 2020: ['ICLR 2020', 'NeurIPS 2020', 'ICML 2020'],
717
+ 2019: ['ICLR 2019', 'NeurIPS 2019', 'ICML 2019'],
718
+ 2018: ['ICLR 2018', 'NeurIPS 2018', 'ICML 2018'],
719
+ 2017: ['ICLR 2017']
720
+ }
721
+
722
+ possible_venues = venues_by_year.get(year, [])
723
+
724
+ results = []
725
+ for venue in possible_venues:
726
+ # Search by venue and then filter by title
727
+ venue_params = search_params.copy()
728
+ venue_params['content.venue'] = venue
729
+
730
+ api_url = f"{self.api_url}/notes"
731
+ response = self._respectful_request(api_url, params=venue_params)
732
+
733
+ if response and response.status_code == 200:
734
+ try:
735
+ data = response.json()
736
+ if 'notes' in data and data['notes']:
737
+ for note in data['notes']:
738
+ try:
739
+ metadata = self._parse_api_response(note)
740
+ if metadata and self._is_good_match(metadata, title, authors, year):
741
+ results.append(metadata)
742
+ if len(results) >= 5: # Limit results
743
+ break
744
+ except Exception as e:
745
+ logger.debug(f"Error parsing note: {e}")
746
+ continue
747
+
748
+ if results:
749
+ break # Found results, no need to search other venues
750
+
751
+ except (json.JSONDecodeError, KeyError) as e:
752
+ logger.debug(f"Failed to parse venue search response: {e}")
753
+ continue
754
+ else:
755
+ logger.debug(f"Venue search failed for {venue}: {response.status_code if response else 'No response'}")
756
+
757
+ if results:
758
+ logger.debug(f"OpenReview API search found {len(results)} matches via venue search")
759
+ return results
760
+
761
+ # If venue search didn't work, try other approaches
762
+ # OpenReview API is quite restrictive, so we might need to fall back to web scraping
763
+ logger.debug("OpenReview API venue search returned no results, trying web search")
764
+ return []
765
+
766
+ except Exception as e:
767
+ logger.debug(f"OpenReview API search error: {e}")
768
+ return []
769
+
770
+ def _search_via_web(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
771
+ """
772
+ Search using OpenReview web interface (fallback)
773
+
774
+ Args:
775
+ title: Clean title to search for
776
+ authors: List of author names (optional)
777
+ year: Publication year (optional)
778
+
779
+ Returns:
780
+ List of matching paper dictionaries
781
+ """
782
+ try:
783
+ # Build search URL
784
+ search_query = title.replace(' ', '+')
785
+ search_url = f"{self.base_url}/search?term={search_query}"
786
+
787
+ response = self._respectful_request(search_url)
788
+ if not response or response.status_code != 200:
789
+ return []
790
+
791
+ # Parse search results page
792
+ soup = BeautifulSoup(response.text, 'html.parser')
793
+
794
+ # Look for paper links in search results
795
+ # OpenReview search results typically contain links to forum pages
796
+ results = []
797
+
798
+ # Find links that look like OpenReview paper URLs
799
+ for link in soup.find_all('a', href=True):
800
+ href = link.get('href', '')
801
+ if '/forum?id=' in href:
802
+ paper_id = self.extract_paper_id(href)
803
+ if paper_id:
804
+ # Get full metadata for this paper
805
+ metadata = self.get_paper_metadata(paper_id)
806
+ if metadata and self._is_good_match(metadata, title, authors, year):
807
+ results.append(metadata)
808
+ if len(results) >= 5: # Limit results
809
+ break
810
+
811
+ logger.debug(f"OpenReview web search found {len(results)} matches")
812
+ return results
813
+
814
+ except Exception as e:
815
+ logger.debug(f"OpenReview web search error: {e}")
816
+ return []
817
+
818
+ def _is_good_match(self, metadata: Dict[str, Any], search_title: str, authors: List[str] = None, year: int = None) -> bool:
819
+ """
820
+ Check if the found paper is a good match for the search criteria
821
+
822
+ Args:
823
+ metadata: Paper metadata from OpenReview
824
+ search_title: Title we're searching for
825
+ authors: Authors we're looking for (optional)
826
+ year: Year we're looking for (optional)
827
+
828
+ Returns:
829
+ True if it's a good match
830
+ """
831
+ paper_title = metadata.get('title', '')
832
+ if not paper_title:
833
+ return False
834
+
835
+ # Check title similarity
836
+ title_similarity = calculate_title_similarity(search_title, paper_title)
837
+ if title_similarity < 0.7: # Require at least 70% similarity
838
+ return False
839
+
840
+ # Check year if provided
841
+ if year:
842
+ paper_year = metadata.get('year')
843
+ if paper_year and abs(int(paper_year) - year) > 1: # Allow 1 year difference
844
+ return False
845
+
846
+ # Check authors if provided
847
+ if authors and len(authors) > 0:
848
+ paper_authors = metadata.get('authors', [])
849
+ if paper_authors:
850
+ # Check if at least one author matches
851
+ author_match = False
852
+ for search_author in authors[:2]: # Check first 2 authors
853
+ for paper_author in paper_authors[:3]: # Check first 3 paper authors
854
+ if is_name_match(search_author, paper_author):
855
+ author_match = True
856
+ break
857
+ if author_match:
858
+ break
859
+
860
+ if not author_match:
861
+ return False
862
+
863
+ return True
864
+
865
+ def search_by_title(self, title: str, max_results: int = 5) -> List[Dict[str, Any]]:
866
+ """
867
+ Search OpenReview for papers by title using the working search API.
868
+
869
+ Args:
870
+ title: Paper title to search for
871
+ max_results: Maximum number of results to return
872
+
873
+ Returns:
874
+ List of paper data dictionaries
875
+ """
876
+ try:
877
+ # Use OpenReview's search API with term parameter (this works!)
878
+ params = {
879
+ 'term': title,
880
+ 'limit': max_results
881
+ }
882
+
883
+ response = self._respectful_request(f"{self.api_url}/notes/search", params=params)
884
+ if not response or response.status_code != 200:
885
+ logger.debug(f"OpenReview search API failed with status {response.status_code if response else 'None'}")
886
+ return []
887
+
888
+ data = response.json()
889
+ papers = []
890
+
891
+ for note in data.get('notes', []):
892
+ # Filter to exact or close title matches
893
+ note_title = note.get('content', {}).get('title', '')
894
+ if self._is_title_match(title, note_title):
895
+ paper_data = self._parse_api_response(note)
896
+ if paper_data:
897
+ papers.append(paper_data)
898
+
899
+ logger.debug(f"OpenReview search found {len(papers)} matching papers for '{title}'")
900
+ return papers
901
+
902
+ except Exception as e:
903
+ logger.error(f"Error searching OpenReview by title '{title}': {e}")
904
+ return []
905
+
906
+ def _is_title_match(self, search_title: str, found_title: str, threshold: float = 0.8) -> bool:
907
+ """
908
+ Check if two titles match closely enough.
909
+
910
+ Args:
911
+ search_title: Title we're searching for
912
+ found_title: Title found in search results
913
+ threshold: Similarity threshold (0.0 to 1.0)
914
+
915
+ Returns:
916
+ True if titles match closely enough
917
+ """
918
+ if not search_title or not found_title:
919
+ return False
920
+
921
+ # Exact match
922
+ if search_title.lower().strip() == found_title.lower().strip():
923
+ return True
924
+
925
+ # Check if one contains the other (for cases where one is longer)
926
+ search_clean = search_title.lower().strip()
927
+ found_clean = found_title.lower().strip()
928
+
929
+ if search_clean in found_clean or found_clean in search_clean:
930
+ return True
931
+
932
+ # Use similarity calculation from text_utils
933
+ try:
934
+ from refchecker.utils.text_utils import calculate_title_similarity
935
+ similarity = calculate_title_similarity(search_title, found_title)
936
+ return similarity >= threshold
937
+ except ImportError:
938
+ # Fallback to simple word matching
939
+ search_words = set(search_clean.split())
940
+ found_words = set(found_clean.split())
941
+
942
+ if not search_words or not found_words:
943
+ return False
944
+
945
+ intersection = search_words.intersection(found_words)
946
+ union = search_words.union(found_words)
947
+
948
+ jaccard_similarity = len(intersection) / len(union) if union else 0
949
+ return jaccard_similarity >= threshold
950
+
951
+ def verify_reference_by_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
952
+ """
953
+ Verify a reference by searching OpenReview (for papers without URLs).
954
+
955
+ Args:
956
+ reference: Reference data dictionary
957
+
958
+ Returns:
959
+ Tuple of (verified_data, errors_and_warnings, debug_info)
960
+ """
961
+ title = reference.get('title', '').strip()
962
+ if not title:
963
+ return None, [], "No title provided for search"
964
+
965
+ # Search for the paper
966
+ search_results = self.search_by_title(title)
967
+
968
+ if not search_results:
969
+ return None, [], f"No papers found on OpenReview for title: {title}"
970
+
971
+ # Take the best match (first result, as search is already filtered)
972
+ best_match = search_results[0]
973
+
974
+ # Use the existing verify_reference method with the found URL
975
+ forum_url = best_match.get('forum_url')
976
+ if forum_url:
977
+ # Create a reference with the OpenReview URL for verification
978
+ reference_with_url = reference.copy()
979
+ reference_with_url['url'] = forum_url
980
+
981
+ return self.verify_reference(reference_with_url)
982
+
983
+ # If no URL, return the metadata as verification
984
+ return best_match, [], f"Found on OpenReview: {best_match.get('title')}"