academic-refchecker 1.2.34__tar.gz → 1.2.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {academic_refchecker-1.2.34/src/academic_refchecker.egg-info → academic_refchecker-1.2.35}/PKG-INFO +1 -1
  2. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/__version__.py +1 -1
  3. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
  4. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/academic_refchecker.egg-info/SOURCES.txt +1 -0
  5. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/checkers/enhanced_hybrid_checker.py +23 -2
  6. academic_refchecker-1.2.35/src/checkers/openreview_checker.py +512 -0
  7. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/utils/text_utils.py +19 -5
  8. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/LICENSE +0 -0
  9. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/MANIFEST.in +0 -0
  10. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/README.md +0 -0
  11. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/pyproject.toml +0 -0
  12. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/requirements.txt +0 -0
  13. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/scripts/download_db.py +0 -0
  14. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/scripts/run_tests.py +0 -0
  15. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/scripts/start_vllm_server.py +0 -0
  16. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/setup.cfg +0 -0
  17. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/__init__.py +0 -0
  18. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
  19. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
  20. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/academic_refchecker.egg-info/requires.txt +0 -0
  21. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/academic_refchecker.egg-info/top_level.txt +0 -0
  22. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/checkers/__init__.py +0 -0
  23. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/checkers/crossref.py +0 -0
  24. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/checkers/github_checker.py +0 -0
  25. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/checkers/local_semantic_scholar.py +0 -0
  26. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/checkers/openalex.py +0 -0
  27. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/checkers/semantic_scholar.py +0 -0
  28. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/checkers/webpage_checker.py +0 -0
  29. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/config/__init__.py +0 -0
  30. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/config/logging.conf +0 -0
  31. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/config/settings.py +0 -0
  32. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/core/__init__.py +0 -0
  33. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/core/db_connection_pool.py +0 -0
  34. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/core/parallel_processor.py +0 -0
  35. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/core/refchecker.py +0 -0
  36. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/database/__init__.py +0 -0
  37. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/database/download_semantic_scholar_db.py +0 -0
  38. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/llm/__init__.py +0 -0
  39. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/llm/base.py +0 -0
  40. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/llm/providers.py +0 -0
  41. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/scripts/__init__.py +0 -0
  42. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/scripts/start_vllm_server.py +0 -0
  43. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/services/__init__.py +0 -0
  44. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/services/pdf_processor.py +0 -0
  45. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/utils/__init__.py +0 -0
  46. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/utils/arxiv_utils.py +0 -0
  47. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/utils/author_utils.py +0 -0
  48. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/utils/config_validator.py +0 -0
  49. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/utils/db_utils.py +0 -0
  50. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/utils/doi_utils.py +0 -0
  51. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/utils/error_utils.py +0 -0
  52. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/utils/mock_objects.py +0 -0
  53. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/utils/unicode_utils.py +0 -0
  54. {academic_refchecker-1.2.34 → academic_refchecker-1.2.35}/src/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.34
3
+ Version: 1.2.35
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.34"
3
+ __version__ = "1.2.35"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.34
3
+ Version: 1.2.35
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -20,6 +20,7 @@ src/checkers/enhanced_hybrid_checker.py
20
20
  src/checkers/github_checker.py
21
21
  src/checkers/local_semantic_scholar.py
22
22
  src/checkers/openalex.py
23
+ src/checkers/openreview_checker.py
23
24
  src/checkers/semantic_scholar.py
24
25
  src/checkers/webpage_checker.py
25
26
  src/config/__init__.py
@@ -98,6 +98,16 @@ class EnhancedHybridReferenceChecker:
98
98
  except Exception as e:
99
99
  logger.warning(f"Enhanced Hybrid: Failed to initialize CrossRef: {e}")
100
100
 
101
+ # Initialize OpenReview checker
102
+ self.openreview = None
103
+ try:
104
+ from .openreview_checker import OpenReviewReferenceChecker
105
+ self.openreview = OpenReviewReferenceChecker()
106
+ logger.debug("Enhanced Hybrid: OpenReview checker initialized")
107
+ except Exception as e:
108
+ logger.warning(f"Enhanced Hybrid: Failed to initialize OpenReview: {e}")
109
+ self.openreview = None
110
+
101
111
  # Google Scholar removed - using more reliable APIs only
102
112
 
103
113
  # Track API performance for adaptive selection
@@ -105,7 +115,8 @@ class EnhancedHybridReferenceChecker:
105
115
  'local_db': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
106
116
  'semantic_scholar': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
107
117
  'openalex': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
108
- 'crossref': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0}
118
+ 'crossref': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
119
+ 'openreview': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0}
109
120
  }
110
121
 
111
122
  # Track failed API calls for retry logic - OPTIMIZED CONFIGURATION
@@ -297,7 +308,17 @@ class EnhancedHybridReferenceChecker:
297
308
  if failure_type in ['throttled', 'timeout', 'server_error']:
298
309
  failed_apis.append(('openalex', self.openalex, failure_type))
299
310
 
300
- # Strategy 5: Try CrossRef if we haven't already (for non-DOI references)
311
+ # Strategy 5: Try OpenReview if URL suggests it's an OpenReview paper
312
+ if (self.openreview and
313
+ hasattr(self.openreview, 'is_openreview_reference') and
314
+ self.openreview.is_openreview_reference(reference)):
315
+ verified_data, errors, url, success, failure_type = self._try_api('openreview', self.openreview, reference)
316
+ if success:
317
+ return verified_data, errors, url
318
+ if failure_type in ['throttled', 'timeout', 'server_error']:
319
+ failed_apis.append(('openreview', self.openreview, failure_type))
320
+
321
+ # Strategy 6: Try CrossRef if we haven't already (for non-DOI references)
301
322
  if not self._should_try_doi_apis_first(reference) and self.crossref:
302
323
  verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
303
324
  if success:
@@ -0,0 +1,512 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ OpenReview API Client for Reference Verification
4
+
5
+ This module provides functionality to verify references from OpenReview papers.
6
+ OpenReview is a platform for open peer review in machine learning conferences
7
+ like ICLR, NeurIPS, ICML, etc.
8
+
9
+ Usage:
10
+ from openreview_checker import OpenReviewReferenceChecker
11
+
12
+ # Initialize the checker
13
+ checker = OpenReviewReferenceChecker()
14
+
15
+ # Verify a reference
16
+ reference = {
17
+ 'title': 'Title of the paper',
18
+ 'authors': ['Author 1', 'Author 2'],
19
+ 'year': 2024,
20
+ 'url': 'https://openreview.net/forum?id=ZG3RaNIsO8',
21
+ 'raw_text': 'Full citation text'
22
+ }
23
+
24
+ verified_data, errors, url = checker.verify_reference(reference)
25
+ """
26
+
27
+ import requests
28
+ import time
29
+ import logging
30
+ import re
31
+ import json
32
+ from typing import Dict, List, Tuple, Optional, Any, Union
33
+ from urllib.parse import urlparse, parse_qs
34
+ from bs4 import BeautifulSoup
35
+ from utils.text_utils import (
36
+ normalize_text, clean_title_basic, is_name_match,
37
+ calculate_title_similarity, compare_authors,
38
+ clean_title_for_search, are_venues_substantially_different,
39
+ is_year_substantially_different
40
+ )
41
+
42
+ # Set up logging
43
+ logger = logging.getLogger(__name__)
44
+
45
+ class OpenReviewReferenceChecker:
46
+ """
47
+ A class to verify references using OpenReview
48
+ """
49
+
50
+ def __init__(self, request_delay: float = 1.0):
51
+ """
52
+ Initialize the OpenReview client
53
+
54
+ Args:
55
+ request_delay: Delay between requests to be respectful to OpenReview servers
56
+ """
57
+ self.base_url = "https://openreview.net"
58
+ self.api_url = "https://api.openreview.net"
59
+ self.request_delay = request_delay
60
+ self.last_request_time = 0
61
+
62
+ # Session for connection pooling
63
+ self.session = requests.Session()
64
+ self.session.headers.update({
65
+ 'User-Agent': 'RefChecker/1.0 (Academic Reference Verification)',
66
+ 'Accept': 'application/json, text/html',
67
+ 'Accept-Language': 'en-US,en;q=0.9'
68
+ })
69
+
70
+ def is_openreview_url(self, url: str) -> bool:
71
+ """
72
+ Check if URL is from OpenReview
73
+
74
+ Args:
75
+ url: URL to check
76
+
77
+ Returns:
78
+ True if it's an OpenReview URL
79
+ """
80
+ return bool(url and 'openreview.net' in url.lower())
81
+
82
+ def is_openreview_reference(self, reference: Dict[str, Any]) -> bool:
83
+ """
84
+ Determine if this reference is from OpenReview based on URL patterns
85
+
86
+ Args:
87
+ reference: Reference dictionary to check
88
+
89
+ Returns:
90
+ True if reference appears to be from OpenReview
91
+ """
92
+ # Check various URL fields for OpenReview URLs
93
+ url_fields = ['url', 'openreview_url', 'link', 'venue_url']
94
+ for field in url_fields:
95
+ url = reference.get(field, '')
96
+ if url and self.is_openreview_url(url):
97
+ return True
98
+
99
+ # Check raw text for OpenReview URLs
100
+ raw_text = reference.get('raw_text', '')
101
+ if raw_text and 'openreview.net' in raw_text.lower():
102
+ return True
103
+
104
+ return False
105
+
106
+ def extract_paper_id(self, url: str) -> Optional[str]:
107
+ """
108
+ Extract paper ID from OpenReview URL
109
+
110
+ Args:
111
+ url: OpenReview URL
112
+
113
+ Returns:
114
+ Paper ID if found, None otherwise
115
+ """
116
+ if not self.is_openreview_url(url):
117
+ return None
118
+
119
+ # Handle different OpenReview URL formats:
120
+ # https://openreview.net/forum?id=ZG3RaNIsO8
121
+ # https://openreview.net/pdf?id=ZG3RaNIsO8
122
+ # https://openreview.net/forum?id=ZG3RaNIsO8&noteId=...
123
+
124
+ parsed = urlparse(url)
125
+ query_params = parse_qs(parsed.query)
126
+
127
+ if 'id' in query_params:
128
+ return query_params['id'][0]
129
+
130
+ # Also check path-based URLs (if they exist)
131
+ path_match = re.search(r'/(?:forum|pdf|notes)/([A-Za-z0-9_-]+)', parsed.path)
132
+ if path_match:
133
+ return path_match.group(1)
134
+
135
+ return None
136
+
137
+ def _respectful_request(self, url: str, **kwargs) -> Optional[requests.Response]:
138
+ """Make a respectful HTTP request with rate limiting"""
139
+ current_time = time.time()
140
+ time_since_last = current_time - self.last_request_time
141
+
142
+ if time_since_last < self.request_delay:
143
+ time.sleep(self.request_delay - time_since_last)
144
+
145
+ try:
146
+ logger.debug(f"Making request to: {url}")
147
+ response = self.session.get(url, timeout=15, **kwargs)
148
+ self.last_request_time = time.time()
149
+ logger.debug(f"Request successful: {response.status_code}")
150
+ return response
151
+ except requests.exceptions.RequestException as e:
152
+ logger.debug(f"Request failed for {url}: {type(e).__name__}: {e}")
153
+ return None
154
+
155
+ def get_paper_metadata(self, paper_id: str) -> Optional[Dict[str, Any]]:
156
+ """
157
+ Get paper metadata from OpenReview
158
+
159
+ Args:
160
+ paper_id: OpenReview paper ID
161
+
162
+ Returns:
163
+ Paper metadata dictionary or None if not found
164
+ """
165
+ # Try API endpoint first
166
+ api_url = f"{self.api_url}/notes?id={paper_id}"
167
+ response = self._respectful_request(api_url)
168
+
169
+ if response and response.status_code == 200:
170
+ try:
171
+ data = response.json()
172
+ if 'notes' in data and data['notes']:
173
+ note = data['notes'][0]
174
+ return self._parse_api_response(note)
175
+ except (json.JSONDecodeError, KeyError) as e:
176
+ logger.debug(f"Failed to parse API response: {e}")
177
+
178
+ # Fall back to web scraping
179
+ forum_url = f"{self.base_url}/forum?id={paper_id}"
180
+ response = self._respectful_request(forum_url)
181
+
182
+ if not response or response.status_code != 200:
183
+ return None
184
+
185
+ return self._parse_web_page(response.text, forum_url)
186
+
187
+ def _parse_api_response(self, note: Dict[str, Any]) -> Dict[str, Any]:
188
+ """
189
+ Parse OpenReview API response to extract metadata
190
+
191
+ Args:
192
+ note: Note data from API response
193
+
194
+ Returns:
195
+ Parsed metadata dictionary
196
+ """
197
+ content = note.get('content', {})
198
+
199
+ # Extract basic metadata
200
+ metadata = {
201
+ 'id': note.get('id'),
202
+ 'title': content.get('title', '').strip(),
203
+ 'authors': [],
204
+ 'year': None,
205
+ 'venue': None,
206
+ 'abstract': content.get('abstract', '').strip(),
207
+ 'keywords': content.get('keywords', []),
208
+ 'pdf_url': content.get('pdf'),
209
+ 'forum_url': f"{self.base_url}/forum?id={note.get('id')}",
210
+ 'source': 'openreview_api'
211
+ }
212
+
213
+ # Parse authors
214
+ authors_raw = content.get('authors', [])
215
+ if isinstance(authors_raw, list):
216
+ metadata['authors'] = [author.strip() for author in authors_raw if author.strip()]
217
+ elif isinstance(authors_raw, str):
218
+ # Sometimes authors are in a single string
219
+ metadata['authors'] = [author.strip() for author in authors_raw.split(',') if author.strip()]
220
+
221
+ # Extract year from various sources
222
+ # Check creation time
223
+ if 'cdate' in note:
224
+ try:
225
+ import datetime
226
+ timestamp = note['cdate'] / 1000.0 # Convert from milliseconds
227
+ year = datetime.datetime.fromtimestamp(timestamp).year
228
+ metadata['year'] = year
229
+ except (ValueError, TypeError):
230
+ pass
231
+
232
+ # Check if venue/conference info is available
233
+ venue_info = content.get('venue', '')
234
+ if venue_info:
235
+ metadata['venue'] = venue_info.strip()
236
+
237
+ # Try to extract venue from forum context or submission info
238
+ if not metadata['venue']:
239
+ # Common venues for OpenReview
240
+ forum_path = note.get('forum', '')
241
+ if 'ICLR' in str(content) or 'iclr' in forum_path.lower():
242
+ metadata['venue'] = 'ICLR'
243
+ elif 'NeurIPS' in str(content) or 'neurips' in forum_path.lower():
244
+ metadata['venue'] = 'NeurIPS'
245
+ elif 'ICML' in str(content) or 'icml' in forum_path.lower():
246
+ metadata['venue'] = 'ICML'
247
+
248
+ return metadata
249
+
250
+ def _parse_web_page(self, html: str, url: str) -> Dict[str, Any]:
251
+ """
252
+ Parse OpenReview web page to extract metadata
253
+
254
+ Args:
255
+ html: HTML content of the page
256
+ url: Original URL
257
+
258
+ Returns:
259
+ Parsed metadata dictionary
260
+ """
261
+ soup = BeautifulSoup(html, 'html.parser')
262
+
263
+ # Extract paper ID from URL
264
+ paper_id = self.extract_paper_id(url)
265
+
266
+ metadata = {
267
+ 'id': paper_id,
268
+ 'title': '',
269
+ 'authors': [],
270
+ 'year': None,
271
+ 'venue': None,
272
+ 'abstract': '',
273
+ 'keywords': [],
274
+ 'forum_url': url,
275
+ 'source': 'openreview_web'
276
+ }
277
+
278
+ # Extract title
279
+ title_elem = soup.find('h2', {'class': 'citation_title'}) or soup.find('h1')
280
+ if title_elem:
281
+ metadata['title'] = title_elem.get_text().strip()
282
+
283
+ # Try to find title in meta tags
284
+ if not metadata['title']:
285
+ meta_title = soup.find('meta', {'property': 'og:title'}) or soup.find('meta', {'name': 'title'})
286
+ if meta_title and meta_title.get('content'):
287
+ metadata['title'] = meta_title['content'].strip()
288
+
289
+ # Extract authors from meta tags (most reliable for OpenReview)
290
+ author_metas = soup.find_all('meta', {'name': 'citation_author'})
291
+ if author_metas:
292
+ metadata['authors'] = [meta.get('content', '').strip() for meta in author_metas if meta.get('content', '').strip()]
293
+
294
+ # Fallback: try to find authors in HTML structure
295
+ if not metadata['authors']:
296
+ authors_section = soup.find('div', {'class': 'authors'}) or soup.find('span', {'class': 'authors'})
297
+ if authors_section:
298
+ # Extract author names from links or text
299
+ author_links = authors_section.find_all('a')
300
+ if author_links:
301
+ metadata['authors'] = [link.get_text().strip() for link in author_links]
302
+ else:
303
+ # Parse comma-separated authors
304
+ authors_text = authors_section.get_text().strip()
305
+ metadata['authors'] = [author.strip() for author in authors_text.split(',') if author.strip()]
306
+
307
+ # Extract year from various sources
308
+ year_pattern = r'\b(20\d{2})\b'
309
+
310
+ # Check date/year elements
311
+ date_elem = soup.find('span', {'class': 'date'}) or soup.find('time')
312
+ if date_elem:
313
+ year_match = re.search(year_pattern, date_elem.get_text())
314
+ if year_match:
315
+ metadata['year'] = int(year_match.group(1))
316
+
317
+ # Check meta tags for date
318
+ if not metadata['year']:
319
+ meta_date = soup.find('meta', {'name': 'citation_date'}) or soup.find('meta', {'name': 'date'})
320
+ if meta_date and meta_date.get('content'):
321
+ year_match = re.search(year_pattern, meta_date['content'])
322
+ if year_match:
323
+ metadata['year'] = int(year_match.group(1))
324
+
325
+ # Extract abstract
326
+ abstract_elem = soup.find('div', {'class': 'abstract'}) or soup.find('section', {'class': 'abstract'})
327
+ if abstract_elem:
328
+ metadata['abstract'] = abstract_elem.get_text().strip()
329
+
330
+ # Extract venue information from meta tags (most reliable for OpenReview)
331
+ venue_meta = soup.find('meta', {'name': 'citation_conference_title'})
332
+ if venue_meta and venue_meta.get('content'):
333
+ venue_full = venue_meta['content'].strip()
334
+ # Convert long conference names to common abbreviations
335
+ if 'International Conference on Learning Representations' in venue_full:
336
+ # Extract year if present
337
+ year_match = re.search(r'\b(20\d{2})\b', venue_full)
338
+ if year_match:
339
+ metadata['venue'] = f'ICLR {year_match.group(1)}'
340
+ else:
341
+ metadata['venue'] = 'ICLR'
342
+ elif 'Neural Information Processing Systems' in venue_full or 'NeurIPS' in venue_full:
343
+ year_match = re.search(r'\b(20\d{2})\b', venue_full)
344
+ if year_match:
345
+ metadata['venue'] = f'NeurIPS {year_match.group(1)}'
346
+ else:
347
+ metadata['venue'] = 'NeurIPS'
348
+ else:
349
+ metadata['venue'] = venue_full
350
+
351
+ # Fallback: try HTML structure
352
+ if not metadata['venue']:
353
+ venue_elem = soup.find('div', {'class': 'venue'}) or soup.find('span', {'class': 'venue'})
354
+ if venue_elem:
355
+ metadata['venue'] = venue_elem.get_text().strip()
356
+
357
+ # Final fallback: try to determine venue from page context or URL
358
+ if not metadata['venue']:
359
+ page_text = soup.get_text().lower()
360
+ if 'iclr' in page_text or 'iclr' in url.lower():
361
+ if '2024' in page_text:
362
+ metadata['venue'] = 'ICLR 2024'
363
+ else:
364
+ metadata['venue'] = 'ICLR'
365
+ elif 'neurips' in page_text or 'neurips' in url.lower():
366
+ metadata['venue'] = 'NeurIPS'
367
+ elif 'icml' in page_text or 'icml' in url.lower():
368
+ metadata['venue'] = 'ICML'
369
+
370
+ # Extract keywords if available
371
+ keywords_elem = soup.find('div', {'class': 'keywords'})
372
+ if keywords_elem:
373
+ keywords_text = keywords_elem.get_text()
374
+ metadata['keywords'] = [kw.strip() for kw in keywords_text.split(',') if kw.strip()]
375
+
376
+ return metadata
377
+
378
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
379
+ """
380
+ Verify a reference against OpenReview
381
+
382
+ Args:
383
+ reference: Reference dictionary with title, authors, year, url, etc.
384
+
385
+ Returns:
386
+ Tuple of (verified_data, errors, paper_url) where:
387
+ - verified_data: Dict with verified OpenReview paper data or None
388
+ - errors: List of error/warning dictionaries
389
+ - paper_url: The OpenReview URL
390
+ """
391
+ logger.debug(f"Verifying OpenReview reference: {reference.get('title', 'Untitled')}")
392
+
393
+ # Extract OpenReview URL from reference
394
+ openreview_url = None
395
+ for url_key in ['url', 'openreview_url', 'link']:
396
+ if url_key in reference and reference[url_key]:
397
+ url = reference[url_key].strip()
398
+ if self.is_openreview_url(url):
399
+ openreview_url = url
400
+ break
401
+
402
+ if not openreview_url:
403
+ logger.debug("No OpenReview URL found in reference")
404
+ return None, [], None
405
+
406
+ # Extract paper ID
407
+ paper_id = self.extract_paper_id(openreview_url)
408
+ if not paper_id:
409
+ return None, [{"error_type": "unverified", "error_details": "Could not extract paper ID from OpenReview URL"}], openreview_url
410
+
411
+ # Get paper metadata
412
+ paper_data = self.get_paper_metadata(paper_id)
413
+ if not paper_data:
414
+ return None, [{"error_type": "unverified", "error_details": "Paper not found on OpenReview"}], openreview_url
415
+
416
+ logger.debug(f"Found OpenReview paper: {paper_data.get('title', 'Untitled')}")
417
+
418
+ # Verify the reference against the paper data
419
+ errors = []
420
+
421
+ # Check title match
422
+ cited_title = reference.get('title', '').strip()
423
+ paper_title = paper_data.get('title', '').strip()
424
+
425
+ if cited_title and paper_title:
426
+ similarity = calculate_title_similarity(cited_title, paper_title)
427
+ if similarity < 0.7: # Using a reasonable threshold
428
+ errors.append({
429
+ "warning_type": "title",
430
+ "warning_details": f"Title mismatch: cited as '{cited_title}' but OpenReview shows '{paper_title}' (similarity: {similarity:.2f})"
431
+ })
432
+
433
+ # Check authors
434
+ cited_authors = reference.get('authors', [])
435
+ paper_authors = paper_data.get('authors', [])
436
+
437
+ if cited_authors and paper_authors:
438
+ # Convert to list format if needed
439
+ if isinstance(cited_authors, str):
440
+ cited_authors = [author.strip() for author in cited_authors.split(',')]
441
+ if isinstance(paper_authors, str):
442
+ paper_authors = [author.strip() for author in paper_authors.split(',')]
443
+
444
+ # Use the existing author comparison function
445
+ match, error_msg = compare_authors(cited_authors, paper_authors)
446
+ if not match and error_msg:
447
+ errors.append({
448
+ "warning_type": "author",
449
+ "warning_details": error_msg
450
+ })
451
+
452
+ # Check year
453
+ cited_year = reference.get('year')
454
+ paper_year = paper_data.get('year')
455
+
456
+ if cited_year and paper_year:
457
+ try:
458
+ cited_year_int = int(cited_year)
459
+ paper_year_int = int(paper_year)
460
+
461
+ is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
462
+ if is_different and year_message:
463
+ errors.append({
464
+ "warning_type": "year",
465
+ "warning_details": year_message
466
+ })
467
+ except (ValueError, TypeError):
468
+ pass # Skip year validation if conversion fails
469
+
470
+ # Check venue if provided in reference
471
+ cited_venue = reference.get('venue', '').strip()
472
+ paper_venue = paper_data.get('venue', '').strip()
473
+
474
+ if cited_venue and paper_venue:
475
+ if are_venues_substantially_different(cited_venue, paper_venue):
476
+ errors.append({
477
+ "warning_type": "venue",
478
+ "warning_details": f"Venue mismatch: cited as '{cited_venue}' but OpenReview shows '{paper_venue}'"
479
+ })
480
+
481
+ # Create verified data structure
482
+ verified_data = {
483
+ 'title': paper_data.get('title', cited_title),
484
+ 'authors': paper_data.get('authors', cited_authors),
485
+ 'year': paper_data.get('year', cited_year),
486
+ 'venue': paper_data.get('venue', cited_venue),
487
+ 'url': openreview_url,
488
+ 'abstract': paper_data.get('abstract', ''),
489
+ 'keywords': paper_data.get('keywords', []),
490
+ 'openreview_metadata': paper_data,
491
+ 'verification_source': 'OpenReview'
492
+ }
493
+
494
+ logger.debug(f"OpenReview verification completed for: {openreview_url}")
495
+ return verified_data, errors, openreview_url
496
+
497
+ def search_paper(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
498
+ """
499
+ Search for papers on OpenReview by title, authors, and/or year
500
+
501
+ Args:
502
+ title: Paper title to search for
503
+ authors: List of author names (optional)
504
+ year: Publication year (optional)
505
+
506
+ Returns:
507
+ List of matching paper metadata dictionaries
508
+ """
509
+ # This would implement search functionality if needed
510
+ # For now, OpenReview verification is primarily URL-based
511
+ logger.debug(f"Search functionality not yet implemented for OpenReview")
512
+ return []
@@ -81,6 +81,11 @@ def parse_authors_with_initials(authors_text):
81
81
  # Import regex at function level to avoid import issues
82
82
  import re
83
83
 
84
+ # Handle standalone "others" or "et al" cases that should return empty list
85
+ stripped_text = authors_text.strip().lower()
86
+ if stripped_text in ['others', 'and others', 'et al', 'et al.']:
87
+ return []
88
+
84
89
  # Fix spacing around periods in initials (e.g., "Y . Li" -> "Y. Li") before parsing
85
90
  authors_text = re.sub(r'(\w)\s+\.', r'\1.', authors_text)
86
91
 
@@ -94,10 +99,16 @@ def parse_authors_with_initials(authors_text):
94
99
  valid_names = []
95
100
  for part in and_parts:
96
101
  part = part.strip()
97
- if part and (len(part.split()) >= 2 or re.search(r'[A-Z]\.', part)):
102
+ # Check for et al indicators first
103
+ if part.lower() in ['others', 'et al', 'et al.', 'and others']:
104
+ # Add et al if we have real authors, then stop
105
+ if valid_names:
106
+ valid_names.append("et al")
107
+ break
108
+ elif part and (len(part.split()) >= 2 or re.search(r'[A-Z]\.', part)):
98
109
  valid_names.append(part)
99
110
 
100
- if len(valid_names) == len(and_parts): # All parts look like valid names
111
+ if valid_names: # Return if we found any valid names (including et al handling)
101
112
  return valid_names
102
113
 
103
114
  # Case 2: "Lastname, Firstname and Lastname, Firstname" format (BibTeX format)
@@ -112,9 +123,11 @@ def parse_authors_with_initials(authors_text):
112
123
  # Handle special cases without commas
113
124
  if comma_count == 0:
114
125
  # Check if this is "others", "et al", or similar
115
- if part.lower() in ['others', 'et al', 'et al.']:
116
- # Skip these entirely - they're not real author names
117
- continue
126
+ if part.lower() in ['others', 'et al', 'et al.', 'and others']:
127
+ # Convert to standard "et al" and add it, then stop processing
128
+ if valid_author_parts: # Only add if we have real authors
129
+ valid_author_parts.append("et al")
130
+ break # Stop processing after et al indicator
118
131
  else:
119
132
  # This might be a name without lastname, firstname format
120
133
  # For now, skip to be safe unless it's clearly a single name
@@ -3865,6 +3878,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
3865
3878
  word_roots = {
3866
3879
  'robot': 'robotics', 'robotics': 'robot',
3867
3880
  'sci': 'science', 'science': 'sci',
3881
+ 'science': 'sciences', 'sciences': 'science', # Handle singular/plural
3868
3882
  'adv': 'advanced', 'advanced': 'adv',
3869
3883
  'intell': 'intelligent', 'intelligent': 'intell',
3870
3884
  'syst': 'systems', 'systems': 'syst',