academic-refchecker 1.2.34__py3-none-any.whl → 1.2.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __version__.py +1 -1
- {academic_refchecker-1.2.34.dist-info → academic_refchecker-1.2.35.dist-info}/METADATA +1 -1
- {academic_refchecker-1.2.34.dist-info → academic_refchecker-1.2.35.dist-info}/RECORD +10 -9
- checkers/enhanced_hybrid_checker.py +23 -2
- checkers/openreview_checker.py +512 -0
- utils/text_utils.py +19 -5
- {academic_refchecker-1.2.34.dist-info → academic_refchecker-1.2.35.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.34.dist-info → academic_refchecker-1.2.35.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-1.2.34.dist-info → academic_refchecker-1.2.35.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-1.2.34.dist-info → academic_refchecker-1.2.35.dist-info}/top_level.txt +0 -0
__version__.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
__version__.py,sha256=
|
|
2
|
-
academic_refchecker-1.2.
|
|
1
|
+
__version__.py,sha256=uj_o09nsXqyl0HrS9JiFstvRwB4CAFwQuTgnfqbNKdg,65
|
|
2
|
+
academic_refchecker-1.2.35.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
3
3
|
checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
|
|
4
4
|
checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
|
|
5
|
-
checkers/enhanced_hybrid_checker.py,sha256=
|
|
5
|
+
checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
|
|
6
6
|
checkers/github_checker.py,sha256=54K6_YJW5w2GtzodnSOLfK5d1ErFJxbTOIIV5P_kFX0,13543
|
|
7
7
|
checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
|
|
8
8
|
checkers/openalex.py,sha256=GxYUH9GZ0AyF-WFKgXiFHqkalrSnySgFSkiM1PsK0VI,19757
|
|
9
|
+
checkers/openreview_checker.py,sha256=QRQXUk1Ws-e-wETSeLgq06WmHQrjUk17my_Zj4rrwmY,20303
|
|
9
10
|
checkers/semantic_scholar.py,sha256=YHR9nWaT7aieyczVMRKCPHr3k_Hl8g1rzd0k4f3bDTs,35022
|
|
10
11
|
checkers/webpage_checker.py,sha256=BvNwOqukTX9IeQUpUfIrI_5Gr2w9VLBt5x_PB-hKUIo,21616
|
|
11
12
|
config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
|
|
@@ -32,11 +33,11 @@ utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
|
|
|
32
33
|
utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
|
|
33
34
|
utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
|
|
34
35
|
utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
35
|
-
utils/text_utils.py,sha256=
|
|
36
|
+
utils/text_utils.py,sha256=SbuzUQD8430z7Ll1_4aTilVzwknh1O4N8LeSAx5yF-M,177904
|
|
36
37
|
utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
37
38
|
utils/url_utils.py,sha256=qoimCrMFCBGvlmF_t1c6zSOmkWi_rUm-gZM0XZ4rEVE,6291
|
|
38
|
-
academic_refchecker-1.2.
|
|
39
|
-
academic_refchecker-1.2.
|
|
40
|
-
academic_refchecker-1.2.
|
|
41
|
-
academic_refchecker-1.2.
|
|
42
|
-
academic_refchecker-1.2.
|
|
39
|
+
academic_refchecker-1.2.35.dist-info/METADATA,sha256=W8YaWup9_0p1c24SOEJP-AHSG9pcGBrXFugNfn5TR0g,22298
|
|
40
|
+
academic_refchecker-1.2.35.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
41
|
+
academic_refchecker-1.2.35.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
|
|
42
|
+
academic_refchecker-1.2.35.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
|
|
43
|
+
academic_refchecker-1.2.35.dist-info/RECORD,,
|
|
@@ -98,6 +98,16 @@ class EnhancedHybridReferenceChecker:
|
|
|
98
98
|
except Exception as e:
|
|
99
99
|
logger.warning(f"Enhanced Hybrid: Failed to initialize CrossRef: {e}")
|
|
100
100
|
|
|
101
|
+
# Initialize OpenReview checker
|
|
102
|
+
self.openreview = None
|
|
103
|
+
try:
|
|
104
|
+
from .openreview_checker import OpenReviewReferenceChecker
|
|
105
|
+
self.openreview = OpenReviewReferenceChecker()
|
|
106
|
+
logger.debug("Enhanced Hybrid: OpenReview checker initialized")
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logger.warning(f"Enhanced Hybrid: Failed to initialize OpenReview: {e}")
|
|
109
|
+
self.openreview = None
|
|
110
|
+
|
|
101
111
|
# Google Scholar removed - using more reliable APIs only
|
|
102
112
|
|
|
103
113
|
# Track API performance for adaptive selection
|
|
@@ -105,7 +115,8 @@ class EnhancedHybridReferenceChecker:
|
|
|
105
115
|
'local_db': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
106
116
|
'semantic_scholar': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
107
117
|
'openalex': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
108
|
-
'crossref': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0}
|
|
118
|
+
'crossref': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
119
|
+
'openreview': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0}
|
|
109
120
|
}
|
|
110
121
|
|
|
111
122
|
# Track failed API calls for retry logic - OPTIMIZED CONFIGURATION
|
|
@@ -297,7 +308,17 @@ class EnhancedHybridReferenceChecker:
|
|
|
297
308
|
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
298
309
|
failed_apis.append(('openalex', self.openalex, failure_type))
|
|
299
310
|
|
|
300
|
-
# Strategy 5: Try
|
|
311
|
+
# Strategy 5: Try OpenReview if URL suggests it's an OpenReview paper
|
|
312
|
+
if (self.openreview and
|
|
313
|
+
hasattr(self.openreview, 'is_openreview_reference') and
|
|
314
|
+
self.openreview.is_openreview_reference(reference)):
|
|
315
|
+
verified_data, errors, url, success, failure_type = self._try_api('openreview', self.openreview, reference)
|
|
316
|
+
if success:
|
|
317
|
+
return verified_data, errors, url
|
|
318
|
+
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
319
|
+
failed_apis.append(('openreview', self.openreview, failure_type))
|
|
320
|
+
|
|
321
|
+
# Strategy 6: Try CrossRef if we haven't already (for non-DOI references)
|
|
301
322
|
if not self._should_try_doi_apis_first(reference) and self.crossref:
|
|
302
323
|
verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
|
|
303
324
|
if success:
|
|
@@ -0,0 +1,512 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
OpenReview API Client for Reference Verification
|
|
4
|
+
|
|
5
|
+
This module provides functionality to verify references from OpenReview papers.
|
|
6
|
+
OpenReview is a platform for open peer review in machine learning conferences
|
|
7
|
+
like ICLR, NeurIPS, ICML, etc.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from openreview_checker import OpenReviewReferenceChecker
|
|
11
|
+
|
|
12
|
+
# Initialize the checker
|
|
13
|
+
checker = OpenReviewReferenceChecker()
|
|
14
|
+
|
|
15
|
+
# Verify a reference
|
|
16
|
+
reference = {
|
|
17
|
+
'title': 'Title of the paper',
|
|
18
|
+
'authors': ['Author 1', 'Author 2'],
|
|
19
|
+
'year': 2024,
|
|
20
|
+
'url': 'https://openreview.net/forum?id=ZG3RaNIsO8',
|
|
21
|
+
'raw_text': 'Full citation text'
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
verified_data, errors, url = checker.verify_reference(reference)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import requests
|
|
28
|
+
import time
|
|
29
|
+
import logging
|
|
30
|
+
import re
|
|
31
|
+
import json
|
|
32
|
+
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
33
|
+
from urllib.parse import urlparse, parse_qs
|
|
34
|
+
from bs4 import BeautifulSoup
|
|
35
|
+
from utils.text_utils import (
|
|
36
|
+
normalize_text, clean_title_basic, is_name_match,
|
|
37
|
+
calculate_title_similarity, compare_authors,
|
|
38
|
+
clean_title_for_search, are_venues_substantially_different,
|
|
39
|
+
is_year_substantially_different
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Set up logging
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
class OpenReviewReferenceChecker:
|
|
46
|
+
"""
|
|
47
|
+
A class to verify references using OpenReview
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self, request_delay: float = 1.0):
|
|
51
|
+
"""
|
|
52
|
+
Initialize the OpenReview client
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
request_delay: Delay between requests to be respectful to OpenReview servers
|
|
56
|
+
"""
|
|
57
|
+
self.base_url = "https://openreview.net"
|
|
58
|
+
self.api_url = "https://api.openreview.net"
|
|
59
|
+
self.request_delay = request_delay
|
|
60
|
+
self.last_request_time = 0
|
|
61
|
+
|
|
62
|
+
# Session for connection pooling
|
|
63
|
+
self.session = requests.Session()
|
|
64
|
+
self.session.headers.update({
|
|
65
|
+
'User-Agent': 'RefChecker/1.0 (Academic Reference Verification)',
|
|
66
|
+
'Accept': 'application/json, text/html',
|
|
67
|
+
'Accept-Language': 'en-US,en;q=0.9'
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
def is_openreview_url(self, url: str) -> bool:
|
|
71
|
+
"""
|
|
72
|
+
Check if URL is from OpenReview
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
url: URL to check
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
True if it's an OpenReview URL
|
|
79
|
+
"""
|
|
80
|
+
return bool(url and 'openreview.net' in url.lower())
|
|
81
|
+
|
|
82
|
+
def is_openreview_reference(self, reference: Dict[str, Any]) -> bool:
|
|
83
|
+
"""
|
|
84
|
+
Determine if this reference is from OpenReview based on URL patterns
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
reference: Reference dictionary to check
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
True if reference appears to be from OpenReview
|
|
91
|
+
"""
|
|
92
|
+
# Check various URL fields for OpenReview URLs
|
|
93
|
+
url_fields = ['url', 'openreview_url', 'link', 'venue_url']
|
|
94
|
+
for field in url_fields:
|
|
95
|
+
url = reference.get(field, '')
|
|
96
|
+
if url and self.is_openreview_url(url):
|
|
97
|
+
return True
|
|
98
|
+
|
|
99
|
+
# Check raw text for OpenReview URLs
|
|
100
|
+
raw_text = reference.get('raw_text', '')
|
|
101
|
+
if raw_text and 'openreview.net' in raw_text.lower():
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
def extract_paper_id(self, url: str) -> Optional[str]:
|
|
107
|
+
"""
|
|
108
|
+
Extract paper ID from OpenReview URL
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
url: OpenReview URL
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Paper ID if found, None otherwise
|
|
115
|
+
"""
|
|
116
|
+
if not self.is_openreview_url(url):
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
# Handle different OpenReview URL formats:
|
|
120
|
+
# https://openreview.net/forum?id=ZG3RaNIsO8
|
|
121
|
+
# https://openreview.net/pdf?id=ZG3RaNIsO8
|
|
122
|
+
# https://openreview.net/forum?id=ZG3RaNIsO8¬eId=...
|
|
123
|
+
|
|
124
|
+
parsed = urlparse(url)
|
|
125
|
+
query_params = parse_qs(parsed.query)
|
|
126
|
+
|
|
127
|
+
if 'id' in query_params:
|
|
128
|
+
return query_params['id'][0]
|
|
129
|
+
|
|
130
|
+
# Also check path-based URLs (if they exist)
|
|
131
|
+
path_match = re.search(r'/(?:forum|pdf|notes)/([A-Za-z0-9_-]+)', parsed.path)
|
|
132
|
+
if path_match:
|
|
133
|
+
return path_match.group(1)
|
|
134
|
+
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
def _respectful_request(self, url: str, **kwargs) -> Optional[requests.Response]:
|
|
138
|
+
"""Make a respectful HTTP request with rate limiting"""
|
|
139
|
+
current_time = time.time()
|
|
140
|
+
time_since_last = current_time - self.last_request_time
|
|
141
|
+
|
|
142
|
+
if time_since_last < self.request_delay:
|
|
143
|
+
time.sleep(self.request_delay - time_since_last)
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
logger.debug(f"Making request to: {url}")
|
|
147
|
+
response = self.session.get(url, timeout=15, **kwargs)
|
|
148
|
+
self.last_request_time = time.time()
|
|
149
|
+
logger.debug(f"Request successful: {response.status_code}")
|
|
150
|
+
return response
|
|
151
|
+
except requests.exceptions.RequestException as e:
|
|
152
|
+
logger.debug(f"Request failed for {url}: {type(e).__name__}: {e}")
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
def get_paper_metadata(self, paper_id: str) -> Optional[Dict[str, Any]]:
|
|
156
|
+
"""
|
|
157
|
+
Get paper metadata from OpenReview
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
paper_id: OpenReview paper ID
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Paper metadata dictionary or None if not found
|
|
164
|
+
"""
|
|
165
|
+
# Try API endpoint first
|
|
166
|
+
api_url = f"{self.api_url}/notes?id={paper_id}"
|
|
167
|
+
response = self._respectful_request(api_url)
|
|
168
|
+
|
|
169
|
+
if response and response.status_code == 200:
|
|
170
|
+
try:
|
|
171
|
+
data = response.json()
|
|
172
|
+
if 'notes' in data and data['notes']:
|
|
173
|
+
note = data['notes'][0]
|
|
174
|
+
return self._parse_api_response(note)
|
|
175
|
+
except (json.JSONDecodeError, KeyError) as e:
|
|
176
|
+
logger.debug(f"Failed to parse API response: {e}")
|
|
177
|
+
|
|
178
|
+
# Fall back to web scraping
|
|
179
|
+
forum_url = f"{self.base_url}/forum?id={paper_id}"
|
|
180
|
+
response = self._respectful_request(forum_url)
|
|
181
|
+
|
|
182
|
+
if not response or response.status_code != 200:
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
return self._parse_web_page(response.text, forum_url)
|
|
186
|
+
|
|
187
|
+
def _parse_api_response(self, note: Dict[str, Any]) -> Dict[str, Any]:
|
|
188
|
+
"""
|
|
189
|
+
Parse OpenReview API response to extract metadata
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
note: Note data from API response
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Parsed metadata dictionary
|
|
196
|
+
"""
|
|
197
|
+
content = note.get('content', {})
|
|
198
|
+
|
|
199
|
+
# Extract basic metadata
|
|
200
|
+
metadata = {
|
|
201
|
+
'id': note.get('id'),
|
|
202
|
+
'title': content.get('title', '').strip(),
|
|
203
|
+
'authors': [],
|
|
204
|
+
'year': None,
|
|
205
|
+
'venue': None,
|
|
206
|
+
'abstract': content.get('abstract', '').strip(),
|
|
207
|
+
'keywords': content.get('keywords', []),
|
|
208
|
+
'pdf_url': content.get('pdf'),
|
|
209
|
+
'forum_url': f"{self.base_url}/forum?id={note.get('id')}",
|
|
210
|
+
'source': 'openreview_api'
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
# Parse authors
|
|
214
|
+
authors_raw = content.get('authors', [])
|
|
215
|
+
if isinstance(authors_raw, list):
|
|
216
|
+
metadata['authors'] = [author.strip() for author in authors_raw if author.strip()]
|
|
217
|
+
elif isinstance(authors_raw, str):
|
|
218
|
+
# Sometimes authors are in a single string
|
|
219
|
+
metadata['authors'] = [author.strip() for author in authors_raw.split(',') if author.strip()]
|
|
220
|
+
|
|
221
|
+
# Extract year from various sources
|
|
222
|
+
# Check creation time
|
|
223
|
+
if 'cdate' in note:
|
|
224
|
+
try:
|
|
225
|
+
import datetime
|
|
226
|
+
timestamp = note['cdate'] / 1000.0 # Convert from milliseconds
|
|
227
|
+
year = datetime.datetime.fromtimestamp(timestamp).year
|
|
228
|
+
metadata['year'] = year
|
|
229
|
+
except (ValueError, TypeError):
|
|
230
|
+
pass
|
|
231
|
+
|
|
232
|
+
# Check if venue/conference info is available
|
|
233
|
+
venue_info = content.get('venue', '')
|
|
234
|
+
if venue_info:
|
|
235
|
+
metadata['venue'] = venue_info.strip()
|
|
236
|
+
|
|
237
|
+
# Try to extract venue from forum context or submission info
|
|
238
|
+
if not metadata['venue']:
|
|
239
|
+
# Common venues for OpenReview
|
|
240
|
+
forum_path = note.get('forum', '')
|
|
241
|
+
if 'ICLR' in str(content) or 'iclr' in forum_path.lower():
|
|
242
|
+
metadata['venue'] = 'ICLR'
|
|
243
|
+
elif 'NeurIPS' in str(content) or 'neurips' in forum_path.lower():
|
|
244
|
+
metadata['venue'] = 'NeurIPS'
|
|
245
|
+
elif 'ICML' in str(content) or 'icml' in forum_path.lower():
|
|
246
|
+
metadata['venue'] = 'ICML'
|
|
247
|
+
|
|
248
|
+
return metadata
|
|
249
|
+
|
|
250
|
+
def _parse_web_page(self, html: str, url: str) -> Dict[str, Any]:
|
|
251
|
+
"""
|
|
252
|
+
Parse OpenReview web page to extract metadata
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
html: HTML content of the page
|
|
256
|
+
url: Original URL
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Parsed metadata dictionary
|
|
260
|
+
"""
|
|
261
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
262
|
+
|
|
263
|
+
# Extract paper ID from URL
|
|
264
|
+
paper_id = self.extract_paper_id(url)
|
|
265
|
+
|
|
266
|
+
metadata = {
|
|
267
|
+
'id': paper_id,
|
|
268
|
+
'title': '',
|
|
269
|
+
'authors': [],
|
|
270
|
+
'year': None,
|
|
271
|
+
'venue': None,
|
|
272
|
+
'abstract': '',
|
|
273
|
+
'keywords': [],
|
|
274
|
+
'forum_url': url,
|
|
275
|
+
'source': 'openreview_web'
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
# Extract title
|
|
279
|
+
title_elem = soup.find('h2', {'class': 'citation_title'}) or soup.find('h1')
|
|
280
|
+
if title_elem:
|
|
281
|
+
metadata['title'] = title_elem.get_text().strip()
|
|
282
|
+
|
|
283
|
+
# Try to find title in meta tags
|
|
284
|
+
if not metadata['title']:
|
|
285
|
+
meta_title = soup.find('meta', {'property': 'og:title'}) or soup.find('meta', {'name': 'title'})
|
|
286
|
+
if meta_title and meta_title.get('content'):
|
|
287
|
+
metadata['title'] = meta_title['content'].strip()
|
|
288
|
+
|
|
289
|
+
# Extract authors from meta tags (most reliable for OpenReview)
|
|
290
|
+
author_metas = soup.find_all('meta', {'name': 'citation_author'})
|
|
291
|
+
if author_metas:
|
|
292
|
+
metadata['authors'] = [meta.get('content', '').strip() for meta in author_metas if meta.get('content', '').strip()]
|
|
293
|
+
|
|
294
|
+
# Fallback: try to find authors in HTML structure
|
|
295
|
+
if not metadata['authors']:
|
|
296
|
+
authors_section = soup.find('div', {'class': 'authors'}) or soup.find('span', {'class': 'authors'})
|
|
297
|
+
if authors_section:
|
|
298
|
+
# Extract author names from links or text
|
|
299
|
+
author_links = authors_section.find_all('a')
|
|
300
|
+
if author_links:
|
|
301
|
+
metadata['authors'] = [link.get_text().strip() for link in author_links]
|
|
302
|
+
else:
|
|
303
|
+
# Parse comma-separated authors
|
|
304
|
+
authors_text = authors_section.get_text().strip()
|
|
305
|
+
metadata['authors'] = [author.strip() for author in authors_text.split(',') if author.strip()]
|
|
306
|
+
|
|
307
|
+
# Extract year from various sources
|
|
308
|
+
year_pattern = r'\b(20\d{2})\b'
|
|
309
|
+
|
|
310
|
+
# Check date/year elements
|
|
311
|
+
date_elem = soup.find('span', {'class': 'date'}) or soup.find('time')
|
|
312
|
+
if date_elem:
|
|
313
|
+
year_match = re.search(year_pattern, date_elem.get_text())
|
|
314
|
+
if year_match:
|
|
315
|
+
metadata['year'] = int(year_match.group(1))
|
|
316
|
+
|
|
317
|
+
# Check meta tags for date
|
|
318
|
+
if not metadata['year']:
|
|
319
|
+
meta_date = soup.find('meta', {'name': 'citation_date'}) or soup.find('meta', {'name': 'date'})
|
|
320
|
+
if meta_date and meta_date.get('content'):
|
|
321
|
+
year_match = re.search(year_pattern, meta_date['content'])
|
|
322
|
+
if year_match:
|
|
323
|
+
metadata['year'] = int(year_match.group(1))
|
|
324
|
+
|
|
325
|
+
# Extract abstract
|
|
326
|
+
abstract_elem = soup.find('div', {'class': 'abstract'}) or soup.find('section', {'class': 'abstract'})
|
|
327
|
+
if abstract_elem:
|
|
328
|
+
metadata['abstract'] = abstract_elem.get_text().strip()
|
|
329
|
+
|
|
330
|
+
# Extract venue information from meta tags (most reliable for OpenReview)
|
|
331
|
+
venue_meta = soup.find('meta', {'name': 'citation_conference_title'})
|
|
332
|
+
if venue_meta and venue_meta.get('content'):
|
|
333
|
+
venue_full = venue_meta['content'].strip()
|
|
334
|
+
# Convert long conference names to common abbreviations
|
|
335
|
+
if 'International Conference on Learning Representations' in venue_full:
|
|
336
|
+
# Extract year if present
|
|
337
|
+
year_match = re.search(r'\b(20\d{2})\b', venue_full)
|
|
338
|
+
if year_match:
|
|
339
|
+
metadata['venue'] = f'ICLR {year_match.group(1)}'
|
|
340
|
+
else:
|
|
341
|
+
metadata['venue'] = 'ICLR'
|
|
342
|
+
elif 'Neural Information Processing Systems' in venue_full or 'NeurIPS' in venue_full:
|
|
343
|
+
year_match = re.search(r'\b(20\d{2})\b', venue_full)
|
|
344
|
+
if year_match:
|
|
345
|
+
metadata['venue'] = f'NeurIPS {year_match.group(1)}'
|
|
346
|
+
else:
|
|
347
|
+
metadata['venue'] = 'NeurIPS'
|
|
348
|
+
else:
|
|
349
|
+
metadata['venue'] = venue_full
|
|
350
|
+
|
|
351
|
+
# Fallback: try HTML structure
|
|
352
|
+
if not metadata['venue']:
|
|
353
|
+
venue_elem = soup.find('div', {'class': 'venue'}) or soup.find('span', {'class': 'venue'})
|
|
354
|
+
if venue_elem:
|
|
355
|
+
metadata['venue'] = venue_elem.get_text().strip()
|
|
356
|
+
|
|
357
|
+
# Final fallback: try to determine venue from page context or URL
|
|
358
|
+
if not metadata['venue']:
|
|
359
|
+
page_text = soup.get_text().lower()
|
|
360
|
+
if 'iclr' in page_text or 'iclr' in url.lower():
|
|
361
|
+
if '2024' in page_text:
|
|
362
|
+
metadata['venue'] = 'ICLR 2024'
|
|
363
|
+
else:
|
|
364
|
+
metadata['venue'] = 'ICLR'
|
|
365
|
+
elif 'neurips' in page_text or 'neurips' in url.lower():
|
|
366
|
+
metadata['venue'] = 'NeurIPS'
|
|
367
|
+
elif 'icml' in page_text or 'icml' in url.lower():
|
|
368
|
+
metadata['venue'] = 'ICML'
|
|
369
|
+
|
|
370
|
+
# Extract keywords if available
|
|
371
|
+
keywords_elem = soup.find('div', {'class': 'keywords'})
|
|
372
|
+
if keywords_elem:
|
|
373
|
+
keywords_text = keywords_elem.get_text()
|
|
374
|
+
metadata['keywords'] = [kw.strip() for kw in keywords_text.split(',') if kw.strip()]
|
|
375
|
+
|
|
376
|
+
return metadata
|
|
377
|
+
|
|
378
|
+
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
379
|
+
"""
|
|
380
|
+
Verify a reference against OpenReview
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
Tuple of (verified_data, errors, paper_url) where:
|
|
387
|
+
- verified_data: Dict with verified OpenReview paper data or None
|
|
388
|
+
- errors: List of error/warning dictionaries
|
|
389
|
+
- paper_url: The OpenReview URL
|
|
390
|
+
"""
|
|
391
|
+
logger.debug(f"Verifying OpenReview reference: {reference.get('title', 'Untitled')}")
|
|
392
|
+
|
|
393
|
+
# Extract OpenReview URL from reference
|
|
394
|
+
openreview_url = None
|
|
395
|
+
for url_key in ['url', 'openreview_url', 'link']:
|
|
396
|
+
if url_key in reference and reference[url_key]:
|
|
397
|
+
url = reference[url_key].strip()
|
|
398
|
+
if self.is_openreview_url(url):
|
|
399
|
+
openreview_url = url
|
|
400
|
+
break
|
|
401
|
+
|
|
402
|
+
if not openreview_url:
|
|
403
|
+
logger.debug("No OpenReview URL found in reference")
|
|
404
|
+
return None, [], None
|
|
405
|
+
|
|
406
|
+
# Extract paper ID
|
|
407
|
+
paper_id = self.extract_paper_id(openreview_url)
|
|
408
|
+
if not paper_id:
|
|
409
|
+
return None, [{"error_type": "unverified", "error_details": "Could not extract paper ID from OpenReview URL"}], openreview_url
|
|
410
|
+
|
|
411
|
+
# Get paper metadata
|
|
412
|
+
paper_data = self.get_paper_metadata(paper_id)
|
|
413
|
+
if not paper_data:
|
|
414
|
+
return None, [{"error_type": "unverified", "error_details": "Paper not found on OpenReview"}], openreview_url
|
|
415
|
+
|
|
416
|
+
logger.debug(f"Found OpenReview paper: {paper_data.get('title', 'Untitled')}")
|
|
417
|
+
|
|
418
|
+
# Verify the reference against the paper data
|
|
419
|
+
errors = []
|
|
420
|
+
|
|
421
|
+
# Check title match
|
|
422
|
+
cited_title = reference.get('title', '').strip()
|
|
423
|
+
paper_title = paper_data.get('title', '').strip()
|
|
424
|
+
|
|
425
|
+
if cited_title and paper_title:
|
|
426
|
+
similarity = calculate_title_similarity(cited_title, paper_title)
|
|
427
|
+
if similarity < 0.7: # Using a reasonable threshold
|
|
428
|
+
errors.append({
|
|
429
|
+
"warning_type": "title",
|
|
430
|
+
"warning_details": f"Title mismatch: cited as '{cited_title}' but OpenReview shows '{paper_title}' (similarity: {similarity:.2f})"
|
|
431
|
+
})
|
|
432
|
+
|
|
433
|
+
# Check authors
|
|
434
|
+
cited_authors = reference.get('authors', [])
|
|
435
|
+
paper_authors = paper_data.get('authors', [])
|
|
436
|
+
|
|
437
|
+
if cited_authors and paper_authors:
|
|
438
|
+
# Convert to list format if needed
|
|
439
|
+
if isinstance(cited_authors, str):
|
|
440
|
+
cited_authors = [author.strip() for author in cited_authors.split(',')]
|
|
441
|
+
if isinstance(paper_authors, str):
|
|
442
|
+
paper_authors = [author.strip() for author in paper_authors.split(',')]
|
|
443
|
+
|
|
444
|
+
# Use the existing author comparison function
|
|
445
|
+
match, error_msg = compare_authors(cited_authors, paper_authors)
|
|
446
|
+
if not match and error_msg:
|
|
447
|
+
errors.append({
|
|
448
|
+
"warning_type": "author",
|
|
449
|
+
"warning_details": error_msg
|
|
450
|
+
})
|
|
451
|
+
|
|
452
|
+
# Check year
|
|
453
|
+
cited_year = reference.get('year')
|
|
454
|
+
paper_year = paper_data.get('year')
|
|
455
|
+
|
|
456
|
+
if cited_year and paper_year:
|
|
457
|
+
try:
|
|
458
|
+
cited_year_int = int(cited_year)
|
|
459
|
+
paper_year_int = int(paper_year)
|
|
460
|
+
|
|
461
|
+
is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
|
|
462
|
+
if is_different and year_message:
|
|
463
|
+
errors.append({
|
|
464
|
+
"warning_type": "year",
|
|
465
|
+
"warning_details": year_message
|
|
466
|
+
})
|
|
467
|
+
except (ValueError, TypeError):
|
|
468
|
+
pass # Skip year validation if conversion fails
|
|
469
|
+
|
|
470
|
+
# Check venue if provided in reference
|
|
471
|
+
cited_venue = reference.get('venue', '').strip()
|
|
472
|
+
paper_venue = paper_data.get('venue', '').strip()
|
|
473
|
+
|
|
474
|
+
if cited_venue and paper_venue:
|
|
475
|
+
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
476
|
+
errors.append({
|
|
477
|
+
"warning_type": "venue",
|
|
478
|
+
"warning_details": f"Venue mismatch: cited as '{cited_venue}' but OpenReview shows '{paper_venue}'"
|
|
479
|
+
})
|
|
480
|
+
|
|
481
|
+
# Create verified data structure
|
|
482
|
+
verified_data = {
|
|
483
|
+
'title': paper_data.get('title', cited_title),
|
|
484
|
+
'authors': paper_data.get('authors', cited_authors),
|
|
485
|
+
'year': paper_data.get('year', cited_year),
|
|
486
|
+
'venue': paper_data.get('venue', cited_venue),
|
|
487
|
+
'url': openreview_url,
|
|
488
|
+
'abstract': paper_data.get('abstract', ''),
|
|
489
|
+
'keywords': paper_data.get('keywords', []),
|
|
490
|
+
'openreview_metadata': paper_data,
|
|
491
|
+
'verification_source': 'OpenReview'
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
logger.debug(f"OpenReview verification completed for: {openreview_url}")
|
|
495
|
+
return verified_data, errors, openreview_url
|
|
496
|
+
|
|
497
|
+
def search_paper(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
|
|
498
|
+
"""
|
|
499
|
+
Search for papers on OpenReview by title, authors, and/or year
|
|
500
|
+
|
|
501
|
+
Args:
|
|
502
|
+
title: Paper title to search for
|
|
503
|
+
authors: List of author names (optional)
|
|
504
|
+
year: Publication year (optional)
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
List of matching paper metadata dictionaries
|
|
508
|
+
"""
|
|
509
|
+
# This would implement search functionality if needed
|
|
510
|
+
# For now, OpenReview verification is primarily URL-based
|
|
511
|
+
logger.debug(f"Search functionality not yet implemented for OpenReview")
|
|
512
|
+
return []
|
utils/text_utils.py
CHANGED
|
@@ -81,6 +81,11 @@ def parse_authors_with_initials(authors_text):
|
|
|
81
81
|
# Import regex at function level to avoid import issues
|
|
82
82
|
import re
|
|
83
83
|
|
|
84
|
+
# Handle standalone "others" or "et al" cases that should return empty list
|
|
85
|
+
stripped_text = authors_text.strip().lower()
|
|
86
|
+
if stripped_text in ['others', 'and others', 'et al', 'et al.']:
|
|
87
|
+
return []
|
|
88
|
+
|
|
84
89
|
# Fix spacing around periods in initials (e.g., "Y . Li" -> "Y. Li") before parsing
|
|
85
90
|
authors_text = re.sub(r'(\w)\s+\.', r'\1.', authors_text)
|
|
86
91
|
|
|
@@ -94,10 +99,16 @@ def parse_authors_with_initials(authors_text):
|
|
|
94
99
|
valid_names = []
|
|
95
100
|
for part in and_parts:
|
|
96
101
|
part = part.strip()
|
|
97
|
-
|
|
102
|
+
# Check for et al indicators first
|
|
103
|
+
if part.lower() in ['others', 'et al', 'et al.', 'and others']:
|
|
104
|
+
# Add et al if we have real authors, then stop
|
|
105
|
+
if valid_names:
|
|
106
|
+
valid_names.append("et al")
|
|
107
|
+
break
|
|
108
|
+
elif part and (len(part.split()) >= 2 or re.search(r'[A-Z]\.', part)):
|
|
98
109
|
valid_names.append(part)
|
|
99
110
|
|
|
100
|
-
if
|
|
111
|
+
if valid_names: # Return if we found any valid names (including et al handling)
|
|
101
112
|
return valid_names
|
|
102
113
|
|
|
103
114
|
# Case 2: "Lastname, Firstname and Lastname, Firstname" format (BibTeX format)
|
|
@@ -112,9 +123,11 @@ def parse_authors_with_initials(authors_text):
|
|
|
112
123
|
# Handle special cases without commas
|
|
113
124
|
if comma_count == 0:
|
|
114
125
|
# Check if this is "others", "et al", or similar
|
|
115
|
-
if part.lower() in ['others', 'et al', 'et al.']:
|
|
116
|
-
#
|
|
117
|
-
|
|
126
|
+
if part.lower() in ['others', 'et al', 'et al.', 'and others']:
|
|
127
|
+
# Convert to standard "et al" and add it, then stop processing
|
|
128
|
+
if valid_author_parts: # Only add if we have real authors
|
|
129
|
+
valid_author_parts.append("et al")
|
|
130
|
+
break # Stop processing after et al indicator
|
|
118
131
|
else:
|
|
119
132
|
# This might be a name without lastname, firstname format
|
|
120
133
|
# For now, skip to be safe unless it's clearly a single name
|
|
@@ -3865,6 +3878,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
3865
3878
|
word_roots = {
|
|
3866
3879
|
'robot': 'robotics', 'robotics': 'robot',
|
|
3867
3880
|
'sci': 'science', 'science': 'sci',
|
|
3881
|
+
'science': 'sciences', 'sciences': 'science', # Handle singular/plural
|
|
3868
3882
|
'adv': 'advanced', 'advanced': 'adv',
|
|
3869
3883
|
'intell': 'intelligent', 'intelligent': 'intell',
|
|
3870
3884
|
'syst': 'systems', 'systems': 'syst',
|
|
File without changes
|
{academic_refchecker-1.2.34.dist-info → academic_refchecker-1.2.35.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-1.2.34.dist-info → academic_refchecker-1.2.35.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|