academic-refchecker 1.2.51__py3-none-any.whl → 1.2.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.51"
3
+ __version__ = "1.2.53"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.51
3
+ Version: 1.2.53
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -65,6 +65,14 @@ Dynamic: license-file
65
65
 
66
66
  A comprehensive tool for validating reference accuracy in academic papers, useful for both authors checking their bibliography and conference reviewers ensuring that paper references are authentic and accurate. This tool processes papers from various local and online sources including ArXiv, PDF files, LaTeX documents, and text files to verify the accuracy of references by comparing cited information against authoritative sources.
67
67
 
68
+ ## 🎥 Project Deep Dive
69
+
70
+ Learn about RefChecker's design philosophy and development process in this detailed discussion between Mark Russinovich (RefChecker's author) and Scott Hanselman. Mark shares insights into how he leveraged AI coding assistants including Cursor, GitHub Copilot, and Claude to build this comprehensive academic reference validation tool.
71
+
72
+ **[📺 Watch: "AI Coding with Mark Russinovich: Building RefChecker"](https://www.youtube.com/watch?v=n929Alz-fjo)**
73
+
74
+ *This video provides valuable insights into modern AI-assisted development workflows and the technical decisions behind RefChecker's architecture.*
75
+
68
76
  ## 📊 Sample Output
69
77
 
70
78
  ```
@@ -117,6 +125,7 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
117
125
 
118
126
  ## 📋 Table of Contents
119
127
 
128
+ - [🎥 Project Deep Dive](#-project-deep-dive)
120
129
  - [📊 Sample Output](#-sample-output)
121
130
  - [🎯 Features](#-features)
122
131
  - [🚀 Quick Start](#-quick-start)
@@ -1,5 +1,5 @@
1
- __version__.py,sha256=QVi0xkCCLhSxaeFBcvepTdxFHgwlpnbC20YfK43fZ0s,65
2
- academic_refchecker-1.2.51.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ __version__.py,sha256=iH7i3qnj4nR1gSXECRVUGvJH5oBPWtb7Lb8H9ODFTVc,65
2
+ academic_refchecker-1.2.53.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
3
  checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
4
  checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
5
5
  checkers/enhanced_hybrid_checker.py,sha256=rbXkzpNkd0bn4e2OooX-CcdGTwwYpgmVaFvX_xCAFsA,27777
@@ -7,15 +7,16 @@ checkers/github_checker.py,sha256=BXJaBC3AloKze04j8EcQz0a79EhtVoi9_871ilV7t60,14
7
7
  checkers/local_semantic_scholar.py,sha256=D8py8-yMCgN1lvhXCiMUOEA4wBkH7AQvrkM4-3LCDsU,21015
8
8
  checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
9
9
  checkers/openreview_checker.py,sha256=3ckn6U7TN5nQBjqPacr8W8mm2uMo6aWWB6gsxTDNCPk,40452
10
+ checkers/pdf_paper_checker.py,sha256=L5HRHd3xpo0xDltZGTAA-Wk_arIS9bQV8ITeuxW0bNc,19893
10
11
  checkers/semantic_scholar.py,sha256=wk6e8DkYJM_O2nWsi-6EfJT53PzfL8KCmX1rS562KKc,34962
11
- checkers/webpage_checker.py,sha256=ZgmnMPxNC7Jn93kDYzD9ORfzaj1Ewb7FLIF1z8o0RfM,22554
12
+ checkers/webpage_checker.py,sha256=REOotx7Qka86_xbOIMeYj5YVb9D1RVMb4Ye311-28cA,43620
12
13
  config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
13
14
  config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
14
15
  config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
15
16
  core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
16
17
  core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
17
18
  core/parallel_processor.py,sha256=cq_WfzXrF2EI6IKOtJd6_QcwvM1xT3J6a13teg-wSbM,17638
18
- core/refchecker.py,sha256=X38KjvO51_YZNqhQ6y8tqCsOKMSaVRf9FRODp1VCi2Q,280164
19
+ core/refchecker.py,sha256=-QIT5eUQaPCuQy7S80sXCvtrmcjdH5lf5wdZvsPQO9w,286416
19
20
  database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
20
21
  database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
21
22
  llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -39,8 +40,8 @@ utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
39
40
  utils/text_utils.py,sha256=T3PiiG9-BMPTbdCftG2zypyIeZJl6snuMCKQ0nEOQv0,217834
40
41
  utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
41
42
  utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
42
- academic_refchecker-1.2.51.dist-info/METADATA,sha256=QhJtbkhRypQyRHykzXxQIGoXu-c6_mlXx55YkofCmzM,22576
43
- academic_refchecker-1.2.51.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- academic_refchecker-1.2.51.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
- academic_refchecker-1.2.51.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
- academic_refchecker-1.2.51.dist-info/RECORD,,
43
+ academic_refchecker-1.2.53.dist-info/METADATA,sha256=6j1G-R74oa1900hERaRnJFkV5u4zTuVyLC6YamhXxq4,23256
44
+ academic_refchecker-1.2.53.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
+ academic_refchecker-1.2.53.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
46
+ academic_refchecker-1.2.53.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
47
+ academic_refchecker-1.2.53.dist-info/RECORD,,
@@ -0,0 +1,493 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDF Paper Checker - Validates citations by extracting and analyzing PDF content
4
+ """
5
+
6
+ import re
7
+ import io
8
+ import logging
9
+ from typing import Dict, List, Any, Optional, Tuple
10
+ from urllib.parse import urlparse
11
+
12
+ import requests
13
+ import pdfplumber
14
+ from pypdf import PdfReader
15
+ from fuzzywuzzy import fuzz
16
+ from bs4 import BeautifulSoup
17
+
18
+ from utils.text_utils import normalize_text, calculate_title_similarity
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class PDFPaperChecker:
24
+ """
25
+ Checker that downloads and analyzes PDF documents to validate citations
26
+ """
27
+
28
+ def __init__(self):
29
+ self.session = requests.Session()
30
+ self.session.headers.update({
31
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
32
+ })
33
+
34
+ def can_check_reference(self, reference: Dict[str, Any]) -> bool:
35
+ """
36
+ Check if this reference can be validated by PDF analysis
37
+
38
+ Args:
39
+ reference: Reference dictionary containing url and other metadata
40
+
41
+ Returns:
42
+ True if reference has URL that likely points to a PDF
43
+ """
44
+ url = reference.get('url', '').strip()
45
+ if not url:
46
+ return False
47
+
48
+ # Check if URL ends with .pdf
49
+ if url.lower().endswith('.pdf'):
50
+ return True
51
+
52
+ # Check if URL path suggests PDF content
53
+ pdf_indicators = ['/pdf/', '/document/', '/download/', '/file/', '/resource/']
54
+ if any(indicator in url.lower() for indicator in pdf_indicators):
55
+ return True
56
+
57
+ # Check if URL is from domains that commonly serve PDFs directly
58
+ domain = urlparse(url).netloc.lower()
59
+ pdf_domains = [
60
+ '.gov', '.edu', '.org', # Common institutional domains
61
+ 'researchgate.net', 'academia.edu', 'arxiv.org', # Academic platforms
62
+ 'oecd.org', 'who.int', 'unesco.org', # International organizations
63
+ 'aecea.ca' # Specific domain from the user's example
64
+ ]
65
+
66
+ if any(domain.endswith(pdf_domain) or pdf_domain in domain for pdf_domain in pdf_domains):
67
+ return True
68
+
69
+ return False
70
+
71
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
72
+ """
73
+ Verify a reference by downloading and analyzing PDF content
74
+
75
+ Args:
76
+ reference: Reference dictionary with title, authors, year, url, etc.
77
+
78
+ Returns:
79
+ Tuple of (verified_data, errors, url) where:
80
+ - verified_data: Dict with verified data if PDF validates citation, None otherwise
81
+ - errors: List of error dictionaries
82
+ - url: The URL that was checked
83
+ """
84
+ logger.debug(f"Verifying PDF reference: {reference.get('title', 'Untitled')}")
85
+
86
+ url = reference.get('url', '').strip()
87
+ if not url:
88
+ return None, [{"error_type": "unverified", "error_details": "no URL provided"}], None
89
+
90
+ try:
91
+ # First try to download directly as PDF
92
+ pdf_content = self._download_pdf(url)
93
+
94
+ # If direct download fails, try to find PDF links in the page
95
+ if not pdf_content:
96
+ pdf_url = self._find_pdf_url_in_page(url)
97
+ if pdf_url:
98
+ logger.debug(f"Found PDF link in page: {pdf_url}")
99
+ pdf_content = self._download_pdf(pdf_url)
100
+ url = pdf_url # Update URL to the actual PDF URL
101
+
102
+ if not pdf_content:
103
+ return None, [{"error_type": "unverified", "error_details": "could not download PDF content"}], url
104
+
105
+ # Extract text and metadata from PDF
106
+ pdf_data = self._extract_pdf_data(pdf_content)
107
+ if not pdf_data:
108
+ return None, [{"error_type": "unverified", "error_details": "could not extract PDF content"}], url
109
+
110
+ # Validate citation against PDF content
111
+ is_valid, errors = self._validate_citation(reference, pdf_data)
112
+
113
+ if is_valid:
114
+ # Create verified data preserving original venue if provided
115
+ venue = reference.get('journal') or reference.get('venue') or reference.get('booktitle') or 'PDF Document'
116
+
117
+ verified_data = {
118
+ 'title': reference.get('title', ''),
119
+ 'authors': reference.get('authors', []),
120
+ 'year': reference.get('year'),
121
+ 'venue': venue,
122
+ 'url': url,
123
+ 'pdf_metadata': {
124
+ 'extracted_title': pdf_data.get('title'),
125
+ 'extracted_authors': pdf_data.get('authors'),
126
+ 'extracted_text_preview': pdf_data.get('text', '')[:200] + '...' if pdf_data.get('text') else '',
127
+ 'pdf_pages': pdf_data.get('page_count'),
128
+ 'extraction_method': pdf_data.get('extraction_method')
129
+ }
130
+ }
131
+ logger.debug(f"PDF reference verified: {url}")
132
+ return verified_data, errors, url
133
+ else:
134
+ return None, errors, url
135
+
136
+ except Exception as e:
137
+ logger.error(f"Error verifying PDF reference {url}: {e}")
138
+ return None, [{"error_type": "unverified", "error_details": "PDF processing error"}], url
139
+
140
+ def _download_pdf(self, url: str, timeout: int = 30) -> Optional[bytes]:
141
+ """
142
+ Download PDF content from URL
143
+
144
+ Args:
145
+ url: URL to download from
146
+ timeout: Request timeout in seconds
147
+
148
+ Returns:
149
+ PDF content as bytes, or None if download failed
150
+ """
151
+ try:
152
+ logger.debug(f"Downloading PDF from: {url}")
153
+
154
+ response = self.session.get(url, timeout=timeout, stream=True)
155
+ response.raise_for_status()
156
+
157
+ # Check if content is actually a PDF
158
+ content_type = response.headers.get('content-type', '').lower()
159
+ if 'pdf' not in content_type and not url.lower().endswith('.pdf'):
160
+ # Sometimes PDFs are served with generic content types, so we'll try anyway
161
+ logger.debug(f"Content-Type '{content_type}' doesn't indicate PDF, but proceeding anyway")
162
+
163
+ # Download content
164
+ content = response.content
165
+
166
+ # Basic PDF validation - check for PDF header
167
+ if content.startswith(b'%PDF-'):
168
+ logger.debug(f"Successfully downloaded PDF ({len(content)} bytes)")
169
+ return content
170
+ else:
171
+ logger.debug("Downloaded content doesn't appear to be a valid PDF")
172
+ return None
173
+
174
+ except Exception as e:
175
+ logger.error(f"Failed to download PDF from {url}: {e}")
176
+ return None
177
+
178
+ def _find_pdf_url_in_page(self, url: str) -> Optional[str]:
179
+ """
180
+ Look for PDF download links in a web page
181
+
182
+ Args:
183
+ url: URL of the web page to search
184
+
185
+ Returns:
186
+ URL of PDF document if found, None otherwise
187
+ """
188
+ try:
189
+ logger.debug(f"Searching for PDF links in page: {url}")
190
+
191
+ response = self.session.get(url, timeout=15)
192
+ response.raise_for_status()
193
+
194
+ # Check if the response itself is a PDF (after redirects)
195
+ content_type = response.headers.get('content-type', '').lower()
196
+ if 'pdf' in content_type or response.content.startswith(b'%PDF-'):
197
+ logger.debug("Page redirected directly to PDF")
198
+ return response.url
199
+
200
+ # Parse HTML to look for PDF links
201
+ from bs4 import BeautifulSoup
202
+ soup = BeautifulSoup(response.content, 'html.parser')
203
+
204
+ # Look for links that might be PDFs
205
+ pdf_links = []
206
+
207
+ # Find all links
208
+ for link in soup.find_all('a', href=True):
209
+ href = link.get('href')
210
+ link_text = link.get_text().lower().strip()
211
+
212
+ # Check if link ends with .pdf
213
+ if href and href.lower().endswith('.pdf'):
214
+ pdf_links.append(href)
215
+ continue
216
+
217
+ # Check if link text suggests PDF
218
+ if any(indicator in link_text for indicator in ['pdf', 'download', 'document', 'report', 'policy']):
219
+ pdf_links.append(href)
220
+ continue
221
+
222
+ # Check if link has PDF-related attributes
223
+ if link.get('type', '').lower() == 'application/pdf':
224
+ pdf_links.append(href)
225
+ continue
226
+
227
+ # Look for PDF links in other elements
228
+ for element in soup.find_all(attrs={'href': True}):
229
+ href = element.get('href')
230
+ if href and href.lower().endswith('.pdf'):
231
+ pdf_links.append(href)
232
+
233
+ # Convert relative URLs to absolute
234
+ from urllib.parse import urljoin
235
+ absolute_pdf_links = []
236
+ for link in pdf_links:
237
+ if link:
238
+ absolute_url = urljoin(url, link)
239
+ absolute_pdf_links.append(absolute_url)
240
+
241
+ # Remove duplicates
242
+ absolute_pdf_links = list(set(absolute_pdf_links))
243
+
244
+ if absolute_pdf_links:
245
+ logger.debug(f"Found {len(absolute_pdf_links)} potential PDF links")
246
+ # Return the first PDF link found
247
+ return absolute_pdf_links[0]
248
+
249
+ logger.debug("No PDF links found in page")
250
+ return None
251
+
252
+ except Exception as e:
253
+ logger.error(f"Error searching for PDF links in {url}: {e}")
254
+ return None
255
+
256
+ def _extract_pdf_data(self, pdf_content: bytes) -> Optional[Dict[str, Any]]:
257
+ """
258
+ Extract text and metadata from PDF content
259
+
260
+ Args:
261
+ pdf_content: PDF file content as bytes
262
+
263
+ Returns:
264
+ Dictionary with extracted data including text, title, authors, etc.
265
+ """
266
+ pdf_data = {
267
+ 'text': '',
268
+ 'title': '',
269
+ 'authors': [],
270
+ 'page_count': 0,
271
+ 'extraction_method': 'none'
272
+ }
273
+
274
+ # Try multiple extraction methods
275
+ try:
276
+ # Method 1: Try pdfplumber (usually better for text extraction)
277
+ pdf_data = self._extract_with_pdfplumber(pdf_content, pdf_data)
278
+ if pdf_data['text']:
279
+ pdf_data['extraction_method'] = 'pdfplumber'
280
+ return pdf_data
281
+ except Exception as e:
282
+ logger.debug(f"pdfplumber extraction failed: {e}")
283
+
284
+ try:
285
+ # Method 2: Try pypdf (fallback)
286
+ pdf_data = self._extract_with_pypdf(pdf_content, pdf_data)
287
+ if pdf_data['text']:
288
+ pdf_data['extraction_method'] = 'pypdf'
289
+ return pdf_data
290
+ except Exception as e:
291
+ logger.debug(f"pypdf extraction failed: {e}")
292
+
293
+ logger.debug("All PDF extraction methods failed")
294
+ return None
295
+
296
+ def _extract_with_pdfplumber(self, pdf_content: bytes, pdf_data: Dict[str, Any]) -> Dict[str, Any]:
297
+ """Extract PDF data using pdfplumber"""
298
+ with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
299
+ pdf_data['page_count'] = len(pdf.pages)
300
+
301
+ # Extract text from first few pages (usually contains title/author info)
302
+ text_parts = []
303
+ for i, page in enumerate(pdf.pages[:5]): # First 5 pages should be enough
304
+ page_text = page.extract_text()
305
+ if page_text:
306
+ text_parts.append(page_text)
307
+
308
+ pdf_data['text'] = '\n'.join(text_parts)
309
+
310
+ # Try to extract title and author from first page
311
+ if pdf.pages:
312
+ first_page_text = pdf.pages[0].extract_text() or ''
313
+ pdf_data['title'], pdf_data['authors'] = self._parse_title_and_authors(first_page_text)
314
+
315
+ return pdf_data
316
+
317
+ def _extract_with_pypdf(self, pdf_content: bytes, pdf_data: Dict[str, Any]) -> Dict[str, Any]:
318
+ """Extract PDF data using pypdf"""
319
+ reader = PdfReader(io.BytesIO(pdf_content))
320
+ pdf_data['page_count'] = len(reader.pages)
321
+
322
+ # Extract metadata
323
+ if reader.metadata:
324
+ if '/Title' in reader.metadata:
325
+ pdf_data['title'] = str(reader.metadata['/Title'])
326
+ if '/Author' in reader.metadata:
327
+ pdf_data['authors'] = [str(reader.metadata['/Author'])]
328
+
329
+ # Extract text from first few pages
330
+ text_parts = []
331
+ for i, page in enumerate(reader.pages[:5]): # First 5 pages
332
+ try:
333
+ page_text = page.extract_text()
334
+ if page_text:
335
+ text_parts.append(page_text)
336
+ except Exception as e:
337
+ logger.debug(f"Failed to extract text from page {i}: {e}")
338
+ continue
339
+
340
+ pdf_data['text'] = '\n'.join(text_parts)
341
+
342
+ # If no metadata title/author, try to parse from text
343
+ if not pdf_data['title'] and text_parts:
344
+ title, authors = self._parse_title_and_authors(text_parts[0])
345
+ if title and not pdf_data['title']:
346
+ pdf_data['title'] = title
347
+ if authors and not pdf_data['authors']:
348
+ pdf_data['authors'] = authors
349
+
350
+ return pdf_data
351
+
352
+ def _parse_title_and_authors(self, text: str) -> Tuple[str, List[str]]:
353
+ """
354
+ Parse title and authors from PDF text
355
+
356
+ Args:
357
+ text: Text from first page of PDF
358
+
359
+ Returns:
360
+ Tuple of (title, authors_list)
361
+ """
362
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
363
+
364
+ if not lines:
365
+ return '', []
366
+
367
+ # The title is often the first meaningful line (after removing headers/footers)
368
+ title = ''
369
+ authors = []
370
+
371
+ # Look for the title - usually first non-header line
372
+ for i, line in enumerate(lines):
373
+ # Skip obvious header/footer content
374
+ if len(line) < 10 or any(skip in line.lower() for skip in ['page', 'doi:', 'http', 'www.', '@']):
375
+ continue
376
+
377
+ # Title is usually longer and on its own line
378
+ if len(line) > 20 and not any(sep in line for sep in [',', ';']) and not line.endswith('.'):
379
+ title = line
380
+
381
+ # Authors often follow the title - look for patterns
382
+ for j in range(i + 1, min(i + 5, len(lines))):
383
+ author_line = lines[j]
384
+
385
+ # Author lines often contain commas, "and", or institutional affiliations
386
+ if any(indicator in author_line.lower() for indicator in [',', ' and ', 'university', 'college', 'institute']):
387
+ # Clean up author line
388
+ author_text = re.sub(r'[0-9*†‡§¶#]', '', author_line) # Remove superscript markers
389
+ if ',' in author_text:
390
+ authors.extend([name.strip() for name in author_text.split(',') if name.strip()])
391
+ else:
392
+ authors.append(author_text.strip())
393
+ break
394
+ break
395
+
396
+ return title, authors
397
+
398
+ def _validate_citation(self, reference: Dict[str, Any], pdf_data: Dict[str, Any]) -> Tuple[bool, List[Dict[str, Any]]]:
399
+ """
400
+ Validate citation against extracted PDF data
401
+
402
+ Args:
403
+ reference: The citation being checked
404
+ pdf_data: Extracted data from PDF
405
+
406
+ Returns:
407
+ Tuple of (is_valid, errors_list)
408
+ """
409
+ errors = []
410
+
411
+ # Check title match
412
+ cited_title = reference.get('title', '').strip()
413
+ extracted_title = pdf_data.get('title', '').strip()
414
+ pdf_text = pdf_data.get('text', '').lower()
415
+
416
+ title_match = False
417
+
418
+ if cited_title and extracted_title:
419
+ # Compare titles directly
420
+ similarity = calculate_title_similarity(cited_title, extracted_title)
421
+ if similarity > 0.8: # 80% similarity threshold
422
+ title_match = True
423
+
424
+ if not title_match and cited_title and pdf_text:
425
+ # Check if cited title appears in PDF text
426
+ cited_title_normalized = normalize_text(cited_title)
427
+ if cited_title_normalized.lower() in pdf_text:
428
+ title_match = True
429
+
430
+ if not title_match:
431
+ errors.append({
432
+ "error_type": "unverified",
433
+ "error_details": "title not found in PDF content"
434
+ })
435
+
436
+ # Check author match (more lenient since PDF author extraction is difficult)
437
+ cited_authors = reference.get('authors', [])
438
+ extracted_authors = pdf_data.get('authors', [])
439
+
440
+ author_match = False
441
+
442
+ if cited_authors and extracted_authors:
443
+ # Check if any cited author appears in extracted authors
444
+ for cited_author in cited_authors:
445
+ for extracted_author in extracted_authors:
446
+ if self._authors_match(cited_author, extracted_author):
447
+ author_match = True
448
+ break
449
+ if author_match:
450
+ break
451
+
452
+ if not author_match and cited_authors and pdf_text:
453
+ # Check if any cited author appears in PDF text
454
+ for cited_author in cited_authors:
455
+ author_normalized = normalize_text(cited_author)
456
+ if author_normalized.lower() in pdf_text:
457
+ author_match = True
458
+ break
459
+
460
+ # For PDF validation, we're more lenient with author matching since extraction is unreliable
461
+ if not author_match and cited_authors:
462
+ errors.append({
463
+ "warning_type": "author",
464
+ "warning_details": "authors not clearly identified in PDF content"
465
+ })
466
+
467
+ # A reference is valid if we found the title (author matching is optional due to extraction difficulties)
468
+ is_valid = title_match
469
+
470
+ return is_valid, errors
471
+
472
+ def _authors_match(self, author1: str, author2: str) -> bool:
473
+ """Check if two author names likely refer to the same person"""
474
+ author1_norm = normalize_text(author1).lower()
475
+ author2_norm = normalize_text(author2).lower()
476
+
477
+ # Exact match
478
+ if author1_norm == author2_norm:
479
+ return True
480
+
481
+ # Check similarity
482
+ similarity = fuzz.ratio(author1_norm, author2_norm)
483
+ if similarity > 85: # 85% similarity threshold
484
+ return True
485
+
486
+ # Check if one name is contained in the other (handles "J. Smith" vs "John Smith")
487
+ words1 = set(author1_norm.split())
488
+ words2 = set(author2_norm.split())
489
+
490
+ if words1.intersection(words2):
491
+ return True
492
+
493
+ return False
@@ -512,4 +512,427 @@ class WebPageChecker:
512
512
  "warning_details": "Access blocked by site but domain is trusted (likely bot protection)"
513
513
  })
514
514
 
515
- return verified_data, errors, web_url
515
+ return verified_data, errors, web_url
516
+
517
+ def check_unverified_url_reference(self, reference: Dict[str, Any]) -> str:
518
+ """
519
+ Check a URL from an unverified reference to determine the specific unverified reason
520
+
521
+ Args:
522
+ reference: Reference dictionary with title, authors, year, url, etc.
523
+
524
+ Returns:
525
+ String with the specific unverified reason:
526
+ - "non-existent web page" if the page doesn't exist
527
+ - "paper not found and URL doesn't reference it" if page exists but doesn't contain title
528
+ - "paper not verified but URL references paper" if page exists and contains title
529
+ """
530
+ logger.debug(f"Checking unverified URL reference: {reference.get('title', 'Untitled')}")
531
+
532
+ # Extract URL from reference
533
+ web_url = reference.get('url', '').strip()
534
+ if not web_url:
535
+ return "paper not found and URL doesn't reference it" # No URL to check
536
+
537
+ # Make request to check if page exists
538
+ response = self._respectful_request(web_url)
539
+ if response is None:
540
+ return "non-existent web page"
541
+
542
+ if response.status_code == 404:
543
+ return "non-existent web page"
544
+ elif response.status_code == 403:
545
+ # For blocked resources, we can't check content but assume page exists
546
+ return "paper not verified but URL references paper"
547
+ elif response.status_code != 200:
548
+ return "non-existent web page"
549
+
550
+ try:
551
+ # Parse HTML content to search for title
552
+ content_type = response.headers.get('content-type', '').lower()
553
+ if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
554
+ # For PDFs, we can't search content, so assume it's referenced if accessible
555
+ return "paper not verified but URL references paper"
556
+
557
+ # Parse HTML content
558
+ soup = BeautifulSoup(response.content, 'html.parser')
559
+
560
+ # Extract page content for searching
561
+ page_title = self._extract_page_title(soup)
562
+ page_description = self._extract_description(soup)
563
+
564
+ # Get the full page text for comprehensive searching
565
+ page_text = soup.get_text().lower()
566
+
567
+ # Get the reference title to search for
568
+ cited_title = reference.get('title', '').strip()
569
+ if not cited_title:
570
+ return "paper not found and URL doesn't reference it"
571
+
572
+ # Search for the title in various ways
573
+ cited_title_lower = cited_title.lower()
574
+
575
+ # Direct search in page text
576
+ if cited_title_lower in page_text:
577
+ return "paper not verified but URL references paper"
578
+
579
+ # Search for key words from the title
580
+ cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
581
+ if len(word.strip('.,;:()[]{}')) > 3)
582
+
583
+ # Check if significant portion of title words appear in page
584
+ page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
585
+ if len(word.strip('.,;:()[]{}')) > 3)
586
+
587
+ common_words = cited_words.intersection(page_words)
588
+
589
+ # If most of the title words are found, consider it referenced
590
+ if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
591
+ return "paper not verified but URL references paper"
592
+
593
+ # Also check the extracted title and description specifically
594
+ if page_title:
595
+ if self._check_title_match(cited_title, page_title, page_description):
596
+ return "paper not verified but URL references paper"
597
+
598
+ # Title not found in page content
599
+ return "paper not found and URL doesn't reference it"
600
+
601
+ except Exception as e:
602
+ logger.error(f"Error checking unverified URL {web_url}: {e}")
603
+ return "paper not found and URL doesn't reference it"
604
+
605
+ def verify_raw_url_for_unverified_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
606
+ """
607
+ Verify a raw URL from an unverified reference - can return verified data if appropriate
608
+
609
+ Args:
610
+ reference: Reference dictionary with title, authors, year, url, etc.
611
+
612
+ Returns:
613
+ Tuple of (verified_data, errors, url) where:
614
+ - verified_data: Dict with verified data if URL should be considered verified, None otherwise
615
+ - errors: List of error dictionaries with specific unverified reasons
616
+ - url: The URL that was checked
617
+ """
618
+ logger.debug(f"Verifying raw URL for unverified reference: {reference.get('title', 'Untitled')}")
619
+
620
+ # Extract URL from reference
621
+ web_url = reference.get('url', '').strip()
622
+ if not web_url:
623
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], None
624
+
625
+ # Make request to check if page exists
626
+ response = self._respectful_request(web_url)
627
+ if response is None:
628
+ return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
629
+
630
+ if response.status_code == 404:
631
+ return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
632
+ elif response.status_code == 403:
633
+ # For blocked resources, we can't check content but assume page exists
634
+ # If no venue, treat as verified since URL is accessible
635
+ if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
636
+ verified_data = {
637
+ 'title': reference.get('title', ''),
638
+ 'authors': reference.get('authors', []),
639
+ 'year': reference.get('year'),
640
+ 'venue': 'Web Page',
641
+ 'url': web_url,
642
+ 'web_metadata': {
643
+ 'status_code': 403,
644
+ 'access_blocked': True
645
+ }
646
+ }
647
+ return verified_data, [], web_url
648
+ else:
649
+ return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
650
+ elif response.status_code != 200:
651
+ return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
652
+
653
+ try:
654
+ # Parse HTML content to search for title
655
+ content_type = response.headers.get('content-type', '').lower()
656
+ if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
657
+ # For PDFs, if no venue specified, treat as verified
658
+ if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
659
+ verified_data = {
660
+ 'title': reference.get('title', ''),
661
+ 'authors': reference.get('authors', []),
662
+ 'year': reference.get('year'),
663
+ 'venue': 'PDF Document',
664
+ 'url': web_url,
665
+ 'web_metadata': {
666
+ 'content_type': response.headers.get('content-type', ''),
667
+ 'status_code': response.status_code
668
+ }
669
+ }
670
+ return verified_data, [], web_url
671
+ else:
672
+ return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
673
+
674
+ # Parse HTML content
675
+ soup = BeautifulSoup(response.content, 'html.parser')
676
+
677
+ # Extract page content for searching
678
+ page_title = self._extract_page_title(soup)
679
+ page_description = self._extract_description(soup)
680
+
681
+ # Get the full page text for comprehensive searching
682
+ page_text = soup.get_text().lower()
683
+
684
+ # Get the reference title to search for
685
+ cited_title = reference.get('title', '').strip()
686
+ if not cited_title:
687
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
688
+
689
+ # Search for the title in various ways
690
+ cited_title_lower = cited_title.lower()
691
+ title_found = False
692
+
693
+ # Direct search in page text
694
+ if cited_title_lower in page_text:
695
+ title_found = True
696
+
697
+ # Search for key words from the title
698
+ if not title_found:
699
+ cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
700
+ if len(word.strip('.,;:()[]{}')) > 3)
701
+
702
+ # Check if significant portion of title words appear in page
703
+ page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
704
+ if len(word.strip('.,;:()[]{}')) > 3)
705
+
706
+ common_words = cited_words.intersection(page_words)
707
+
708
+ # If most of the title words are found, consider it referenced
709
+ if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
710
+ title_found = True
711
+
712
+ # Also check the extracted title and description specifically
713
+ if not title_found and page_title:
714
+ if self._check_title_match(cited_title, page_title, page_description):
715
+ title_found = True
716
+
717
+ # Determine if this should be verified or unverified
718
+ if title_found:
719
+ # Check if reference should be verified based on venue type
720
+ venue_field = reference.get('journal') or reference.get('venue') or reference.get('booktitle')
721
+
722
+ if not venue_field:
723
+ # No venue specified - verify with URL as venue
724
+ site_info = self._extract_site_info(soup, web_url)
725
+ venue = site_info.get('organization', 'Web Page') if site_info.get('organization') != site_info.get('domain') else 'Web Page'
726
+
727
+ verified_data = {
728
+ 'title': reference.get('title', ''),
729
+ 'authors': reference.get('authors', []),
730
+ 'year': reference.get('year'),
731
+ 'venue': venue,
732
+ 'url': web_url,
733
+ 'web_metadata': {
734
+ 'page_title': page_title,
735
+ 'description': page_description,
736
+ 'site_info': site_info,
737
+ 'final_url': response.url,
738
+ 'status_code': response.status_code
739
+ }
740
+ }
741
+ logger.debug(f"URL verified as valid source for reference without venue: {web_url}")
742
+ return verified_data, [], web_url
743
+ elif self._is_web_content_venue(venue_field, web_url):
744
+ # Has venue but it's a web content venue (news, blog, etc.) - verify it
745
+ verified_data = {
746
+ 'title': reference.get('title', ''),
747
+ 'authors': reference.get('authors', []),
748
+ 'year': reference.get('year'),
749
+ 'venue': venue_field, # Keep the original venue
750
+ 'url': web_url,
751
+ 'web_metadata': {
752
+ 'page_title': page_title,
753
+ 'description': page_description,
754
+ 'site_info': self._extract_site_info(soup, web_url),
755
+ 'final_url': response.url,
756
+ 'status_code': response.status_code
757
+ }
758
+ }
759
+ logger.debug(f"URL verified as valid web content source: {web_url}")
760
+ return verified_data, [], web_url
761
+ else:
762
+ # Has academic venue but URL references paper - still unverified (needs proper paper verification)
763
+ return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
764
+ else:
765
+ # Title not found in page content
766
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
767
+
768
+ except Exception as e:
769
+ logger.error(f"Error checking raw URL {web_url}: {e}")
770
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
771
+
772
+ def _is_web_content_venue(self, venue: str, url: str) -> bool:
773
+ """
774
+ Determine if a venue represents web content rather than academic publication
775
+
776
+ Args:
777
+ venue: The venue string (journal, venue, or booktitle)
778
+ url: The URL being checked (for additional context)
779
+
780
+ Returns:
781
+ True if this represents web content that can be verified via URL
782
+ """
783
+ if not venue:
784
+ return False
785
+
786
+ venue_lower = venue.lower().strip()
787
+
788
+ # News organizations and media outlets
789
+ news_indicators = [
790
+ 'news', 'cbc', 'bbc', 'cnn', 'reuters', 'associated press', 'ap news',
791
+ 'npr', 'pbs', 'abc news', 'nbc news', 'fox news', 'guardian', 'times',
792
+ 'post', 'herald', 'tribune', 'gazette', 'chronicle', 'observer',
793
+ 'magazine', 'weekly', 'daily', 'today', 'report', 'wire', 'press'
794
+ ]
795
+
796
+ # Special case for Wall Street Journal
797
+ if any(word in venue_lower for word in ['wall street', 'wsj']):
798
+ news_indicators.append('journal')
799
+
800
+ # Technology and industry publications
801
+ tech_publications = [
802
+ 'techcrunch', 'wired', 'ars technica', 'the verge', 'engadget',
803
+ 'zdnet', 'cnet', 'computerworld', 'infoworld', 'pcmag', 'pcworld',
804
+ 'ieee spectrum', 'mit technology review', 'scientific american'
805
+ ]
806
+
807
+ # Blogs and web platforms
808
+ blog_platforms = [
809
+ 'blog', 'medium', 'substack', 'wordpress', 'blogspot', 'tumblr',
810
+ 'linkedin', 'facebook', 'twitter', 'reddit', 'stack overflow',
811
+ 'github pages', 'personal website', 'company blog'
812
+ ]
813
+
814
+ # Government and organizational websites
815
+ org_indicators = [
816
+ 'government', 'gov', '.org', 'agency', 'department', 'ministry',
817
+ 'commission', 'bureau', 'office', 'administration', 'institute',
818
+ 'foundation', 'association', 'society', 'center', 'centre',
819
+ 'council', 'committee', 'board', 'union', 'federation', 'alliance',
820
+ 'coalition', 'consortium', 'network', 'group', 'organization',
821
+ 'organisation', 'corp', 'corporation', 'company', 'ltd', 'inc'
822
+ ]
823
+
824
+ # Documentation and technical resources
825
+ tech_resources = [
826
+ 'documentation', 'docs', 'api', 'reference', 'guide', 'tutorial',
827
+ 'manual', 'readme', 'wiki', 'help', 'support', 'developer',
828
+ 'technical', 'white paper', 'whitepaper', 'brief', 'overview',
829
+ 'policy', 'strategy', 'report', 'study', 'analysis', 'research'
830
+ ]
831
+
832
+ # Check URL domain for additional context
833
+ url_lower = url.lower() if url else ''
834
+
835
+ # Known web content domains in URL
836
+ web_domains = [
837
+ 'cbc.ca', 'bbc.com', 'cnn.com', 'reuters.com', 'npr.org', 'pbs.org',
838
+ 'nytimes.com', 'washingtonpost.com', 'theguardian.com', 'wsj.com',
839
+ 'techcrunch.com', 'wired.com', 'theverge.com', 'arstechnica.com',
840
+ 'medium.com', 'substack.com', 'linkedin.com', 'github.io',
841
+ 'readthedocs.io', 'stackoverflow.com', 'reddit.com'
842
+ ]
843
+
844
+ # Combine all indicators
845
+ all_indicators = news_indicators + tech_publications + blog_platforms + org_indicators + tech_resources
846
+
847
+ # Academic venue indicators that should NOT be considered web content
848
+ academic_indicators = [
849
+ 'proceedings', 'conference', 'symposium', 'workshop', 'transactions',
850
+ 'journal of', 'international journal', 'acm', 'ieee', 'springer',
851
+ 'nature', 'science', 'cell', 'lancet', 'plos', 'arxiv', 'pubmed',
852
+ 'artificial intelligence', 'machine learning', 'computer vision',
853
+ 'neural', 'computing', 'robotics', 'bioinformatics'
854
+ ]
855
+
856
+ # Check if venue is clearly academic (should not be treated as web content)
857
+ is_academic = any(indicator in venue_lower for indicator in academic_indicators)
858
+ if is_academic:
859
+ return False
860
+
861
+ # Check if venue matches any web content indicators
862
+ venue_matches = any(indicator and indicator in venue_lower for indicator in all_indicators)
863
+
864
+ # Check if URL domain suggests web content
865
+ url_matches = any(domain in url_lower for domain in web_domains)
866
+
867
+ # Special case: if URL contains news/blog/docs indicators, lean towards web content
868
+ url_content_indicators = ['news', 'blog', 'post', 'article', 'docs', 'help', 'guide', 'resources', 'policy', 'strategy']
869
+ url_has_content_indicators = any(indicator in url_lower for indicator in url_content_indicators)
870
+
871
+ # Special case: Check if venue is an organizational acronym/name that matches the URL domain
872
+ # This handles cases like "AECEA" on aecea.ca domain
873
+ organizational_match = self._check_organizational_venue_match(venue, url_lower)
874
+
875
+ return venue_matches or url_matches or url_has_content_indicators or organizational_match
876
+
877
+ def _check_organizational_venue_match(self, venue: str, url_lower: str) -> bool:
878
+ """
879
+ Check if the venue represents an organization that matches the URL domain
880
+
881
+ Args:
882
+ venue: The venue string
883
+ url_lower: The lowercased URL
884
+
885
+ Returns:
886
+ True if venue appears to be the organization publishing on their own domain
887
+ """
888
+ if not venue or not url_lower:
889
+ return False
890
+
891
+ venue_lower = venue.lower().strip()
892
+
893
+ # Extract domain from URL
894
+ from urllib.parse import urlparse
895
+ try:
896
+ parsed_url = urlparse(url_lower)
897
+ domain = parsed_url.netloc.lower()
898
+
899
+ # Remove common prefixes
900
+ domain = domain.replace('www.', '')
901
+
902
+ # Check if venue is likely an acronym (short, all caps or mixed case)
903
+ is_likely_acronym = (len(venue) <= 10 and
904
+ (venue.isupper() or
905
+ any(c.isupper() for c in venue) and len(venue.split()) == 1))
906
+
907
+ # Check if venue appears in domain
908
+ venue_clean = ''.join(c for c in venue_lower if c.isalnum())
909
+
910
+ if venue_clean and venue_clean in domain:
911
+ return True
912
+
913
+ # For acronyms, check if the acronym could match the domain
914
+ if is_likely_acronym:
915
+ # Split venue into words and check if initials match domain
916
+ venue_words = venue_lower.replace('.', ' ').split()
917
+ if len(venue_words) == 1 and len(venue_words[0]) <= 6:
918
+ # Single word acronym - check if it's in the domain
919
+ if venue_words[0] in domain:
920
+ return True
921
+
922
+ # Check for educational/professional associations with .ca, .org, .edu domains
923
+ if any(domain.endswith(tld) for tld in ['.ca', '.org', '.edu', '.gov']):
924
+ # These domains often host organizational content
925
+ if any(org_word in venue_lower for org_word in [
926
+ 'association', 'society', 'institute', 'foundation', 'center',
927
+ 'centre', 'council', 'committee', 'board', 'agency', 'department'
928
+ ]):
929
+ return True
930
+
931
+ # Check if venue is a short organizational name/acronym
932
+ if is_likely_acronym:
933
+ return True
934
+
935
+ return False
936
+
937
+ except Exception:
938
+ return False
core/refchecker.py CHANGED
@@ -2021,8 +2021,20 @@ class ArxivReferenceChecker:
2021
2021
  logger.debug(f"Database mode: Initial paper_url from database checker: {paper_url}")
2022
2022
 
2023
2023
  if not verified_data:
2024
- # Mark as unverified but keep the URL if found
2025
- return [{"error_type": "unverified", "error_details": "Reference could not be verified in database"}], paper_url, None
2024
+ # Mark as unverified but check URL for more specific reason or verification
2025
+ if reference.get('url', '').strip():
2026
+ # Use raw URL verifier to check if it can be verified or get specific reason
2027
+ url_verified_data, url_errors, url_checked = self.verify_raw_url_reference(reference)
2028
+ if url_verified_data:
2029
+ # URL verification succeeded - return as verified
2030
+ logger.debug(f"Database mode: URL verification succeeded for unverified reference")
2031
+ return None, url_checked, url_verified_data
2032
+ else:
2033
+ # URL verification failed - use specific error reason
2034
+ url_error_details = url_errors[0].get('error_details', 'Reference could not be verified in database') if url_errors else 'Reference could not be verified in database'
2035
+ return [{"error_type": "unverified", "error_details": url_error_details}], paper_url, None
2036
+ else:
2037
+ return [{"error_type": "unverified", "error_details": "Reference could not be verified in database"}], paper_url, None
2026
2038
 
2027
2039
  # Convert database errors to our format
2028
2040
  formatted_errors = []
@@ -2118,7 +2130,29 @@ class ArxivReferenceChecker:
2118
2130
  return [{"error_type": "unverified", "error_details": "Database connection not available"}], None, None
2119
2131
 
2120
2132
  # For non-database mode, use the standard reference verification
2121
- return self.verify_reference_standard(source_paper, reference)
2133
+ errors, paper_url, verified_data = self.verify_reference_standard(source_paper, reference)
2134
+
2135
+ # If standard verification failed and the reference has a URL, try raw URL verification
2136
+ if errors and verified_data is None:
2137
+ # Check if there's an unverified error
2138
+ unverified_errors = [e for e in errors if e.get('error_type') == 'unverified']
2139
+ if unverified_errors and reference.get('url', '').strip():
2140
+ # Use raw URL verifier to check if it can be verified or get specific reason
2141
+ url_verified_data, url_errors, url_checked = self.verify_raw_url_reference(reference)
2142
+ if url_verified_data:
2143
+ # URL verification succeeded - return as verified
2144
+ logger.debug(f"Non-database mode: URL verification succeeded for unverified reference")
2145
+ return None, url_checked, url_verified_data
2146
+ else:
2147
+ # URL verification failed - use specific error reason
2148
+ url_error_details = url_errors[0].get('error_details', 'Reference could not be verified') if url_errors else 'Reference could not be verified'
2149
+ # Update the unverified error with the specific reason
2150
+ for error in errors:
2151
+ if error.get('error_type') == 'unverified':
2152
+ error['error_details'] = url_error_details
2153
+ break
2154
+
2155
+ return errors, paper_url, verified_data
2122
2156
 
2123
2157
 
2124
2158
  def verify_github_reference(self, reference):
@@ -2253,6 +2287,72 @@ class ArxivReferenceChecker:
2253
2287
  formatted_errors.append(formatted_error)
2254
2288
  return formatted_errors if formatted_errors else [{"error_type": "unverified", "error_details": "Web page could not be verified"}], page_url, None
2255
2289
 
2290
+ def verify_raw_url_reference(self, reference):
2291
+ """
2292
+ Verify a raw URL from an unverified reference - can return verified data if appropriate
2293
+
2294
+ Args:
2295
+ reference: The reference to verify (already determined to be unverified by paper validators)
2296
+
2297
+ Returns:
2298
+ Tuple of (verified_data, errors, url) where:
2299
+ - verified_data: Dict with verified data if URL should be considered verified, None otherwise
2300
+ - errors: List of error dictionaries
2301
+ - url: The URL that was checked
2302
+ """
2303
+ logger.debug(f"Checking raw URL for unverified reference: {reference.get('title', 'Untitled')}")
2304
+
2305
+ # Extract URL from reference
2306
+ web_url = reference.get('url', '').strip()
2307
+ if not web_url:
2308
+ return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], None
2309
+
2310
+ # First try PDF paper checker if URL appears to be a PDF
2311
+ from checkers.pdf_paper_checker import PDFPaperChecker
2312
+ pdf_checker = PDFPaperChecker()
2313
+
2314
+ if pdf_checker.can_check_reference(reference):
2315
+ logger.debug(f"URL appears to be PDF, trying PDF verification: {web_url}")
2316
+ try:
2317
+ verified_data, errors, url = pdf_checker.verify_reference(reference)
2318
+ if verified_data:
2319
+ logger.debug(f"PDF verification successful for: {reference.get('title', 'Untitled')}")
2320
+ return verified_data, errors, url
2321
+ else:
2322
+ logger.debug(f"PDF verification failed, falling back to web page verification")
2323
+ except Exception as e:
2324
+ logger.error(f"Error in PDF verification: {e}")
2325
+ logger.debug(f"PDF verification error, falling back to web page verification")
2326
+
2327
+ # Fall back to web page checker
2328
+ from checkers.pdf_paper_checker import PDFPaperChecker
2329
+ pdf_checker = PDFPaperChecker()
2330
+
2331
+ if pdf_checker.can_check_reference(reference):
2332
+ logger.debug(f"URL appears to be PDF, trying PDF verification: {web_url}")
2333
+ try:
2334
+ verified_data, errors, url = pdf_checker.verify_reference(reference)
2335
+ if verified_data:
2336
+ logger.debug(f"PDF verification successful for: {reference.get('title', 'Untitled')}")
2337
+ return verified_data, errors, url
2338
+ else:
2339
+ logger.debug(f"PDF verification failed, falling back to web page verification")
2340
+ except Exception as e:
2341
+ logger.error(f"Error in PDF verification: {e}")
2342
+ logger.debug(f"PDF verification error, falling back to web page verification")
2343
+
2344
+ # Fall back to web page checker
2345
+ from checkers.webpage_checker import WebPageChecker
2346
+ webpage_checker = WebPageChecker()
2347
+
2348
+ try:
2349
+ verified_data, errors, url = webpage_checker.verify_raw_url_for_unverified_reference(reference)
2350
+ logger.debug(f"Raw URL verification result: verified_data={verified_data is not None}, errors={len(errors)}, url={url}")
2351
+ return verified_data, errors, url
2352
+ except Exception as e:
2353
+ logger.error(f"Error checking raw URL: {e}")
2354
+ return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], web_url
2355
+
2256
2356
  def verify_reference_standard(self, source_paper, reference):
2257
2357
  """
2258
2358
  Verify if a reference is accurate using GitHub, Semantic Scholar, or other checkers
@@ -2274,11 +2374,6 @@ class ArxivReferenceChecker:
2274
2374
  if github_result:
2275
2375
  return github_result
2276
2376
 
2277
- # Next, check if this is a web page reference
2278
- webpage_result = self.verify_webpage_reference(reference)
2279
- if webpage_result:
2280
- return webpage_result
2281
-
2282
2377
  # Use the Semantic Scholar client to verify the reference
2283
2378
  verified_data, errors, paper_url = self.non_arxiv_checker.verify_reference(reference)
2284
2379
 
@@ -5515,6 +5610,14 @@ class ArxivReferenceChecker:
5515
5610
  """Categorize the unverified error into checker error or not found"""
5516
5611
  error_details_lower = error_details.lower()
5517
5612
 
5613
+ # New specific URL-based unverified reasons
5614
+ if error_details_lower == "non-existent web page":
5615
+ return "Non-existent web page"
5616
+ elif error_details_lower == "paper not found and url doesn't reference it":
5617
+ return "Paper not found and URL doesn't reference it"
5618
+ elif error_details_lower == "paper not verified but url references paper":
5619
+ return "Paper not verified but URL references paper"
5620
+
5518
5621
  # Checker/API errors
5519
5622
  api_error_patterns = [
5520
5623
  'api error', 'rate limit', 'http error', 'network error',