academic-refchecker 1.2.51__py3-none-any.whl → 1.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __version__.py +1 -1
- {academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/METADATA +10 -1
- {academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/RECORD +10 -9
- checkers/pdf_paper_checker.py +493 -0
- checkers/webpage_checker.py +424 -1
- core/refchecker.py +94 -8
- {academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/top_level.txt +0 -0
__version__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: academic-refchecker
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.52
|
|
4
4
|
Summary: A comprehensive tool for validating reference accuracy in academic papers
|
|
5
5
|
Author-email: Mark Russinovich <markrussinovich@hotmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -65,6 +65,14 @@ Dynamic: license-file
|
|
|
65
65
|
|
|
66
66
|
A comprehensive tool for validating reference accuracy in academic papers, useful for both authors checking their bibliography and conference reviewers ensuring that paper references are authentic and accurate. This tool processes papers from various local and online sources including ArXiv, PDF files, LaTeX documents, and text files to verify the accuracy of references by comparing cited information against authoritative sources.
|
|
67
67
|
|
|
68
|
+
## 🎥 Project Deep Dive
|
|
69
|
+
|
|
70
|
+
Learn about RefChecker's design philosophy and development process in this detailed discussion between Mark Russinovich (RefChecker's author) and Scott Hanselman. Mark shares insights into how he leveraged AI coding assistants including Cursor, GitHub Copilot, and Claude to build this comprehensive academic reference validation tool.
|
|
71
|
+
|
|
72
|
+
**[📺 Watch: "AI Coding with Mark Russinovich: Building RefChecker"](https://www.youtube.com/watch?v=n929Alz-fjo)**
|
|
73
|
+
|
|
74
|
+
*This video provides valuable insights into modern AI-assisted development workflows and the technical decisions behind RefChecker's architecture.*
|
|
75
|
+
|
|
68
76
|
## 📊 Sample Output
|
|
69
77
|
|
|
70
78
|
```
|
|
@@ -117,6 +125,7 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
|
|
|
117
125
|
|
|
118
126
|
## 📋 Table of Contents
|
|
119
127
|
|
|
128
|
+
- [🎥 Project Deep Dive](#-project-deep-dive)
|
|
120
129
|
- [📊 Sample Output](#-sample-output)
|
|
121
130
|
- [🎯 Features](#-features)
|
|
122
131
|
- [🚀 Quick Start](#-quick-start)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
__version__.py,sha256=
|
|
2
|
-
academic_refchecker-1.2.
|
|
1
|
+
__version__.py,sha256=3kX5OAukU7mOMMYEni5E3TW6cnip3XwxplWJP4qANhU,65
|
|
2
|
+
academic_refchecker-1.2.52.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
3
3
|
checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
|
|
4
4
|
checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
|
|
5
5
|
checkers/enhanced_hybrid_checker.py,sha256=rbXkzpNkd0bn4e2OooX-CcdGTwwYpgmVaFvX_xCAFsA,27777
|
|
@@ -7,15 +7,16 @@ checkers/github_checker.py,sha256=BXJaBC3AloKze04j8EcQz0a79EhtVoi9_871ilV7t60,14
|
|
|
7
7
|
checkers/local_semantic_scholar.py,sha256=D8py8-yMCgN1lvhXCiMUOEA4wBkH7AQvrkM4-3LCDsU,21015
|
|
8
8
|
checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
|
|
9
9
|
checkers/openreview_checker.py,sha256=3ckn6U7TN5nQBjqPacr8W8mm2uMo6aWWB6gsxTDNCPk,40452
|
|
10
|
+
checkers/pdf_paper_checker.py,sha256=L5HRHd3xpo0xDltZGTAA-Wk_arIS9bQV8ITeuxW0bNc,19893
|
|
10
11
|
checkers/semantic_scholar.py,sha256=wk6e8DkYJM_O2nWsi-6EfJT53PzfL8KCmX1rS562KKc,34962
|
|
11
|
-
checkers/webpage_checker.py,sha256=
|
|
12
|
+
checkers/webpage_checker.py,sha256=REOotx7Qka86_xbOIMeYj5YVb9D1RVMb4Ye311-28cA,43620
|
|
12
13
|
config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
|
|
13
14
|
config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
|
|
14
15
|
config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
|
|
15
16
|
core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
|
|
16
17
|
core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
|
|
17
18
|
core/parallel_processor.py,sha256=cq_WfzXrF2EI6IKOtJd6_QcwvM1xT3J6a13teg-wSbM,17638
|
|
18
|
-
core/refchecker.py,sha256=
|
|
19
|
+
core/refchecker.py,sha256=rJ-CbCqN3dxzxCLr4DERq5UxWtVbErwCMyS3YUxdtuo,285500
|
|
19
20
|
database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
|
|
20
21
|
database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
|
|
21
22
|
llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -39,8 +40,8 @@ utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
|
39
40
|
utils/text_utils.py,sha256=T3PiiG9-BMPTbdCftG2zypyIeZJl6snuMCKQ0nEOQv0,217834
|
|
40
41
|
utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
41
42
|
utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
|
|
42
|
-
academic_refchecker-1.2.
|
|
43
|
-
academic_refchecker-1.2.
|
|
44
|
-
academic_refchecker-1.2.
|
|
45
|
-
academic_refchecker-1.2.
|
|
46
|
-
academic_refchecker-1.2.
|
|
43
|
+
academic_refchecker-1.2.52.dist-info/METADATA,sha256=PKCXz09omWTvIVLZGCgP3kt9yO_V-FjXDu-HHfedqUU,23256
|
|
44
|
+
academic_refchecker-1.2.52.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
45
|
+
academic_refchecker-1.2.52.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
|
|
46
|
+
academic_refchecker-1.2.52.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
|
|
47
|
+
academic_refchecker-1.2.52.dist-info/RECORD,,
|
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
PDF Paper Checker - Validates citations by extracting and analyzing PDF content
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
import io
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
10
|
+
from urllib.parse import urlparse
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
import pdfplumber
|
|
14
|
+
from pypdf import PdfReader
|
|
15
|
+
from fuzzywuzzy import fuzz
|
|
16
|
+
from bs4 import BeautifulSoup
|
|
17
|
+
|
|
18
|
+
from utils.text_utils import normalize_text, calculate_title_similarity
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PDFPaperChecker:
|
|
24
|
+
"""
|
|
25
|
+
Checker that downloads and analyzes PDF documents to validate citations
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self):
|
|
29
|
+
self.session = requests.Session()
|
|
30
|
+
self.session.headers.update({
|
|
31
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
def can_check_reference(self, reference: Dict[str, Any]) -> bool:
|
|
35
|
+
"""
|
|
36
|
+
Check if this reference can be validated by PDF analysis
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
reference: Reference dictionary containing url and other metadata
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
True if reference has URL that likely points to a PDF
|
|
43
|
+
"""
|
|
44
|
+
url = reference.get('url', '').strip()
|
|
45
|
+
if not url:
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
# Check if URL ends with .pdf
|
|
49
|
+
if url.lower().endswith('.pdf'):
|
|
50
|
+
return True
|
|
51
|
+
|
|
52
|
+
# Check if URL path suggests PDF content
|
|
53
|
+
pdf_indicators = ['/pdf/', '/document/', '/download/', '/file/', '/resource/']
|
|
54
|
+
if any(indicator in url.lower() for indicator in pdf_indicators):
|
|
55
|
+
return True
|
|
56
|
+
|
|
57
|
+
# Check if URL is from domains that commonly serve PDFs directly
|
|
58
|
+
domain = urlparse(url).netloc.lower()
|
|
59
|
+
pdf_domains = [
|
|
60
|
+
'.gov', '.edu', '.org', # Common institutional domains
|
|
61
|
+
'researchgate.net', 'academia.edu', 'arxiv.org', # Academic platforms
|
|
62
|
+
'oecd.org', 'who.int', 'unesco.org', # International organizations
|
|
63
|
+
'aecea.ca' # Specific domain from the user's example
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
if any(domain.endswith(pdf_domain) or pdf_domain in domain for pdf_domain in pdf_domains):
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
72
|
+
"""
|
|
73
|
+
Verify a reference by downloading and analyzing PDF content
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Tuple of (verified_data, errors, url) where:
|
|
80
|
+
- verified_data: Dict with verified data if PDF validates citation, None otherwise
|
|
81
|
+
- errors: List of error dictionaries
|
|
82
|
+
- url: The URL that was checked
|
|
83
|
+
"""
|
|
84
|
+
logger.debug(f"Verifying PDF reference: {reference.get('title', 'Untitled')}")
|
|
85
|
+
|
|
86
|
+
url = reference.get('url', '').strip()
|
|
87
|
+
if not url:
|
|
88
|
+
return None, [{"error_type": "unverified", "error_details": "no URL provided"}], None
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
# First try to download directly as PDF
|
|
92
|
+
pdf_content = self._download_pdf(url)
|
|
93
|
+
|
|
94
|
+
# If direct download fails, try to find PDF links in the page
|
|
95
|
+
if not pdf_content:
|
|
96
|
+
pdf_url = self._find_pdf_url_in_page(url)
|
|
97
|
+
if pdf_url:
|
|
98
|
+
logger.debug(f"Found PDF link in page: {pdf_url}")
|
|
99
|
+
pdf_content = self._download_pdf(pdf_url)
|
|
100
|
+
url = pdf_url # Update URL to the actual PDF URL
|
|
101
|
+
|
|
102
|
+
if not pdf_content:
|
|
103
|
+
return None, [{"error_type": "unverified", "error_details": "could not download PDF content"}], url
|
|
104
|
+
|
|
105
|
+
# Extract text and metadata from PDF
|
|
106
|
+
pdf_data = self._extract_pdf_data(pdf_content)
|
|
107
|
+
if not pdf_data:
|
|
108
|
+
return None, [{"error_type": "unverified", "error_details": "could not extract PDF content"}], url
|
|
109
|
+
|
|
110
|
+
# Validate citation against PDF content
|
|
111
|
+
is_valid, errors = self._validate_citation(reference, pdf_data)
|
|
112
|
+
|
|
113
|
+
if is_valid:
|
|
114
|
+
# Create verified data preserving original venue if provided
|
|
115
|
+
venue = reference.get('journal') or reference.get('venue') or reference.get('booktitle') or 'PDF Document'
|
|
116
|
+
|
|
117
|
+
verified_data = {
|
|
118
|
+
'title': reference.get('title', ''),
|
|
119
|
+
'authors': reference.get('authors', []),
|
|
120
|
+
'year': reference.get('year'),
|
|
121
|
+
'venue': venue,
|
|
122
|
+
'url': url,
|
|
123
|
+
'pdf_metadata': {
|
|
124
|
+
'extracted_title': pdf_data.get('title'),
|
|
125
|
+
'extracted_authors': pdf_data.get('authors'),
|
|
126
|
+
'extracted_text_preview': pdf_data.get('text', '')[:200] + '...' if pdf_data.get('text') else '',
|
|
127
|
+
'pdf_pages': pdf_data.get('page_count'),
|
|
128
|
+
'extraction_method': pdf_data.get('extraction_method')
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
logger.debug(f"PDF reference verified: {url}")
|
|
132
|
+
return verified_data, errors, url
|
|
133
|
+
else:
|
|
134
|
+
return None, errors, url
|
|
135
|
+
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.error(f"Error verifying PDF reference {url}: {e}")
|
|
138
|
+
return None, [{"error_type": "unverified", "error_details": "PDF processing error"}], url
|
|
139
|
+
|
|
140
|
+
def _download_pdf(self, url: str, timeout: int = 30) -> Optional[bytes]:
|
|
141
|
+
"""
|
|
142
|
+
Download PDF content from URL
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
url: URL to download from
|
|
146
|
+
timeout: Request timeout in seconds
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
PDF content as bytes, or None if download failed
|
|
150
|
+
"""
|
|
151
|
+
try:
|
|
152
|
+
logger.debug(f"Downloading PDF from: {url}")
|
|
153
|
+
|
|
154
|
+
response = self.session.get(url, timeout=timeout, stream=True)
|
|
155
|
+
response.raise_for_status()
|
|
156
|
+
|
|
157
|
+
# Check if content is actually a PDF
|
|
158
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
159
|
+
if 'pdf' not in content_type and not url.lower().endswith('.pdf'):
|
|
160
|
+
# Sometimes PDFs are served with generic content types, so we'll try anyway
|
|
161
|
+
logger.debug(f"Content-Type '{content_type}' doesn't indicate PDF, but proceeding anyway")
|
|
162
|
+
|
|
163
|
+
# Download content
|
|
164
|
+
content = response.content
|
|
165
|
+
|
|
166
|
+
# Basic PDF validation - check for PDF header
|
|
167
|
+
if content.startswith(b'%PDF-'):
|
|
168
|
+
logger.debug(f"Successfully downloaded PDF ({len(content)} bytes)")
|
|
169
|
+
return content
|
|
170
|
+
else:
|
|
171
|
+
logger.debug("Downloaded content doesn't appear to be a valid PDF")
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.error(f"Failed to download PDF from {url}: {e}")
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
def _find_pdf_url_in_page(self, url: str) -> Optional[str]:
|
|
179
|
+
"""
|
|
180
|
+
Look for PDF download links in a web page
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
url: URL of the web page to search
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
URL of PDF document if found, None otherwise
|
|
187
|
+
"""
|
|
188
|
+
try:
|
|
189
|
+
logger.debug(f"Searching for PDF links in page: {url}")
|
|
190
|
+
|
|
191
|
+
response = self.session.get(url, timeout=15)
|
|
192
|
+
response.raise_for_status()
|
|
193
|
+
|
|
194
|
+
# Check if the response itself is a PDF (after redirects)
|
|
195
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
196
|
+
if 'pdf' in content_type or response.content.startswith(b'%PDF-'):
|
|
197
|
+
logger.debug("Page redirected directly to PDF")
|
|
198
|
+
return response.url
|
|
199
|
+
|
|
200
|
+
# Parse HTML to look for PDF links
|
|
201
|
+
from bs4 import BeautifulSoup
|
|
202
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
203
|
+
|
|
204
|
+
# Look for links that might be PDFs
|
|
205
|
+
pdf_links = []
|
|
206
|
+
|
|
207
|
+
# Find all links
|
|
208
|
+
for link in soup.find_all('a', href=True):
|
|
209
|
+
href = link.get('href')
|
|
210
|
+
link_text = link.get_text().lower().strip()
|
|
211
|
+
|
|
212
|
+
# Check if link ends with .pdf
|
|
213
|
+
if href and href.lower().endswith('.pdf'):
|
|
214
|
+
pdf_links.append(href)
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
# Check if link text suggests PDF
|
|
218
|
+
if any(indicator in link_text for indicator in ['pdf', 'download', 'document', 'report', 'policy']):
|
|
219
|
+
pdf_links.append(href)
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
# Check if link has PDF-related attributes
|
|
223
|
+
if link.get('type', '').lower() == 'application/pdf':
|
|
224
|
+
pdf_links.append(href)
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
# Look for PDF links in other elements
|
|
228
|
+
for element in soup.find_all(attrs={'href': True}):
|
|
229
|
+
href = element.get('href')
|
|
230
|
+
if href and href.lower().endswith('.pdf'):
|
|
231
|
+
pdf_links.append(href)
|
|
232
|
+
|
|
233
|
+
# Convert relative URLs to absolute
|
|
234
|
+
from urllib.parse import urljoin
|
|
235
|
+
absolute_pdf_links = []
|
|
236
|
+
for link in pdf_links:
|
|
237
|
+
if link:
|
|
238
|
+
absolute_url = urljoin(url, link)
|
|
239
|
+
absolute_pdf_links.append(absolute_url)
|
|
240
|
+
|
|
241
|
+
# Remove duplicates
|
|
242
|
+
absolute_pdf_links = list(set(absolute_pdf_links))
|
|
243
|
+
|
|
244
|
+
if absolute_pdf_links:
|
|
245
|
+
logger.debug(f"Found {len(absolute_pdf_links)} potential PDF links")
|
|
246
|
+
# Return the first PDF link found
|
|
247
|
+
return absolute_pdf_links[0]
|
|
248
|
+
|
|
249
|
+
logger.debug("No PDF links found in page")
|
|
250
|
+
return None
|
|
251
|
+
|
|
252
|
+
except Exception as e:
|
|
253
|
+
logger.error(f"Error searching for PDF links in {url}: {e}")
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
def _extract_pdf_data(self, pdf_content: bytes) -> Optional[Dict[str, Any]]:
|
|
257
|
+
"""
|
|
258
|
+
Extract text and metadata from PDF content
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
pdf_content: PDF file content as bytes
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
Dictionary with extracted data including text, title, authors, etc.
|
|
265
|
+
"""
|
|
266
|
+
pdf_data = {
|
|
267
|
+
'text': '',
|
|
268
|
+
'title': '',
|
|
269
|
+
'authors': [],
|
|
270
|
+
'page_count': 0,
|
|
271
|
+
'extraction_method': 'none'
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
# Try multiple extraction methods
|
|
275
|
+
try:
|
|
276
|
+
# Method 1: Try pdfplumber (usually better for text extraction)
|
|
277
|
+
pdf_data = self._extract_with_pdfplumber(pdf_content, pdf_data)
|
|
278
|
+
if pdf_data['text']:
|
|
279
|
+
pdf_data['extraction_method'] = 'pdfplumber'
|
|
280
|
+
return pdf_data
|
|
281
|
+
except Exception as e:
|
|
282
|
+
logger.debug(f"pdfplumber extraction failed: {e}")
|
|
283
|
+
|
|
284
|
+
try:
|
|
285
|
+
# Method 2: Try pypdf (fallback)
|
|
286
|
+
pdf_data = self._extract_with_pypdf(pdf_content, pdf_data)
|
|
287
|
+
if pdf_data['text']:
|
|
288
|
+
pdf_data['extraction_method'] = 'pypdf'
|
|
289
|
+
return pdf_data
|
|
290
|
+
except Exception as e:
|
|
291
|
+
logger.debug(f"pypdf extraction failed: {e}")
|
|
292
|
+
|
|
293
|
+
logger.debug("All PDF extraction methods failed")
|
|
294
|
+
return None
|
|
295
|
+
|
|
296
|
+
def _extract_with_pdfplumber(self, pdf_content: bytes, pdf_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
297
|
+
"""Extract PDF data using pdfplumber"""
|
|
298
|
+
with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
|
|
299
|
+
pdf_data['page_count'] = len(pdf.pages)
|
|
300
|
+
|
|
301
|
+
# Extract text from first few pages (usually contains title/author info)
|
|
302
|
+
text_parts = []
|
|
303
|
+
for i, page in enumerate(pdf.pages[:5]): # First 5 pages should be enough
|
|
304
|
+
page_text = page.extract_text()
|
|
305
|
+
if page_text:
|
|
306
|
+
text_parts.append(page_text)
|
|
307
|
+
|
|
308
|
+
pdf_data['text'] = '\n'.join(text_parts)
|
|
309
|
+
|
|
310
|
+
# Try to extract title and author from first page
|
|
311
|
+
if pdf.pages:
|
|
312
|
+
first_page_text = pdf.pages[0].extract_text() or ''
|
|
313
|
+
pdf_data['title'], pdf_data['authors'] = self._parse_title_and_authors(first_page_text)
|
|
314
|
+
|
|
315
|
+
return pdf_data
|
|
316
|
+
|
|
317
|
+
def _extract_with_pypdf(self, pdf_content: bytes, pdf_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
318
|
+
"""Extract PDF data using pypdf"""
|
|
319
|
+
reader = PdfReader(io.BytesIO(pdf_content))
|
|
320
|
+
pdf_data['page_count'] = len(reader.pages)
|
|
321
|
+
|
|
322
|
+
# Extract metadata
|
|
323
|
+
if reader.metadata:
|
|
324
|
+
if '/Title' in reader.metadata:
|
|
325
|
+
pdf_data['title'] = str(reader.metadata['/Title'])
|
|
326
|
+
if '/Author' in reader.metadata:
|
|
327
|
+
pdf_data['authors'] = [str(reader.metadata['/Author'])]
|
|
328
|
+
|
|
329
|
+
# Extract text from first few pages
|
|
330
|
+
text_parts = []
|
|
331
|
+
for i, page in enumerate(reader.pages[:5]): # First 5 pages
|
|
332
|
+
try:
|
|
333
|
+
page_text = page.extract_text()
|
|
334
|
+
if page_text:
|
|
335
|
+
text_parts.append(page_text)
|
|
336
|
+
except Exception as e:
|
|
337
|
+
logger.debug(f"Failed to extract text from page {i}: {e}")
|
|
338
|
+
continue
|
|
339
|
+
|
|
340
|
+
pdf_data['text'] = '\n'.join(text_parts)
|
|
341
|
+
|
|
342
|
+
# If no metadata title/author, try to parse from text
|
|
343
|
+
if not pdf_data['title'] and text_parts:
|
|
344
|
+
title, authors = self._parse_title_and_authors(text_parts[0])
|
|
345
|
+
if title and not pdf_data['title']:
|
|
346
|
+
pdf_data['title'] = title
|
|
347
|
+
if authors and not pdf_data['authors']:
|
|
348
|
+
pdf_data['authors'] = authors
|
|
349
|
+
|
|
350
|
+
return pdf_data
|
|
351
|
+
|
|
352
|
+
def _parse_title_and_authors(self, text: str) -> Tuple[str, List[str]]:
|
|
353
|
+
"""
|
|
354
|
+
Parse title and authors from PDF text
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
text: Text from first page of PDF
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
Tuple of (title, authors_list)
|
|
361
|
+
"""
|
|
362
|
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|
363
|
+
|
|
364
|
+
if not lines:
|
|
365
|
+
return '', []
|
|
366
|
+
|
|
367
|
+
# The title is often the first meaningful line (after removing headers/footers)
|
|
368
|
+
title = ''
|
|
369
|
+
authors = []
|
|
370
|
+
|
|
371
|
+
# Look for the title - usually first non-header line
|
|
372
|
+
for i, line in enumerate(lines):
|
|
373
|
+
# Skip obvious header/footer content
|
|
374
|
+
if len(line) < 10 or any(skip in line.lower() for skip in ['page', 'doi:', 'http', 'www.', '@']):
|
|
375
|
+
continue
|
|
376
|
+
|
|
377
|
+
# Title is usually longer and on its own line
|
|
378
|
+
if len(line) > 20 and not any(sep in line for sep in [',', ';']) and not line.endswith('.'):
|
|
379
|
+
title = line
|
|
380
|
+
|
|
381
|
+
# Authors often follow the title - look for patterns
|
|
382
|
+
for j in range(i + 1, min(i + 5, len(lines))):
|
|
383
|
+
author_line = lines[j]
|
|
384
|
+
|
|
385
|
+
# Author lines often contain commas, "and", or institutional affiliations
|
|
386
|
+
if any(indicator in author_line.lower() for indicator in [',', ' and ', 'university', 'college', 'institute']):
|
|
387
|
+
# Clean up author line
|
|
388
|
+
author_text = re.sub(r'[0-9*†‡§¶#]', '', author_line) # Remove superscript markers
|
|
389
|
+
if ',' in author_text:
|
|
390
|
+
authors.extend([name.strip() for name in author_text.split(',') if name.strip()])
|
|
391
|
+
else:
|
|
392
|
+
authors.append(author_text.strip())
|
|
393
|
+
break
|
|
394
|
+
break
|
|
395
|
+
|
|
396
|
+
return title, authors
|
|
397
|
+
|
|
398
|
+
def _validate_citation(self, reference: Dict[str, Any], pdf_data: Dict[str, Any]) -> Tuple[bool, List[Dict[str, Any]]]:
|
|
399
|
+
"""
|
|
400
|
+
Validate citation against extracted PDF data
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
reference: The citation being checked
|
|
404
|
+
pdf_data: Extracted data from PDF
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
Tuple of (is_valid, errors_list)
|
|
408
|
+
"""
|
|
409
|
+
errors = []
|
|
410
|
+
|
|
411
|
+
# Check title match
|
|
412
|
+
cited_title = reference.get('title', '').strip()
|
|
413
|
+
extracted_title = pdf_data.get('title', '').strip()
|
|
414
|
+
pdf_text = pdf_data.get('text', '').lower()
|
|
415
|
+
|
|
416
|
+
title_match = False
|
|
417
|
+
|
|
418
|
+
if cited_title and extracted_title:
|
|
419
|
+
# Compare titles directly
|
|
420
|
+
similarity = calculate_title_similarity(cited_title, extracted_title)
|
|
421
|
+
if similarity > 0.8: # 80% similarity threshold
|
|
422
|
+
title_match = True
|
|
423
|
+
|
|
424
|
+
if not title_match and cited_title and pdf_text:
|
|
425
|
+
# Check if cited title appears in PDF text
|
|
426
|
+
cited_title_normalized = normalize_text(cited_title)
|
|
427
|
+
if cited_title_normalized.lower() in pdf_text:
|
|
428
|
+
title_match = True
|
|
429
|
+
|
|
430
|
+
if not title_match:
|
|
431
|
+
errors.append({
|
|
432
|
+
"error_type": "unverified",
|
|
433
|
+
"error_details": "title not found in PDF content"
|
|
434
|
+
})
|
|
435
|
+
|
|
436
|
+
# Check author match (more lenient since PDF author extraction is difficult)
|
|
437
|
+
cited_authors = reference.get('authors', [])
|
|
438
|
+
extracted_authors = pdf_data.get('authors', [])
|
|
439
|
+
|
|
440
|
+
author_match = False
|
|
441
|
+
|
|
442
|
+
if cited_authors and extracted_authors:
|
|
443
|
+
# Check if any cited author appears in extracted authors
|
|
444
|
+
for cited_author in cited_authors:
|
|
445
|
+
for extracted_author in extracted_authors:
|
|
446
|
+
if self._authors_match(cited_author, extracted_author):
|
|
447
|
+
author_match = True
|
|
448
|
+
break
|
|
449
|
+
if author_match:
|
|
450
|
+
break
|
|
451
|
+
|
|
452
|
+
if not author_match and cited_authors and pdf_text:
|
|
453
|
+
# Check if any cited author appears in PDF text
|
|
454
|
+
for cited_author in cited_authors:
|
|
455
|
+
author_normalized = normalize_text(cited_author)
|
|
456
|
+
if author_normalized.lower() in pdf_text:
|
|
457
|
+
author_match = True
|
|
458
|
+
break
|
|
459
|
+
|
|
460
|
+
# For PDF validation, we're more lenient with author matching since extraction is unreliable
|
|
461
|
+
if not author_match and cited_authors:
|
|
462
|
+
errors.append({
|
|
463
|
+
"warning_type": "author",
|
|
464
|
+
"warning_details": "authors not clearly identified in PDF content"
|
|
465
|
+
})
|
|
466
|
+
|
|
467
|
+
# A reference is valid if we found the title (author matching is optional due to extraction difficulties)
|
|
468
|
+
is_valid = title_match
|
|
469
|
+
|
|
470
|
+
return is_valid, errors
|
|
471
|
+
|
|
472
|
+
def _authors_match(self, author1: str, author2: str) -> bool:
|
|
473
|
+
"""Check if two author names likely refer to the same person"""
|
|
474
|
+
author1_norm = normalize_text(author1).lower()
|
|
475
|
+
author2_norm = normalize_text(author2).lower()
|
|
476
|
+
|
|
477
|
+
# Exact match
|
|
478
|
+
if author1_norm == author2_norm:
|
|
479
|
+
return True
|
|
480
|
+
|
|
481
|
+
# Check similarity
|
|
482
|
+
similarity = fuzz.ratio(author1_norm, author2_norm)
|
|
483
|
+
if similarity > 85: # 85% similarity threshold
|
|
484
|
+
return True
|
|
485
|
+
|
|
486
|
+
# Check if one name is contained in the other (handles "J. Smith" vs "John Smith")
|
|
487
|
+
words1 = set(author1_norm.split())
|
|
488
|
+
words2 = set(author2_norm.split())
|
|
489
|
+
|
|
490
|
+
if words1.intersection(words2):
|
|
491
|
+
return True
|
|
492
|
+
|
|
493
|
+
return False
|
checkers/webpage_checker.py
CHANGED
|
@@ -512,4 +512,427 @@ class WebPageChecker:
|
|
|
512
512
|
"warning_details": "Access blocked by site but domain is trusted (likely bot protection)"
|
|
513
513
|
})
|
|
514
514
|
|
|
515
|
-
return verified_data, errors, web_url
|
|
515
|
+
return verified_data, errors, web_url
|
|
516
|
+
|
|
517
|
+
def check_unverified_url_reference(self, reference: Dict[str, Any]) -> str:
|
|
518
|
+
"""
|
|
519
|
+
Check a URL from an unverified reference to determine the specific unverified reason
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
String with the specific unverified reason:
|
|
526
|
+
- "non-existent web page" if the page doesn't exist
|
|
527
|
+
- "paper not found and URL doesn't reference it" if page exists but doesn't contain title
|
|
528
|
+
- "paper not verified but URL references paper" if page exists and contains title
|
|
529
|
+
"""
|
|
530
|
+
logger.debug(f"Checking unverified URL reference: {reference.get('title', 'Untitled')}")
|
|
531
|
+
|
|
532
|
+
# Extract URL from reference
|
|
533
|
+
web_url = reference.get('url', '').strip()
|
|
534
|
+
if not web_url:
|
|
535
|
+
return "paper not found and URL doesn't reference it" # No URL to check
|
|
536
|
+
|
|
537
|
+
# Make request to check if page exists
|
|
538
|
+
response = self._respectful_request(web_url)
|
|
539
|
+
if response is None:
|
|
540
|
+
return "non-existent web page"
|
|
541
|
+
|
|
542
|
+
if response.status_code == 404:
|
|
543
|
+
return "non-existent web page"
|
|
544
|
+
elif response.status_code == 403:
|
|
545
|
+
# For blocked resources, we can't check content but assume page exists
|
|
546
|
+
return "paper not verified but URL references paper"
|
|
547
|
+
elif response.status_code != 200:
|
|
548
|
+
return "non-existent web page"
|
|
549
|
+
|
|
550
|
+
try:
|
|
551
|
+
# Parse HTML content to search for title
|
|
552
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
553
|
+
if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
|
|
554
|
+
# For PDFs, we can't search content, so assume it's referenced if accessible
|
|
555
|
+
return "paper not verified but URL references paper"
|
|
556
|
+
|
|
557
|
+
# Parse HTML content
|
|
558
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
559
|
+
|
|
560
|
+
# Extract page content for searching
|
|
561
|
+
page_title = self._extract_page_title(soup)
|
|
562
|
+
page_description = self._extract_description(soup)
|
|
563
|
+
|
|
564
|
+
# Get the full page text for comprehensive searching
|
|
565
|
+
page_text = soup.get_text().lower()
|
|
566
|
+
|
|
567
|
+
# Get the reference title to search for
|
|
568
|
+
cited_title = reference.get('title', '').strip()
|
|
569
|
+
if not cited_title:
|
|
570
|
+
return "paper not found and URL doesn't reference it"
|
|
571
|
+
|
|
572
|
+
# Search for the title in various ways
|
|
573
|
+
cited_title_lower = cited_title.lower()
|
|
574
|
+
|
|
575
|
+
# Direct search in page text
|
|
576
|
+
if cited_title_lower in page_text:
|
|
577
|
+
return "paper not verified but URL references paper"
|
|
578
|
+
|
|
579
|
+
# Search for key words from the title
|
|
580
|
+
cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
|
|
581
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
582
|
+
|
|
583
|
+
# Check if significant portion of title words appear in page
|
|
584
|
+
page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
|
|
585
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
586
|
+
|
|
587
|
+
common_words = cited_words.intersection(page_words)
|
|
588
|
+
|
|
589
|
+
# If most of the title words are found, consider it referenced
|
|
590
|
+
if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
|
|
591
|
+
return "paper not verified but URL references paper"
|
|
592
|
+
|
|
593
|
+
# Also check the extracted title and description specifically
|
|
594
|
+
if page_title:
|
|
595
|
+
if self._check_title_match(cited_title, page_title, page_description):
|
|
596
|
+
return "paper not verified but URL references paper"
|
|
597
|
+
|
|
598
|
+
# Title not found in page content
|
|
599
|
+
return "paper not found and URL doesn't reference it"
|
|
600
|
+
|
|
601
|
+
except Exception as e:
|
|
602
|
+
logger.error(f"Error checking unverified URL {web_url}: {e}")
|
|
603
|
+
return "paper not found and URL doesn't reference it"
|
|
604
|
+
|
|
605
|
+
def verify_raw_url_for_unverified_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
606
|
+
"""
|
|
607
|
+
Verify a raw URL from an unverified reference - can return verified data if appropriate
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
611
|
+
|
|
612
|
+
Returns:
|
|
613
|
+
Tuple of (verified_data, errors, url) where:
|
|
614
|
+
- verified_data: Dict with verified data if URL should be considered verified, None otherwise
|
|
615
|
+
- errors: List of error dictionaries with specific unverified reasons
|
|
616
|
+
- url: The URL that was checked
|
|
617
|
+
"""
|
|
618
|
+
logger.debug(f"Verifying raw URL for unverified reference: {reference.get('title', 'Untitled')}")
|
|
619
|
+
|
|
620
|
+
# Extract URL from reference
|
|
621
|
+
web_url = reference.get('url', '').strip()
|
|
622
|
+
if not web_url:
|
|
623
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], None
|
|
624
|
+
|
|
625
|
+
# Make request to check if page exists
|
|
626
|
+
response = self._respectful_request(web_url)
|
|
627
|
+
if response is None:
|
|
628
|
+
return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
|
|
629
|
+
|
|
630
|
+
if response.status_code == 404:
|
|
631
|
+
return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
|
|
632
|
+
elif response.status_code == 403:
|
|
633
|
+
# For blocked resources, we can't check content but assume page exists
|
|
634
|
+
# If no venue, treat as verified since URL is accessible
|
|
635
|
+
if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
|
|
636
|
+
verified_data = {
|
|
637
|
+
'title': reference.get('title', ''),
|
|
638
|
+
'authors': reference.get('authors', []),
|
|
639
|
+
'year': reference.get('year'),
|
|
640
|
+
'venue': 'Web Page',
|
|
641
|
+
'url': web_url,
|
|
642
|
+
'web_metadata': {
|
|
643
|
+
'status_code': 403,
|
|
644
|
+
'access_blocked': True
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
return verified_data, [], web_url
|
|
648
|
+
else:
|
|
649
|
+
return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
|
|
650
|
+
elif response.status_code != 200:
|
|
651
|
+
return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
|
|
652
|
+
|
|
653
|
+
try:
|
|
654
|
+
# Parse HTML content to search for title
|
|
655
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
656
|
+
if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
|
|
657
|
+
# For PDFs, if no venue specified, treat as verified
|
|
658
|
+
if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
|
|
659
|
+
verified_data = {
|
|
660
|
+
'title': reference.get('title', ''),
|
|
661
|
+
'authors': reference.get('authors', []),
|
|
662
|
+
'year': reference.get('year'),
|
|
663
|
+
'venue': 'PDF Document',
|
|
664
|
+
'url': web_url,
|
|
665
|
+
'web_metadata': {
|
|
666
|
+
'content_type': response.headers.get('content-type', ''),
|
|
667
|
+
'status_code': response.status_code
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
return verified_data, [], web_url
|
|
671
|
+
else:
|
|
672
|
+
return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
|
|
673
|
+
|
|
674
|
+
# Parse HTML content
|
|
675
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
676
|
+
|
|
677
|
+
# Extract page content for searching
|
|
678
|
+
page_title = self._extract_page_title(soup)
|
|
679
|
+
page_description = self._extract_description(soup)
|
|
680
|
+
|
|
681
|
+
# Get the full page text for comprehensive searching
|
|
682
|
+
page_text = soup.get_text().lower()
|
|
683
|
+
|
|
684
|
+
# Get the reference title to search for
|
|
685
|
+
cited_title = reference.get('title', '').strip()
|
|
686
|
+
if not cited_title:
|
|
687
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
|
|
688
|
+
|
|
689
|
+
# Search for the title in various ways
|
|
690
|
+
cited_title_lower = cited_title.lower()
|
|
691
|
+
title_found = False
|
|
692
|
+
|
|
693
|
+
# Direct search in page text
|
|
694
|
+
if cited_title_lower in page_text:
|
|
695
|
+
title_found = True
|
|
696
|
+
|
|
697
|
+
# Search for key words from the title
|
|
698
|
+
if not title_found:
|
|
699
|
+
cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
|
|
700
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
701
|
+
|
|
702
|
+
# Check if significant portion of title words appear in page
|
|
703
|
+
page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
|
|
704
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
705
|
+
|
|
706
|
+
common_words = cited_words.intersection(page_words)
|
|
707
|
+
|
|
708
|
+
# If most of the title words are found, consider it referenced
|
|
709
|
+
if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
|
|
710
|
+
title_found = True
|
|
711
|
+
|
|
712
|
+
# Also check the extracted title and description specifically
|
|
713
|
+
if not title_found and page_title:
|
|
714
|
+
if self._check_title_match(cited_title, page_title, page_description):
|
|
715
|
+
title_found = True
|
|
716
|
+
|
|
717
|
+
# Determine if this should be verified or unverified
|
|
718
|
+
if title_found:
|
|
719
|
+
# Check if reference should be verified based on venue type
|
|
720
|
+
venue_field = reference.get('journal') or reference.get('venue') or reference.get('booktitle')
|
|
721
|
+
|
|
722
|
+
if not venue_field:
|
|
723
|
+
# No venue specified - verify with URL as venue
|
|
724
|
+
site_info = self._extract_site_info(soup, web_url)
|
|
725
|
+
venue = site_info.get('organization', 'Web Page') if site_info.get('organization') != site_info.get('domain') else 'Web Page'
|
|
726
|
+
|
|
727
|
+
verified_data = {
|
|
728
|
+
'title': reference.get('title', ''),
|
|
729
|
+
'authors': reference.get('authors', []),
|
|
730
|
+
'year': reference.get('year'),
|
|
731
|
+
'venue': venue,
|
|
732
|
+
'url': web_url,
|
|
733
|
+
'web_metadata': {
|
|
734
|
+
'page_title': page_title,
|
|
735
|
+
'description': page_description,
|
|
736
|
+
'site_info': site_info,
|
|
737
|
+
'final_url': response.url,
|
|
738
|
+
'status_code': response.status_code
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
logger.debug(f"URL verified as valid source for reference without venue: {web_url}")
|
|
742
|
+
return verified_data, [], web_url
|
|
743
|
+
elif self._is_web_content_venue(venue_field, web_url):
|
|
744
|
+
# Has venue but it's a web content venue (news, blog, etc.) - verify it
|
|
745
|
+
verified_data = {
|
|
746
|
+
'title': reference.get('title', ''),
|
|
747
|
+
'authors': reference.get('authors', []),
|
|
748
|
+
'year': reference.get('year'),
|
|
749
|
+
'venue': venue_field, # Keep the original venue
|
|
750
|
+
'url': web_url,
|
|
751
|
+
'web_metadata': {
|
|
752
|
+
'page_title': page_title,
|
|
753
|
+
'description': page_description,
|
|
754
|
+
'site_info': self._extract_site_info(soup, web_url),
|
|
755
|
+
'final_url': response.url,
|
|
756
|
+
'status_code': response.status_code
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
logger.debug(f"URL verified as valid web content source: {web_url}")
|
|
760
|
+
return verified_data, [], web_url
|
|
761
|
+
else:
|
|
762
|
+
# Has academic venue but URL references paper - still unverified (needs proper paper verification)
|
|
763
|
+
return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
|
|
764
|
+
else:
|
|
765
|
+
# Title not found in page content
|
|
766
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
|
|
767
|
+
|
|
768
|
+
except Exception as e:
|
|
769
|
+
logger.error(f"Error checking raw URL {web_url}: {e}")
|
|
770
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
|
|
771
|
+
|
|
772
|
+
def _is_web_content_venue(self, venue: str, url: str) -> bool:
|
|
773
|
+
"""
|
|
774
|
+
Determine if a venue represents web content rather than academic publication
|
|
775
|
+
|
|
776
|
+
Args:
|
|
777
|
+
venue: The venue string (journal, venue, or booktitle)
|
|
778
|
+
url: The URL being checked (for additional context)
|
|
779
|
+
|
|
780
|
+
Returns:
|
|
781
|
+
True if this represents web content that can be verified via URL
|
|
782
|
+
"""
|
|
783
|
+
if not venue:
|
|
784
|
+
return False
|
|
785
|
+
|
|
786
|
+
venue_lower = venue.lower().strip()
|
|
787
|
+
|
|
788
|
+
# News organizations and media outlets
|
|
789
|
+
news_indicators = [
|
|
790
|
+
'news', 'cbc', 'bbc', 'cnn', 'reuters', 'associated press', 'ap news',
|
|
791
|
+
'npr', 'pbs', 'abc news', 'nbc news', 'fox news', 'guardian', 'times',
|
|
792
|
+
'post', 'herald', 'tribune', 'gazette', 'chronicle', 'observer',
|
|
793
|
+
'magazine', 'weekly', 'daily', 'today', 'report', 'wire', 'press'
|
|
794
|
+
]
|
|
795
|
+
|
|
796
|
+
# Special case for Wall Street Journal
|
|
797
|
+
if any(word in venue_lower for word in ['wall street', 'wsj']):
|
|
798
|
+
news_indicators.append('journal')
|
|
799
|
+
|
|
800
|
+
# Technology and industry publications
|
|
801
|
+
tech_publications = [
|
|
802
|
+
'techcrunch', 'wired', 'ars technica', 'the verge', 'engadget',
|
|
803
|
+
'zdnet', 'cnet', 'computerworld', 'infoworld', 'pcmag', 'pcworld',
|
|
804
|
+
'ieee spectrum', 'mit technology review', 'scientific american'
|
|
805
|
+
]
|
|
806
|
+
|
|
807
|
+
# Blogs and web platforms
|
|
808
|
+
blog_platforms = [
|
|
809
|
+
'blog', 'medium', 'substack', 'wordpress', 'blogspot', 'tumblr',
|
|
810
|
+
'linkedin', 'facebook', 'twitter', 'reddit', 'stack overflow',
|
|
811
|
+
'github pages', 'personal website', 'company blog'
|
|
812
|
+
]
|
|
813
|
+
|
|
814
|
+
# Government and organizational websites
|
|
815
|
+
org_indicators = [
|
|
816
|
+
'government', 'gov', '.org', 'agency', 'department', 'ministry',
|
|
817
|
+
'commission', 'bureau', 'office', 'administration', 'institute',
|
|
818
|
+
'foundation', 'association', 'society', 'center', 'centre',
|
|
819
|
+
'council', 'committee', 'board', 'union', 'federation', 'alliance',
|
|
820
|
+
'coalition', 'consortium', 'network', 'group', 'organization',
|
|
821
|
+
'organisation', 'corp', 'corporation', 'company', 'ltd', 'inc'
|
|
822
|
+
]
|
|
823
|
+
|
|
824
|
+
# Documentation and technical resources
|
|
825
|
+
tech_resources = [
|
|
826
|
+
'documentation', 'docs', 'api', 'reference', 'guide', 'tutorial',
|
|
827
|
+
'manual', 'readme', 'wiki', 'help', 'support', 'developer',
|
|
828
|
+
'technical', 'white paper', 'whitepaper', 'brief', 'overview',
|
|
829
|
+
'policy', 'strategy', 'report', 'study', 'analysis', 'research'
|
|
830
|
+
]
|
|
831
|
+
|
|
832
|
+
# Check URL domain for additional context
|
|
833
|
+
url_lower = url.lower() if url else ''
|
|
834
|
+
|
|
835
|
+
# Known web content domains in URL
|
|
836
|
+
web_domains = [
|
|
837
|
+
'cbc.ca', 'bbc.com', 'cnn.com', 'reuters.com', 'npr.org', 'pbs.org',
|
|
838
|
+
'nytimes.com', 'washingtonpost.com', 'theguardian.com', 'wsj.com',
|
|
839
|
+
'techcrunch.com', 'wired.com', 'theverge.com', 'arstechnica.com',
|
|
840
|
+
'medium.com', 'substack.com', 'linkedin.com', 'github.io',
|
|
841
|
+
'readthedocs.io', 'stackoverflow.com', 'reddit.com'
|
|
842
|
+
]
|
|
843
|
+
|
|
844
|
+
# Combine all indicators
|
|
845
|
+
all_indicators = news_indicators + tech_publications + blog_platforms + org_indicators + tech_resources
|
|
846
|
+
|
|
847
|
+
# Academic venue indicators that should NOT be considered web content
|
|
848
|
+
academic_indicators = [
|
|
849
|
+
'proceedings', 'conference', 'symposium', 'workshop', 'transactions',
|
|
850
|
+
'journal of', 'international journal', 'acm', 'ieee', 'springer',
|
|
851
|
+
'nature', 'science', 'cell', 'lancet', 'plos', 'arxiv', 'pubmed',
|
|
852
|
+
'artificial intelligence', 'machine learning', 'computer vision',
|
|
853
|
+
'neural', 'computing', 'robotics', 'bioinformatics'
|
|
854
|
+
]
|
|
855
|
+
|
|
856
|
+
# Check if venue is clearly academic (should not be treated as web content)
|
|
857
|
+
is_academic = any(indicator in venue_lower for indicator in academic_indicators)
|
|
858
|
+
if is_academic:
|
|
859
|
+
return False
|
|
860
|
+
|
|
861
|
+
# Check if venue matches any web content indicators
|
|
862
|
+
venue_matches = any(indicator and indicator in venue_lower for indicator in all_indicators)
|
|
863
|
+
|
|
864
|
+
# Check if URL domain suggests web content
|
|
865
|
+
url_matches = any(domain in url_lower for domain in web_domains)
|
|
866
|
+
|
|
867
|
+
# Special case: if URL contains news/blog/docs indicators, lean towards web content
|
|
868
|
+
url_content_indicators = ['news', 'blog', 'post', 'article', 'docs', 'help', 'guide', 'resources', 'policy', 'strategy']
|
|
869
|
+
url_has_content_indicators = any(indicator in url_lower for indicator in url_content_indicators)
|
|
870
|
+
|
|
871
|
+
# Special case: Check if venue is an organizational acronym/name that matches the URL domain
|
|
872
|
+
# This handles cases like "AECEA" on aecea.ca domain
|
|
873
|
+
organizational_match = self._check_organizational_venue_match(venue, url_lower)
|
|
874
|
+
|
|
875
|
+
return venue_matches or url_matches or url_has_content_indicators or organizational_match
|
|
876
|
+
|
|
877
|
+
def _check_organizational_venue_match(self, venue: str, url_lower: str) -> bool:
|
|
878
|
+
"""
|
|
879
|
+
Check if the venue represents an organization that matches the URL domain
|
|
880
|
+
|
|
881
|
+
Args:
|
|
882
|
+
venue: The venue string
|
|
883
|
+
url_lower: The lowercased URL
|
|
884
|
+
|
|
885
|
+
Returns:
|
|
886
|
+
True if venue appears to be the organization publishing on their own domain
|
|
887
|
+
"""
|
|
888
|
+
if not venue or not url_lower:
|
|
889
|
+
return False
|
|
890
|
+
|
|
891
|
+
venue_lower = venue.lower().strip()
|
|
892
|
+
|
|
893
|
+
# Extract domain from URL
|
|
894
|
+
from urllib.parse import urlparse
|
|
895
|
+
try:
|
|
896
|
+
parsed_url = urlparse(url_lower)
|
|
897
|
+
domain = parsed_url.netloc.lower()
|
|
898
|
+
|
|
899
|
+
# Remove common prefixes
|
|
900
|
+
domain = domain.replace('www.', '')
|
|
901
|
+
|
|
902
|
+
# Check if venue is likely an acronym (short, all caps or mixed case)
|
|
903
|
+
is_likely_acronym = (len(venue) <= 10 and
|
|
904
|
+
(venue.isupper() or
|
|
905
|
+
any(c.isupper() for c in venue) and len(venue.split()) == 1))
|
|
906
|
+
|
|
907
|
+
# Check if venue appears in domain
|
|
908
|
+
venue_clean = ''.join(c for c in venue_lower if c.isalnum())
|
|
909
|
+
|
|
910
|
+
if venue_clean and venue_clean in domain:
|
|
911
|
+
return True
|
|
912
|
+
|
|
913
|
+
# For acronyms, check if the acronym could match the domain
|
|
914
|
+
if is_likely_acronym:
|
|
915
|
+
# Split venue into words and check if initials match domain
|
|
916
|
+
venue_words = venue_lower.replace('.', ' ').split()
|
|
917
|
+
if len(venue_words) == 1 and len(venue_words[0]) <= 6:
|
|
918
|
+
# Single word acronym - check if it's in the domain
|
|
919
|
+
if venue_words[0] in domain:
|
|
920
|
+
return True
|
|
921
|
+
|
|
922
|
+
# Check for educational/professional associations with .ca, .org, .edu domains
|
|
923
|
+
if any(domain.endswith(tld) for tld in ['.ca', '.org', '.edu', '.gov']):
|
|
924
|
+
# These domains often host organizational content
|
|
925
|
+
if any(org_word in venue_lower for org_word in [
|
|
926
|
+
'association', 'society', 'institute', 'foundation', 'center',
|
|
927
|
+
'centre', 'council', 'committee', 'board', 'agency', 'department'
|
|
928
|
+
]):
|
|
929
|
+
return True
|
|
930
|
+
|
|
931
|
+
# Check if venue is a short organizational name/acronym
|
|
932
|
+
if is_likely_acronym:
|
|
933
|
+
return True
|
|
934
|
+
|
|
935
|
+
return False
|
|
936
|
+
|
|
937
|
+
except Exception:
|
|
938
|
+
return False
|
core/refchecker.py
CHANGED
|
@@ -2021,8 +2021,20 @@ class ArxivReferenceChecker:
|
|
|
2021
2021
|
logger.debug(f"Database mode: Initial paper_url from database checker: {paper_url}")
|
|
2022
2022
|
|
|
2023
2023
|
if not verified_data:
|
|
2024
|
-
# Mark as unverified but
|
|
2025
|
-
|
|
2024
|
+
# Mark as unverified but check URL for more specific reason or verification
|
|
2025
|
+
if reference.get('url', '').strip():
|
|
2026
|
+
# Use raw URL verifier to check if it can be verified or get specific reason
|
|
2027
|
+
url_verified_data, url_errors, url_checked = self.verify_raw_url_reference(reference)
|
|
2028
|
+
if url_verified_data:
|
|
2029
|
+
# URL verification succeeded - return as verified
|
|
2030
|
+
logger.debug(f"Database mode: URL verification succeeded for unverified reference")
|
|
2031
|
+
return None, url_checked, url_verified_data
|
|
2032
|
+
else:
|
|
2033
|
+
# URL verification failed - use specific error reason
|
|
2034
|
+
url_error_details = url_errors[0].get('error_details', 'Reference could not be verified in database') if url_errors else 'Reference could not be verified in database'
|
|
2035
|
+
return [{"error_type": "unverified", "error_details": url_error_details}], paper_url, None
|
|
2036
|
+
else:
|
|
2037
|
+
return [{"error_type": "unverified", "error_details": "Reference could not be verified in database"}], paper_url, None
|
|
2026
2038
|
|
|
2027
2039
|
# Convert database errors to our format
|
|
2028
2040
|
formatted_errors = []
|
|
@@ -2118,7 +2130,29 @@ class ArxivReferenceChecker:
|
|
|
2118
2130
|
return [{"error_type": "unverified", "error_details": "Database connection not available"}], None, None
|
|
2119
2131
|
|
|
2120
2132
|
# For non-database mode, use the standard reference verification
|
|
2121
|
-
|
|
2133
|
+
errors, paper_url, verified_data = self.verify_reference_standard(source_paper, reference)
|
|
2134
|
+
|
|
2135
|
+
# If standard verification failed and the reference has a URL, try raw URL verification
|
|
2136
|
+
if errors and verified_data is None:
|
|
2137
|
+
# Check if there's an unverified error
|
|
2138
|
+
unverified_errors = [e for e in errors if e.get('error_type') == 'unverified']
|
|
2139
|
+
if unverified_errors and reference.get('url', '').strip():
|
|
2140
|
+
# Use raw URL verifier to check if it can be verified or get specific reason
|
|
2141
|
+
url_verified_data, url_errors, url_checked = self.verify_raw_url_reference(reference)
|
|
2142
|
+
if url_verified_data:
|
|
2143
|
+
# URL verification succeeded - return as verified
|
|
2144
|
+
logger.debug(f"Non-database mode: URL verification succeeded for unverified reference")
|
|
2145
|
+
return None, url_checked, url_verified_data
|
|
2146
|
+
else:
|
|
2147
|
+
# URL verification failed - use specific error reason
|
|
2148
|
+
url_error_details = url_errors[0].get('error_details', 'Reference could not be verified') if url_errors else 'Reference could not be verified'
|
|
2149
|
+
# Update the unverified error with the specific reason
|
|
2150
|
+
for error in errors:
|
|
2151
|
+
if error.get('error_type') == 'unverified':
|
|
2152
|
+
error['error_details'] = url_error_details
|
|
2153
|
+
break
|
|
2154
|
+
|
|
2155
|
+
return errors, paper_url, verified_data
|
|
2122
2156
|
|
|
2123
2157
|
|
|
2124
2158
|
def verify_github_reference(self, reference):
|
|
@@ -2253,6 +2287,55 @@ class ArxivReferenceChecker:
|
|
|
2253
2287
|
formatted_errors.append(formatted_error)
|
|
2254
2288
|
return formatted_errors if formatted_errors else [{"error_type": "unverified", "error_details": "Web page could not be verified"}], page_url, None
|
|
2255
2289
|
|
|
2290
|
+
def verify_raw_url_reference(self, reference):
|
|
2291
|
+
"""
|
|
2292
|
+
Verify a raw URL from an unverified reference - can return verified data if appropriate
|
|
2293
|
+
|
|
2294
|
+
Args:
|
|
2295
|
+
reference: The reference to verify (already determined to be unverified by paper validators)
|
|
2296
|
+
|
|
2297
|
+
Returns:
|
|
2298
|
+
Tuple of (verified_data, errors, url) where:
|
|
2299
|
+
- verified_data: Dict with verified data if URL should be considered verified, None otherwise
|
|
2300
|
+
- errors: List of error dictionaries
|
|
2301
|
+
- url: The URL that was checked
|
|
2302
|
+
"""
|
|
2303
|
+
logger.debug(f"Checking raw URL for unverified reference: {reference.get('title', 'Untitled')}")
|
|
2304
|
+
|
|
2305
|
+
# Extract URL from reference
|
|
2306
|
+
web_url = reference.get('url', '').strip()
|
|
2307
|
+
if not web_url:
|
|
2308
|
+
return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], None
|
|
2309
|
+
|
|
2310
|
+
# First try PDF paper checker if URL appears to be a PDF
|
|
2311
|
+
from checkers.pdf_paper_checker import PDFPaperChecker
|
|
2312
|
+
pdf_checker = PDFPaperChecker()
|
|
2313
|
+
|
|
2314
|
+
if pdf_checker.can_check_reference(reference):
|
|
2315
|
+
logger.debug(f"URL appears to be PDF, trying PDF verification: {web_url}")
|
|
2316
|
+
try:
|
|
2317
|
+
verified_data, errors, url = pdf_checker.verify_reference(reference)
|
|
2318
|
+
if verified_data:
|
|
2319
|
+
logger.debug(f"PDF verification successful for: {reference.get('title', 'Untitled')}")
|
|
2320
|
+
return verified_data, errors, url
|
|
2321
|
+
else:
|
|
2322
|
+
logger.debug(f"PDF verification failed, falling back to web page verification")
|
|
2323
|
+
except Exception as e:
|
|
2324
|
+
logger.error(f"Error in PDF verification: {e}")
|
|
2325
|
+
logger.debug(f"PDF verification error, falling back to web page verification")
|
|
2326
|
+
|
|
2327
|
+
# Fall back to web page checker
|
|
2328
|
+
from checkers.webpage_checker import WebPageChecker
|
|
2329
|
+
webpage_checker = WebPageChecker()
|
|
2330
|
+
|
|
2331
|
+
try:
|
|
2332
|
+
verified_data, errors, url = webpage_checker.verify_raw_url_for_unverified_reference(reference)
|
|
2333
|
+
logger.debug(f"Raw URL verification result: verified_data={verified_data is not None}, errors={len(errors)}, url={url}")
|
|
2334
|
+
return verified_data, errors, url
|
|
2335
|
+
except Exception as e:
|
|
2336
|
+
logger.error(f"Error checking raw URL: {e}")
|
|
2337
|
+
return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], web_url
|
|
2338
|
+
|
|
2256
2339
|
def verify_reference_standard(self, source_paper, reference):
|
|
2257
2340
|
"""
|
|
2258
2341
|
Verify if a reference is accurate using GitHub, Semantic Scholar, or other checkers
|
|
@@ -2274,11 +2357,6 @@ class ArxivReferenceChecker:
|
|
|
2274
2357
|
if github_result:
|
|
2275
2358
|
return github_result
|
|
2276
2359
|
|
|
2277
|
-
# Next, check if this is a web page reference
|
|
2278
|
-
webpage_result = self.verify_webpage_reference(reference)
|
|
2279
|
-
if webpage_result:
|
|
2280
|
-
return webpage_result
|
|
2281
|
-
|
|
2282
2360
|
# Use the Semantic Scholar client to verify the reference
|
|
2283
2361
|
verified_data, errors, paper_url = self.non_arxiv_checker.verify_reference(reference)
|
|
2284
2362
|
|
|
@@ -5515,6 +5593,14 @@ class ArxivReferenceChecker:
|
|
|
5515
5593
|
"""Categorize the unverified error into checker error or not found"""
|
|
5516
5594
|
error_details_lower = error_details.lower()
|
|
5517
5595
|
|
|
5596
|
+
# New specific URL-based unverified reasons
|
|
5597
|
+
if error_details_lower == "non-existent web page":
|
|
5598
|
+
return "Non-existent web page"
|
|
5599
|
+
elif error_details_lower == "paper not found and url doesn't reference it":
|
|
5600
|
+
return "Paper not found and URL doesn't reference it"
|
|
5601
|
+
elif error_details_lower == "paper not verified but url references paper":
|
|
5602
|
+
return "Paper not verified but URL references paper"
|
|
5603
|
+
|
|
5518
5604
|
# Checker/API errors
|
|
5519
5605
|
api_error_patterns = [
|
|
5520
5606
|
'api error', 'rate limit', 'http error', 'network error',
|
|
File without changes
|
{academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|