academic-refchecker 1.2.51__tar.gz → 1.2.53__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.51/src/academic_refchecker.egg-info → academic_refchecker-1.2.53}/PKG-INFO +10 -1
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/README.md +9 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/__version__.py +1 -1
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53/src/academic_refchecker.egg-info}/PKG-INFO +10 -1
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/academic_refchecker.egg-info/SOURCES.txt +1 -0
- academic_refchecker-1.2.53/src/checkers/pdf_paper_checker.py +493 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/checkers/webpage_checker.py +424 -1
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/core/refchecker.py +111 -8
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/LICENSE +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/MANIFEST.in +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/pyproject.toml +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/requirements.txt +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/scripts/download_db.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/scripts/run_tests.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/setup.cfg +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/__init__.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/checkers/__init__.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/checkers/crossref.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/checkers/github_checker.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/checkers/openalex.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/checkers/openreview_checker.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/checkers/semantic_scholar.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/config/__init__.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/config/logging.conf +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/config/settings.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/core/__init__.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/core/db_connection_pool.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/core/parallel_processor.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/database/__init__.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/llm/__init__.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/llm/base.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/llm/providers.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/scripts/__init__.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/services/__init__.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/services/pdf_processor.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/__init__.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/arxiv_utils.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/author_utils.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/biblatex_parser.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/bibliography_utils.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/bibtex_parser.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/config_validator.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/db_utils.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/doi_utils.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/error_utils.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/mock_objects.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/text_utils.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/unicode_utils.py +0 -0
- {academic_refchecker-1.2.51 → academic_refchecker-1.2.53}/src/utils/url_utils.py +0 -0
{academic_refchecker-1.2.51/src/academic_refchecker.egg-info → academic_refchecker-1.2.53}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: academic-refchecker
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.53
|
|
4
4
|
Summary: A comprehensive tool for validating reference accuracy in academic papers
|
|
5
5
|
Author-email: Mark Russinovich <markrussinovich@hotmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -65,6 +65,14 @@ Dynamic: license-file
|
|
|
65
65
|
|
|
66
66
|
A comprehensive tool for validating reference accuracy in academic papers, useful for both authors checking their bibliography and conference reviewers ensuring that paper references are authentic and accurate. This tool processes papers from various local and online sources including ArXiv, PDF files, LaTeX documents, and text files to verify the accuracy of references by comparing cited information against authoritative sources.
|
|
67
67
|
|
|
68
|
+
## 🎥 Project Deep Dive
|
|
69
|
+
|
|
70
|
+
Learn about RefChecker's design philosophy and development process in this detailed discussion between Mark Russinovich (RefChecker's author) and Scott Hanselman. Mark shares insights into how he leveraged AI coding assistants including Cursor, GitHub Copilot, and Claude to build this comprehensive academic reference validation tool.
|
|
71
|
+
|
|
72
|
+
**[📺 Watch: "AI Coding with Mark Russinovich: Building RefChecker"](https://www.youtube.com/watch?v=n929Alz-fjo)**
|
|
73
|
+
|
|
74
|
+
*This video provides valuable insights into modern AI-assisted development workflows and the technical decisions behind RefChecker's architecture.*
|
|
75
|
+
|
|
68
76
|
## 📊 Sample Output
|
|
69
77
|
|
|
70
78
|
```
|
|
@@ -117,6 +125,7 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
|
|
|
117
125
|
|
|
118
126
|
## 📋 Table of Contents
|
|
119
127
|
|
|
128
|
+
- [🎥 Project Deep Dive](#-project-deep-dive)
|
|
120
129
|
- [📊 Sample Output](#-sample-output)
|
|
121
130
|
- [🎯 Features](#-features)
|
|
122
131
|
- [🚀 Quick Start](#-quick-start)
|
|
@@ -4,6 +4,14 @@
|
|
|
4
4
|
|
|
5
5
|
A comprehensive tool for validating reference accuracy in academic papers, useful for both authors checking their bibliography and conference reviewers ensuring that paper references are authentic and accurate. This tool processes papers from various local and online sources including ArXiv, PDF files, LaTeX documents, and text files to verify the accuracy of references by comparing cited information against authoritative sources.
|
|
6
6
|
|
|
7
|
+
## 🎥 Project Deep Dive
|
|
8
|
+
|
|
9
|
+
Learn about RefChecker's design philosophy and development process in this detailed discussion between Mark Russinovich (RefChecker's author) and Scott Hanselman. Mark shares insights into how he leveraged AI coding assistants including Cursor, GitHub Copilot, and Claude to build this comprehensive academic reference validation tool.
|
|
10
|
+
|
|
11
|
+
**[📺 Watch: "AI Coding with Mark Russinovich: Building RefChecker"](https://www.youtube.com/watch?v=n929Alz-fjo)**
|
|
12
|
+
|
|
13
|
+
*This video provides valuable insights into modern AI-assisted development workflows and the technical decisions behind RefChecker's architecture.*
|
|
14
|
+
|
|
7
15
|
## 📊 Sample Output
|
|
8
16
|
|
|
9
17
|
```
|
|
@@ -56,6 +64,7 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
|
|
|
56
64
|
|
|
57
65
|
## 📋 Table of Contents
|
|
58
66
|
|
|
67
|
+
- [🎥 Project Deep Dive](#-project-deep-dive)
|
|
59
68
|
- [📊 Sample Output](#-sample-output)
|
|
60
69
|
- [🎯 Features](#-features)
|
|
61
70
|
- [🚀 Quick Start](#-quick-start)
|
{academic_refchecker-1.2.51 → academic_refchecker-1.2.53/src/academic_refchecker.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: academic-refchecker
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.53
|
|
4
4
|
Summary: A comprehensive tool for validating reference accuracy in academic papers
|
|
5
5
|
Author-email: Mark Russinovich <markrussinovich@hotmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -65,6 +65,14 @@ Dynamic: license-file
|
|
|
65
65
|
|
|
66
66
|
A comprehensive tool for validating reference accuracy in academic papers, useful for both authors checking their bibliography and conference reviewers ensuring that paper references are authentic and accurate. This tool processes papers from various local and online sources including ArXiv, PDF files, LaTeX documents, and text files to verify the accuracy of references by comparing cited information against authoritative sources.
|
|
67
67
|
|
|
68
|
+
## 🎥 Project Deep Dive
|
|
69
|
+
|
|
70
|
+
Learn about RefChecker's design philosophy and development process in this detailed discussion between Mark Russinovich (RefChecker's author) and Scott Hanselman. Mark shares insights into how he leveraged AI coding assistants including Cursor, GitHub Copilot, and Claude to build this comprehensive academic reference validation tool.
|
|
71
|
+
|
|
72
|
+
**[📺 Watch: "AI Coding with Mark Russinovich: Building RefChecker"](https://www.youtube.com/watch?v=n929Alz-fjo)**
|
|
73
|
+
|
|
74
|
+
*This video provides valuable insights into modern AI-assisted development workflows and the technical decisions behind RefChecker's architecture.*
|
|
75
|
+
|
|
68
76
|
## 📊 Sample Output
|
|
69
77
|
|
|
70
78
|
```
|
|
@@ -117,6 +125,7 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
|
|
|
117
125
|
|
|
118
126
|
## 📋 Table of Contents
|
|
119
127
|
|
|
128
|
+
- [🎥 Project Deep Dive](#-project-deep-dive)
|
|
120
129
|
- [📊 Sample Output](#-sample-output)
|
|
121
130
|
- [🎯 Features](#-features)
|
|
122
131
|
- [🚀 Quick Start](#-quick-start)
|
|
@@ -21,6 +21,7 @@ src/checkers/github_checker.py
|
|
|
21
21
|
src/checkers/local_semantic_scholar.py
|
|
22
22
|
src/checkers/openalex.py
|
|
23
23
|
src/checkers/openreview_checker.py
|
|
24
|
+
src/checkers/pdf_paper_checker.py
|
|
24
25
|
src/checkers/semantic_scholar.py
|
|
25
26
|
src/checkers/webpage_checker.py
|
|
26
27
|
src/config/__init__.py
|
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
PDF Paper Checker - Validates citations by extracting and analyzing PDF content
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
import io
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
10
|
+
from urllib.parse import urlparse
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
import pdfplumber
|
|
14
|
+
from pypdf import PdfReader
|
|
15
|
+
from fuzzywuzzy import fuzz
|
|
16
|
+
from bs4 import BeautifulSoup
|
|
17
|
+
|
|
18
|
+
from utils.text_utils import normalize_text, calculate_title_similarity
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PDFPaperChecker:
|
|
24
|
+
"""
|
|
25
|
+
Checker that downloads and analyzes PDF documents to validate citations
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self):
|
|
29
|
+
self.session = requests.Session()
|
|
30
|
+
self.session.headers.update({
|
|
31
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
def can_check_reference(self, reference: Dict[str, Any]) -> bool:
|
|
35
|
+
"""
|
|
36
|
+
Check if this reference can be validated by PDF analysis
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
reference: Reference dictionary containing url and other metadata
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
True if reference has URL that likely points to a PDF
|
|
43
|
+
"""
|
|
44
|
+
url = reference.get('url', '').strip()
|
|
45
|
+
if not url:
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
# Check if URL ends with .pdf
|
|
49
|
+
if url.lower().endswith('.pdf'):
|
|
50
|
+
return True
|
|
51
|
+
|
|
52
|
+
# Check if URL path suggests PDF content
|
|
53
|
+
pdf_indicators = ['/pdf/', '/document/', '/download/', '/file/', '/resource/']
|
|
54
|
+
if any(indicator in url.lower() for indicator in pdf_indicators):
|
|
55
|
+
return True
|
|
56
|
+
|
|
57
|
+
# Check if URL is from domains that commonly serve PDFs directly
|
|
58
|
+
domain = urlparse(url).netloc.lower()
|
|
59
|
+
pdf_domains = [
|
|
60
|
+
'.gov', '.edu', '.org', # Common institutional domains
|
|
61
|
+
'researchgate.net', 'academia.edu', 'arxiv.org', # Academic platforms
|
|
62
|
+
'oecd.org', 'who.int', 'unesco.org', # International organizations
|
|
63
|
+
'aecea.ca' # Specific domain from the user's example
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
if any(domain.endswith(pdf_domain) or pdf_domain in domain for pdf_domain in pdf_domains):
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
72
|
+
"""
|
|
73
|
+
Verify a reference by downloading and analyzing PDF content
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Tuple of (verified_data, errors, url) where:
|
|
80
|
+
- verified_data: Dict with verified data if PDF validates citation, None otherwise
|
|
81
|
+
- errors: List of error dictionaries
|
|
82
|
+
- url: The URL that was checked
|
|
83
|
+
"""
|
|
84
|
+
logger.debug(f"Verifying PDF reference: {reference.get('title', 'Untitled')}")
|
|
85
|
+
|
|
86
|
+
url = reference.get('url', '').strip()
|
|
87
|
+
if not url:
|
|
88
|
+
return None, [{"error_type": "unverified", "error_details": "no URL provided"}], None
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
# First try to download directly as PDF
|
|
92
|
+
pdf_content = self._download_pdf(url)
|
|
93
|
+
|
|
94
|
+
# If direct download fails, try to find PDF links in the page
|
|
95
|
+
if not pdf_content:
|
|
96
|
+
pdf_url = self._find_pdf_url_in_page(url)
|
|
97
|
+
if pdf_url:
|
|
98
|
+
logger.debug(f"Found PDF link in page: {pdf_url}")
|
|
99
|
+
pdf_content = self._download_pdf(pdf_url)
|
|
100
|
+
url = pdf_url # Update URL to the actual PDF URL
|
|
101
|
+
|
|
102
|
+
if not pdf_content:
|
|
103
|
+
return None, [{"error_type": "unverified", "error_details": "could not download PDF content"}], url
|
|
104
|
+
|
|
105
|
+
# Extract text and metadata from PDF
|
|
106
|
+
pdf_data = self._extract_pdf_data(pdf_content)
|
|
107
|
+
if not pdf_data:
|
|
108
|
+
return None, [{"error_type": "unverified", "error_details": "could not extract PDF content"}], url
|
|
109
|
+
|
|
110
|
+
# Validate citation against PDF content
|
|
111
|
+
is_valid, errors = self._validate_citation(reference, pdf_data)
|
|
112
|
+
|
|
113
|
+
if is_valid:
|
|
114
|
+
# Create verified data preserving original venue if provided
|
|
115
|
+
venue = reference.get('journal') or reference.get('venue') or reference.get('booktitle') or 'PDF Document'
|
|
116
|
+
|
|
117
|
+
verified_data = {
|
|
118
|
+
'title': reference.get('title', ''),
|
|
119
|
+
'authors': reference.get('authors', []),
|
|
120
|
+
'year': reference.get('year'),
|
|
121
|
+
'venue': venue,
|
|
122
|
+
'url': url,
|
|
123
|
+
'pdf_metadata': {
|
|
124
|
+
'extracted_title': pdf_data.get('title'),
|
|
125
|
+
'extracted_authors': pdf_data.get('authors'),
|
|
126
|
+
'extracted_text_preview': pdf_data.get('text', '')[:200] + '...' if pdf_data.get('text') else '',
|
|
127
|
+
'pdf_pages': pdf_data.get('page_count'),
|
|
128
|
+
'extraction_method': pdf_data.get('extraction_method')
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
logger.debug(f"PDF reference verified: {url}")
|
|
132
|
+
return verified_data, errors, url
|
|
133
|
+
else:
|
|
134
|
+
return None, errors, url
|
|
135
|
+
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.error(f"Error verifying PDF reference {url}: {e}")
|
|
138
|
+
return None, [{"error_type": "unverified", "error_details": "PDF processing error"}], url
|
|
139
|
+
|
|
140
|
+
def _download_pdf(self, url: str, timeout: int = 30) -> Optional[bytes]:
|
|
141
|
+
"""
|
|
142
|
+
Download PDF content from URL
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
url: URL to download from
|
|
146
|
+
timeout: Request timeout in seconds
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
PDF content as bytes, or None if download failed
|
|
150
|
+
"""
|
|
151
|
+
try:
|
|
152
|
+
logger.debug(f"Downloading PDF from: {url}")
|
|
153
|
+
|
|
154
|
+
response = self.session.get(url, timeout=timeout, stream=True)
|
|
155
|
+
response.raise_for_status()
|
|
156
|
+
|
|
157
|
+
# Check if content is actually a PDF
|
|
158
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
159
|
+
if 'pdf' not in content_type and not url.lower().endswith('.pdf'):
|
|
160
|
+
# Sometimes PDFs are served with generic content types, so we'll try anyway
|
|
161
|
+
logger.debug(f"Content-Type '{content_type}' doesn't indicate PDF, but proceeding anyway")
|
|
162
|
+
|
|
163
|
+
# Download content
|
|
164
|
+
content = response.content
|
|
165
|
+
|
|
166
|
+
# Basic PDF validation - check for PDF header
|
|
167
|
+
if content.startswith(b'%PDF-'):
|
|
168
|
+
logger.debug(f"Successfully downloaded PDF ({len(content)} bytes)")
|
|
169
|
+
return content
|
|
170
|
+
else:
|
|
171
|
+
logger.debug("Downloaded content doesn't appear to be a valid PDF")
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.error(f"Failed to download PDF from {url}: {e}")
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
def _find_pdf_url_in_page(self, url: str) -> Optional[str]:
|
|
179
|
+
"""
|
|
180
|
+
Look for PDF download links in a web page
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
url: URL of the web page to search
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
URL of PDF document if found, None otherwise
|
|
187
|
+
"""
|
|
188
|
+
try:
|
|
189
|
+
logger.debug(f"Searching for PDF links in page: {url}")
|
|
190
|
+
|
|
191
|
+
response = self.session.get(url, timeout=15)
|
|
192
|
+
response.raise_for_status()
|
|
193
|
+
|
|
194
|
+
# Check if the response itself is a PDF (after redirects)
|
|
195
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
196
|
+
if 'pdf' in content_type or response.content.startswith(b'%PDF-'):
|
|
197
|
+
logger.debug("Page redirected directly to PDF")
|
|
198
|
+
return response.url
|
|
199
|
+
|
|
200
|
+
# Parse HTML to look for PDF links
|
|
201
|
+
from bs4 import BeautifulSoup
|
|
202
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
203
|
+
|
|
204
|
+
# Look for links that might be PDFs
|
|
205
|
+
pdf_links = []
|
|
206
|
+
|
|
207
|
+
# Find all links
|
|
208
|
+
for link in soup.find_all('a', href=True):
|
|
209
|
+
href = link.get('href')
|
|
210
|
+
link_text = link.get_text().lower().strip()
|
|
211
|
+
|
|
212
|
+
# Check if link ends with .pdf
|
|
213
|
+
if href and href.lower().endswith('.pdf'):
|
|
214
|
+
pdf_links.append(href)
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
# Check if link text suggests PDF
|
|
218
|
+
if any(indicator in link_text for indicator in ['pdf', 'download', 'document', 'report', 'policy']):
|
|
219
|
+
pdf_links.append(href)
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
# Check if link has PDF-related attributes
|
|
223
|
+
if link.get('type', '').lower() == 'application/pdf':
|
|
224
|
+
pdf_links.append(href)
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
# Look for PDF links in other elements
|
|
228
|
+
for element in soup.find_all(attrs={'href': True}):
|
|
229
|
+
href = element.get('href')
|
|
230
|
+
if href and href.lower().endswith('.pdf'):
|
|
231
|
+
pdf_links.append(href)
|
|
232
|
+
|
|
233
|
+
# Convert relative URLs to absolute
|
|
234
|
+
from urllib.parse import urljoin
|
|
235
|
+
absolute_pdf_links = []
|
|
236
|
+
for link in pdf_links:
|
|
237
|
+
if link:
|
|
238
|
+
absolute_url = urljoin(url, link)
|
|
239
|
+
absolute_pdf_links.append(absolute_url)
|
|
240
|
+
|
|
241
|
+
# Remove duplicates
|
|
242
|
+
absolute_pdf_links = list(set(absolute_pdf_links))
|
|
243
|
+
|
|
244
|
+
if absolute_pdf_links:
|
|
245
|
+
logger.debug(f"Found {len(absolute_pdf_links)} potential PDF links")
|
|
246
|
+
# Return the first PDF link found
|
|
247
|
+
return absolute_pdf_links[0]
|
|
248
|
+
|
|
249
|
+
logger.debug("No PDF links found in page")
|
|
250
|
+
return None
|
|
251
|
+
|
|
252
|
+
except Exception as e:
|
|
253
|
+
logger.error(f"Error searching for PDF links in {url}: {e}")
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
def _extract_pdf_data(self, pdf_content: bytes) -> Optional[Dict[str, Any]]:
|
|
257
|
+
"""
|
|
258
|
+
Extract text and metadata from PDF content
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
pdf_content: PDF file content as bytes
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
Dictionary with extracted data including text, title, authors, etc.
|
|
265
|
+
"""
|
|
266
|
+
pdf_data = {
|
|
267
|
+
'text': '',
|
|
268
|
+
'title': '',
|
|
269
|
+
'authors': [],
|
|
270
|
+
'page_count': 0,
|
|
271
|
+
'extraction_method': 'none'
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
# Try multiple extraction methods
|
|
275
|
+
try:
|
|
276
|
+
# Method 1: Try pdfplumber (usually better for text extraction)
|
|
277
|
+
pdf_data = self._extract_with_pdfplumber(pdf_content, pdf_data)
|
|
278
|
+
if pdf_data['text']:
|
|
279
|
+
pdf_data['extraction_method'] = 'pdfplumber'
|
|
280
|
+
return pdf_data
|
|
281
|
+
except Exception as e:
|
|
282
|
+
logger.debug(f"pdfplumber extraction failed: {e}")
|
|
283
|
+
|
|
284
|
+
try:
|
|
285
|
+
# Method 2: Try pypdf (fallback)
|
|
286
|
+
pdf_data = self._extract_with_pypdf(pdf_content, pdf_data)
|
|
287
|
+
if pdf_data['text']:
|
|
288
|
+
pdf_data['extraction_method'] = 'pypdf'
|
|
289
|
+
return pdf_data
|
|
290
|
+
except Exception as e:
|
|
291
|
+
logger.debug(f"pypdf extraction failed: {e}")
|
|
292
|
+
|
|
293
|
+
logger.debug("All PDF extraction methods failed")
|
|
294
|
+
return None
|
|
295
|
+
|
|
296
|
+
def _extract_with_pdfplumber(self, pdf_content: bytes, pdf_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
297
|
+
"""Extract PDF data using pdfplumber"""
|
|
298
|
+
with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
|
|
299
|
+
pdf_data['page_count'] = len(pdf.pages)
|
|
300
|
+
|
|
301
|
+
# Extract text from first few pages (usually contains title/author info)
|
|
302
|
+
text_parts = []
|
|
303
|
+
for i, page in enumerate(pdf.pages[:5]): # First 5 pages should be enough
|
|
304
|
+
page_text = page.extract_text()
|
|
305
|
+
if page_text:
|
|
306
|
+
text_parts.append(page_text)
|
|
307
|
+
|
|
308
|
+
pdf_data['text'] = '\n'.join(text_parts)
|
|
309
|
+
|
|
310
|
+
# Try to extract title and author from first page
|
|
311
|
+
if pdf.pages:
|
|
312
|
+
first_page_text = pdf.pages[0].extract_text() or ''
|
|
313
|
+
pdf_data['title'], pdf_data['authors'] = self._parse_title_and_authors(first_page_text)
|
|
314
|
+
|
|
315
|
+
return pdf_data
|
|
316
|
+
|
|
317
|
+
def _extract_with_pypdf(self, pdf_content: bytes, pdf_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
318
|
+
"""Extract PDF data using pypdf"""
|
|
319
|
+
reader = PdfReader(io.BytesIO(pdf_content))
|
|
320
|
+
pdf_data['page_count'] = len(reader.pages)
|
|
321
|
+
|
|
322
|
+
# Extract metadata
|
|
323
|
+
if reader.metadata:
|
|
324
|
+
if '/Title' in reader.metadata:
|
|
325
|
+
pdf_data['title'] = str(reader.metadata['/Title'])
|
|
326
|
+
if '/Author' in reader.metadata:
|
|
327
|
+
pdf_data['authors'] = [str(reader.metadata['/Author'])]
|
|
328
|
+
|
|
329
|
+
# Extract text from first few pages
|
|
330
|
+
text_parts = []
|
|
331
|
+
for i, page in enumerate(reader.pages[:5]): # First 5 pages
|
|
332
|
+
try:
|
|
333
|
+
page_text = page.extract_text()
|
|
334
|
+
if page_text:
|
|
335
|
+
text_parts.append(page_text)
|
|
336
|
+
except Exception as e:
|
|
337
|
+
logger.debug(f"Failed to extract text from page {i}: {e}")
|
|
338
|
+
continue
|
|
339
|
+
|
|
340
|
+
pdf_data['text'] = '\n'.join(text_parts)
|
|
341
|
+
|
|
342
|
+
# If no metadata title/author, try to parse from text
|
|
343
|
+
if not pdf_data['title'] and text_parts:
|
|
344
|
+
title, authors = self._parse_title_and_authors(text_parts[0])
|
|
345
|
+
if title and not pdf_data['title']:
|
|
346
|
+
pdf_data['title'] = title
|
|
347
|
+
if authors and not pdf_data['authors']:
|
|
348
|
+
pdf_data['authors'] = authors
|
|
349
|
+
|
|
350
|
+
return pdf_data
|
|
351
|
+
|
|
352
|
+
def _parse_title_and_authors(self, text: str) -> Tuple[str, List[str]]:
|
|
353
|
+
"""
|
|
354
|
+
Parse title and authors from PDF text
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
text: Text from first page of PDF
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
Tuple of (title, authors_list)
|
|
361
|
+
"""
|
|
362
|
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|
363
|
+
|
|
364
|
+
if not lines:
|
|
365
|
+
return '', []
|
|
366
|
+
|
|
367
|
+
# The title is often the first meaningful line (after removing headers/footers)
|
|
368
|
+
title = ''
|
|
369
|
+
authors = []
|
|
370
|
+
|
|
371
|
+
# Look for the title - usually first non-header line
|
|
372
|
+
for i, line in enumerate(lines):
|
|
373
|
+
# Skip obvious header/footer content
|
|
374
|
+
if len(line) < 10 or any(skip in line.lower() for skip in ['page', 'doi:', 'http', 'www.', '@']):
|
|
375
|
+
continue
|
|
376
|
+
|
|
377
|
+
# Title is usually longer and on its own line
|
|
378
|
+
if len(line) > 20 and not any(sep in line for sep in [',', ';']) and not line.endswith('.'):
|
|
379
|
+
title = line
|
|
380
|
+
|
|
381
|
+
# Authors often follow the title - look for patterns
|
|
382
|
+
for j in range(i + 1, min(i + 5, len(lines))):
|
|
383
|
+
author_line = lines[j]
|
|
384
|
+
|
|
385
|
+
# Author lines often contain commas, "and", or institutional affiliations
|
|
386
|
+
if any(indicator in author_line.lower() for indicator in [',', ' and ', 'university', 'college', 'institute']):
|
|
387
|
+
# Clean up author line
|
|
388
|
+
author_text = re.sub(r'[0-9*†‡§¶#]', '', author_line) # Remove superscript markers
|
|
389
|
+
if ',' in author_text:
|
|
390
|
+
authors.extend([name.strip() for name in author_text.split(',') if name.strip()])
|
|
391
|
+
else:
|
|
392
|
+
authors.append(author_text.strip())
|
|
393
|
+
break
|
|
394
|
+
break
|
|
395
|
+
|
|
396
|
+
return title, authors
|
|
397
|
+
|
|
398
|
+
def _validate_citation(self, reference: Dict[str, Any], pdf_data: Dict[str, Any]) -> Tuple[bool, List[Dict[str, Any]]]:
|
|
399
|
+
"""
|
|
400
|
+
Validate citation against extracted PDF data
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
reference: The citation being checked
|
|
404
|
+
pdf_data: Extracted data from PDF
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
Tuple of (is_valid, errors_list)
|
|
408
|
+
"""
|
|
409
|
+
errors = []
|
|
410
|
+
|
|
411
|
+
# Check title match
|
|
412
|
+
cited_title = reference.get('title', '').strip()
|
|
413
|
+
extracted_title = pdf_data.get('title', '').strip()
|
|
414
|
+
pdf_text = pdf_data.get('text', '').lower()
|
|
415
|
+
|
|
416
|
+
title_match = False
|
|
417
|
+
|
|
418
|
+
if cited_title and extracted_title:
|
|
419
|
+
# Compare titles directly
|
|
420
|
+
similarity = calculate_title_similarity(cited_title, extracted_title)
|
|
421
|
+
if similarity > 0.8: # 80% similarity threshold
|
|
422
|
+
title_match = True
|
|
423
|
+
|
|
424
|
+
if not title_match and cited_title and pdf_text:
|
|
425
|
+
# Check if cited title appears in PDF text
|
|
426
|
+
cited_title_normalized = normalize_text(cited_title)
|
|
427
|
+
if cited_title_normalized.lower() in pdf_text:
|
|
428
|
+
title_match = True
|
|
429
|
+
|
|
430
|
+
if not title_match:
|
|
431
|
+
errors.append({
|
|
432
|
+
"error_type": "unverified",
|
|
433
|
+
"error_details": "title not found in PDF content"
|
|
434
|
+
})
|
|
435
|
+
|
|
436
|
+
# Check author match (more lenient since PDF author extraction is difficult)
|
|
437
|
+
cited_authors = reference.get('authors', [])
|
|
438
|
+
extracted_authors = pdf_data.get('authors', [])
|
|
439
|
+
|
|
440
|
+
author_match = False
|
|
441
|
+
|
|
442
|
+
if cited_authors and extracted_authors:
|
|
443
|
+
# Check if any cited author appears in extracted authors
|
|
444
|
+
for cited_author in cited_authors:
|
|
445
|
+
for extracted_author in extracted_authors:
|
|
446
|
+
if self._authors_match(cited_author, extracted_author):
|
|
447
|
+
author_match = True
|
|
448
|
+
break
|
|
449
|
+
if author_match:
|
|
450
|
+
break
|
|
451
|
+
|
|
452
|
+
if not author_match and cited_authors and pdf_text:
|
|
453
|
+
# Check if any cited author appears in PDF text
|
|
454
|
+
for cited_author in cited_authors:
|
|
455
|
+
author_normalized = normalize_text(cited_author)
|
|
456
|
+
if author_normalized.lower() in pdf_text:
|
|
457
|
+
author_match = True
|
|
458
|
+
break
|
|
459
|
+
|
|
460
|
+
# For PDF validation, we're more lenient with author matching since extraction is unreliable
|
|
461
|
+
if not author_match and cited_authors:
|
|
462
|
+
errors.append({
|
|
463
|
+
"warning_type": "author",
|
|
464
|
+
"warning_details": "authors not clearly identified in PDF content"
|
|
465
|
+
})
|
|
466
|
+
|
|
467
|
+
# A reference is valid if we found the title (author matching is optional due to extraction difficulties)
|
|
468
|
+
is_valid = title_match
|
|
469
|
+
|
|
470
|
+
return is_valid, errors
|
|
471
|
+
|
|
472
|
+
def _authors_match(self, author1: str, author2: str) -> bool:
|
|
473
|
+
"""Check if two author names likely refer to the same person"""
|
|
474
|
+
author1_norm = normalize_text(author1).lower()
|
|
475
|
+
author2_norm = normalize_text(author2).lower()
|
|
476
|
+
|
|
477
|
+
# Exact match
|
|
478
|
+
if author1_norm == author2_norm:
|
|
479
|
+
return True
|
|
480
|
+
|
|
481
|
+
# Check similarity
|
|
482
|
+
similarity = fuzz.ratio(author1_norm, author2_norm)
|
|
483
|
+
if similarity > 85: # 85% similarity threshold
|
|
484
|
+
return True
|
|
485
|
+
|
|
486
|
+
# Check if one name is contained in the other (handles "J. Smith" vs "John Smith")
|
|
487
|
+
words1 = set(author1_norm.split())
|
|
488
|
+
words2 = set(author2_norm.split())
|
|
489
|
+
|
|
490
|
+
if words1.intersection(words2):
|
|
491
|
+
return True
|
|
492
|
+
|
|
493
|
+
return False
|