academic-refchecker 1.2.50__py3-none-any.whl → 1.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.50"
3
+ __version__ = "1.2.52"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.50
3
+ Version: 1.2.52
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -65,6 +65,14 @@ Dynamic: license-file
65
65
 
66
66
  A comprehensive tool for validating reference accuracy in academic papers, useful for both authors checking their bibliography and conference reviewers ensuring that paper references are authentic and accurate. This tool processes papers from various local and online sources including ArXiv, PDF files, LaTeX documents, and text files to verify the accuracy of references by comparing cited information against authoritative sources.
67
67
 
68
+ ## 🎥 Project Deep Dive
69
+
70
+ Learn about RefChecker's design philosophy and development process in this detailed discussion between Mark Russinovich (RefChecker's author) and Scott Hanselman. Mark shares insights into how he leveraged AI coding assistants including Cursor, GitHub Copilot, and Claude to build this comprehensive academic reference validation tool.
71
+
72
+ **[📺 Watch: "AI Coding with Mark Russinovich: Building RefChecker"](https://www.youtube.com/watch?v=n929Alz-fjo)**
73
+
74
+ *This video provides valuable insights into modern AI-assisted development workflows and the technical decisions behind RefChecker's architecture.*
75
+
68
76
  ## 📊 Sample Output
69
77
 
70
78
  ```
@@ -117,6 +125,7 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
117
125
 
118
126
  ## 📋 Table of Contents
119
127
 
128
+ - [🎥 Project Deep Dive](#-project-deep-dive)
120
129
  - [📊 Sample Output](#-sample-output)
121
130
  - [🎯 Features](#-features)
122
131
  - [🚀 Quick Start](#-quick-start)
@@ -1,21 +1,22 @@
1
- __version__.py,sha256=ZQ6vcRuuZpexVshgiVwj1EkuR3vzgsRUj6ll7aoa8Dw,65
2
- academic_refchecker-1.2.50.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ __version__.py,sha256=3kX5OAukU7mOMMYEni5E3TW6cnip3XwxplWJP4qANhU,65
2
+ academic_refchecker-1.2.52.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
3
  checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
4
  checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
5
5
  checkers/enhanced_hybrid_checker.py,sha256=rbXkzpNkd0bn4e2OooX-CcdGTwwYpgmVaFvX_xCAFsA,27777
6
- checkers/github_checker.py,sha256=GoepG4aRRUqAomkM4HgOSNf20BPxQgocZEpsk0ZTZZU,14003
6
+ checkers/github_checker.py,sha256=BXJaBC3AloKze04j8EcQz0a79EhtVoi9_871ilV7t60,14233
7
7
  checkers/local_semantic_scholar.py,sha256=D8py8-yMCgN1lvhXCiMUOEA4wBkH7AQvrkM4-3LCDsU,21015
8
8
  checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
9
- checkers/openreview_checker.py,sha256=mu33gytnIEond5A2gAZtxgLkKrirmmUSIUc4frL1GsA,40030
10
- checkers/semantic_scholar.py,sha256=dDDOxURwr-Kx7fIiJTAh_4_9V8VxGWYabQJiQ1VdSbM,34762
11
- checkers/webpage_checker.py,sha256=Ivzhu0xcpeWZcCTlAt9C06Lfsz5gKEiSko9gJ7EyQEw,22324
9
+ checkers/openreview_checker.py,sha256=3ckn6U7TN5nQBjqPacr8W8mm2uMo6aWWB6gsxTDNCPk,40452
10
+ checkers/pdf_paper_checker.py,sha256=L5HRHd3xpo0xDltZGTAA-Wk_arIS9bQV8ITeuxW0bNc,19893
11
+ checkers/semantic_scholar.py,sha256=wk6e8DkYJM_O2nWsi-6EfJT53PzfL8KCmX1rS562KKc,34962
12
+ checkers/webpage_checker.py,sha256=REOotx7Qka86_xbOIMeYj5YVb9D1RVMb4Ye311-28cA,43620
12
13
  config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
13
14
  config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
14
15
  config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
15
16
  core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
16
17
  core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
17
- core/parallel_processor.py,sha256=VHjsHc_wCKumeF__fXh8RjMpM4dYE8ua5amgotd4PTg,17474
18
- core/refchecker.py,sha256=sOenr6DgXqReiOCKcPVVFaaAAYJXEwONdw9gjur4KYE,279937
18
+ core/parallel_processor.py,sha256=cq_WfzXrF2EI6IKOtJd6_QcwvM1xT3J6a13teg-wSbM,17638
19
+ core/refchecker.py,sha256=rJ-CbCqN3dxzxCLr4DERq5UxWtVbErwCMyS3YUxdtuo,285500
19
20
  database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
20
21
  database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
21
22
  llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -36,11 +37,11 @@ utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
36
37
  utils/doi_utils.py,sha256=ezUiRnYRpoO0U_Rqgxv1FxqmeTwPh6X8gLgSDbqg5sY,4874
37
38
  utils/error_utils.py,sha256=UJOH7Bp-rPV2JDY_XN38I2pSkqqPdnQoviKa4s4nK_A,12501
38
39
  utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
39
- utils/text_utils.py,sha256=g2r0QT6RGNi_8K5MD_EE-GT3cbffhk8cQyQaL6HSYtA,211955
40
+ utils/text_utils.py,sha256=T3PiiG9-BMPTbdCftG2zypyIeZJl6snuMCKQ0nEOQv0,217834
40
41
  utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
41
42
  utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
42
- academic_refchecker-1.2.50.dist-info/METADATA,sha256=uOjNDL9zgwSxgFmaScRw635mQn0K4r2UfEYeQodcOy8,22576
43
- academic_refchecker-1.2.50.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- academic_refchecker-1.2.50.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
- academic_refchecker-1.2.50.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
- academic_refchecker-1.2.50.dist-info/RECORD,,
43
+ academic_refchecker-1.2.52.dist-info/METADATA,sha256=PKCXz09omWTvIVLZGCgP3kt9yO_V-FjXDu-HHfedqUU,23256
44
+ academic_refchecker-1.2.52.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
+ academic_refchecker-1.2.52.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
46
+ academic_refchecker-1.2.52.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
47
+ academic_refchecker-1.2.52.dist-info/RECORD,,
@@ -5,6 +5,7 @@ import re
5
5
  import logging
6
6
  from urllib.parse import urlparse
7
7
  from typing import Dict, Optional, Tuple, List, Any
8
+ from utils.text_utils import strip_latex_commands
8
9
 
9
10
  logger = logging.getLogger(__name__)
10
11
 
@@ -170,7 +171,9 @@ class GitHubChecker:
170
171
  title_match = self._check_title_match(cited_title, actual_name, actual_description)
171
172
  if not title_match:
172
173
  from utils.error_utils import format_title_mismatch
173
- details = format_title_mismatch(cited_title, actual_name)
174
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
175
+ clean_cited_title = strip_latex_commands(cited_title)
176
+ details = format_title_mismatch(clean_cited_title, actual_name)
174
177
  if actual_description:
175
178
  snippet = actual_description[:100] + ('...' if len(actual_description) > 100 else '')
176
179
  details += f" ({snippet})"
@@ -36,7 +36,8 @@ from utils.text_utils import (
36
36
  normalize_text, clean_title_basic, is_name_match,
37
37
  calculate_title_similarity, compare_authors,
38
38
  clean_title_for_search, are_venues_substantially_different,
39
- is_year_substantially_different
39
+ is_year_substantially_different, strip_latex_commands,
40
+ compare_titles_with_latex_cleaning
40
41
  )
41
42
 
42
43
  # Set up logging
@@ -423,10 +424,12 @@ class OpenReviewReferenceChecker:
423
424
  paper_title = paper_data.get('title', '').strip()
424
425
 
425
426
  if cited_title and paper_title:
426
- similarity = calculate_title_similarity(cited_title, paper_title)
427
+ similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
427
428
  if similarity < 0.7: # Using a reasonable threshold
428
429
  from utils.error_utils import format_title_mismatch
429
- details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
430
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
431
+ clean_cited_title = strip_latex_commands(cited_title)
432
+ details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
430
433
  errors.append({
431
434
  "warning_type": "title",
432
435
  "warning_details": details
@@ -547,10 +550,12 @@ class OpenReviewReferenceChecker:
547
550
  paper_title = best_match.get('title', '').strip()
548
551
 
549
552
  if cited_title and paper_title:
550
- similarity = calculate_title_similarity(cited_title, paper_title)
553
+ similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
551
554
  if similarity < 0.8: # Slightly higher threshold for search results
552
555
  from utils.error_utils import format_title_mismatch
553
- details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
556
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
557
+ clean_cited_title = strip_latex_commands(cited_title)
558
+ details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
554
559
  errors.append({
555
560
  "warning_type": "title",
556
561
  "warning_details": details
@@ -0,0 +1,493 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDF Paper Checker - Validates citations by extracting and analyzing PDF content
4
+ """
5
+
6
+ import re
7
+ import io
8
+ import logging
9
+ from typing import Dict, List, Any, Optional, Tuple
10
+ from urllib.parse import urlparse
11
+
12
+ import requests
13
+ import pdfplumber
14
+ from pypdf import PdfReader
15
+ from fuzzywuzzy import fuzz
16
+ from bs4 import BeautifulSoup
17
+
18
+ from utils.text_utils import normalize_text, calculate_title_similarity
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class PDFPaperChecker:
24
+ """
25
+ Checker that downloads and analyzes PDF documents to validate citations
26
+ """
27
+
28
+ def __init__(self):
29
+ self.session = requests.Session()
30
+ self.session.headers.update({
31
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
32
+ })
33
+
34
+ def can_check_reference(self, reference: Dict[str, Any]) -> bool:
35
+ """
36
+ Check if this reference can be validated by PDF analysis
37
+
38
+ Args:
39
+ reference: Reference dictionary containing url and other metadata
40
+
41
+ Returns:
42
+ True if reference has URL that likely points to a PDF
43
+ """
44
+ url = reference.get('url', '').strip()
45
+ if not url:
46
+ return False
47
+
48
+ # Check if URL ends with .pdf
49
+ if url.lower().endswith('.pdf'):
50
+ return True
51
+
52
+ # Check if URL path suggests PDF content
53
+ pdf_indicators = ['/pdf/', '/document/', '/download/', '/file/', '/resource/']
54
+ if any(indicator in url.lower() for indicator in pdf_indicators):
55
+ return True
56
+
57
+ # Check if URL is from domains that commonly serve PDFs directly
58
+ domain = urlparse(url).netloc.lower()
59
+ pdf_domains = [
60
+ '.gov', '.edu', '.org', # Common institutional domains
61
+ 'researchgate.net', 'academia.edu', 'arxiv.org', # Academic platforms
62
+ 'oecd.org', 'who.int', 'unesco.org', # International organizations
63
+ 'aecea.ca' # Specific domain from the user's example
64
+ ]
65
+
66
+ if any(domain.endswith(pdf_domain) or pdf_domain in domain for pdf_domain in pdf_domains):
67
+ return True
68
+
69
+ return False
70
+
71
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
72
+ """
73
+ Verify a reference by downloading and analyzing PDF content
74
+
75
+ Args:
76
+ reference: Reference dictionary with title, authors, year, url, etc.
77
+
78
+ Returns:
79
+ Tuple of (verified_data, errors, url) where:
80
+ - verified_data: Dict with verified data if PDF validates citation, None otherwise
81
+ - errors: List of error dictionaries
82
+ - url: The URL that was checked
83
+ """
84
+ logger.debug(f"Verifying PDF reference: {reference.get('title', 'Untitled')}")
85
+
86
+ url = reference.get('url', '').strip()
87
+ if not url:
88
+ return None, [{"error_type": "unverified", "error_details": "no URL provided"}], None
89
+
90
+ try:
91
+ # First try to download directly as PDF
92
+ pdf_content = self._download_pdf(url)
93
+
94
+ # If direct download fails, try to find PDF links in the page
95
+ if not pdf_content:
96
+ pdf_url = self._find_pdf_url_in_page(url)
97
+ if pdf_url:
98
+ logger.debug(f"Found PDF link in page: {pdf_url}")
99
+ pdf_content = self._download_pdf(pdf_url)
100
+ url = pdf_url # Update URL to the actual PDF URL
101
+
102
+ if not pdf_content:
103
+ return None, [{"error_type": "unverified", "error_details": "could not download PDF content"}], url
104
+
105
+ # Extract text and metadata from PDF
106
+ pdf_data = self._extract_pdf_data(pdf_content)
107
+ if not pdf_data:
108
+ return None, [{"error_type": "unverified", "error_details": "could not extract PDF content"}], url
109
+
110
+ # Validate citation against PDF content
111
+ is_valid, errors = self._validate_citation(reference, pdf_data)
112
+
113
+ if is_valid:
114
+ # Create verified data preserving original venue if provided
115
+ venue = reference.get('journal') or reference.get('venue') or reference.get('booktitle') or 'PDF Document'
116
+
117
+ verified_data = {
118
+ 'title': reference.get('title', ''),
119
+ 'authors': reference.get('authors', []),
120
+ 'year': reference.get('year'),
121
+ 'venue': venue,
122
+ 'url': url,
123
+ 'pdf_metadata': {
124
+ 'extracted_title': pdf_data.get('title'),
125
+ 'extracted_authors': pdf_data.get('authors'),
126
+ 'extracted_text_preview': pdf_data.get('text', '')[:200] + '...' if pdf_data.get('text') else '',
127
+ 'pdf_pages': pdf_data.get('page_count'),
128
+ 'extraction_method': pdf_data.get('extraction_method')
129
+ }
130
+ }
131
+ logger.debug(f"PDF reference verified: {url}")
132
+ return verified_data, errors, url
133
+ else:
134
+ return None, errors, url
135
+
136
+ except Exception as e:
137
+ logger.error(f"Error verifying PDF reference {url}: {e}")
138
+ return None, [{"error_type": "unverified", "error_details": "PDF processing error"}], url
139
+
140
+ def _download_pdf(self, url: str, timeout: int = 30) -> Optional[bytes]:
141
+ """
142
+ Download PDF content from URL
143
+
144
+ Args:
145
+ url: URL to download from
146
+ timeout: Request timeout in seconds
147
+
148
+ Returns:
149
+ PDF content as bytes, or None if download failed
150
+ """
151
+ try:
152
+ logger.debug(f"Downloading PDF from: {url}")
153
+
154
+ response = self.session.get(url, timeout=timeout, stream=True)
155
+ response.raise_for_status()
156
+
157
+ # Check if content is actually a PDF
158
+ content_type = response.headers.get('content-type', '').lower()
159
+ if 'pdf' not in content_type and not url.lower().endswith('.pdf'):
160
+ # Sometimes PDFs are served with generic content types, so we'll try anyway
161
+ logger.debug(f"Content-Type '{content_type}' doesn't indicate PDF, but proceeding anyway")
162
+
163
+ # Download content
164
+ content = response.content
165
+
166
+ # Basic PDF validation - check for PDF header
167
+ if content.startswith(b'%PDF-'):
168
+ logger.debug(f"Successfully downloaded PDF ({len(content)} bytes)")
169
+ return content
170
+ else:
171
+ logger.debug("Downloaded content doesn't appear to be a valid PDF")
172
+ return None
173
+
174
+ except Exception as e:
175
+ logger.error(f"Failed to download PDF from {url}: {e}")
176
+ return None
177
+
178
+ def _find_pdf_url_in_page(self, url: str) -> Optional[str]:
179
+ """
180
+ Look for PDF download links in a web page
181
+
182
+ Args:
183
+ url: URL of the web page to search
184
+
185
+ Returns:
186
+ URL of PDF document if found, None otherwise
187
+ """
188
+ try:
189
+ logger.debug(f"Searching for PDF links in page: {url}")
190
+
191
+ response = self.session.get(url, timeout=15)
192
+ response.raise_for_status()
193
+
194
+ # Check if the response itself is a PDF (after redirects)
195
+ content_type = response.headers.get('content-type', '').lower()
196
+ if 'pdf' in content_type or response.content.startswith(b'%PDF-'):
197
+ logger.debug("Page redirected directly to PDF")
198
+ return response.url
199
+
200
+ # Parse HTML to look for PDF links
201
+ from bs4 import BeautifulSoup
202
+ soup = BeautifulSoup(response.content, 'html.parser')
203
+
204
+ # Look for links that might be PDFs
205
+ pdf_links = []
206
+
207
+ # Find all links
208
+ for link in soup.find_all('a', href=True):
209
+ href = link.get('href')
210
+ link_text = link.get_text().lower().strip()
211
+
212
+ # Check if link ends with .pdf
213
+ if href and href.lower().endswith('.pdf'):
214
+ pdf_links.append(href)
215
+ continue
216
+
217
+ # Check if link text suggests PDF
218
+ if any(indicator in link_text for indicator in ['pdf', 'download', 'document', 'report', 'policy']):
219
+ pdf_links.append(href)
220
+ continue
221
+
222
+ # Check if link has PDF-related attributes
223
+ if link.get('type', '').lower() == 'application/pdf':
224
+ pdf_links.append(href)
225
+ continue
226
+
227
+ # Look for PDF links in other elements
228
+ for element in soup.find_all(attrs={'href': True}):
229
+ href = element.get('href')
230
+ if href and href.lower().endswith('.pdf'):
231
+ pdf_links.append(href)
232
+
233
+ # Convert relative URLs to absolute
234
+ from urllib.parse import urljoin
235
+ absolute_pdf_links = []
236
+ for link in pdf_links:
237
+ if link:
238
+ absolute_url = urljoin(url, link)
239
+ absolute_pdf_links.append(absolute_url)
240
+
241
+ # Remove duplicates
242
+ absolute_pdf_links = list(set(absolute_pdf_links))
243
+
244
+ if absolute_pdf_links:
245
+ logger.debug(f"Found {len(absolute_pdf_links)} potential PDF links")
246
+ # Return the first PDF link found
247
+ return absolute_pdf_links[0]
248
+
249
+ logger.debug("No PDF links found in page")
250
+ return None
251
+
252
+ except Exception as e:
253
+ logger.error(f"Error searching for PDF links in {url}: {e}")
254
+ return None
255
+
256
+ def _extract_pdf_data(self, pdf_content: bytes) -> Optional[Dict[str, Any]]:
257
+ """
258
+ Extract text and metadata from PDF content
259
+
260
+ Args:
261
+ pdf_content: PDF file content as bytes
262
+
263
+ Returns:
264
+ Dictionary with extracted data including text, title, authors, etc.
265
+ """
266
+ pdf_data = {
267
+ 'text': '',
268
+ 'title': '',
269
+ 'authors': [],
270
+ 'page_count': 0,
271
+ 'extraction_method': 'none'
272
+ }
273
+
274
+ # Try multiple extraction methods
275
+ try:
276
+ # Method 1: Try pdfplumber (usually better for text extraction)
277
+ pdf_data = self._extract_with_pdfplumber(pdf_content, pdf_data)
278
+ if pdf_data['text']:
279
+ pdf_data['extraction_method'] = 'pdfplumber'
280
+ return pdf_data
281
+ except Exception as e:
282
+ logger.debug(f"pdfplumber extraction failed: {e}")
283
+
284
+ try:
285
+ # Method 2: Try pypdf (fallback)
286
+ pdf_data = self._extract_with_pypdf(pdf_content, pdf_data)
287
+ if pdf_data['text']:
288
+ pdf_data['extraction_method'] = 'pypdf'
289
+ return pdf_data
290
+ except Exception as e:
291
+ logger.debug(f"pypdf extraction failed: {e}")
292
+
293
+ logger.debug("All PDF extraction methods failed")
294
+ return None
295
+
296
+ def _extract_with_pdfplumber(self, pdf_content: bytes, pdf_data: Dict[str, Any]) -> Dict[str, Any]:
297
+ """Extract PDF data using pdfplumber"""
298
+ with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
299
+ pdf_data['page_count'] = len(pdf.pages)
300
+
301
+ # Extract text from first few pages (usually contains title/author info)
302
+ text_parts = []
303
+ for i, page in enumerate(pdf.pages[:5]): # First 5 pages should be enough
304
+ page_text = page.extract_text()
305
+ if page_text:
306
+ text_parts.append(page_text)
307
+
308
+ pdf_data['text'] = '\n'.join(text_parts)
309
+
310
+ # Try to extract title and author from first page
311
+ if pdf.pages:
312
+ first_page_text = pdf.pages[0].extract_text() or ''
313
+ pdf_data['title'], pdf_data['authors'] = self._parse_title_and_authors(first_page_text)
314
+
315
+ return pdf_data
316
+
317
+ def _extract_with_pypdf(self, pdf_content: bytes, pdf_data: Dict[str, Any]) -> Dict[str, Any]:
318
+ """Extract PDF data using pypdf"""
319
+ reader = PdfReader(io.BytesIO(pdf_content))
320
+ pdf_data['page_count'] = len(reader.pages)
321
+
322
+ # Extract metadata
323
+ if reader.metadata:
324
+ if '/Title' in reader.metadata:
325
+ pdf_data['title'] = str(reader.metadata['/Title'])
326
+ if '/Author' in reader.metadata:
327
+ pdf_data['authors'] = [str(reader.metadata['/Author'])]
328
+
329
+ # Extract text from first few pages
330
+ text_parts = []
331
+ for i, page in enumerate(reader.pages[:5]): # First 5 pages
332
+ try:
333
+ page_text = page.extract_text()
334
+ if page_text:
335
+ text_parts.append(page_text)
336
+ except Exception as e:
337
+ logger.debug(f"Failed to extract text from page {i}: {e}")
338
+ continue
339
+
340
+ pdf_data['text'] = '\n'.join(text_parts)
341
+
342
+ # If no metadata title/author, try to parse from text
343
+ if not pdf_data['title'] and text_parts:
344
+ title, authors = self._parse_title_and_authors(text_parts[0])
345
+ if title and not pdf_data['title']:
346
+ pdf_data['title'] = title
347
+ if authors and not pdf_data['authors']:
348
+ pdf_data['authors'] = authors
349
+
350
+ return pdf_data
351
+
352
+ def _parse_title_and_authors(self, text: str) -> Tuple[str, List[str]]:
353
+ """
354
+ Parse title and authors from PDF text
355
+
356
+ Args:
357
+ text: Text from first page of PDF
358
+
359
+ Returns:
360
+ Tuple of (title, authors_list)
361
+ """
362
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
363
+
364
+ if not lines:
365
+ return '', []
366
+
367
+ # The title is often the first meaningful line (after removing headers/footers)
368
+ title = ''
369
+ authors = []
370
+
371
+ # Look for the title - usually first non-header line
372
+ for i, line in enumerate(lines):
373
+ # Skip obvious header/footer content
374
+ if len(line) < 10 or any(skip in line.lower() for skip in ['page', 'doi:', 'http', 'www.', '@']):
375
+ continue
376
+
377
+ # Title is usually longer and on its own line
378
+ if len(line) > 20 and not any(sep in line for sep in [',', ';']) and not line.endswith('.'):
379
+ title = line
380
+
381
+ # Authors often follow the title - look for patterns
382
+ for j in range(i + 1, min(i + 5, len(lines))):
383
+ author_line = lines[j]
384
+
385
+ # Author lines often contain commas, "and", or institutional affiliations
386
+ if any(indicator in author_line.lower() for indicator in [',', ' and ', 'university', 'college', 'institute']):
387
+ # Clean up author line
388
+ author_text = re.sub(r'[0-9*†‡§¶#]', '', author_line) # Remove superscript markers
389
+ if ',' in author_text:
390
+ authors.extend([name.strip() for name in author_text.split(',') if name.strip()])
391
+ else:
392
+ authors.append(author_text.strip())
393
+ break
394
+ break
395
+
396
+ return title, authors
397
+
398
+ def _validate_citation(self, reference: Dict[str, Any], pdf_data: Dict[str, Any]) -> Tuple[bool, List[Dict[str, Any]]]:
399
+ """
400
+ Validate citation against extracted PDF data
401
+
402
+ Args:
403
+ reference: The citation being checked
404
+ pdf_data: Extracted data from PDF
405
+
406
+ Returns:
407
+ Tuple of (is_valid, errors_list)
408
+ """
409
+ errors = []
410
+
411
+ # Check title match
412
+ cited_title = reference.get('title', '').strip()
413
+ extracted_title = pdf_data.get('title', '').strip()
414
+ pdf_text = pdf_data.get('text', '').lower()
415
+
416
+ title_match = False
417
+
418
+ if cited_title and extracted_title:
419
+ # Compare titles directly
420
+ similarity = calculate_title_similarity(cited_title, extracted_title)
421
+ if similarity > 0.8: # 80% similarity threshold
422
+ title_match = True
423
+
424
+ if not title_match and cited_title and pdf_text:
425
+ # Check if cited title appears in PDF text
426
+ cited_title_normalized = normalize_text(cited_title)
427
+ if cited_title_normalized.lower() in pdf_text:
428
+ title_match = True
429
+
430
+ if not title_match:
431
+ errors.append({
432
+ "error_type": "unverified",
433
+ "error_details": "title not found in PDF content"
434
+ })
435
+
436
+ # Check author match (more lenient since PDF author extraction is difficult)
437
+ cited_authors = reference.get('authors', [])
438
+ extracted_authors = pdf_data.get('authors', [])
439
+
440
+ author_match = False
441
+
442
+ if cited_authors and extracted_authors:
443
+ # Check if any cited author appears in extracted authors
444
+ for cited_author in cited_authors:
445
+ for extracted_author in extracted_authors:
446
+ if self._authors_match(cited_author, extracted_author):
447
+ author_match = True
448
+ break
449
+ if author_match:
450
+ break
451
+
452
+ if not author_match and cited_authors and pdf_text:
453
+ # Check if any cited author appears in PDF text
454
+ for cited_author in cited_authors:
455
+ author_normalized = normalize_text(cited_author)
456
+ if author_normalized.lower() in pdf_text:
457
+ author_match = True
458
+ break
459
+
460
+ # For PDF validation, we're more lenient with author matching since extraction is unreliable
461
+ if not author_match and cited_authors:
462
+ errors.append({
463
+ "warning_type": "author",
464
+ "warning_details": "authors not clearly identified in PDF content"
465
+ })
466
+
467
+ # A reference is valid if we found the title (author matching is optional due to extraction difficulties)
468
+ is_valid = title_match
469
+
470
+ return is_valid, errors
471
+
472
+ def _authors_match(self, author1: str, author2: str) -> bool:
473
+ """Check if two author names likely refer to the same person"""
474
+ author1_norm = normalize_text(author1).lower()
475
+ author2_norm = normalize_text(author2).lower()
476
+
477
+ # Exact match
478
+ if author1_norm == author2_norm:
479
+ return True
480
+
481
+ # Check similarity
482
+ similarity = fuzz.ratio(author1_norm, author2_norm)
483
+ if similarity > 85: # 85% similarity threshold
484
+ return True
485
+
486
+ # Check if one name is contained in the other (handles "J. Smith" vs "John Smith")
487
+ words1 = set(author1_norm.split())
488
+ words2 = set(author2_norm.split())
489
+
490
+ if words1.intersection(words2):
491
+ return True
492
+
493
+ return False