academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,938 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import requests
4
+ import re
5
+ import logging
6
+ from urllib.parse import urlparse, urljoin
7
+ from typing import Dict, Optional, Tuple, List, Any
8
+ from bs4 import BeautifulSoup
9
+ import time
10
+ from refchecker.utils.text_utils import strip_latex_commands
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class WebPageChecker:
15
+ """
16
+ Checker for verifying web page references (documentation, tutorials, etc.)
17
+ """
18
+
19
+ def __init__(self, request_delay: float = 1.0):
20
+ """
21
+ Initialize web page checker
22
+
23
+ Args:
24
+ request_delay: Delay between requests to be respectful to servers
25
+ """
26
+ self.request_delay = request_delay
27
+ self.session = requests.Session()
28
+ self.session.headers.update({
29
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
30
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
31
+ 'Accept-Language': 'en-US,en;q=0.5',
32
+ 'Accept-Encoding': 'gzip, deflate',
33
+ 'Connection': 'keep-alive',
34
+ 'Referer': 'https://www.google.com/',
35
+ })
36
+ self.last_request_time = 0
37
+
38
+ def is_web_page_url(self, url: str) -> bool:
39
+ """
40
+ Check if URL is a web page that should be verified
41
+
42
+ Args:
43
+ url: URL to check
44
+
45
+ Returns:
46
+ True if it's a verifiable web page URL
47
+ """
48
+ if not url or not url.startswith(('http://', 'https://')):
49
+ return False
50
+
51
+ # Skip GitHub URLs (handled by GitHubChecker)
52
+ if 'github.com' in url:
53
+ return False
54
+
55
+ # Skip Semantic Scholar CorpusID URLs (handled by Semantic Scholar API)
56
+ if 'api.semanticscholar.org/CorpusID:' in url:
57
+ return False
58
+
59
+ # Skip direct file downloads, but allow PDFs that are likely web-viewable
60
+ file_extensions = ['.doc', '.docx', '.zip', '.tar.gz', '.exe', '.dmg']
61
+ if any(url.lower().endswith(ext) for ext in file_extensions):
62
+ return False
63
+
64
+ # For PDFs, only skip if they're clearly downloadable files, not web-viewable documents
65
+ if url.lower().endswith('.pdf'):
66
+ # Allow PDFs from known documentation/content sites
67
+ pdf_allowed_domains = ['intel.com', 'nvidia.com', 'microsoft.com', 'google.com', 'openai.com']
68
+ if not any(domain in url.lower() for domain in pdf_allowed_domains):
69
+ return False
70
+
71
+ # Include documentation and web content
72
+ doc_indicators = [
73
+ 'docs', 'documentation', 'readthedocs.io', 'help', 'guide', 'tutorial',
74
+ 'reference', 'manual', 'wiki', 'blog', 'api', 'developer', 'platform',
75
+ 'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop',
76
+ 'posts' # For blog posts and forum posts like LessWrong
77
+ ]
78
+
79
+ return any(indicator in url.lower() for indicator in doc_indicators) or self._is_likely_webpage(url)
80
+
81
+ def _is_likely_webpage(self, url: str) -> bool:
82
+ """Check if URL pattern suggests it's a webpage"""
83
+ parsed = urlparse(url)
84
+
85
+ # Known documentation domains
86
+ doc_domains = [
87
+ 'pytorch.org', 'tensorflow.org', 'readthedocs.io', 'onnxruntime.ai',
88
+ 'deepspeed.ai', 'huggingface.co', 'openai.com', 'microsoft.com',
89
+ 'google.com', 'nvidia.com', 'intel.com', 'langchain.com',
90
+ 'lesswrong.com' # LessWrong rationality and AI safety blog platform
91
+ ]
92
+
93
+ return any(domain in parsed.netloc for domain in doc_domains)
94
+
95
+ def _respectful_request(self, url: str, timeout: int = 15) -> Optional[requests.Response]:
96
+ """Make a respectful HTTP request with rate limiting"""
97
+ current_time = time.time()
98
+ time_since_last = current_time - self.last_request_time
99
+
100
+ if time_since_last < self.request_delay:
101
+ time.sleep(self.request_delay - time_since_last)
102
+
103
+ try:
104
+ logger.debug(f"Making request to: {url}")
105
+ response = self.session.get(url, timeout=timeout, allow_redirects=True)
106
+ self.last_request_time = time.time()
107
+ logger.debug(f"Request successful: {response.status_code}, content-type: {response.headers.get('content-type', 'unknown')}")
108
+ return response
109
+ except requests.exceptions.RequestException as e:
110
+ logger.debug(f"Request failed for {url}: {type(e).__name__}: {e}")
111
+ return None
112
+
113
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
114
+ """
115
+ Verify a web page reference
116
+
117
+ Args:
118
+ reference: Reference dictionary with title, authors, year, url, etc.
119
+
120
+ Returns:
121
+ Tuple of (verified_data, errors, paper_url) where:
122
+ - verified_data: Dict with verified web page information or None
123
+ - errors: List of error/warning dictionaries
124
+ - paper_url: The web page URL
125
+ """
126
+ logger.debug(f"Verifying web page reference: {reference.get('title', 'Untitled')}")
127
+
128
+ # Extract web URL from reference
129
+ web_url = reference.get('url', '').strip()
130
+ if not web_url or not self.is_web_page_url(web_url):
131
+ logger.debug("No verifiable web URL found in reference")
132
+ return None, [], None
133
+
134
+ # Fetch the web page
135
+ response = self._respectful_request(web_url)
136
+ if response is None:
137
+ return None, [{"error_type": "unverified", "error_details": "Could not fetch web page"}], web_url
138
+
139
+ if response.status_code == 404:
140
+ return None, [{"error_type": "unverified", "error_details": "Web page not found (404)"}], web_url
141
+ elif response.status_code == 403:
142
+ # For 403, assume the resource exists but blocks automated access
143
+ # This is common for PDFs and some corporate sites
144
+ return self._handle_blocked_resource(reference, web_url)
145
+ elif response.status_code != 200:
146
+ return None, [{"error_type": "unverified", "error_details": f"HTTP error {response.status_code}"}], web_url
147
+
148
+ try:
149
+ # Handle PDF content differently
150
+ content_type = response.headers.get('content-type', '').lower()
151
+ if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
152
+ return self._handle_pdf_reference(reference, response, web_url)
153
+
154
+ # Parse HTML content
155
+ soup = BeautifulSoup(response.content, 'html.parser')
156
+
157
+ # Extract page metadata
158
+ page_title = self._extract_page_title(soup)
159
+ page_description = self._extract_description(soup)
160
+ site_info = self._extract_site_info(soup, web_url)
161
+
162
+ logger.debug(f"Extracted page title: {page_title}")
163
+ logger.debug(f"Extracted description: {page_description[:100] if page_description else 'None'}...")
164
+
165
+ # Create verified data structure
166
+ verified_data = {
167
+ 'title': page_title or reference.get('title', ''),
168
+ 'authors': self._determine_authors(reference.get('authors', []), site_info, web_url),
169
+ 'year': reference.get('year'),
170
+ 'venue': 'Web Page',
171
+ 'url': web_url,
172
+ 'web_metadata': {
173
+ 'page_title': page_title,
174
+ 'description': page_description,
175
+ 'site_info': site_info,
176
+ 'final_url': response.url, # In case of redirects
177
+ 'status_code': response.status_code
178
+ }
179
+ }
180
+
181
+ # Verify content
182
+ errors = []
183
+ cited_title = reference.get('title', '').strip()
184
+
185
+ # Check title match
186
+ if cited_title and page_title:
187
+ if not self._check_title_match(cited_title, page_title, page_description):
188
+ from refchecker.utils.error_utils import format_title_mismatch
189
+ # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
190
+ clean_cited_title = strip_latex_commands(cited_title)
191
+ errors.append({
192
+ "warning_type": "title",
193
+ "warning_details": format_title_mismatch(clean_cited_title, page_title)
194
+ })
195
+
196
+ # Check if this is a documentation page for the cited topic
197
+ if cited_title:
198
+ topic_match = self._check_topic_relevance(cited_title, page_title, page_description, soup)
199
+ if not topic_match:
200
+ errors.append({
201
+ "warning_type": "content",
202
+ "warning_details": f"Page content may not match cited topic '{cited_title}'"
203
+ })
204
+
205
+ # Check authors/organization
206
+ cited_authors = reference.get('authors', [])
207
+ if cited_authors:
208
+ author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
209
+ if not self._check_author_match(author_str, site_info, web_url):
210
+ from refchecker.utils.error_utils import format_three_line_mismatch
211
+ left = author_str
212
+ right = site_info.get('organization', 'unknown')
213
+ details = format_three_line_mismatch("Author/organization mismatch", left, right)
214
+ errors.append({
215
+ "warning_type": "author",
216
+ "warning_details": details
217
+ })
218
+
219
+ logger.debug(f"Web page verification completed for: {web_url}")
220
+ return verified_data, errors, web_url
221
+
222
+ except Exception as e:
223
+ logger.error(f"Error parsing web page {web_url}: {e}")
224
+ return None, [{"error_type": "unverified", "error_details": f"Error parsing page: {str(e)}"}], web_url
225
+
226
+ def _extract_page_title(self, soup: BeautifulSoup) -> Optional[str]:
227
+ """Extract the page title"""
228
+ # Try <title> tag
229
+ title_tag = soup.find('title')
230
+ if title_tag and title_tag.text.strip():
231
+ return title_tag.text.strip()
232
+
233
+ # Try <h1> tag
234
+ h1_tag = soup.find('h1')
235
+ if h1_tag and h1_tag.text.strip():
236
+ return h1_tag.text.strip()
237
+
238
+ # Try meta property title
239
+ meta_title = soup.find('meta', {'property': 'og:title'})
240
+ if meta_title and meta_title.get('content'):
241
+ return meta_title['content'].strip()
242
+
243
+ return None
244
+
245
+ def _extract_description(self, soup: BeautifulSoup) -> Optional[str]:
246
+ """Extract page description"""
247
+ # Try meta description
248
+ meta_desc = soup.find('meta', {'name': 'description'})
249
+ if meta_desc and meta_desc.get('content'):
250
+ return meta_desc['content'].strip()
251
+
252
+ # Try OpenGraph description
253
+ og_desc = soup.find('meta', {'property': 'og:description'})
254
+ if og_desc and og_desc.get('content'):
255
+ return og_desc['content'].strip()
256
+
257
+ # Try first paragraph
258
+ first_p = soup.find('p')
259
+ if first_p and first_p.text.strip():
260
+ return first_p.text.strip()[:500] # Limit length
261
+
262
+ return None
263
+
264
+ def _extract_site_info(self, soup: BeautifulSoup, url: str) -> Dict[str, str]:
265
+ """Extract information about the website/organization"""
266
+ parsed_url = urlparse(url)
267
+ domain = parsed_url.netloc.lower()
268
+
269
+ site_info = {
270
+ 'domain': domain,
271
+ 'organization': self._determine_organization(domain),
272
+ 'site_type': self._determine_site_type(domain, url)
273
+ }
274
+
275
+ # Try to extract more specific site info
276
+ generator = soup.find('meta', {'name': 'generator'})
277
+ if generator and generator.get('content'):
278
+ site_info['generator'] = generator['content']
279
+
280
+ return site_info
281
+
282
+ def _determine_organization(self, domain: str) -> str:
283
+ """Determine the organization from domain"""
284
+ org_map = {
285
+ 'onnxruntime.ai': 'ONNX Runtime',
286
+ 'readthedocs.io': 'ReadTheDocs',
287
+ 'pytorch.org': 'PyTorch',
288
+ 'tensorflow.org': 'TensorFlow',
289
+ 'huggingface.co': 'Hugging Face',
290
+ 'openai.com': 'OpenAI',
291
+ 'microsoft.com': 'Microsoft',
292
+ 'google.com': 'Google',
293
+ 'nvidia.com': 'NVIDIA',
294
+ 'intel.com': 'Intel',
295
+ 'deepspeed.ai': 'DeepSpeed',
296
+ 'langchain.com': 'LangChain'
297
+ }
298
+
299
+ for domain_key, org in org_map.items():
300
+ if domain_key in domain:
301
+ return org
302
+
303
+ # Extract organization from domain
304
+ if 'readthedocs.io' in domain:
305
+ # Extract project name from readthedocs URL
306
+ parts = domain.split('.')
307
+ if len(parts) >= 3 and parts[-2] == 'readthedocs':
308
+ return parts[0].title()
309
+
310
+ # Generic extraction
311
+ domain_parts = domain.replace('www.', '').split('.')
312
+ if domain_parts:
313
+ return domain_parts[0].title()
314
+
315
+ return domain
316
+
317
+ def _determine_site_type(self, domain: str, url: str) -> str:
318
+ """Determine the type of website"""
319
+ if 'readthedocs.io' in domain:
320
+ return 'documentation'
321
+ elif any(indicator in url.lower() for indicator in ['docs', 'documentation']):
322
+ return 'documentation'
323
+ elif any(indicator in url.lower() for indicator in ['api', 'reference']):
324
+ return 'api_documentation'
325
+ elif any(indicator in url.lower() for indicator in ['tutorial', 'guide', 'help']):
326
+ return 'tutorial'
327
+ elif any(indicator in url.lower() for indicator in ['blog', 'post']):
328
+ return 'blog'
329
+ else:
330
+ return 'website'
331
+
332
+ def _check_title_match(self, cited_title: str, page_title: str, page_description: str = None) -> bool:
333
+ """Check if cited title matches page content"""
334
+ cited_lower = cited_title.lower().strip()
335
+ page_title_lower = page_title.lower().strip() if page_title else ''
336
+
337
+ # Direct substring match
338
+ if cited_lower in page_title_lower or page_title_lower in cited_lower:
339
+ return True
340
+
341
+ # Check key terms
342
+ cited_words = set(word.strip('.,;:()[]') for word in cited_lower.split() if len(word.strip('.,;:()[]')) > 3)
343
+ page_words = set(word.strip('.,;:()[]') for word in page_title_lower.split() if len(word.strip('.,;:()[]')) > 3)
344
+
345
+ # If description is available, include it
346
+ if page_description:
347
+ desc_words = set(word.strip('.,;:()[]') for word in page_description.lower().split() if len(word.strip('.,;:()[]')) > 3)
348
+ page_words.update(desc_words)
349
+
350
+ # Check for significant overlap
351
+ common_words = cited_words.intersection(page_words)
352
+ if len(common_words) >= min(2, len(cited_words) // 2):
353
+ return True
354
+
355
+ # Check for technical terms that indicate same topic
356
+ tech_terms = {'api', 'documentation', 'guide', 'tutorial', 'reference', 'docs'}
357
+ if cited_words.intersection(tech_terms) and page_words.intersection(tech_terms):
358
+ # If both mention technical terms, be more lenient
359
+ return len(common_words) >= 1
360
+
361
+ return False
362
+
363
+ def _check_topic_relevance(self, cited_title: str, page_title: str, page_description: str, soup: BeautifulSoup) -> bool:
364
+ """Check if the page content is relevant to the cited topic"""
365
+ cited_lower = cited_title.lower()
366
+
367
+ # Extract main content text for analysis
368
+ content_text = ""
369
+ if page_title:
370
+ content_text += page_title.lower() + " "
371
+ if page_description:
372
+ content_text += page_description.lower() + " "
373
+
374
+ # Get some body text
375
+ main_content = soup.find('main') or soup.find('div', {'class': re.compile(r'content|main|body')}) or soup.find('body')
376
+ if main_content:
377
+ # Get first few paragraphs
378
+ paragraphs = main_content.find_all('p')[:5]
379
+ for p in paragraphs:
380
+ content_text += p.text.lower() + " "
381
+
382
+ # Extract key terms from cited title
383
+ cited_terms = [word.strip('.,;:()[]') for word in cited_lower.split() if len(word.strip('.,;:()[]')) > 3]
384
+
385
+ # Check if most key terms appear in content
386
+ matches = sum(1 for term in cited_terms if term in content_text)
387
+ return matches >= len(cited_terms) // 2 # At least half the terms should match
388
+
389
+ def _determine_authors(self, cited_authors: List[str], site_info: Dict[str, str], url: str) -> List[str]:
390
+ """Determine appropriate authors based on site info"""
391
+ if not cited_authors:
392
+ return [site_info.get('organization', 'Unknown')]
393
+
394
+ # For web pages, often the organization is the "author"
395
+ return cited_authors
396
+
397
+ def _check_author_match(self, cited_authors: str, site_info: Dict[str, str], url: str) -> bool:
398
+ """Check if cited authors match the website organization"""
399
+ cited_lower = cited_authors.lower().strip()
400
+ organization = site_info.get('organization', '').lower()
401
+ domain = site_info.get('domain', '').lower()
402
+
403
+ # Accept generic web resource terms - these are valid for any web URL
404
+ generic_web_terms = [
405
+ 'web resource', 'web site', 'website', 'online resource',
406
+ 'online', 'web', 'internet resource', 'web page', 'webpage'
407
+ ]
408
+ if cited_lower in generic_web_terms:
409
+ return True
410
+
411
+ # Direct matches
412
+ if cited_lower in organization or organization in cited_lower:
413
+ return True
414
+
415
+ # Handle common abbreviations and variations
416
+ author_patterns = {
417
+ 'o. runtime': ['onnx', 'runtime', 'onnxruntime'],
418
+ 'deepspeed': ['deepspeed', 'microsoft'],
419
+ 'openai': ['openai', 'open ai'],
420
+ 'hugging face': ['huggingface', 'hf', 'h.f.'],
421
+ 'google': ['google', 'alphabet'],
422
+ 'microsoft': ['microsoft', 'ms', 'msft'],
423
+ }
424
+
425
+ for pattern, variants in author_patterns.items():
426
+ if any(variant in cited_lower for variant in variants):
427
+ if any(variant in organization or variant in domain for variant in variants):
428
+ return True
429
+
430
+ # Check if domain contains author info
431
+ if any(word in domain for word in cited_lower.split() if len(word) > 3):
432
+ return True
433
+
434
+ # For documentation sites, be more lenient
435
+ if site_info.get('site_type') in ['documentation', 'api_documentation']:
436
+ return True # Documentation authorship is often ambiguous
437
+
438
+ return False
439
+
440
+ def _handle_pdf_reference(self, reference, response, web_url):
441
+ """Handle PDF document references"""
442
+ logger.debug(f"Handling PDF reference: {web_url}")
443
+
444
+ # For PDFs, we can't extract much content, so we do basic verification
445
+ verified_data = {
446
+ 'title': reference.get('title', ''),
447
+ 'authors': reference.get('authors', []),
448
+ 'year': reference.get('year'),
449
+ 'venue': 'PDF Document',
450
+ 'url': web_url,
451
+ 'web_metadata': {
452
+ 'content_type': response.headers.get('content-type', ''),
453
+ 'content_length': response.headers.get('content-length', ''),
454
+ 'final_url': response.url,
455
+ 'status_code': response.status_code
456
+ }
457
+ }
458
+
459
+ # For PDFs, we can't do much content verification, so just check if it's accessible
460
+ errors = []
461
+
462
+ # Check if the URL is from a reputable source
463
+ domain = urlparse(web_url).netloc.lower()
464
+ if not any(trusted in domain for trusted in ['intel.com', 'nvidia.com', 'microsoft.com', 'google.com', 'openai.com']):
465
+ errors.append({
466
+ "warning_type": "source",
467
+ "warning_details": f"PDF from unverified domain: {domain}"
468
+ })
469
+
470
+ return verified_data, errors, web_url
471
+
472
+ def _handle_blocked_resource(self, reference, web_url):
473
+ """Handle resources that return 403 (blocked by bot detection)"""
474
+ logger.debug(f"Handling blocked resource: {web_url}")
475
+
476
+ # For blocked resources, we can still do basic verification based on URL patterns
477
+ domain = urlparse(web_url).netloc.lower()
478
+
479
+ # Determine if this is a trusted domain
480
+ trusted_domains = [
481
+ 'intel.com', 'nvidia.com', 'microsoft.com', 'google.com', 'openai.com',
482
+ 'adobe.com', 'apple.com', 'arxiv.org', 'ieee.org', 'acm.org',
483
+ 'arxiv.org', 'semanticscholar.org'
484
+ ]
485
+
486
+ is_trusted = any(trusted in domain for trusted in trusted_domains)
487
+
488
+ verified_data = {
489
+ 'title': reference.get('title', ''),
490
+ 'authors': reference.get('authors', []),
491
+ 'year': reference.get('year'),
492
+ 'venue': 'PDF Document' if web_url.lower().endswith('.pdf') else 'Web Page',
493
+ 'url': web_url,
494
+ 'web_metadata': {
495
+ 'status_code': 403,
496
+ 'domain': domain,
497
+ 'trusted_domain': is_trusted,
498
+ 'access_blocked': True
499
+ }
500
+ }
501
+
502
+ errors = []
503
+ if not is_trusted:
504
+ errors.append({
505
+ "warning_type": "access",
506
+ "warning_details": f"Access blocked by site (403) and domain not in trusted list: {domain}"
507
+ })
508
+ else:
509
+ # For trusted domains that block access, we assume the resource exists
510
+ errors.append({
511
+ "warning_type": "access",
512
+ "warning_details": "Access blocked by site but domain is trusted (likely bot protection)"
513
+ })
514
+
515
+ return verified_data, errors, web_url
516
+
517
+ def check_unverified_url_reference(self, reference: Dict[str, Any]) -> str:
518
+ """
519
+ Check a URL from an unverified reference to determine the specific unverified reason
520
+
521
+ Args:
522
+ reference: Reference dictionary with title, authors, year, url, etc.
523
+
524
+ Returns:
525
+ String with the specific unverified reason:
526
+ - "non-existent web page" if the page doesn't exist
527
+ - "paper not found and URL doesn't reference it" if page exists but doesn't contain title
528
+ - "paper not verified but URL references paper" if page exists and contains title
529
+ """
530
+ logger.debug(f"Checking unverified URL reference: {reference.get('title', 'Untitled')}")
531
+
532
+ # Extract URL from reference
533
+ web_url = reference.get('url', '').strip()
534
+ if not web_url:
535
+ return "paper not found and URL doesn't reference it" # No URL to check
536
+
537
+ # Make request to check if page exists
538
+ response = self._respectful_request(web_url)
539
+ if response is None:
540
+ return "non-existent web page"
541
+
542
+ if response.status_code == 404:
543
+ return "non-existent web page"
544
+ elif response.status_code == 403:
545
+ # For blocked resources, we can't check content but assume page exists
546
+ return "paper not verified but URL references paper"
547
+ elif response.status_code != 200:
548
+ return "non-existent web page"
549
+
550
+ try:
551
+ # Parse HTML content to search for title
552
+ content_type = response.headers.get('content-type', '').lower()
553
+ if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
554
+ # For PDFs, we can't search content, so assume it's referenced if accessible
555
+ return "paper not verified but URL references paper"
556
+
557
+ # Parse HTML content
558
+ soup = BeautifulSoup(response.content, 'html.parser')
559
+
560
+ # Extract page content for searching
561
+ page_title = self._extract_page_title(soup)
562
+ page_description = self._extract_description(soup)
563
+
564
+ # Get the full page text for comprehensive searching
565
+ page_text = soup.get_text().lower()
566
+
567
+ # Get the reference title to search for
568
+ cited_title = reference.get('title', '').strip()
569
+ if not cited_title:
570
+ return "paper not found and URL doesn't reference it"
571
+
572
+ # Search for the title in various ways
573
+ cited_title_lower = cited_title.lower()
574
+
575
+ # Direct search in page text
576
+ if cited_title_lower in page_text:
577
+ return "paper not verified but URL references paper"
578
+
579
+ # Search for key words from the title
580
+ cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
581
+ if len(word.strip('.,;:()[]{}')) > 3)
582
+
583
+ # Check if significant portion of title words appear in page
584
+ page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
585
+ if len(word.strip('.,;:()[]{}')) > 3)
586
+
587
+ common_words = cited_words.intersection(page_words)
588
+
589
+ # If most of the title words are found, consider it referenced
590
+ if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
591
+ return "paper not verified but URL references paper"
592
+
593
+ # Also check the extracted title and description specifically
594
+ if page_title:
595
+ if self._check_title_match(cited_title, page_title, page_description):
596
+ return "paper not verified but URL references paper"
597
+
598
+ # Title not found in page content
599
+ return "paper not found and URL doesn't reference it"
600
+
601
+ except Exception as e:
602
+ logger.error(f"Error checking unverified URL {web_url}: {e}")
603
+ return "paper not found and URL doesn't reference it"
604
+
605
+ def verify_raw_url_for_unverified_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
606
+ """
607
+ Verify a raw URL from an unverified reference - can return verified data if appropriate
608
+
609
+ Args:
610
+ reference: Reference dictionary with title, authors, year, url, etc.
611
+
612
+ Returns:
613
+ Tuple of (verified_data, errors, url) where:
614
+ - verified_data: Dict with verified data if URL should be considered verified, None otherwise
615
+ - errors: List of error dictionaries with specific unverified reasons
616
+ - url: The URL that was checked
617
+ """
618
+ logger.debug(f"Verifying raw URL for unverified reference: {reference.get('title', 'Untitled')}")
619
+
620
+ # Extract URL from reference
621
+ web_url = reference.get('url', '').strip()
622
+ if not web_url:
623
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], None
624
+
625
+ # Make request to check if page exists
626
+ response = self._respectful_request(web_url)
627
+ if response is None:
628
+ return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
629
+
630
+ if response.status_code == 404:
631
+ return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
632
+ elif response.status_code == 403:
633
+ # For blocked resources, we can't check content but assume page exists
634
+ # If no venue, treat as verified since URL is accessible
635
+ if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
636
+ verified_data = {
637
+ 'title': reference.get('title', ''),
638
+ 'authors': reference.get('authors', []),
639
+ 'year': reference.get('year'),
640
+ 'venue': 'Web Page',
641
+ 'url': web_url,
642
+ 'web_metadata': {
643
+ 'status_code': 403,
644
+ 'access_blocked': True
645
+ }
646
+ }
647
+ return verified_data, [], web_url
648
+ else:
649
+ return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
650
+ elif response.status_code != 200:
651
+ return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
652
+
653
+ try:
654
+ # Parse HTML content to search for title
655
+ content_type = response.headers.get('content-type', '').lower()
656
+ if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
657
+ # For PDFs, if no venue specified, treat as verified
658
+ if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
659
+ verified_data = {
660
+ 'title': reference.get('title', ''),
661
+ 'authors': reference.get('authors', []),
662
+ 'year': reference.get('year'),
663
+ 'venue': 'PDF Document',
664
+ 'url': web_url,
665
+ 'web_metadata': {
666
+ 'content_type': response.headers.get('content-type', ''),
667
+ 'status_code': response.status_code
668
+ }
669
+ }
670
+ return verified_data, [], web_url
671
+ else:
672
+ return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
673
+
674
+ # Parse HTML content
675
+ soup = BeautifulSoup(response.content, 'html.parser')
676
+
677
+ # Extract page content for searching
678
+ page_title = self._extract_page_title(soup)
679
+ page_description = self._extract_description(soup)
680
+
681
+ # Get the full page text for comprehensive searching
682
+ page_text = soup.get_text().lower()
683
+
684
+ # Get the reference title to search for
685
+ cited_title = reference.get('title', '').strip()
686
+ if not cited_title:
687
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
688
+
689
+ # Search for the title in various ways
690
+ cited_title_lower = cited_title.lower()
691
+ title_found = False
692
+
693
+ # Direct search in page text
694
+ if cited_title_lower in page_text:
695
+ title_found = True
696
+
697
+ # Search for key words from the title
698
+ if not title_found:
699
+ cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
700
+ if len(word.strip('.,;:()[]{}')) > 3)
701
+
702
+ # Check if significant portion of title words appear in page
703
+ page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
704
+ if len(word.strip('.,;:()[]{}')) > 3)
705
+
706
+ common_words = cited_words.intersection(page_words)
707
+
708
+ # If most of the title words are found, consider it referenced
709
+ if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
710
+ title_found = True
711
+
712
+ # Also check the extracted title and description specifically
713
+ if not title_found and page_title:
714
+ if self._check_title_match(cited_title, page_title, page_description):
715
+ title_found = True
716
+
717
+ # Determine if this should be verified or unverified
718
+ if title_found:
719
+ # Check if reference should be verified based on venue type
720
+ venue_field = reference.get('journal') or reference.get('venue') or reference.get('booktitle')
721
+
722
+ if not venue_field:
723
+ # No venue specified - verify with URL as venue
724
+ site_info = self._extract_site_info(soup, web_url)
725
+ venue = site_info.get('organization', 'Web Page') if site_info.get('organization') != site_info.get('domain') else 'Web Page'
726
+
727
+ verified_data = {
728
+ 'title': reference.get('title', ''),
729
+ 'authors': reference.get('authors', []),
730
+ 'year': reference.get('year'),
731
+ 'venue': venue,
732
+ 'url': web_url,
733
+ 'web_metadata': {
734
+ 'page_title': page_title,
735
+ 'description': page_description,
736
+ 'site_info': site_info,
737
+ 'final_url': response.url,
738
+ 'status_code': response.status_code
739
+ }
740
+ }
741
+ logger.debug(f"URL verified as valid source for reference without venue: {web_url}")
742
+ return verified_data, [], web_url
743
+ elif self._is_web_content_venue(venue_field, web_url):
744
+ # Has venue but it's a web content venue (news, blog, etc.) - verify it
745
+ verified_data = {
746
+ 'title': reference.get('title', ''),
747
+ 'authors': reference.get('authors', []),
748
+ 'year': reference.get('year'),
749
+ 'venue': venue_field, # Keep the original venue
750
+ 'url': web_url,
751
+ 'web_metadata': {
752
+ 'page_title': page_title,
753
+ 'description': page_description,
754
+ 'site_info': self._extract_site_info(soup, web_url),
755
+ 'final_url': response.url,
756
+ 'status_code': response.status_code
757
+ }
758
+ }
759
+ logger.debug(f"URL verified as valid web content source: {web_url}")
760
+ return verified_data, [], web_url
761
+ else:
762
+ # Has academic venue but URL references paper - still unverified (needs proper paper verification)
763
+ return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
764
+ else:
765
+ # Title not found in page content
766
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
767
+
768
+ except Exception as e:
769
+ logger.error(f"Error checking raw URL {web_url}: {e}")
770
+ return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
771
+
772
+ def _is_web_content_venue(self, venue: str, url: str) -> bool:
773
+ """
774
+ Determine if a venue represents web content rather than academic publication
775
+
776
+ Args:
777
+ venue: The venue string (journal, venue, or booktitle)
778
+ url: The URL being checked (for additional context)
779
+
780
+ Returns:
781
+ True if this represents web content that can be verified via URL
782
+ """
783
+ if not venue:
784
+ return False
785
+
786
+ venue_lower = venue.lower().strip()
787
+
788
+ # News organizations and media outlets
789
+ news_indicators = [
790
+ 'news', 'cbc', 'bbc', 'cnn', 'reuters', 'associated press', 'ap news',
791
+ 'npr', 'pbs', 'abc news', 'nbc news', 'fox news', 'guardian', 'times',
792
+ 'post', 'herald', 'tribune', 'gazette', 'chronicle', 'observer',
793
+ 'magazine', 'weekly', 'daily', 'today', 'report', 'wire', 'press'
794
+ ]
795
+
796
+ # Special case for Wall Street Journal
797
+ if any(word in venue_lower for word in ['wall street', 'wsj']):
798
+ news_indicators.append('journal')
799
+
800
+ # Technology and industry publications
801
+ tech_publications = [
802
+ 'techcrunch', 'wired', 'ars technica', 'the verge', 'engadget',
803
+ 'zdnet', 'cnet', 'computerworld', 'infoworld', 'pcmag', 'pcworld',
804
+ 'ieee spectrum', 'mit technology review', 'scientific american'
805
+ ]
806
+
807
+ # Blogs and web platforms
808
+ blog_platforms = [
809
+ 'blog', 'medium', 'substack', 'wordpress', 'blogspot', 'tumblr',
810
+ 'linkedin', 'facebook', 'twitter', 'reddit', 'stack overflow',
811
+ 'github pages', 'personal website', 'company blog'
812
+ ]
813
+
814
+ # Government and organizational websites
815
+ org_indicators = [
816
+ 'government', 'gov', '.org', 'agency', 'department', 'ministry',
817
+ 'commission', 'bureau', 'office', 'administration', 'institute',
818
+ 'foundation', 'association', 'society', 'center', 'centre',
819
+ 'council', 'committee', 'board', 'union', 'federation', 'alliance',
820
+ 'coalition', 'consortium', 'network', 'group', 'organization',
821
+ 'organisation', 'corp', 'corporation', 'company', 'ltd', 'inc'
822
+ ]
823
+
824
+ # Documentation and technical resources
825
+ tech_resources = [
826
+ 'documentation', 'docs', 'api', 'reference', 'guide', 'tutorial',
827
+ 'manual', 'readme', 'wiki', 'help', 'support', 'developer',
828
+ 'technical', 'white paper', 'whitepaper', 'brief', 'overview',
829
+ 'policy', 'strategy', 'report', 'study', 'analysis', 'research'
830
+ ]
831
+
832
+ # Check URL domain for additional context
833
+ url_lower = url.lower() if url else ''
834
+
835
+ # Known web content domains in URL
836
+ web_domains = [
837
+ 'cbc.ca', 'bbc.com', 'cnn.com', 'reuters.com', 'npr.org', 'pbs.org',
838
+ 'nytimes.com', 'washingtonpost.com', 'theguardian.com', 'wsj.com',
839
+ 'techcrunch.com', 'wired.com', 'theverge.com', 'arstechnica.com',
840
+ 'medium.com', 'substack.com', 'linkedin.com', 'github.io',
841
+ 'readthedocs.io', 'stackoverflow.com', 'reddit.com'
842
+ ]
843
+
844
+ # Combine all indicators
845
+ all_indicators = news_indicators + tech_publications + blog_platforms + org_indicators + tech_resources
846
+
847
+ # Academic venue indicators that should NOT be considered web content
848
+ academic_indicators = [
849
+ 'proceedings', 'conference', 'symposium', 'workshop', 'transactions',
850
+ 'journal of', 'international journal', 'acm', 'ieee', 'springer',
851
+ 'nature', 'science', 'cell', 'lancet', 'plos', 'arxiv', 'pubmed',
852
+ 'artificial intelligence', 'machine learning', 'computer vision',
853
+ 'neural', 'computing', 'robotics', 'bioinformatics'
854
+ ]
855
+
856
+ # Check if venue is clearly academic (should not be treated as web content)
857
+ is_academic = any(indicator in venue_lower for indicator in academic_indicators)
858
+ if is_academic:
859
+ return False
860
+
861
+ # Check if venue matches any web content indicators
862
+ venue_matches = any(indicator and indicator in venue_lower for indicator in all_indicators)
863
+
864
+ # Check if URL domain suggests web content
865
+ url_matches = any(domain in url_lower for domain in web_domains)
866
+
867
+ # Special case: if URL contains news/blog/docs indicators, lean towards web content
868
+ url_content_indicators = ['news', 'blog', 'post', 'article', 'docs', 'help', 'guide', 'resources', 'policy', 'strategy']
869
+ url_has_content_indicators = any(indicator in url_lower for indicator in url_content_indicators)
870
+
871
+ # Special case: Check if venue is an organizational acronym/name that matches the URL domain
872
+ # This handles cases like "AECEA" on aecea.ca domain
873
+ organizational_match = self._check_organizational_venue_match(venue, url_lower)
874
+
875
+ return venue_matches or url_matches or url_has_content_indicators or organizational_match
876
+
877
+ def _check_organizational_venue_match(self, venue: str, url_lower: str) -> bool:
878
+ """
879
+ Check if the venue represents an organization that matches the URL domain
880
+
881
+ Args:
882
+ venue: The venue string
883
+ url_lower: The lowercased URL
884
+
885
+ Returns:
886
+ True if venue appears to be the organization publishing on their own domain
887
+ """
888
+ if not venue or not url_lower:
889
+ return False
890
+
891
+ venue_lower = venue.lower().strip()
892
+
893
+ # Extract domain from URL
894
+ from urllib.parse import urlparse
895
+ try:
896
+ parsed_url = urlparse(url_lower)
897
+ domain = parsed_url.netloc.lower()
898
+
899
+ # Remove common prefixes
900
+ domain = domain.replace('www.', '')
901
+
902
+ # Check if venue is likely an acronym (short, all caps or mixed case)
903
+ is_likely_acronym = (len(venue) <= 10 and
904
+ (venue.isupper() or
905
+ any(c.isupper() for c in venue) and len(venue.split()) == 1))
906
+
907
+ # Check if venue appears in domain
908
+ venue_clean = ''.join(c for c in venue_lower if c.isalnum())
909
+
910
+ if venue_clean and venue_clean in domain:
911
+ return True
912
+
913
+ # For acronyms, check if the acronym could match the domain
914
+ if is_likely_acronym:
915
+ # Split venue into words and check if initials match domain
916
+ venue_words = venue_lower.replace('.', ' ').split()
917
+ if len(venue_words) == 1 and len(venue_words[0]) <= 6:
918
+ # Single word acronym - check if it's in the domain
919
+ if venue_words[0] in domain:
920
+ return True
921
+
922
+ # Check for educational/professional associations with .ca, .org, .edu domains
923
+ if any(domain.endswith(tld) for tld in ['.ca', '.org', '.edu', '.gov']):
924
+ # These domains often host organizational content
925
+ if any(org_word in venue_lower for org_word in [
926
+ 'association', 'society', 'institute', 'foundation', 'center',
927
+ 'centre', 'council', 'committee', 'board', 'agency', 'department'
928
+ ]):
929
+ return True
930
+
931
+ # Check if venue is a short organizational name/acronym
932
+ if is_likely_acronym:
933
+ return True
934
+
935
+ return False
936
+
937
+ except Exception:
938
+ return False