academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,938 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
import re
|
|
5
|
+
import logging
|
|
6
|
+
from urllib.parse import urlparse, urljoin
|
|
7
|
+
from typing import Dict, Optional, Tuple, List, Any
|
|
8
|
+
from bs4 import BeautifulSoup
|
|
9
|
+
import time
|
|
10
|
+
from refchecker.utils.text_utils import strip_latex_commands
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
class WebPageChecker:
|
|
15
|
+
"""
|
|
16
|
+
Checker for verifying web page references (documentation, tutorials, etc.)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, request_delay: float = 1.0):
|
|
20
|
+
"""
|
|
21
|
+
Initialize web page checker
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
request_delay: Delay between requests to be respectful to servers
|
|
25
|
+
"""
|
|
26
|
+
self.request_delay = request_delay
|
|
27
|
+
self.session = requests.Session()
|
|
28
|
+
self.session.headers.update({
|
|
29
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
30
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
31
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
32
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
33
|
+
'Connection': 'keep-alive',
|
|
34
|
+
'Referer': 'https://www.google.com/',
|
|
35
|
+
})
|
|
36
|
+
self.last_request_time = 0
|
|
37
|
+
|
|
38
|
+
def is_web_page_url(self, url: str) -> bool:
|
|
39
|
+
"""
|
|
40
|
+
Check if URL is a web page that should be verified
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
url: URL to check
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
True if it's a verifiable web page URL
|
|
47
|
+
"""
|
|
48
|
+
if not url or not url.startswith(('http://', 'https://')):
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
# Skip GitHub URLs (handled by GitHubChecker)
|
|
52
|
+
if 'github.com' in url:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
# Skip Semantic Scholar CorpusID URLs (handled by Semantic Scholar API)
|
|
56
|
+
if 'api.semanticscholar.org/CorpusID:' in url:
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
# Skip direct file downloads, but allow PDFs that are likely web-viewable
|
|
60
|
+
file_extensions = ['.doc', '.docx', '.zip', '.tar.gz', '.exe', '.dmg']
|
|
61
|
+
if any(url.lower().endswith(ext) for ext in file_extensions):
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
# For PDFs, only skip if they're clearly downloadable files, not web-viewable documents
|
|
65
|
+
if url.lower().endswith('.pdf'):
|
|
66
|
+
# Allow PDFs from known documentation/content sites
|
|
67
|
+
pdf_allowed_domains = ['intel.com', 'nvidia.com', 'microsoft.com', 'google.com', 'openai.com']
|
|
68
|
+
if not any(domain in url.lower() for domain in pdf_allowed_domains):
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
# Include documentation and web content
|
|
72
|
+
doc_indicators = [
|
|
73
|
+
'docs', 'documentation', 'readthedocs.io', 'help', 'guide', 'tutorial',
|
|
74
|
+
'reference', 'manual', 'wiki', 'blog', 'api', 'developer', 'platform',
|
|
75
|
+
'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop',
|
|
76
|
+
'posts' # For blog posts and forum posts like LessWrong
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
return any(indicator in url.lower() for indicator in doc_indicators) or self._is_likely_webpage(url)
|
|
80
|
+
|
|
81
|
+
def _is_likely_webpage(self, url: str) -> bool:
|
|
82
|
+
"""Check if URL pattern suggests it's a webpage"""
|
|
83
|
+
parsed = urlparse(url)
|
|
84
|
+
|
|
85
|
+
# Known documentation domains
|
|
86
|
+
doc_domains = [
|
|
87
|
+
'pytorch.org', 'tensorflow.org', 'readthedocs.io', 'onnxruntime.ai',
|
|
88
|
+
'deepspeed.ai', 'huggingface.co', 'openai.com', 'microsoft.com',
|
|
89
|
+
'google.com', 'nvidia.com', 'intel.com', 'langchain.com',
|
|
90
|
+
'lesswrong.com' # LessWrong rationality and AI safety blog platform
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
return any(domain in parsed.netloc for domain in doc_domains)
|
|
94
|
+
|
|
95
|
+
def _respectful_request(self, url: str, timeout: int = 15) -> Optional[requests.Response]:
|
|
96
|
+
"""Make a respectful HTTP request with rate limiting"""
|
|
97
|
+
current_time = time.time()
|
|
98
|
+
time_since_last = current_time - self.last_request_time
|
|
99
|
+
|
|
100
|
+
if time_since_last < self.request_delay:
|
|
101
|
+
time.sleep(self.request_delay - time_since_last)
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
logger.debug(f"Making request to: {url}")
|
|
105
|
+
response = self.session.get(url, timeout=timeout, allow_redirects=True)
|
|
106
|
+
self.last_request_time = time.time()
|
|
107
|
+
logger.debug(f"Request successful: {response.status_code}, content-type: {response.headers.get('content-type', 'unknown')}")
|
|
108
|
+
return response
|
|
109
|
+
except requests.exceptions.RequestException as e:
|
|
110
|
+
logger.debug(f"Request failed for {url}: {type(e).__name__}: {e}")
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
114
|
+
"""
|
|
115
|
+
Verify a web page reference
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Tuple of (verified_data, errors, paper_url) where:
|
|
122
|
+
- verified_data: Dict with verified web page information or None
|
|
123
|
+
- errors: List of error/warning dictionaries
|
|
124
|
+
- paper_url: The web page URL
|
|
125
|
+
"""
|
|
126
|
+
logger.debug(f"Verifying web page reference: {reference.get('title', 'Untitled')}")
|
|
127
|
+
|
|
128
|
+
# Extract web URL from reference
|
|
129
|
+
web_url = reference.get('url', '').strip()
|
|
130
|
+
if not web_url or not self.is_web_page_url(web_url):
|
|
131
|
+
logger.debug("No verifiable web URL found in reference")
|
|
132
|
+
return None, [], None
|
|
133
|
+
|
|
134
|
+
# Fetch the web page
|
|
135
|
+
response = self._respectful_request(web_url)
|
|
136
|
+
if response is None:
|
|
137
|
+
return None, [{"error_type": "unverified", "error_details": "Could not fetch web page"}], web_url
|
|
138
|
+
|
|
139
|
+
if response.status_code == 404:
|
|
140
|
+
return None, [{"error_type": "unverified", "error_details": "Web page not found (404)"}], web_url
|
|
141
|
+
elif response.status_code == 403:
|
|
142
|
+
# For 403, assume the resource exists but blocks automated access
|
|
143
|
+
# This is common for PDFs and some corporate sites
|
|
144
|
+
return self._handle_blocked_resource(reference, web_url)
|
|
145
|
+
elif response.status_code != 200:
|
|
146
|
+
return None, [{"error_type": "unverified", "error_details": f"HTTP error {response.status_code}"}], web_url
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
# Handle PDF content differently
|
|
150
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
151
|
+
if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
|
|
152
|
+
return self._handle_pdf_reference(reference, response, web_url)
|
|
153
|
+
|
|
154
|
+
# Parse HTML content
|
|
155
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
156
|
+
|
|
157
|
+
# Extract page metadata
|
|
158
|
+
page_title = self._extract_page_title(soup)
|
|
159
|
+
page_description = self._extract_description(soup)
|
|
160
|
+
site_info = self._extract_site_info(soup, web_url)
|
|
161
|
+
|
|
162
|
+
logger.debug(f"Extracted page title: {page_title}")
|
|
163
|
+
logger.debug(f"Extracted description: {page_description[:100] if page_description else 'None'}...")
|
|
164
|
+
|
|
165
|
+
# Create verified data structure
|
|
166
|
+
verified_data = {
|
|
167
|
+
'title': page_title or reference.get('title', ''),
|
|
168
|
+
'authors': self._determine_authors(reference.get('authors', []), site_info, web_url),
|
|
169
|
+
'year': reference.get('year'),
|
|
170
|
+
'venue': 'Web Page',
|
|
171
|
+
'url': web_url,
|
|
172
|
+
'web_metadata': {
|
|
173
|
+
'page_title': page_title,
|
|
174
|
+
'description': page_description,
|
|
175
|
+
'site_info': site_info,
|
|
176
|
+
'final_url': response.url, # In case of redirects
|
|
177
|
+
'status_code': response.status_code
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
# Verify content
|
|
182
|
+
errors = []
|
|
183
|
+
cited_title = reference.get('title', '').strip()
|
|
184
|
+
|
|
185
|
+
# Check title match
|
|
186
|
+
if cited_title and page_title:
|
|
187
|
+
if not self._check_title_match(cited_title, page_title, page_description):
|
|
188
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
189
|
+
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
190
|
+
clean_cited_title = strip_latex_commands(cited_title)
|
|
191
|
+
errors.append({
|
|
192
|
+
"warning_type": "title",
|
|
193
|
+
"warning_details": format_title_mismatch(clean_cited_title, page_title)
|
|
194
|
+
})
|
|
195
|
+
|
|
196
|
+
# Check if this is a documentation page for the cited topic
|
|
197
|
+
if cited_title:
|
|
198
|
+
topic_match = self._check_topic_relevance(cited_title, page_title, page_description, soup)
|
|
199
|
+
if not topic_match:
|
|
200
|
+
errors.append({
|
|
201
|
+
"warning_type": "content",
|
|
202
|
+
"warning_details": f"Page content may not match cited topic '{cited_title}'"
|
|
203
|
+
})
|
|
204
|
+
|
|
205
|
+
# Check authors/organization
|
|
206
|
+
cited_authors = reference.get('authors', [])
|
|
207
|
+
if cited_authors:
|
|
208
|
+
author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
|
|
209
|
+
if not self._check_author_match(author_str, site_info, web_url):
|
|
210
|
+
from refchecker.utils.error_utils import format_three_line_mismatch
|
|
211
|
+
left = author_str
|
|
212
|
+
right = site_info.get('organization', 'unknown')
|
|
213
|
+
details = format_three_line_mismatch("Author/organization mismatch", left, right)
|
|
214
|
+
errors.append({
|
|
215
|
+
"warning_type": "author",
|
|
216
|
+
"warning_details": details
|
|
217
|
+
})
|
|
218
|
+
|
|
219
|
+
logger.debug(f"Web page verification completed for: {web_url}")
|
|
220
|
+
return verified_data, errors, web_url
|
|
221
|
+
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.error(f"Error parsing web page {web_url}: {e}")
|
|
224
|
+
return None, [{"error_type": "unverified", "error_details": f"Error parsing page: {str(e)}"}], web_url
|
|
225
|
+
|
|
226
|
+
def _extract_page_title(self, soup: BeautifulSoup) -> Optional[str]:
|
|
227
|
+
"""Extract the page title"""
|
|
228
|
+
# Try <title> tag
|
|
229
|
+
title_tag = soup.find('title')
|
|
230
|
+
if title_tag and title_tag.text.strip():
|
|
231
|
+
return title_tag.text.strip()
|
|
232
|
+
|
|
233
|
+
# Try <h1> tag
|
|
234
|
+
h1_tag = soup.find('h1')
|
|
235
|
+
if h1_tag and h1_tag.text.strip():
|
|
236
|
+
return h1_tag.text.strip()
|
|
237
|
+
|
|
238
|
+
# Try meta property title
|
|
239
|
+
meta_title = soup.find('meta', {'property': 'og:title'})
|
|
240
|
+
if meta_title and meta_title.get('content'):
|
|
241
|
+
return meta_title['content'].strip()
|
|
242
|
+
|
|
243
|
+
return None
|
|
244
|
+
|
|
245
|
+
def _extract_description(self, soup: BeautifulSoup) -> Optional[str]:
|
|
246
|
+
"""Extract page description"""
|
|
247
|
+
# Try meta description
|
|
248
|
+
meta_desc = soup.find('meta', {'name': 'description'})
|
|
249
|
+
if meta_desc and meta_desc.get('content'):
|
|
250
|
+
return meta_desc['content'].strip()
|
|
251
|
+
|
|
252
|
+
# Try OpenGraph description
|
|
253
|
+
og_desc = soup.find('meta', {'property': 'og:description'})
|
|
254
|
+
if og_desc and og_desc.get('content'):
|
|
255
|
+
return og_desc['content'].strip()
|
|
256
|
+
|
|
257
|
+
# Try first paragraph
|
|
258
|
+
first_p = soup.find('p')
|
|
259
|
+
if first_p and first_p.text.strip():
|
|
260
|
+
return first_p.text.strip()[:500] # Limit length
|
|
261
|
+
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
def _extract_site_info(self, soup: BeautifulSoup, url: str) -> Dict[str, str]:
|
|
265
|
+
"""Extract information about the website/organization"""
|
|
266
|
+
parsed_url = urlparse(url)
|
|
267
|
+
domain = parsed_url.netloc.lower()
|
|
268
|
+
|
|
269
|
+
site_info = {
|
|
270
|
+
'domain': domain,
|
|
271
|
+
'organization': self._determine_organization(domain),
|
|
272
|
+
'site_type': self._determine_site_type(domain, url)
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
# Try to extract more specific site info
|
|
276
|
+
generator = soup.find('meta', {'name': 'generator'})
|
|
277
|
+
if generator and generator.get('content'):
|
|
278
|
+
site_info['generator'] = generator['content']
|
|
279
|
+
|
|
280
|
+
return site_info
|
|
281
|
+
|
|
282
|
+
def _determine_organization(self, domain: str) -> str:
|
|
283
|
+
"""Determine the organization from domain"""
|
|
284
|
+
org_map = {
|
|
285
|
+
'onnxruntime.ai': 'ONNX Runtime',
|
|
286
|
+
'readthedocs.io': 'ReadTheDocs',
|
|
287
|
+
'pytorch.org': 'PyTorch',
|
|
288
|
+
'tensorflow.org': 'TensorFlow',
|
|
289
|
+
'huggingface.co': 'Hugging Face',
|
|
290
|
+
'openai.com': 'OpenAI',
|
|
291
|
+
'microsoft.com': 'Microsoft',
|
|
292
|
+
'google.com': 'Google',
|
|
293
|
+
'nvidia.com': 'NVIDIA',
|
|
294
|
+
'intel.com': 'Intel',
|
|
295
|
+
'deepspeed.ai': 'DeepSpeed',
|
|
296
|
+
'langchain.com': 'LangChain'
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
for domain_key, org in org_map.items():
|
|
300
|
+
if domain_key in domain:
|
|
301
|
+
return org
|
|
302
|
+
|
|
303
|
+
# Extract organization from domain
|
|
304
|
+
if 'readthedocs.io' in domain:
|
|
305
|
+
# Extract project name from readthedocs URL
|
|
306
|
+
parts = domain.split('.')
|
|
307
|
+
if len(parts) >= 3 and parts[-2] == 'readthedocs':
|
|
308
|
+
return parts[0].title()
|
|
309
|
+
|
|
310
|
+
# Generic extraction
|
|
311
|
+
domain_parts = domain.replace('www.', '').split('.')
|
|
312
|
+
if domain_parts:
|
|
313
|
+
return domain_parts[0].title()
|
|
314
|
+
|
|
315
|
+
return domain
|
|
316
|
+
|
|
317
|
+
def _determine_site_type(self, domain: str, url: str) -> str:
|
|
318
|
+
"""Determine the type of website"""
|
|
319
|
+
if 'readthedocs.io' in domain:
|
|
320
|
+
return 'documentation'
|
|
321
|
+
elif any(indicator in url.lower() for indicator in ['docs', 'documentation']):
|
|
322
|
+
return 'documentation'
|
|
323
|
+
elif any(indicator in url.lower() for indicator in ['api', 'reference']):
|
|
324
|
+
return 'api_documentation'
|
|
325
|
+
elif any(indicator in url.lower() for indicator in ['tutorial', 'guide', 'help']):
|
|
326
|
+
return 'tutorial'
|
|
327
|
+
elif any(indicator in url.lower() for indicator in ['blog', 'post']):
|
|
328
|
+
return 'blog'
|
|
329
|
+
else:
|
|
330
|
+
return 'website'
|
|
331
|
+
|
|
332
|
+
def _check_title_match(self, cited_title: str, page_title: str, page_description: str = None) -> bool:
|
|
333
|
+
"""Check if cited title matches page content"""
|
|
334
|
+
cited_lower = cited_title.lower().strip()
|
|
335
|
+
page_title_lower = page_title.lower().strip() if page_title else ''
|
|
336
|
+
|
|
337
|
+
# Direct substring match
|
|
338
|
+
if cited_lower in page_title_lower or page_title_lower in cited_lower:
|
|
339
|
+
return True
|
|
340
|
+
|
|
341
|
+
# Check key terms
|
|
342
|
+
cited_words = set(word.strip('.,;:()[]') for word in cited_lower.split() if len(word.strip('.,;:()[]')) > 3)
|
|
343
|
+
page_words = set(word.strip('.,;:()[]') for word in page_title_lower.split() if len(word.strip('.,;:()[]')) > 3)
|
|
344
|
+
|
|
345
|
+
# If description is available, include it
|
|
346
|
+
if page_description:
|
|
347
|
+
desc_words = set(word.strip('.,;:()[]') for word in page_description.lower().split() if len(word.strip('.,;:()[]')) > 3)
|
|
348
|
+
page_words.update(desc_words)
|
|
349
|
+
|
|
350
|
+
# Check for significant overlap
|
|
351
|
+
common_words = cited_words.intersection(page_words)
|
|
352
|
+
if len(common_words) >= min(2, len(cited_words) // 2):
|
|
353
|
+
return True
|
|
354
|
+
|
|
355
|
+
# Check for technical terms that indicate same topic
|
|
356
|
+
tech_terms = {'api', 'documentation', 'guide', 'tutorial', 'reference', 'docs'}
|
|
357
|
+
if cited_words.intersection(tech_terms) and page_words.intersection(tech_terms):
|
|
358
|
+
# If both mention technical terms, be more lenient
|
|
359
|
+
return len(common_words) >= 1
|
|
360
|
+
|
|
361
|
+
return False
|
|
362
|
+
|
|
363
|
+
def _check_topic_relevance(self, cited_title: str, page_title: str, page_description: str, soup: BeautifulSoup) -> bool:
|
|
364
|
+
"""Check if the page content is relevant to the cited topic"""
|
|
365
|
+
cited_lower = cited_title.lower()
|
|
366
|
+
|
|
367
|
+
# Extract main content text for analysis
|
|
368
|
+
content_text = ""
|
|
369
|
+
if page_title:
|
|
370
|
+
content_text += page_title.lower() + " "
|
|
371
|
+
if page_description:
|
|
372
|
+
content_text += page_description.lower() + " "
|
|
373
|
+
|
|
374
|
+
# Get some body text
|
|
375
|
+
main_content = soup.find('main') or soup.find('div', {'class': re.compile(r'content|main|body')}) or soup.find('body')
|
|
376
|
+
if main_content:
|
|
377
|
+
# Get first few paragraphs
|
|
378
|
+
paragraphs = main_content.find_all('p')[:5]
|
|
379
|
+
for p in paragraphs:
|
|
380
|
+
content_text += p.text.lower() + " "
|
|
381
|
+
|
|
382
|
+
# Extract key terms from cited title
|
|
383
|
+
cited_terms = [word.strip('.,;:()[]') for word in cited_lower.split() if len(word.strip('.,;:()[]')) > 3]
|
|
384
|
+
|
|
385
|
+
# Check if most key terms appear in content
|
|
386
|
+
matches = sum(1 for term in cited_terms if term in content_text)
|
|
387
|
+
return matches >= len(cited_terms) // 2 # At least half the terms should match
|
|
388
|
+
|
|
389
|
+
def _determine_authors(self, cited_authors: List[str], site_info: Dict[str, str], url: str) -> List[str]:
|
|
390
|
+
"""Determine appropriate authors based on site info"""
|
|
391
|
+
if not cited_authors:
|
|
392
|
+
return [site_info.get('organization', 'Unknown')]
|
|
393
|
+
|
|
394
|
+
# For web pages, often the organization is the "author"
|
|
395
|
+
return cited_authors
|
|
396
|
+
|
|
397
|
+
def _check_author_match(self, cited_authors: str, site_info: Dict[str, str], url: str) -> bool:
|
|
398
|
+
"""Check if cited authors match the website organization"""
|
|
399
|
+
cited_lower = cited_authors.lower().strip()
|
|
400
|
+
organization = site_info.get('organization', '').lower()
|
|
401
|
+
domain = site_info.get('domain', '').lower()
|
|
402
|
+
|
|
403
|
+
# Accept generic web resource terms - these are valid for any web URL
|
|
404
|
+
generic_web_terms = [
|
|
405
|
+
'web resource', 'web site', 'website', 'online resource',
|
|
406
|
+
'online', 'web', 'internet resource', 'web page', 'webpage'
|
|
407
|
+
]
|
|
408
|
+
if cited_lower in generic_web_terms:
|
|
409
|
+
return True
|
|
410
|
+
|
|
411
|
+
# Direct matches
|
|
412
|
+
if cited_lower in organization or organization in cited_lower:
|
|
413
|
+
return True
|
|
414
|
+
|
|
415
|
+
# Handle common abbreviations and variations
|
|
416
|
+
author_patterns = {
|
|
417
|
+
'o. runtime': ['onnx', 'runtime', 'onnxruntime'],
|
|
418
|
+
'deepspeed': ['deepspeed', 'microsoft'],
|
|
419
|
+
'openai': ['openai', 'open ai'],
|
|
420
|
+
'hugging face': ['huggingface', 'hf', 'h.f.'],
|
|
421
|
+
'google': ['google', 'alphabet'],
|
|
422
|
+
'microsoft': ['microsoft', 'ms', 'msft'],
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
for pattern, variants in author_patterns.items():
|
|
426
|
+
if any(variant in cited_lower for variant in variants):
|
|
427
|
+
if any(variant in organization or variant in domain for variant in variants):
|
|
428
|
+
return True
|
|
429
|
+
|
|
430
|
+
# Check if domain contains author info
|
|
431
|
+
if any(word in domain for word in cited_lower.split() if len(word) > 3):
|
|
432
|
+
return True
|
|
433
|
+
|
|
434
|
+
# For documentation sites, be more lenient
|
|
435
|
+
if site_info.get('site_type') in ['documentation', 'api_documentation']:
|
|
436
|
+
return True # Documentation authorship is often ambiguous
|
|
437
|
+
|
|
438
|
+
return False
|
|
439
|
+
|
|
440
|
+
def _handle_pdf_reference(self, reference, response, web_url):
|
|
441
|
+
"""Handle PDF document references"""
|
|
442
|
+
logger.debug(f"Handling PDF reference: {web_url}")
|
|
443
|
+
|
|
444
|
+
# For PDFs, we can't extract much content, so we do basic verification
|
|
445
|
+
verified_data = {
|
|
446
|
+
'title': reference.get('title', ''),
|
|
447
|
+
'authors': reference.get('authors', []),
|
|
448
|
+
'year': reference.get('year'),
|
|
449
|
+
'venue': 'PDF Document',
|
|
450
|
+
'url': web_url,
|
|
451
|
+
'web_metadata': {
|
|
452
|
+
'content_type': response.headers.get('content-type', ''),
|
|
453
|
+
'content_length': response.headers.get('content-length', ''),
|
|
454
|
+
'final_url': response.url,
|
|
455
|
+
'status_code': response.status_code
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
# For PDFs, we can't do much content verification, so just check if it's accessible
|
|
460
|
+
errors = []
|
|
461
|
+
|
|
462
|
+
# Check if the URL is from a reputable source
|
|
463
|
+
domain = urlparse(web_url).netloc.lower()
|
|
464
|
+
if not any(trusted in domain for trusted in ['intel.com', 'nvidia.com', 'microsoft.com', 'google.com', 'openai.com']):
|
|
465
|
+
errors.append({
|
|
466
|
+
"warning_type": "source",
|
|
467
|
+
"warning_details": f"PDF from unverified domain: {domain}"
|
|
468
|
+
})
|
|
469
|
+
|
|
470
|
+
return verified_data, errors, web_url
|
|
471
|
+
|
|
472
|
+
def _handle_blocked_resource(self, reference, web_url):
|
|
473
|
+
"""Handle resources that return 403 (blocked by bot detection)"""
|
|
474
|
+
logger.debug(f"Handling blocked resource: {web_url}")
|
|
475
|
+
|
|
476
|
+
# For blocked resources, we can still do basic verification based on URL patterns
|
|
477
|
+
domain = urlparse(web_url).netloc.lower()
|
|
478
|
+
|
|
479
|
+
# Determine if this is a trusted domain
|
|
480
|
+
trusted_domains = [
|
|
481
|
+
'intel.com', 'nvidia.com', 'microsoft.com', 'google.com', 'openai.com',
|
|
482
|
+
'adobe.com', 'apple.com', 'arxiv.org', 'ieee.org', 'acm.org',
|
|
483
|
+
'arxiv.org', 'semanticscholar.org'
|
|
484
|
+
]
|
|
485
|
+
|
|
486
|
+
is_trusted = any(trusted in domain for trusted in trusted_domains)
|
|
487
|
+
|
|
488
|
+
verified_data = {
|
|
489
|
+
'title': reference.get('title', ''),
|
|
490
|
+
'authors': reference.get('authors', []),
|
|
491
|
+
'year': reference.get('year'),
|
|
492
|
+
'venue': 'PDF Document' if web_url.lower().endswith('.pdf') else 'Web Page',
|
|
493
|
+
'url': web_url,
|
|
494
|
+
'web_metadata': {
|
|
495
|
+
'status_code': 403,
|
|
496
|
+
'domain': domain,
|
|
497
|
+
'trusted_domain': is_trusted,
|
|
498
|
+
'access_blocked': True
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
errors = []
|
|
503
|
+
if not is_trusted:
|
|
504
|
+
errors.append({
|
|
505
|
+
"warning_type": "access",
|
|
506
|
+
"warning_details": f"Access blocked by site (403) and domain not in trusted list: {domain}"
|
|
507
|
+
})
|
|
508
|
+
else:
|
|
509
|
+
# For trusted domains that block access, we assume the resource exists
|
|
510
|
+
errors.append({
|
|
511
|
+
"warning_type": "access",
|
|
512
|
+
"warning_details": "Access blocked by site but domain is trusted (likely bot protection)"
|
|
513
|
+
})
|
|
514
|
+
|
|
515
|
+
return verified_data, errors, web_url
|
|
516
|
+
|
|
517
|
+
def check_unverified_url_reference(self, reference: Dict[str, Any]) -> str:
|
|
518
|
+
"""
|
|
519
|
+
Check a URL from an unverified reference to determine the specific unverified reason
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
String with the specific unverified reason:
|
|
526
|
+
- "non-existent web page" if the page doesn't exist
|
|
527
|
+
- "paper not found and URL doesn't reference it" if page exists but doesn't contain title
|
|
528
|
+
- "paper not verified but URL references paper" if page exists and contains title
|
|
529
|
+
"""
|
|
530
|
+
logger.debug(f"Checking unverified URL reference: {reference.get('title', 'Untitled')}")
|
|
531
|
+
|
|
532
|
+
# Extract URL from reference
|
|
533
|
+
web_url = reference.get('url', '').strip()
|
|
534
|
+
if not web_url:
|
|
535
|
+
return "paper not found and URL doesn't reference it" # No URL to check
|
|
536
|
+
|
|
537
|
+
# Make request to check if page exists
|
|
538
|
+
response = self._respectful_request(web_url)
|
|
539
|
+
if response is None:
|
|
540
|
+
return "non-existent web page"
|
|
541
|
+
|
|
542
|
+
if response.status_code == 404:
|
|
543
|
+
return "non-existent web page"
|
|
544
|
+
elif response.status_code == 403:
|
|
545
|
+
# For blocked resources, we can't check content but assume page exists
|
|
546
|
+
return "paper not verified but URL references paper"
|
|
547
|
+
elif response.status_code != 200:
|
|
548
|
+
return "non-existent web page"
|
|
549
|
+
|
|
550
|
+
try:
|
|
551
|
+
# Parse HTML content to search for title
|
|
552
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
553
|
+
if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
|
|
554
|
+
# For PDFs, we can't search content, so assume it's referenced if accessible
|
|
555
|
+
return "paper not verified but URL references paper"
|
|
556
|
+
|
|
557
|
+
# Parse HTML content
|
|
558
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
559
|
+
|
|
560
|
+
# Extract page content for searching
|
|
561
|
+
page_title = self._extract_page_title(soup)
|
|
562
|
+
page_description = self._extract_description(soup)
|
|
563
|
+
|
|
564
|
+
# Get the full page text for comprehensive searching
|
|
565
|
+
page_text = soup.get_text().lower()
|
|
566
|
+
|
|
567
|
+
# Get the reference title to search for
|
|
568
|
+
cited_title = reference.get('title', '').strip()
|
|
569
|
+
if not cited_title:
|
|
570
|
+
return "paper not found and URL doesn't reference it"
|
|
571
|
+
|
|
572
|
+
# Search for the title in various ways
|
|
573
|
+
cited_title_lower = cited_title.lower()
|
|
574
|
+
|
|
575
|
+
# Direct search in page text
|
|
576
|
+
if cited_title_lower in page_text:
|
|
577
|
+
return "paper not verified but URL references paper"
|
|
578
|
+
|
|
579
|
+
# Search for key words from the title
|
|
580
|
+
cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
|
|
581
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
582
|
+
|
|
583
|
+
# Check if significant portion of title words appear in page
|
|
584
|
+
page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
|
|
585
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
586
|
+
|
|
587
|
+
common_words = cited_words.intersection(page_words)
|
|
588
|
+
|
|
589
|
+
# If most of the title words are found, consider it referenced
|
|
590
|
+
if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
|
|
591
|
+
return "paper not verified but URL references paper"
|
|
592
|
+
|
|
593
|
+
# Also check the extracted title and description specifically
|
|
594
|
+
if page_title:
|
|
595
|
+
if self._check_title_match(cited_title, page_title, page_description):
|
|
596
|
+
return "paper not verified but URL references paper"
|
|
597
|
+
|
|
598
|
+
# Title not found in page content
|
|
599
|
+
return "paper not found and URL doesn't reference it"
|
|
600
|
+
|
|
601
|
+
except Exception as e:
|
|
602
|
+
logger.error(f"Error checking unverified URL {web_url}: {e}")
|
|
603
|
+
return "paper not found and URL doesn't reference it"
|
|
604
|
+
|
|
605
|
+
def verify_raw_url_for_unverified_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
606
|
+
"""
|
|
607
|
+
Verify a raw URL from an unverified reference - can return verified data if appropriate
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
reference: Reference dictionary with title, authors, year, url, etc.
|
|
611
|
+
|
|
612
|
+
Returns:
|
|
613
|
+
Tuple of (verified_data, errors, url) where:
|
|
614
|
+
- verified_data: Dict with verified data if URL should be considered verified, None otherwise
|
|
615
|
+
- errors: List of error dictionaries with specific unverified reasons
|
|
616
|
+
- url: The URL that was checked
|
|
617
|
+
"""
|
|
618
|
+
logger.debug(f"Verifying raw URL for unverified reference: {reference.get('title', 'Untitled')}")
|
|
619
|
+
|
|
620
|
+
# Extract URL from reference
|
|
621
|
+
web_url = reference.get('url', '').strip()
|
|
622
|
+
if not web_url:
|
|
623
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], None
|
|
624
|
+
|
|
625
|
+
# Make request to check if page exists
|
|
626
|
+
response = self._respectful_request(web_url)
|
|
627
|
+
if response is None:
|
|
628
|
+
return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
|
|
629
|
+
|
|
630
|
+
if response.status_code == 404:
|
|
631
|
+
return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
|
|
632
|
+
elif response.status_code == 403:
|
|
633
|
+
# For blocked resources, we can't check content but assume page exists
|
|
634
|
+
# If no venue, treat as verified since URL is accessible
|
|
635
|
+
if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
|
|
636
|
+
verified_data = {
|
|
637
|
+
'title': reference.get('title', ''),
|
|
638
|
+
'authors': reference.get('authors', []),
|
|
639
|
+
'year': reference.get('year'),
|
|
640
|
+
'venue': 'Web Page',
|
|
641
|
+
'url': web_url,
|
|
642
|
+
'web_metadata': {
|
|
643
|
+
'status_code': 403,
|
|
644
|
+
'access_blocked': True
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
return verified_data, [], web_url
|
|
648
|
+
else:
|
|
649
|
+
return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
|
|
650
|
+
elif response.status_code != 200:
|
|
651
|
+
return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
|
|
652
|
+
|
|
653
|
+
try:
|
|
654
|
+
# Parse HTML content to search for title
|
|
655
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
656
|
+
if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
|
|
657
|
+
# For PDFs, if no venue specified, treat as verified
|
|
658
|
+
if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
|
|
659
|
+
verified_data = {
|
|
660
|
+
'title': reference.get('title', ''),
|
|
661
|
+
'authors': reference.get('authors', []),
|
|
662
|
+
'year': reference.get('year'),
|
|
663
|
+
'venue': 'PDF Document',
|
|
664
|
+
'url': web_url,
|
|
665
|
+
'web_metadata': {
|
|
666
|
+
'content_type': response.headers.get('content-type', ''),
|
|
667
|
+
'status_code': response.status_code
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
return verified_data, [], web_url
|
|
671
|
+
else:
|
|
672
|
+
return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
|
|
673
|
+
|
|
674
|
+
# Parse HTML content
|
|
675
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
676
|
+
|
|
677
|
+
# Extract page content for searching
|
|
678
|
+
page_title = self._extract_page_title(soup)
|
|
679
|
+
page_description = self._extract_description(soup)
|
|
680
|
+
|
|
681
|
+
# Get the full page text for comprehensive searching
|
|
682
|
+
page_text = soup.get_text().lower()
|
|
683
|
+
|
|
684
|
+
# Get the reference title to search for
|
|
685
|
+
cited_title = reference.get('title', '').strip()
|
|
686
|
+
if not cited_title:
|
|
687
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
|
|
688
|
+
|
|
689
|
+
# Search for the title in various ways
|
|
690
|
+
cited_title_lower = cited_title.lower()
|
|
691
|
+
title_found = False
|
|
692
|
+
|
|
693
|
+
# Direct search in page text
|
|
694
|
+
if cited_title_lower in page_text:
|
|
695
|
+
title_found = True
|
|
696
|
+
|
|
697
|
+
# Search for key words from the title
|
|
698
|
+
if not title_found:
|
|
699
|
+
cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
|
|
700
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
701
|
+
|
|
702
|
+
# Check if significant portion of title words appear in page
|
|
703
|
+
page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
|
|
704
|
+
if len(word.strip('.,;:()[]{}')) > 3)
|
|
705
|
+
|
|
706
|
+
common_words = cited_words.intersection(page_words)
|
|
707
|
+
|
|
708
|
+
# If most of the title words are found, consider it referenced
|
|
709
|
+
if len(common_words) >= max(1, len(cited_words) * 0.6): # At least 60% of words match
|
|
710
|
+
title_found = True
|
|
711
|
+
|
|
712
|
+
# Also check the extracted title and description specifically
|
|
713
|
+
if not title_found and page_title:
|
|
714
|
+
if self._check_title_match(cited_title, page_title, page_description):
|
|
715
|
+
title_found = True
|
|
716
|
+
|
|
717
|
+
# Determine if this should be verified or unverified
|
|
718
|
+
if title_found:
|
|
719
|
+
# Check if reference should be verified based on venue type
|
|
720
|
+
venue_field = reference.get('journal') or reference.get('venue') or reference.get('booktitle')
|
|
721
|
+
|
|
722
|
+
if not venue_field:
|
|
723
|
+
# No venue specified - verify with URL as venue
|
|
724
|
+
site_info = self._extract_site_info(soup, web_url)
|
|
725
|
+
venue = site_info.get('organization', 'Web Page') if site_info.get('organization') != site_info.get('domain') else 'Web Page'
|
|
726
|
+
|
|
727
|
+
verified_data = {
|
|
728
|
+
'title': reference.get('title', ''),
|
|
729
|
+
'authors': reference.get('authors', []),
|
|
730
|
+
'year': reference.get('year'),
|
|
731
|
+
'venue': venue,
|
|
732
|
+
'url': web_url,
|
|
733
|
+
'web_metadata': {
|
|
734
|
+
'page_title': page_title,
|
|
735
|
+
'description': page_description,
|
|
736
|
+
'site_info': site_info,
|
|
737
|
+
'final_url': response.url,
|
|
738
|
+
'status_code': response.status_code
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
logger.debug(f"URL verified as valid source for reference without venue: {web_url}")
|
|
742
|
+
return verified_data, [], web_url
|
|
743
|
+
elif self._is_web_content_venue(venue_field, web_url):
|
|
744
|
+
# Has venue but it's a web content venue (news, blog, etc.) - verify it
|
|
745
|
+
verified_data = {
|
|
746
|
+
'title': reference.get('title', ''),
|
|
747
|
+
'authors': reference.get('authors', []),
|
|
748
|
+
'year': reference.get('year'),
|
|
749
|
+
'venue': venue_field, # Keep the original venue
|
|
750
|
+
'url': web_url,
|
|
751
|
+
'web_metadata': {
|
|
752
|
+
'page_title': page_title,
|
|
753
|
+
'description': page_description,
|
|
754
|
+
'site_info': self._extract_site_info(soup, web_url),
|
|
755
|
+
'final_url': response.url,
|
|
756
|
+
'status_code': response.status_code
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
logger.debug(f"URL verified as valid web content source: {web_url}")
|
|
760
|
+
return verified_data, [], web_url
|
|
761
|
+
else:
|
|
762
|
+
# Has academic venue but URL references paper - still unverified (needs proper paper verification)
|
|
763
|
+
return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
|
|
764
|
+
else:
|
|
765
|
+
# Title not found in page content
|
|
766
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
|
|
767
|
+
|
|
768
|
+
except Exception as e:
|
|
769
|
+
logger.error(f"Error checking raw URL {web_url}: {e}")
|
|
770
|
+
return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
|
|
771
|
+
|
|
772
|
+
def _is_web_content_venue(self, venue: str, url: str) -> bool:
|
|
773
|
+
"""
|
|
774
|
+
Determine if a venue represents web content rather than academic publication
|
|
775
|
+
|
|
776
|
+
Args:
|
|
777
|
+
venue: The venue string (journal, venue, or booktitle)
|
|
778
|
+
url: The URL being checked (for additional context)
|
|
779
|
+
|
|
780
|
+
Returns:
|
|
781
|
+
True if this represents web content that can be verified via URL
|
|
782
|
+
"""
|
|
783
|
+
if not venue:
|
|
784
|
+
return False
|
|
785
|
+
|
|
786
|
+
venue_lower = venue.lower().strip()
|
|
787
|
+
|
|
788
|
+
# News organizations and media outlets
|
|
789
|
+
news_indicators = [
|
|
790
|
+
'news', 'cbc', 'bbc', 'cnn', 'reuters', 'associated press', 'ap news',
|
|
791
|
+
'npr', 'pbs', 'abc news', 'nbc news', 'fox news', 'guardian', 'times',
|
|
792
|
+
'post', 'herald', 'tribune', 'gazette', 'chronicle', 'observer',
|
|
793
|
+
'magazine', 'weekly', 'daily', 'today', 'report', 'wire', 'press'
|
|
794
|
+
]
|
|
795
|
+
|
|
796
|
+
# Special case for Wall Street Journal
|
|
797
|
+
if any(word in venue_lower for word in ['wall street', 'wsj']):
|
|
798
|
+
news_indicators.append('journal')
|
|
799
|
+
|
|
800
|
+
# Technology and industry publications
|
|
801
|
+
tech_publications = [
|
|
802
|
+
'techcrunch', 'wired', 'ars technica', 'the verge', 'engadget',
|
|
803
|
+
'zdnet', 'cnet', 'computerworld', 'infoworld', 'pcmag', 'pcworld',
|
|
804
|
+
'ieee spectrum', 'mit technology review', 'scientific american'
|
|
805
|
+
]
|
|
806
|
+
|
|
807
|
+
# Blogs and web platforms
|
|
808
|
+
blog_platforms = [
|
|
809
|
+
'blog', 'medium', 'substack', 'wordpress', 'blogspot', 'tumblr',
|
|
810
|
+
'linkedin', 'facebook', 'twitter', 'reddit', 'stack overflow',
|
|
811
|
+
'github pages', 'personal website', 'company blog'
|
|
812
|
+
]
|
|
813
|
+
|
|
814
|
+
# Government and organizational websites
|
|
815
|
+
org_indicators = [
|
|
816
|
+
'government', 'gov', '.org', 'agency', 'department', 'ministry',
|
|
817
|
+
'commission', 'bureau', 'office', 'administration', 'institute',
|
|
818
|
+
'foundation', 'association', 'society', 'center', 'centre',
|
|
819
|
+
'council', 'committee', 'board', 'union', 'federation', 'alliance',
|
|
820
|
+
'coalition', 'consortium', 'network', 'group', 'organization',
|
|
821
|
+
'organisation', 'corp', 'corporation', 'company', 'ltd', 'inc'
|
|
822
|
+
]
|
|
823
|
+
|
|
824
|
+
# Documentation and technical resources
|
|
825
|
+
tech_resources = [
|
|
826
|
+
'documentation', 'docs', 'api', 'reference', 'guide', 'tutorial',
|
|
827
|
+
'manual', 'readme', 'wiki', 'help', 'support', 'developer',
|
|
828
|
+
'technical', 'white paper', 'whitepaper', 'brief', 'overview',
|
|
829
|
+
'policy', 'strategy', 'report', 'study', 'analysis', 'research'
|
|
830
|
+
]
|
|
831
|
+
|
|
832
|
+
# Check URL domain for additional context
|
|
833
|
+
url_lower = url.lower() if url else ''
|
|
834
|
+
|
|
835
|
+
# Known web content domains in URL
|
|
836
|
+
web_domains = [
|
|
837
|
+
'cbc.ca', 'bbc.com', 'cnn.com', 'reuters.com', 'npr.org', 'pbs.org',
|
|
838
|
+
'nytimes.com', 'washingtonpost.com', 'theguardian.com', 'wsj.com',
|
|
839
|
+
'techcrunch.com', 'wired.com', 'theverge.com', 'arstechnica.com',
|
|
840
|
+
'medium.com', 'substack.com', 'linkedin.com', 'github.io',
|
|
841
|
+
'readthedocs.io', 'stackoverflow.com', 'reddit.com'
|
|
842
|
+
]
|
|
843
|
+
|
|
844
|
+
# Combine all indicators
|
|
845
|
+
all_indicators = news_indicators + tech_publications + blog_platforms + org_indicators + tech_resources
|
|
846
|
+
|
|
847
|
+
# Academic venue indicators that should NOT be considered web content
|
|
848
|
+
academic_indicators = [
|
|
849
|
+
'proceedings', 'conference', 'symposium', 'workshop', 'transactions',
|
|
850
|
+
'journal of', 'international journal', 'acm', 'ieee', 'springer',
|
|
851
|
+
'nature', 'science', 'cell', 'lancet', 'plos', 'arxiv', 'pubmed',
|
|
852
|
+
'artificial intelligence', 'machine learning', 'computer vision',
|
|
853
|
+
'neural', 'computing', 'robotics', 'bioinformatics'
|
|
854
|
+
]
|
|
855
|
+
|
|
856
|
+
# Check if venue is clearly academic (should not be treated as web content)
|
|
857
|
+
is_academic = any(indicator in venue_lower for indicator in academic_indicators)
|
|
858
|
+
if is_academic:
|
|
859
|
+
return False
|
|
860
|
+
|
|
861
|
+
# Check if venue matches any web content indicators
|
|
862
|
+
venue_matches = any(indicator and indicator in venue_lower for indicator in all_indicators)
|
|
863
|
+
|
|
864
|
+
# Check if URL domain suggests web content
|
|
865
|
+
url_matches = any(domain in url_lower for domain in web_domains)
|
|
866
|
+
|
|
867
|
+
# Special case: if URL contains news/blog/docs indicators, lean towards web content
|
|
868
|
+
url_content_indicators = ['news', 'blog', 'post', 'article', 'docs', 'help', 'guide', 'resources', 'policy', 'strategy']
|
|
869
|
+
url_has_content_indicators = any(indicator in url_lower for indicator in url_content_indicators)
|
|
870
|
+
|
|
871
|
+
# Special case: Check if venue is an organizational acronym/name that matches the URL domain
|
|
872
|
+
# This handles cases like "AECEA" on aecea.ca domain
|
|
873
|
+
organizational_match = self._check_organizational_venue_match(venue, url_lower)
|
|
874
|
+
|
|
875
|
+
return venue_matches or url_matches or url_has_content_indicators or organizational_match
|
|
876
|
+
|
|
877
|
+
def _check_organizational_venue_match(self, venue: str, url_lower: str) -> bool:
|
|
878
|
+
"""
|
|
879
|
+
Check if the venue represents an organization that matches the URL domain
|
|
880
|
+
|
|
881
|
+
Args:
|
|
882
|
+
venue: The venue string
|
|
883
|
+
url_lower: The lowercased URL
|
|
884
|
+
|
|
885
|
+
Returns:
|
|
886
|
+
True if venue appears to be the organization publishing on their own domain
|
|
887
|
+
"""
|
|
888
|
+
if not venue or not url_lower:
|
|
889
|
+
return False
|
|
890
|
+
|
|
891
|
+
venue_lower = venue.lower().strip()
|
|
892
|
+
|
|
893
|
+
# Extract domain from URL
|
|
894
|
+
from urllib.parse import urlparse
|
|
895
|
+
try:
|
|
896
|
+
parsed_url = urlparse(url_lower)
|
|
897
|
+
domain = parsed_url.netloc.lower()
|
|
898
|
+
|
|
899
|
+
# Remove common prefixes
|
|
900
|
+
domain = domain.replace('www.', '')
|
|
901
|
+
|
|
902
|
+
# Check if venue is likely an acronym (short, all caps or mixed case)
|
|
903
|
+
is_likely_acronym = (len(venue) <= 10 and
|
|
904
|
+
(venue.isupper() or
|
|
905
|
+
any(c.isupper() for c in venue) and len(venue.split()) == 1))
|
|
906
|
+
|
|
907
|
+
# Check if venue appears in domain
|
|
908
|
+
venue_clean = ''.join(c for c in venue_lower if c.isalnum())
|
|
909
|
+
|
|
910
|
+
if venue_clean and venue_clean in domain:
|
|
911
|
+
return True
|
|
912
|
+
|
|
913
|
+
# For acronyms, check if the acronym could match the domain
|
|
914
|
+
if is_likely_acronym:
|
|
915
|
+
# Split venue into words and check if initials match domain
|
|
916
|
+
venue_words = venue_lower.replace('.', ' ').split()
|
|
917
|
+
if len(venue_words) == 1 and len(venue_words[0]) <= 6:
|
|
918
|
+
# Single word acronym - check if it's in the domain
|
|
919
|
+
if venue_words[0] in domain:
|
|
920
|
+
return True
|
|
921
|
+
|
|
922
|
+
# Check for educational/professional associations with .ca, .org, .edu domains
|
|
923
|
+
if any(domain.endswith(tld) for tld in ['.ca', '.org', '.edu', '.gov']):
|
|
924
|
+
# These domains often host organizational content
|
|
925
|
+
if any(org_word in venue_lower for org_word in [
|
|
926
|
+
'association', 'society', 'institute', 'foundation', 'center',
|
|
927
|
+
'centre', 'council', 'committee', 'board', 'agency', 'department'
|
|
928
|
+
]):
|
|
929
|
+
return True
|
|
930
|
+
|
|
931
|
+
# Check if venue is a short organizational name/acronym
|
|
932
|
+
if is_likely_acronym:
|
|
933
|
+
return True
|
|
934
|
+
|
|
935
|
+
return False
|
|
936
|
+
|
|
937
|
+
except Exception:
|
|
938
|
+
return False
|