syscred 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syscred/__init__.py +41 -0
- syscred/api_clients.py +560 -0
- syscred/backend_app.py +363 -0
- syscred/config.py +275 -0
- syscred/database.py +54 -0
- syscred/debug_factcheck.py +43 -0
- syscred/debug_graph_json.py +58 -0
- syscred/debug_init.py +33 -0
- syscred/debug_local_server.py +25 -0
- syscred/diagnose_imports.py +37 -0
- syscred/eval_metrics.py +349 -0
- syscred/graph_rag.py +171 -0
- syscred/ir_engine.py +410 -0
- syscred/ontology_manager.py +509 -0
- syscred/run_benchmark.py +135 -0
- syscred/seo_analyzer.py +610 -0
- syscred/setup.py +65 -0
- syscred/test_graphrag.py +87 -0
- syscred/test_phase1.py +28 -0
- syscred/test_phase2.py +55 -0
- syscred/test_suite.py +64 -0
- syscred/verification_system.py +765 -0
- syscred-2.2.0.dist-info/METADATA +259 -0
- syscred-2.2.0.dist-info/RECORD +28 -0
- syscred-2.2.0.dist-info/WHEEL +5 -0
- syscred-2.2.0.dist-info/entry_points.txt +3 -0
- syscred-2.2.0.dist-info/licenses/LICENSE +21 -0
- syscred-2.2.0.dist-info/top_level.txt +1 -0
syscred/__init__.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
SysCRED - Système Neuro-Symbolique de Vérification de Crédibilité
|
|
4
|
+
===================================================================
|
|
5
|
+
|
|
6
|
+
PhD Thesis Prototype - (c) Dominique S. Loyer
|
|
7
|
+
Citation Key: loyerModelingHybridSystem2025
|
|
8
|
+
|
|
9
|
+
Modules:
|
|
10
|
+
- api_clients: Web scraping, WHOIS, Fact Check APIs
|
|
11
|
+
- ir_engine: BM25, QLD, TF-IDF, PRF (from TREC)
|
|
12
|
+
- seo_analyzer: SEO analysis, PageRank estimation
|
|
13
|
+
- eval_metrics: MAP, NDCG, P@K, Recall, MRR
|
|
14
|
+
- ontology_manager: RDFLib integration
|
|
15
|
+
- verification_system: Main credibility pipeline
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
__version__ = "2.0.0"
|
|
19
|
+
__author__ = "Dominique S. Loyer"
|
|
20
|
+
__citation__ = "loyerModelingHybridSystem2025"
|
|
21
|
+
|
|
22
|
+
# Core classes
|
|
23
|
+
from syscred.verification_system import CredibilityVerificationSystem
|
|
24
|
+
from syscred.api_clients import ExternalAPIClients
|
|
25
|
+
from syscred.ontology_manager import OntologyManager
|
|
26
|
+
from syscred.seo_analyzer import SEOAnalyzer
|
|
27
|
+
from syscred.ir_engine import IREngine
|
|
28
|
+
from syscred.eval_metrics import EvaluationMetrics
|
|
29
|
+
|
|
30
|
+
# Convenience alias
|
|
31
|
+
SysCRED = CredibilityVerificationSystem
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
'CredibilityVerificationSystem',
|
|
35
|
+
'SysCRED',
|
|
36
|
+
'ExternalAPIClients',
|
|
37
|
+
'OntologyManager',
|
|
38
|
+
'SEOAnalyzer',
|
|
39
|
+
'IREngine',
|
|
40
|
+
'EvaluationMetrics',
|
|
41
|
+
]
|
syscred/api_clients.py
ADDED
|
@@ -0,0 +1,560 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
API Clients Module - SysCRED
|
|
4
|
+
============================
|
|
5
|
+
Handles all external API calls for the credibility verification system.
|
|
6
|
+
|
|
7
|
+
APIs intégrées:
|
|
8
|
+
- Web content fetching (requests + BeautifulSoup)
|
|
9
|
+
- WHOIS lookup for domain age
|
|
10
|
+
- Google Fact Check Tools API
|
|
11
|
+
- Backlinks estimation via CommonCrawl
|
|
12
|
+
|
|
13
|
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
|
14
|
+
Citation Key: loyerModelingHybridSystem2025
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import requests
|
|
18
|
+
from urllib.parse import urlparse
|
|
19
|
+
from datetime import datetime, timedelta
|
|
20
|
+
from typing import Optional, List, Dict, Any
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
import re
|
|
23
|
+
import json
|
|
24
|
+
from functools import lru_cache
|
|
25
|
+
|
|
26
|
+
# Optional imports with fallbacks
|
|
27
|
+
try:
|
|
28
|
+
from bs4 import BeautifulSoup
|
|
29
|
+
HAS_BS4 = True
|
|
30
|
+
except ImportError:
|
|
31
|
+
HAS_BS4 = False
|
|
32
|
+
print("Warning: BeautifulSoup not installed. Run: pip install beautifulsoup4")
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
import whois
|
|
36
|
+
HAS_WHOIS = True
|
|
37
|
+
except ImportError:
|
|
38
|
+
HAS_WHOIS = False
|
|
39
|
+
print("Warning: python-whois not installed. Run: pip install python-whois")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# --- Data Classes for Structured Results ---
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class WebContent:
|
|
46
|
+
"""Represents fetched web content."""
|
|
47
|
+
url: str
|
|
48
|
+
title: Optional[str]
|
|
49
|
+
text_content: str
|
|
50
|
+
meta_description: Optional[str]
|
|
51
|
+
meta_keywords: List[str]
|
|
52
|
+
links: List[str]
|
|
53
|
+
fetch_timestamp: str
|
|
54
|
+
success: bool
|
|
55
|
+
error: Optional[str] = None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class DomainInfo:
|
|
60
|
+
"""Represents domain WHOIS information."""
|
|
61
|
+
domain: str
|
|
62
|
+
creation_date: Optional[datetime]
|
|
63
|
+
expiration_date: Optional[datetime]
|
|
64
|
+
registrar: Optional[str]
|
|
65
|
+
age_days: Optional[int]
|
|
66
|
+
success: bool
|
|
67
|
+
error: Optional[str] = None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class FactCheckResult:
|
|
72
|
+
"""Represents a single fact-check claim review."""
|
|
73
|
+
claim: str
|
|
74
|
+
claimant: Optional[str]
|
|
75
|
+
rating: str
|
|
76
|
+
publisher: str
|
|
77
|
+
url: str
|
|
78
|
+
review_date: Optional[str]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class ExternalData:
|
|
83
|
+
"""Combined external data for credibility analysis."""
|
|
84
|
+
fact_checks: List[FactCheckResult]
|
|
85
|
+
source_reputation: str
|
|
86
|
+
domain_age_days: Optional[int]
|
|
87
|
+
domain_info: Optional[DomainInfo]
|
|
88
|
+
related_articles: List[Dict[str, str]]
|
|
89
|
+
backlinks_count: int
|
|
90
|
+
backlinks_sample: List[Dict[str, str]]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ExternalAPIClients:
|
|
94
|
+
"""
|
|
95
|
+
Central class for all external API integrations.
|
|
96
|
+
Replaces simulated functions with real API calls.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def __init__(self, google_api_key: Optional[str] = None):
|
|
100
|
+
"""
|
|
101
|
+
Initialize API clients.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
google_api_key: API key for Google Fact Check Tools API (optional)
|
|
105
|
+
"""
|
|
106
|
+
self.google_api_key = google_api_key
|
|
107
|
+
self.session = requests.Session()
|
|
108
|
+
self.session.headers.update({
|
|
109
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
110
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
111
|
+
'Accept-Language': 'en-US,en;q=0.9,fr;q=0.8',
|
|
112
|
+
'Referer': 'https://www.google.com/',
|
|
113
|
+
'Upgrade-Insecure-Requests': '1',
|
|
114
|
+
'Sec-Fetch-Dest': 'document',
|
|
115
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
116
|
+
'Sec-Fetch-Site': 'none',
|
|
117
|
+
'Sec-Fetch-User': '?1'
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
# Reputation database (can be extended or loaded from file)
|
|
121
|
+
self.known_reputations = {
|
|
122
|
+
# High credibility sources
|
|
123
|
+
'lemonde.fr': 'High',
|
|
124
|
+
'nytimes.com': 'High',
|
|
125
|
+
'reuters.com': 'High',
|
|
126
|
+
'bbc.com': 'High',
|
|
127
|
+
'theguardian.com': 'High',
|
|
128
|
+
'apnews.com': 'High',
|
|
129
|
+
'nature.com': 'High',
|
|
130
|
+
'sciencedirect.com': 'High',
|
|
131
|
+
'scholar.google.com': 'High',
|
|
132
|
+
'factcheck.org': 'High',
|
|
133
|
+
'snopes.com': 'High',
|
|
134
|
+
'politifact.com': 'High',
|
|
135
|
+
# Medium credibility
|
|
136
|
+
'wikipedia.org': 'Medium',
|
|
137
|
+
'medium.com': 'Medium',
|
|
138
|
+
'huffpost.com': 'Medium',
|
|
139
|
+
# Low credibility (known misinformation spreaders)
|
|
140
|
+
'infowars.com': 'Low',
|
|
141
|
+
'naturalnews.com': 'Low',
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
def fetch_web_content(self, url: str, timeout: int = 10) -> WebContent:
|
|
145
|
+
"""
|
|
146
|
+
Fetch and parse web content from a URL.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
url: The URL to fetch
|
|
150
|
+
timeout: Request timeout in seconds
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
WebContent dataclass with extracted information
|
|
154
|
+
"""
|
|
155
|
+
timestamp = datetime.now().isoformat()
|
|
156
|
+
|
|
157
|
+
if not HAS_BS4:
|
|
158
|
+
return WebContent(
|
|
159
|
+
url=url, title=None, text_content="",
|
|
160
|
+
meta_description=None, meta_keywords=[],
|
|
161
|
+
links=[], fetch_timestamp=timestamp,
|
|
162
|
+
success=False, error="BeautifulSoup not installed"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
try:
|
|
167
|
+
response = self.session.get(url, timeout=timeout, allow_redirects=True)
|
|
168
|
+
response.raise_for_status()
|
|
169
|
+
except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
|
|
170
|
+
print(f"[SysCRED] SSL/Connection error for {url}. Retrying without verification...")
|
|
171
|
+
# Suppress warnings for unverified HTTPS request
|
|
172
|
+
import urllib3
|
|
173
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
174
|
+
response = self.session.get(url, timeout=timeout, allow_redirects=True, verify=False)
|
|
175
|
+
response.raise_for_status()
|
|
176
|
+
|
|
177
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
178
|
+
|
|
179
|
+
# Extract title
|
|
180
|
+
title = soup.title.string.strip() if soup.title else None
|
|
181
|
+
|
|
182
|
+
# Extract meta description
|
|
183
|
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
184
|
+
meta_description = meta_desc.get('content', '') if meta_desc else None
|
|
185
|
+
|
|
186
|
+
# Extract meta keywords
|
|
187
|
+
meta_kw = soup.find('meta', attrs={'name': 'keywords'})
|
|
188
|
+
meta_keywords = []
|
|
189
|
+
if meta_kw and meta_kw.get('content'):
|
|
190
|
+
meta_keywords = [k.strip() for k in meta_kw.get('content', '').split(',')]
|
|
191
|
+
|
|
192
|
+
# Remove script and style elements
|
|
193
|
+
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
|
|
194
|
+
element.decompose()
|
|
195
|
+
|
|
196
|
+
# Extract main text content
|
|
197
|
+
text_content = soup.get_text(separator=' ', strip=True)
|
|
198
|
+
# Clean up excessive whitespace
|
|
199
|
+
text_content = re.sub(r'\s+', ' ', text_content)
|
|
200
|
+
|
|
201
|
+
# Extract links
|
|
202
|
+
links = []
|
|
203
|
+
for a_tag in soup.find_all('a', href=True)[:50]: # Limit to 50 links
|
|
204
|
+
href = a_tag['href']
|
|
205
|
+
if href.startswith('http'):
|
|
206
|
+
links.append(href)
|
|
207
|
+
|
|
208
|
+
return WebContent(
|
|
209
|
+
url=url,
|
|
210
|
+
title=title,
|
|
211
|
+
text_content=text_content[:10000], # Limit text size
|
|
212
|
+
meta_description=meta_description,
|
|
213
|
+
meta_keywords=meta_keywords,
|
|
214
|
+
links=links,
|
|
215
|
+
fetch_timestamp=timestamp,
|
|
216
|
+
success=True
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
except requests.exceptions.Timeout:
|
|
220
|
+
return WebContent(
|
|
221
|
+
url=url, title=None, text_content="",
|
|
222
|
+
meta_description=None, meta_keywords=[], links=[],
|
|
223
|
+
fetch_timestamp=timestamp, success=False,
|
|
224
|
+
error=f"Timeout after {timeout}s"
|
|
225
|
+
)
|
|
226
|
+
except requests.exceptions.RequestException as e:
|
|
227
|
+
return WebContent(
|
|
228
|
+
url=url, title=None, text_content="",
|
|
229
|
+
meta_description=None, meta_keywords=[], links=[],
|
|
230
|
+
fetch_timestamp=timestamp, success=False,
|
|
231
|
+
error=str(e)
|
|
232
|
+
)
|
|
233
|
+
except Exception as e:
|
|
234
|
+
return WebContent(
|
|
235
|
+
url=url, title=None, text_content="",
|
|
236
|
+
meta_description=None, meta_keywords=[], links=[],
|
|
237
|
+
fetch_timestamp=timestamp, success=False,
|
|
238
|
+
error=f"Parsing error: {str(e)}"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
@lru_cache(maxsize=128)
|
|
242
|
+
def whois_lookup(self, url_or_domain: str) -> DomainInfo:
|
|
243
|
+
"""
|
|
244
|
+
Perform WHOIS lookup to get domain registration information.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
url_or_domain: URL or domain name
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
DomainInfo dataclass with domain details
|
|
251
|
+
"""
|
|
252
|
+
# Extract domain from URL if needed
|
|
253
|
+
if url_or_domain.startswith('http'):
|
|
254
|
+
domain = urlparse(url_or_domain).netloc
|
|
255
|
+
else:
|
|
256
|
+
domain = url_or_domain
|
|
257
|
+
|
|
258
|
+
# Remove 'www.' prefix
|
|
259
|
+
if domain.startswith('www.'):
|
|
260
|
+
domain = domain[4:]
|
|
261
|
+
|
|
262
|
+
if not HAS_WHOIS:
|
|
263
|
+
return DomainInfo(
|
|
264
|
+
domain=domain,
|
|
265
|
+
creation_date=None, expiration_date=None,
|
|
266
|
+
registrar=None, age_days=None,
|
|
267
|
+
success=False, error="python-whois not installed"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
w = whois.whois(domain)
|
|
272
|
+
|
|
273
|
+
# Handle creation_date (can be a list or single value)
|
|
274
|
+
creation_date = w.creation_date
|
|
275
|
+
if isinstance(creation_date, list):
|
|
276
|
+
creation_date = creation_date[0]
|
|
277
|
+
|
|
278
|
+
# Handle expiration_date
|
|
279
|
+
expiration_date = w.expiration_date
|
|
280
|
+
if isinstance(expiration_date, list):
|
|
281
|
+
expiration_date = expiration_date[0]
|
|
282
|
+
|
|
283
|
+
# Calculate age in days
|
|
284
|
+
age_days = None
|
|
285
|
+
if creation_date:
|
|
286
|
+
if isinstance(creation_date, datetime):
|
|
287
|
+
age_days = (datetime.now() - creation_date).days
|
|
288
|
+
|
|
289
|
+
return DomainInfo(
|
|
290
|
+
domain=domain,
|
|
291
|
+
creation_date=creation_date,
|
|
292
|
+
expiration_date=expiration_date,
|
|
293
|
+
registrar=w.registrar,
|
|
294
|
+
age_days=age_days,
|
|
295
|
+
success=True
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
except Exception as e:
|
|
299
|
+
return DomainInfo(
|
|
300
|
+
domain=domain,
|
|
301
|
+
creation_date=None, expiration_date=None,
|
|
302
|
+
registrar=None, age_days=None,
|
|
303
|
+
success=False, error=str(e)
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
def google_fact_check(self, query: str, language: str = "fr") -> List[FactCheckResult]:
|
|
307
|
+
"""
|
|
308
|
+
Query Google Fact Check Tools API.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
query: The claim or text to check
|
|
312
|
+
language: Language code (default: French)
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
List of FactCheckResult objects
|
|
316
|
+
"""
|
|
317
|
+
results = []
|
|
318
|
+
|
|
319
|
+
if not self.google_api_key:
|
|
320
|
+
print("[Info] Google Fact Check API key not configured. Using simulation.")
|
|
321
|
+
return self._simulate_fact_check(query)
|
|
322
|
+
|
|
323
|
+
try:
|
|
324
|
+
api_url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
|
|
325
|
+
params = {
|
|
326
|
+
'key': self.google_api_key,
|
|
327
|
+
'query': query[:200], # API has character limit
|
|
328
|
+
# 'languageCode': language # Removed to allow all languages (e.g. English queries)
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
response = self.session.get(api_url, params=params, timeout=10)
|
|
332
|
+
response.raise_for_status()
|
|
333
|
+
data = response.json()
|
|
334
|
+
|
|
335
|
+
claims = data.get('claims', [])
|
|
336
|
+
for claim in claims[:5]: # Limit to 5 results
|
|
337
|
+
text = claim.get('text', '')
|
|
338
|
+
claimant = claim.get('claimant')
|
|
339
|
+
|
|
340
|
+
for review in claim.get('claimReview', []):
|
|
341
|
+
results.append(FactCheckResult(
|
|
342
|
+
claim=text,
|
|
343
|
+
claimant=claimant,
|
|
344
|
+
rating=review.get('textualRating', 'Unknown'),
|
|
345
|
+
publisher=review.get('publisher', {}).get('name', 'Unknown'),
|
|
346
|
+
url=review.get('url', ''),
|
|
347
|
+
review_date=review.get('reviewDate')
|
|
348
|
+
))
|
|
349
|
+
|
|
350
|
+
return results
|
|
351
|
+
|
|
352
|
+
except Exception as e:
|
|
353
|
+
print(f"[Warning] Google Fact Check API error: {e}")
|
|
354
|
+
return self._simulate_fact_check(query)
|
|
355
|
+
|
|
356
|
+
def _simulate_fact_check(self, query: str) -> List[FactCheckResult]:
|
|
357
|
+
"""Fallback simulation when API is not available."""
|
|
358
|
+
# Check for known misinformation patterns
|
|
359
|
+
misinformation_keywords = [
|
|
360
|
+
'conspiracy', 'hoax', 'fake', 'miracle cure', 'they don\'t want you to know',
|
|
361
|
+
'mainstream media lies', 'deep state', 'plandemic'
|
|
362
|
+
]
|
|
363
|
+
|
|
364
|
+
query_lower = query.lower()
|
|
365
|
+
for keyword in misinformation_keywords:
|
|
366
|
+
if keyword in query_lower:
|
|
367
|
+
return [FactCheckResult(
|
|
368
|
+
claim=f"Text contains potential misinformation marker: '{keyword}'",
|
|
369
|
+
claimant=None,
|
|
370
|
+
rating="Needs Verification",
|
|
371
|
+
publisher="SysCRED Heuristic",
|
|
372
|
+
url="",
|
|
373
|
+
review_date=datetime.now().isoformat()
|
|
374
|
+
)]
|
|
375
|
+
|
|
376
|
+
return [] # No fact checks found
|
|
377
|
+
|
|
378
|
+
@lru_cache(maxsize=128)
|
|
379
|
+
def get_source_reputation(self, url: str) -> str:
|
|
380
|
+
"""
|
|
381
|
+
Get reputation score for a source/domain.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
url: URL or domain to check
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
Reputation level: 'High', 'Medium', 'Low', or 'Unknown'
|
|
388
|
+
"""
|
|
389
|
+
if url.startswith('http'):
|
|
390
|
+
domain = urlparse(url).netloc
|
|
391
|
+
else:
|
|
392
|
+
domain = url
|
|
393
|
+
|
|
394
|
+
# Remove www prefix
|
|
395
|
+
if domain.startswith('www.'):
|
|
396
|
+
domain = domain[4:]
|
|
397
|
+
|
|
398
|
+
# Check known reputations
|
|
399
|
+
for known_domain, reputation in self.known_reputations.items():
|
|
400
|
+
if domain.endswith(known_domain) or known_domain in domain:
|
|
401
|
+
return reputation
|
|
402
|
+
|
|
403
|
+
# Heuristics for unknown domains
|
|
404
|
+
# Academic domains tend to be more credible
|
|
405
|
+
if domain.endswith('.edu') or domain.endswith('.gov') or domain.endswith('.ac.uk'):
|
|
406
|
+
return 'High'
|
|
407
|
+
|
|
408
|
+
# Personal sites and free hosting are less credible
|
|
409
|
+
if any(x in domain for x in ['.blogspot.', '.wordpress.', '.wix.', '.weebly.']):
|
|
410
|
+
return 'Low'
|
|
411
|
+
|
|
412
|
+
return 'Unknown'
|
|
413
|
+
|
|
414
|
+
def estimate_backlinks(self, url: str) -> Dict[str, Any]:
|
|
415
|
+
"""
|
|
416
|
+
Estimate relative authority/backlinks based on available signals.
|
|
417
|
+
|
|
418
|
+
Since real backlink databases (Ahrefs, Moz) are paid/proprietary,
|
|
419
|
+
we use a composite heuristic based on:
|
|
420
|
+
1. Domain age (older domains tend to have more backlinks)
|
|
421
|
+
2. Known reputation (High reputation sources imply high backlinks)
|
|
422
|
+
3. Google Fact Check mentions (as a proxy for visibility in fact-checks)
|
|
423
|
+
"""
|
|
424
|
+
domain = urlparse(url).netloc
|
|
425
|
+
if domain.startswith('www.'):
|
|
426
|
+
domain = domain[4:]
|
|
427
|
+
|
|
428
|
+
# 1. Base Score from Reputation
|
|
429
|
+
reputation = self.get_source_reputation(domain)
|
|
430
|
+
base_count = 0
|
|
431
|
+
if reputation == 'High':
|
|
432
|
+
base_count = 10000 # High authority
|
|
433
|
+
elif reputation == 'Medium':
|
|
434
|
+
base_count = 1000 # Medium authority
|
|
435
|
+
elif reputation == 'Low':
|
|
436
|
+
base_count = 50 # Low authority
|
|
437
|
+
else:
|
|
438
|
+
base_count = 100 # Unknown
|
|
439
|
+
|
|
440
|
+
# 2. Multiplier from Domain Age
|
|
441
|
+
age_multiplier = 1.0
|
|
442
|
+
domain_info = self.whois_lookup(domain)
|
|
443
|
+
if domain_info.success and domain_info.age_days:
|
|
444
|
+
# Add 10% for every year of age, max 5x
|
|
445
|
+
years = domain_info.age_days / 365
|
|
446
|
+
age_multiplier = min(5.0, 1.0 + (years * 0.1))
|
|
447
|
+
|
|
448
|
+
estimated_count = int(base_count * age_multiplier)
|
|
449
|
+
|
|
450
|
+
# 3. Adjust for specific TLDs
|
|
451
|
+
if domain.endswith('.edu') or domain.endswith('.gov'):
|
|
452
|
+
estimated_count *= 2
|
|
453
|
+
|
|
454
|
+
return {
|
|
455
|
+
'estimated_count': estimated_count,
|
|
456
|
+
'sample_backlinks': [], # Real sample requires SERP API
|
|
457
|
+
'method': 'heuristic_v2.1',
|
|
458
|
+
'note': 'Estimated from domain age and reputation (Proxy)'
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
def fetch_external_data(self, input_data: str, fc_query: str = None) -> ExternalData:
|
|
462
|
+
"""
|
|
463
|
+
Main method to fetch all external data for credibility analysis.
|
|
464
|
+
This replaces the simulated fetch_external_data function.
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
input_data: URL or text to analyze
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
ExternalData with all gathered information
|
|
471
|
+
"""
|
|
472
|
+
from urllib.parse import urlparse
|
|
473
|
+
|
|
474
|
+
# Determine if input is URL
|
|
475
|
+
is_url = False
|
|
476
|
+
try:
|
|
477
|
+
result = urlparse(input_data)
|
|
478
|
+
is_url = all([result.scheme, result.netloc])
|
|
479
|
+
except:
|
|
480
|
+
pass
|
|
481
|
+
|
|
482
|
+
# Initialize results
|
|
483
|
+
domain_age_days = None
|
|
484
|
+
domain_info = None
|
|
485
|
+
source_reputation = 'Unknown'
|
|
486
|
+
fact_checks = []
|
|
487
|
+
backlinks_data = {'estimated_count': 0, 'sample_backlinks': []}
|
|
488
|
+
|
|
489
|
+
if is_url:
|
|
490
|
+
# Get domain information
|
|
491
|
+
domain_info = self.whois_lookup(input_data)
|
|
492
|
+
if domain_info.success:
|
|
493
|
+
domain_age_days = domain_info.age_days
|
|
494
|
+
|
|
495
|
+
# Get source reputation
|
|
496
|
+
source_reputation = self.get_source_reputation(input_data)
|
|
497
|
+
|
|
498
|
+
# Get backlink estimation
|
|
499
|
+
backlinks_data = self.estimate_backlinks(input_data)
|
|
500
|
+
|
|
501
|
+
# Perform fact check on the content/URL
|
|
502
|
+
# Use provided query or fall back to input_data
|
|
503
|
+
query_to_use = fc_query if fc_query else input_data
|
|
504
|
+
fact_checks = self.google_fact_check(query_to_use)
|
|
505
|
+
|
|
506
|
+
return ExternalData(
|
|
507
|
+
fact_checks=fact_checks,
|
|
508
|
+
source_reputation=source_reputation,
|
|
509
|
+
domain_age_days=domain_age_days,
|
|
510
|
+
domain_info=domain_info,
|
|
511
|
+
related_articles=[], # TODO: Implement related article search
|
|
512
|
+
backlinks_count=backlinks_data.get('estimated_count', 0),
|
|
513
|
+
backlinks_sample=backlinks_data.get('sample_backlinks', [])
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
# --- Testing ---
|
|
518
|
+
if __name__ == "__main__":
|
|
519
|
+
print("=== Testing ExternalAPIClients ===\n")
|
|
520
|
+
|
|
521
|
+
client = ExternalAPIClients()
|
|
522
|
+
|
|
523
|
+
# Test 1: Web content fetching
|
|
524
|
+
print("Test 1: Fetching web content from Le Monde...")
|
|
525
|
+
content = client.fetch_web_content("https://www.lemonde.fr")
|
|
526
|
+
print(f" Success: {content.success}")
|
|
527
|
+
print(f" Title: {content.title}")
|
|
528
|
+
print(f" Text length: {len(content.text_content)} chars")
|
|
529
|
+
print(f" Links found: {len(content.links)}")
|
|
530
|
+
print()
|
|
531
|
+
|
|
532
|
+
# Test 2: WHOIS lookup
|
|
533
|
+
print("Test 2: WHOIS lookup for lemonde.fr...")
|
|
534
|
+
domain_info = client.whois_lookup("https://www.lemonde.fr")
|
|
535
|
+
print(f" Success: {domain_info.success}")
|
|
536
|
+
print(f" Domain: {domain_info.domain}")
|
|
537
|
+
print(f" Age: {domain_info.age_days} days")
|
|
538
|
+
print(f" Registrar: {domain_info.registrar}")
|
|
539
|
+
print()
|
|
540
|
+
|
|
541
|
+
# Test 3: Source reputation
|
|
542
|
+
print("Test 3: Source reputation checks...")
|
|
543
|
+
test_urls = [
|
|
544
|
+
"https://www.nytimes.com/article",
|
|
545
|
+
"https://www.infowars.com/post",
|
|
546
|
+
"https://random-blog.wordpress.com"
|
|
547
|
+
]
|
|
548
|
+
for url in test_urls:
|
|
549
|
+
rep = client.get_source_reputation(url)
|
|
550
|
+
print(f" {url}: {rep}")
|
|
551
|
+
print()
|
|
552
|
+
|
|
553
|
+
# Test 4: Full external data
|
|
554
|
+
print("Test 4: Full external data fetch...")
|
|
555
|
+
external_data = client.fetch_external_data("https://www.bbc.com/news")
|
|
556
|
+
print(f" Source reputation: {external_data.source_reputation}")
|
|
557
|
+
print(f" Domain age: {external_data.domain_age_days} days")
|
|
558
|
+
print(f" Fact checks found: {len(external_data.fact_checks)}")
|
|
559
|
+
|
|
560
|
+
print("\n=== Tests Complete ===")
|