syscred 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
syscred/__init__.py ADDED
@@ -0,0 +1,41 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ SysCRED - Système Neuro-Symbolique de Vérification de Crédibilité
4
+ ===================================================================
5
+
6
+ PhD Thesis Prototype - (c) Dominique S. Loyer
7
+ Citation Key: loyerModelingHybridSystem2025
8
+
9
+ Modules:
10
+ - api_clients: Web scraping, WHOIS, Fact Check APIs
11
+ - ir_engine: BM25, QLD, TF-IDF, PRF (from TREC)
12
+ - seo_analyzer: SEO analysis, PageRank estimation
13
+ - eval_metrics: MAP, NDCG, P@K, Recall, MRR
14
+ - ontology_manager: RDFLib integration
15
+ - verification_system: Main credibility pipeline
16
+ """
17
+
18
+ __version__ = "2.0.0"
19
+ __author__ = "Dominique S. Loyer"
20
+ __citation__ = "loyerModelingHybridSystem2025"
21
+
22
+ # Core classes
23
+ from syscred.verification_system import CredibilityVerificationSystem
24
+ from syscred.api_clients import ExternalAPIClients
25
+ from syscred.ontology_manager import OntologyManager
26
+ from syscred.seo_analyzer import SEOAnalyzer
27
+ from syscred.ir_engine import IREngine
28
+ from syscred.eval_metrics import EvaluationMetrics
29
+
30
+ # Convenience alias
31
+ SysCRED = CredibilityVerificationSystem
32
+
33
+ __all__ = [
34
+ 'CredibilityVerificationSystem',
35
+ 'SysCRED',
36
+ 'ExternalAPIClients',
37
+ 'OntologyManager',
38
+ 'SEOAnalyzer',
39
+ 'IREngine',
40
+ 'EvaluationMetrics',
41
+ ]
syscred/api_clients.py ADDED
@@ -0,0 +1,560 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ API Clients Module - SysCRED
4
+ ============================
5
+ Handles all external API calls for the credibility verification system.
6
+
7
+ APIs intégrées:
8
+ - Web content fetching (requests + BeautifulSoup)
9
+ - WHOIS lookup for domain age
10
+ - Google Fact Check Tools API
11
+ - Backlinks estimation via CommonCrawl
12
+
13
+ (c) Dominique S. Loyer - PhD Thesis Prototype
14
+ Citation Key: loyerModelingHybridSystem2025
15
+ """
16
+
17
+ import requests
18
+ from urllib.parse import urlparse
19
+ from datetime import datetime, timedelta
20
+ from typing import Optional, List, Dict, Any
21
+ from dataclasses import dataclass
22
+ import re
23
+ import json
24
+ from functools import lru_cache
25
+
26
+ # Optional imports with fallbacks
27
+ try:
28
+ from bs4 import BeautifulSoup
29
+ HAS_BS4 = True
30
+ except ImportError:
31
+ HAS_BS4 = False
32
+ print("Warning: BeautifulSoup not installed. Run: pip install beautifulsoup4")
33
+
34
+ try:
35
+ import whois
36
+ HAS_WHOIS = True
37
+ except ImportError:
38
+ HAS_WHOIS = False
39
+ print("Warning: python-whois not installed. Run: pip install python-whois")
40
+
41
+
42
+ # --- Data Classes for Structured Results ---
43
+
44
+ @dataclass
45
+ class WebContent:
46
+ """Represents fetched web content."""
47
+ url: str
48
+ title: Optional[str]
49
+ text_content: str
50
+ meta_description: Optional[str]
51
+ meta_keywords: List[str]
52
+ links: List[str]
53
+ fetch_timestamp: str
54
+ success: bool
55
+ error: Optional[str] = None
56
+
57
+
58
+ @dataclass
59
+ class DomainInfo:
60
+ """Represents domain WHOIS information."""
61
+ domain: str
62
+ creation_date: Optional[datetime]
63
+ expiration_date: Optional[datetime]
64
+ registrar: Optional[str]
65
+ age_days: Optional[int]
66
+ success: bool
67
+ error: Optional[str] = None
68
+
69
+
70
+ @dataclass
71
+ class FactCheckResult:
72
+ """Represents a single fact-check claim review."""
73
+ claim: str
74
+ claimant: Optional[str]
75
+ rating: str
76
+ publisher: str
77
+ url: str
78
+ review_date: Optional[str]
79
+
80
+
81
+ @dataclass
82
+ class ExternalData:
83
+ """Combined external data for credibility analysis."""
84
+ fact_checks: List[FactCheckResult]
85
+ source_reputation: str
86
+ domain_age_days: Optional[int]
87
+ domain_info: Optional[DomainInfo]
88
+ related_articles: List[Dict[str, str]]
89
+ backlinks_count: int
90
+ backlinks_sample: List[Dict[str, str]]
91
+
92
+
93
+ class ExternalAPIClients:
94
+ """
95
+ Central class for all external API integrations.
96
+ Replaces simulated functions with real API calls.
97
+ """
98
+
99
+ def __init__(self, google_api_key: Optional[str] = None):
100
+ """
101
+ Initialize API clients.
102
+
103
+ Args:
104
+ google_api_key: API key for Google Fact Check Tools API (optional)
105
+ """
106
+ self.google_api_key = google_api_key
107
+ self.session = requests.Session()
108
+ self.session.headers.update({
109
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
110
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
111
+ 'Accept-Language': 'en-US,en;q=0.9,fr;q=0.8',
112
+ 'Referer': 'https://www.google.com/',
113
+ 'Upgrade-Insecure-Requests': '1',
114
+ 'Sec-Fetch-Dest': 'document',
115
+ 'Sec-Fetch-Mode': 'navigate',
116
+ 'Sec-Fetch-Site': 'none',
117
+ 'Sec-Fetch-User': '?1'
118
+ })
119
+
120
+ # Reputation database (can be extended or loaded from file)
121
+ self.known_reputations = {
122
+ # High credibility sources
123
+ 'lemonde.fr': 'High',
124
+ 'nytimes.com': 'High',
125
+ 'reuters.com': 'High',
126
+ 'bbc.com': 'High',
127
+ 'theguardian.com': 'High',
128
+ 'apnews.com': 'High',
129
+ 'nature.com': 'High',
130
+ 'sciencedirect.com': 'High',
131
+ 'scholar.google.com': 'High',
132
+ 'factcheck.org': 'High',
133
+ 'snopes.com': 'High',
134
+ 'politifact.com': 'High',
135
+ # Medium credibility
136
+ 'wikipedia.org': 'Medium',
137
+ 'medium.com': 'Medium',
138
+ 'huffpost.com': 'Medium',
139
+ # Low credibility (known misinformation spreaders)
140
+ 'infowars.com': 'Low',
141
+ 'naturalnews.com': 'Low',
142
+ }
143
+
144
+ def fetch_web_content(self, url: str, timeout: int = 10) -> WebContent:
145
+ """
146
+ Fetch and parse web content from a URL.
147
+
148
+ Args:
149
+ url: The URL to fetch
150
+ timeout: Request timeout in seconds
151
+
152
+ Returns:
153
+ WebContent dataclass with extracted information
154
+ """
155
+ timestamp = datetime.now().isoformat()
156
+
157
+ if not HAS_BS4:
158
+ return WebContent(
159
+ url=url, title=None, text_content="",
160
+ meta_description=None, meta_keywords=[],
161
+ links=[], fetch_timestamp=timestamp,
162
+ success=False, error="BeautifulSoup not installed"
163
+ )
164
+
165
+ try:
166
+ try:
167
+ response = self.session.get(url, timeout=timeout, allow_redirects=True)
168
+ response.raise_for_status()
169
+ except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
170
+ print(f"[SysCRED] SSL/Connection error for {url}. Retrying without verification...")
171
+ # Suppress warnings for unverified HTTPS request
172
+ import urllib3
173
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
174
+ response = self.session.get(url, timeout=timeout, allow_redirects=True, verify=False)
175
+ response.raise_for_status()
176
+
177
+ soup = BeautifulSoup(response.text, 'html.parser')
178
+
179
+ # Extract title
180
+ title = soup.title.string.strip() if soup.title else None
181
+
182
+ # Extract meta description
183
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
184
+ meta_description = meta_desc.get('content', '') if meta_desc else None
185
+
186
+ # Extract meta keywords
187
+ meta_kw = soup.find('meta', attrs={'name': 'keywords'})
188
+ meta_keywords = []
189
+ if meta_kw and meta_kw.get('content'):
190
+ meta_keywords = [k.strip() for k in meta_kw.get('content', '').split(',')]
191
+
192
+ # Remove script and style elements
193
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
194
+ element.decompose()
195
+
196
+ # Extract main text content
197
+ text_content = soup.get_text(separator=' ', strip=True)
198
+ # Clean up excessive whitespace
199
+ text_content = re.sub(r'\s+', ' ', text_content)
200
+
201
+ # Extract links
202
+ links = []
203
+ for a_tag in soup.find_all('a', href=True)[:50]: # Limit to 50 links
204
+ href = a_tag['href']
205
+ if href.startswith('http'):
206
+ links.append(href)
207
+
208
+ return WebContent(
209
+ url=url,
210
+ title=title,
211
+ text_content=text_content[:10000], # Limit text size
212
+ meta_description=meta_description,
213
+ meta_keywords=meta_keywords,
214
+ links=links,
215
+ fetch_timestamp=timestamp,
216
+ success=True
217
+ )
218
+
219
+ except requests.exceptions.Timeout:
220
+ return WebContent(
221
+ url=url, title=None, text_content="",
222
+ meta_description=None, meta_keywords=[], links=[],
223
+ fetch_timestamp=timestamp, success=False,
224
+ error=f"Timeout after {timeout}s"
225
+ )
226
+ except requests.exceptions.RequestException as e:
227
+ return WebContent(
228
+ url=url, title=None, text_content="",
229
+ meta_description=None, meta_keywords=[], links=[],
230
+ fetch_timestamp=timestamp, success=False,
231
+ error=str(e)
232
+ )
233
+ except Exception as e:
234
+ return WebContent(
235
+ url=url, title=None, text_content="",
236
+ meta_description=None, meta_keywords=[], links=[],
237
+ fetch_timestamp=timestamp, success=False,
238
+ error=f"Parsing error: {str(e)}"
239
+ )
240
+
241
+ @lru_cache(maxsize=128)
242
+ def whois_lookup(self, url_or_domain: str) -> DomainInfo:
243
+ """
244
+ Perform WHOIS lookup to get domain registration information.
245
+
246
+ Args:
247
+ url_or_domain: URL or domain name
248
+
249
+ Returns:
250
+ DomainInfo dataclass with domain details
251
+ """
252
+ # Extract domain from URL if needed
253
+ if url_or_domain.startswith('http'):
254
+ domain = urlparse(url_or_domain).netloc
255
+ else:
256
+ domain = url_or_domain
257
+
258
+ # Remove 'www.' prefix
259
+ if domain.startswith('www.'):
260
+ domain = domain[4:]
261
+
262
+ if not HAS_WHOIS:
263
+ return DomainInfo(
264
+ domain=domain,
265
+ creation_date=None, expiration_date=None,
266
+ registrar=None, age_days=None,
267
+ success=False, error="python-whois not installed"
268
+ )
269
+
270
+ try:
271
+ w = whois.whois(domain)
272
+
273
+ # Handle creation_date (can be a list or single value)
274
+ creation_date = w.creation_date
275
+ if isinstance(creation_date, list):
276
+ creation_date = creation_date[0]
277
+
278
+ # Handle expiration_date
279
+ expiration_date = w.expiration_date
280
+ if isinstance(expiration_date, list):
281
+ expiration_date = expiration_date[0]
282
+
283
+ # Calculate age in days
284
+ age_days = None
285
+ if creation_date:
286
+ if isinstance(creation_date, datetime):
287
+ age_days = (datetime.now() - creation_date).days
288
+
289
+ return DomainInfo(
290
+ domain=domain,
291
+ creation_date=creation_date,
292
+ expiration_date=expiration_date,
293
+ registrar=w.registrar,
294
+ age_days=age_days,
295
+ success=True
296
+ )
297
+
298
+ except Exception as e:
299
+ return DomainInfo(
300
+ domain=domain,
301
+ creation_date=None, expiration_date=None,
302
+ registrar=None, age_days=None,
303
+ success=False, error=str(e)
304
+ )
305
+
306
+ def google_fact_check(self, query: str, language: str = "fr") -> List[FactCheckResult]:
307
+ """
308
+ Query Google Fact Check Tools API.
309
+
310
+ Args:
311
+ query: The claim or text to check
312
+ language: Language code (default: French)
313
+
314
+ Returns:
315
+ List of FactCheckResult objects
316
+ """
317
+ results = []
318
+
319
+ if not self.google_api_key:
320
+ print("[Info] Google Fact Check API key not configured. Using simulation.")
321
+ return self._simulate_fact_check(query)
322
+
323
+ try:
324
+ api_url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
325
+ params = {
326
+ 'key': self.google_api_key,
327
+ 'query': query[:200], # API has character limit
328
+ # 'languageCode': language # Removed to allow all languages (e.g. English queries)
329
+ }
330
+
331
+ response = self.session.get(api_url, params=params, timeout=10)
332
+ response.raise_for_status()
333
+ data = response.json()
334
+
335
+ claims = data.get('claims', [])
336
+ for claim in claims[:5]: # Limit to 5 results
337
+ text = claim.get('text', '')
338
+ claimant = claim.get('claimant')
339
+
340
+ for review in claim.get('claimReview', []):
341
+ results.append(FactCheckResult(
342
+ claim=text,
343
+ claimant=claimant,
344
+ rating=review.get('textualRating', 'Unknown'),
345
+ publisher=review.get('publisher', {}).get('name', 'Unknown'),
346
+ url=review.get('url', ''),
347
+ review_date=review.get('reviewDate')
348
+ ))
349
+
350
+ return results
351
+
352
+ except Exception as e:
353
+ print(f"[Warning] Google Fact Check API error: {e}")
354
+ return self._simulate_fact_check(query)
355
+
356
+ def _simulate_fact_check(self, query: str) -> List[FactCheckResult]:
357
+ """Fallback simulation when API is not available."""
358
+ # Check for known misinformation patterns
359
+ misinformation_keywords = [
360
+ 'conspiracy', 'hoax', 'fake', 'miracle cure', 'they don\'t want you to know',
361
+ 'mainstream media lies', 'deep state', 'plandemic'
362
+ ]
363
+
364
+ query_lower = query.lower()
365
+ for keyword in misinformation_keywords:
366
+ if keyword in query_lower:
367
+ return [FactCheckResult(
368
+ claim=f"Text contains potential misinformation marker: '{keyword}'",
369
+ claimant=None,
370
+ rating="Needs Verification",
371
+ publisher="SysCRED Heuristic",
372
+ url="",
373
+ review_date=datetime.now().isoformat()
374
+ )]
375
+
376
+ return [] # No fact checks found
377
+
378
+ @lru_cache(maxsize=128)
379
+ def get_source_reputation(self, url: str) -> str:
380
+ """
381
+ Get reputation score for a source/domain.
382
+
383
+ Args:
384
+ url: URL or domain to check
385
+
386
+ Returns:
387
+ Reputation level: 'High', 'Medium', 'Low', or 'Unknown'
388
+ """
389
+ if url.startswith('http'):
390
+ domain = urlparse(url).netloc
391
+ else:
392
+ domain = url
393
+
394
+ # Remove www prefix
395
+ if domain.startswith('www.'):
396
+ domain = domain[4:]
397
+
398
+ # Check known reputations
399
+ for known_domain, reputation in self.known_reputations.items():
400
+ if domain.endswith(known_domain) or known_domain in domain:
401
+ return reputation
402
+
403
+ # Heuristics for unknown domains
404
+ # Academic domains tend to be more credible
405
+ if domain.endswith('.edu') or domain.endswith('.gov') or domain.endswith('.ac.uk'):
406
+ return 'High'
407
+
408
+ # Personal sites and free hosting are less credible
409
+ if any(x in domain for x in ['.blogspot.', '.wordpress.', '.wix.', '.weebly.']):
410
+ return 'Low'
411
+
412
+ return 'Unknown'
413
+
414
+ def estimate_backlinks(self, url: str) -> Dict[str, Any]:
415
+ """
416
+ Estimate relative authority/backlinks based on available signals.
417
+
418
+ Since real backlink databases (Ahrefs, Moz) are paid/proprietary,
419
+ we use a composite heuristic based on:
420
+ 1. Domain age (older domains tend to have more backlinks)
421
+ 2. Known reputation (High reputation sources imply high backlinks)
422
+ 3. Google Fact Check mentions (as a proxy for visibility in fact-checks)
423
+ """
424
+ domain = urlparse(url).netloc
425
+ if domain.startswith('www.'):
426
+ domain = domain[4:]
427
+
428
+ # 1. Base Score from Reputation
429
+ reputation = self.get_source_reputation(domain)
430
+ base_count = 0
431
+ if reputation == 'High':
432
+ base_count = 10000 # High authority
433
+ elif reputation == 'Medium':
434
+ base_count = 1000 # Medium authority
435
+ elif reputation == 'Low':
436
+ base_count = 50 # Low authority
437
+ else:
438
+ base_count = 100 # Unknown
439
+
440
+ # 2. Multiplier from Domain Age
441
+ age_multiplier = 1.0
442
+ domain_info = self.whois_lookup(domain)
443
+ if domain_info.success and domain_info.age_days:
444
+ # Add 10% for every year of age, max 5x
445
+ years = domain_info.age_days / 365
446
+ age_multiplier = min(5.0, 1.0 + (years * 0.1))
447
+
448
+ estimated_count = int(base_count * age_multiplier)
449
+
450
+ # 3. Adjust for specific TLDs
451
+ if domain.endswith('.edu') or domain.endswith('.gov'):
452
+ estimated_count *= 2
453
+
454
+ return {
455
+ 'estimated_count': estimated_count,
456
+ 'sample_backlinks': [], # Real sample requires SERP API
457
+ 'method': 'heuristic_v2.1',
458
+ 'note': 'Estimated from domain age and reputation (Proxy)'
459
+ }
460
+
461
+ def fetch_external_data(self, input_data: str, fc_query: str = None) -> ExternalData:
462
+ """
463
+ Main method to fetch all external data for credibility analysis.
464
+ This replaces the simulated fetch_external_data function.
465
+
466
+ Args:
467
+ input_data: URL or text to analyze
468
+
469
+ Returns:
470
+ ExternalData with all gathered information
471
+ """
472
+ from urllib.parse import urlparse
473
+
474
+ # Determine if input is URL
475
+ is_url = False
476
+ try:
477
+ result = urlparse(input_data)
478
+ is_url = all([result.scheme, result.netloc])
479
+ except:
480
+ pass
481
+
482
+ # Initialize results
483
+ domain_age_days = None
484
+ domain_info = None
485
+ source_reputation = 'Unknown'
486
+ fact_checks = []
487
+ backlinks_data = {'estimated_count': 0, 'sample_backlinks': []}
488
+
489
+ if is_url:
490
+ # Get domain information
491
+ domain_info = self.whois_lookup(input_data)
492
+ if domain_info.success:
493
+ domain_age_days = domain_info.age_days
494
+
495
+ # Get source reputation
496
+ source_reputation = self.get_source_reputation(input_data)
497
+
498
+ # Get backlink estimation
499
+ backlinks_data = self.estimate_backlinks(input_data)
500
+
501
+ # Perform fact check on the content/URL
502
+ # Use provided query or fall back to input_data
503
+ query_to_use = fc_query if fc_query else input_data
504
+ fact_checks = self.google_fact_check(query_to_use)
505
+
506
+ return ExternalData(
507
+ fact_checks=fact_checks,
508
+ source_reputation=source_reputation,
509
+ domain_age_days=domain_age_days,
510
+ domain_info=domain_info,
511
+ related_articles=[], # TODO: Implement related article search
512
+ backlinks_count=backlinks_data.get('estimated_count', 0),
513
+ backlinks_sample=backlinks_data.get('sample_backlinks', [])
514
+ )
515
+
516
+
517
+ # --- Testing ---
518
+ if __name__ == "__main__":
519
+ print("=== Testing ExternalAPIClients ===\n")
520
+
521
+ client = ExternalAPIClients()
522
+
523
+ # Test 1: Web content fetching
524
+ print("Test 1: Fetching web content from Le Monde...")
525
+ content = client.fetch_web_content("https://www.lemonde.fr")
526
+ print(f" Success: {content.success}")
527
+ print(f" Title: {content.title}")
528
+ print(f" Text length: {len(content.text_content)} chars")
529
+ print(f" Links found: {len(content.links)}")
530
+ print()
531
+
532
+ # Test 2: WHOIS lookup
533
+ print("Test 2: WHOIS lookup for lemonde.fr...")
534
+ domain_info = client.whois_lookup("https://www.lemonde.fr")
535
+ print(f" Success: {domain_info.success}")
536
+ print(f" Domain: {domain_info.domain}")
537
+ print(f" Age: {domain_info.age_days} days")
538
+ print(f" Registrar: {domain_info.registrar}")
539
+ print()
540
+
541
+ # Test 3: Source reputation
542
+ print("Test 3: Source reputation checks...")
543
+ test_urls = [
544
+ "https://www.nytimes.com/article",
545
+ "https://www.infowars.com/post",
546
+ "https://random-blog.wordpress.com"
547
+ ]
548
+ for url in test_urls:
549
+ rep = client.get_source_reputation(url)
550
+ print(f" {url}: {rep}")
551
+ print()
552
+
553
+ # Test 4: Full external data
554
+ print("Test 4: Full external data fetch...")
555
+ external_data = client.fetch_external_data("https://www.bbc.com/news")
556
+ print(f" Source reputation: {external_data.source_reputation}")
557
+ print(f" Domain age: {external_data.domain_age_days} days")
558
+ print(f" Fact checks found: {len(external_data.fact_checks)}")
559
+
560
+ print("\n=== Tests Complete ===")