syscred 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syscred/__init__.py +41 -0
- syscred/api_clients.py +560 -0
- syscred/backend_app.py +363 -0
- syscred/config.py +275 -0
- syscred/database.py +54 -0
- syscred/debug_factcheck.py +43 -0
- syscred/debug_graph_json.py +58 -0
- syscred/debug_init.py +33 -0
- syscred/debug_local_server.py +25 -0
- syscred/diagnose_imports.py +37 -0
- syscred/eval_metrics.py +349 -0
- syscred/graph_rag.py +171 -0
- syscred/ir_engine.py +410 -0
- syscred/ontology_manager.py +509 -0
- syscred/run_benchmark.py +135 -0
- syscred/seo_analyzer.py +610 -0
- syscred/setup.py +65 -0
- syscred/test_graphrag.py +87 -0
- syscred/test_phase1.py +28 -0
- syscred/test_phase2.py +55 -0
- syscred/test_suite.py +64 -0
- syscred/verification_system.py +765 -0
- syscred-2.2.0.dist-info/METADATA +259 -0
- syscred-2.2.0.dist-info/RECORD +28 -0
- syscred-2.2.0.dist-info/WHEEL +5 -0
- syscred-2.2.0.dist-info/entry_points.txt +3 -0
- syscred-2.2.0.dist-info/licenses/LICENSE +21 -0
- syscred-2.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,765 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Verification System Module - SysCRED v2.0
|
|
4
|
+
==========================================
|
|
5
|
+
Main credibility verification system with real API integration.
|
|
6
|
+
Refactored from sys-cred-Python-27avril2025.py
|
|
7
|
+
|
|
8
|
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
|
9
|
+
Citation Key: loyerModelingHybridSystem2025
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
import json
|
|
14
|
+
import datetime
|
|
15
|
+
from typing import Optional, Dict, Any, List
|
|
16
|
+
from urllib.parse import urlparse
|
|
17
|
+
|
|
18
|
+
# Transformers and ML
|
|
19
|
+
try:
|
|
20
|
+
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
|
21
|
+
import numpy as np
|
|
22
|
+
import torch
|
|
23
|
+
from lime.lime_text import LimeTextExplainer
|
|
24
|
+
HAS_ML = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
HAS_ML = False
|
|
27
|
+
print("Warning: ML libraries not fully installed. Run: pip install transformers torch lime numpy")
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from sentence_transformers import SentenceTransformer, util
|
|
31
|
+
HAS_SBERT = True
|
|
32
|
+
except ImportError:
|
|
33
|
+
HAS_SBERT = False
|
|
34
|
+
print("Warning: sentence-transformers not installed. Semantic coherence will use heuristics.")
|
|
35
|
+
|
|
36
|
+
# Local imports
|
|
37
|
+
from syscred.api_clients import ExternalAPIClients, WebContent, ExternalData
|
|
38
|
+
from syscred.ontology_manager import OntologyManager
|
|
39
|
+
from syscred.seo_analyzer import SEOAnalyzer
|
|
40
|
+
from syscred.graph_rag import GraphRAG # [NEW] GraphRAG
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class CredibilityVerificationSystem:
|
|
44
|
+
"""
|
|
45
|
+
Système neuro-symbolique de vérification de crédibilité.
|
|
46
|
+
|
|
47
|
+
Combine:
|
|
48
|
+
- Analyse basée sur des règles (symbolique, transparent)
|
|
49
|
+
- Analyse NLP/IA (apprentissage automatique)
|
|
50
|
+
- Ontologie OWL pour la traçabilité
|
|
51
|
+
- APIs externes pour les données réelles
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
google_api_key: Optional[str] = None,
|
|
57
|
+
ontology_base_path: Optional[str] = None,
|
|
58
|
+
ontology_data_path: Optional[str] = None,
|
|
59
|
+
load_ml_models: bool = True
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Initialize the credibility verification system.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
google_api_key: API key for Google Fact Check (optional)
|
|
66
|
+
ontology_base_path: Path to base ontology TTL file
|
|
67
|
+
ontology_data_path: Path to store accumulated data
|
|
68
|
+
load_ml_models: Whether to load ML models (disable for testing)
|
|
69
|
+
"""
|
|
70
|
+
print("[SysCRED] Initializing Credibility Verification System v2.0...")
|
|
71
|
+
|
|
72
|
+
# Initialize API clients
|
|
73
|
+
self.api_clients = ExternalAPIClients(google_api_key=google_api_key)
|
|
74
|
+
print("[SysCRED] API clients initialized")
|
|
75
|
+
|
|
76
|
+
# Initialize ontology manager
|
|
77
|
+
self.ontology_manager = None
|
|
78
|
+
if ontology_base_path or ontology_data_path:
|
|
79
|
+
try:
|
|
80
|
+
self.ontology_manager = OntologyManager(
|
|
81
|
+
base_ontology_path=ontology_base_path,
|
|
82
|
+
data_path=ontology_data_path
|
|
83
|
+
)
|
|
84
|
+
self.graph_rag = GraphRAG(self.ontology_manager) # [NEW] Init GraphRAG
|
|
85
|
+
print("[SysCRED] Ontology manager & GraphRAG initialized")
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(f"[SysCRED] Ontology manager disabled: {e}")
|
|
88
|
+
self.graph_rag = None
|
|
89
|
+
else:
|
|
90
|
+
self.graph_rag = None
|
|
91
|
+
|
|
92
|
+
# Initialize ML models
|
|
93
|
+
self.sentiment_pipeline = None
|
|
94
|
+
self.ner_pipeline = None
|
|
95
|
+
self.bias_tokenizer = None
|
|
96
|
+
self.bias_model = None
|
|
97
|
+
self.coherence_model = None
|
|
98
|
+
self.explainer = None
|
|
99
|
+
|
|
100
|
+
if load_ml_models and HAS_ML:
|
|
101
|
+
self._load_ml_models()
|
|
102
|
+
|
|
103
|
+
# Weights for score calculation (configurable)
|
|
104
|
+
# Weights for score calculation (Loaded from Config)
|
|
105
|
+
self.weights = config.Config.SCORE_WEIGHTS
|
|
106
|
+
print(f"[SysCRED] Using weights: {self.weights}")
|
|
107
|
+
|
|
108
|
+
print("[SysCRED] System ready!")
|
|
109
|
+
|
|
110
|
+
def _load_ml_models(self):
|
|
111
|
+
"""Load ML models for NLP analysis."""
|
|
112
|
+
print("[SysCRED] Loading ML models (this may take a moment)...")
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
# Sentiment analysis
|
|
116
|
+
self.sentiment_pipeline = pipeline(
|
|
117
|
+
"sentiment-analysis",
|
|
118
|
+
model="distilbert-base-uncased-finetuned-sst-2-english"
|
|
119
|
+
)
|
|
120
|
+
print("[SysCRED] ✓ Sentiment model loaded")
|
|
121
|
+
except Exception as e:
|
|
122
|
+
print(f"[SysCRED] ✗ Sentiment model failed: {e}")
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
# NER pipeline
|
|
126
|
+
self.ner_pipeline = pipeline("ner", grouped_entities=True)
|
|
127
|
+
print("[SysCRED] ✓ NER model loaded")
|
|
128
|
+
except Exception as e:
|
|
129
|
+
print(f"[SysCRED] ✗ NER model failed: {e}")
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
# Bias detection - Specialized model
|
|
133
|
+
# Using 'd4data/bias-detection-model' or fallback to generic
|
|
134
|
+
bias_model_name = "d4data/bias-detection-model"
|
|
135
|
+
self.bias_tokenizer = AutoTokenizer.from_pretrained(bias_model_name)
|
|
136
|
+
self.bias_model = AutoModelForSequenceClassification.from_pretrained(bias_model_name)
|
|
137
|
+
print("[SysCRED] ✓ Bias model loaded (d4data)")
|
|
138
|
+
except Exception as e:
|
|
139
|
+
print(f"[SysCRED] ✗ Bias model failed: {e}. Using heuristics.")
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
# Semantic Coherence
|
|
143
|
+
if HAS_SBERT:
|
|
144
|
+
self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
145
|
+
print("[SysCRED] ✓ Coherence model loaded (SBERT)")
|
|
146
|
+
except Exception as e:
|
|
147
|
+
print(f"[SysCRED] ✗ Coherence model failed: {e}")
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
# LIME explainer
|
|
151
|
+
self.explainer = LimeTextExplainer(class_names=['NEGATIVE', 'POSITIVE'])
|
|
152
|
+
print("[SysCRED] ✓ LIME explainer loaded")
|
|
153
|
+
except Exception as e:
|
|
154
|
+
print(f"[SysCRED] ✗ LIME explainer failed: {e}")
|
|
155
|
+
|
|
156
|
+
def is_url(self, text: str) -> bool:
|
|
157
|
+
"""Check if a string is a valid URL."""
|
|
158
|
+
try:
|
|
159
|
+
result = urlparse(text)
|
|
160
|
+
return all([result.scheme, result.netloc])
|
|
161
|
+
except ValueError:
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
def preprocess(self, text: str) -> str:
|
|
165
|
+
"""Clean and normalize text for analysis."""
|
|
166
|
+
if not isinstance(text, str):
|
|
167
|
+
return ""
|
|
168
|
+
|
|
169
|
+
# Remove URLs
|
|
170
|
+
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
|
171
|
+
# Normalize whitespace
|
|
172
|
+
text = re.sub(r'\s+', ' ', text)
|
|
173
|
+
# Keep basic punctuation
|
|
174
|
+
text = re.sub(r'[^\w\s\.\?,!]', '', text)
|
|
175
|
+
|
|
176
|
+
return text.lower().strip()
|
|
177
|
+
|
|
178
|
+
def rule_based_analysis(self, text: str, external_data: ExternalData) -> Dict[str, Any]:
|
|
179
|
+
"""
|
|
180
|
+
Perform rule-based analysis using symbolic reasoning.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
text: Preprocessed text to analyze
|
|
184
|
+
external_data: Data from external APIs
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Dictionary with rule-based analysis results
|
|
188
|
+
"""
|
|
189
|
+
results = {
|
|
190
|
+
'linguistic_markers': {},
|
|
191
|
+
'source_analysis': {},
|
|
192
|
+
'timeliness_flags': [],
|
|
193
|
+
'fact_checking': []
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
# 1. Linguistic markers
|
|
197
|
+
sensational_words = [
|
|
198
|
+
'shocking', 'revealed', 'conspiracy', 'amazing', 'secret',
|
|
199
|
+
'breakthrough', 'miracle', 'unbelievable', 'exclusive', 'urgent'
|
|
200
|
+
]
|
|
201
|
+
certainty_words = [
|
|
202
|
+
'verified', 'authentic', 'credible', 'proven', 'fact',
|
|
203
|
+
'confirmed', 'official', 'legitimate', 'established'
|
|
204
|
+
]
|
|
205
|
+
doubt_words = [
|
|
206
|
+
'hoax', 'false', 'fake', 'unproven', 'rumor', 'allegedly',
|
|
207
|
+
'claim', 'debunked', 'misleading', 'disputed'
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
text_lower = text.lower()
|
|
211
|
+
results['linguistic_markers']['sensationalism'] = sum(
|
|
212
|
+
1 for word in sensational_words if word in text_lower
|
|
213
|
+
)
|
|
214
|
+
results['linguistic_markers']['certainty'] = sum(
|
|
215
|
+
1 for word in certainty_words if word in text_lower
|
|
216
|
+
)
|
|
217
|
+
results['linguistic_markers']['doubt'] = sum(
|
|
218
|
+
1 for word in doubt_words if word in text_lower
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# 2. Source analysis from external data
|
|
222
|
+
results['source_analysis']['reputation'] = external_data.source_reputation
|
|
223
|
+
results['source_analysis']['domain_age_days'] = external_data.domain_age_days
|
|
224
|
+
|
|
225
|
+
if external_data.domain_info:
|
|
226
|
+
results['source_analysis']['registrar'] = external_data.domain_info.registrar
|
|
227
|
+
results['source_analysis']['domain'] = external_data.domain_info.domain
|
|
228
|
+
|
|
229
|
+
# 3. Timeliness flags
|
|
230
|
+
if external_data.domain_age_days is not None:
|
|
231
|
+
if external_data.domain_age_days < 180:
|
|
232
|
+
results['timeliness_flags'].append('Source domain is relatively new (<6 months)')
|
|
233
|
+
elif external_data.domain_age_days < 365:
|
|
234
|
+
results['timeliness_flags'].append('Source domain is less than 1 year old')
|
|
235
|
+
|
|
236
|
+
# 4. Fact checking results
|
|
237
|
+
for fc in external_data.fact_checks:
|
|
238
|
+
results['fact_checking'].append({
|
|
239
|
+
'claim': fc.claim,
|
|
240
|
+
'rating': fc.rating,
|
|
241
|
+
'publisher': fc.publisher,
|
|
242
|
+
'url': fc.url
|
|
243
|
+
})
|
|
244
|
+
|
|
245
|
+
return results
|
|
246
|
+
|
|
247
|
+
def nlp_analysis(self, text: str) -> Dict[str, Any]:
|
|
248
|
+
"""
|
|
249
|
+
Perform NLP-based analysis using ML models.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
text: Preprocessed text to analyze
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Dictionary with NLP analysis results
|
|
256
|
+
"""
|
|
257
|
+
results = {
|
|
258
|
+
'sentiment': None,
|
|
259
|
+
'sentiment_explanation': None,
|
|
260
|
+
'bias_analysis': {'score': None, 'label': 'Unavailable'},
|
|
261
|
+
'named_entities': [],
|
|
262
|
+
'coherence_score': None
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
if not text:
|
|
266
|
+
results['sentiment'] = {'label': 'Neutral', 'score': 0.5}
|
|
267
|
+
return results
|
|
268
|
+
|
|
269
|
+
# 1. Sentiment analysis with LIME explanation
|
|
270
|
+
if self.sentiment_pipeline:
|
|
271
|
+
try:
|
|
272
|
+
main_pred = self.sentiment_pipeline(text[:512])[0]
|
|
273
|
+
results['sentiment'] = main_pred
|
|
274
|
+
|
|
275
|
+
if self.explainer:
|
|
276
|
+
def predict_proba(texts):
|
|
277
|
+
if isinstance(texts, str):
|
|
278
|
+
texts = [texts]
|
|
279
|
+
predictions = self.sentiment_pipeline(list(texts))
|
|
280
|
+
probs = []
|
|
281
|
+
for pred in predictions:
|
|
282
|
+
if pred['label'] == 'POSITIVE':
|
|
283
|
+
probs.append([1 - pred['score'], pred['score']])
|
|
284
|
+
else:
|
|
285
|
+
probs.append([pred['score'], 1 - pred['score']])
|
|
286
|
+
return np.array(probs)
|
|
287
|
+
|
|
288
|
+
explanation = self.explainer.explain_instance(
|
|
289
|
+
text[:512], predict_proba, num_features=6
|
|
290
|
+
)
|
|
291
|
+
results['sentiment_explanation'] = explanation.as_list()
|
|
292
|
+
except Exception as e:
|
|
293
|
+
print(f"[NLP] Sentiment error: {e}")
|
|
294
|
+
results['sentiment'] = {'label': 'Error', 'score': 0.0}
|
|
295
|
+
|
|
296
|
+
# 2. Bias analysis
|
|
297
|
+
results['bias_analysis'] = self._analyze_bias(text)
|
|
298
|
+
|
|
299
|
+
# 3. Named Entity Recognition
|
|
300
|
+
if self.ner_pipeline:
|
|
301
|
+
try:
|
|
302
|
+
entities = self.ner_pipeline(text[:512])
|
|
303
|
+
results['named_entities'] = entities
|
|
304
|
+
except Exception as e:
|
|
305
|
+
print(f"[NLP] NER error: {e}")
|
|
306
|
+
|
|
307
|
+
# 4. Semantic Coherence
|
|
308
|
+
results['coherence_score'] = self._calculate_coherence(text)
|
|
309
|
+
|
|
310
|
+
return results
|
|
311
|
+
|
|
312
|
+
def _analyze_bias(self, text: str) -> Dict[str, Any]:
|
|
313
|
+
"""Analyze text for bias using ML or heuristics."""
|
|
314
|
+
# Method 1: ML Model
|
|
315
|
+
if self.bias_model and self.bias_tokenizer:
|
|
316
|
+
try:
|
|
317
|
+
inputs = self.bias_tokenizer(
|
|
318
|
+
text[:512], return_tensors="pt",
|
|
319
|
+
truncation=True, max_length=512, padding=True
|
|
320
|
+
)
|
|
321
|
+
with torch.no_grad():
|
|
322
|
+
logits = self.bias_model(**inputs).logits
|
|
323
|
+
probs = torch.softmax(logits, dim=1)[0]
|
|
324
|
+
# Label mapping depends on model, usually [Non-biased, Biased]
|
|
325
|
+
bias_score = probs[1].item()
|
|
326
|
+
|
|
327
|
+
label = " biased" if bias_score > 0.5 else "Non-biased"
|
|
328
|
+
return {'score': bias_score, 'label': label, 'method': 'ML (d4data)'}
|
|
329
|
+
except Exception as e:
|
|
330
|
+
print(f"[NLP] ML Bias error: {e}")
|
|
331
|
+
|
|
332
|
+
# Method 2: Heuristics
|
|
333
|
+
biased_words = [
|
|
334
|
+
'radical', 'extremist', 'disgraceful', 'shameful', 'corrupt',
|
|
335
|
+
'insane', 'idiot', 'disaster', 'propaganda', 'dictator',
|
|
336
|
+
'puppet', 'regime', 'tyrant', 'treason', 'traitor'
|
|
337
|
+
]
|
|
338
|
+
text_lower = text.lower()
|
|
339
|
+
count = sum(1 for w in biased_words if w in text_lower)
|
|
340
|
+
score = min(1.0, count * 0.15)
|
|
341
|
+
label = "Potentially Biased" if score > 0.3 else "Neutral"
|
|
342
|
+
return {'score': score, 'label': label, 'method': 'Heuristic'}
|
|
343
|
+
|
|
344
|
+
def _calculate_coherence(self, text: str) -> float:
|
|
345
|
+
"""Calculate semantic coherence score."""
|
|
346
|
+
sentences = re.split(r'[.!?]+', text)
|
|
347
|
+
sentences = [s.strip() for s in sentences if len(s.split()) > 3]
|
|
348
|
+
|
|
349
|
+
if len(sentences) < 2:
|
|
350
|
+
return 0.7 # Default to neutral/good for short text, not perfect 1.0
|
|
351
|
+
|
|
352
|
+
# Method 1: SBERT Semantic Similarity
|
|
353
|
+
if self.coherence_model and HAS_SBERT:
|
|
354
|
+
try:
|
|
355
|
+
embeddings = self.coherence_model.encode(sentences[:10]) # Limit to 10
|
|
356
|
+
sims = []
|
|
357
|
+
for i in range(len(embeddings) - 1):
|
|
358
|
+
sim = util.pytorch_cos_sim(embeddings[i], embeddings[i+1])
|
|
359
|
+
sims.append(sim.item())
|
|
360
|
+
return sum(sims) / len(sims) if sims else 0.5
|
|
361
|
+
except Exception as e:
|
|
362
|
+
print(f"[NLP] SBERT error: {e}")
|
|
363
|
+
|
|
364
|
+
# Method 2: Heuristic (Sentence Length Variance & Repetition)
|
|
365
|
+
lengths = [len(s.split()) for s in sentences]
|
|
366
|
+
avg_len = sum(lengths) / len(lengths)
|
|
367
|
+
variance = sum((l - avg_len) ** 2 for l in lengths) / len(lengths)
|
|
368
|
+
|
|
369
|
+
# High variance suggests simpler/choppier writing usually
|
|
370
|
+
score = 0.8
|
|
371
|
+
if variance > 100: score -= 0.2
|
|
372
|
+
if avg_len < 5: score -= 0.2
|
|
373
|
+
|
|
374
|
+
return max(0.0, score)
|
|
375
|
+
|
|
376
|
+
def calculate_overall_score(
|
|
377
|
+
self,
|
|
378
|
+
rule_results: Dict,
|
|
379
|
+
nlp_results: Dict
|
|
380
|
+
) -> float:
|
|
381
|
+
"""
|
|
382
|
+
Calculate overall credibility score based on User-Defined Metrics.
|
|
383
|
+
"""
|
|
384
|
+
score = 0.5 # Start neutral
|
|
385
|
+
adjustments = 0.0
|
|
386
|
+
total_weight_used = 0.0
|
|
387
|
+
|
|
388
|
+
# 1. Source Reputation (25%)
|
|
389
|
+
w_rep = self.weights.get('source_reputation', 0.25)
|
|
390
|
+
reputation = rule_results['source_analysis'].get('reputation', 'Unknown')
|
|
391
|
+
if reputation != 'Unknown' and "N/A" not in reputation:
|
|
392
|
+
if reputation == 'High':
|
|
393
|
+
adjustments += w_rep * 1.0 # Full boost
|
|
394
|
+
elif reputation == 'Low':
|
|
395
|
+
adjustments -= w_rep * 1.0 # Full penalty
|
|
396
|
+
elif reputation == 'Medium':
|
|
397
|
+
adjustments += w_rep * 0.2 # Slight boost
|
|
398
|
+
total_weight_used += w_rep
|
|
399
|
+
|
|
400
|
+
# 2. Domain Age (10%)
|
|
401
|
+
w_age = self.weights.get('domain_age', 0.10)
|
|
402
|
+
domain_age = rule_results['source_analysis'].get('domain_age_days')
|
|
403
|
+
if domain_age is not None:
|
|
404
|
+
if domain_age > 730: # > 2 years
|
|
405
|
+
adjustments += w_age
|
|
406
|
+
elif domain_age < 90: # < 3 months
|
|
407
|
+
adjustments -= w_age
|
|
408
|
+
total_weight_used += w_age
|
|
409
|
+
|
|
410
|
+
# 3. Fact Check (20%)
|
|
411
|
+
w_fc = self.weights.get('fact_check', 0.20)
|
|
412
|
+
fact_checks = rule_results.get('fact_checking', [])
|
|
413
|
+
if fact_checks:
|
|
414
|
+
fc_score = 0
|
|
415
|
+
for fc in fact_checks:
|
|
416
|
+
rating = fc.get('rating', '').lower()
|
|
417
|
+
if rating in ['true', 'verified', 'correct']:
|
|
418
|
+
fc_score += 1
|
|
419
|
+
elif rating in ['false', 'fake', 'incorrect']:
|
|
420
|
+
fc_score -= 1
|
|
421
|
+
|
|
422
|
+
# Normalize fc_score (-1 to 1) roughly
|
|
423
|
+
if fc_score > 0: adjustments += w_fc
|
|
424
|
+
elif fc_score < 0: adjustments -= w_fc
|
|
425
|
+
total_weight_used += w_fc
|
|
426
|
+
|
|
427
|
+
# 4. Sentiment Neutrality (15%)
|
|
428
|
+
# Extreme sentiment = lower score
|
|
429
|
+
w_sent = self.weights.get('sentiment_neutrality', 0.15)
|
|
430
|
+
sentiment = nlp_results.get('sentiment', {})
|
|
431
|
+
if sentiment:
|
|
432
|
+
s_score = sentiment.get('score', 0.5)
|
|
433
|
+
# If extremely positive or negative (>0.9), penalize
|
|
434
|
+
if s_score > 0.9:
|
|
435
|
+
adjustments -= w_sent * 0.5 # Penalty for extremism
|
|
436
|
+
else:
|
|
437
|
+
adjustments += w_sent * 0.2 # Slight boost for moderation
|
|
438
|
+
total_weight_used += w_sent
|
|
439
|
+
|
|
440
|
+
# 5. Entity Presence (15%)
|
|
441
|
+
# Presence of Named Entities (PER, ORG, LOC) suggests verifyiability
|
|
442
|
+
w_ent = self.weights.get('entity_presence', 0.15)
|
|
443
|
+
entities = nlp_results.get('named_entities', [])
|
|
444
|
+
if len(entities) > 0:
|
|
445
|
+
# More entities = better (capped)
|
|
446
|
+
boost = min(1.0, len(entities) * 0.2)
|
|
447
|
+
adjustments += w_ent * boost
|
|
448
|
+
total_weight_used += w_ent
|
|
449
|
+
|
|
450
|
+
# 6. Text Coherence (15%) (Vocabulary Diversity)
|
|
451
|
+
w_coh = self.weights.get('coherence', 0.15)
|
|
452
|
+
coherence = nlp_results.get('coherence_score')
|
|
453
|
+
if coherence is not None:
|
|
454
|
+
# Coherence is usually 0.0 to 1.0
|
|
455
|
+
# Center around 0.5: >0.5 improves, <0.5 penalizes
|
|
456
|
+
adjustments += (coherence - 0.5) * w_coh
|
|
457
|
+
total_weight_used += w_coh
|
|
458
|
+
|
|
459
|
+
# Final calculation
|
|
460
|
+
# Base 0.5 + sum of weighted adjustments
|
|
461
|
+
# Adjustments are in range [-weight, +weight]
|
|
462
|
+
|
|
463
|
+
final_score = 0.5 + adjustments
|
|
464
|
+
|
|
465
|
+
return max(0.0, min(1.0, final_score))
|
|
466
|
+
|
|
467
|
+
def generate_report(
|
|
468
|
+
self,
|
|
469
|
+
input_data: str,
|
|
470
|
+
cleaned_text: str,
|
|
471
|
+
rule_results: Dict,
|
|
472
|
+
nlp_results: Dict,
|
|
473
|
+
external_data: ExternalData,
|
|
474
|
+
overall_score: float,
|
|
475
|
+
web_content: Optional[WebContent] = None,
|
|
476
|
+
graph_context: str = "" # [NEW]
|
|
477
|
+
) -> Dict[str, Any]:
|
|
478
|
+
"""Generate the final evaluation report."""
|
|
479
|
+
|
|
480
|
+
report = {
|
|
481
|
+
'idRapport': f"report_{int(datetime.datetime.now().timestamp())}",
|
|
482
|
+
'informationEntree': input_data,
|
|
483
|
+
'dateGeneration': datetime.datetime.now().isoformat(),
|
|
484
|
+
'scoreCredibilite': round(overall_score, 2),
|
|
485
|
+
'resumeAnalyse': "",
|
|
486
|
+
'detailsScore': {
|
|
487
|
+
'base': 0.5,
|
|
488
|
+
'weights': self.weights,
|
|
489
|
+
'factors': self._get_score_factors(rule_results, nlp_results)
|
|
490
|
+
},
|
|
491
|
+
'sourcesUtilisees': [],
|
|
492
|
+
'reglesAppliquees': rule_results,
|
|
493
|
+
'analyseNLP': {
|
|
494
|
+
'sentiment': nlp_results.get('sentiment'),
|
|
495
|
+
'bias_analysis': nlp_results.get('bias_analysis'),
|
|
496
|
+
'named_entities_count': len(nlp_results.get('named_entities', [])),
|
|
497
|
+
'coherence_score': nlp_results.get('coherence_score'),
|
|
498
|
+
'sentiment_explanation_preview': (nlp_results.get('sentiment_explanation') or [])[:3]
|
|
499
|
+
},
|
|
500
|
+
'metadonnees': {}
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
# Add web content metadata if available
|
|
504
|
+
if web_content:
|
|
505
|
+
if web_content.success:
|
|
506
|
+
report['metadonnees']['page_title'] = web_content.title
|
|
507
|
+
report['metadonnees']['meta_description'] = web_content.meta_description
|
|
508
|
+
report['metadonnees']['links_count'] = len(web_content.links)
|
|
509
|
+
else:
|
|
510
|
+
report['metadonnees']['warning'] = f"Content scrape failed: {web_content.error}"
|
|
511
|
+
|
|
512
|
+
# Generate summary
|
|
513
|
+
summary_parts = []
|
|
514
|
+
|
|
515
|
+
if web_content and not web_content.success:
|
|
516
|
+
summary_parts.append(f"⚠️ ATTENTION: Impossible de lire le texte de la page ({web_content.error}). Analyse basée uniquement sur la réputation du domaine.")
|
|
517
|
+
|
|
518
|
+
if overall_score > 0.75:
|
|
519
|
+
summary_parts.append("L'analyse suggère une crédibilité ÉLEVÉE.")
|
|
520
|
+
elif overall_score > 0.55:
|
|
521
|
+
summary_parts.append("L'analyse suggère une crédibilité MOYENNE à ÉLEVÉE.")
|
|
522
|
+
elif overall_score > 0.45:
|
|
523
|
+
summary_parts.append("L'analyse suggère une crédibilité MOYENNE.")
|
|
524
|
+
elif overall_score > 0.25:
|
|
525
|
+
summary_parts.append("L'analyse suggère une crédibilité FAIBLE à MOYENNE.")
|
|
526
|
+
else:
|
|
527
|
+
summary_parts.append("L'analyse suggère une crédibilité FAIBLE.")
|
|
528
|
+
|
|
529
|
+
if external_data.source_reputation != 'Unknown':
|
|
530
|
+
summary_parts.append(f"Réputation source : {external_data.source_reputation}.")
|
|
531
|
+
|
|
532
|
+
if external_data.domain_age_days:
|
|
533
|
+
years = external_data.domain_age_days / 365
|
|
534
|
+
summary_parts.append(f"Âge du domaine : {years:.1f} ans.")
|
|
535
|
+
|
|
536
|
+
if external_data.fact_checks:
|
|
537
|
+
summary_parts.append(f"{len(external_data.fact_checks)} vérification(s) de faits trouvée(s).")
|
|
538
|
+
|
|
539
|
+
report['resumeAnalyse'] = " ".join(summary_parts)
|
|
540
|
+
|
|
541
|
+
# List sources used
|
|
542
|
+
if self.is_url(input_data):
|
|
543
|
+
report['sourcesUtilisees'].append({
|
|
544
|
+
'type': 'Primary URL',
|
|
545
|
+
'url': input_data
|
|
546
|
+
})
|
|
547
|
+
report['sourcesUtilisees'].append({
|
|
548
|
+
'type': 'WHOIS Lookup',
|
|
549
|
+
'status': 'Success' if (external_data.domain_info and external_data.domain_info.success) else 'Failed/N/A'
|
|
550
|
+
})
|
|
551
|
+
report['sourcesUtilisees'].append({
|
|
552
|
+
'type': 'Fact Check API',
|
|
553
|
+
'results_count': len(external_data.fact_checks)
|
|
554
|
+
})
|
|
555
|
+
|
|
556
|
+
return report
|
|
557
|
+
|
|
558
|
+
def _get_score_factors(self, rule_results: Dict, nlp_results: Dict) -> List[Dict]:
|
|
559
|
+
"""Get list of factors that influenced the score (For UI)."""
|
|
560
|
+
factors = []
|
|
561
|
+
|
|
562
|
+
# 1. Reputation
|
|
563
|
+
rep = rule_results['source_analysis'].get('reputation')
|
|
564
|
+
if rep and "N/A" not in rep:
|
|
565
|
+
factors.append({
|
|
566
|
+
'factor': 'Source Reputation',
|
|
567
|
+
'value': rep,
|
|
568
|
+
'weight': f"{int(self.weights.get('source_reputation',0)*100)}%",
|
|
569
|
+
'impact': '+' if rep == 'High' else ('-' if rep == 'Low' else '0')
|
|
570
|
+
})
|
|
571
|
+
|
|
572
|
+
# 2. Fact Checks
|
|
573
|
+
if rule_results.get('fact_checking'):
|
|
574
|
+
factors.append({
|
|
575
|
+
'factor': 'Fact Checks',
|
|
576
|
+
'value': f"{len(rule_results['fact_checking'])} found",
|
|
577
|
+
'weight': f"{int(self.weights.get('fact_check',0)*100)}%",
|
|
578
|
+
'impact': 'Variable'
|
|
579
|
+
})
|
|
580
|
+
|
|
581
|
+
# 3. Entities
|
|
582
|
+
n_ent = len(nlp_results.get('named_entities', []))
|
|
583
|
+
if n_ent > 0:
|
|
584
|
+
factors.append({
|
|
585
|
+
'factor': 'Entity Presence',
|
|
586
|
+
'value': f"{n_ent} entities",
|
|
587
|
+
'weight': f"{int(self.weights.get('entity_presence',0)*100)}%",
|
|
588
|
+
'impact': '+'
|
|
589
|
+
})
|
|
590
|
+
|
|
591
|
+
# 4. Sentiment
|
|
592
|
+
sent = nlp_results.get('sentiment', {})
|
|
593
|
+
if sent:
|
|
594
|
+
factors.append({
|
|
595
|
+
'factor': 'Sentiment Neutrality',
|
|
596
|
+
'value': f"{sent.get('label')} ({sent.get('score',0):.2f})",
|
|
597
|
+
'weight': f"{int(self.weights.get('sentiment_neutrality',0)*100)}%",
|
|
598
|
+
'impact': '-' if sent.get('score', 0) > 0.9 else '0'
|
|
599
|
+
})
|
|
600
|
+
|
|
601
|
+
return factors
|
|
602
|
+
|
|
603
|
+
def verify_information(self, input_data: str) -> Dict[str, Any]:
|
|
604
|
+
"""
|
|
605
|
+
Main pipeline to verify credibility of input data.
|
|
606
|
+
|
|
607
|
+
Args:
|
|
608
|
+
input_data: URL or text to verify
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
Complete evaluation report
|
|
612
|
+
"""
|
|
613
|
+
if not isinstance(input_data, str) or not input_data.strip():
|
|
614
|
+
return {"error": "L'entrée doit être une chaîne non vide."}
|
|
615
|
+
|
|
616
|
+
print(f"\n[SysCRED] === Vérification: {input_data[:100]}... ===")
|
|
617
|
+
|
|
618
|
+
# 1. Determine input type and fetch content
|
|
619
|
+
text_to_analyze = ""
|
|
620
|
+
web_content = None
|
|
621
|
+
is_url = self.is_url(input_data)
|
|
622
|
+
|
|
623
|
+
if is_url:
|
|
624
|
+
print("[SysCRED] Fetching web content...")
|
|
625
|
+
web_content = self.api_clients.fetch_web_content(input_data)
|
|
626
|
+
|
|
627
|
+
if web_content.success:
|
|
628
|
+
text_to_analyze = web_content.text_content
|
|
629
|
+
print(f"[SysCRED] ✓ Content fetched: {len(text_to_analyze)} chars")
|
|
630
|
+
else:
|
|
631
|
+
print(f"[SysCRED] ⚠ Fetch failed: {web_content.error}")
|
|
632
|
+
print("[SysCRED] Proceeding with Domain/Metadata analysis only.")
|
|
633
|
+
text_to_analyze = ""
|
|
634
|
+
# We don't return error anymore, we proceed!
|
|
635
|
+
else:
|
|
636
|
+
text_to_analyze = input_data
|
|
637
|
+
|
|
638
|
+
# 2. Preprocess text
|
|
639
|
+
cleaned_text = self.preprocess(text_to_analyze)
|
|
640
|
+
|
|
641
|
+
# Only error on empty text if it wasn't a failed web fetch
|
|
642
|
+
# If web fetch failed, we proceed with empty text to give metadata analysis
|
|
643
|
+
if not cleaned_text and not (is_url and web_content and not web_content.success):
|
|
644
|
+
return {"error": "Le texte est vide après prétraitement."}
|
|
645
|
+
print(f"[SysCRED] Preprocessed text: {len(cleaned_text)} chars")
|
|
646
|
+
|
|
647
|
+
# Determine best query for Fact Checking
|
|
648
|
+
fact_check_query = input_data
|
|
649
|
+
if text_to_analyze and len(text_to_analyze) > 10:
|
|
650
|
+
# Use start of text if available
|
|
651
|
+
fact_check_query = text_to_analyze[:200]
|
|
652
|
+
elif is_url and web_content and web_content.title:
|
|
653
|
+
# Fallback to page title if text is missing (e.g. 403)
|
|
654
|
+
fact_check_query = web_content.title
|
|
655
|
+
|
|
656
|
+
# 3. Fetch external data
|
|
657
|
+
print(f"[SysCRED] Fetching external data (Query: {fact_check_query[:50]}...)...")
|
|
658
|
+
external_data = self.api_clients.fetch_external_data(input_data, fc_query=fact_check_query)
|
|
659
|
+
|
|
660
|
+
# [FIX] Handle text-only input reputation
|
|
661
|
+
if not is_url:
|
|
662
|
+
external_data.source_reputation = "N/A (User Input)"
|
|
663
|
+
|
|
664
|
+
print(f"[SysCRED] ✓ Reputation: {external_data.source_reputation}, Age: {external_data.domain_age_days} days")
|
|
665
|
+
|
|
666
|
+
# 4. Rule-based analysis
|
|
667
|
+
print("[SysCRED] Running rule-based analysis...")
|
|
668
|
+
rule_results = self.rule_based_analysis(cleaned_text, external_data)
|
|
669
|
+
|
|
670
|
+
# 5. NLP analysis
|
|
671
|
+
print("[SysCRED] Running NLP analysis...")
|
|
672
|
+
nlp_results = self.nlp_analysis(cleaned_text)
|
|
673
|
+
|
|
674
|
+
# 6. Calculate score
|
|
675
|
+
overall_score = self.calculate_overall_score(rule_results, nlp_results)
|
|
676
|
+
print(f"[SysCRED] ✓ Credibility score: {overall_score:.2f}")
|
|
677
|
+
|
|
678
|
+
# 7. [NEW] GraphRAG Context Retrieval
|
|
679
|
+
graph_context = ""
|
|
680
|
+
similar_uris = []
|
|
681
|
+
if self.graph_rag and 'source_analysis' in rule_results:
|
|
682
|
+
domain = rule_results['source_analysis'].get('domain', '')
|
|
683
|
+
# Pass keywords for text search if domain is empty or generic
|
|
684
|
+
keywords = []
|
|
685
|
+
if not domain and cleaned_text:
|
|
686
|
+
keywords = cleaned_text.split()[:5] # Simple keyword extraction
|
|
687
|
+
|
|
688
|
+
context = self.graph_rag.get_context(domain, keywords=keywords)
|
|
689
|
+
graph_context = context.get('full_text', '')
|
|
690
|
+
similar_uris = context.get('similar_uris', [])
|
|
691
|
+
|
|
692
|
+
if "Graph Memory" in graph_context:
|
|
693
|
+
print(f"[SysCRED] GraphRAG Context Found: {graph_context.splitlines()[1]}")
|
|
694
|
+
|
|
695
|
+
# 8. Generate report (Updated to include context)
|
|
696
|
+
report = self.generate_report(
|
|
697
|
+
input_data, cleaned_text, rule_results,
|
|
698
|
+
nlp_results, external_data, overall_score, web_content,
|
|
699
|
+
graph_context=graph_context
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
# Add similar URIs to report for ontology linking
|
|
703
|
+
if similar_uris:
|
|
704
|
+
report['similar_claims_uris'] = similar_uris
|
|
705
|
+
|
|
706
|
+
# 9. Save to ontology
|
|
707
|
+
if self.ontology_manager:
|
|
708
|
+
try:
|
|
709
|
+
report_uri = self.ontology_manager.add_evaluation_triplets(report)
|
|
710
|
+
report['ontology_uri'] = report_uri
|
|
711
|
+
self.ontology_manager.save_data()
|
|
712
|
+
except Exception as e:
|
|
713
|
+
print(f"[SysCRED] Ontology save failed: {e}")
|
|
714
|
+
|
|
715
|
+
print("[SysCRED] === Vérification terminée ===\n")
|
|
716
|
+
return report
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
# --- Main / Testing ---
|
|
720
|
+
if __name__ == "__main__":
|
|
721
|
+
import json
|
|
722
|
+
|
|
723
|
+
print("=" * 60)
|
|
724
|
+
print("SysCRED v2.0 - Système de Vérification de Crédibilité")
|
|
725
|
+
print("(c) Dominique S. Loyer - PhD Thesis Prototype")
|
|
726
|
+
print("=" * 60 + "\n")
|
|
727
|
+
|
|
728
|
+
# Initialize system (without ML models for quick testing)
|
|
729
|
+
system = CredibilityVerificationSystem(
|
|
730
|
+
ontology_base_path="/Users/bk280625/documents041025/MonCode/sysCRED_onto26avrtil.ttl",
|
|
731
|
+
ontology_data_path="/Users/bk280625/documents041025/MonCode/ontology/sysCRED_data.ttl",
|
|
732
|
+
load_ml_models=False # Set to True for full analysis
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
# Test cases
|
|
736
|
+
test_cases = {
|
|
737
|
+
"Test URL Crédible": "https://www.lemonde.fr",
|
|
738
|
+
"Test URL Inconnu": "https://example.com/article",
|
|
739
|
+
"Test Texte Simple": "This is a verified and authentic news report.",
|
|
740
|
+
"Test Texte Suspect": "Shocking conspiracy revealed! They don't want you to know this secret!",
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
results = {}
|
|
744
|
+
for name, test_input in test_cases.items():
|
|
745
|
+
print(f"\n{'='*50}")
|
|
746
|
+
print(f"Test: {name}")
|
|
747
|
+
print('='*50)
|
|
748
|
+
|
|
749
|
+
result = system.verify_information(test_input)
|
|
750
|
+
results[name] = result
|
|
751
|
+
|
|
752
|
+
if 'error' not in result:
|
|
753
|
+
print(f"\nScore: {result['scoreCredibilite']}")
|
|
754
|
+
print(f"Résumé: {result['resumeAnalyse']}")
|
|
755
|
+
else:
|
|
756
|
+
print(f"Erreur: {result['error']}")
|
|
757
|
+
|
|
758
|
+
print("\n" + "="*60)
|
|
759
|
+
print("Résumé des tests:")
|
|
760
|
+
print("="*60)
|
|
761
|
+
for name, result in results.items():
|
|
762
|
+
if 'error' not in result:
|
|
763
|
+
print(f" {name}: Score = {result['scoreCredibilite']:.2f}")
|
|
764
|
+
else:
|
|
765
|
+
print(f" {name}: ERREUR")
|