syscred 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,765 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Verification System Module - SysCRED v2.0
4
+ ==========================================
5
+ Main credibility verification system with real API integration.
6
+ Refactored from sys-cred-Python-27avril2025.py
7
+
8
+ (c) Dominique S. Loyer - PhD Thesis Prototype
9
+ Citation Key: loyerModelingHybridSystem2025
10
+ """
11
+
12
+ import re
13
+ import json
14
+ import datetime
15
+ from typing import Optional, Dict, Any, List
16
+ from urllib.parse import urlparse
17
+
18
+ # Transformers and ML
19
+ try:
20
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
21
+ import numpy as np
22
+ import torch
23
+ from lime.lime_text import LimeTextExplainer
24
+ HAS_ML = True
25
+ except ImportError:
26
+ HAS_ML = False
27
+ print("Warning: ML libraries not fully installed. Run: pip install transformers torch lime numpy")
28
+
29
+ try:
30
+ from sentence_transformers import SentenceTransformer, util
31
+ HAS_SBERT = True
32
+ except ImportError:
33
+ HAS_SBERT = False
34
+ print("Warning: sentence-transformers not installed. Semantic coherence will use heuristics.")
35
+
36
+ # Local imports
37
+ from syscred.api_clients import ExternalAPIClients, WebContent, ExternalData
38
+ from syscred.ontology_manager import OntologyManager
39
+ from syscred.seo_analyzer import SEOAnalyzer
40
+ from syscred.graph_rag import GraphRAG # [NEW] GraphRAG
41
+
42
+
43
+ class CredibilityVerificationSystem:
44
+ """
45
+ Système neuro-symbolique de vérification de crédibilité.
46
+
47
+ Combine:
48
+ - Analyse basée sur des règles (symbolique, transparent)
49
+ - Analyse NLP/IA (apprentissage automatique)
50
+ - Ontologie OWL pour la traçabilité
51
+ - APIs externes pour les données réelles
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ google_api_key: Optional[str] = None,
57
+ ontology_base_path: Optional[str] = None,
58
+ ontology_data_path: Optional[str] = None,
59
+ load_ml_models: bool = True
60
+ ):
61
+ """
62
+ Initialize the credibility verification system.
63
+
64
+ Args:
65
+ google_api_key: API key for Google Fact Check (optional)
66
+ ontology_base_path: Path to base ontology TTL file
67
+ ontology_data_path: Path to store accumulated data
68
+ load_ml_models: Whether to load ML models (disable for testing)
69
+ """
70
+ print("[SysCRED] Initializing Credibility Verification System v2.0...")
71
+
72
+ # Initialize API clients
73
+ self.api_clients = ExternalAPIClients(google_api_key=google_api_key)
74
+ print("[SysCRED] API clients initialized")
75
+
76
+ # Initialize ontology manager
77
+ self.ontology_manager = None
78
+ if ontology_base_path or ontology_data_path:
79
+ try:
80
+ self.ontology_manager = OntologyManager(
81
+ base_ontology_path=ontology_base_path,
82
+ data_path=ontology_data_path
83
+ )
84
+ self.graph_rag = GraphRAG(self.ontology_manager) # [NEW] Init GraphRAG
85
+ print("[SysCRED] Ontology manager & GraphRAG initialized")
86
+ except Exception as e:
87
+ print(f"[SysCRED] Ontology manager disabled: {e}")
88
+ self.graph_rag = None
89
+ else:
90
+ self.graph_rag = None
91
+
92
+ # Initialize ML models
93
+ self.sentiment_pipeline = None
94
+ self.ner_pipeline = None
95
+ self.bias_tokenizer = None
96
+ self.bias_model = None
97
+ self.coherence_model = None
98
+ self.explainer = None
99
+
100
+ if load_ml_models and HAS_ML:
101
+ self._load_ml_models()
102
+
103
+ # Weights for score calculation (configurable)
104
+ # Weights for score calculation (Loaded from Config)
105
+ self.weights = config.Config.SCORE_WEIGHTS
106
+ print(f"[SysCRED] Using weights: {self.weights}")
107
+
108
+ print("[SysCRED] System ready!")
109
+
110
+ def _load_ml_models(self):
111
+ """Load ML models for NLP analysis."""
112
+ print("[SysCRED] Loading ML models (this may take a moment)...")
113
+
114
+ try:
115
+ # Sentiment analysis
116
+ self.sentiment_pipeline = pipeline(
117
+ "sentiment-analysis",
118
+ model="distilbert-base-uncased-finetuned-sst-2-english"
119
+ )
120
+ print("[SysCRED] ✓ Sentiment model loaded")
121
+ except Exception as e:
122
+ print(f"[SysCRED] ✗ Sentiment model failed: {e}")
123
+
124
+ try:
125
+ # NER pipeline
126
+ self.ner_pipeline = pipeline("ner", grouped_entities=True)
127
+ print("[SysCRED] ✓ NER model loaded")
128
+ except Exception as e:
129
+ print(f"[SysCRED] ✗ NER model failed: {e}")
130
+
131
+ try:
132
+ # Bias detection - Specialized model
133
+ # Using 'd4data/bias-detection-model' or fallback to generic
134
+ bias_model_name = "d4data/bias-detection-model"
135
+ self.bias_tokenizer = AutoTokenizer.from_pretrained(bias_model_name)
136
+ self.bias_model = AutoModelForSequenceClassification.from_pretrained(bias_model_name)
137
+ print("[SysCRED] ✓ Bias model loaded (d4data)")
138
+ except Exception as e:
139
+ print(f"[SysCRED] ✗ Bias model failed: {e}. Using heuristics.")
140
+
141
+ try:
142
+ # Semantic Coherence
143
+ if HAS_SBERT:
144
+ self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
145
+ print("[SysCRED] ✓ Coherence model loaded (SBERT)")
146
+ except Exception as e:
147
+ print(f"[SysCRED] ✗ Coherence model failed: {e}")
148
+
149
+ try:
150
+ # LIME explainer
151
+ self.explainer = LimeTextExplainer(class_names=['NEGATIVE', 'POSITIVE'])
152
+ print("[SysCRED] ✓ LIME explainer loaded")
153
+ except Exception as e:
154
+ print(f"[SysCRED] ✗ LIME explainer failed: {e}")
155
+
156
+ def is_url(self, text: str) -> bool:
157
+ """Check if a string is a valid URL."""
158
+ try:
159
+ result = urlparse(text)
160
+ return all([result.scheme, result.netloc])
161
+ except ValueError:
162
+ return False
163
+
164
+ def preprocess(self, text: str) -> str:
165
+ """Clean and normalize text for analysis."""
166
+ if not isinstance(text, str):
167
+ return ""
168
+
169
+ # Remove URLs
170
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
171
+ # Normalize whitespace
172
+ text = re.sub(r'\s+', ' ', text)
173
+ # Keep basic punctuation
174
+ text = re.sub(r'[^\w\s\.\?,!]', '', text)
175
+
176
+ return text.lower().strip()
177
+
178
+ def rule_based_analysis(self, text: str, external_data: ExternalData) -> Dict[str, Any]:
179
+ """
180
+ Perform rule-based analysis using symbolic reasoning.
181
+
182
+ Args:
183
+ text: Preprocessed text to analyze
184
+ external_data: Data from external APIs
185
+
186
+ Returns:
187
+ Dictionary with rule-based analysis results
188
+ """
189
+ results = {
190
+ 'linguistic_markers': {},
191
+ 'source_analysis': {},
192
+ 'timeliness_flags': [],
193
+ 'fact_checking': []
194
+ }
195
+
196
+ # 1. Linguistic markers
197
+ sensational_words = [
198
+ 'shocking', 'revealed', 'conspiracy', 'amazing', 'secret',
199
+ 'breakthrough', 'miracle', 'unbelievable', 'exclusive', 'urgent'
200
+ ]
201
+ certainty_words = [
202
+ 'verified', 'authentic', 'credible', 'proven', 'fact',
203
+ 'confirmed', 'official', 'legitimate', 'established'
204
+ ]
205
+ doubt_words = [
206
+ 'hoax', 'false', 'fake', 'unproven', 'rumor', 'allegedly',
207
+ 'claim', 'debunked', 'misleading', 'disputed'
208
+ ]
209
+
210
+ text_lower = text.lower()
211
+ results['linguistic_markers']['sensationalism'] = sum(
212
+ 1 for word in sensational_words if word in text_lower
213
+ )
214
+ results['linguistic_markers']['certainty'] = sum(
215
+ 1 for word in certainty_words if word in text_lower
216
+ )
217
+ results['linguistic_markers']['doubt'] = sum(
218
+ 1 for word in doubt_words if word in text_lower
219
+ )
220
+
221
+ # 2. Source analysis from external data
222
+ results['source_analysis']['reputation'] = external_data.source_reputation
223
+ results['source_analysis']['domain_age_days'] = external_data.domain_age_days
224
+
225
+ if external_data.domain_info:
226
+ results['source_analysis']['registrar'] = external_data.domain_info.registrar
227
+ results['source_analysis']['domain'] = external_data.domain_info.domain
228
+
229
+ # 3. Timeliness flags
230
+ if external_data.domain_age_days is not None:
231
+ if external_data.domain_age_days < 180:
232
+ results['timeliness_flags'].append('Source domain is relatively new (<6 months)')
233
+ elif external_data.domain_age_days < 365:
234
+ results['timeliness_flags'].append('Source domain is less than 1 year old')
235
+
236
+ # 4. Fact checking results
237
+ for fc in external_data.fact_checks:
238
+ results['fact_checking'].append({
239
+ 'claim': fc.claim,
240
+ 'rating': fc.rating,
241
+ 'publisher': fc.publisher,
242
+ 'url': fc.url
243
+ })
244
+
245
+ return results
246
+
247
+ def nlp_analysis(self, text: str) -> Dict[str, Any]:
248
+ """
249
+ Perform NLP-based analysis using ML models.
250
+
251
+ Args:
252
+ text: Preprocessed text to analyze
253
+
254
+ Returns:
255
+ Dictionary with NLP analysis results
256
+ """
257
+ results = {
258
+ 'sentiment': None,
259
+ 'sentiment_explanation': None,
260
+ 'bias_analysis': {'score': None, 'label': 'Unavailable'},
261
+ 'named_entities': [],
262
+ 'coherence_score': None
263
+ }
264
+
265
+ if not text:
266
+ results['sentiment'] = {'label': 'Neutral', 'score': 0.5}
267
+ return results
268
+
269
+ # 1. Sentiment analysis with LIME explanation
270
+ if self.sentiment_pipeline:
271
+ try:
272
+ main_pred = self.sentiment_pipeline(text[:512])[0]
273
+ results['sentiment'] = main_pred
274
+
275
+ if self.explainer:
276
+ def predict_proba(texts):
277
+ if isinstance(texts, str):
278
+ texts = [texts]
279
+ predictions = self.sentiment_pipeline(list(texts))
280
+ probs = []
281
+ for pred in predictions:
282
+ if pred['label'] == 'POSITIVE':
283
+ probs.append([1 - pred['score'], pred['score']])
284
+ else:
285
+ probs.append([pred['score'], 1 - pred['score']])
286
+ return np.array(probs)
287
+
288
+ explanation = self.explainer.explain_instance(
289
+ text[:512], predict_proba, num_features=6
290
+ )
291
+ results['sentiment_explanation'] = explanation.as_list()
292
+ except Exception as e:
293
+ print(f"[NLP] Sentiment error: {e}")
294
+ results['sentiment'] = {'label': 'Error', 'score': 0.0}
295
+
296
+ # 2. Bias analysis
297
+ results['bias_analysis'] = self._analyze_bias(text)
298
+
299
+ # 3. Named Entity Recognition
300
+ if self.ner_pipeline:
301
+ try:
302
+ entities = self.ner_pipeline(text[:512])
303
+ results['named_entities'] = entities
304
+ except Exception as e:
305
+ print(f"[NLP] NER error: {e}")
306
+
307
+ # 4. Semantic Coherence
308
+ results['coherence_score'] = self._calculate_coherence(text)
309
+
310
+ return results
311
+
312
+ def _analyze_bias(self, text: str) -> Dict[str, Any]:
313
+ """Analyze text for bias using ML or heuristics."""
314
+ # Method 1: ML Model
315
+ if self.bias_model and self.bias_tokenizer:
316
+ try:
317
+ inputs = self.bias_tokenizer(
318
+ text[:512], return_tensors="pt",
319
+ truncation=True, max_length=512, padding=True
320
+ )
321
+ with torch.no_grad():
322
+ logits = self.bias_model(**inputs).logits
323
+ probs = torch.softmax(logits, dim=1)[0]
324
+ # Label mapping depends on model, usually [Non-biased, Biased]
325
+ bias_score = probs[1].item()
326
+
327
+ label = " biased" if bias_score > 0.5 else "Non-biased"
328
+ return {'score': bias_score, 'label': label, 'method': 'ML (d4data)'}
329
+ except Exception as e:
330
+ print(f"[NLP] ML Bias error: {e}")
331
+
332
+ # Method 2: Heuristics
333
+ biased_words = [
334
+ 'radical', 'extremist', 'disgraceful', 'shameful', 'corrupt',
335
+ 'insane', 'idiot', 'disaster', 'propaganda', 'dictator',
336
+ 'puppet', 'regime', 'tyrant', 'treason', 'traitor'
337
+ ]
338
+ text_lower = text.lower()
339
+ count = sum(1 for w in biased_words if w in text_lower)
340
+ score = min(1.0, count * 0.15)
341
+ label = "Potentially Biased" if score > 0.3 else "Neutral"
342
+ return {'score': score, 'label': label, 'method': 'Heuristic'}
343
+
344
+ def _calculate_coherence(self, text: str) -> float:
345
+ """Calculate semantic coherence score."""
346
+ sentences = re.split(r'[.!?]+', text)
347
+ sentences = [s.strip() for s in sentences if len(s.split()) > 3]
348
+
349
+ if len(sentences) < 2:
350
+ return 0.7 # Default to neutral/good for short text, not perfect 1.0
351
+
352
+ # Method 1: SBERT Semantic Similarity
353
+ if self.coherence_model and HAS_SBERT:
354
+ try:
355
+ embeddings = self.coherence_model.encode(sentences[:10]) # Limit to 10
356
+ sims = []
357
+ for i in range(len(embeddings) - 1):
358
+ sim = util.pytorch_cos_sim(embeddings[i], embeddings[i+1])
359
+ sims.append(sim.item())
360
+ return sum(sims) / len(sims) if sims else 0.5
361
+ except Exception as e:
362
+ print(f"[NLP] SBERT error: {e}")
363
+
364
+ # Method 2: Heuristic (Sentence Length Variance & Repetition)
365
+ lengths = [len(s.split()) for s in sentences]
366
+ avg_len = sum(lengths) / len(lengths)
367
+ variance = sum((l - avg_len) ** 2 for l in lengths) / len(lengths)
368
+
369
+ # High variance suggests simpler/choppier writing usually
370
+ score = 0.8
371
+ if variance > 100: score -= 0.2
372
+ if avg_len < 5: score -= 0.2
373
+
374
+ return max(0.0, score)
375
+
376
+ def calculate_overall_score(
377
+ self,
378
+ rule_results: Dict,
379
+ nlp_results: Dict
380
+ ) -> float:
381
+ """
382
+ Calculate overall credibility score based on User-Defined Metrics.
383
+ """
384
+ score = 0.5 # Start neutral
385
+ adjustments = 0.0
386
+ total_weight_used = 0.0
387
+
388
+ # 1. Source Reputation (25%)
389
+ w_rep = self.weights.get('source_reputation', 0.25)
390
+ reputation = rule_results['source_analysis'].get('reputation', 'Unknown')
391
+ if reputation != 'Unknown' and "N/A" not in reputation:
392
+ if reputation == 'High':
393
+ adjustments += w_rep * 1.0 # Full boost
394
+ elif reputation == 'Low':
395
+ adjustments -= w_rep * 1.0 # Full penalty
396
+ elif reputation == 'Medium':
397
+ adjustments += w_rep * 0.2 # Slight boost
398
+ total_weight_used += w_rep
399
+
400
+ # 2. Domain Age (10%)
401
+ w_age = self.weights.get('domain_age', 0.10)
402
+ domain_age = rule_results['source_analysis'].get('domain_age_days')
403
+ if domain_age is not None:
404
+ if domain_age > 730: # > 2 years
405
+ adjustments += w_age
406
+ elif domain_age < 90: # < 3 months
407
+ adjustments -= w_age
408
+ total_weight_used += w_age
409
+
410
+ # 3. Fact Check (20%)
411
+ w_fc = self.weights.get('fact_check', 0.20)
412
+ fact_checks = rule_results.get('fact_checking', [])
413
+ if fact_checks:
414
+ fc_score = 0
415
+ for fc in fact_checks:
416
+ rating = fc.get('rating', '').lower()
417
+ if rating in ['true', 'verified', 'correct']:
418
+ fc_score += 1
419
+ elif rating in ['false', 'fake', 'incorrect']:
420
+ fc_score -= 1
421
+
422
+ # Normalize fc_score (-1 to 1) roughly
423
+ if fc_score > 0: adjustments += w_fc
424
+ elif fc_score < 0: adjustments -= w_fc
425
+ total_weight_used += w_fc
426
+
427
+ # 4. Sentiment Neutrality (15%)
428
+ # Extreme sentiment = lower score
429
+ w_sent = self.weights.get('sentiment_neutrality', 0.15)
430
+ sentiment = nlp_results.get('sentiment', {})
431
+ if sentiment:
432
+ s_score = sentiment.get('score', 0.5)
433
+ # If extremely positive or negative (>0.9), penalize
434
+ if s_score > 0.9:
435
+ adjustments -= w_sent * 0.5 # Penalty for extremism
436
+ else:
437
+ adjustments += w_sent * 0.2 # Slight boost for moderation
438
+ total_weight_used += w_sent
439
+
440
+ # 5. Entity Presence (15%)
441
+ # Presence of Named Entities (PER, ORG, LOC) suggests verifyiability
442
+ w_ent = self.weights.get('entity_presence', 0.15)
443
+ entities = nlp_results.get('named_entities', [])
444
+ if len(entities) > 0:
445
+ # More entities = better (capped)
446
+ boost = min(1.0, len(entities) * 0.2)
447
+ adjustments += w_ent * boost
448
+ total_weight_used += w_ent
449
+
450
+ # 6. Text Coherence (15%) (Vocabulary Diversity)
451
+ w_coh = self.weights.get('coherence', 0.15)
452
+ coherence = nlp_results.get('coherence_score')
453
+ if coherence is not None:
454
+ # Coherence is usually 0.0 to 1.0
455
+ # Center around 0.5: >0.5 improves, <0.5 penalizes
456
+ adjustments += (coherence - 0.5) * w_coh
457
+ total_weight_used += w_coh
458
+
459
+ # Final calculation
460
+ # Base 0.5 + sum of weighted adjustments
461
+ # Adjustments are in range [-weight, +weight]
462
+
463
+ final_score = 0.5 + adjustments
464
+
465
+ return max(0.0, min(1.0, final_score))
466
+
467
+ def generate_report(
468
+ self,
469
+ input_data: str,
470
+ cleaned_text: str,
471
+ rule_results: Dict,
472
+ nlp_results: Dict,
473
+ external_data: ExternalData,
474
+ overall_score: float,
475
+ web_content: Optional[WebContent] = None,
476
+ graph_context: str = "" # [NEW]
477
+ ) -> Dict[str, Any]:
478
+ """Generate the final evaluation report."""
479
+
480
+ report = {
481
+ 'idRapport': f"report_{int(datetime.datetime.now().timestamp())}",
482
+ 'informationEntree': input_data,
483
+ 'dateGeneration': datetime.datetime.now().isoformat(),
484
+ 'scoreCredibilite': round(overall_score, 2),
485
+ 'resumeAnalyse': "",
486
+ 'detailsScore': {
487
+ 'base': 0.5,
488
+ 'weights': self.weights,
489
+ 'factors': self._get_score_factors(rule_results, nlp_results)
490
+ },
491
+ 'sourcesUtilisees': [],
492
+ 'reglesAppliquees': rule_results,
493
+ 'analyseNLP': {
494
+ 'sentiment': nlp_results.get('sentiment'),
495
+ 'bias_analysis': nlp_results.get('bias_analysis'),
496
+ 'named_entities_count': len(nlp_results.get('named_entities', [])),
497
+ 'coherence_score': nlp_results.get('coherence_score'),
498
+ 'sentiment_explanation_preview': (nlp_results.get('sentiment_explanation') or [])[:3]
499
+ },
500
+ 'metadonnees': {}
501
+ }
502
+
503
+ # Add web content metadata if available
504
+ if web_content:
505
+ if web_content.success:
506
+ report['metadonnees']['page_title'] = web_content.title
507
+ report['metadonnees']['meta_description'] = web_content.meta_description
508
+ report['metadonnees']['links_count'] = len(web_content.links)
509
+ else:
510
+ report['metadonnees']['warning'] = f"Content scrape failed: {web_content.error}"
511
+
512
+ # Generate summary
513
+ summary_parts = []
514
+
515
+ if web_content and not web_content.success:
516
+ summary_parts.append(f"⚠️ ATTENTION: Impossible de lire le texte de la page ({web_content.error}). Analyse basée uniquement sur la réputation du domaine.")
517
+
518
+ if overall_score > 0.75:
519
+ summary_parts.append("L'analyse suggère une crédibilité ÉLEVÉE.")
520
+ elif overall_score > 0.55:
521
+ summary_parts.append("L'analyse suggère une crédibilité MOYENNE à ÉLEVÉE.")
522
+ elif overall_score > 0.45:
523
+ summary_parts.append("L'analyse suggère une crédibilité MOYENNE.")
524
+ elif overall_score > 0.25:
525
+ summary_parts.append("L'analyse suggère une crédibilité FAIBLE à MOYENNE.")
526
+ else:
527
+ summary_parts.append("L'analyse suggère une crédibilité FAIBLE.")
528
+
529
+ if external_data.source_reputation != 'Unknown':
530
+ summary_parts.append(f"Réputation source : {external_data.source_reputation}.")
531
+
532
+ if external_data.domain_age_days:
533
+ years = external_data.domain_age_days / 365
534
+ summary_parts.append(f"Âge du domaine : {years:.1f} ans.")
535
+
536
+ if external_data.fact_checks:
537
+ summary_parts.append(f"{len(external_data.fact_checks)} vérification(s) de faits trouvée(s).")
538
+
539
+ report['resumeAnalyse'] = " ".join(summary_parts)
540
+
541
+ # List sources used
542
+ if self.is_url(input_data):
543
+ report['sourcesUtilisees'].append({
544
+ 'type': 'Primary URL',
545
+ 'url': input_data
546
+ })
547
+ report['sourcesUtilisees'].append({
548
+ 'type': 'WHOIS Lookup',
549
+ 'status': 'Success' if (external_data.domain_info and external_data.domain_info.success) else 'Failed/N/A'
550
+ })
551
+ report['sourcesUtilisees'].append({
552
+ 'type': 'Fact Check API',
553
+ 'results_count': len(external_data.fact_checks)
554
+ })
555
+
556
+ return report
557
+
558
+ def _get_score_factors(self, rule_results: Dict, nlp_results: Dict) -> List[Dict]:
559
+ """Get list of factors that influenced the score (For UI)."""
560
+ factors = []
561
+
562
+ # 1. Reputation
563
+ rep = rule_results['source_analysis'].get('reputation')
564
+ if rep and "N/A" not in rep:
565
+ factors.append({
566
+ 'factor': 'Source Reputation',
567
+ 'value': rep,
568
+ 'weight': f"{int(self.weights.get('source_reputation',0)*100)}%",
569
+ 'impact': '+' if rep == 'High' else ('-' if rep == 'Low' else '0')
570
+ })
571
+
572
+ # 2. Fact Checks
573
+ if rule_results.get('fact_checking'):
574
+ factors.append({
575
+ 'factor': 'Fact Checks',
576
+ 'value': f"{len(rule_results['fact_checking'])} found",
577
+ 'weight': f"{int(self.weights.get('fact_check',0)*100)}%",
578
+ 'impact': 'Variable'
579
+ })
580
+
581
+ # 3. Entities
582
+ n_ent = len(nlp_results.get('named_entities', []))
583
+ if n_ent > 0:
584
+ factors.append({
585
+ 'factor': 'Entity Presence',
586
+ 'value': f"{n_ent} entities",
587
+ 'weight': f"{int(self.weights.get('entity_presence',0)*100)}%",
588
+ 'impact': '+'
589
+ })
590
+
591
+ # 4. Sentiment
592
+ sent = nlp_results.get('sentiment', {})
593
+ if sent:
594
+ factors.append({
595
+ 'factor': 'Sentiment Neutrality',
596
+ 'value': f"{sent.get('label')} ({sent.get('score',0):.2f})",
597
+ 'weight': f"{int(self.weights.get('sentiment_neutrality',0)*100)}%",
598
+ 'impact': '-' if sent.get('score', 0) > 0.9 else '0'
599
+ })
600
+
601
+ return factors
602
+
603
+ def verify_information(self, input_data: str) -> Dict[str, Any]:
604
+ """
605
+ Main pipeline to verify credibility of input data.
606
+
607
+ Args:
608
+ input_data: URL or text to verify
609
+
610
+ Returns:
611
+ Complete evaluation report
612
+ """
613
+ if not isinstance(input_data, str) or not input_data.strip():
614
+ return {"error": "L'entrée doit être une chaîne non vide."}
615
+
616
+ print(f"\n[SysCRED] === Vérification: {input_data[:100]}... ===")
617
+
618
+ # 1. Determine input type and fetch content
619
+ text_to_analyze = ""
620
+ web_content = None
621
+ is_url = self.is_url(input_data)
622
+
623
+ if is_url:
624
+ print("[SysCRED] Fetching web content...")
625
+ web_content = self.api_clients.fetch_web_content(input_data)
626
+
627
+ if web_content.success:
628
+ text_to_analyze = web_content.text_content
629
+ print(f"[SysCRED] ✓ Content fetched: {len(text_to_analyze)} chars")
630
+ else:
631
+ print(f"[SysCRED] ⚠ Fetch failed: {web_content.error}")
632
+ print("[SysCRED] Proceeding with Domain/Metadata analysis only.")
633
+ text_to_analyze = ""
634
+ # We don't return error anymore, we proceed!
635
+ else:
636
+ text_to_analyze = input_data
637
+
638
+ # 2. Preprocess text
639
+ cleaned_text = self.preprocess(text_to_analyze)
640
+
641
+ # Only error on empty text if it wasn't a failed web fetch
642
+ # If web fetch failed, we proceed with empty text to give metadata analysis
643
+ if not cleaned_text and not (is_url and web_content and not web_content.success):
644
+ return {"error": "Le texte est vide après prétraitement."}
645
+ print(f"[SysCRED] Preprocessed text: {len(cleaned_text)} chars")
646
+
647
+ # Determine best query for Fact Checking
648
+ fact_check_query = input_data
649
+ if text_to_analyze and len(text_to_analyze) > 10:
650
+ # Use start of text if available
651
+ fact_check_query = text_to_analyze[:200]
652
+ elif is_url and web_content and web_content.title:
653
+ # Fallback to page title if text is missing (e.g. 403)
654
+ fact_check_query = web_content.title
655
+
656
+ # 3. Fetch external data
657
+ print(f"[SysCRED] Fetching external data (Query: {fact_check_query[:50]}...)...")
658
+ external_data = self.api_clients.fetch_external_data(input_data, fc_query=fact_check_query)
659
+
660
+ # [FIX] Handle text-only input reputation
661
+ if not is_url:
662
+ external_data.source_reputation = "N/A (User Input)"
663
+
664
+ print(f"[SysCRED] ✓ Reputation: {external_data.source_reputation}, Age: {external_data.domain_age_days} days")
665
+
666
+ # 4. Rule-based analysis
667
+ print("[SysCRED] Running rule-based analysis...")
668
+ rule_results = self.rule_based_analysis(cleaned_text, external_data)
669
+
670
+ # 5. NLP analysis
671
+ print("[SysCRED] Running NLP analysis...")
672
+ nlp_results = self.nlp_analysis(cleaned_text)
673
+
674
+ # 6. Calculate score
675
+ overall_score = self.calculate_overall_score(rule_results, nlp_results)
676
+ print(f"[SysCRED] ✓ Credibility score: {overall_score:.2f}")
677
+
678
+ # 7. [NEW] GraphRAG Context Retrieval
679
+ graph_context = ""
680
+ similar_uris = []
681
+ if self.graph_rag and 'source_analysis' in rule_results:
682
+ domain = rule_results['source_analysis'].get('domain', '')
683
+ # Pass keywords for text search if domain is empty or generic
684
+ keywords = []
685
+ if not domain and cleaned_text:
686
+ keywords = cleaned_text.split()[:5] # Simple keyword extraction
687
+
688
+ context = self.graph_rag.get_context(domain, keywords=keywords)
689
+ graph_context = context.get('full_text', '')
690
+ similar_uris = context.get('similar_uris', [])
691
+
692
+ if "Graph Memory" in graph_context:
693
+ print(f"[SysCRED] GraphRAG Context Found: {graph_context.splitlines()[1]}")
694
+
695
+ # 8. Generate report (Updated to include context)
696
+ report = self.generate_report(
697
+ input_data, cleaned_text, rule_results,
698
+ nlp_results, external_data, overall_score, web_content,
699
+ graph_context=graph_context
700
+ )
701
+
702
+ # Add similar URIs to report for ontology linking
703
+ if similar_uris:
704
+ report['similar_claims_uris'] = similar_uris
705
+
706
+ # 9. Save to ontology
707
+ if self.ontology_manager:
708
+ try:
709
+ report_uri = self.ontology_manager.add_evaluation_triplets(report)
710
+ report['ontology_uri'] = report_uri
711
+ self.ontology_manager.save_data()
712
+ except Exception as e:
713
+ print(f"[SysCRED] Ontology save failed: {e}")
714
+
715
+ print("[SysCRED] === Vérification terminée ===\n")
716
+ return report
717
+
718
+
719
+ # --- Main / Testing ---
720
+ if __name__ == "__main__":
721
+ import json
722
+
723
+ print("=" * 60)
724
+ print("SysCRED v2.0 - Système de Vérification de Crédibilité")
725
+ print("(c) Dominique S. Loyer - PhD Thesis Prototype")
726
+ print("=" * 60 + "\n")
727
+
728
+ # Initialize system (without ML models for quick testing)
729
+ system = CredibilityVerificationSystem(
730
+ ontology_base_path="/Users/bk280625/documents041025/MonCode/sysCRED_onto26avrtil.ttl",
731
+ ontology_data_path="/Users/bk280625/documents041025/MonCode/ontology/sysCRED_data.ttl",
732
+ load_ml_models=False # Set to True for full analysis
733
+ )
734
+
735
+ # Test cases
736
+ test_cases = {
737
+ "Test URL Crédible": "https://www.lemonde.fr",
738
+ "Test URL Inconnu": "https://example.com/article",
739
+ "Test Texte Simple": "This is a verified and authentic news report.",
740
+ "Test Texte Suspect": "Shocking conspiracy revealed! They don't want you to know this secret!",
741
+ }
742
+
743
+ results = {}
744
+ for name, test_input in test_cases.items():
745
+ print(f"\n{'='*50}")
746
+ print(f"Test: {name}")
747
+ print('='*50)
748
+
749
+ result = system.verify_information(test_input)
750
+ results[name] = result
751
+
752
+ if 'error' not in result:
753
+ print(f"\nScore: {result['scoreCredibilite']}")
754
+ print(f"Résumé: {result['resumeAnalyse']}")
755
+ else:
756
+ print(f"Erreur: {result['error']}")
757
+
758
+ print("\n" + "="*60)
759
+ print("Résumé des tests:")
760
+ print("="*60)
761
+ for name, result in results.items():
762
+ if 'error' not in result:
763
+ print(f" {name}: Score = {result['scoreCredibilite']:.2f}")
764
+ else:
765
+ print(f" {name}: ERREUR")