syscred 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syscred/__init__.py +41 -0
- syscred/api_clients.py +560 -0
- syscred/backend_app.py +363 -0
- syscred/config.py +275 -0
- syscred/database.py +54 -0
- syscred/debug_factcheck.py +43 -0
- syscred/debug_graph_json.py +58 -0
- syscred/debug_init.py +33 -0
- syscred/debug_local_server.py +25 -0
- syscred/diagnose_imports.py +37 -0
- syscred/eval_metrics.py +349 -0
- syscred/graph_rag.py +171 -0
- syscred/ir_engine.py +410 -0
- syscred/ontology_manager.py +509 -0
- syscred/run_benchmark.py +135 -0
- syscred/seo_analyzer.py +610 -0
- syscred/setup.py +65 -0
- syscred/test_graphrag.py +87 -0
- syscred/test_phase1.py +28 -0
- syscred/test_phase2.py +55 -0
- syscred/test_suite.py +64 -0
- syscred/verification_system.py +765 -0
- syscred-2.2.0.dist-info/METADATA +259 -0
- syscred-2.2.0.dist-info/RECORD +28 -0
- syscred-2.2.0.dist-info/WHEEL +5 -0
- syscred-2.2.0.dist-info/entry_points.txt +3 -0
- syscred-2.2.0.dist-info/licenses/LICENSE +21 -0
- syscred-2.2.0.dist-info/top_level.txt +1 -0
syscred/seo_analyzer.py
ADDED
|
@@ -0,0 +1,610 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
SEO Analyzer Module - SysCRED
|
|
4
|
+
==============================
|
|
5
|
+
Provides SEO analysis and Information Retrieval metrics for credibility assessment.
|
|
6
|
+
|
|
7
|
+
Implements:
|
|
8
|
+
- TF-IDF calculation
|
|
9
|
+
- BM25 scoring
|
|
10
|
+
- PageRank estimation/explanation
|
|
11
|
+
- SEO meta tag analysis
|
|
12
|
+
- Backlink quality assessment
|
|
13
|
+
|
|
14
|
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
|
15
|
+
Citation Key: loyerModelingHybridSystem2025
|
|
16
|
+
|
|
17
|
+
Note sur la scalabilité:
|
|
18
|
+
- Pour des corpus de grande taille, envisager Cython ou Rust pour TF-IDF/BM25
|
|
19
|
+
- Les calculs matriciels peuvent bénéficier de NumPy optimisé ou de bibliothèques C
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import math
|
|
23
|
+
import re
|
|
24
|
+
from typing import List, Dict, Tuple, Optional, Any
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from collections import Counter
|
|
27
|
+
from urllib.parse import urlparse
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
import numpy as np
|
|
31
|
+
HAS_NUMPY = True
|
|
32
|
+
except ImportError:
|
|
33
|
+
HAS_NUMPY = False
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# --- Data Classes ---
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class SEOAnalysis:
|
|
40
|
+
"""Results of SEO analysis for a webpage."""
|
|
41
|
+
url: str
|
|
42
|
+
title_length: int
|
|
43
|
+
title_has_keywords: bool
|
|
44
|
+
meta_description_length: int
|
|
45
|
+
has_meta_keywords: bool
|
|
46
|
+
heading_structure: Dict[str, int] # h1, h2, h3 counts
|
|
47
|
+
word_count: int
|
|
48
|
+
keyword_density: Dict[str, float]
|
|
49
|
+
readability_score: float
|
|
50
|
+
seo_score: float # Overall 0-1 score
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class PageRankExplanation:
|
|
55
|
+
"""Explainable PageRank estimation."""
|
|
56
|
+
url: str
|
|
57
|
+
estimated_pr: float
|
|
58
|
+
factors: List[Dict[str, Any]]
|
|
59
|
+
explanation_text: str
|
|
60
|
+
confidence: float
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class IRMetrics:
|
|
65
|
+
"""Information Retrieval metrics for a document."""
|
|
66
|
+
tf_idf_scores: Dict[str, float]
|
|
67
|
+
bm25_score: float
|
|
68
|
+
top_terms: List[Tuple[str, float]]
|
|
69
|
+
document_length: int
|
|
70
|
+
avg_term_frequency: float
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class SEOAnalyzer:
|
|
74
|
+
"""
|
|
75
|
+
Analyze SEO factors and compute IR metrics for credibility assessment.
|
|
76
|
+
|
|
77
|
+
This module helps explain WHY a URL might rank well (or poorly) in search engines,
|
|
78
|
+
which is a factor in its credibility assessment.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
# BM25 parameters (classic values)
|
|
82
|
+
BM25_K1 = 1.5 # Term frequency saturation
|
|
83
|
+
BM25_B = 0.75 # Length normalization
|
|
84
|
+
|
|
85
|
+
# Stopwords (expandable)
|
|
86
|
+
STOPWORDS = {
|
|
87
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
88
|
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
|
89
|
+
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
|
90
|
+
'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need',
|
|
91
|
+
'this', 'that', 'these', 'those', 'it', 'its', 'they', 'them',
|
|
92
|
+
'he', 'she', 'him', 'her', 'his', 'my', 'your', 'our', 'their',
|
|
93
|
+
'what', 'which', 'who', 'whom', 'when', 'where', 'why', 'how',
|
|
94
|
+
'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other',
|
|
95
|
+
'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
|
|
96
|
+
'than', 'too', 'very', 'just', 'also', 'now', 'here', 'there',
|
|
97
|
+
# French stopwords
|
|
98
|
+
'le', 'la', 'les', 'un', 'une', 'des', 'du', 'de', 'et', 'ou',
|
|
99
|
+
'mais', 'donc', 'car', 'ni', 'que', 'qui', 'quoi', 'dont', 'où',
|
|
100
|
+
'ce', 'cette', 'ces', 'mon', 'ma', 'mes', 'ton', 'ta', 'tes',
|
|
101
|
+
'son', 'sa', 'ses', 'notre', 'nos', 'votre', 'vos', 'leur', 'leurs',
|
|
102
|
+
'je', 'tu', 'il', 'elle', 'nous', 'vous', 'ils', 'elles', 'on',
|
|
103
|
+
'est', 'sont', 'être', 'avoir', 'fait', 'faire', 'dit', 'dire',
|
|
104
|
+
'plus', 'moins', 'très', 'bien', 'tout', 'tous', 'toute', 'toutes',
|
|
105
|
+
'pour', 'par', 'sur', 'sous', 'avec', 'sans', 'dans', 'en', 'au', 'aux'
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
def __init__(self):
|
|
109
|
+
"""Initialize the SEO analyzer."""
|
|
110
|
+
# Reference corpus statistics (can be updated with real data)
|
|
111
|
+
self.avg_doc_length = 500 # Average document length in words
|
|
112
|
+
self.corpus_size = 1000 # Number of documents in reference corpus
|
|
113
|
+
# IDF values for common terms (placeholder - would be computed from real corpus)
|
|
114
|
+
self.idf_cache = {}
|
|
115
|
+
|
|
116
|
+
def tokenize(self, text: str, remove_stopwords: bool = True) -> List[str]:
|
|
117
|
+
"""
|
|
118
|
+
Tokenize text into words.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
text: Input text
|
|
122
|
+
remove_stopwords: Whether to remove stopwords
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of tokens
|
|
126
|
+
"""
|
|
127
|
+
if not text:
|
|
128
|
+
return []
|
|
129
|
+
|
|
130
|
+
# Lowercase and extract words
|
|
131
|
+
text = text.lower()
|
|
132
|
+
tokens = re.findall(r'\b[a-zA-ZÀ-ÿ]{2,}\b', text)
|
|
133
|
+
|
|
134
|
+
if remove_stopwords:
|
|
135
|
+
tokens = [t for t in tokens if t not in self.STOPWORDS]
|
|
136
|
+
|
|
137
|
+
return tokens
|
|
138
|
+
|
|
139
|
+
def calculate_tf(self, tokens: List[str]) -> Dict[str, float]:
|
|
140
|
+
"""
|
|
141
|
+
Calculate Term Frequency for each token.
|
|
142
|
+
|
|
143
|
+
TF(t) = (count of t in document) / (total terms in document)
|
|
144
|
+
"""
|
|
145
|
+
if not tokens:
|
|
146
|
+
return {}
|
|
147
|
+
|
|
148
|
+
term_counts = Counter(tokens)
|
|
149
|
+
total_terms = len(tokens)
|
|
150
|
+
|
|
151
|
+
return {term: count / total_terms for term, count in term_counts.items()}
|
|
152
|
+
|
|
153
|
+
def calculate_idf(self, term: str, doc_frequency: int = None) -> float:
|
|
154
|
+
"""
|
|
155
|
+
Calculate Inverse Document Frequency.
|
|
156
|
+
|
|
157
|
+
IDF(t) = log(N / (1 + df(t)))
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
term: The term to calculate IDF for
|
|
161
|
+
doc_frequency: Number of documents containing the term
|
|
162
|
+
(if None, use heuristic based on term length)
|
|
163
|
+
"""
|
|
164
|
+
if term in self.idf_cache:
|
|
165
|
+
return self.idf_cache[term]
|
|
166
|
+
|
|
167
|
+
if doc_frequency is None:
|
|
168
|
+
# Heuristic: shorter common words appear in more documents
|
|
169
|
+
if len(term) <= 3:
|
|
170
|
+
doc_frequency = self.corpus_size * 0.5
|
|
171
|
+
elif len(term) <= 5:
|
|
172
|
+
doc_frequency = self.corpus_size * 0.3
|
|
173
|
+
elif len(term) <= 8:
|
|
174
|
+
doc_frequency = self.corpus_size * 0.1
|
|
175
|
+
else:
|
|
176
|
+
doc_frequency = self.corpus_size * 0.05
|
|
177
|
+
|
|
178
|
+
idf = math.log(self.corpus_size / (1 + doc_frequency))
|
|
179
|
+
self.idf_cache[term] = idf
|
|
180
|
+
return idf
|
|
181
|
+
|
|
182
|
+
def calculate_tf_idf(self, text: str) -> Dict[str, float]:
|
|
183
|
+
"""
|
|
184
|
+
Calculate TF-IDF scores for all terms in a document.
|
|
185
|
+
|
|
186
|
+
TF-IDF(t,d) = TF(t,d) × IDF(t)
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
text: Document text
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Dictionary of term -> TF-IDF score
|
|
193
|
+
"""
|
|
194
|
+
tokens = self.tokenize(text)
|
|
195
|
+
tf_scores = self.calculate_tf(tokens)
|
|
196
|
+
|
|
197
|
+
tf_idf = {}
|
|
198
|
+
for term, tf in tf_scores.items():
|
|
199
|
+
idf = self.calculate_idf(term)
|
|
200
|
+
tf_idf[term] = tf * idf
|
|
201
|
+
|
|
202
|
+
return tf_idf
|
|
203
|
+
|
|
204
|
+
def calculate_bm25(
|
|
205
|
+
self,
|
|
206
|
+
query: str,
|
|
207
|
+
document: str,
|
|
208
|
+
k1: float = None,
|
|
209
|
+
b: float = None
|
|
210
|
+
) -> float:
|
|
211
|
+
"""
|
|
212
|
+
Calculate BM25 relevance score between query and document.
|
|
213
|
+
|
|
214
|
+
BM25(D, Q) = Σ IDF(qi) × (f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D|/avgdl))
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
query: Query string
|
|
218
|
+
document: Document text
|
|
219
|
+
k1: Term frequency saturation parameter
|
|
220
|
+
b: Length normalization parameter
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
BM25 score
|
|
224
|
+
"""
|
|
225
|
+
k1 = k1 or self.BM25_K1
|
|
226
|
+
b = b or self.BM25_B
|
|
227
|
+
|
|
228
|
+
query_tokens = self.tokenize(query)
|
|
229
|
+
doc_tokens = self.tokenize(document, remove_stopwords=False)
|
|
230
|
+
|
|
231
|
+
if not query_tokens or not doc_tokens:
|
|
232
|
+
return 0.0
|
|
233
|
+
|
|
234
|
+
doc_length = len(doc_tokens)
|
|
235
|
+
doc_term_counts = Counter(doc_tokens)
|
|
236
|
+
|
|
237
|
+
score = 0.0
|
|
238
|
+
for term in query_tokens:
|
|
239
|
+
if term not in doc_term_counts:
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
tf = doc_term_counts[term]
|
|
243
|
+
idf = self.calculate_idf(term)
|
|
244
|
+
|
|
245
|
+
numerator = tf * (k1 + 1)
|
|
246
|
+
denominator = tf + k1 * (1 - b + b * doc_length / self.avg_doc_length)
|
|
247
|
+
|
|
248
|
+
score += idf * (numerator / denominator)
|
|
249
|
+
|
|
250
|
+
return score
|
|
251
|
+
|
|
252
|
+
def analyze_seo(
|
|
253
|
+
self,
|
|
254
|
+
url: str,
|
|
255
|
+
title: Optional[str],
|
|
256
|
+
meta_description: Optional[str],
|
|
257
|
+
text_content: str,
|
|
258
|
+
headings: Dict[str, List[str]] = None
|
|
259
|
+
) -> SEOAnalysis:
|
|
260
|
+
"""
|
|
261
|
+
Perform comprehensive SEO analysis.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
url: Page URL
|
|
265
|
+
title: Page title
|
|
266
|
+
meta_description: Meta description
|
|
267
|
+
text_content: Main text content
|
|
268
|
+
headings: Dictionary of heading levels (h1, h2, etc.) and their texts
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
SEOAnalysis with all metrics
|
|
272
|
+
"""
|
|
273
|
+
tokens = self.tokenize(text_content)
|
|
274
|
+
word_count = len(tokens)
|
|
275
|
+
|
|
276
|
+
# Title analysis
|
|
277
|
+
title_length = len(title) if title else 0
|
|
278
|
+
title_tokens = self.tokenize(title) if title else []
|
|
279
|
+
|
|
280
|
+
# Check if title contains main keywords from content
|
|
281
|
+
content_top_terms = Counter(tokens).most_common(10)
|
|
282
|
+
title_has_keywords = any(
|
|
283
|
+
term in title_tokens
|
|
284
|
+
for term, _ in content_top_terms[:5]
|
|
285
|
+
) if title_tokens else False
|
|
286
|
+
|
|
287
|
+
# Meta description analysis
|
|
288
|
+
meta_length = len(meta_description) if meta_description else 0
|
|
289
|
+
|
|
290
|
+
# Heading structure
|
|
291
|
+
headings = headings or {}
|
|
292
|
+
heading_structure = {
|
|
293
|
+
'h1': len(headings.get('h1', [])),
|
|
294
|
+
'h2': len(headings.get('h2', [])),
|
|
295
|
+
'h3': len(headings.get('h3', []))
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
# Keyword density (top 5 terms)
|
|
299
|
+
keyword_density = {}
|
|
300
|
+
for term, count in Counter(tokens).most_common(5):
|
|
301
|
+
keyword_density[term] = count / word_count if word_count > 0 else 0
|
|
302
|
+
|
|
303
|
+
# Readability score (simple metric based on average word/sentence length)
|
|
304
|
+
sentences = re.split(r'[.!?]+', text_content)
|
|
305
|
+
avg_sentence_length = word_count / len(sentences) if sentences else 0
|
|
306
|
+
|
|
307
|
+
# Convert to readability score (0-1, where 1 is optimal ~15-20 words/sentence)
|
|
308
|
+
if 15 <= avg_sentence_length <= 20:
|
|
309
|
+
readability_score = 1.0
|
|
310
|
+
elif 10 <= avg_sentence_length <= 25:
|
|
311
|
+
readability_score = 0.8
|
|
312
|
+
elif 5 <= avg_sentence_length <= 30:
|
|
313
|
+
readability_score = 0.6
|
|
314
|
+
else:
|
|
315
|
+
readability_score = 0.4
|
|
316
|
+
|
|
317
|
+
# Overall SEO score
|
|
318
|
+
seo_factors = []
|
|
319
|
+
|
|
320
|
+
# Title score (optimal: 50-60 chars)
|
|
321
|
+
if 50 <= title_length <= 60:
|
|
322
|
+
seo_factors.append(1.0)
|
|
323
|
+
elif 30 <= title_length <= 70:
|
|
324
|
+
seo_factors.append(0.7)
|
|
325
|
+
else:
|
|
326
|
+
seo_factors.append(0.3)
|
|
327
|
+
|
|
328
|
+
# Meta description (optimal: 150-160 chars)
|
|
329
|
+
if 150 <= meta_length <= 160:
|
|
330
|
+
seo_factors.append(1.0)
|
|
331
|
+
elif 100 <= meta_length <= 200:
|
|
332
|
+
seo_factors.append(0.7)
|
|
333
|
+
else:
|
|
334
|
+
seo_factors.append(0.3)
|
|
335
|
+
|
|
336
|
+
# Has exactly one H1
|
|
337
|
+
seo_factors.append(1.0 if heading_structure['h1'] == 1 else 0.5)
|
|
338
|
+
|
|
339
|
+
# Content length (optimal: 300+ words)
|
|
340
|
+
if word_count >= 1000:
|
|
341
|
+
seo_factors.append(1.0)
|
|
342
|
+
elif word_count >= 500:
|
|
343
|
+
seo_factors.append(0.8)
|
|
344
|
+
elif word_count >= 300:
|
|
345
|
+
seo_factors.append(0.6)
|
|
346
|
+
else:
|
|
347
|
+
seo_factors.append(0.3)
|
|
348
|
+
|
|
349
|
+
seo_score = sum(seo_factors) / len(seo_factors) if seo_factors else 0.5
|
|
350
|
+
|
|
351
|
+
return SEOAnalysis(
|
|
352
|
+
url=url,
|
|
353
|
+
title_length=title_length,
|
|
354
|
+
title_has_keywords=title_has_keywords,
|
|
355
|
+
meta_description_length=meta_length,
|
|
356
|
+
has_meta_keywords=bool(keyword_density),
|
|
357
|
+
heading_structure=heading_structure,
|
|
358
|
+
word_count=word_count,
|
|
359
|
+
keyword_density=keyword_density,
|
|
360
|
+
readability_score=readability_score,
|
|
361
|
+
seo_score=seo_score
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
def estimate_pagerank(
|
|
365
|
+
self,
|
|
366
|
+
url: str,
|
|
367
|
+
backlinks: List[Dict[str, Any]] = None,
|
|
368
|
+
domain_age_days: int = None,
|
|
369
|
+
source_reputation: str = None
|
|
370
|
+
) -> PageRankExplanation:
|
|
371
|
+
"""
|
|
372
|
+
Estimate and explain PageRank-like score.
|
|
373
|
+
|
|
374
|
+
This is NOT the actual Google PageRank, but an explainable approximation
|
|
375
|
+
based on available factors that contribute to search ranking.
|
|
376
|
+
|
|
377
|
+
PageRank Formula (simplified):
|
|
378
|
+
PR(A) = (1-d) + d × Σ (PR(Ti) / C(Ti))
|
|
379
|
+
|
|
380
|
+
Where:
|
|
381
|
+
- d = damping factor (0.85)
|
|
382
|
+
- Ti = pages pointing to A
|
|
383
|
+
- C(Ti) = number of outgoing links from Ti
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
url: Target URL
|
|
387
|
+
backlinks: List of backlink information
|
|
388
|
+
domain_age_days: Age of the domain in days
|
|
389
|
+
source_reputation: Known reputation level
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
PageRankExplanation with estimated score and factors
|
|
393
|
+
"""
|
|
394
|
+
d = 0.85 # Damping factor
|
|
395
|
+
base_pr = (1 - d) # Starting PageRank
|
|
396
|
+
|
|
397
|
+
factors = []
|
|
398
|
+
pr_contributions = []
|
|
399
|
+
|
|
400
|
+
# Factor 1: Domain Age
|
|
401
|
+
if domain_age_days is not None:
|
|
402
|
+
if domain_age_days > 365 * 5: # > 5 years
|
|
403
|
+
age_contribution = 0.3
|
|
404
|
+
age_description = "Domaine ancien (5+ ans) - forte confiance"
|
|
405
|
+
elif domain_age_days > 365 * 2: # > 2 years
|
|
406
|
+
age_contribution = 0.2
|
|
407
|
+
age_description = "Domaine établi (2-5 ans) - bonne confiance"
|
|
408
|
+
elif domain_age_days > 365: # > 1 year
|
|
409
|
+
age_contribution = 0.1
|
|
410
|
+
age_description = "Domaine récent (1-2 ans) - confiance modérée"
|
|
411
|
+
else:
|
|
412
|
+
age_contribution = 0.0
|
|
413
|
+
age_description = "Domaine très récent (<1 an) - confiance faible"
|
|
414
|
+
|
|
415
|
+
factors.append({
|
|
416
|
+
'name': 'Domain Age',
|
|
417
|
+
'value': f"{domain_age_days} days ({domain_age_days/365:.1f} years)",
|
|
418
|
+
'contribution': age_contribution,
|
|
419
|
+
'description': age_description
|
|
420
|
+
})
|
|
421
|
+
pr_contributions.append(age_contribution)
|
|
422
|
+
|
|
423
|
+
# Factor 2: Source Reputation
|
|
424
|
+
if source_reputation:
|
|
425
|
+
if source_reputation == 'High':
|
|
426
|
+
rep_contribution = 0.3
|
|
427
|
+
rep_description = "Source réputée - équivalent à beaucoup de backlinks de qualité"
|
|
428
|
+
elif source_reputation == 'Medium':
|
|
429
|
+
rep_contribution = 0.15
|
|
430
|
+
rep_description = "Source connue - équivalent à quelques backlinks de qualité"
|
|
431
|
+
else:
|
|
432
|
+
rep_contribution = 0.0
|
|
433
|
+
rep_description = "Source inconnue ou peu fiable - pas de boost de réputation"
|
|
434
|
+
|
|
435
|
+
factors.append({
|
|
436
|
+
'name': 'Source Reputation',
|
|
437
|
+
'value': source_reputation,
|
|
438
|
+
'contribution': rep_contribution,
|
|
439
|
+
'description': rep_description
|
|
440
|
+
})
|
|
441
|
+
pr_contributions.append(rep_contribution)
|
|
442
|
+
|
|
443
|
+
# Factor 3: Backlinks (if available)
|
|
444
|
+
backlinks = backlinks or []
|
|
445
|
+
if backlinks:
|
|
446
|
+
# Estimate backlink contribution
|
|
447
|
+
high_quality_count = sum(1 for bl in backlinks if bl.get('quality', 'low') == 'high')
|
|
448
|
+
medium_quality_count = sum(1 for bl in backlinks if bl.get('quality', 'low') == 'medium')
|
|
449
|
+
|
|
450
|
+
# Each high-quality backlink contributes more
|
|
451
|
+
backlink_contribution = min(0.3, high_quality_count * 0.05 + medium_quality_count * 0.02)
|
|
452
|
+
|
|
453
|
+
factors.append({
|
|
454
|
+
'name': 'Backlinks',
|
|
455
|
+
'value': f"{len(backlinks)} total ({high_quality_count} high quality)",
|
|
456
|
+
'contribution': backlink_contribution,
|
|
457
|
+
'description': f"Liens entrants détectés - contribution au classement"
|
|
458
|
+
})
|
|
459
|
+
pr_contributions.append(backlink_contribution)
|
|
460
|
+
|
|
461
|
+
# Factor 4: Domain type (TLD)
|
|
462
|
+
parsed = urlparse(url)
|
|
463
|
+
domain = parsed.netloc
|
|
464
|
+
|
|
465
|
+
if domain.endswith('.edu') or domain.endswith('.gov'):
|
|
466
|
+
tld_contribution = 0.2
|
|
467
|
+
tld_description = "Domaine .edu/.gov - haute autorité institutionnelle"
|
|
468
|
+
elif domain.endswith('.ac.uk') or domain.endswith('.gouv.fr'):
|
|
469
|
+
tld_contribution = 0.15
|
|
470
|
+
tld_description = "Domaine académique/gouvernemental - bonne autorité"
|
|
471
|
+
elif domain.endswith('.org'):
|
|
472
|
+
tld_contribution = 0.05
|
|
473
|
+
tld_description = "Domaine .org - légère autorité"
|
|
474
|
+
else:
|
|
475
|
+
tld_contribution = 0.0
|
|
476
|
+
tld_description = "Domaine commercial standard"
|
|
477
|
+
|
|
478
|
+
factors.append({
|
|
479
|
+
'name': 'Domain Type (TLD)',
|
|
480
|
+
'value': domain,
|
|
481
|
+
'contribution': tld_contribution,
|
|
482
|
+
'description': tld_description
|
|
483
|
+
})
|
|
484
|
+
pr_contributions.append(tld_contribution)
|
|
485
|
+
|
|
486
|
+
# Calculate final estimated PageRank
|
|
487
|
+
total_contribution = sum(pr_contributions)
|
|
488
|
+
estimated_pr = base_pr + d * total_contribution
|
|
489
|
+
estimated_pr = min(1.0, max(0.0, estimated_pr)) # Clamp to [0, 1]
|
|
490
|
+
|
|
491
|
+
# Generate explanation
|
|
492
|
+
explanation_parts = [
|
|
493
|
+
f"PageRank estimé: {estimated_pr:.3f}",
|
|
494
|
+
f"",
|
|
495
|
+
f"Formule: PR = (1-d) + d × Σ(contributions)",
|
|
496
|
+
f" PR = {base_pr:.2f} + {d:.2f} × {total_contribution:.2f}",
|
|
497
|
+
f"",
|
|
498
|
+
f"Facteurs contributifs:"
|
|
499
|
+
]
|
|
500
|
+
|
|
501
|
+
for factor in factors:
|
|
502
|
+
explanation_parts.append(
|
|
503
|
+
f" • {factor['name']}: +{factor['contribution']:.2f} - {factor['description']}"
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
# Confidence based on how many factors we have data for
|
|
507
|
+
confidence = min(1.0, len([f for f in factors if f['contribution'] > 0]) / 4)
|
|
508
|
+
|
|
509
|
+
return PageRankExplanation(
|
|
510
|
+
url=url,
|
|
511
|
+
estimated_pr=estimated_pr,
|
|
512
|
+
factors=factors,
|
|
513
|
+
explanation_text="\n".join(explanation_parts),
|
|
514
|
+
confidence=confidence
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
def get_ir_metrics(self, text: str, query: str = None) -> IRMetrics:
|
|
518
|
+
"""
|
|
519
|
+
Get comprehensive IR metrics for a document.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
text: Document text
|
|
523
|
+
query: Optional query for BM25 calculation
|
|
524
|
+
|
|
525
|
+
Returns:
|
|
526
|
+
IRMetrics with TF-IDF, BM25, and other metrics
|
|
527
|
+
"""
|
|
528
|
+
tokens = self.tokenize(text)
|
|
529
|
+
tf_idf = self.calculate_tf_idf(text)
|
|
530
|
+
|
|
531
|
+
# Top terms by TF-IDF
|
|
532
|
+
top_terms = sorted(tf_idf.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
533
|
+
|
|
534
|
+
# BM25 score (if query provided)
|
|
535
|
+
bm25_score = 0.0
|
|
536
|
+
if query:
|
|
537
|
+
bm25_score = self.calculate_bm25(query, text)
|
|
538
|
+
|
|
539
|
+
# Average term frequency
|
|
540
|
+
tf = self.calculate_tf(tokens)
|
|
541
|
+
avg_tf = sum(tf.values()) / len(tf) if tf else 0
|
|
542
|
+
|
|
543
|
+
return IRMetrics(
|
|
544
|
+
tf_idf_scores=tf_idf,
|
|
545
|
+
bm25_score=bm25_score,
|
|
546
|
+
top_terms=top_terms,
|
|
547
|
+
document_length=len(tokens),
|
|
548
|
+
avg_term_frequency=avg_tf
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
# --- Testing ---
|
|
553
|
+
if __name__ == "__main__":
|
|
554
|
+
print("=" * 60)
|
|
555
|
+
print("SysCRED SEO Analyzer - Tests")
|
|
556
|
+
print("=" * 60 + "\n")
|
|
557
|
+
|
|
558
|
+
analyzer = SEOAnalyzer()
|
|
559
|
+
|
|
560
|
+
# Test 1: TF-IDF
|
|
561
|
+
print("1. Testing TF-IDF calculation...")
|
|
562
|
+
sample_text = """
|
|
563
|
+
The credibility of online information is crucial in today's digital age.
|
|
564
|
+
Fact-checking organizations help verify claims and identify misinformation.
|
|
565
|
+
Source reputation and domain age are important credibility factors.
|
|
566
|
+
"""
|
|
567
|
+
tf_idf = analyzer.calculate_tf_idf(sample_text)
|
|
568
|
+
top_5 = sorted(tf_idf.items(), key=lambda x: x[1], reverse=True)[:5]
|
|
569
|
+
print(" Top 5 TF-IDF terms:")
|
|
570
|
+
for term, score in top_5:
|
|
571
|
+
print(f" {term}: {score:.4f}")
|
|
572
|
+
print()
|
|
573
|
+
|
|
574
|
+
# Test 2: BM25
|
|
575
|
+
print("2. Testing BM25 scoring...")
|
|
576
|
+
query = "credibility verification"
|
|
577
|
+
bm25_score = analyzer.calculate_bm25(query, sample_text)
|
|
578
|
+
print(f" Query: '{query}'")
|
|
579
|
+
print(f" BM25 Score: {bm25_score:.4f}")
|
|
580
|
+
print()
|
|
581
|
+
|
|
582
|
+
# Test 3: SEO Analysis
|
|
583
|
+
print("3. Testing SEO analysis...")
|
|
584
|
+
seo = analyzer.analyze_seo(
|
|
585
|
+
url="https://example.com/article",
|
|
586
|
+
title="Understanding Online Credibility - A Complete Guide",
|
|
587
|
+
meta_description="Learn about the key factors that determine the credibility of online information sources.",
|
|
588
|
+
text_content=sample_text
|
|
589
|
+
)
|
|
590
|
+
print(f" Title length: {seo.title_length} chars")
|
|
591
|
+
print(f" Meta description length: {seo.meta_description_length} chars")
|
|
592
|
+
print(f" Word count: {seo.word_count}")
|
|
593
|
+
print(f" SEO Score: {seo.seo_score:.2f}")
|
|
594
|
+
print()
|
|
595
|
+
|
|
596
|
+
# Test 4: PageRank Estimation
|
|
597
|
+
print("4. Testing PageRank estimation...")
|
|
598
|
+
pr = analyzer.estimate_pagerank(
|
|
599
|
+
url="https://www.lemonde.fr/article",
|
|
600
|
+
domain_age_days=9125, # ~25 years
|
|
601
|
+
source_reputation="High"
|
|
602
|
+
)
|
|
603
|
+
print(f" Estimated PageRank: {pr.estimated_pr:.3f}")
|
|
604
|
+
print(f" Confidence: {pr.confidence:.2f}")
|
|
605
|
+
print("\n Explanation:")
|
|
606
|
+
print(" " + pr.explanation_text.replace("\n", "\n "))
|
|
607
|
+
|
|
608
|
+
print("\n" + "=" * 60)
|
|
609
|
+
print("Tests complete!")
|
|
610
|
+
print("=" * 60)
|
syscred/setup.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
SysCRED - Système de Vérification de Crédibilité
|
|
4
|
+
=================================================
|
|
5
|
+
PhD Thesis Prototype - Neuro-Symbolic Credibility Verification
|
|
6
|
+
|
|
7
|
+
(c) Dominique S. Loyer
|
|
8
|
+
Citation Key: loyerModelingHybridSystem2025
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from setuptools import setup, find_packages
|
|
12
|
+
|
|
13
|
+
setup(
|
|
14
|
+
name="syscred",
|
|
15
|
+
version="2.0.0",
|
|
16
|
+
author="Dominique S. Loyer",
|
|
17
|
+
author_email="loyer.dominique_sebastien@courrier.uqam.ca",
|
|
18
|
+
description="Neuro-Symbolic Credibility Verification System",
|
|
19
|
+
long_description=open("README.md").read() if __import__("os").path.exists("README.md") else "",
|
|
20
|
+
long_description_content_type="text/markdown",
|
|
21
|
+
url="https://github.com/DominiqueLoyer/syscred",
|
|
22
|
+
packages=find_packages(),
|
|
23
|
+
python_requires=">=3.9",
|
|
24
|
+
install_requires=[
|
|
25
|
+
"requests>=2.28.0",
|
|
26
|
+
"beautifulsoup4>=4.11.0",
|
|
27
|
+
"rdflib>=6.0.0",
|
|
28
|
+
"nltk>=3.7",
|
|
29
|
+
],
|
|
30
|
+
extras_require={
|
|
31
|
+
"ml": [
|
|
32
|
+
"torch>=2.0.0",
|
|
33
|
+
"transformers>=4.30.0",
|
|
34
|
+
"numpy>=1.23.0,<2.0",
|
|
35
|
+
],
|
|
36
|
+
"ir": [
|
|
37
|
+
"pyserini>=0.21.0",
|
|
38
|
+
"pytrec_eval>=0.5",
|
|
39
|
+
],
|
|
40
|
+
"web": [
|
|
41
|
+
"flask>=2.0.0",
|
|
42
|
+
"flask-cors>=3.0.0",
|
|
43
|
+
],
|
|
44
|
+
"full": [
|
|
45
|
+
"torch>=2.0.0",
|
|
46
|
+
"transformers>=4.30.0",
|
|
47
|
+
"numpy>=1.23.0,<2.0",
|
|
48
|
+
"pyserini>=0.21.0",
|
|
49
|
+
"pytrec_eval>=0.5",
|
|
50
|
+
"flask>=2.0.0",
|
|
51
|
+
"flask-cors>=3.0.0",
|
|
52
|
+
"lime>=0.2.0",
|
|
53
|
+
],
|
|
54
|
+
},
|
|
55
|
+
classifiers=[
|
|
56
|
+
"Development Status :: 4 - Beta",
|
|
57
|
+
"Intended Audience :: Science/Research",
|
|
58
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
59
|
+
"License :: OSI Approved :: MIT License",
|
|
60
|
+
"Programming Language :: Python :: 3.9",
|
|
61
|
+
"Programming Language :: Python :: 3.10",
|
|
62
|
+
"Programming Language :: Python :: 3.11",
|
|
63
|
+
],
|
|
64
|
+
keywords="credibility verification nlp ontology information-retrieval",
|
|
65
|
+
)
|