syscred 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
syscred/graph_rag.py ADDED
@@ -0,0 +1,171 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ GraphRAG Module - SysCRED
4
+ =========================
5
+ Retrieves context from the Knowledge Graph to enhance verification.
6
+ Transforms "Passive" Graph into "Active" Context.
7
+
8
+ (c) Dominique S. Loyer - PhD Thesis Prototype
9
+ """
10
+
11
+ from typing import List, Dict, Any, Optional
12
+ from syscred.ontology_manager import OntologyManager
13
+
14
+ class GraphRAG:
15
+ """
16
+ Retrieval Augmented Generation using the Semantic Knowledge Graph.
17
+ """
18
+
19
+ def __init__(self, ontology_manager: OntologyManager):
20
+ self.om = ontology_manager
21
+
22
+ def get_context(self, domain: str, keywords: List[str] = []) -> Dict[str, str]:
23
+ """
24
+ Retrieve context for a specific verification task.
25
+
26
+ Args:
27
+ domain: The domain being analyzed (e.g., 'lemonde.fr')
28
+ keywords: List of keywords from the claim (not yet used in V1)
29
+
30
+ Returns:
31
+ Dictionary with natural language context strings.
32
+ """
33
+ if not self.om:
34
+ return {"graph_context": "No ontology manager available."}
35
+
36
+ context_parts = []
37
+
38
+ # 1. Source History
39
+ source_history = self._get_source_history(domain)
40
+ if source_history:
41
+ context_parts.append(source_history)
42
+
43
+ # 2. Pattern Matching (Similar Claims)
44
+ similar_uris = []
45
+ if keywords:
46
+ similar_result = self._find_similar_claims(keywords)
47
+ if similar_result["text"]:
48
+ context_parts.append(similar_result["text"])
49
+ similar_uris = similar_result["uris"]
50
+
51
+ full_context = "\n\n".join(context_parts) if context_parts else "No prior knowledge found in the graph."
52
+
53
+ return {
54
+ "full_text": full_context,
55
+ "source_history": source_history,
56
+ "similar_uris": similar_uris # [NEW] Return URIs for linking
57
+ }
58
+
59
+ def _get_source_history(self, domain: str) -> str:
60
+ """
61
+ Query the graph for all previous evaluations of this domain.
62
+ """
63
+ if not domain:
64
+ return ""
65
+
66
+ # We reuse the specific query logic but tailored for retrieval
67
+ query = """
68
+ PREFIX cred: <https://github.com/DominiqueLoyer/systemFactChecking#>
69
+
70
+ SELECT ?score ?level ?timestamp
71
+ WHERE {
72
+ ?info cred:informationURL ?url .
73
+ ?request cred:concernsInformation ?info .
74
+ ?report cred:isReportOf ?request .
75
+ ?report cred:credibilityScoreValue ?score .
76
+ ?report cred:assignsCredibilityLevel ?level .
77
+ ?report cred:completionTimestamp ?timestamp .
78
+ FILTER(CONTAINS(STR(?url), "%s"))
79
+ }
80
+ ORDER BY DESC(?timestamp)
81
+ LIMIT 5
82
+ """ % domain
83
+
84
+ results = []
85
+ try:
86
+ combined = self.om.base_graph + self.om.data_graph
87
+ for row in combined.query(query):
88
+ results.append({
89
+ "score": float(row.score),
90
+ "level": str(row.level).split('#')[-1],
91
+ "date": str(row.timestamp).split('T')[0]
92
+ })
93
+ except Exception as e:
94
+ print(f"[GraphRAG] Query error: {e}")
95
+ return ""
96
+
97
+ if not results:
98
+ return f"The graph contains no previous evaluations for {domain}."
99
+
100
+ # Summarize
101
+ count = len(results)
102
+ avg_score = sum(r['score'] for r in results) / count
103
+ last_verdict = results[0]['level']
104
+
105
+ summary = (
106
+ f"Graph Memory for '{domain}':\n"
107
+ f"- Analyzed {count} times previously.\n"
108
+ f"- Average Credibility Score: {avg_score:.2f} / 1.0\n"
109
+ f"- Most recent verdict ({results[0]['date']}): {last_verdict}.\n"
110
+ )
111
+
112
+ return summary
113
+
114
+ def _find_similar_claims(self, keywords: List[str]) -> Dict[str, Any]:
115
+ """
116
+ Find evaluation history for content containing specific keywords.
117
+ Returns dict with 'text' (for LLM) and 'uris' (for Graph linking).
118
+ """
119
+ if not keywords:
120
+ return {"text": "", "uris": []}
121
+
122
+ # Build REGEX filter for keywords (OR logic)
123
+ # e.g., (fake|hoax|conspiracy)
124
+ clean_kws = [k for k in keywords if len(k) > 3] # Skip short words
125
+ if not clean_kws:
126
+ return {"text": "", "uris": []}
127
+
128
+ regex_pattern = "|".join(clean_kws)
129
+
130
+ query = """
131
+ PREFIX cred: <https://github.com/DominiqueLoyer/systemFactChecking#>
132
+
133
+ SELECT ?report ?content ?score ?level ?timestamp
134
+ WHERE {
135
+ ?info cred:informationContent ?content .
136
+ ?request cred:concernsInformation ?info .
137
+ ?report cred:isReportOf ?request .
138
+ ?report cred:credibilityScoreValue ?score .
139
+ ?report cred:assignsCredibilityLevel ?level .
140
+ ?report cred:completionTimestamp ?timestamp .
141
+ FILTER(REGEX(?content, "%s", "i"))
142
+ }
143
+ ORDER BY DESC(?timestamp)
144
+ LIMIT 3
145
+ """ % regex_pattern
146
+
147
+ results = []
148
+ try:
149
+ combined = self.om.base_graph + self.om.data_graph
150
+ for row in combined.query(query):
151
+ results.append({
152
+ "uri": str(row.report),
153
+ "content": str(row.content)[:100] + "...",
154
+ "score": float(row.score),
155
+ "verdict": str(row.level).split('#')[-1]
156
+ })
157
+ except Exception as e:
158
+ print(f"[GraphRAG] Similar claims error: {e}")
159
+ return {"text": "", "uris": []}
160
+
161
+ if not results:
162
+ return {"text": "", "uris": []}
163
+
164
+ lines = [f"Found {len(results)} similar claims in history:"]
165
+ for r in results:
166
+ lines.append(f"- \"{r['content']}\" ({r['verdict']}, Score: {r['score']:.2f})")
167
+
168
+ return {
169
+ "text": "\n".join(lines),
170
+ "uris": [r['uri'] for r in results]
171
+ }
syscred/ir_engine.py ADDED
@@ -0,0 +1,410 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ IR Engine Module - SysCRED
4
+ ===========================
5
+ Information Retrieval engine extracted from TREC AP88-90 project.
6
+
7
+ Features:
8
+ - TF-IDF calculation (custom and via Pyserini)
9
+ - BM25 scoring (via Lucene/Pyserini)
10
+ - Query Likelihood Dirichlet (QLD)
11
+ - Pseudo-Relevance Feedback (PRF)
12
+ - Porter Stemming integration
13
+
14
+ Based on: TREC_AP88-90_5juin2025.py
15
+ (c) Dominique S. Loyer - PhD Thesis Prototype
16
+ Citation Key: loyerEvaluationModelesRecherche2025
17
+ """
18
+
19
+ import re
20
+ import math
21
+ from typing import Dict, List, Tuple, Optional, Any
22
+ from dataclasses import dataclass
23
+ from collections import Counter
24
+
25
+ # Check for optional dependencies
26
+ try:
27
+ import nltk
28
+ from nltk.corpus import stopwords
29
+ from nltk.stem import PorterStemmer
30
+ from nltk.tokenize import word_tokenize
31
+ HAS_NLTK = True
32
+ except ImportError:
33
+ HAS_NLTK = False
34
+
35
+ try:
36
+ from pyserini.search.lucene import LuceneSearcher
37
+ HAS_PYSERINI = True
38
+ except ImportError:
39
+ HAS_PYSERINI = False
40
+
41
+
42
+ # --- Data Classes ---
43
+
44
+ @dataclass
45
+ class SearchResult:
46
+ """A single search result."""
47
+ doc_id: str
48
+ score: float
49
+ rank: int
50
+ snippet: Optional[str] = None
51
+
52
+
53
+ @dataclass
54
+ class SearchResponse:
55
+ """Complete search response."""
56
+ query_id: str
57
+ query_text: str
58
+ results: List[SearchResult]
59
+ model: str # 'bm25', 'qld', 'tfidf'
60
+ total_hits: int
61
+ search_time_ms: float
62
+
63
+
64
+ class IREngine:
65
+ """
66
+ Information Retrieval engine with multiple scoring methods.
67
+
68
+ Supports:
69
+ - Built-in TF-IDF/BM25 (no dependencies)
70
+ - Pyserini/Lucene BM25 and QLD (if pyserini installed)
71
+ - Query expansion with Pseudo-Relevance Feedback
72
+ """
73
+
74
+ # BM25 default parameters
75
+ BM25_K1 = 0.9
76
+ BM25_B = 0.4
77
+
78
+ def __init__(self, index_path: str = None, use_stemming: bool = True):
79
+ """
80
+ Initialize the IR engine.
81
+
82
+ Args:
83
+ index_path: Path to Lucene/Pyserini index (optional)
84
+ use_stemming: Whether to apply Porter stemming
85
+ """
86
+ self.index_path = index_path
87
+ self.use_stemming = use_stemming
88
+ self.searcher = None
89
+
90
+ # Initialize NLTK components
91
+ if HAS_NLTK:
92
+ try:
93
+ self.stopwords = set(stopwords.words('english'))
94
+ self.stemmer = PorterStemmer() if use_stemming else None
95
+ except LookupError:
96
+ print("[IREngine] Downloading NLTK resources...")
97
+ nltk.download('stopwords', quiet=True)
98
+ nltk.download('punkt', quiet=True)
99
+ nltk.download('punkt_tab', quiet=True)
100
+ self.stopwords = set(stopwords.words('english'))
101
+ self.stemmer = PorterStemmer() if use_stemming else None
102
+ else:
103
+ # Fallback stopwords
104
+ self.stopwords = {
105
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
106
+ 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are',
107
+ 'were', 'been', 'be', 'have', 'has', 'had', 'do', 'does',
108
+ 'did', 'will', 'would', 'could', 'should', 'may', 'might',
109
+ 'must', 'shall', 'can', 'need', 'this', 'that', 'these',
110
+ 'those', 'it', 'its', 'they', 'them', 'he', 'she', 'him',
111
+ 'her', 'his', 'we', 'you', 'i', 'my', 'your', 'our', 'their'
112
+ }
113
+ self.stemmer = None
114
+
115
+ # Initialize Pyserini searcher if available
116
+ if HAS_PYSERINI and index_path:
117
+ try:
118
+ self.searcher = LuceneSearcher(index_path)
119
+ print(f"[IREngine] Pyserini searcher initialized with index: {index_path}")
120
+ except Exception as e:
121
+ print(f"[IREngine] Failed to initialize Pyserini: {e}")
122
+
123
+ def preprocess(self, text: str) -> str:
124
+ """
125
+ Preprocess text with tokenization, stopword removal, and optional stemming.
126
+
127
+ This matches the TREC preprocessing pipeline.
128
+ """
129
+ if not isinstance(text, str):
130
+ return ""
131
+
132
+ text = text.lower()
133
+
134
+ if HAS_NLTK:
135
+ try:
136
+ tokens = word_tokenize(text)
137
+ except LookupError:
138
+ # Fallback tokenization
139
+ tokens = re.findall(r'\b[a-z]+\b', text)
140
+ else:
141
+ tokens = re.findall(r'\b[a-z]+\b', text)
142
+
143
+ # Filter stopwords and non-alpha
144
+ filtered = [t for t in tokens if t.isalpha() and t not in self.stopwords]
145
+
146
+ # Apply stemming if enabled
147
+ if self.stemmer:
148
+ filtered = [self.stemmer.stem(t) for t in filtered]
149
+
150
+ return ' '.join(filtered)
151
+
152
+ def calculate_tf(self, tokens: List[str]) -> Dict[str, float]:
153
+ """Calculate term frequency."""
154
+ if not tokens:
155
+ return {}
156
+ counts = Counter(tokens)
157
+ total = len(tokens)
158
+ return {term: count / total for term, count in counts.items()}
159
+
160
+ def calculate_bm25_score(
161
+ self,
162
+ query_terms: List[str],
163
+ doc_terms: List[str],
164
+ doc_length: int,
165
+ avg_doc_length: float,
166
+ doc_freq: Dict[str, int],
167
+ corpus_size: int
168
+ ) -> float:
169
+ """
170
+ Calculate BM25 score for a document.
171
+
172
+ BM25(D, Q) = Σ IDF(qi) × (f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D|/avgdl))
173
+ """
174
+ doc_term_counts = Counter(doc_terms)
175
+ score = 0.0
176
+
177
+ for term in query_terms:
178
+ if term not in doc_term_counts:
179
+ continue
180
+
181
+ tf = doc_term_counts[term]
182
+ df = doc_freq.get(term, 1)
183
+ idf = math.log((corpus_size - df + 0.5) / (df + 0.5) + 1)
184
+
185
+ numerator = tf * (self.BM25_K1 + 1)
186
+ denominator = tf + self.BM25_K1 * (1 - self.BM25_B + self.BM25_B * doc_length / avg_doc_length)
187
+
188
+ score += idf * (numerator / denominator)
189
+
190
+ return score
191
+
192
+ def search_pyserini(
193
+ self,
194
+ query: str,
195
+ model: str = 'bm25',
196
+ k: int = 100,
197
+ query_id: str = "Q1"
198
+ ) -> SearchResponse:
199
+ """
200
+ Search using Pyserini/Lucene.
201
+
202
+ Args:
203
+ query: Query text
204
+ model: 'bm25' or 'qld'
205
+ k: Number of results
206
+ query_id: Query identifier
207
+ """
208
+ import time
209
+ start = time.time()
210
+
211
+ if not self.searcher:
212
+ raise RuntimeError("Pyserini searcher not initialized. Provide index_path.")
213
+
214
+ # Configure similarity
215
+ if model == 'bm25':
216
+ self.searcher.set_bm25(k1=self.BM25_K1, b=self.BM25_B)
217
+ elif model == 'qld':
218
+ self.searcher.set_qld()
219
+ else:
220
+ self.searcher.set_bm25()
221
+
222
+ # Preprocess query
223
+ processed_query = self.preprocess(query)
224
+
225
+ # Search
226
+ hits = self.searcher.search(processed_query, k=k)
227
+
228
+ results = []
229
+ for i, hit in enumerate(hits):
230
+ results.append(SearchResult(
231
+ doc_id=hit.docid,
232
+ score=hit.score,
233
+ rank=i + 1
234
+ ))
235
+
236
+ elapsed = (time.time() - start) * 1000
237
+
238
+ return SearchResponse(
239
+ query_id=query_id,
240
+ query_text=query,
241
+ results=results,
242
+ model=model,
243
+ total_hits=len(results),
244
+ search_time_ms=elapsed
245
+ )
246
+
247
+ def pseudo_relevance_feedback(
248
+ self,
249
+ query: str,
250
+ top_docs_texts: List[str],
251
+ num_expansion_terms: int = 10
252
+ ) -> str:
253
+ """
254
+ Expand query using Pseudo-Relevance Feedback (PRF).
255
+
256
+ Uses top-k retrieved documents to find expansion terms.
257
+ """
258
+ query_tokens = self.preprocess(query).split()
259
+
260
+ # Collect terms from top documents
261
+ expansion_candidates = Counter()
262
+ for doc_text in top_docs_texts:
263
+ doc_tokens = self.preprocess(doc_text).split()
264
+ # Count terms not in original query
265
+ for token in doc_tokens:
266
+ if token not in query_tokens:
267
+ expansion_candidates[token] += 1
268
+
269
+ # Get top expansion terms
270
+ expansion_terms = [term for term, _ in expansion_candidates.most_common(num_expansion_terms)]
271
+
272
+ # Create expanded query
273
+ expanded_query = query + ' ' + ' '.join(expansion_terms)
274
+
275
+ return expanded_query
276
+
277
+ def format_trec_run(
278
+ self,
279
+ responses: List[SearchResponse],
280
+ run_tag: str
281
+ ) -> str:
282
+ """
283
+ Format results in TREC run file format.
284
+
285
+ Format: query_id Q0 doc_id rank score run_tag
286
+ """
287
+ lines = []
288
+ for response in responses:
289
+ for result in response.results:
290
+ lines.append(
291
+ f"{response.query_id} Q0 {result.doc_id} "
292
+ f"{result.rank} {result.score:.6f} {run_tag}"
293
+ )
294
+ return '\n'.join(lines)
295
+
296
+
297
+ # --- Kaggle/Colab Utilities ---
298
+
299
+ def setup_kaggle_environment():
300
+ """Setup environment for Kaggle notebooks."""
301
+ import subprocess
302
+ import sys
303
+
304
+ print("=" * 60)
305
+ print("SysCRED - Kaggle Environment Setup")
306
+ print("=" * 60)
307
+
308
+ # Check for GPU/TPU
309
+ import torch
310
+ if torch.cuda.is_available():
311
+ print(f"✓ GPU available: {torch.cuda.get_device_name(0)}")
312
+ else:
313
+ print("✗ No GPU detected")
314
+
315
+ # Install required packages
316
+ packages = [
317
+ 'pyserini',
318
+ 'transformers',
319
+ 'pytrec_eval',
320
+ 'nltk',
321
+ 'rdflib'
322
+ ]
323
+
324
+ print("\nInstalling packages...")
325
+ for pkg in packages:
326
+ try:
327
+ subprocess.run(
328
+ [sys.executable, '-m', 'pip', 'install', '-q', pkg],
329
+ check=True,
330
+ capture_output=True
331
+ )
332
+ print(f" ✓ {pkg}")
333
+ except:
334
+ print(f" ✗ {pkg} - install failed")
335
+
336
+ # Download NLTK resources
337
+ import nltk
338
+ for resource in ['stopwords', 'punkt', 'punkt_tab', 'wordnet']:
339
+ try:
340
+ nltk.download(resource, quiet=True)
341
+ except:
342
+ pass
343
+
344
+ print("\n✓ Environment setup complete")
345
+
346
+
347
+ def load_kaggle_dataset(dataset_path: str) -> str:
348
+ """
349
+ Load a Kaggle dataset.
350
+
351
+ Args:
352
+ dataset_path: Path like '/kaggle/input/trec-ap88-90'
353
+ """
354
+ import os
355
+
356
+ if os.path.exists(dataset_path):
357
+ print(f"✓ Dataset found: {dataset_path}")
358
+ return dataset_path
359
+ else:
360
+ print(f"✗ Dataset not found: {dataset_path}")
361
+ print("Make sure to add the dataset to your Kaggle notebook.")
362
+ return None
363
+
364
+
365
+ # --- Testing ---
366
+ if __name__ == "__main__":
367
+ print("=" * 60)
368
+ print("SysCRED IR Engine - Tests")
369
+ print("=" * 60)
370
+
371
+ engine = IREngine(use_stemming=True)
372
+
373
+ # Test preprocessing
374
+ print("\n1. Testing preprocessing...")
375
+ sample = "Information Retrieval systems help users find relevant documents."
376
+ processed = engine.preprocess(sample)
377
+ print(f" Original: {sample}")
378
+ print(f" Processed: {processed}")
379
+
380
+ # Test BM25
381
+ print("\n2. Testing BM25 calculation...")
382
+ query_terms = engine.preprocess("information retrieval").split()
383
+ doc_terms = engine.preprocess(sample).split()
384
+
385
+ score = engine.calculate_bm25_score(
386
+ query_terms=query_terms,
387
+ doc_terms=doc_terms,
388
+ doc_length=len(doc_terms),
389
+ avg_doc_length=10,
390
+ doc_freq={'inform': 5, 'retriev': 3},
391
+ corpus_size=100
392
+ )
393
+ print(f" BM25 Score: {score:.4f}")
394
+
395
+ # Test PRF
396
+ print("\n3. Testing Pseudo-Relevance Feedback...")
397
+ expanded = engine.pseudo_relevance_feedback(
398
+ query="information retrieval",
399
+ top_docs_texts=[
400
+ "Information retrieval is finding relevant documents in a collection.",
401
+ "Search engines use retrieval models like BM25 and TF-IDF.",
402
+ "Query expansion improves retrieval effectiveness."
403
+ ]
404
+ )
405
+ print(f" Original query: information retrieval")
406
+ print(f" Expanded query: {expanded}")
407
+
408
+ print("\n" + "=" * 60)
409
+ print("Tests complete!")
410
+ print("=" * 60)