syscred 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syscred/__init__.py +41 -0
- syscred/api_clients.py +560 -0
- syscred/backend_app.py +363 -0
- syscred/config.py +275 -0
- syscred/database.py +54 -0
- syscred/debug_factcheck.py +43 -0
- syscred/debug_graph_json.py +58 -0
- syscred/debug_init.py +33 -0
- syscred/debug_local_server.py +25 -0
- syscred/diagnose_imports.py +37 -0
- syscred/eval_metrics.py +349 -0
- syscred/graph_rag.py +171 -0
- syscred/ir_engine.py +410 -0
- syscred/ontology_manager.py +509 -0
- syscred/run_benchmark.py +135 -0
- syscred/seo_analyzer.py +610 -0
- syscred/setup.py +65 -0
- syscred/test_graphrag.py +87 -0
- syscred/test_phase1.py +28 -0
- syscred/test_phase2.py +55 -0
- syscred/test_suite.py +64 -0
- syscred/verification_system.py +765 -0
- syscred-2.2.0.dist-info/METADATA +259 -0
- syscred-2.2.0.dist-info/RECORD +28 -0
- syscred-2.2.0.dist-info/WHEEL +5 -0
- syscred-2.2.0.dist-info/entry_points.txt +3 -0
- syscred-2.2.0.dist-info/licenses/LICENSE +21 -0
- syscred-2.2.0.dist-info/top_level.txt +1 -0
syscred/graph_rag.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
GraphRAG Module - SysCRED
|
|
4
|
+
=========================
|
|
5
|
+
Retrieves context from the Knowledge Graph to enhance verification.
|
|
6
|
+
Transforms "Passive" Graph into "Active" Context.
|
|
7
|
+
|
|
8
|
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import List, Dict, Any, Optional
|
|
12
|
+
from syscred.ontology_manager import OntologyManager
|
|
13
|
+
|
|
14
|
+
class GraphRAG:
|
|
15
|
+
"""
|
|
16
|
+
Retrieval Augmented Generation using the Semantic Knowledge Graph.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, ontology_manager: OntologyManager):
|
|
20
|
+
self.om = ontology_manager
|
|
21
|
+
|
|
22
|
+
def get_context(self, domain: str, keywords: List[str] = []) -> Dict[str, str]:
|
|
23
|
+
"""
|
|
24
|
+
Retrieve context for a specific verification task.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
domain: The domain being analyzed (e.g., 'lemonde.fr')
|
|
28
|
+
keywords: List of keywords from the claim (not yet used in V1)
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Dictionary with natural language context strings.
|
|
32
|
+
"""
|
|
33
|
+
if not self.om:
|
|
34
|
+
return {"graph_context": "No ontology manager available."}
|
|
35
|
+
|
|
36
|
+
context_parts = []
|
|
37
|
+
|
|
38
|
+
# 1. Source History
|
|
39
|
+
source_history = self._get_source_history(domain)
|
|
40
|
+
if source_history:
|
|
41
|
+
context_parts.append(source_history)
|
|
42
|
+
|
|
43
|
+
# 2. Pattern Matching (Similar Claims)
|
|
44
|
+
similar_uris = []
|
|
45
|
+
if keywords:
|
|
46
|
+
similar_result = self._find_similar_claims(keywords)
|
|
47
|
+
if similar_result["text"]:
|
|
48
|
+
context_parts.append(similar_result["text"])
|
|
49
|
+
similar_uris = similar_result["uris"]
|
|
50
|
+
|
|
51
|
+
full_context = "\n\n".join(context_parts) if context_parts else "No prior knowledge found in the graph."
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
"full_text": full_context,
|
|
55
|
+
"source_history": source_history,
|
|
56
|
+
"similar_uris": similar_uris # [NEW] Return URIs for linking
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
def _get_source_history(self, domain: str) -> str:
|
|
60
|
+
"""
|
|
61
|
+
Query the graph for all previous evaluations of this domain.
|
|
62
|
+
"""
|
|
63
|
+
if not domain:
|
|
64
|
+
return ""
|
|
65
|
+
|
|
66
|
+
# We reuse the specific query logic but tailored for retrieval
|
|
67
|
+
query = """
|
|
68
|
+
PREFIX cred: <https://github.com/DominiqueLoyer/systemFactChecking#>
|
|
69
|
+
|
|
70
|
+
SELECT ?score ?level ?timestamp
|
|
71
|
+
WHERE {
|
|
72
|
+
?info cred:informationURL ?url .
|
|
73
|
+
?request cred:concernsInformation ?info .
|
|
74
|
+
?report cred:isReportOf ?request .
|
|
75
|
+
?report cred:credibilityScoreValue ?score .
|
|
76
|
+
?report cred:assignsCredibilityLevel ?level .
|
|
77
|
+
?report cred:completionTimestamp ?timestamp .
|
|
78
|
+
FILTER(CONTAINS(STR(?url), "%s"))
|
|
79
|
+
}
|
|
80
|
+
ORDER BY DESC(?timestamp)
|
|
81
|
+
LIMIT 5
|
|
82
|
+
""" % domain
|
|
83
|
+
|
|
84
|
+
results = []
|
|
85
|
+
try:
|
|
86
|
+
combined = self.om.base_graph + self.om.data_graph
|
|
87
|
+
for row in combined.query(query):
|
|
88
|
+
results.append({
|
|
89
|
+
"score": float(row.score),
|
|
90
|
+
"level": str(row.level).split('#')[-1],
|
|
91
|
+
"date": str(row.timestamp).split('T')[0]
|
|
92
|
+
})
|
|
93
|
+
except Exception as e:
|
|
94
|
+
print(f"[GraphRAG] Query error: {e}")
|
|
95
|
+
return ""
|
|
96
|
+
|
|
97
|
+
if not results:
|
|
98
|
+
return f"The graph contains no previous evaluations for {domain}."
|
|
99
|
+
|
|
100
|
+
# Summarize
|
|
101
|
+
count = len(results)
|
|
102
|
+
avg_score = sum(r['score'] for r in results) / count
|
|
103
|
+
last_verdict = results[0]['level']
|
|
104
|
+
|
|
105
|
+
summary = (
|
|
106
|
+
f"Graph Memory for '{domain}':\n"
|
|
107
|
+
f"- Analyzed {count} times previously.\n"
|
|
108
|
+
f"- Average Credibility Score: {avg_score:.2f} / 1.0\n"
|
|
109
|
+
f"- Most recent verdict ({results[0]['date']}): {last_verdict}.\n"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return summary
|
|
113
|
+
|
|
114
|
+
def _find_similar_claims(self, keywords: List[str]) -> Dict[str, Any]:
|
|
115
|
+
"""
|
|
116
|
+
Find evaluation history for content containing specific keywords.
|
|
117
|
+
Returns dict with 'text' (for LLM) and 'uris' (for Graph linking).
|
|
118
|
+
"""
|
|
119
|
+
if not keywords:
|
|
120
|
+
return {"text": "", "uris": []}
|
|
121
|
+
|
|
122
|
+
# Build REGEX filter for keywords (OR logic)
|
|
123
|
+
# e.g., (fake|hoax|conspiracy)
|
|
124
|
+
clean_kws = [k for k in keywords if len(k) > 3] # Skip short words
|
|
125
|
+
if not clean_kws:
|
|
126
|
+
return {"text": "", "uris": []}
|
|
127
|
+
|
|
128
|
+
regex_pattern = "|".join(clean_kws)
|
|
129
|
+
|
|
130
|
+
query = """
|
|
131
|
+
PREFIX cred: <https://github.com/DominiqueLoyer/systemFactChecking#>
|
|
132
|
+
|
|
133
|
+
SELECT ?report ?content ?score ?level ?timestamp
|
|
134
|
+
WHERE {
|
|
135
|
+
?info cred:informationContent ?content .
|
|
136
|
+
?request cred:concernsInformation ?info .
|
|
137
|
+
?report cred:isReportOf ?request .
|
|
138
|
+
?report cred:credibilityScoreValue ?score .
|
|
139
|
+
?report cred:assignsCredibilityLevel ?level .
|
|
140
|
+
?report cred:completionTimestamp ?timestamp .
|
|
141
|
+
FILTER(REGEX(?content, "%s", "i"))
|
|
142
|
+
}
|
|
143
|
+
ORDER BY DESC(?timestamp)
|
|
144
|
+
LIMIT 3
|
|
145
|
+
""" % regex_pattern
|
|
146
|
+
|
|
147
|
+
results = []
|
|
148
|
+
try:
|
|
149
|
+
combined = self.om.base_graph + self.om.data_graph
|
|
150
|
+
for row in combined.query(query):
|
|
151
|
+
results.append({
|
|
152
|
+
"uri": str(row.report),
|
|
153
|
+
"content": str(row.content)[:100] + "...",
|
|
154
|
+
"score": float(row.score),
|
|
155
|
+
"verdict": str(row.level).split('#')[-1]
|
|
156
|
+
})
|
|
157
|
+
except Exception as e:
|
|
158
|
+
print(f"[GraphRAG] Similar claims error: {e}")
|
|
159
|
+
return {"text": "", "uris": []}
|
|
160
|
+
|
|
161
|
+
if not results:
|
|
162
|
+
return {"text": "", "uris": []}
|
|
163
|
+
|
|
164
|
+
lines = [f"Found {len(results)} similar claims in history:"]
|
|
165
|
+
for r in results:
|
|
166
|
+
lines.append(f"- \"{r['content']}\" ({r['verdict']}, Score: {r['score']:.2f})")
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
"text": "\n".join(lines),
|
|
170
|
+
"uris": [r['uri'] for r in results]
|
|
171
|
+
}
|
syscred/ir_engine.py
ADDED
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
IR Engine Module - SysCRED
|
|
4
|
+
===========================
|
|
5
|
+
Information Retrieval engine extracted from TREC AP88-90 project.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- TF-IDF calculation (custom and via Pyserini)
|
|
9
|
+
- BM25 scoring (via Lucene/Pyserini)
|
|
10
|
+
- Query Likelihood Dirichlet (QLD)
|
|
11
|
+
- Pseudo-Relevance Feedback (PRF)
|
|
12
|
+
- Porter Stemming integration
|
|
13
|
+
|
|
14
|
+
Based on: TREC_AP88-90_5juin2025.py
|
|
15
|
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
|
16
|
+
Citation Key: loyerEvaluationModelesRecherche2025
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import re
|
|
20
|
+
import math
|
|
21
|
+
from typing import Dict, List, Tuple, Optional, Any
|
|
22
|
+
from dataclasses import dataclass
|
|
23
|
+
from collections import Counter
|
|
24
|
+
|
|
25
|
+
# Check for optional dependencies
|
|
26
|
+
try:
|
|
27
|
+
import nltk
|
|
28
|
+
from nltk.corpus import stopwords
|
|
29
|
+
from nltk.stem import PorterStemmer
|
|
30
|
+
from nltk.tokenize import word_tokenize
|
|
31
|
+
HAS_NLTK = True
|
|
32
|
+
except ImportError:
|
|
33
|
+
HAS_NLTK = False
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from pyserini.search.lucene import LuceneSearcher
|
|
37
|
+
HAS_PYSERINI = True
|
|
38
|
+
except ImportError:
|
|
39
|
+
HAS_PYSERINI = False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# --- Data Classes ---
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class SearchResult:
|
|
46
|
+
"""A single search result."""
|
|
47
|
+
doc_id: str
|
|
48
|
+
score: float
|
|
49
|
+
rank: int
|
|
50
|
+
snippet: Optional[str] = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class SearchResponse:
|
|
55
|
+
"""Complete search response."""
|
|
56
|
+
query_id: str
|
|
57
|
+
query_text: str
|
|
58
|
+
results: List[SearchResult]
|
|
59
|
+
model: str # 'bm25', 'qld', 'tfidf'
|
|
60
|
+
total_hits: int
|
|
61
|
+
search_time_ms: float
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class IREngine:
|
|
65
|
+
"""
|
|
66
|
+
Information Retrieval engine with multiple scoring methods.
|
|
67
|
+
|
|
68
|
+
Supports:
|
|
69
|
+
- Built-in TF-IDF/BM25 (no dependencies)
|
|
70
|
+
- Pyserini/Lucene BM25 and QLD (if pyserini installed)
|
|
71
|
+
- Query expansion with Pseudo-Relevance Feedback
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
# BM25 default parameters
|
|
75
|
+
BM25_K1 = 0.9
|
|
76
|
+
BM25_B = 0.4
|
|
77
|
+
|
|
78
|
+
def __init__(self, index_path: str = None, use_stemming: bool = True):
|
|
79
|
+
"""
|
|
80
|
+
Initialize the IR engine.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
index_path: Path to Lucene/Pyserini index (optional)
|
|
84
|
+
use_stemming: Whether to apply Porter stemming
|
|
85
|
+
"""
|
|
86
|
+
self.index_path = index_path
|
|
87
|
+
self.use_stemming = use_stemming
|
|
88
|
+
self.searcher = None
|
|
89
|
+
|
|
90
|
+
# Initialize NLTK components
|
|
91
|
+
if HAS_NLTK:
|
|
92
|
+
try:
|
|
93
|
+
self.stopwords = set(stopwords.words('english'))
|
|
94
|
+
self.stemmer = PorterStemmer() if use_stemming else None
|
|
95
|
+
except LookupError:
|
|
96
|
+
print("[IREngine] Downloading NLTK resources...")
|
|
97
|
+
nltk.download('stopwords', quiet=True)
|
|
98
|
+
nltk.download('punkt', quiet=True)
|
|
99
|
+
nltk.download('punkt_tab', quiet=True)
|
|
100
|
+
self.stopwords = set(stopwords.words('english'))
|
|
101
|
+
self.stemmer = PorterStemmer() if use_stemming else None
|
|
102
|
+
else:
|
|
103
|
+
# Fallback stopwords
|
|
104
|
+
self.stopwords = {
|
|
105
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
|
|
106
|
+
'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are',
|
|
107
|
+
'were', 'been', 'be', 'have', 'has', 'had', 'do', 'does',
|
|
108
|
+
'did', 'will', 'would', 'could', 'should', 'may', 'might',
|
|
109
|
+
'must', 'shall', 'can', 'need', 'this', 'that', 'these',
|
|
110
|
+
'those', 'it', 'its', 'they', 'them', 'he', 'she', 'him',
|
|
111
|
+
'her', 'his', 'we', 'you', 'i', 'my', 'your', 'our', 'their'
|
|
112
|
+
}
|
|
113
|
+
self.stemmer = None
|
|
114
|
+
|
|
115
|
+
# Initialize Pyserini searcher if available
|
|
116
|
+
if HAS_PYSERINI and index_path:
|
|
117
|
+
try:
|
|
118
|
+
self.searcher = LuceneSearcher(index_path)
|
|
119
|
+
print(f"[IREngine] Pyserini searcher initialized with index: {index_path}")
|
|
120
|
+
except Exception as e:
|
|
121
|
+
print(f"[IREngine] Failed to initialize Pyserini: {e}")
|
|
122
|
+
|
|
123
|
+
def preprocess(self, text: str) -> str:
|
|
124
|
+
"""
|
|
125
|
+
Preprocess text with tokenization, stopword removal, and optional stemming.
|
|
126
|
+
|
|
127
|
+
This matches the TREC preprocessing pipeline.
|
|
128
|
+
"""
|
|
129
|
+
if not isinstance(text, str):
|
|
130
|
+
return ""
|
|
131
|
+
|
|
132
|
+
text = text.lower()
|
|
133
|
+
|
|
134
|
+
if HAS_NLTK:
|
|
135
|
+
try:
|
|
136
|
+
tokens = word_tokenize(text)
|
|
137
|
+
except LookupError:
|
|
138
|
+
# Fallback tokenization
|
|
139
|
+
tokens = re.findall(r'\b[a-z]+\b', text)
|
|
140
|
+
else:
|
|
141
|
+
tokens = re.findall(r'\b[a-z]+\b', text)
|
|
142
|
+
|
|
143
|
+
# Filter stopwords and non-alpha
|
|
144
|
+
filtered = [t for t in tokens if t.isalpha() and t not in self.stopwords]
|
|
145
|
+
|
|
146
|
+
# Apply stemming if enabled
|
|
147
|
+
if self.stemmer:
|
|
148
|
+
filtered = [self.stemmer.stem(t) for t in filtered]
|
|
149
|
+
|
|
150
|
+
return ' '.join(filtered)
|
|
151
|
+
|
|
152
|
+
def calculate_tf(self, tokens: List[str]) -> Dict[str, float]:
|
|
153
|
+
"""Calculate term frequency."""
|
|
154
|
+
if not tokens:
|
|
155
|
+
return {}
|
|
156
|
+
counts = Counter(tokens)
|
|
157
|
+
total = len(tokens)
|
|
158
|
+
return {term: count / total for term, count in counts.items()}
|
|
159
|
+
|
|
160
|
+
def calculate_bm25_score(
|
|
161
|
+
self,
|
|
162
|
+
query_terms: List[str],
|
|
163
|
+
doc_terms: List[str],
|
|
164
|
+
doc_length: int,
|
|
165
|
+
avg_doc_length: float,
|
|
166
|
+
doc_freq: Dict[str, int],
|
|
167
|
+
corpus_size: int
|
|
168
|
+
) -> float:
|
|
169
|
+
"""
|
|
170
|
+
Calculate BM25 score for a document.
|
|
171
|
+
|
|
172
|
+
BM25(D, Q) = Σ IDF(qi) × (f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D|/avgdl))
|
|
173
|
+
"""
|
|
174
|
+
doc_term_counts = Counter(doc_terms)
|
|
175
|
+
score = 0.0
|
|
176
|
+
|
|
177
|
+
for term in query_terms:
|
|
178
|
+
if term not in doc_term_counts:
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
tf = doc_term_counts[term]
|
|
182
|
+
df = doc_freq.get(term, 1)
|
|
183
|
+
idf = math.log((corpus_size - df + 0.5) / (df + 0.5) + 1)
|
|
184
|
+
|
|
185
|
+
numerator = tf * (self.BM25_K1 + 1)
|
|
186
|
+
denominator = tf + self.BM25_K1 * (1 - self.BM25_B + self.BM25_B * doc_length / avg_doc_length)
|
|
187
|
+
|
|
188
|
+
score += idf * (numerator / denominator)
|
|
189
|
+
|
|
190
|
+
return score
|
|
191
|
+
|
|
192
|
+
def search_pyserini(
|
|
193
|
+
self,
|
|
194
|
+
query: str,
|
|
195
|
+
model: str = 'bm25',
|
|
196
|
+
k: int = 100,
|
|
197
|
+
query_id: str = "Q1"
|
|
198
|
+
) -> SearchResponse:
|
|
199
|
+
"""
|
|
200
|
+
Search using Pyserini/Lucene.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
query: Query text
|
|
204
|
+
model: 'bm25' or 'qld'
|
|
205
|
+
k: Number of results
|
|
206
|
+
query_id: Query identifier
|
|
207
|
+
"""
|
|
208
|
+
import time
|
|
209
|
+
start = time.time()
|
|
210
|
+
|
|
211
|
+
if not self.searcher:
|
|
212
|
+
raise RuntimeError("Pyserini searcher not initialized. Provide index_path.")
|
|
213
|
+
|
|
214
|
+
# Configure similarity
|
|
215
|
+
if model == 'bm25':
|
|
216
|
+
self.searcher.set_bm25(k1=self.BM25_K1, b=self.BM25_B)
|
|
217
|
+
elif model == 'qld':
|
|
218
|
+
self.searcher.set_qld()
|
|
219
|
+
else:
|
|
220
|
+
self.searcher.set_bm25()
|
|
221
|
+
|
|
222
|
+
# Preprocess query
|
|
223
|
+
processed_query = self.preprocess(query)
|
|
224
|
+
|
|
225
|
+
# Search
|
|
226
|
+
hits = self.searcher.search(processed_query, k=k)
|
|
227
|
+
|
|
228
|
+
results = []
|
|
229
|
+
for i, hit in enumerate(hits):
|
|
230
|
+
results.append(SearchResult(
|
|
231
|
+
doc_id=hit.docid,
|
|
232
|
+
score=hit.score,
|
|
233
|
+
rank=i + 1
|
|
234
|
+
))
|
|
235
|
+
|
|
236
|
+
elapsed = (time.time() - start) * 1000
|
|
237
|
+
|
|
238
|
+
return SearchResponse(
|
|
239
|
+
query_id=query_id,
|
|
240
|
+
query_text=query,
|
|
241
|
+
results=results,
|
|
242
|
+
model=model,
|
|
243
|
+
total_hits=len(results),
|
|
244
|
+
search_time_ms=elapsed
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
def pseudo_relevance_feedback(
|
|
248
|
+
self,
|
|
249
|
+
query: str,
|
|
250
|
+
top_docs_texts: List[str],
|
|
251
|
+
num_expansion_terms: int = 10
|
|
252
|
+
) -> str:
|
|
253
|
+
"""
|
|
254
|
+
Expand query using Pseudo-Relevance Feedback (PRF).
|
|
255
|
+
|
|
256
|
+
Uses top-k retrieved documents to find expansion terms.
|
|
257
|
+
"""
|
|
258
|
+
query_tokens = self.preprocess(query).split()
|
|
259
|
+
|
|
260
|
+
# Collect terms from top documents
|
|
261
|
+
expansion_candidates = Counter()
|
|
262
|
+
for doc_text in top_docs_texts:
|
|
263
|
+
doc_tokens = self.preprocess(doc_text).split()
|
|
264
|
+
# Count terms not in original query
|
|
265
|
+
for token in doc_tokens:
|
|
266
|
+
if token not in query_tokens:
|
|
267
|
+
expansion_candidates[token] += 1
|
|
268
|
+
|
|
269
|
+
# Get top expansion terms
|
|
270
|
+
expansion_terms = [term for term, _ in expansion_candidates.most_common(num_expansion_terms)]
|
|
271
|
+
|
|
272
|
+
# Create expanded query
|
|
273
|
+
expanded_query = query + ' ' + ' '.join(expansion_terms)
|
|
274
|
+
|
|
275
|
+
return expanded_query
|
|
276
|
+
|
|
277
|
+
def format_trec_run(
|
|
278
|
+
self,
|
|
279
|
+
responses: List[SearchResponse],
|
|
280
|
+
run_tag: str
|
|
281
|
+
) -> str:
|
|
282
|
+
"""
|
|
283
|
+
Format results in TREC run file format.
|
|
284
|
+
|
|
285
|
+
Format: query_id Q0 doc_id rank score run_tag
|
|
286
|
+
"""
|
|
287
|
+
lines = []
|
|
288
|
+
for response in responses:
|
|
289
|
+
for result in response.results:
|
|
290
|
+
lines.append(
|
|
291
|
+
f"{response.query_id} Q0 {result.doc_id} "
|
|
292
|
+
f"{result.rank} {result.score:.6f} {run_tag}"
|
|
293
|
+
)
|
|
294
|
+
return '\n'.join(lines)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
# --- Kaggle/Colab Utilities ---
|
|
298
|
+
|
|
299
|
+
def setup_kaggle_environment():
|
|
300
|
+
"""Setup environment for Kaggle notebooks."""
|
|
301
|
+
import subprocess
|
|
302
|
+
import sys
|
|
303
|
+
|
|
304
|
+
print("=" * 60)
|
|
305
|
+
print("SysCRED - Kaggle Environment Setup")
|
|
306
|
+
print("=" * 60)
|
|
307
|
+
|
|
308
|
+
# Check for GPU/TPU
|
|
309
|
+
import torch
|
|
310
|
+
if torch.cuda.is_available():
|
|
311
|
+
print(f"✓ GPU available: {torch.cuda.get_device_name(0)}")
|
|
312
|
+
else:
|
|
313
|
+
print("✗ No GPU detected")
|
|
314
|
+
|
|
315
|
+
# Install required packages
|
|
316
|
+
packages = [
|
|
317
|
+
'pyserini',
|
|
318
|
+
'transformers',
|
|
319
|
+
'pytrec_eval',
|
|
320
|
+
'nltk',
|
|
321
|
+
'rdflib'
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
print("\nInstalling packages...")
|
|
325
|
+
for pkg in packages:
|
|
326
|
+
try:
|
|
327
|
+
subprocess.run(
|
|
328
|
+
[sys.executable, '-m', 'pip', 'install', '-q', pkg],
|
|
329
|
+
check=True,
|
|
330
|
+
capture_output=True
|
|
331
|
+
)
|
|
332
|
+
print(f" ✓ {pkg}")
|
|
333
|
+
except:
|
|
334
|
+
print(f" ✗ {pkg} - install failed")
|
|
335
|
+
|
|
336
|
+
# Download NLTK resources
|
|
337
|
+
import nltk
|
|
338
|
+
for resource in ['stopwords', 'punkt', 'punkt_tab', 'wordnet']:
|
|
339
|
+
try:
|
|
340
|
+
nltk.download(resource, quiet=True)
|
|
341
|
+
except:
|
|
342
|
+
pass
|
|
343
|
+
|
|
344
|
+
print("\n✓ Environment setup complete")
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def load_kaggle_dataset(dataset_path: str) -> str:
|
|
348
|
+
"""
|
|
349
|
+
Load a Kaggle dataset.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
dataset_path: Path like '/kaggle/input/trec-ap88-90'
|
|
353
|
+
"""
|
|
354
|
+
import os
|
|
355
|
+
|
|
356
|
+
if os.path.exists(dataset_path):
|
|
357
|
+
print(f"✓ Dataset found: {dataset_path}")
|
|
358
|
+
return dataset_path
|
|
359
|
+
else:
|
|
360
|
+
print(f"✗ Dataset not found: {dataset_path}")
|
|
361
|
+
print("Make sure to add the dataset to your Kaggle notebook.")
|
|
362
|
+
return None
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
# --- Testing ---
|
|
366
|
+
if __name__ == "__main__":
|
|
367
|
+
print("=" * 60)
|
|
368
|
+
print("SysCRED IR Engine - Tests")
|
|
369
|
+
print("=" * 60)
|
|
370
|
+
|
|
371
|
+
engine = IREngine(use_stemming=True)
|
|
372
|
+
|
|
373
|
+
# Test preprocessing
|
|
374
|
+
print("\n1. Testing preprocessing...")
|
|
375
|
+
sample = "Information Retrieval systems help users find relevant documents."
|
|
376
|
+
processed = engine.preprocess(sample)
|
|
377
|
+
print(f" Original: {sample}")
|
|
378
|
+
print(f" Processed: {processed}")
|
|
379
|
+
|
|
380
|
+
# Test BM25
|
|
381
|
+
print("\n2. Testing BM25 calculation...")
|
|
382
|
+
query_terms = engine.preprocess("information retrieval").split()
|
|
383
|
+
doc_terms = engine.preprocess(sample).split()
|
|
384
|
+
|
|
385
|
+
score = engine.calculate_bm25_score(
|
|
386
|
+
query_terms=query_terms,
|
|
387
|
+
doc_terms=doc_terms,
|
|
388
|
+
doc_length=len(doc_terms),
|
|
389
|
+
avg_doc_length=10,
|
|
390
|
+
doc_freq={'inform': 5, 'retriev': 3},
|
|
391
|
+
corpus_size=100
|
|
392
|
+
)
|
|
393
|
+
print(f" BM25 Score: {score:.4f}")
|
|
394
|
+
|
|
395
|
+
# Test PRF
|
|
396
|
+
print("\n3. Testing Pseudo-Relevance Feedback...")
|
|
397
|
+
expanded = engine.pseudo_relevance_feedback(
|
|
398
|
+
query="information retrieval",
|
|
399
|
+
top_docs_texts=[
|
|
400
|
+
"Information retrieval is finding relevant documents in a collection.",
|
|
401
|
+
"Search engines use retrieval models like BM25 and TF-IDF.",
|
|
402
|
+
"Query expansion improves retrieval effectiveness."
|
|
403
|
+
]
|
|
404
|
+
)
|
|
405
|
+
print(f" Original query: information retrieval")
|
|
406
|
+
print(f" Expanded query: {expanded}")
|
|
407
|
+
|
|
408
|
+
print("\n" + "=" * 60)
|
|
409
|
+
print("Tests complete!")
|
|
410
|
+
print("=" * 60)
|