syscred 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syscred/__init__.py +41 -0
- syscred/api_clients.py +560 -0
- syscred/backend_app.py +363 -0
- syscred/config.py +275 -0
- syscred/database.py +54 -0
- syscred/debug_factcheck.py +43 -0
- syscred/debug_graph_json.py +58 -0
- syscred/debug_init.py +33 -0
- syscred/debug_local_server.py +25 -0
- syscred/diagnose_imports.py +37 -0
- syscred/eval_metrics.py +349 -0
- syscred/graph_rag.py +171 -0
- syscred/ir_engine.py +410 -0
- syscred/ontology_manager.py +509 -0
- syscred/run_benchmark.py +135 -0
- syscred/seo_analyzer.py +610 -0
- syscred/setup.py +65 -0
- syscred/test_graphrag.py +87 -0
- syscred/test_phase1.py +28 -0
- syscred/test_phase2.py +55 -0
- syscred/test_suite.py +64 -0
- syscred/verification_system.py +765 -0
- syscred-2.2.0.dist-info/METADATA +259 -0
- syscred-2.2.0.dist-info/RECORD +28 -0
- syscred-2.2.0.dist-info/WHEEL +5 -0
- syscred-2.2.0.dist-info/entry_points.txt +3 -0
- syscred-2.2.0.dist-info/licenses/LICENSE +21 -0
- syscred-2.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
|
|
5
|
+
# Load environment variables
|
|
6
|
+
load_dotenv(dotenv_path='/Users/bk280625/documents041025/MonCode/syscred/.env')
|
|
7
|
+
|
|
8
|
+
API_KEY = os.getenv('SYSCRED_GOOGLE_API_KEY')
|
|
9
|
+
print(f"Loaded API Key: {API_KEY[:5]}...{API_KEY[-5:] if API_KEY else 'None'}")
|
|
10
|
+
|
|
11
|
+
if not API_KEY:
|
|
12
|
+
print("❌ Error: API Key not found in .env")
|
|
13
|
+
exit(1)
|
|
14
|
+
|
|
15
|
+
query = "La terre est plate"
|
|
16
|
+
url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
|
|
17
|
+
params = {
|
|
18
|
+
'key': API_KEY,
|
|
19
|
+
'query': query,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
print(f"\nSending request for query: '{query}'...")
|
|
23
|
+
try:
|
|
24
|
+
response = requests.get(url, params=params)
|
|
25
|
+
print(f"Status Code: {response.status_code}")
|
|
26
|
+
|
|
27
|
+
if response.status_code == 200:
|
|
28
|
+
data = response.json()
|
|
29
|
+
claims = data.get('claims', [])
|
|
30
|
+
print(f"✅ Success! Found {len(claims)} claims.")
|
|
31
|
+
for i, claim in enumerate(claims[:3]):
|
|
32
|
+
print(f"\n--- Result {i+1} ---")
|
|
33
|
+
print(f"Claim: {claim.get('text')}")
|
|
34
|
+
print(f"Claimant: {claim.get('claimant')}")
|
|
35
|
+
reviews = claim.get('claimReview', [])
|
|
36
|
+
if reviews:
|
|
37
|
+
print(f"Rating: {reviews[0].get('textualRating')}")
|
|
38
|
+
print(f"URL: {reviews[0].get('url')}")
|
|
39
|
+
else:
|
|
40
|
+
print(f"❌ API Error: {response.text}")
|
|
41
|
+
|
|
42
|
+
except Exception as e:
|
|
43
|
+
print(f"❌ Connection Error: {e}")
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
# Add project root to path (one level up from this script)
|
|
6
|
+
sys.path.append(str(Path(__file__).parent.parent))
|
|
7
|
+
|
|
8
|
+
from syscred.ontology_manager import OntologyManager
|
|
9
|
+
from syscred.config import config
|
|
10
|
+
|
|
11
|
+
def debug_graph():
|
|
12
|
+
print("=== Debugging Ontology Graph Extraction ===")
|
|
13
|
+
|
|
14
|
+
# Initialize manager
|
|
15
|
+
base_path = str(config.ONTOLOGY_BASE_PATH)
|
|
16
|
+
data_path = str(config.ONTOLOGY_DATA_PATH)
|
|
17
|
+
|
|
18
|
+
print(f"Loading data from: {data_path}")
|
|
19
|
+
manager = OntologyManager(base_ontology_path=base_path, data_path=data_path)
|
|
20
|
+
|
|
21
|
+
# Get Stats
|
|
22
|
+
stats = manager.get_statistics()
|
|
23
|
+
print(f"Total Triples: {stats['total_triples']}")
|
|
24
|
+
print(f"Evaluations: {stats.get('total_evaluations', 'N/A')}")
|
|
25
|
+
|
|
26
|
+
# Try getting graph JSON
|
|
27
|
+
print("\nExtracting Graph JSON...")
|
|
28
|
+
graph_data = manager.get_graph_json()
|
|
29
|
+
|
|
30
|
+
nodes = graph_data.get('nodes', [])
|
|
31
|
+
links = graph_data.get('links', [])
|
|
32
|
+
|
|
33
|
+
print(f"Nodes found: {len(nodes)}")
|
|
34
|
+
print(f"Links found: {len(links)}")
|
|
35
|
+
|
|
36
|
+
if len(nodes) > 0:
|
|
37
|
+
print("\n--- Sample Nodes ---")
|
|
38
|
+
for n in nodes[:3]:
|
|
39
|
+
print(json.dumps(n, indent=2))
|
|
40
|
+
else:
|
|
41
|
+
print("\n❌ No nodes found! Checking latest report query...")
|
|
42
|
+
# Manually run the query to see what's wrong
|
|
43
|
+
query = """
|
|
44
|
+
PREFIX cred: <http://www.dic9335.uqam.ca/ontologies/credibility-verification#>
|
|
45
|
+
SELECT ?report ?timestamp WHERE {
|
|
46
|
+
?report a cred:RapportEvaluation .
|
|
47
|
+
?report cred:completionTimestamp ?timestamp .
|
|
48
|
+
}
|
|
49
|
+
ORDER BY DESC(?timestamp)
|
|
50
|
+
LIMIT 5
|
|
51
|
+
"""
|
|
52
|
+
print(f"Running SPARQL:\n{query}")
|
|
53
|
+
results = manager.data_graph.query(query)
|
|
54
|
+
for row in results:
|
|
55
|
+
print(f"Found Report: {row.report} at {row.timestamp}")
|
|
56
|
+
|
|
57
|
+
if __name__ == "__main__":
|
|
58
|
+
debug_graph()
|
syscred/debug_init.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import traceback
|
|
5
|
+
|
|
6
|
+
# Setup path
|
|
7
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
8
|
+
|
|
9
|
+
from syscred.verification_system import CredibilityVerificationSystem
|
|
10
|
+
from syscred.config import config
|
|
11
|
+
from syscred.seo_analyzer import SEOAnalyzer
|
|
12
|
+
|
|
13
|
+
print("=== DEBUG INITIALIZATION ===")
|
|
14
|
+
try:
|
|
15
|
+
print("[1] Config check:")
|
|
16
|
+
print(f" Base Ontology: {config.ONTOLOGY_BASE_PATH}")
|
|
17
|
+
print(f" Data Path: {config.ONTOLOGY_DATA_PATH}")
|
|
18
|
+
|
|
19
|
+
print("\n[2] Initializing SEO Analyzer...")
|
|
20
|
+
seo = SEOAnalyzer()
|
|
21
|
+
print(" OK")
|
|
22
|
+
|
|
23
|
+
print("\n[3] Initializing Verification System...")
|
|
24
|
+
sys = CredibilityVerificationSystem(
|
|
25
|
+
ontology_base_path=config.ONTOLOGY_BASE_PATH,
|
|
26
|
+
ontology_data_path=config.ONTOLOGY_DATA_PATH,
|
|
27
|
+
load_ml_models=False # Disable ML for basic init test
|
|
28
|
+
)
|
|
29
|
+
print(" OK - System initialized successfully.")
|
|
30
|
+
|
|
31
|
+
except Exception as e:
|
|
32
|
+
print(f"\n❌ FATAL ERROR: {e}")
|
|
33
|
+
traceback.print_exc()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
url = "http://localhost:5001/api/verify"
|
|
5
|
+
payload = {
|
|
6
|
+
"input_data": "la terre est plate",
|
|
7
|
+
"include_seo": True
|
|
8
|
+
}
|
|
9
|
+
headers = {'Content-Type': 'application/json'}
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
print(f"Sending POST to {url} with payload: {payload}")
|
|
13
|
+
response = requests.post(url, json=payload, headers=headers)
|
|
14
|
+
print(f"Status: {response.status_code}")
|
|
15
|
+
|
|
16
|
+
if response.status_code == 200:
|
|
17
|
+
data = response.json()
|
|
18
|
+
print("\n--- JSON RESPONSE PARTIAL ---")
|
|
19
|
+
facts = data.get('reglesAppliquees', {}).get('fact_checking', [])
|
|
20
|
+
print(f"Fact Checks Count: {len(facts)}")
|
|
21
|
+
print("Fact Checks Items:", json.dumps(facts, indent=2, ensure_ascii=False))
|
|
22
|
+
else:
|
|
23
|
+
print("Error:", response.text)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
print(f"Connection failed: {e}")
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import traceback
|
|
5
|
+
|
|
6
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
7
|
+
|
|
8
|
+
print("--- DIAGNOSTIC START ---")
|
|
9
|
+
try:
|
|
10
|
+
print("[1] Importing config...")
|
|
11
|
+
from syscred.config import config
|
|
12
|
+
print(" OK")
|
|
13
|
+
except Exception:
|
|
14
|
+
traceback.print_exc()
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
print("[2] Importing database...")
|
|
18
|
+
from syscred.database import init_db
|
|
19
|
+
print(" OK")
|
|
20
|
+
except Exception:
|
|
21
|
+
traceback.print_exc()
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
print("[3] Importing ontology_manager...")
|
|
25
|
+
from syscred.ontology_manager import OntologyManager
|
|
26
|
+
print(" OK")
|
|
27
|
+
except Exception:
|
|
28
|
+
traceback.print_exc()
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
print("[4] Importing verification_system...")
|
|
32
|
+
from syscred.verification_system import CredibilityVerificationSystem
|
|
33
|
+
print(" OK")
|
|
34
|
+
except Exception:
|
|
35
|
+
traceback.print_exc()
|
|
36
|
+
|
|
37
|
+
print("--- DIAGNOSTIC END ---")
|
syscred/eval_metrics.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Evaluation Metrics Module - SysCRED
|
|
4
|
+
====================================
|
|
5
|
+
Information Retrieval evaluation metrics for TREC-style experiments.
|
|
6
|
+
|
|
7
|
+
Metrics:
|
|
8
|
+
- MAP (Mean Average Precision)
|
|
9
|
+
- NDCG (Normalized Discounted Cumulative Gain)
|
|
10
|
+
- P@K (Precision at K)
|
|
11
|
+
- Recall@K
|
|
12
|
+
- MRR (Mean Reciprocal Rank)
|
|
13
|
+
|
|
14
|
+
Based on pytrec_eval for official TREC evaluation.
|
|
15
|
+
|
|
16
|
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
|
17
|
+
Citation Key: loyerEvaluationModelesRecherche2025
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import math
|
|
21
|
+
from typing import Dict, List, Tuple, Any
|
|
22
|
+
from collections import defaultdict
|
|
23
|
+
|
|
24
|
+
# Check for pytrec_eval
|
|
25
|
+
try:
|
|
26
|
+
import pytrec_eval
|
|
27
|
+
HAS_PYTREC_EVAL = True
|
|
28
|
+
except ImportError:
|
|
29
|
+
HAS_PYTREC_EVAL = False
|
|
30
|
+
print("[EvalMetrics] pytrec_eval not installed. Using built-in metrics.")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EvaluationMetrics:
|
|
34
|
+
"""
|
|
35
|
+
IR Evaluation metrics using pytrec_eval or built-in implementations.
|
|
36
|
+
|
|
37
|
+
Supports TREC-style evaluation with:
|
|
38
|
+
- Official pytrec_eval (if available)
|
|
39
|
+
- Fallback pure-Python implementations
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self):
|
|
43
|
+
"""Initialize the metrics calculator."""
|
|
44
|
+
self.use_pytrec = HAS_PYTREC_EVAL
|
|
45
|
+
|
|
46
|
+
# --- Built-in Metric Implementations ---
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def precision_at_k(retrieved: List[str], relevant: set, k: int) -> float:
|
|
50
|
+
"""
|
|
51
|
+
Calculate Precision@K.
|
|
52
|
+
|
|
53
|
+
P@K = |relevant ∩ retrieved[:k]| / k
|
|
54
|
+
"""
|
|
55
|
+
if k <= 0:
|
|
56
|
+
return 0.0
|
|
57
|
+
retrieved_k = retrieved[:k]
|
|
58
|
+
relevant_retrieved = len([d for d in retrieved_k if d in relevant])
|
|
59
|
+
return relevant_retrieved / k
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def recall_at_k(retrieved: List[str], relevant: set, k: int) -> float:
|
|
63
|
+
"""
|
|
64
|
+
Calculate Recall@K.
|
|
65
|
+
|
|
66
|
+
R@K = |relevant ∩ retrieved[:k]| / |relevant|
|
|
67
|
+
"""
|
|
68
|
+
if not relevant:
|
|
69
|
+
return 0.0
|
|
70
|
+
retrieved_k = retrieved[:k]
|
|
71
|
+
relevant_retrieved = len([d for d in retrieved_k if d in relevant])
|
|
72
|
+
return relevant_retrieved / len(relevant)
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def average_precision(retrieved: List[str], relevant: set) -> float:
|
|
76
|
+
"""
|
|
77
|
+
Calculate Average Precision for a single query.
|
|
78
|
+
|
|
79
|
+
AP = (1/|relevant|) × Σ (P@k × rel(k))
|
|
80
|
+
"""
|
|
81
|
+
if not relevant:
|
|
82
|
+
return 0.0
|
|
83
|
+
|
|
84
|
+
hits = 0
|
|
85
|
+
sum_precision = 0.0
|
|
86
|
+
|
|
87
|
+
for i, doc in enumerate(retrieved):
|
|
88
|
+
if doc in relevant:
|
|
89
|
+
hits += 1
|
|
90
|
+
sum_precision += hits / (i + 1)
|
|
91
|
+
|
|
92
|
+
return sum_precision / len(relevant)
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def dcg_at_k(retrieved: List[str], relevance: Dict[str, int], k: int) -> float:
|
|
96
|
+
"""
|
|
97
|
+
Calculate DCG@K (Discounted Cumulative Gain).
|
|
98
|
+
|
|
99
|
+
DCG@K = Σ (2^rel(i) - 1) / log2(i + 2)
|
|
100
|
+
"""
|
|
101
|
+
dcg = 0.0
|
|
102
|
+
for i, doc in enumerate(retrieved[:k]):
|
|
103
|
+
rel = relevance.get(doc, 0)
|
|
104
|
+
dcg += (2 ** rel - 1) / math.log2(i + 2)
|
|
105
|
+
return dcg
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def ndcg_at_k(retrieved: List[str], relevance: Dict[str, int], k: int) -> float:
|
|
109
|
+
"""
|
|
110
|
+
Calculate NDCG@K (Normalized DCG).
|
|
111
|
+
|
|
112
|
+
NDCG@K = DCG@K / IDCG@K
|
|
113
|
+
"""
|
|
114
|
+
dcg = EvaluationMetrics.dcg_at_k(retrieved, relevance, k)
|
|
115
|
+
|
|
116
|
+
# Calculate IDCG (ideal DCG)
|
|
117
|
+
sorted_rels = sorted(relevance.values(), reverse=True)[:k]
|
|
118
|
+
idcg = 0.0
|
|
119
|
+
for i, rel in enumerate(sorted_rels):
|
|
120
|
+
idcg += (2 ** rel - 1) / math.log2(i + 2)
|
|
121
|
+
|
|
122
|
+
return dcg / idcg if idcg > 0 else 0.0
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def reciprocal_rank(retrieved: List[str], relevant: set) -> float:
|
|
126
|
+
"""
|
|
127
|
+
Calculate Reciprocal Rank.
|
|
128
|
+
|
|
129
|
+
RR = 1 / rank of first relevant document
|
|
130
|
+
"""
|
|
131
|
+
for i, doc in enumerate(retrieved):
|
|
132
|
+
if doc in relevant:
|
|
133
|
+
return 1.0 / (i + 1)
|
|
134
|
+
return 0.0
|
|
135
|
+
|
|
136
|
+
# --- TREC-Style Evaluation ---
|
|
137
|
+
|
|
138
|
+
def evaluate_run(
|
|
139
|
+
self,
|
|
140
|
+
run: Dict[str, List[Tuple[str, float]]],
|
|
141
|
+
qrels: Dict[str, Dict[str, int]],
|
|
142
|
+
metrics: List[str] = None
|
|
143
|
+
) -> Dict[str, Dict[str, float]]:
|
|
144
|
+
"""
|
|
145
|
+
Evaluate a run against qrels (relevance judgments).
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
run: {query_id: [(doc_id, score), ...]}
|
|
149
|
+
qrels: {query_id: {doc_id: relevance}}
|
|
150
|
+
metrics: List of metrics to compute
|
|
151
|
+
['map', 'ndcg', 'P_5', 'P_10', 'recall_100']
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
{query_id: {metric: value}}
|
|
155
|
+
"""
|
|
156
|
+
if metrics is None:
|
|
157
|
+
metrics = ['map', 'ndcg', 'P_5', 'P_10', 'P_20', 'recall_100', 'recip_rank']
|
|
158
|
+
|
|
159
|
+
if self.use_pytrec and HAS_PYTREC_EVAL:
|
|
160
|
+
return self._evaluate_pytrec(run, qrels, metrics)
|
|
161
|
+
else:
|
|
162
|
+
return self._evaluate_builtin(run, qrels, metrics)
|
|
163
|
+
|
|
164
|
+
def _evaluate_pytrec(
|
|
165
|
+
self,
|
|
166
|
+
run: Dict[str, List[Tuple[str, float]]],
|
|
167
|
+
qrels: Dict[str, Dict[str, int]],
|
|
168
|
+
metrics: List[str]
|
|
169
|
+
) -> Dict[str, Dict[str, float]]:
|
|
170
|
+
"""Evaluate using pytrec_eval."""
|
|
171
|
+
# Convert run format for pytrec_eval
|
|
172
|
+
pytrec_run = {}
|
|
173
|
+
for qid, docs in run.items():
|
|
174
|
+
pytrec_run[qid] = {doc_id: score for doc_id, score in docs}
|
|
175
|
+
|
|
176
|
+
# Create evaluator
|
|
177
|
+
evaluator = pytrec_eval.RelevanceEvaluator(qrels, set(metrics))
|
|
178
|
+
|
|
179
|
+
# Evaluate
|
|
180
|
+
results = evaluator.evaluate(pytrec_run)
|
|
181
|
+
|
|
182
|
+
return results
|
|
183
|
+
|
|
184
|
+
def _evaluate_builtin(
|
|
185
|
+
self,
|
|
186
|
+
run: Dict[str, List[Tuple[str, float]]],
|
|
187
|
+
qrels: Dict[str, Dict[str, int]],
|
|
188
|
+
metrics: List[str]
|
|
189
|
+
) -> Dict[str, Dict[str, float]]:
|
|
190
|
+
"""Evaluate using built-in implementations."""
|
|
191
|
+
results = {}
|
|
192
|
+
|
|
193
|
+
for qid, docs_scores in run.items():
|
|
194
|
+
if qid not in qrels:
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
q_results = {}
|
|
198
|
+
retrieved = [doc_id for doc_id, _ in docs_scores]
|
|
199
|
+
relevance = qrels[qid]
|
|
200
|
+
relevant = set(doc_id for doc_id, rel in relevance.items() if rel > 0)
|
|
201
|
+
|
|
202
|
+
for metric in metrics:
|
|
203
|
+
if metric == 'map':
|
|
204
|
+
q_results['map'] = self.average_precision(retrieved, relevant)
|
|
205
|
+
elif metric == 'ndcg':
|
|
206
|
+
q_results['ndcg'] = self.ndcg_at_k(retrieved, relevance, 1000)
|
|
207
|
+
elif metric.startswith('ndcg_cut_'):
|
|
208
|
+
k = int(metric.split('_')[-1])
|
|
209
|
+
q_results[metric] = self.ndcg_at_k(retrieved, relevance, k)
|
|
210
|
+
elif metric.startswith('P_'):
|
|
211
|
+
k = int(metric.split('_')[-1])
|
|
212
|
+
q_results[metric] = self.precision_at_k(retrieved, relevant, k)
|
|
213
|
+
elif metric.startswith('recall_'):
|
|
214
|
+
k = int(metric.split('_')[-1])
|
|
215
|
+
q_results[metric] = self.recall_at_k(retrieved, relevant, k)
|
|
216
|
+
elif metric == 'recip_rank':
|
|
217
|
+
q_results['recip_rank'] = self.reciprocal_rank(retrieved, relevant)
|
|
218
|
+
|
|
219
|
+
results[qid] = q_results
|
|
220
|
+
|
|
221
|
+
return results
|
|
222
|
+
|
|
223
|
+
def compute_aggregate(
|
|
224
|
+
self,
|
|
225
|
+
results: Dict[str, Dict[str, float]]
|
|
226
|
+
) -> Dict[str, float]:
|
|
227
|
+
"""
|
|
228
|
+
Compute aggregate metrics across all queries.
|
|
229
|
+
|
|
230
|
+
Returns mean values for each metric.
|
|
231
|
+
"""
|
|
232
|
+
if not results:
|
|
233
|
+
return {}
|
|
234
|
+
|
|
235
|
+
aggregated = defaultdict(list)
|
|
236
|
+
for qid, metrics in results.items():
|
|
237
|
+
for metric, value in metrics.items():
|
|
238
|
+
aggregated[metric].append(value)
|
|
239
|
+
|
|
240
|
+
return {metric: sum(values) / len(values)
|
|
241
|
+
for metric, values in aggregated.items()}
|
|
242
|
+
|
|
243
|
+
def format_results(
|
|
244
|
+
self,
|
|
245
|
+
results: Dict[str, Dict[str, float]],
|
|
246
|
+
include_per_query: bool = False
|
|
247
|
+
) -> str:
|
|
248
|
+
"""Format results as a readable string."""
|
|
249
|
+
lines = []
|
|
250
|
+
|
|
251
|
+
# Aggregate
|
|
252
|
+
agg = self.compute_aggregate(results)
|
|
253
|
+
lines.append("=" * 50)
|
|
254
|
+
lines.append("AGGREGATE METRICS")
|
|
255
|
+
lines.append("=" * 50)
|
|
256
|
+
for metric, value in sorted(agg.items()):
|
|
257
|
+
lines.append(f" {metric:20s}: {value:.4f}")
|
|
258
|
+
|
|
259
|
+
# Per-query (optional)
|
|
260
|
+
if include_per_query:
|
|
261
|
+
lines.append("")
|
|
262
|
+
lines.append("=" * 50)
|
|
263
|
+
lines.append("PER-QUERY METRICS")
|
|
264
|
+
lines.append("=" * 50)
|
|
265
|
+
for qid in sorted(results.keys()):
|
|
266
|
+
lines.append(f"\nQuery {qid}:")
|
|
267
|
+
for metric, value in sorted(results[qid].items()):
|
|
268
|
+
lines.append(f" {metric:20s}: {value:.4f}")
|
|
269
|
+
|
|
270
|
+
return '\n'.join(lines)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def parse_qrels_file(filepath: str) -> Dict[str, Dict[str, int]]:
|
|
274
|
+
"""
|
|
275
|
+
Parse a TREC qrels file.
|
|
276
|
+
|
|
277
|
+
Format: query_id 0 doc_id relevance
|
|
278
|
+
"""
|
|
279
|
+
qrels = defaultdict(dict)
|
|
280
|
+
with open(filepath, 'r') as f:
|
|
281
|
+
for line in f:
|
|
282
|
+
parts = line.strip().split()
|
|
283
|
+
if len(parts) >= 4:
|
|
284
|
+
qid, _, docid, rel = parts[:4]
|
|
285
|
+
qrels[qid][docid] = int(rel)
|
|
286
|
+
return dict(qrels)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def parse_run_file(filepath: str) -> Dict[str, List[Tuple[str, float]]]:
|
|
290
|
+
"""
|
|
291
|
+
Parse a TREC run file.
|
|
292
|
+
|
|
293
|
+
Format: query_id Q0 doc_id rank score run_tag
|
|
294
|
+
"""
|
|
295
|
+
run = defaultdict(list)
|
|
296
|
+
with open(filepath, 'r') as f:
|
|
297
|
+
for line in f:
|
|
298
|
+
parts = line.strip().split()
|
|
299
|
+
if len(parts) >= 5:
|
|
300
|
+
qid, _, docid, rank, score = parts[:5]
|
|
301
|
+
run[qid].append((docid, float(score)))
|
|
302
|
+
|
|
303
|
+
# Sort by score descending
|
|
304
|
+
for qid in run:
|
|
305
|
+
run[qid].sort(key=lambda x: x[1], reverse=True)
|
|
306
|
+
|
|
307
|
+
return dict(run)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# --- Testing ---
|
|
311
|
+
if __name__ == "__main__":
|
|
312
|
+
print("=" * 60)
|
|
313
|
+
print("SysCRED Evaluation Metrics - Tests")
|
|
314
|
+
print("=" * 60)
|
|
315
|
+
|
|
316
|
+
metrics = EvaluationMetrics()
|
|
317
|
+
print(f"\nUsing pytrec_eval: {metrics.use_pytrec}")
|
|
318
|
+
|
|
319
|
+
# Test data
|
|
320
|
+
retrieved = ['doc1', 'doc2', 'doc3', 'doc4', 'doc5', 'doc6', 'doc7', 'doc8', 'doc9', 'doc10']
|
|
321
|
+
relevant = {'doc1', 'doc3', 'doc5', 'doc8'}
|
|
322
|
+
relevance = {'doc1': 2, 'doc3': 1, 'doc5': 2, 'doc8': 1}
|
|
323
|
+
|
|
324
|
+
print("\n--- Built-in Metrics Tests ---")
|
|
325
|
+
print(f"P@5: {metrics.precision_at_k(retrieved, relevant, 5):.4f}")
|
|
326
|
+
print(f"P@10: {metrics.precision_at_k(retrieved, relevant, 10):.4f}")
|
|
327
|
+
print(f"R@5: {metrics.recall_at_k(retrieved, relevant, 5):.4f}")
|
|
328
|
+
print(f"R@10: {metrics.recall_at_k(retrieved, relevant, 10):.4f}")
|
|
329
|
+
print(f"AP: {metrics.average_precision(retrieved, relevant):.4f}")
|
|
330
|
+
print(f"NDCG@10: {metrics.ndcg_at_k(retrieved, relevance, 10):.4f}")
|
|
331
|
+
print(f"RR: {metrics.reciprocal_rank(retrieved, relevant):.4f}")
|
|
332
|
+
|
|
333
|
+
# Test run evaluation
|
|
334
|
+
print("\n--- Run Evaluation Test ---")
|
|
335
|
+
run = {
|
|
336
|
+
'Q1': [(doc, 10-i) for i, doc in enumerate(retrieved)],
|
|
337
|
+
'Q2': [('doc2', 10), ('doc1', 9), ('doc4', 8), ('doc3', 7)]
|
|
338
|
+
}
|
|
339
|
+
qrels = {
|
|
340
|
+
'Q1': relevance,
|
|
341
|
+
'Q2': {'doc1': 1, 'doc3': 2}
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
results = metrics.evaluate_run(run, qrels)
|
|
345
|
+
print(metrics.format_results(results))
|
|
346
|
+
|
|
347
|
+
print("\n" + "=" * 60)
|
|
348
|
+
print("Tests complete!")
|
|
349
|
+
print("=" * 60)
|