mirage-benchmark 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mirage-benchmark might be problematic. Click here for more details.
- mirage/__init__.py +83 -0
- mirage/cli.py +150 -0
- mirage/core/__init__.py +52 -0
- mirage/core/config.py +248 -0
- mirage/core/llm.py +1745 -0
- mirage/core/prompts.py +884 -0
- mirage/embeddings/__init__.py +31 -0
- mirage/embeddings/models.py +512 -0
- mirage/embeddings/rerankers_multimodal.py +766 -0
- mirage/embeddings/rerankers_text.py +149 -0
- mirage/evaluation/__init__.py +26 -0
- mirage/evaluation/metrics.py +2223 -0
- mirage/evaluation/metrics_optimized.py +2172 -0
- mirage/pipeline/__init__.py +45 -0
- mirage/pipeline/chunker.py +545 -0
- mirage/pipeline/context.py +1003 -0
- mirage/pipeline/deduplication.py +491 -0
- mirage/pipeline/domain.py +514 -0
- mirage/pipeline/pdf_processor.py +598 -0
- mirage/pipeline/qa_generator.py +798 -0
- mirage/utils/__init__.py +31 -0
- mirage/utils/ablation.py +360 -0
- mirage/utils/preflight.py +663 -0
- mirage/utils/stats.py +626 -0
- mirage_benchmark-1.0.4.dist-info/METADATA +490 -0
- mirage_benchmark-1.0.4.dist-info/RECORD +30 -0
- mirage_benchmark-1.0.4.dist-info/WHEEL +5 -0
- mirage_benchmark-1.0.4.dist-info/entry_points.txt +3 -0
- mirage_benchmark-1.0.4.dist-info/licenses/LICENSE +190 -0
- mirage_benchmark-1.0.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2223 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from typing import List, Dict, Any, Optional
|
|
6
|
+
from collections import Counter
|
|
7
|
+
|
|
8
|
+
# Import prompts for LLM-as-a-Judge metrics
|
|
9
|
+
try:
|
|
10
|
+
from prompt import PROMPTS_METRICS
|
|
11
|
+
except ImportError:
|
|
12
|
+
PROMPTS_METRICS = {}
|
|
13
|
+
print("Warning: Could not import PROMPTS_METRICS from prompt.py")
|
|
14
|
+
|
|
15
|
+
# Ragas Imports for Standard RAG Metrics
|
|
16
|
+
try:
|
|
17
|
+
from ragas import evaluate
|
|
18
|
+
from ragas.metrics import (
|
|
19
|
+
faithfulness,
|
|
20
|
+
answer_relevancy, # Note: RAGAS 0.4.x uses 'answer_relevancy' not 'answer_relevance'
|
|
21
|
+
context_precision,
|
|
22
|
+
context_recall,
|
|
23
|
+
)
|
|
24
|
+
from datasets import Dataset
|
|
25
|
+
RAGAS_AVAILABLE = True
|
|
26
|
+
print("✅ RAGAS metrics loaded successfully")
|
|
27
|
+
except ImportError as e:
|
|
28
|
+
RAGAS_AVAILABLE = False
|
|
29
|
+
print(f"Warning: 'ragas' or 'datasets' not installed. Error: {e}")
|
|
30
|
+
|
|
31
|
+
# Optional advanced metrics (may not be available in all ragas versions)
|
|
32
|
+
HAS_ENTITY_RECALL = False
|
|
33
|
+
HAS_NOISE_SENSITIVITY = False
|
|
34
|
+
HAS_MULTIMODAL = False
|
|
35
|
+
|
|
36
|
+
if RAGAS_AVAILABLE:
|
|
37
|
+
try:
|
|
38
|
+
from ragas.metrics import context_entity_recall
|
|
39
|
+
HAS_ENTITY_RECALL = True
|
|
40
|
+
except ImportError:
|
|
41
|
+
print("Info: context_entity_recall not available in this ragas version.")
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
from ragas.metrics import noise_sensitivity_relevant
|
|
45
|
+
HAS_NOISE_SENSITIVITY = True
|
|
46
|
+
except ImportError:
|
|
47
|
+
print("Info: noise_sensitivity metrics not available in this ragas version.")
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
from ragas.metrics import multimodal_faithfulness, multimodal_relevance
|
|
51
|
+
HAS_MULTIMODAL = True
|
|
52
|
+
except ImportError:
|
|
53
|
+
print("Info: multimodal metrics not available in this ragas version.")
|
|
54
|
+
|
|
55
|
+
# LangChain Imports
|
|
56
|
+
try:
|
|
57
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
58
|
+
from langchain_core.output_parsers import StrOutputParser
|
|
59
|
+
LANGCHAIN_AVAILABLE = True
|
|
60
|
+
except ImportError:
|
|
61
|
+
try:
|
|
62
|
+
from langchain.prompts import ChatPromptTemplate
|
|
63
|
+
from langchain.output_parsers import StrOutputParser
|
|
64
|
+
LANGCHAIN_AVAILABLE = True
|
|
65
|
+
except ImportError:
|
|
66
|
+
LANGCHAIN_AVAILABLE = False
|
|
67
|
+
print("Warning: 'langchain' not installed.")
|
|
68
|
+
|
|
69
|
+
# Output parsers (may be in different locations)
|
|
70
|
+
try:
|
|
71
|
+
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
|
|
72
|
+
except ImportError:
|
|
73
|
+
try:
|
|
74
|
+
from langchain_community.output_parsers import ResponseSchema, StructuredOutputParser
|
|
75
|
+
except ImportError:
|
|
76
|
+
try:
|
|
77
|
+
from langchain_core.output_parsers import ResponseSchema, StructuredOutputParser
|
|
78
|
+
except ImportError:
|
|
79
|
+
# Define minimal fallbacks
|
|
80
|
+
ResponseSchema = None
|
|
81
|
+
StructuredOutputParser = None
|
|
82
|
+
|
|
83
|
+
# LangChain Google Gemini Imports
|
|
84
|
+
try:
|
|
85
|
+
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
|
|
86
|
+
GEMINI_AVAILABLE = True
|
|
87
|
+
except ImportError:
|
|
88
|
+
GEMINI_AVAILABLE = False
|
|
89
|
+
print("Info: 'langchain-google-genai' not installed. Trying OpenAI...")
|
|
90
|
+
|
|
91
|
+
# LangChain OpenAI Imports (fallback)
|
|
92
|
+
try:
|
|
93
|
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
|
94
|
+
OPENAI_AVAILABLE = True
|
|
95
|
+
except ImportError:
|
|
96
|
+
OPENAI_AVAILABLE = False
|
|
97
|
+
print("Info: 'langchain_openai' not installed.")
|
|
98
|
+
|
|
99
|
+
# Data Science Imports
|
|
100
|
+
try:
|
|
101
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
102
|
+
except ImportError:
|
|
103
|
+
print("Warning: 'scikit-learn' not installed.")
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
from scipy.stats import entropy
|
|
107
|
+
SCIPY_AVAILABLE = True
|
|
108
|
+
except ImportError:
|
|
109
|
+
SCIPY_AVAILABLE = False
|
|
110
|
+
print("Warning: 'scipy' not installed. Domain coverage metrics may fail.")
|
|
111
|
+
|
|
112
|
+
# VLM Import for multimodal evaluation
|
|
113
|
+
try:
|
|
114
|
+
from call_llm import call_vlm_interweaved, batch_call_llm, batch_call_vlm_interweaved
|
|
115
|
+
VLM_AVAILABLE = True
|
|
116
|
+
BATCH_AVAILABLE = True
|
|
117
|
+
except ImportError:
|
|
118
|
+
VLM_AVAILABLE = False
|
|
119
|
+
BATCH_AVAILABLE = False
|
|
120
|
+
print("Warning: 'call_llm' module not available. VLM-based multimodal metrics will be skipped.")
|
|
121
|
+
|
|
122
|
+
class MultimodalFrameworkEvaluator:
|
|
123
|
+
def __init__(self, model_name=None, embedding_model=None, use_gemini=True):
|
|
124
|
+
"""
|
|
125
|
+
Initialize the evaluator.
|
|
126
|
+
Args:
|
|
127
|
+
model_name: The LLM to use as a Judge (auto-detected if None)
|
|
128
|
+
embedding_model: Model for diversity calculations (auto-detected if None)
|
|
129
|
+
use_gemini: If True, prefer Gemini over OpenAI
|
|
130
|
+
"""
|
|
131
|
+
# Determine which API to use
|
|
132
|
+
if use_gemini and GEMINI_AVAILABLE and os.getenv("GOOGLE_API_KEY"):
|
|
133
|
+
model_name = model_name or "gemini-2.0-flash"
|
|
134
|
+
embedding_model = embedding_model or "models/text-embedding-004"
|
|
135
|
+
print(f"Using Gemini API with model: {model_name}")
|
|
136
|
+
self.llm = ChatGoogleGenerativeAI(model=model_name, temperature=0)
|
|
137
|
+
self.embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model)
|
|
138
|
+
self.api_type = "gemini"
|
|
139
|
+
elif OPENAI_AVAILABLE and os.getenv("OPENAI_API_KEY"):
|
|
140
|
+
model_name = model_name or "gpt-4-turbo"
|
|
141
|
+
embedding_model = embedding_model or "text-embedding-3-small"
|
|
142
|
+
print(f"Using OpenAI API with model: {model_name}")
|
|
143
|
+
self.llm = ChatOpenAI(model=model_name, temperature=0)
|
|
144
|
+
self.embeddings = OpenAIEmbeddings(model=embedding_model)
|
|
145
|
+
self.api_type = "openai"
|
|
146
|
+
elif use_gemini and GEMINI_AVAILABLE:
|
|
147
|
+
# Try Gemini without env var check (will fail if key not set)
|
|
148
|
+
model_name = model_name or "gemini-2.0-flash"
|
|
149
|
+
embedding_model = embedding_model or "models/text-embedding-004"
|
|
150
|
+
print(f"Using Gemini API with model: {model_name}")
|
|
151
|
+
self.llm = ChatGoogleGenerativeAI(model=model_name, temperature=0)
|
|
152
|
+
self.embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model)
|
|
153
|
+
self.api_type = "gemini"
|
|
154
|
+
else:
|
|
155
|
+
raise RuntimeError("No API available. Install langchain-google-genai or langchain-openai and set API key.")
|
|
156
|
+
|
|
157
|
+
# Ragas metrics configuration (only if ragas available)
|
|
158
|
+
self.ragas_metrics = []
|
|
159
|
+
if RAGAS_AVAILABLE:
|
|
160
|
+
self.ragas_metrics = [
|
|
161
|
+
faithfulness,
|
|
162
|
+
answer_relevancy,
|
|
163
|
+
context_precision,
|
|
164
|
+
context_recall,
|
|
165
|
+
]
|
|
166
|
+
# Add optional metrics if available
|
|
167
|
+
if HAS_ENTITY_RECALL:
|
|
168
|
+
self.ragas_metrics.append(context_entity_recall)
|
|
169
|
+
if HAS_NOISE_SENSITIVITY:
|
|
170
|
+
self.ragas_metrics.append(noise_sensitivity_relevant)
|
|
171
|
+
if HAS_MULTIMODAL:
|
|
172
|
+
self.ragas_metrics.extend([multimodal_faithfulness, multimodal_relevance])
|
|
173
|
+
|
|
174
|
+
def load_dataset(self, json_path: str) -> List[Dict]:
|
|
175
|
+
"""
|
|
176
|
+
Loads the generated QA dataset.
|
|
177
|
+
Expected JSON format: List of dicts with keys:
|
|
178
|
+
['question', 'answer', 'contexts', 'ground_truth' (optional), 'metadata']
|
|
179
|
+
"""
|
|
180
|
+
with open(json_path, 'r') as f:
|
|
181
|
+
data = json.load(f)
|
|
182
|
+
return data
|
|
183
|
+
|
|
184
|
+
# ==========================================
|
|
185
|
+
# 1. STANDARD RAG METRICS (RAGAS)
|
|
186
|
+
# ==========================================
|
|
187
|
+
def evaluate_ragas_standard(self, data: List[Dict]) -> pd.DataFrame:
|
|
188
|
+
"""
|
|
189
|
+
Evaluates Faithfulness, Relevance, Precision, and Recall using Ragas.
|
|
190
|
+
Uses the same LLM backend (Gemini/OpenAI) as configured in the evaluator.
|
|
191
|
+
"""
|
|
192
|
+
print("--- Running Standard RAG Metrics (Ragas) ---")
|
|
193
|
+
|
|
194
|
+
# Import RAGAS LLM wrapper
|
|
195
|
+
try:
|
|
196
|
+
from ragas.llms import LangchainLLMWrapper
|
|
197
|
+
from ragas.embeddings import LangchainEmbeddingsWrapper
|
|
198
|
+
except ImportError:
|
|
199
|
+
# Older RAGAS versions
|
|
200
|
+
LangchainLLMWrapper = None
|
|
201
|
+
LangchainEmbeddingsWrapper = None
|
|
202
|
+
|
|
203
|
+
# RAGAS 0.4.x requires specific column names
|
|
204
|
+
# user_input (question), response (answer), retrieved_contexts, reference (ground_truth)
|
|
205
|
+
ragas_data = {
|
|
206
|
+
"user_input": [d.get('question', "") for d in data],
|
|
207
|
+
"response": [d.get('answer', "") for d in data],
|
|
208
|
+
"retrieved_contexts": [d.get('contexts', []) for d in data],
|
|
209
|
+
# Use answer as reference if not provided
|
|
210
|
+
"reference": [d.get('ground_truth', d.get('answer', "")) for d in data]
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
dataset = Dataset.from_dict(ragas_data)
|
|
214
|
+
|
|
215
|
+
# Wrap LLM and embeddings for RAGAS
|
|
216
|
+
import warnings
|
|
217
|
+
with warnings.catch_warnings():
|
|
218
|
+
warnings.simplefilter("ignore", DeprecationWarning)
|
|
219
|
+
|
|
220
|
+
# Configure parallel execution
|
|
221
|
+
try:
|
|
222
|
+
from ragas import RunConfig
|
|
223
|
+
# Increase parallelism for faster evaluation
|
|
224
|
+
# max_workers=64 for high throughput, timeout=300 for long contexts
|
|
225
|
+
run_config = RunConfig(
|
|
226
|
+
max_workers=64, # Parallel LLM calls
|
|
227
|
+
timeout=300, # 5 min timeout per call
|
|
228
|
+
max_retries=3, # Retry on failures
|
|
229
|
+
)
|
|
230
|
+
print(f" Using parallel execution with {run_config.max_workers} workers...")
|
|
231
|
+
except ImportError:
|
|
232
|
+
run_config = None
|
|
233
|
+
|
|
234
|
+
if LangchainLLMWrapper and LangchainEmbeddingsWrapper:
|
|
235
|
+
ragas_llm = LangchainLLMWrapper(self.llm)
|
|
236
|
+
ragas_embeddings = LangchainEmbeddingsWrapper(self.embeddings)
|
|
237
|
+
|
|
238
|
+
eval_kwargs = {
|
|
239
|
+
"dataset": dataset,
|
|
240
|
+
"metrics": self.ragas_metrics,
|
|
241
|
+
"llm": ragas_llm,
|
|
242
|
+
"embeddings": ragas_embeddings,
|
|
243
|
+
}
|
|
244
|
+
if run_config:
|
|
245
|
+
eval_kwargs["run_config"] = run_config
|
|
246
|
+
|
|
247
|
+
results = evaluate(**eval_kwargs)
|
|
248
|
+
else:
|
|
249
|
+
# Fallback for older RAGAS versions
|
|
250
|
+
results = evaluate(
|
|
251
|
+
dataset=dataset,
|
|
252
|
+
metrics=self.ragas_metrics,
|
|
253
|
+
llm=self.llm,
|
|
254
|
+
embeddings=self.embeddings,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return results.to_pandas()
|
|
258
|
+
|
|
259
|
+
# ==========================================
|
|
260
|
+
# 2. CUSTOM: REASONING COMPLEXITY (MULTI-HOP) - LLM-as-a-Judge
|
|
261
|
+
# ==========================================
|
|
262
|
+
def evaluate_multihop_reasoning(self, question: str, answer: str, contexts: List[str]):
|
|
263
|
+
"""
|
|
264
|
+
Uses LLM-as-a-Judge to determine if a question is truly multi-hop.
|
|
265
|
+
Returns: Dict with hop_count (int), reasoning_score (float 0-1), bridge_entity (str)
|
|
266
|
+
"""
|
|
267
|
+
import re
|
|
268
|
+
|
|
269
|
+
if "multihop_reasoning" not in PROMPTS_METRICS:
|
|
270
|
+
raise ValueError("PROMPTS_METRICS['multihop_reasoning'] not found in prompt.py")
|
|
271
|
+
prompt_template = PROMPTS_METRICS["multihop_reasoning"]
|
|
272
|
+
prompt = ChatPromptTemplate.from_template(prompt_template)
|
|
273
|
+
chain = prompt | self.llm
|
|
274
|
+
|
|
275
|
+
try:
|
|
276
|
+
response = chain.invoke({
|
|
277
|
+
"contexts": "\n".join(contexts),
|
|
278
|
+
"question": question,
|
|
279
|
+
"answer": answer
|
|
280
|
+
})
|
|
281
|
+
content = response.content.strip()
|
|
282
|
+
|
|
283
|
+
# Parse the response
|
|
284
|
+
hop_match = re.search(r'HOP_COUNT:\s*(\d+)', content)
|
|
285
|
+
score_match = re.search(r'REASONING_SCORE:\s*([\d.]+)', content)
|
|
286
|
+
bridge_match = re.search(r'BRIDGE_ENTITY:\s*(.+?)(?:\n|$)', content)
|
|
287
|
+
|
|
288
|
+
hop_count = int(hop_match.group(1)) if hop_match else 1
|
|
289
|
+
reasoning_score = float(score_match.group(1)) if score_match else 0.5
|
|
290
|
+
reasoning_score = min(1.0, max(0.0, reasoning_score))
|
|
291
|
+
bridge_entity = bridge_match.group(1).strip() if bridge_match else "None"
|
|
292
|
+
|
|
293
|
+
return {
|
|
294
|
+
"hop_count": hop_count,
|
|
295
|
+
"reasoning_score": reasoning_score,
|
|
296
|
+
"bridge_entity": bridge_entity
|
|
297
|
+
}
|
|
298
|
+
except Exception as e:
|
|
299
|
+
print(f"Error in multihop eval: {e}")
|
|
300
|
+
return {"hop_count": 1, "reasoning_score": 0.0, "bridge_entity": "Error"}
|
|
301
|
+
|
|
302
|
+
# ==========================================
|
|
303
|
+
# 3. CUSTOM: VISUAL DEPENDENCY (BLIND TEST) - LLM-as-a-Judge
|
|
304
|
+
# ==========================================
|
|
305
|
+
def evaluate_visual_dependency(self, question: str, text_contexts: List[str]):
|
|
306
|
+
"""
|
|
307
|
+
The 'Blind Test': Can the question be answered using ONLY text contexts?
|
|
308
|
+
High Score (1.0) = Good for Multimodal (Model FAILED to answer without image).
|
|
309
|
+
Low Score (0.0) = Bad for Multimodal (Model could answer using text only).
|
|
310
|
+
"""
|
|
311
|
+
if "visual_dependency" not in PROMPTS_METRICS:
|
|
312
|
+
raise ValueError("PROMPTS_METRICS['visual_dependency'] not found in prompt.py")
|
|
313
|
+
prompt_template = PROMPTS_METRICS["visual_dependency"]
|
|
314
|
+
prompt = ChatPromptTemplate.from_template(prompt_template)
|
|
315
|
+
|
|
316
|
+
chain = prompt | self.llm
|
|
317
|
+
try:
|
|
318
|
+
response = chain.invoke({"contexts": "\n".join(text_contexts), "question": question})
|
|
319
|
+
content = response.content.strip()
|
|
320
|
+
|
|
321
|
+
# If LLM says it's missing visual info, that's a PASS (1.0) for Visual Dependency
|
|
322
|
+
is_dependent = "MISSING_VISUAL" in content.upper()
|
|
323
|
+
return 1.0 if is_dependent else 0.0
|
|
324
|
+
except Exception as e:
|
|
325
|
+
print(f"Error in visual eval: {e}")
|
|
326
|
+
return 0.0
|
|
327
|
+
|
|
328
|
+
# ==========================================
|
|
329
|
+
# 4. CUSTOM: MULTIMODAL VLM METRICS
|
|
330
|
+
# ==========================================
|
|
331
|
+
def evaluate_multimodal_faithfulness_vlm(self, question: str, answer: str, context_chunks: List[Dict]) -> Dict:
|
|
332
|
+
"""
|
|
333
|
+
VLM-based faithfulness evaluation: Does the answer faithfully represent
|
|
334
|
+
information from BOTH text and visual contexts?
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
Dict with 'score' (0-1), 'text_supported', 'visual_supported', 'explanation'
|
|
338
|
+
"""
|
|
339
|
+
if not VLM_AVAILABLE:
|
|
340
|
+
return {"score": 0.0, "text_supported": False, "visual_supported": False,
|
|
341
|
+
"explanation": "VLM not available"}
|
|
342
|
+
|
|
343
|
+
if "multimodal_faithfulness_vlm" not in PROMPTS_METRICS:
|
|
344
|
+
raise ValueError("PROMPTS_METRICS['multimodal_faithfulness_vlm'] not found in prompt.py")
|
|
345
|
+
prompt = PROMPTS_METRICS["multimodal_faithfulness_vlm"].format(question=question, answer=answer)
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
response = call_vlm_interweaved(prompt, context_chunks)
|
|
349
|
+
|
|
350
|
+
# Parse response
|
|
351
|
+
text_supported = "TEXT_SUPPORTED: YES" in response.upper()
|
|
352
|
+
visual_supported = "VISUAL_SUPPORTED: YES" in response.upper()
|
|
353
|
+
visual_na = "VISUAL_SUPPORTED: NA" in response.upper()
|
|
354
|
+
|
|
355
|
+
# Extract score
|
|
356
|
+
import re
|
|
357
|
+
score_match = re.search(r'FAITHFULNESS_SCORE:\s*([\d.]+)', response)
|
|
358
|
+
score = float(score_match.group(1)) if score_match else 0.5
|
|
359
|
+
score = min(1.0, max(0.0, score))
|
|
360
|
+
|
|
361
|
+
# Extract explanation
|
|
362
|
+
exp_match = re.search(r'EXPLANATION:\s*(.+?)(?:\n|$)', response, re.DOTALL)
|
|
363
|
+
explanation = exp_match.group(1).strip() if exp_match else ""
|
|
364
|
+
|
|
365
|
+
return {
|
|
366
|
+
"score": score,
|
|
367
|
+
"text_supported": text_supported,
|
|
368
|
+
"visual_supported": visual_supported if not visual_na else None,
|
|
369
|
+
"explanation": explanation[:200]
|
|
370
|
+
}
|
|
371
|
+
except Exception as e:
|
|
372
|
+
print(f"Error in multimodal faithfulness eval: {e}")
|
|
373
|
+
return {"score": 0.0, "text_supported": False, "visual_supported": False,
|
|
374
|
+
"explanation": f"Error: {str(e)}"}
|
|
375
|
+
|
|
376
|
+
def evaluate_multimodal_answer_quality_vlm(self, question: str, answer: str, context_chunks: List[Dict]) -> Dict:
|
|
377
|
+
"""
|
|
378
|
+
VLM-based answer quality evaluation considering multimodal context.
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Dict with 'completeness', 'accuracy', 'uses_visual_info', 'overall_score'
|
|
382
|
+
"""
|
|
383
|
+
if not VLM_AVAILABLE:
|
|
384
|
+
return {"completeness": 0.0, "accuracy": 0.0, "uses_visual_info": False, "overall_score": 0.0}
|
|
385
|
+
|
|
386
|
+
if "multimodal_answer_quality_vlm" not in PROMPTS_METRICS:
|
|
387
|
+
raise ValueError("PROMPTS_METRICS['multimodal_answer_quality_vlm'] not found in prompt.py")
|
|
388
|
+
prompt = PROMPTS_METRICS["multimodal_answer_quality_vlm"].format(question=question, answer=answer)
|
|
389
|
+
|
|
390
|
+
try:
|
|
391
|
+
response = call_vlm_interweaved(prompt, context_chunks)
|
|
392
|
+
|
|
393
|
+
# Parse scores
|
|
394
|
+
import re
|
|
395
|
+
completeness = 0.5
|
|
396
|
+
accuracy = 0.5
|
|
397
|
+
overall = 0.5
|
|
398
|
+
|
|
399
|
+
comp_match = re.search(r'COMPLETENESS:\s*([\d.]+)', response)
|
|
400
|
+
if comp_match:
|
|
401
|
+
completeness = min(1.0, max(0.0, float(comp_match.group(1))))
|
|
402
|
+
|
|
403
|
+
acc_match = re.search(r'ACCURACY:\s*([\d.]+)', response)
|
|
404
|
+
if acc_match:
|
|
405
|
+
accuracy = min(1.0, max(0.0, float(acc_match.group(1))))
|
|
406
|
+
|
|
407
|
+
overall_match = re.search(r'OVERALL_SCORE:\s*([\d.]+)', response)
|
|
408
|
+
if overall_match:
|
|
409
|
+
overall = min(1.0, max(0.0, float(overall_match.group(1))))
|
|
410
|
+
|
|
411
|
+
uses_visual = "VISUAL_INFO_USED: YES" in response.upper()
|
|
412
|
+
|
|
413
|
+
return {
|
|
414
|
+
"completeness": completeness,
|
|
415
|
+
"accuracy": accuracy,
|
|
416
|
+
"uses_visual_info": uses_visual,
|
|
417
|
+
"overall_score": overall
|
|
418
|
+
}
|
|
419
|
+
except Exception as e:
|
|
420
|
+
print(f"Error in multimodal answer quality eval: {e}")
|
|
421
|
+
return {"completeness": 0.0, "accuracy": 0.0, "uses_visual_info": False, "overall_score": 0.0}
|
|
422
|
+
|
|
423
|
+
# ==========================================
|
|
424
|
+
# 5. CUSTOM: DATASET DIVERSITY
|
|
425
|
+
# ==========================================
|
|
426
|
+
def evaluate_semantic_diversity(self, questions: List[str]):
|
|
427
|
+
"""
|
|
428
|
+
Calculates diversity based on cosine distance of question embeddings.
|
|
429
|
+
Returns: diversity_score (0-1, higher is better)
|
|
430
|
+
"""
|
|
431
|
+
print("--- Calculating Semantic Diversity ---")
|
|
432
|
+
if not questions:
|
|
433
|
+
return 0.0
|
|
434
|
+
|
|
435
|
+
embeddings = self.embeddings.embed_documents(questions)
|
|
436
|
+
matrix = np.array(embeddings)
|
|
437
|
+
|
|
438
|
+
# Calculate cosine similarity matrix
|
|
439
|
+
sim_matrix = cosine_similarity(matrix)
|
|
440
|
+
|
|
441
|
+
# We want diversity (distance), so we look at 1 - average_similarity
|
|
442
|
+
# Exclude diagonal (self-similarity is always 1)
|
|
443
|
+
np.fill_diagonal(sim_matrix, np.nan)
|
|
444
|
+
avg_similarity = np.nanmean(sim_matrix)
|
|
445
|
+
|
|
446
|
+
diversity_score = 1 - avg_similarity
|
|
447
|
+
return diversity_score
|
|
448
|
+
|
|
449
|
+
# ==========================================
|
|
450
|
+
# 6. CUSTOM: CONTEXT NECESSITY (Anti-Parametric Bias)
|
|
451
|
+
# ==========================================
|
|
452
|
+
def evaluate_context_necessity(self, question: str, answer: str, context: str) -> Dict:
|
|
453
|
+
"""
|
|
454
|
+
Measures if the question REQUIRES the context to be answered correctly.
|
|
455
|
+
Tests anti-parametric bias by checking if LLM can answer without context.
|
|
456
|
+
|
|
457
|
+
Objective: Ensure the question tests retrieval, not just parametric knowledge.
|
|
458
|
+
|
|
459
|
+
Mathematical Intuition: Maximizes information gain from context.
|
|
460
|
+
High score = context is necessary (good for RAG evaluation).
|
|
461
|
+
Low score = answerable from parametric knowledge (bad for RAG evaluation).
|
|
462
|
+
|
|
463
|
+
Input:
|
|
464
|
+
- question: The question string
|
|
465
|
+
- answer: The ground truth answer
|
|
466
|
+
- context: The provided context
|
|
467
|
+
|
|
468
|
+
Output:
|
|
469
|
+
Dict with:
|
|
470
|
+
- context_necessity_score: Float 0-1 (1 = context essential, 0 = not needed)
|
|
471
|
+
- without_context_correct: Boolean (did LLM answer correctly without context?)
|
|
472
|
+
- with_context_correct: Boolean (did LLM answer correctly with context?)
|
|
473
|
+
- explanation: String explaining the assessment
|
|
474
|
+
|
|
475
|
+
Interpretation:
|
|
476
|
+
- 0.8-1.0: Excellent - question strictly requires context
|
|
477
|
+
- 0.5-0.8: Moderate - context helps but partial answers possible
|
|
478
|
+
- 0.0-0.5: Poor - answerable from parametric knowledge
|
|
479
|
+
"""
|
|
480
|
+
# Step 1: Ask LLM to answer WITHOUT context
|
|
481
|
+
if "context_necessity_without" not in PROMPTS_METRICS:
|
|
482
|
+
raise ValueError("PROMPTS_METRICS['context_necessity_without'] not found in prompt.py")
|
|
483
|
+
prompt_without_template = PROMPTS_METRICS["context_necessity_without"]
|
|
484
|
+
prompt_without = ChatPromptTemplate.from_template(prompt_without_template)
|
|
485
|
+
|
|
486
|
+
# Step 2: Ask LLM to verify answer WITH context
|
|
487
|
+
if "context_necessity_verify" not in PROMPTS_METRICS:
|
|
488
|
+
raise ValueError("PROMPTS_METRICS['context_necessity_verify'] not found in prompt.py")
|
|
489
|
+
prompt_verify_template = PROMPTS_METRICS["context_necessity_verify"]
|
|
490
|
+
prompt_verify = ChatPromptTemplate.from_template(prompt_verify_template)
|
|
491
|
+
|
|
492
|
+
try:
|
|
493
|
+
# Get answer without context
|
|
494
|
+
chain_without = prompt_without | self.llm
|
|
495
|
+
response_without = chain_without.invoke({"question": question})
|
|
496
|
+
answer_without_context = response_without.content.strip()
|
|
497
|
+
|
|
498
|
+
# Check if model refused to answer
|
|
499
|
+
refused = "CANNOT_ANSWER" in answer_without_context.upper()
|
|
500
|
+
|
|
501
|
+
if refused:
|
|
502
|
+
# Model couldn't answer without context - high context necessity
|
|
503
|
+
return {
|
|
504
|
+
"context_necessity_score": 1.0,
|
|
505
|
+
"without_context_correct": False,
|
|
506
|
+
"with_context_correct": True, # Assumed since we have the answer
|
|
507
|
+
"answer_without_context": answer_without_context[:200],
|
|
508
|
+
"explanation": "Model could not answer without context - context is essential"
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
# Verify if the answer without context matches ground truth
|
|
512
|
+
chain_verify = prompt_verify | self.llm
|
|
513
|
+
verify_response = chain_verify.invoke({
|
|
514
|
+
"ground_truth": answer,
|
|
515
|
+
"model_answer": answer_without_context
|
|
516
|
+
})
|
|
517
|
+
verify_content = verify_response.content.strip().upper()
|
|
518
|
+
|
|
519
|
+
if "MATCH: YES" in verify_content:
|
|
520
|
+
# Model answered correctly without context - low context necessity
|
|
521
|
+
return {
|
|
522
|
+
"context_necessity_score": 0.0,
|
|
523
|
+
"without_context_correct": True,
|
|
524
|
+
"with_context_correct": True,
|
|
525
|
+
"answer_without_context": answer_without_context[:200],
|
|
526
|
+
"explanation": "Model answered correctly without context - question may test parametric knowledge"
|
|
527
|
+
}
|
|
528
|
+
elif "MATCH: PARTIAL" in verify_content:
|
|
529
|
+
# Partial match - moderate context necessity
|
|
530
|
+
return {
|
|
531
|
+
"context_necessity_score": 0.5,
|
|
532
|
+
"without_context_correct": False,
|
|
533
|
+
"with_context_correct": True,
|
|
534
|
+
"answer_without_context": answer_without_context[:200],
|
|
535
|
+
"explanation": "Model partially answered without context - context adds value"
|
|
536
|
+
}
|
|
537
|
+
else:
|
|
538
|
+
# No match - high context necessity
|
|
539
|
+
return {
|
|
540
|
+
"context_necessity_score": 0.9,
|
|
541
|
+
"without_context_correct": False,
|
|
542
|
+
"with_context_correct": True,
|
|
543
|
+
"answer_without_context": answer_without_context[:200],
|
|
544
|
+
"explanation": "Model answered incorrectly without context - context is necessary"
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
except Exception as e:
|
|
548
|
+
print(f"Error in context necessity eval: {e}")
|
|
549
|
+
return {
|
|
550
|
+
"context_necessity_score": 0.5,
|
|
551
|
+
"without_context_correct": None,
|
|
552
|
+
"with_context_correct": None,
|
|
553
|
+
"answer_without_context": "",
|
|
554
|
+
"explanation": f"Error: {str(e)}"
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
def batch_evaluate_context_necessity(self, qa_items: List[Dict]) -> List[Dict]:
|
|
558
|
+
"""
|
|
559
|
+
Batch evaluation of context necessity using concurrent API calls.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
qa_items: List of dicts with 'question', 'answer', 'context' keys
|
|
563
|
+
|
|
564
|
+
Returns:
|
|
565
|
+
List of result dicts in same order
|
|
566
|
+
"""
|
|
567
|
+
if not BATCH_AVAILABLE:
|
|
568
|
+
# Fallback to sequential
|
|
569
|
+
return [self.evaluate_context_necessity(
|
|
570
|
+
item['question'], item['answer'], item['context']
|
|
571
|
+
) for item in qa_items]
|
|
572
|
+
|
|
573
|
+
if "context_necessity_without" not in PROMPTS_METRICS:
|
|
574
|
+
raise ValueError("PROMPTS_METRICS['context_necessity_without'] not found")
|
|
575
|
+
if "context_necessity_verify" not in PROMPTS_METRICS:
|
|
576
|
+
raise ValueError("PROMPTS_METRICS['context_necessity_verify'] not found")
|
|
577
|
+
|
|
578
|
+
prompt_without_template = PROMPTS_METRICS["context_necessity_without"]
|
|
579
|
+
prompt_verify_template = PROMPTS_METRICS["context_necessity_verify"]
|
|
580
|
+
|
|
581
|
+
# Phase 1: Batch "answer without context" calls
|
|
582
|
+
prompts_without = []
|
|
583
|
+
for item in qa_items:
|
|
584
|
+
prompt = prompt_without_template.replace("{question}", item['question'])
|
|
585
|
+
prompts_without.append(prompt)
|
|
586
|
+
|
|
587
|
+
print(f" ⚡ Phase 1: Batch answering {len(prompts_without)} questions without context...")
|
|
588
|
+
answers_without = batch_call_llm(prompts_without, show_progress=False)
|
|
589
|
+
|
|
590
|
+
# Phase 2: Batch verification calls for non-refused answers
|
|
591
|
+
verify_prompts = []
|
|
592
|
+
verify_indices = []
|
|
593
|
+
results = [None] * len(qa_items)
|
|
594
|
+
|
|
595
|
+
for i, (item, answer_without) in enumerate(zip(qa_items, answers_without)):
|
|
596
|
+
if answer_without.startswith("ERROR:"):
|
|
597
|
+
results[i] = {
|
|
598
|
+
"context_necessity_score": 0.5,
|
|
599
|
+
"without_context_correct": None,
|
|
600
|
+
"with_context_correct": None,
|
|
601
|
+
"answer_without_context": "",
|
|
602
|
+
"explanation": f"Error: {answer_without}"
|
|
603
|
+
}
|
|
604
|
+
elif "CANNOT_ANSWER" in answer_without.upper():
|
|
605
|
+
results[i] = {
|
|
606
|
+
"context_necessity_score": 1.0,
|
|
607
|
+
"without_context_correct": False,
|
|
608
|
+
"with_context_correct": True,
|
|
609
|
+
"answer_without_context": answer_without[:200],
|
|
610
|
+
"explanation": "Model could not answer without context - context is essential"
|
|
611
|
+
}
|
|
612
|
+
else:
|
|
613
|
+
# Need to verify
|
|
614
|
+
prompt = prompt_verify_template.replace(
|
|
615
|
+
"{ground_truth}", item['answer']
|
|
616
|
+
).replace("{model_answer}", answer_without)
|
|
617
|
+
verify_prompts.append(prompt)
|
|
618
|
+
verify_indices.append(i)
|
|
619
|
+
|
|
620
|
+
if verify_prompts:
|
|
621
|
+
print(f" ⚡ Phase 2: Batch verifying {len(verify_prompts)} answers...")
|
|
622
|
+
verify_responses = batch_call_llm(verify_prompts, show_progress=False)
|
|
623
|
+
|
|
624
|
+
for idx, verify_content in zip(verify_indices, verify_responses):
|
|
625
|
+
answer_without = answers_without[idx]
|
|
626
|
+
verify_upper = verify_content.upper() if verify_content else ""
|
|
627
|
+
|
|
628
|
+
if "MATCH: YES" in verify_upper:
|
|
629
|
+
results[idx] = {
|
|
630
|
+
"context_necessity_score": 0.0,
|
|
631
|
+
"without_context_correct": True,
|
|
632
|
+
"with_context_correct": True,
|
|
633
|
+
"answer_without_context": answer_without[:200],
|
|
634
|
+
"explanation": "Model answered correctly without context - question may test parametric knowledge"
|
|
635
|
+
}
|
|
636
|
+
elif "MATCH: PARTIAL" in verify_upper:
|
|
637
|
+
results[idx] = {
|
|
638
|
+
"context_necessity_score": 0.5,
|
|
639
|
+
"without_context_correct": False,
|
|
640
|
+
"with_context_correct": True,
|
|
641
|
+
"answer_without_context": answer_without[:200],
|
|
642
|
+
"explanation": "Model partially answered without context - context adds value"
|
|
643
|
+
}
|
|
644
|
+
else:
|
|
645
|
+
results[idx] = {
|
|
646
|
+
"context_necessity_score": 0.9,
|
|
647
|
+
"without_context_correct": False,
|
|
648
|
+
"with_context_correct": True,
|
|
649
|
+
"answer_without_context": answer_without[:200],
|
|
650
|
+
"explanation": "Model answered incorrectly without context - context is necessary"
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
return results
|
|
654
|
+
|
|
655
|
+
def batch_evaluate_multihop_reasoning(self, qa_items: List[Dict]) -> List[Dict]:
|
|
656
|
+
"""
|
|
657
|
+
Batch evaluation of multihop reasoning using concurrent API calls.
|
|
658
|
+
|
|
659
|
+
Args:
|
|
660
|
+
qa_items: List of dicts with 'question', 'answer', 'contexts' keys
|
|
661
|
+
|
|
662
|
+
Returns:
|
|
663
|
+
List of result dicts with hop_count, reasoning_score, bridge_entity
|
|
664
|
+
"""
|
|
665
|
+
if not BATCH_AVAILABLE:
|
|
666
|
+
# Fallback to sequential
|
|
667
|
+
return [self.evaluate_multihop_reasoning(
|
|
668
|
+
item['question'], item['answer'], item['contexts']
|
|
669
|
+
) for item in qa_items]
|
|
670
|
+
|
|
671
|
+
if "multihop_reasoning" not in PROMPTS_METRICS:
|
|
672
|
+
raise ValueError("PROMPTS_METRICS['multihop_reasoning'] not found")
|
|
673
|
+
|
|
674
|
+
prompt_template = PROMPTS_METRICS["multihop_reasoning"]
|
|
675
|
+
|
|
676
|
+
prompts = []
|
|
677
|
+
for item in qa_items:
|
|
678
|
+
contexts_str = "\n".join(item['contexts']) if isinstance(item['contexts'], list) else item['contexts']
|
|
679
|
+
prompt = prompt_template.replace(
|
|
680
|
+
"{contexts}", contexts_str
|
|
681
|
+
).replace("{question}", item['question']).replace("{answer}", item['answer'])
|
|
682
|
+
prompts.append(prompt)
|
|
683
|
+
|
|
684
|
+
print(f" ⚡ Batch evaluating {len(prompts)} multihop reasoning questions...")
|
|
685
|
+
responses = batch_call_llm(prompts, show_progress=False)
|
|
686
|
+
|
|
687
|
+
results = []
|
|
688
|
+
import re
|
|
689
|
+
for response in responses:
|
|
690
|
+
if response.startswith("ERROR:"):
|
|
691
|
+
results.append({"hop_count": 1, "reasoning_score": 0.0, "bridge_entity": "Error"})
|
|
692
|
+
continue
|
|
693
|
+
|
|
694
|
+
hop_match = re.search(r'HOP_COUNT:\s*(\d+)', response)
|
|
695
|
+
score_match = re.search(r'REASONING_SCORE:\s*([\d.]+)', response)
|
|
696
|
+
bridge_match = re.search(r'BRIDGE_ENTITY:\s*(.+?)(?:\n|$)', response)
|
|
697
|
+
|
|
698
|
+
hop_count = int(hop_match.group(1)) if hop_match else 1
|
|
699
|
+
reasoning_score = float(score_match.group(1)) if score_match else 0.5
|
|
700
|
+
reasoning_score = min(1.0, max(0.0, reasoning_score))
|
|
701
|
+
bridge_entity = bridge_match.group(1).strip() if bridge_match else "None"
|
|
702
|
+
|
|
703
|
+
results.append({
|
|
704
|
+
"hop_count": hop_count,
|
|
705
|
+
"reasoning_score": reasoning_score,
|
|
706
|
+
"bridge_entity": bridge_entity
|
|
707
|
+
})
|
|
708
|
+
|
|
709
|
+
return results
|
|
710
|
+
|
|
711
|
+
# ==========================================
|
|
712
|
+
# 7. CUSTOM: DOMAIN COVERAGE
|
|
713
|
+
# ==========================================
|
|
714
|
+
def evaluate_domain_coverage(self, qa_data: List[Dict], corpus_chunks: List[Dict]) -> Dict:
|
|
715
|
+
"""
|
|
716
|
+
Measures how well the QA dataset covers the source corpus.
|
|
717
|
+
Prevents sampling bias and ensures comprehensive evaluation.
|
|
718
|
+
|
|
719
|
+
Objective: Ensure QA dataset comprehensively tests knowledge across the corpus.
|
|
720
|
+
|
|
721
|
+
Mathematical Intuition: Minimizes Jensen-Shannon divergence between
|
|
722
|
+
topic distributions: min D_JS(P_topics(D) || P_topics(C))
|
|
723
|
+
|
|
724
|
+
Input:
|
|
725
|
+
- qa_data: List of QA pairs with chunk references
|
|
726
|
+
- corpus_chunks: List of all corpus chunks with metadata
|
|
727
|
+
|
|
728
|
+
Output:
|
|
729
|
+
Dict with:
|
|
730
|
+
- chunk_coverage: Float 0-1 (proportion of corpus chunks covered)
|
|
731
|
+
- file_coverage: Float 0-1 (proportion of source files covered)
|
|
732
|
+
- chunk_type_coverage: Dict (coverage by chunk type)
|
|
733
|
+
- topic_divergence: Float 0-1 (JS divergence, lower is better)
|
|
734
|
+
- uncovered_chunks: Int (number of chunks not referenced)
|
|
735
|
+
- coverage_by_file: Dict (coverage breakdown by file)
|
|
736
|
+
|
|
737
|
+
Interpretation:
|
|
738
|
+
- chunk_coverage 0.8+: Excellent corpus coverage
|
|
739
|
+
- chunk_coverage 0.5-0.8: Moderate coverage, some gaps
|
|
740
|
+
- chunk_coverage <0.5: Poor coverage, significant gaps
|
|
741
|
+
- topic_divergence <0.2: Good topic balance
|
|
742
|
+
- topic_divergence >0.5: Significant topic imbalance
|
|
743
|
+
"""
|
|
744
|
+
# entropy imported at module level from scipy.stats
|
|
745
|
+
|
|
746
|
+
# Build corpus index
|
|
747
|
+
corpus_index = {}
|
|
748
|
+
corpus_by_file = Counter()
|
|
749
|
+
corpus_by_type = Counter()
|
|
750
|
+
|
|
751
|
+
for chunk in corpus_chunks:
|
|
752
|
+
key = (chunk.get('file_name'), str(chunk.get('chunk_id')))
|
|
753
|
+
corpus_index[key] = chunk
|
|
754
|
+
corpus_by_file[chunk.get('file_name', 'unknown')] += 1
|
|
755
|
+
corpus_by_type[chunk.get('chunk_type', 'unknown')] += 1
|
|
756
|
+
|
|
757
|
+
# Track covered chunks
|
|
758
|
+
covered_chunks = set()
|
|
759
|
+
covered_by_file = Counter()
|
|
760
|
+
covered_by_type = Counter()
|
|
761
|
+
|
|
762
|
+
for qa in qa_data:
|
|
763
|
+
for chunk_ref in qa.get('chunks_added', []):
|
|
764
|
+
key = (chunk_ref.get('file_name'), str(chunk_ref.get('chunk_id')))
|
|
765
|
+
if key in corpus_index:
|
|
766
|
+
covered_chunks.add(key)
|
|
767
|
+
chunk_info = corpus_index[key]
|
|
768
|
+
covered_by_file[chunk_info.get('file_name', 'unknown')] += 1
|
|
769
|
+
covered_by_type[chunk_info.get('chunk_type', 'unknown')] += 1
|
|
770
|
+
|
|
771
|
+
# Calculate coverage metrics
|
|
772
|
+
total_corpus = len(corpus_chunks)
|
|
773
|
+
total_covered = len(covered_chunks)
|
|
774
|
+
chunk_coverage = total_covered / total_corpus if total_corpus > 0 else 0.0
|
|
775
|
+
|
|
776
|
+
# File coverage
|
|
777
|
+
files_in_corpus = set(corpus_by_file.keys())
|
|
778
|
+
files_covered = set(covered_by_file.keys())
|
|
779
|
+
file_coverage = len(files_covered) / len(files_in_corpus) if files_in_corpus else 0.0
|
|
780
|
+
|
|
781
|
+
# Coverage by file
|
|
782
|
+
coverage_by_file = {}
|
|
783
|
+
for file_name in files_in_corpus:
|
|
784
|
+
total_in_file = corpus_by_file[file_name]
|
|
785
|
+
covered_in_file = len([k for k in covered_chunks if k[0] == file_name])
|
|
786
|
+
coverage_by_file[file_name] = {
|
|
787
|
+
"total_chunks": total_in_file,
|
|
788
|
+
"covered_chunks": covered_in_file,
|
|
789
|
+
"coverage_rate": covered_in_file / total_in_file if total_in_file > 0 else 0.0
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
# Coverage by chunk type
|
|
793
|
+
chunk_type_coverage = {}
|
|
794
|
+
for chunk_type in corpus_by_type.keys():
|
|
795
|
+
total_of_type = corpus_by_type[chunk_type]
|
|
796
|
+
covered_of_type = sum(1 for k in covered_chunks
|
|
797
|
+
if corpus_index.get(k, {}).get('chunk_type') == chunk_type)
|
|
798
|
+
chunk_type_coverage[chunk_type] = {
|
|
799
|
+
"total": total_of_type,
|
|
800
|
+
"covered": covered_of_type,
|
|
801
|
+
"coverage_rate": covered_of_type / total_of_type if total_of_type > 0 else 0.0
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
# Calculate Jensen-Shannon divergence for topic distribution
|
|
805
|
+
# Using file distribution as proxy for topic distribution
|
|
806
|
+
all_files = sorted(files_in_corpus)
|
|
807
|
+
corpus_dist = np.array([corpus_by_file.get(f, 0) for f in all_files], dtype=float)
|
|
808
|
+
corpus_dist = corpus_dist / corpus_dist.sum() if corpus_dist.sum() > 0 else corpus_dist
|
|
809
|
+
|
|
810
|
+
qa_file_counts = Counter()
|
|
811
|
+
for qa in qa_data:
|
|
812
|
+
for chunk_ref in qa.get('chunks_added', []):
|
|
813
|
+
qa_file_counts[chunk_ref.get('file_name')] += 1
|
|
814
|
+
|
|
815
|
+
qa_dist = np.array([qa_file_counts.get(f, 0) for f in all_files], dtype=float)
|
|
816
|
+
qa_dist = qa_dist / qa_dist.sum() if qa_dist.sum() > 0 else qa_dist
|
|
817
|
+
|
|
818
|
+
# Jensen-Shannon divergence (symmetric KL divergence)
|
|
819
|
+
# Add small epsilon to avoid log(0)
|
|
820
|
+
eps = 1e-10
|
|
821
|
+
corpus_dist = corpus_dist + eps
|
|
822
|
+
qa_dist = qa_dist + eps
|
|
823
|
+
corpus_dist = corpus_dist / corpus_dist.sum()
|
|
824
|
+
qa_dist = qa_dist / qa_dist.sum()
|
|
825
|
+
|
|
826
|
+
m = 0.5 * (corpus_dist + qa_dist)
|
|
827
|
+
js_divergence = 0.5 * (entropy(corpus_dist, m) + entropy(qa_dist, m))
|
|
828
|
+
|
|
829
|
+
return {
|
|
830
|
+
"chunk_coverage": chunk_coverage,
|
|
831
|
+
"file_coverage": file_coverage,
|
|
832
|
+
"chunks_covered": total_covered,
|
|
833
|
+
"chunks_total": total_corpus,
|
|
834
|
+
"uncovered_chunks": total_corpus - total_covered,
|
|
835
|
+
"topic_divergence_js": float(js_divergence),
|
|
836
|
+
"chunk_type_coverage": chunk_type_coverage,
|
|
837
|
+
"coverage_by_file": coverage_by_file
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
# ==========================================
|
|
841
|
+
# MAIN PIPELINE
|
|
842
|
+
# ==========================================
|
|
843
|
+
def run_full_evaluation(self, json_path: str, output_path: str = "eval_report.json"):
|
|
844
|
+
"""
|
|
845
|
+
Runs the full suite of metrics on the provided JSON dataset.
|
|
846
|
+
"""
|
|
847
|
+
data = self.load_dataset(json_path)
|
|
848
|
+
metrics_log = []
|
|
849
|
+
|
|
850
|
+
print(f"Starting evaluation for {len(data)} items...")
|
|
851
|
+
|
|
852
|
+
# 1. Run Standard Metrics (Batch)
|
|
853
|
+
# Note: Ragas requires 'ground_truth' for some metrics.
|
|
854
|
+
# If your dataset is purely synthetic without human labels,
|
|
855
|
+
# context_precision/recall might be approximations based on generated answers.
|
|
856
|
+
ragas_df = self.evaluate_ragas_standard(data)
|
|
857
|
+
|
|
858
|
+
# 2. Run Custom Agentic Metrics (Iterative)
|
|
859
|
+
questions = [d['question'] for d in data]
|
|
860
|
+
|
|
861
|
+
for i, item in enumerate(data):
|
|
862
|
+
if i % 5 == 0:
|
|
863
|
+
print(f"Processing item {i+1}/{len(data)}...")
|
|
864
|
+
|
|
865
|
+
# A. Multi-hop Evaluation
|
|
866
|
+
mh_res = self.evaluate_multihop_reasoning(
|
|
867
|
+
item['question'],
|
|
868
|
+
item['answer'],
|
|
869
|
+
item['contexts']
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
# B. Visual Dependency (Only for items marked as visual/multimodal)
|
|
873
|
+
# Checks metadata type or if image_contexts exist
|
|
874
|
+
is_visual = (
|
|
875
|
+
item.get('metadata', {}).get('type') in ['visual', 'chart', 'table'] or
|
|
876
|
+
len(item.get('image_contexts', [])) > 0
|
|
877
|
+
)
|
|
878
|
+
|
|
879
|
+
vis_score = 0.0
|
|
880
|
+
if is_visual:
|
|
881
|
+
# For the blind test, we pass ONLY text contexts, excluding image descriptions
|
|
882
|
+
vis_score = self.evaluate_visual_dependency(item['question'], item['contexts'])
|
|
883
|
+
|
|
884
|
+
# C. VLM-based Multimodal Metrics (only if images available)
|
|
885
|
+
context_chunks = item.get('context_chunks', [])
|
|
886
|
+
vlm_faithfulness = None
|
|
887
|
+
vlm_answer_quality = None
|
|
888
|
+
|
|
889
|
+
if is_visual and VLM_AVAILABLE and context_chunks:
|
|
890
|
+
print(f" Running VLM multimodal evaluation for item {i+1}...")
|
|
891
|
+
vlm_faithfulness = self.evaluate_multimodal_faithfulness_vlm(
|
|
892
|
+
item['question'], item['answer'], context_chunks
|
|
893
|
+
)
|
|
894
|
+
vlm_answer_quality = self.evaluate_multimodal_answer_quality_vlm(
|
|
895
|
+
item['question'], item['answer'], context_chunks
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
metrics_log.append({
|
|
899
|
+
"hop_count": mh_res['hop_count'],
|
|
900
|
+
"reasoning_score": mh_res['reasoning_score'],
|
|
901
|
+
"bridge_entity": mh_res['bridge_entity'],
|
|
902
|
+
"visual_dependency": vis_score if is_visual else None,
|
|
903
|
+
"is_visual": is_visual,
|
|
904
|
+
# VLM Multimodal metrics
|
|
905
|
+
"vlm_faithfulness_score": vlm_faithfulness['score'] if vlm_faithfulness else None,
|
|
906
|
+
"vlm_text_supported": vlm_faithfulness['text_supported'] if vlm_faithfulness else None,
|
|
907
|
+
"vlm_visual_supported": vlm_faithfulness['visual_supported'] if vlm_faithfulness else None,
|
|
908
|
+
"vlm_completeness": vlm_answer_quality['completeness'] if vlm_answer_quality else None,
|
|
909
|
+
"vlm_accuracy": vlm_answer_quality['accuracy'] if vlm_answer_quality else None,
|
|
910
|
+
"vlm_uses_visual": vlm_answer_quality['uses_visual_info'] if vlm_answer_quality else None,
|
|
911
|
+
"vlm_overall_score": vlm_answer_quality['overall_score'] if vlm_answer_quality else None,
|
|
912
|
+
})
|
|
913
|
+
|
|
914
|
+
# 3. Diversity Evaluation
|
|
915
|
+
diversity_score = self.evaluate_semantic_diversity(questions)
|
|
916
|
+
|
|
917
|
+
# 4. Aggregate Results
|
|
918
|
+
custom_df = pd.DataFrame(metrics_log)
|
|
919
|
+
final_df = pd.concat([ragas_df, custom_df], axis=1)
|
|
920
|
+
|
|
921
|
+
# Calculate Summary Statistics
|
|
922
|
+
rag_quality = {
|
|
923
|
+
"Faithfulness": final_df['faithfulness'].mean(),
|
|
924
|
+
"Answer_Relevance": final_df['answer_relevancy'].mean() if 'answer_relevancy' in final_df.columns else final_df.get('answer_relevance', pd.Series([0])).mean(),
|
|
925
|
+
"Context_Precision": final_df['context_precision'].mean(),
|
|
926
|
+
"Context_Recall": final_df['context_recall'].mean(),
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
# Add optional metrics if they were computed
|
|
930
|
+
if HAS_ENTITY_RECALL and 'context_entity_recall' in final_df.columns:
|
|
931
|
+
rag_quality["Context_Entity_Recall"] = final_df['context_entity_recall'].mean()
|
|
932
|
+
if HAS_NOISE_SENSITIVITY and 'noise_sensitivity_relevant' in final_df.columns:
|
|
933
|
+
rag_quality["Noise_Sensitivity"] = final_df['noise_sensitivity_relevant'].mean()
|
|
934
|
+
|
|
935
|
+
multimodal_quality = {
|
|
936
|
+
"Visual_Necessity_Rate": final_df[final_df['is_visual'] == True]['visual_dependency'].mean()
|
|
937
|
+
if not final_df[final_df['is_visual'] == True].empty else 0.0
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
# Add multimodal ragas metrics if available
|
|
941
|
+
if HAS_MULTIMODAL:
|
|
942
|
+
if 'multimodal_faithfulness' in final_df.columns:
|
|
943
|
+
multimodal_quality["Multimodal_Faithfulness"] = final_df['multimodal_faithfulness'].mean()
|
|
944
|
+
if 'multimodal_relevance' in final_df.columns:
|
|
945
|
+
multimodal_quality["Multimodal_Relevance"] = final_df['multimodal_relevance'].mean()
|
|
946
|
+
|
|
947
|
+
# Add VLM-based multimodal metrics
|
|
948
|
+
visual_items = final_df[final_df['is_visual'] == True]
|
|
949
|
+
if not visual_items.empty:
|
|
950
|
+
if 'vlm_faithfulness_score' in final_df.columns:
|
|
951
|
+
vlm_faith_scores = visual_items['vlm_faithfulness_score'].dropna()
|
|
952
|
+
if len(vlm_faith_scores) > 0:
|
|
953
|
+
multimodal_quality["VLM_Faithfulness_Score"] = vlm_faith_scores.mean()
|
|
954
|
+
if 'vlm_overall_score' in final_df.columns:
|
|
955
|
+
vlm_overall_scores = visual_items['vlm_overall_score'].dropna()
|
|
956
|
+
if len(vlm_overall_scores) > 0:
|
|
957
|
+
multimodal_quality["VLM_Overall_Quality"] = vlm_overall_scores.mean()
|
|
958
|
+
if 'vlm_accuracy' in final_df.columns:
|
|
959
|
+
vlm_accuracy_scores = visual_items['vlm_accuracy'].dropna()
|
|
960
|
+
if len(vlm_accuracy_scores) > 0:
|
|
961
|
+
multimodal_quality["VLM_Accuracy"] = vlm_accuracy_scores.mean()
|
|
962
|
+
if 'vlm_completeness' in final_df.columns:
|
|
963
|
+
vlm_completeness_scores = visual_items['vlm_completeness'].dropna()
|
|
964
|
+
if len(vlm_completeness_scores) > 0:
|
|
965
|
+
multimodal_quality["VLM_Completeness"] = vlm_completeness_scores.mean()
|
|
966
|
+
# Count items using visual info
|
|
967
|
+
if 'vlm_uses_visual' in final_df.columns:
|
|
968
|
+
uses_visual_count = visual_items['vlm_uses_visual'].sum()
|
|
969
|
+
multimodal_quality["Items_Using_Visual_Info"] = int(uses_visual_count)
|
|
970
|
+
multimodal_quality["Visual_Info_Usage_Rate"] = uses_visual_count / len(visual_items) if len(visual_items) > 0 else 0.0
|
|
971
|
+
|
|
972
|
+
report = {
|
|
973
|
+
"RAG_Quality": rag_quality,
|
|
974
|
+
"Reasoning_Complexity": {
|
|
975
|
+
"Avg_Reasoning_Score": final_df['reasoning_score'].mean(),
|
|
976
|
+
"Avg_Hop_Count": final_df['hop_count'].mean(),
|
|
977
|
+
},
|
|
978
|
+
"Multimodal_Quality": multimodal_quality,
|
|
979
|
+
"Dataset_Health": {
|
|
980
|
+
"Semantic_Diversity": diversity_score,
|
|
981
|
+
"Total_Samples": len(data)
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
# Save detailed results
|
|
986
|
+
final_df.to_csv(output_path.replace(".json", "_detailed.csv"), index=False)
|
|
987
|
+
with open(output_path, "w") as f:
|
|
988
|
+
json.dump(report, f, indent=4)
|
|
989
|
+
|
|
990
|
+
print("\nEvaluation Complete. Summary:")
|
|
991
|
+
print(json.dumps(report, indent=2))
|
|
992
|
+
|
|
993
|
+
return final_df, report
|
|
994
|
+
|
|
995
|
+
def transform_qa_data(raw_data: List[Dict]) -> List[Dict]:
|
|
996
|
+
"""
|
|
997
|
+
Transform qa_multihop_pass.json format to the format expected by the evaluator.
|
|
998
|
+
|
|
999
|
+
Input format (from qa_multihop_pass.json):
|
|
1000
|
+
- chunk_id, original_chunk, final_context, context_chunks, context_status,
|
|
1001
|
+
- depth_reached, chunks_added, expert_persona, domain, question, answer,
|
|
1002
|
+
- relevance_score, difficulty_score, selection_status, selection_reason, verification_result
|
|
1003
|
+
|
|
1004
|
+
Expected format:
|
|
1005
|
+
- question, answer, contexts (list), ground_truth, metadata, image_contexts, context_chunks
|
|
1006
|
+
"""
|
|
1007
|
+
transformed = []
|
|
1008
|
+
for item in raw_data:
|
|
1009
|
+
# Extract context_chunks which contain both text content and image_path
|
|
1010
|
+
context_chunks = item.get("context_chunks", [])
|
|
1011
|
+
|
|
1012
|
+
# Extract text contexts from context_chunks or fall back to final_context
|
|
1013
|
+
if context_chunks:
|
|
1014
|
+
contexts = [chunk.get("content", "") for chunk in context_chunks if chunk.get("content")]
|
|
1015
|
+
else:
|
|
1016
|
+
contexts = [item.get("final_context", item.get("original_chunk", ""))]
|
|
1017
|
+
|
|
1018
|
+
# Extract image paths from context_chunks
|
|
1019
|
+
image_contexts = []
|
|
1020
|
+
for chunk in context_chunks:
|
|
1021
|
+
img_path = chunk.get("image_path")
|
|
1022
|
+
if img_path and img_path != "null":
|
|
1023
|
+
image_contexts.append(img_path)
|
|
1024
|
+
|
|
1025
|
+
# Determine if this is a visual/multimodal item
|
|
1026
|
+
has_images = len(image_contexts) > 0
|
|
1027
|
+
item_type = "visual" if has_images else "text"
|
|
1028
|
+
|
|
1029
|
+
transformed.append({
|
|
1030
|
+
"question": item.get("question", ""),
|
|
1031
|
+
"answer": item.get("answer", ""),
|
|
1032
|
+
"contexts": contexts,
|
|
1033
|
+
"ground_truth": item.get("answer", ""), # Using answer as ground_truth (no human labels)
|
|
1034
|
+
"metadata": {
|
|
1035
|
+
"chunk_id": item.get("chunk_id"),
|
|
1036
|
+
"expert_persona": item.get("expert_persona"),
|
|
1037
|
+
"domain": item.get("domain"),
|
|
1038
|
+
"context_status": item.get("context_status"),
|
|
1039
|
+
"relevance_score": item.get("relevance_score"),
|
|
1040
|
+
"difficulty_score": item.get("difficulty_score"),
|
|
1041
|
+
"selection_status": item.get("selection_status"),
|
|
1042
|
+
"depth_reached": item.get("depth_reached"),
|
|
1043
|
+
"chunks_added": item.get("chunks_added", []),
|
|
1044
|
+
"type": item_type,
|
|
1045
|
+
},
|
|
1046
|
+
"image_contexts": image_contexts,
|
|
1047
|
+
"context_chunks": context_chunks, # Keep full chunks for VLM calls
|
|
1048
|
+
})
|
|
1049
|
+
return transformed
|
|
1050
|
+
|
|
1051
|
+
|
|
1052
|
+
def analyze_missing_information(raw_data: List[Dict]) -> Dict[str, Any]:
|
|
1053
|
+
"""
|
|
1054
|
+
Analyze which metrics cannot be fully evaluated and what information is missing.
|
|
1055
|
+
"""
|
|
1056
|
+
missing_info = {
|
|
1057
|
+
"metrics_status": {},
|
|
1058
|
+
"missing_fields": [],
|
|
1059
|
+
"recommendations": [],
|
|
1060
|
+
"image_stats": {}
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
# Check what's available
|
|
1064
|
+
sample = raw_data[0] if raw_data else {}
|
|
1065
|
+
available_fields = set(sample.keys())
|
|
1066
|
+
|
|
1067
|
+
# Count items with images in context_chunks
|
|
1068
|
+
items_with_images = 0
|
|
1069
|
+
total_images = 0
|
|
1070
|
+
for item in raw_data:
|
|
1071
|
+
context_chunks = item.get("context_chunks", [])
|
|
1072
|
+
item_has_image = False
|
|
1073
|
+
for chunk in context_chunks:
|
|
1074
|
+
img_path = chunk.get("image_path")
|
|
1075
|
+
if img_path and img_path != "null" and img_path is not None:
|
|
1076
|
+
total_images += 1
|
|
1077
|
+
item_has_image = True
|
|
1078
|
+
if item_has_image:
|
|
1079
|
+
items_with_images += 1
|
|
1080
|
+
|
|
1081
|
+
missing_info["image_stats"] = {
|
|
1082
|
+
"total_items": len(raw_data),
|
|
1083
|
+
"items_with_images": items_with_images,
|
|
1084
|
+
"total_images": total_images,
|
|
1085
|
+
"has_context_chunks": "context_chunks" in available_fields
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
has_images = items_with_images > 0
|
|
1089
|
+
|
|
1090
|
+
# 1. RAGAS Standard Metrics
|
|
1091
|
+
missing_info["metrics_status"]["faithfulness"] = {
|
|
1092
|
+
"can_evaluate": True,
|
|
1093
|
+
"quality": "FULL",
|
|
1094
|
+
"notes": "Question, answer, and context available."
|
|
1095
|
+
}
|
|
1096
|
+
missing_info["metrics_status"]["answer_relevance"] = {
|
|
1097
|
+
"can_evaluate": True,
|
|
1098
|
+
"quality": "FULL",
|
|
1099
|
+
"notes": "Question and answer available."
|
|
1100
|
+
}
|
|
1101
|
+
missing_info["metrics_status"]["context_precision"] = {
|
|
1102
|
+
"can_evaluate": True,
|
|
1103
|
+
"quality": "APPROXIMATED",
|
|
1104
|
+
"notes": "No human-labeled ground_truth. Using generated answer as proxy.",
|
|
1105
|
+
"missing": ["Human-annotated ground truth answers"]
|
|
1106
|
+
}
|
|
1107
|
+
missing_info["metrics_status"]["context_recall"] = {
|
|
1108
|
+
"can_evaluate": True,
|
|
1109
|
+
"quality": "APPROXIMATED",
|
|
1110
|
+
"notes": "No human-labeled ground_truth. Using generated answer as proxy.",
|
|
1111
|
+
"missing": ["Human-annotated ground truth answers"]
|
|
1112
|
+
}
|
|
1113
|
+
|
|
1114
|
+
# 2. Multi-hop Reasoning Metric
|
|
1115
|
+
missing_info["metrics_status"]["multihop_reasoning"] = {
|
|
1116
|
+
"can_evaluate": True,
|
|
1117
|
+
"quality": "FULL",
|
|
1118
|
+
"notes": "Question, answer, and context available for LLM-as-a-Judge evaluation."
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
# 3. Visual Dependency Metric
|
|
1122
|
+
if has_images:
|
|
1123
|
+
missing_info["metrics_status"]["visual_dependency"] = {
|
|
1124
|
+
"can_evaluate": True,
|
|
1125
|
+
"quality": "FULL",
|
|
1126
|
+
"notes": f"Found {items_with_images} items with {total_images} images in context_chunks."
|
|
1127
|
+
}
|
|
1128
|
+
else:
|
|
1129
|
+
missing_info["metrics_status"]["visual_dependency"] = {
|
|
1130
|
+
"can_evaluate": False,
|
|
1131
|
+
"quality": "NOT_APPLICABLE",
|
|
1132
|
+
"notes": "No image data found in context_chunks.",
|
|
1133
|
+
"missing": [
|
|
1134
|
+
"context_chunks[].image_path: Image file paths in context chunks",
|
|
1135
|
+
"Ensure source chunks have 'artifact' fields with image references"
|
|
1136
|
+
]
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
# 4. Multimodal VLM Metrics (custom implementation)
|
|
1140
|
+
if has_images:
|
|
1141
|
+
missing_info["metrics_status"]["multimodal_faithfulness_vlm"] = {
|
|
1142
|
+
"can_evaluate": True,
|
|
1143
|
+
"quality": "FULL",
|
|
1144
|
+
"notes": f"VLM-based evaluation using {total_images} images from context_chunks."
|
|
1145
|
+
}
|
|
1146
|
+
missing_info["metrics_status"]["multimodal_answer_quality_vlm"] = {
|
|
1147
|
+
"can_evaluate": True,
|
|
1148
|
+
"quality": "FULL",
|
|
1149
|
+
"notes": "VLM-based answer quality evaluation with visual context."
|
|
1150
|
+
}
|
|
1151
|
+
else:
|
|
1152
|
+
missing_info["metrics_status"]["multimodal_faithfulness_vlm"] = {
|
|
1153
|
+
"can_evaluate": False,
|
|
1154
|
+
"quality": "NOT_APPLICABLE",
|
|
1155
|
+
"notes": "Requires image data in context_chunks.",
|
|
1156
|
+
"missing": ["context_chunks[].image_path: Image file paths"]
|
|
1157
|
+
}
|
|
1158
|
+
missing_info["metrics_status"]["multimodal_answer_quality_vlm"] = {
|
|
1159
|
+
"can_evaluate": False,
|
|
1160
|
+
"quality": "NOT_APPLICABLE",
|
|
1161
|
+
"notes": "Requires image data in context_chunks.",
|
|
1162
|
+
"missing": ["context_chunks[].image_path: Image file paths"]
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
# 5. Semantic Diversity
|
|
1166
|
+
missing_info["metrics_status"]["semantic_diversity"] = {
|
|
1167
|
+
"can_evaluate": True,
|
|
1168
|
+
"quality": "FULL",
|
|
1169
|
+
"notes": "Questions available for embedding-based diversity calculation."
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
# 6. Context Necessity (Anti-Parametric Bias)
|
|
1173
|
+
missing_info["metrics_status"]["context_necessity"] = {
|
|
1174
|
+
"can_evaluate": True,
|
|
1175
|
+
"quality": "FULL",
|
|
1176
|
+
"notes": "Question, answer, and context available for LLM-based necessity evaluation."
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
# 7. Domain Coverage
|
|
1180
|
+
missing_info["metrics_status"]["domain_coverage"] = {
|
|
1181
|
+
"can_evaluate": True,
|
|
1182
|
+
"quality": "FULL" if "chunks_added" in available_fields else "LIMITED",
|
|
1183
|
+
"notes": "Chunk references available for coverage calculation. Requires corpus chunks.json."
|
|
1184
|
+
}
|
|
1185
|
+
|
|
1186
|
+
# Recommendations
|
|
1187
|
+
recommendations = [
|
|
1188
|
+
"Add 'ground_truth' field with human-annotated answers for accurate context_precision/recall"
|
|
1189
|
+
]
|
|
1190
|
+
if not has_images:
|
|
1191
|
+
recommendations.append("Ensure source chunks have 'artifact' fields with image paths for multimodal metrics")
|
|
1192
|
+
recommendations.append("Re-run QA generation with updated context_retrieved.py to capture images")
|
|
1193
|
+
|
|
1194
|
+
missing_info["recommendations"] = recommendations
|
|
1195
|
+
|
|
1196
|
+
return missing_info
|
|
1197
|
+
|
|
1198
|
+
|
|
1199
|
+
def identify_qa_subsets(raw_data: List[Dict]) -> Dict[str, List[Dict]]:
|
|
1200
|
+
"""
|
|
1201
|
+
Identify QA subsets: multihop, multimodal, and their intersection.
|
|
1202
|
+
|
|
1203
|
+
Returns:
|
|
1204
|
+
Dict with keys: 'all', 'multihop', 'multimodal', 'multihop_and_multimodal'
|
|
1205
|
+
"""
|
|
1206
|
+
all_qa = raw_data
|
|
1207
|
+
|
|
1208
|
+
# Multihop: hop_count > 0 (each hop adds one link between chunks)
|
|
1209
|
+
# hop_count = len(chunks_added) - 1
|
|
1210
|
+
multihop = [
|
|
1211
|
+
qa for qa in raw_data
|
|
1212
|
+
if qa.get('hop_count', 0) > 0
|
|
1213
|
+
]
|
|
1214
|
+
|
|
1215
|
+
# Multimodal: content mentions figures/tables/images OR has image artifacts
|
|
1216
|
+
multimodal = []
|
|
1217
|
+
multimodal_keywords = ['figure', 'diagram', 'table', 'image', 'chart', '![image]', 'block diagram']
|
|
1218
|
+
|
|
1219
|
+
for qa in raw_data:
|
|
1220
|
+
content = (qa.get('original_chunk', '') + ' ' + qa.get('final_context', '')).lower()
|
|
1221
|
+
# Check for visual keywords in content
|
|
1222
|
+
has_visual_content = any(kw in content for kw in multimodal_keywords)
|
|
1223
|
+
# Check for explicit image paths in context_chunks
|
|
1224
|
+
context_chunks = qa.get('context_chunks', [])
|
|
1225
|
+
has_image_path = any(
|
|
1226
|
+
chunk.get('image_path') and chunk.get('image_path') != 'null'
|
|
1227
|
+
for chunk in context_chunks
|
|
1228
|
+
)
|
|
1229
|
+
if has_visual_content or has_image_path:
|
|
1230
|
+
multimodal.append(qa)
|
|
1231
|
+
|
|
1232
|
+
# Intersection
|
|
1233
|
+
multihop_ids = set(id(qa) for qa in multihop)
|
|
1234
|
+
multihop_and_multimodal = [qa for qa in multimodal if id(qa) in multihop_ids]
|
|
1235
|
+
|
|
1236
|
+
return {
|
|
1237
|
+
'all': all_qa,
|
|
1238
|
+
'multihop': multihop,
|
|
1239
|
+
'multimodal': multimodal,
|
|
1240
|
+
'multihop_and_multimodal': multihop_and_multimodal
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
|
|
1244
|
+
def _count_tokens(text: str) -> int:
|
|
1245
|
+
"""
|
|
1246
|
+
Count tokens using tokenizer if available, otherwise use improved approximation.
|
|
1247
|
+
|
|
1248
|
+
Approximation: ~0.75 tokens per word for English (GPT-4 average).
|
|
1249
|
+
For better accuracy, uses tiktoken if available, otherwise word-based estimate.
|
|
1250
|
+
"""
|
|
1251
|
+
if not text:
|
|
1252
|
+
return 0
|
|
1253
|
+
|
|
1254
|
+
# Try tiktoken (fast, accurate for GPT models, handles long text without warnings)
|
|
1255
|
+
try:
|
|
1256
|
+
import tiktoken
|
|
1257
|
+
enc = tiktoken.get_encoding("cl100k_base") # GPT-4 tokenizer
|
|
1258
|
+
return len(enc.encode(text))
|
|
1259
|
+
except ImportError:
|
|
1260
|
+
pass
|
|
1261
|
+
|
|
1262
|
+
# Try transformers tokenizer with large model_max_length to avoid warnings
|
|
1263
|
+
try:
|
|
1264
|
+
from transformers import GPT2TokenizerFast
|
|
1265
|
+
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", model_max_length=100000)
|
|
1266
|
+
return len(tokenizer.encode(text))
|
|
1267
|
+
except (ImportError, Exception):
|
|
1268
|
+
pass
|
|
1269
|
+
|
|
1270
|
+
# Fallback: improved word-based approximation
|
|
1271
|
+
# GPT-4 average: ~0.75 tokens per word, but varies by language
|
|
1272
|
+
# Using 1.3 tokens per word as conservative estimate (accounts for punctuation, etc.)
|
|
1273
|
+
words = len(text.split())
|
|
1274
|
+
return int(words * 1.3)
|
|
1275
|
+
|
|
1276
|
+
|
|
1277
|
+
def _count_pages_from_pdf(pdf_path: str) -> int:
|
|
1278
|
+
"""Count pages from PDF using pypdfium2 or pypdf."""
|
|
1279
|
+
try:
|
|
1280
|
+
import pypdfium2 as pdfium
|
|
1281
|
+
with open(pdf_path, 'rb') as f:
|
|
1282
|
+
pdf = pdfium.PdfDocument(f.read())
|
|
1283
|
+
return len(pdf)
|
|
1284
|
+
except ImportError:
|
|
1285
|
+
try:
|
|
1286
|
+
import pypdf
|
|
1287
|
+
with open(pdf_path, 'rb') as f:
|
|
1288
|
+
pdf_reader = pypdf.PdfReader(f)
|
|
1289
|
+
return len(pdf_reader.pages)
|
|
1290
|
+
except ImportError:
|
|
1291
|
+
pass
|
|
1292
|
+
except Exception:
|
|
1293
|
+
pass
|
|
1294
|
+
return 0
|
|
1295
|
+
|
|
1296
|
+
|
|
1297
|
+
def _count_pages_from_html_or_markdown(file_path: str) -> int:
|
|
1298
|
+
"""
|
|
1299
|
+
Estimate pages from HTML or Markdown files.
|
|
1300
|
+
Uses content length with reasonable assumptions:
|
|
1301
|
+
- Average page: ~2000-2500 words or ~12,000-15,000 characters
|
|
1302
|
+
- For HTML: strips tags first
|
|
1303
|
+
"""
|
|
1304
|
+
try:
|
|
1305
|
+
content = Path(file_path).read_text(encoding='utf-8')
|
|
1306
|
+
|
|
1307
|
+
# For HTML, strip tags to get actual content
|
|
1308
|
+
if file_path.lower().endswith('.html') or file_path.lower().endswith('.htm'):
|
|
1309
|
+
import re
|
|
1310
|
+
# Remove HTML tags
|
|
1311
|
+
content = re.sub(r'<[^>]+>', '', content)
|
|
1312
|
+
# Remove extra whitespace
|
|
1313
|
+
content = ' '.join(content.split())
|
|
1314
|
+
|
|
1315
|
+
# Estimate: ~2500 words per page or ~13,000 chars per page
|
|
1316
|
+
word_count = len(content.split())
|
|
1317
|
+
char_count = len(content)
|
|
1318
|
+
|
|
1319
|
+
# Use word-based estimate (more reliable)
|
|
1320
|
+
pages_from_words = max(1, word_count // 2500)
|
|
1321
|
+
# Use char-based estimate as backup
|
|
1322
|
+
pages_from_chars = max(1, char_count // 13000)
|
|
1323
|
+
|
|
1324
|
+
# Return average, rounded up
|
|
1325
|
+
return max(pages_from_words, pages_from_chars)
|
|
1326
|
+
except Exception:
|
|
1327
|
+
return 0
|
|
1328
|
+
|
|
1329
|
+
|
|
1330
|
+
def compute_corpus_and_dataset_stats(
|
|
1331
|
+
qa_data: List[Dict],
|
|
1332
|
+
corpus_chunks: List[Dict] = None,
|
|
1333
|
+
pdf_dir: str = None,
|
|
1334
|
+
markdown_dir: str = None
|
|
1335
|
+
) -> Dict[str, Any]:
|
|
1336
|
+
"""
|
|
1337
|
+
Compute comprehensive corpus and dataset statistics.
|
|
1338
|
+
|
|
1339
|
+
Computes:
|
|
1340
|
+
- Corpus stats: #chunks, #multimodal chunks, #tables, #images, #tokens
|
|
1341
|
+
- Context stats: Distribution by hop count
|
|
1342
|
+
- QA stats: By modality categories (Multimodal, Table, Table+Image) and hop counts
|
|
1343
|
+
|
|
1344
|
+
Args:
|
|
1345
|
+
qa_data: List of QA pairs
|
|
1346
|
+
corpus_chunks: List of corpus chunks (from chunks.json)
|
|
1347
|
+
pdf_dir: Path to original PDF files directory (for accurate page count using pypdfium2/pypdf)
|
|
1348
|
+
markdown_dir: Path to markdown/HTML files directory (fallback for page count if PDFs not available)
|
|
1349
|
+
|
|
1350
|
+
Returns:
|
|
1351
|
+
Dict with corpus_stats, context_stats, and qa_category_stats
|
|
1352
|
+
"""
|
|
1353
|
+
import re
|
|
1354
|
+
from pathlib import Path
|
|
1355
|
+
|
|
1356
|
+
stats = {
|
|
1357
|
+
"corpus_stats": {},
|
|
1358
|
+
"context_stats": {},
|
|
1359
|
+
"qa_category_stats": {}
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
# ==========================================
|
|
1363
|
+
# 1. CORPUS STATS (from chunks.json)
|
|
1364
|
+
# ==========================================
|
|
1365
|
+
if corpus_chunks:
|
|
1366
|
+
total_chunks = len(corpus_chunks)
|
|
1367
|
+
|
|
1368
|
+
# Count by chunk type
|
|
1369
|
+
text_chunks = 0
|
|
1370
|
+
table_chunks = 0
|
|
1371
|
+
image_chunks = 0
|
|
1372
|
+
multimodal_chunks = 0 # Chunks with images (standalone or embedded)
|
|
1373
|
+
total_tokens = 0
|
|
1374
|
+
|
|
1375
|
+
for chunk in corpus_chunks:
|
|
1376
|
+
chunk_type = chunk.get('chunk_type', 'text').lower()
|
|
1377
|
+
content = chunk.get('content', '')
|
|
1378
|
+
artifact = chunk.get('artifact', 'None')
|
|
1379
|
+
|
|
1380
|
+
# Count tokens using proper tokenizer or improved approximation
|
|
1381
|
+
total_tokens += _count_tokens(content)
|
|
1382
|
+
|
|
1383
|
+
if chunk_type == 'table':
|
|
1384
|
+
table_chunks += 1
|
|
1385
|
+
elif chunk_type == 'standalone image':
|
|
1386
|
+
image_chunks += 1
|
|
1387
|
+
multimodal_chunks += 1
|
|
1388
|
+
else:
|
|
1389
|
+
text_chunks += 1
|
|
1390
|
+
|
|
1391
|
+
# Check for embedded images in content or artifact
|
|
1392
|
+
has_image = (
|
|
1393
|
+
artifact and artifact != 'None' and '![' in str(artifact)
|
|
1394
|
+
) or '![' in content
|
|
1395
|
+
|
|
1396
|
+
if has_image and chunk_type != 'standalone image':
|
|
1397
|
+
multimodal_chunks += 1
|
|
1398
|
+
|
|
1399
|
+
stats["corpus_stats"] = {
|
|
1400
|
+
"total_chunks": total_chunks,
|
|
1401
|
+
"text_chunks": text_chunks,
|
|
1402
|
+
"table_chunks": table_chunks,
|
|
1403
|
+
"image_chunks": image_chunks,
|
|
1404
|
+
"multimodal_chunks": multimodal_chunks,
|
|
1405
|
+
"total_tokens": total_tokens, # Renamed from total_tokens_approx
|
|
1406
|
+
"avg_tokens_per_chunk": round(total_tokens / total_chunks, 1) if total_chunks > 0 else 0
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
# Count unique files
|
|
1410
|
+
unique_files = set(chunk.get('file_name', '') for chunk in corpus_chunks)
|
|
1411
|
+
stats["corpus_stats"]["num_source_files"] = len(unique_files)
|
|
1412
|
+
|
|
1413
|
+
# ==========================================
|
|
1414
|
+
# PAGE COUNTING (from PDFs if available, otherwise markdown/HTML)
|
|
1415
|
+
# ==========================================
|
|
1416
|
+
total_pages = 0
|
|
1417
|
+
pages_counted = 0
|
|
1418
|
+
|
|
1419
|
+
# First, try to count from original PDFs (most accurate)
|
|
1420
|
+
if pdf_dir and os.path.exists(pdf_dir):
|
|
1421
|
+
pdf_files = list(Path(pdf_dir).glob("*.pdf"))
|
|
1422
|
+
for pdf_file in pdf_files:
|
|
1423
|
+
page_count = _count_pages_from_pdf(str(pdf_file))
|
|
1424
|
+
if page_count > 0:
|
|
1425
|
+
total_pages += page_count
|
|
1426
|
+
pages_counted += 1
|
|
1427
|
+
|
|
1428
|
+
# If no PDFs found or PDF dir not provided, try markdown files
|
|
1429
|
+
if total_pages == 0 and markdown_dir and os.path.exists(markdown_dir):
|
|
1430
|
+
md_files = list(Path(markdown_dir).rglob("*.md"))
|
|
1431
|
+
html_files = list(Path(markdown_dir).rglob("*.html")) + list(Path(markdown_dir).rglob("*.htm"))
|
|
1432
|
+
|
|
1433
|
+
for md_file in md_files:
|
|
1434
|
+
page_count = _count_pages_from_html_or_markdown(str(md_file))
|
|
1435
|
+
total_pages += page_count
|
|
1436
|
+
|
|
1437
|
+
for html_file in html_files:
|
|
1438
|
+
page_count = _count_pages_from_html_or_markdown(str(html_file))
|
|
1439
|
+
total_pages += page_count
|
|
1440
|
+
|
|
1441
|
+
if total_pages > 0:
|
|
1442
|
+
stats["corpus_stats"]["total_pages"] = total_pages
|
|
1443
|
+
if pages_counted > 0:
|
|
1444
|
+
stats["corpus_stats"]["pages_counted_from_pdfs"] = pages_counted
|
|
1445
|
+
|
|
1446
|
+
# ==========================================
|
|
1447
|
+
# 2. CONTEXT STATS (from QA data)
|
|
1448
|
+
# ==========================================
|
|
1449
|
+
hop_distribution = Counter()
|
|
1450
|
+
context_sizes = [] # Number of chunks per context
|
|
1451
|
+
|
|
1452
|
+
for qa in qa_data:
|
|
1453
|
+
# Determine hop count from chunks_added
|
|
1454
|
+
chunks_added = qa.get('chunks_added', [])
|
|
1455
|
+
num_chunks = len(chunks_added) if isinstance(chunks_added, list) else 1
|
|
1456
|
+
context_sizes.append(num_chunks)
|
|
1457
|
+
|
|
1458
|
+
# Hop count = number of chunks - 1 (0-hop means single chunk)
|
|
1459
|
+
hop_count = max(0, num_chunks - 1)
|
|
1460
|
+
hop_distribution[hop_count] += 1
|
|
1461
|
+
|
|
1462
|
+
stats["context_stats"] = {
|
|
1463
|
+
"total_contexts": len(qa_data),
|
|
1464
|
+
"hop_distribution": dict(sorted(hop_distribution.items())),
|
|
1465
|
+
"avg_chunks_per_context": round(np.mean(context_sizes), 2) if context_sizes else 0,
|
|
1466
|
+
"max_chunks_in_context": max(context_sizes) if context_sizes else 0
|
|
1467
|
+
}
|
|
1468
|
+
|
|
1469
|
+
# Add summary counts
|
|
1470
|
+
for hop in range(max(hop_distribution.keys()) + 1 if hop_distribution else 0):
|
|
1471
|
+
stats["context_stats"][f"num_{hop}_hop_contexts"] = hop_distribution.get(hop, 0)
|
|
1472
|
+
|
|
1473
|
+
# ==========================================
|
|
1474
|
+
# 3. QA CATEGORY STATS
|
|
1475
|
+
# ==========================================
|
|
1476
|
+
# Categories based on context content:
|
|
1477
|
+
# - Multimodal: at least one image in context
|
|
1478
|
+
# - Table: at least one table in context
|
|
1479
|
+
# - Table+Image: both table and image in context
|
|
1480
|
+
|
|
1481
|
+
def context_has_table(qa: Dict) -> bool:
|
|
1482
|
+
"""Check if context contains a table."""
|
|
1483
|
+
context_chunks = qa.get('context_chunks', [])
|
|
1484
|
+
final_context = qa.get('final_context', '')
|
|
1485
|
+
|
|
1486
|
+
# Check in context_chunks
|
|
1487
|
+
for chunk in context_chunks:
|
|
1488
|
+
content = chunk.get('content', '')
|
|
1489
|
+
if '|' in content and '-|' in content: # Markdown table pattern
|
|
1490
|
+
return True
|
|
1491
|
+
|
|
1492
|
+
# Check in final_context
|
|
1493
|
+
if '|' in final_context and '-|' in final_context:
|
|
1494
|
+
return True
|
|
1495
|
+
|
|
1496
|
+
return False
|
|
1497
|
+
|
|
1498
|
+
def context_has_image(qa: Dict) -> bool:
|
|
1499
|
+
"""Check if context contains an image."""
|
|
1500
|
+
context_chunks = qa.get('context_chunks', [])
|
|
1501
|
+
|
|
1502
|
+
for chunk in context_chunks:
|
|
1503
|
+
image_path = chunk.get('image_path')
|
|
1504
|
+
if image_path and image_path != 'null' and image_path is not None:
|
|
1505
|
+
return True
|
|
1506
|
+
content = chunk.get('content', '')
|
|
1507
|
+
if '![' in content: # Markdown image pattern
|
|
1508
|
+
return True
|
|
1509
|
+
|
|
1510
|
+
return False
|
|
1511
|
+
|
|
1512
|
+
# Initialize counters
|
|
1513
|
+
total_qa = len(qa_data)
|
|
1514
|
+
multimodal_qa = []
|
|
1515
|
+
table_qa = []
|
|
1516
|
+
table_image_qa = []
|
|
1517
|
+
text_only_qa = []
|
|
1518
|
+
|
|
1519
|
+
# Per-hop counters for each category
|
|
1520
|
+
hop_multimodal = Counter()
|
|
1521
|
+
hop_table = Counter()
|
|
1522
|
+
hop_table_image = Counter()
|
|
1523
|
+
hop_text_only = Counter()
|
|
1524
|
+
hop_all = Counter()
|
|
1525
|
+
|
|
1526
|
+
for qa in qa_data:
|
|
1527
|
+
chunks_added = qa.get('chunks_added', [])
|
|
1528
|
+
num_chunks = len(chunks_added) if isinstance(chunks_added, list) else 1
|
|
1529
|
+
hop_count = max(0, num_chunks - 1)
|
|
1530
|
+
|
|
1531
|
+
has_table = context_has_table(qa)
|
|
1532
|
+
has_image = context_has_image(qa)
|
|
1533
|
+
|
|
1534
|
+
hop_all[hop_count] += 1
|
|
1535
|
+
|
|
1536
|
+
if has_table and has_image:
|
|
1537
|
+
table_image_qa.append(qa)
|
|
1538
|
+
hop_table_image[hop_count] += 1
|
|
1539
|
+
elif has_image:
|
|
1540
|
+
multimodal_qa.append(qa)
|
|
1541
|
+
hop_multimodal[hop_count] += 1
|
|
1542
|
+
elif has_table:
|
|
1543
|
+
table_qa.append(qa)
|
|
1544
|
+
hop_table[hop_count] += 1
|
|
1545
|
+
else:
|
|
1546
|
+
text_only_qa.append(qa)
|
|
1547
|
+
hop_text_only[hop_count] += 1
|
|
1548
|
+
|
|
1549
|
+
# Also count inclusive categories (for reporting)
|
|
1550
|
+
# Multimodal (any image): includes table_image
|
|
1551
|
+
multimodal_inclusive = [qa for qa in qa_data if context_has_image(qa)]
|
|
1552
|
+
# Table (any table): includes table_image
|
|
1553
|
+
table_inclusive = [qa for qa in qa_data if context_has_table(qa)]
|
|
1554
|
+
|
|
1555
|
+
stats["qa_category_stats"] = {
|
|
1556
|
+
"total_qa_pairs": total_qa,
|
|
1557
|
+
|
|
1558
|
+
# Exclusive categories (mutually exclusive)
|
|
1559
|
+
"text_only_qa": len(text_only_qa),
|
|
1560
|
+
"table_only_qa": len(table_qa),
|
|
1561
|
+
"image_only_qa": len(multimodal_qa),
|
|
1562
|
+
"table_and_image_qa": len(table_image_qa),
|
|
1563
|
+
|
|
1564
|
+
# Inclusive categories (overlapping)
|
|
1565
|
+
"multimodal_qa_inclusive": len(multimodal_inclusive), # Any QA with image
|
|
1566
|
+
"table_qa_inclusive": len(table_inclusive), # Any QA with table
|
|
1567
|
+
|
|
1568
|
+
# Hop distribution for all QA
|
|
1569
|
+
"qa_hop_distribution": dict(sorted(hop_all.items())),
|
|
1570
|
+
|
|
1571
|
+
# Hop distribution by category
|
|
1572
|
+
"text_only_by_hop": dict(sorted(hop_text_only.items())),
|
|
1573
|
+
"table_only_by_hop": dict(sorted(hop_table.items())),
|
|
1574
|
+
"image_only_by_hop": dict(sorted(hop_multimodal.items())),
|
|
1575
|
+
"table_and_image_by_hop": dict(sorted(hop_table_image.items()))
|
|
1576
|
+
}
|
|
1577
|
+
|
|
1578
|
+
# Add summary counts per hop
|
|
1579
|
+
max_hop = max(hop_all.keys()) if hop_all else 0
|
|
1580
|
+
for hop in range(max_hop + 1):
|
|
1581
|
+
stats["qa_category_stats"][f"num_{hop}_hop_qa"] = hop_all.get(hop, 0)
|
|
1582
|
+
|
|
1583
|
+
# Compute multimodal inclusive by hop
|
|
1584
|
+
hop_multimodal_inclusive = Counter()
|
|
1585
|
+
hop_table_inclusive = Counter()
|
|
1586
|
+
for qa in qa_data:
|
|
1587
|
+
chunks_added = qa.get('chunks_added', [])
|
|
1588
|
+
num_chunks = len(chunks_added) if isinstance(chunks_added, list) else 1
|
|
1589
|
+
hop_count = max(0, num_chunks - 1)
|
|
1590
|
+
|
|
1591
|
+
if context_has_image(qa):
|
|
1592
|
+
hop_multimodal_inclusive[hop_count] += 1
|
|
1593
|
+
if context_has_table(qa):
|
|
1594
|
+
hop_table_inclusive[hop_count] += 1
|
|
1595
|
+
|
|
1596
|
+
stats["qa_category_stats"]["multimodal_inclusive_by_hop"] = dict(sorted(hop_multimodal_inclusive.items()))
|
|
1597
|
+
stats["qa_category_stats"]["table_inclusive_by_hop"] = dict(sorted(hop_table_inclusive.items()))
|
|
1598
|
+
|
|
1599
|
+
return stats
|
|
1600
|
+
|
|
1601
|
+
|
|
1602
|
+
def run_subset_evaluation(
|
|
1603
|
+
qa_data: List[Dict],
|
|
1604
|
+
corpus_path: str = None,
|
|
1605
|
+
output_dir: str = None,
|
|
1606
|
+
sample_size: int = None,
|
|
1607
|
+
run_context_necessity: bool = True,
|
|
1608
|
+
pdf_dir: str = None,
|
|
1609
|
+
markdown_dir: str = None
|
|
1610
|
+
) -> Dict[str, Any]:
|
|
1611
|
+
"""
|
|
1612
|
+
Run comprehensive evaluation on QA dataset including new metrics.
|
|
1613
|
+
|
|
1614
|
+
Evaluates:
|
|
1615
|
+
- Corpus and Dataset Statistics (chunks, modalities, hop distributions)
|
|
1616
|
+
- Context Necessity (anti-parametric bias)
|
|
1617
|
+
- Domain Coverage (corpus coverage)
|
|
1618
|
+
- Subset statistics (multihop, multimodal counts)
|
|
1619
|
+
- Targeted metrics on subsets
|
|
1620
|
+
|
|
1621
|
+
Args:
|
|
1622
|
+
qa_data: List of QA pairs (raw format)
|
|
1623
|
+
corpus_path: Path to chunks.json for domain coverage
|
|
1624
|
+
output_dir: Output directory for reports
|
|
1625
|
+
sample_size: If set, sample this many items for expensive evaluations
|
|
1626
|
+
run_context_necessity: Whether to run context necessity (expensive)
|
|
1627
|
+
pdf_dir: Path to original PDF files directory (optional, for accurate page count)
|
|
1628
|
+
markdown_dir: Path to markdown/HTML files directory (optional, fallback for page count)
|
|
1629
|
+
|
|
1630
|
+
Returns:
|
|
1631
|
+
Dict with evaluation results
|
|
1632
|
+
"""
|
|
1633
|
+
import random
|
|
1634
|
+
|
|
1635
|
+
results = {
|
|
1636
|
+
"corpus_stats": {},
|
|
1637
|
+
"context_stats": {},
|
|
1638
|
+
"qa_category_stats": {},
|
|
1639
|
+
"subset_statistics": {},
|
|
1640
|
+
"ragas_metrics": {}, # Faithfulness, Relevance, Precision, Recall
|
|
1641
|
+
"context_necessity": {},
|
|
1642
|
+
"domain_coverage": {},
|
|
1643
|
+
"multihop_metrics": {},
|
|
1644
|
+
"multimodal_metrics": {}
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1647
|
+
# 0. Compute Corpus and Dataset Statistics
|
|
1648
|
+
print("\n" + "=" * 60)
|
|
1649
|
+
print("CORPUS AND DATASET STATISTICS")
|
|
1650
|
+
print("=" * 60)
|
|
1651
|
+
|
|
1652
|
+
corpus_chunks = None
|
|
1653
|
+
if corpus_path and os.path.exists(corpus_path):
|
|
1654
|
+
with open(corpus_path, 'r') as f:
|
|
1655
|
+
corpus_chunks = json.load(f)
|
|
1656
|
+
|
|
1657
|
+
# Auto-detect directories if not provided
|
|
1658
|
+
pdf_dir_auto = pdf_dir # Keep original if provided
|
|
1659
|
+
if corpus_path:
|
|
1660
|
+
base_dir = os.path.dirname(corpus_path)
|
|
1661
|
+
|
|
1662
|
+
# Try to find PDF directory (check parent directories)
|
|
1663
|
+
if not pdf_dir_auto:
|
|
1664
|
+
# Check config for input_pdf_dir
|
|
1665
|
+
try:
|
|
1666
|
+
from config_loader import get_paths_config
|
|
1667
|
+
paths_config = get_paths_config()
|
|
1668
|
+
potential_pdf_dir = paths_config.get('input_pdf_dir')
|
|
1669
|
+
if potential_pdf_dir and os.path.exists(potential_pdf_dir):
|
|
1670
|
+
pdf_dir_auto = potential_pdf_dir
|
|
1671
|
+
except:
|
|
1672
|
+
pass
|
|
1673
|
+
|
|
1674
|
+
# Fallback: check common locations relative to output_dir
|
|
1675
|
+
if not pdf_dir_auto:
|
|
1676
|
+
potential_pdf_dirs = [
|
|
1677
|
+
os.path.join(os.path.dirname(base_dir), "data"),
|
|
1678
|
+
os.path.join(base_dir, "..", "data"),
|
|
1679
|
+
"data/documents" # Default from config
|
|
1680
|
+
]
|
|
1681
|
+
for pd in potential_pdf_dirs:
|
|
1682
|
+
if os.path.exists(pd) and any(Path(pd).glob("*.pdf")):
|
|
1683
|
+
pdf_dir_auto = pd
|
|
1684
|
+
break
|
|
1685
|
+
|
|
1686
|
+
# Auto-detect markdown_dir if not provided
|
|
1687
|
+
if not markdown_dir:
|
|
1688
|
+
potential_md_dir = os.path.join(base_dir, "markdown")
|
|
1689
|
+
if os.path.exists(potential_md_dir):
|
|
1690
|
+
markdown_dir = potential_md_dir
|
|
1691
|
+
|
|
1692
|
+
comprehensive_stats = compute_corpus_and_dataset_stats(
|
|
1693
|
+
qa_data=qa_data,
|
|
1694
|
+
corpus_chunks=corpus_chunks,
|
|
1695
|
+
pdf_dir=pdf_dir,
|
|
1696
|
+
markdown_dir=markdown_dir
|
|
1697
|
+
)
|
|
1698
|
+
|
|
1699
|
+
results["corpus_stats"] = comprehensive_stats["corpus_stats"]
|
|
1700
|
+
results["context_stats"] = comprehensive_stats["context_stats"]
|
|
1701
|
+
results["qa_category_stats"] = comprehensive_stats["qa_category_stats"]
|
|
1702
|
+
|
|
1703
|
+
# Print corpus stats
|
|
1704
|
+
cs = results["corpus_stats"]
|
|
1705
|
+
if cs:
|
|
1706
|
+
print(f"\n 📚 CORPUS STATS:")
|
|
1707
|
+
print(f" Total Chunks: {cs.get('total_chunks', 'N/A')}")
|
|
1708
|
+
print(f" Text Chunks: {cs.get('text_chunks', 'N/A')}")
|
|
1709
|
+
print(f" Table Chunks: {cs.get('table_chunks', 'N/A')}")
|
|
1710
|
+
print(f" Image Chunks: {cs.get('image_chunks', 'N/A')}")
|
|
1711
|
+
print(f" Multimodal Chunks: {cs.get('multimodal_chunks', 'N/A')}")
|
|
1712
|
+
print(f" Total Tokens: {cs.get('total_tokens', 'N/A'):,}")
|
|
1713
|
+
print(f" Source Files: {cs.get('num_source_files', 'N/A')}")
|
|
1714
|
+
if cs.get('total_pages'):
|
|
1715
|
+
page_source = "PDFs" if cs.get('pages_counted_from_pdfs') else "Markdown/HTML"
|
|
1716
|
+
print(f" Total Pages ({page_source}): {cs.get('total_pages')}")
|
|
1717
|
+
|
|
1718
|
+
# Print context stats
|
|
1719
|
+
ctx = results["context_stats"]
|
|
1720
|
+
print(f"\n 📋 CONTEXT STATS:")
|
|
1721
|
+
print(f" Total Contexts: {ctx.get('total_contexts', 'N/A')}")
|
|
1722
|
+
print(f" Avg Chunks/Context: {ctx.get('avg_chunks_per_context', 'N/A')}")
|
|
1723
|
+
print(f" Hop Distribution: {ctx.get('hop_distribution', {})}")
|
|
1724
|
+
|
|
1725
|
+
# Print QA category stats
|
|
1726
|
+
qa_cat = results["qa_category_stats"]
|
|
1727
|
+
print(f"\n 📊 QA CATEGORY STATS:")
|
|
1728
|
+
print(f" Total QA Pairs: {qa_cat.get('total_qa_pairs', 'N/A')}")
|
|
1729
|
+
print(f" Text-only QA: {qa_cat.get('text_only_qa', 'N/A')}")
|
|
1730
|
+
print(f" Table-only QA: {qa_cat.get('table_only_qa', 'N/A')}")
|
|
1731
|
+
print(f" Image-only QA: {qa_cat.get('image_only_qa', 'N/A')}")
|
|
1732
|
+
print(f" Table+Image QA: {qa_cat.get('table_and_image_qa', 'N/A')}")
|
|
1733
|
+
print(f" Multimodal QA (inclusive): {qa_cat.get('multimodal_qa_inclusive', 'N/A')}")
|
|
1734
|
+
print(f" Table QA (inclusive): {qa_cat.get('table_qa_inclusive', 'N/A')}")
|
|
1735
|
+
print(f" QA Hop Distribution: {qa_cat.get('qa_hop_distribution', {})}")
|
|
1736
|
+
|
|
1737
|
+
# 1. Identify subsets (legacy, kept for backwards compatibility)
|
|
1738
|
+
print("\n" + "=" * 60)
|
|
1739
|
+
print("SUBSET ANALYSIS")
|
|
1740
|
+
print("=" * 60)
|
|
1741
|
+
|
|
1742
|
+
subsets = identify_qa_subsets(qa_data)
|
|
1743
|
+
|
|
1744
|
+
results["subset_statistics"] = {
|
|
1745
|
+
"total_qa_pairs": len(subsets['all']),
|
|
1746
|
+
"multihop_count": len(subsets['multihop']),
|
|
1747
|
+
"multimodal_count": len(subsets['multimodal']),
|
|
1748
|
+
"multihop_and_multimodal_count": len(subsets['multihop_and_multimodal']),
|
|
1749
|
+
"single_hop_text_only": len(subsets['all']) - len(subsets['multihop']) - len(subsets['multimodal']) + len(subsets['multihop_and_multimodal'])
|
|
1750
|
+
}
|
|
1751
|
+
|
|
1752
|
+
print(f"\n Total QA pairs: {results['subset_statistics']['total_qa_pairs']}")
|
|
1753
|
+
print(f" Multihop QA pairs: {results['subset_statistics']['multihop_count']}")
|
|
1754
|
+
print(f" Multimodal QA pairs: {results['subset_statistics']['multimodal_count']}")
|
|
1755
|
+
print(f" Multihop AND Multimodal: {results['subset_statistics']['multihop_and_multimodal_count']}")
|
|
1756
|
+
|
|
1757
|
+
# 1.5. RAGAS Standard Metrics (Faithfulness, Relevance, Precision, Recall)
|
|
1758
|
+
print("\n" + "=" * 60)
|
|
1759
|
+
print("RAGAS STANDARD METRICS")
|
|
1760
|
+
print("=" * 60)
|
|
1761
|
+
|
|
1762
|
+
if RAGAS_AVAILABLE:
|
|
1763
|
+
try:
|
|
1764
|
+
evaluator = MultimodalFrameworkEvaluator()
|
|
1765
|
+
|
|
1766
|
+
# Transform data for RAGAS
|
|
1767
|
+
transformed_data = transform_qa_data(qa_data)
|
|
1768
|
+
|
|
1769
|
+
# Sample if needed for expensive RAGAS evaluation
|
|
1770
|
+
# RAGAS with 64 parallel workers. Limit samples to manage API costs.
|
|
1771
|
+
RAGAS_MAX_SAMPLES = 50 # Reasonable sample for statistical significance
|
|
1772
|
+
eval_data = transformed_data
|
|
1773
|
+
ragas_sample_size = min(sample_size or RAGAS_MAX_SAMPLES, RAGAS_MAX_SAMPLES)
|
|
1774
|
+
if len(transformed_data) > ragas_sample_size:
|
|
1775
|
+
eval_data = random.sample(transformed_data, ragas_sample_size)
|
|
1776
|
+
print(f"\n 📊 Sampling {ragas_sample_size}/{len(transformed_data)} items for RAGAS...")
|
|
1777
|
+
|
|
1778
|
+
print(f"\n ⚡ Running RAGAS evaluation on {len(eval_data)} items (parallel, ~2-5 min)...")
|
|
1779
|
+
ragas_df = evaluator.evaluate_ragas_standard(eval_data)
|
|
1780
|
+
|
|
1781
|
+
# Extract metrics
|
|
1782
|
+
ragas_results = {
|
|
1783
|
+
"faithfulness": float(ragas_df['faithfulness'].mean()) if 'faithfulness' in ragas_df.columns else None,
|
|
1784
|
+
"answer_relevance": float(ragas_df['answer_relevancy'].mean()) if 'answer_relevancy' in ragas_df.columns else None,
|
|
1785
|
+
"context_precision": float(ragas_df['context_precision'].mean()) if 'context_precision' in ragas_df.columns else None,
|
|
1786
|
+
"context_recall": float(ragas_df['context_recall'].mean()) if 'context_recall' in ragas_df.columns else None,
|
|
1787
|
+
"items_evaluated": len(eval_data)
|
|
1788
|
+
}
|
|
1789
|
+
|
|
1790
|
+
# Add optional metrics if available
|
|
1791
|
+
if HAS_ENTITY_RECALL and 'context_entity_recall' in ragas_df.columns:
|
|
1792
|
+
ragas_results["context_entity_recall"] = float(ragas_df['context_entity_recall'].mean())
|
|
1793
|
+
if HAS_MULTIMODAL:
|
|
1794
|
+
if 'multimodal_faithfulness' in ragas_df.columns:
|
|
1795
|
+
ragas_results["multimodal_faithfulness"] = float(ragas_df['multimodal_faithfulness'].mean())
|
|
1796
|
+
if 'multimodal_relevance' in ragas_df.columns:
|
|
1797
|
+
ragas_results["multimodal_relevance"] = float(ragas_df['multimodal_relevance'].mean())
|
|
1798
|
+
|
|
1799
|
+
results["ragas_metrics"] = ragas_results
|
|
1800
|
+
|
|
1801
|
+
print(f"\n 📊 RAGAS Results:")
|
|
1802
|
+
print(f" Faithfulness: {ragas_results.get('faithfulness', 'N/A'):.3f}" if ragas_results.get('faithfulness') else " Faithfulness: N/A")
|
|
1803
|
+
print(f" Answer Relevance: {ragas_results.get('answer_relevance', 'N/A'):.3f}" if ragas_results.get('answer_relevance') else " Answer Relevance: N/A")
|
|
1804
|
+
print(f" Context Precision: {ragas_results.get('context_precision', 'N/A'):.3f}" if ragas_results.get('context_precision') else " Context Precision: N/A")
|
|
1805
|
+
print(f" Context Recall: {ragas_results.get('context_recall', 'N/A'):.3f}" if ragas_results.get('context_recall') else " Context Recall: N/A")
|
|
1806
|
+
|
|
1807
|
+
except Exception as e:
|
|
1808
|
+
print(f"\n ⚠️ RAGAS evaluation failed: {e}")
|
|
1809
|
+
results["ragas_metrics"] = {"error": str(e)}
|
|
1810
|
+
else:
|
|
1811
|
+
print("\n ⚠️ RAGAS not available. Install with: pip install ragas datasets")
|
|
1812
|
+
results["ragas_metrics"] = {"error": "RAGAS not installed"}
|
|
1813
|
+
|
|
1814
|
+
# 2. Domain Coverage (if corpus provided)
|
|
1815
|
+
if corpus_path and os.path.exists(corpus_path):
|
|
1816
|
+
print("\n" + "=" * 60)
|
|
1817
|
+
print("DOMAIN COVERAGE EVALUATION")
|
|
1818
|
+
print("=" * 60)
|
|
1819
|
+
|
|
1820
|
+
with open(corpus_path, 'r') as f:
|
|
1821
|
+
corpus_chunks = json.load(f)
|
|
1822
|
+
|
|
1823
|
+
evaluator = MultimodalFrameworkEvaluator()
|
|
1824
|
+
coverage = evaluator.evaluate_domain_coverage(qa_data, corpus_chunks)
|
|
1825
|
+
results["domain_coverage"] = coverage
|
|
1826
|
+
|
|
1827
|
+
print(f"\n Chunk Coverage: {coverage['chunk_coverage']*100:.1f}% ({coverage['chunks_covered']}/{coverage['chunks_total']})")
|
|
1828
|
+
print(f" File Coverage: {coverage['file_coverage']*100:.1f}%")
|
|
1829
|
+
print(f" Topic Divergence (JS): {coverage['topic_divergence_js']:.4f}")
|
|
1830
|
+
print(f" Uncovered Chunks: {coverage['uncovered_chunks']}")
|
|
1831
|
+
|
|
1832
|
+
print("\n Coverage by Chunk Type:")
|
|
1833
|
+
for ctype, stats in coverage['chunk_type_coverage'].items():
|
|
1834
|
+
print(f" {ctype}: {stats['coverage_rate']*100:.1f}% ({stats['covered']}/{stats['total']})")
|
|
1835
|
+
|
|
1836
|
+
print("\n Coverage by File:")
|
|
1837
|
+
for fname, stats in coverage['coverage_by_file'].items():
|
|
1838
|
+
print(f" {fname}: {stats['coverage_rate']*100:.1f}% ({stats['covered_chunks']}/{stats['total_chunks']})")
|
|
1839
|
+
|
|
1840
|
+
# 3. Context Necessity (sample if too large) - BATCH PROCESSING
|
|
1841
|
+
if run_context_necessity:
|
|
1842
|
+
print("\n" + "=" * 60)
|
|
1843
|
+
print("CONTEXT NECESSITY EVALUATION (Anti-Parametric Bias)")
|
|
1844
|
+
print("=" * 60)
|
|
1845
|
+
|
|
1846
|
+
evaluator = MultimodalFrameworkEvaluator()
|
|
1847
|
+
|
|
1848
|
+
# Sample if needed
|
|
1849
|
+
eval_data = qa_data
|
|
1850
|
+
if sample_size and len(qa_data) > sample_size:
|
|
1851
|
+
eval_data = random.sample(qa_data, sample_size)
|
|
1852
|
+
print(f"\n Sampling {sample_size} items for evaluation...")
|
|
1853
|
+
|
|
1854
|
+
# Prepare batch items
|
|
1855
|
+
batch_items = []
|
|
1856
|
+
for qa in eval_data:
|
|
1857
|
+
context = qa.get('final_context', qa.get('original_chunk', ''))
|
|
1858
|
+
batch_items.append({
|
|
1859
|
+
'question': qa['question'],
|
|
1860
|
+
'answer': qa['answer'],
|
|
1861
|
+
'context': context
|
|
1862
|
+
})
|
|
1863
|
+
|
|
1864
|
+
# Use batch evaluation if available
|
|
1865
|
+
if BATCH_AVAILABLE and len(batch_items) > 1:
|
|
1866
|
+
print(f"\n ⚡ Using batch processing for {len(batch_items)} items...")
|
|
1867
|
+
batch_results = evaluator.batch_evaluate_context_necessity(batch_items)
|
|
1868
|
+
|
|
1869
|
+
necessity_scores = [r['context_necessity_score'] for r in batch_results]
|
|
1870
|
+
without_context_correct = sum(1 for r in batch_results if r.get('without_context_correct'))
|
|
1871
|
+
else:
|
|
1872
|
+
# Fallback to sequential processing
|
|
1873
|
+
necessity_scores = []
|
|
1874
|
+
without_context_correct = 0
|
|
1875
|
+
|
|
1876
|
+
for i, item in enumerate(batch_items):
|
|
1877
|
+
if i % 10 == 0:
|
|
1878
|
+
print(f" Processing {i+1}/{len(batch_items)}...")
|
|
1879
|
+
|
|
1880
|
+
result = evaluator.evaluate_context_necessity(
|
|
1881
|
+
item['question'], item['answer'], item['context']
|
|
1882
|
+
)
|
|
1883
|
+
necessity_scores.append(result['context_necessity_score'])
|
|
1884
|
+
if result.get('without_context_correct'):
|
|
1885
|
+
without_context_correct += 1
|
|
1886
|
+
|
|
1887
|
+
avg_necessity = np.mean(necessity_scores) if necessity_scores else 0.0
|
|
1888
|
+
results["context_necessity"] = {
|
|
1889
|
+
"avg_context_necessity_score": float(avg_necessity),
|
|
1890
|
+
"items_evaluated": len(eval_data),
|
|
1891
|
+
"items_answerable_without_context": without_context_correct,
|
|
1892
|
+
"parametric_leakage_rate": without_context_correct / len(eval_data) if eval_data else 0.0,
|
|
1893
|
+
"score_distribution": {
|
|
1894
|
+
"high_necessity (0.8-1.0)": sum(1 for s in necessity_scores if s >= 0.8),
|
|
1895
|
+
"moderate_necessity (0.5-0.8)": sum(1 for s in necessity_scores if 0.5 <= s < 0.8),
|
|
1896
|
+
"low_necessity (0.0-0.5)": sum(1 for s in necessity_scores if s < 0.5)
|
|
1897
|
+
}
|
|
1898
|
+
}
|
|
1899
|
+
|
|
1900
|
+
print(f"\n Average Context Necessity Score: {avg_necessity:.3f}")
|
|
1901
|
+
print(f" Items answerable without context: {without_context_correct}/{len(eval_data)} ({without_context_correct/len(eval_data)*100:.1f}%)")
|
|
1902
|
+
print(f" Score Distribution:")
|
|
1903
|
+
for k, v in results["context_necessity"]["score_distribution"].items():
|
|
1904
|
+
print(f" {k}: {v} ({v/len(eval_data)*100:.1f}%)")
|
|
1905
|
+
|
|
1906
|
+
# 4. Multihop-specific metrics - BATCH PROCESSING
|
|
1907
|
+
if subsets['multihop']:
|
|
1908
|
+
print("\n" + "=" * 60)
|
|
1909
|
+
print("MULTIHOP METRICS (on multihop subset)")
|
|
1910
|
+
print("=" * 60)
|
|
1911
|
+
|
|
1912
|
+
evaluator = MultimodalFrameworkEvaluator()
|
|
1913
|
+
|
|
1914
|
+
# Prepare batch items
|
|
1915
|
+
batch_items = []
|
|
1916
|
+
for qa in subsets['multihop']:
|
|
1917
|
+
contexts = [qa.get('final_context', qa.get('original_chunk', ''))]
|
|
1918
|
+
batch_items.append({
|
|
1919
|
+
'question': qa['question'],
|
|
1920
|
+
'answer': qa['answer'],
|
|
1921
|
+
'contexts': contexts
|
|
1922
|
+
})
|
|
1923
|
+
|
|
1924
|
+
# Use batch evaluation if available
|
|
1925
|
+
if BATCH_AVAILABLE and len(batch_items) > 1:
|
|
1926
|
+
print(f"\n ⚡ Using batch processing for {len(batch_items)} multihop items...")
|
|
1927
|
+
batch_results = evaluator.batch_evaluate_multihop_reasoning(batch_items)
|
|
1928
|
+
|
|
1929
|
+
hop_counts = [int(r.get('hop_count', 1)) for r in batch_results]
|
|
1930
|
+
reasoning_scores = [float(r.get('reasoning_score', 0)) for r in batch_results]
|
|
1931
|
+
bridge_entities = [r['bridge_entity'] for r in batch_results
|
|
1932
|
+
if r.get('bridge_entity') and r['bridge_entity'] != 'None']
|
|
1933
|
+
else:
|
|
1934
|
+
# Fallback to sequential
|
|
1935
|
+
hop_counts = []
|
|
1936
|
+
reasoning_scores = []
|
|
1937
|
+
bridge_entities = []
|
|
1938
|
+
|
|
1939
|
+
for i, item in enumerate(batch_items):
|
|
1940
|
+
if i % 5 == 0:
|
|
1941
|
+
print(f" Processing {i+1}/{len(batch_items)}...")
|
|
1942
|
+
|
|
1943
|
+
result = evaluator.evaluate_multihop_reasoning(
|
|
1944
|
+
item['question'], item['answer'], item['contexts']
|
|
1945
|
+
)
|
|
1946
|
+
hop_counts.append(int(result.get('hop_count', 1)))
|
|
1947
|
+
reasoning_scores.append(float(result.get('reasoning_score', 0)))
|
|
1948
|
+
if result.get('bridge_entity') and result['bridge_entity'] != 'None':
|
|
1949
|
+
bridge_entities.append(result['bridge_entity'])
|
|
1950
|
+
|
|
1951
|
+
results["multihop_metrics"] = {
|
|
1952
|
+
"items_evaluated": len(subsets['multihop']),
|
|
1953
|
+
"avg_hop_count": float(np.mean(hop_counts)) if hop_counts else 0.0,
|
|
1954
|
+
"avg_reasoning_score": float(np.mean(reasoning_scores)) if reasoning_scores else 0.0,
|
|
1955
|
+
"hop_distribution": dict(Counter(hop_counts)),
|
|
1956
|
+
"items_with_bridge_entity": len(bridge_entities),
|
|
1957
|
+
"sample_bridge_entities": bridge_entities[:10]
|
|
1958
|
+
}
|
|
1959
|
+
|
|
1960
|
+
print(f"\n Items evaluated: {len(subsets['multihop'])}")
|
|
1961
|
+
print(f" Average Hop Count: {results['multihop_metrics']['avg_hop_count']:.2f}")
|
|
1962
|
+
print(f" Average Reasoning Score: {results['multihop_metrics']['avg_reasoning_score']:.3f}")
|
|
1963
|
+
print(f" Hop Distribution: {results['multihop_metrics']['hop_distribution']}")
|
|
1964
|
+
|
|
1965
|
+
# 5. Multimodal-specific metrics
|
|
1966
|
+
if subsets['multimodal']:
|
|
1967
|
+
print("\n" + "=" * 60)
|
|
1968
|
+
print("MULTIMODAL METRICS (on multimodal subset)")
|
|
1969
|
+
print("=" * 60)
|
|
1970
|
+
|
|
1971
|
+
evaluator = MultimodalFrameworkEvaluator()
|
|
1972
|
+
visual_dependency_scores = []
|
|
1973
|
+
|
|
1974
|
+
for i, qa in enumerate(subsets['multimodal']):
|
|
1975
|
+
if i % 10 == 0:
|
|
1976
|
+
print(f" Processing {i+1}/{len(subsets['multimodal'])}...")
|
|
1977
|
+
|
|
1978
|
+
# Visual dependency test (text-only blind test)
|
|
1979
|
+
contexts = [qa.get('final_context', qa.get('original_chunk', ''))]
|
|
1980
|
+
score = evaluator.evaluate_visual_dependency(qa['question'], contexts)
|
|
1981
|
+
visual_dependency_scores.append(score)
|
|
1982
|
+
|
|
1983
|
+
results["multimodal_metrics"] = {
|
|
1984
|
+
"items_evaluated": len(subsets['multimodal']),
|
|
1985
|
+
"avg_visual_dependency": float(np.mean(visual_dependency_scores)) if visual_dependency_scores else 0.0,
|
|
1986
|
+
"items_requiring_visual": sum(1 for s in visual_dependency_scores if s > 0.5),
|
|
1987
|
+
"visual_necessity_rate": sum(1 for s in visual_dependency_scores if s > 0.5) / len(visual_dependency_scores) if visual_dependency_scores else 0.0
|
|
1988
|
+
}
|
|
1989
|
+
|
|
1990
|
+
print(f"\n Items evaluated: {len(subsets['multimodal'])}")
|
|
1991
|
+
print(f" Average Visual Dependency: {results['multimodal_metrics']['avg_visual_dependency']:.3f}")
|
|
1992
|
+
print(f" Items requiring visual info: {results['multimodal_metrics']['items_requiring_visual']}/{len(subsets['multimodal'])}")
|
|
1993
|
+
|
|
1994
|
+
# =========================================================================
|
|
1995
|
+
# FINAL SUMMARY - Key Metrics from MiRAGE Paper (Table 2)
|
|
1996
|
+
# =========================================================================
|
|
1997
|
+
print("\n" + "=" * 70)
|
|
1998
|
+
print("📊 MiRAGE EVALUATION SUMMARY (Paper Table 2 Metrics)")
|
|
1999
|
+
print("=" * 70)
|
|
2000
|
+
|
|
2001
|
+
# Extract metrics
|
|
2002
|
+
faith = results.get("ragas_metrics", {}).get("faithfulness")
|
|
2003
|
+
rel = results.get("ragas_metrics", {}).get("answer_relevance")
|
|
2004
|
+
ctx_prec = results.get("ragas_metrics", {}).get("context_precision")
|
|
2005
|
+
ctx_rec = results.get("ragas_metrics", {}).get("context_recall")
|
|
2006
|
+
|
|
2007
|
+
# Hop count from context_stats (avg_chunks_per_context - 1)
|
|
2008
|
+
avg_hops = results.get("context_stats", {}).get("avg_chunks_per_context", 1) - 1
|
|
2009
|
+
if avg_hops < 0:
|
|
2010
|
+
avg_hops = 0
|
|
2011
|
+
|
|
2012
|
+
# Reasoning score from multihop_metrics
|
|
2013
|
+
s_reason = results.get("multihop_metrics", {}).get("avg_reasoning_score")
|
|
2014
|
+
|
|
2015
|
+
# Visual grounding from multimodal_metrics
|
|
2016
|
+
vis_gr = results.get("multimodal_metrics", {}).get("avg_visual_dependency")
|
|
2017
|
+
|
|
2018
|
+
# JSD from domain_coverage
|
|
2019
|
+
jsd = results.get("domain_coverage", {}).get("topic_divergence_js")
|
|
2020
|
+
|
|
2021
|
+
# Context necessity (anti-parametric bias)
|
|
2022
|
+
ctx_nec = results.get("context_necessity", {}).get("avg_context_necessity_score")
|
|
2023
|
+
|
|
2024
|
+
# Helper to format metric values
|
|
2025
|
+
def fmt(val, decimals=3):
|
|
2026
|
+
if val is None:
|
|
2027
|
+
return "N/A".rjust(8)
|
|
2028
|
+
return f"{val:.{decimals}f}".rjust(8)
|
|
2029
|
+
|
|
2030
|
+
print("\n ┌─────────────────────────────────────────────────────────────┐")
|
|
2031
|
+
print(" │ CORE METRICS │")
|
|
2032
|
+
print(" ├─────────────────────────────────────────────────────────────┤")
|
|
2033
|
+
print(f" │ Faithfulness (Faith.) │ {fmt(faith)} │")
|
|
2034
|
+
print(f" │ Answer Relevance (Rel.) │ {fmt(rel)} │")
|
|
2035
|
+
print(f" │ Context Precision │ {fmt(ctx_prec)} │")
|
|
2036
|
+
print(f" │ Context Recall │ {fmt(ctx_rec)} │")
|
|
2037
|
+
print(" ├─────────────────────────────────────────────────────────────┤")
|
|
2038
|
+
print(" │ REASONING COMPLEXITY │")
|
|
2039
|
+
print(" ├─────────────────────────────────────────────────────────────┤")
|
|
2040
|
+
print(f" │ Avg Hops (H) │ {fmt(avg_hops, 2)} │")
|
|
2041
|
+
print(f" │ Reasoning Score (S_reason) │ {fmt(s_reason)} │")
|
|
2042
|
+
print(" ├─────────────────────────────────────────────────────────────┤")
|
|
2043
|
+
print(" │ MULTIMODAL & DOMAIN │")
|
|
2044
|
+
print(" ├─────────────────────────────────────────────────────────────┤")
|
|
2045
|
+
print(f" │ Visual Grounding (Vis. Gr.) │ {fmt(vis_gr)} │")
|
|
2046
|
+
print(f" │ Jensen-Shannon Div. (JSD) ↓ │ {fmt(jsd, 4)} │")
|
|
2047
|
+
print(f" │ Context Necessity │ {fmt(ctx_nec)} │")
|
|
2048
|
+
print(" └─────────────────────────────────────────────────────────────┘")
|
|
2049
|
+
|
|
2050
|
+
# Dataset summary
|
|
2051
|
+
total_qa = results.get("qa_category_stats", {}).get("total_qa_pairs", 0)
|
|
2052
|
+
mm_qa = results.get("qa_category_stats", {}).get("multimodal_qa_inclusive", 0)
|
|
2053
|
+
table_qa = results.get("qa_category_stats", {}).get("table_qa_inclusive", 0)
|
|
2054
|
+
|
|
2055
|
+
print(f"\n 📈 Dataset: {total_qa} QA pairs | {mm_qa} multimodal | {table_qa} with tables")
|
|
2056
|
+
print("=" * 70)
|
|
2057
|
+
|
|
2058
|
+
# Save results
|
|
2059
|
+
if output_dir:
|
|
2060
|
+
report_path = os.path.join(output_dir, "subset_evaluation_report.json")
|
|
2061
|
+
with open(report_path, 'w') as f:
|
|
2062
|
+
json.dump(results, f, indent=2, default=str)
|
|
2063
|
+
print(f"\n Report saved to: {report_path}")
|
|
2064
|
+
|
|
2065
|
+
return results
|
|
2066
|
+
|
|
2067
|
+
|
|
2068
|
+
def main(json_path: str, output_dir: str = None):
|
|
2069
|
+
"""
|
|
2070
|
+
Main function to evaluate QA dataset quality.
|
|
2071
|
+
|
|
2072
|
+
Args:
|
|
2073
|
+
json_path: Path to the qa_multihop_pass.json file
|
|
2074
|
+
output_dir: Directory to save output reports (defaults to same dir as input)
|
|
2075
|
+
"""
|
|
2076
|
+
import os
|
|
2077
|
+
|
|
2078
|
+
if output_dir is None:
|
|
2079
|
+
output_dir = os.path.dirname(json_path)
|
|
2080
|
+
|
|
2081
|
+
print("=" * 60)
|
|
2082
|
+
print("QA DATASET QUALITY EVALUATION")
|
|
2083
|
+
print("=" * 60)
|
|
2084
|
+
|
|
2085
|
+
# 1. Load raw data
|
|
2086
|
+
print(f"\n[1/5] Loading dataset from: {json_path}")
|
|
2087
|
+
with open(json_path, 'r') as f:
|
|
2088
|
+
raw_data = json.load(f)
|
|
2089
|
+
print(f" Loaded {len(raw_data)} QA pairs")
|
|
2090
|
+
|
|
2091
|
+
# 2. Analyze missing information BEFORE transformation
|
|
2092
|
+
print("\n[2/5] Analyzing data completeness...")
|
|
2093
|
+
missing_info = analyze_missing_information(raw_data)
|
|
2094
|
+
|
|
2095
|
+
print("\n" + "-" * 60)
|
|
2096
|
+
print("METRICS EVALUATION STATUS:")
|
|
2097
|
+
print("-" * 60)
|
|
2098
|
+
for metric, status in missing_info["metrics_status"].items():
|
|
2099
|
+
symbol = "✓" if status["can_evaluate"] else "✗"
|
|
2100
|
+
quality = status["quality"]
|
|
2101
|
+
print(f" {symbol} {metric}: {quality}")
|
|
2102
|
+
if status.get("missing"):
|
|
2103
|
+
for m in status["missing"]:
|
|
2104
|
+
print(f" Missing: {m}")
|
|
2105
|
+
|
|
2106
|
+
print("\n" + "-" * 60)
|
|
2107
|
+
print("RECOMMENDATIONS:")
|
|
2108
|
+
print("-" * 60)
|
|
2109
|
+
for rec in missing_info["recommendations"]:
|
|
2110
|
+
print(f" • {rec}")
|
|
2111
|
+
|
|
2112
|
+
# 3. Transform data
|
|
2113
|
+
print("\n[3/5] Transforming data to evaluation format...")
|
|
2114
|
+
transformed_data = transform_qa_data(raw_data)
|
|
2115
|
+
|
|
2116
|
+
# 4. Run evaluation (only metrics that can be evaluated)
|
|
2117
|
+
print("\n[4/5] Running evaluation...")
|
|
2118
|
+
|
|
2119
|
+
try:
|
|
2120
|
+
evaluator = MultimodalFrameworkEvaluator()
|
|
2121
|
+
|
|
2122
|
+
# Save transformed data for the evaluator
|
|
2123
|
+
transformed_path = os.path.join(output_dir, "qa_transformed_for_eval.json")
|
|
2124
|
+
with open(transformed_path, 'w') as f:
|
|
2125
|
+
json.dump(transformed_data, f, indent=2)
|
|
2126
|
+
|
|
2127
|
+
output_path = os.path.join(output_dir, "eval_report.json")
|
|
2128
|
+
final_df, report = evaluator.run_full_evaluation(transformed_path, output_path)
|
|
2129
|
+
|
|
2130
|
+
# Add missing info analysis to report
|
|
2131
|
+
report["data_completeness"] = missing_info
|
|
2132
|
+
|
|
2133
|
+
# Save updated report
|
|
2134
|
+
with open(output_path, 'w') as f:
|
|
2135
|
+
json.dump(report, f, indent=4)
|
|
2136
|
+
|
|
2137
|
+
print(f"\n[5/5] Reports saved to:")
|
|
2138
|
+
print(f" - {output_path}")
|
|
2139
|
+
print(f" - {output_path.replace('.json', '_detailed.csv')}")
|
|
2140
|
+
|
|
2141
|
+
return final_df, report
|
|
2142
|
+
|
|
2143
|
+
except Exception as e:
|
|
2144
|
+
print(f"\n[ERROR] Evaluation failed: {e}")
|
|
2145
|
+
print("\nRunning basic statistics only...")
|
|
2146
|
+
|
|
2147
|
+
# Compute basic statistics without LLM calls
|
|
2148
|
+
basic_stats = {
|
|
2149
|
+
"total_samples": len(raw_data),
|
|
2150
|
+
"context_status_distribution": Counter(
|
|
2151
|
+
item.get("context_status", "UNKNOWN") for item in raw_data
|
|
2152
|
+
),
|
|
2153
|
+
"avg_relevance_score": np.mean([
|
|
2154
|
+
float(item.get("relevance_score", 0)) for item in raw_data
|
|
2155
|
+
if item.get("relevance_score")
|
|
2156
|
+
]),
|
|
2157
|
+
"avg_difficulty_score": np.mean([
|
|
2158
|
+
float(item.get("difficulty_score", 0)) for item in raw_data
|
|
2159
|
+
if item.get("difficulty_score")
|
|
2160
|
+
]),
|
|
2161
|
+
"domain_distribution": Counter(
|
|
2162
|
+
item.get("domain", "UNKNOWN") for item in raw_data
|
|
2163
|
+
),
|
|
2164
|
+
"data_completeness": missing_info
|
|
2165
|
+
}
|
|
2166
|
+
|
|
2167
|
+
output_path = os.path.join(output_dir, "eval_report_basic.json")
|
|
2168
|
+
with open(output_path, 'w') as f:
|
|
2169
|
+
# Convert Counter objects to dict for JSON serialization
|
|
2170
|
+
basic_stats["context_status_distribution"] = dict(basic_stats["context_status_distribution"])
|
|
2171
|
+
basic_stats["domain_distribution"] = dict(basic_stats["domain_distribution"])
|
|
2172
|
+
json.dump(basic_stats, f, indent=4)
|
|
2173
|
+
|
|
2174
|
+
print(f"\n[5/5] Basic report saved to: {output_path}")
|
|
2175
|
+
print("\nBasic Statistics:")
|
|
2176
|
+
print(json.dumps({k: v for k, v in basic_stats.items() if k != "data_completeness"}, indent=2))
|
|
2177
|
+
|
|
2178
|
+
return None, basic_stats
|
|
2179
|
+
|
|
2180
|
+
|
|
2181
|
+
if __name__ == "__main__":
|
|
2182
|
+
import argparse
|
|
2183
|
+
|
|
2184
|
+
# Load Gemini API key
|
|
2185
|
+
API_KEY_PATH = os.environ.get("GEMINI_API_KEY_PATH", os.path.expanduser("~/.config/gemini/api_key.txt"))
|
|
2186
|
+
with open(API_KEY_PATH, 'r') as f:
|
|
2187
|
+
os.environ["GOOGLE_API_KEY"] = f.read().strip()
|
|
2188
|
+
|
|
2189
|
+
# Default paths (override via command line arguments)
|
|
2190
|
+
DEFAULT_QA_PATH = "output/results/qa_deduplicated.json"
|
|
2191
|
+
DEFAULT_CORPUS_PATH = "output/results/chunks.json"
|
|
2192
|
+
|
|
2193
|
+
parser = argparse.ArgumentParser(description="Evaluate QA dataset quality")
|
|
2194
|
+
parser.add_argument("--qa-file", "-q", default=DEFAULT_QA_PATH, help="Path to QA JSON file")
|
|
2195
|
+
parser.add_argument("--corpus-file", "-c", default=DEFAULT_CORPUS_PATH, help="Path to corpus chunks.json")
|
|
2196
|
+
parser.add_argument("--output-dir", "-o", default=None, help="Output directory for reports")
|
|
2197
|
+
parser.add_argument("--sample-size", "-s", type=int, default=50, help="Sample size for expensive metrics")
|
|
2198
|
+
parser.add_argument("--skip-context-necessity", action="store_true", help="Skip context necessity evaluation")
|
|
2199
|
+
|
|
2200
|
+
args = parser.parse_args()
|
|
2201
|
+
|
|
2202
|
+
# Set output dir
|
|
2203
|
+
output_dir = args.output_dir or os.path.dirname(args.qa_file)
|
|
2204
|
+
|
|
2205
|
+
# Load QA data
|
|
2206
|
+
print(f"Loading QA data from: {args.qa_file}")
|
|
2207
|
+
with open(args.qa_file, 'r') as f:
|
|
2208
|
+
qa_data = json.load(f)
|
|
2209
|
+
print(f"Loaded {len(qa_data)} QA pairs")
|
|
2210
|
+
|
|
2211
|
+
# Run evaluation
|
|
2212
|
+
results = run_subset_evaluation(
|
|
2213
|
+
qa_data=qa_data,
|
|
2214
|
+
corpus_path=args.corpus_file,
|
|
2215
|
+
output_dir=output_dir,
|
|
2216
|
+
sample_size=args.sample_size,
|
|
2217
|
+
run_context_necessity=not args.skip_context_necessity
|
|
2218
|
+
)
|
|
2219
|
+
|
|
2220
|
+
print("\n" + "=" * 60)
|
|
2221
|
+
print("FINAL EVALUATION RESULTS")
|
|
2222
|
+
print("=" * 60)
|
|
2223
|
+
print(json.dumps(results, indent=2, default=str))
|