mirage-benchmark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mirage-benchmark might be problematic. Click here for more details.

@@ -0,0 +1,2223 @@
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ import pandas as pd
5
+ from typing import List, Dict, Any, Optional
6
+ from collections import Counter
7
+
8
+ # Import prompts for LLM-as-a-Judge metrics
9
+ try:
10
+ from prompt import PROMPTS_METRICS
11
+ except ImportError:
12
+ PROMPTS_METRICS = {}
13
+ print("Warning: Could not import PROMPTS_METRICS from prompt.py")
14
+
15
+ # Ragas Imports for Standard RAG Metrics
16
+ try:
17
+ from ragas import evaluate
18
+ from ragas.metrics import (
19
+ faithfulness,
20
+ answer_relevancy, # Note: RAGAS 0.4.x uses 'answer_relevancy' not 'answer_relevance'
21
+ context_precision,
22
+ context_recall,
23
+ )
24
+ from datasets import Dataset
25
+ RAGAS_AVAILABLE = True
26
+ print("✅ RAGAS metrics loaded successfully")
27
+ except ImportError as e:
28
+ RAGAS_AVAILABLE = False
29
+ print(f"Warning: 'ragas' or 'datasets' not installed. Error: {e}")
30
+
31
+ # Optional advanced metrics (may not be available in all ragas versions)
32
+ HAS_ENTITY_RECALL = False
33
+ HAS_NOISE_SENSITIVITY = False
34
+ HAS_MULTIMODAL = False
35
+
36
+ if RAGAS_AVAILABLE:
37
+ try:
38
+ from ragas.metrics import context_entity_recall
39
+ HAS_ENTITY_RECALL = True
40
+ except ImportError:
41
+ print("Info: context_entity_recall not available in this ragas version.")
42
+
43
+ try:
44
+ from ragas.metrics import noise_sensitivity_relevant
45
+ HAS_NOISE_SENSITIVITY = True
46
+ except ImportError:
47
+ print("Info: noise_sensitivity metrics not available in this ragas version.")
48
+
49
+ try:
50
+ from ragas.metrics import multimodal_faithfulness, multimodal_relevance
51
+ HAS_MULTIMODAL = True
52
+ except ImportError:
53
+ print("Info: multimodal metrics not available in this ragas version.")
54
+
55
+ # LangChain Imports
56
+ try:
57
+ from langchain_core.prompts import ChatPromptTemplate
58
+ from langchain_core.output_parsers import StrOutputParser
59
+ LANGCHAIN_AVAILABLE = True
60
+ except ImportError:
61
+ try:
62
+ from langchain.prompts import ChatPromptTemplate
63
+ from langchain.output_parsers import StrOutputParser
64
+ LANGCHAIN_AVAILABLE = True
65
+ except ImportError:
66
+ LANGCHAIN_AVAILABLE = False
67
+ print("Warning: 'langchain' not installed.")
68
+
69
+ # Output parsers (may be in different locations)
70
+ try:
71
+ from langchain.output_parsers import ResponseSchema, StructuredOutputParser
72
+ except ImportError:
73
+ try:
74
+ from langchain_community.output_parsers import ResponseSchema, StructuredOutputParser
75
+ except ImportError:
76
+ try:
77
+ from langchain_core.output_parsers import ResponseSchema, StructuredOutputParser
78
+ except ImportError:
79
+ # Define minimal fallbacks
80
+ ResponseSchema = None
81
+ StructuredOutputParser = None
82
+
83
+ # LangChain Google Gemini Imports
84
+ try:
85
+ from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
86
+ GEMINI_AVAILABLE = True
87
+ except ImportError:
88
+ GEMINI_AVAILABLE = False
89
+ print("Info: 'langchain-google-genai' not installed. Trying OpenAI...")
90
+
91
+ # LangChain OpenAI Imports (fallback)
92
+ try:
93
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
94
+ OPENAI_AVAILABLE = True
95
+ except ImportError:
96
+ OPENAI_AVAILABLE = False
97
+ print("Info: 'langchain_openai' not installed.")
98
+
99
+ # Data Science Imports
100
+ try:
101
+ from sklearn.metrics.pairwise import cosine_similarity
102
+ except ImportError:
103
+ print("Warning: 'scikit-learn' not installed.")
104
+
105
+ try:
106
+ from scipy.stats import entropy
107
+ SCIPY_AVAILABLE = True
108
+ except ImportError:
109
+ SCIPY_AVAILABLE = False
110
+ print("Warning: 'scipy' not installed. Domain coverage metrics may fail.")
111
+
112
+ # VLM Import for multimodal evaluation
113
+ try:
114
+ from call_llm import call_vlm_interweaved, batch_call_llm, batch_call_vlm_interweaved
115
+ VLM_AVAILABLE = True
116
+ BATCH_AVAILABLE = True
117
+ except ImportError:
118
+ VLM_AVAILABLE = False
119
+ BATCH_AVAILABLE = False
120
+ print("Warning: 'call_llm' module not available. VLM-based multimodal metrics will be skipped.")
121
+
122
+ class MultimodalFrameworkEvaluator:
123
+ def __init__(self, model_name=None, embedding_model=None, use_gemini=True):
124
+ """
125
+ Initialize the evaluator.
126
+ Args:
127
+ model_name: The LLM to use as a Judge (auto-detected if None)
128
+ embedding_model: Model for diversity calculations (auto-detected if None)
129
+ use_gemini: If True, prefer Gemini over OpenAI
130
+ """
131
+ # Determine which API to use
132
+ if use_gemini and GEMINI_AVAILABLE and os.getenv("GOOGLE_API_KEY"):
133
+ model_name = model_name or "gemini-2.0-flash"
134
+ embedding_model = embedding_model or "models/text-embedding-004"
135
+ print(f"Using Gemini API with model: {model_name}")
136
+ self.llm = ChatGoogleGenerativeAI(model=model_name, temperature=0)
137
+ self.embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model)
138
+ self.api_type = "gemini"
139
+ elif OPENAI_AVAILABLE and os.getenv("OPENAI_API_KEY"):
140
+ model_name = model_name or "gpt-4-turbo"
141
+ embedding_model = embedding_model or "text-embedding-3-small"
142
+ print(f"Using OpenAI API with model: {model_name}")
143
+ self.llm = ChatOpenAI(model=model_name, temperature=0)
144
+ self.embeddings = OpenAIEmbeddings(model=embedding_model)
145
+ self.api_type = "openai"
146
+ elif use_gemini and GEMINI_AVAILABLE:
147
+ # Try Gemini without env var check (will fail if key not set)
148
+ model_name = model_name or "gemini-2.0-flash"
149
+ embedding_model = embedding_model or "models/text-embedding-004"
150
+ print(f"Using Gemini API with model: {model_name}")
151
+ self.llm = ChatGoogleGenerativeAI(model=model_name, temperature=0)
152
+ self.embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model)
153
+ self.api_type = "gemini"
154
+ else:
155
+ raise RuntimeError("No API available. Install langchain-google-genai or langchain-openai and set API key.")
156
+
157
+ # Ragas metrics configuration (only if ragas available)
158
+ self.ragas_metrics = []
159
+ if RAGAS_AVAILABLE:
160
+ self.ragas_metrics = [
161
+ faithfulness,
162
+ answer_relevancy,
163
+ context_precision,
164
+ context_recall,
165
+ ]
166
+ # Add optional metrics if available
167
+ if HAS_ENTITY_RECALL:
168
+ self.ragas_metrics.append(context_entity_recall)
169
+ if HAS_NOISE_SENSITIVITY:
170
+ self.ragas_metrics.append(noise_sensitivity_relevant)
171
+ if HAS_MULTIMODAL:
172
+ self.ragas_metrics.extend([multimodal_faithfulness, multimodal_relevance])
173
+
174
+ def load_dataset(self, json_path: str) -> List[Dict]:
175
+ """
176
+ Loads the generated QA dataset.
177
+ Expected JSON format: List of dicts with keys:
178
+ ['question', 'answer', 'contexts', 'ground_truth' (optional), 'metadata']
179
+ """
180
+ with open(json_path, 'r') as f:
181
+ data = json.load(f)
182
+ return data
183
+
184
+ # ==========================================
185
+ # 1. STANDARD RAG METRICS (RAGAS)
186
+ # ==========================================
187
+ def evaluate_ragas_standard(self, data: List[Dict]) -> pd.DataFrame:
188
+ """
189
+ Evaluates Faithfulness, Relevance, Precision, and Recall using Ragas.
190
+ Uses the same LLM backend (Gemini/OpenAI) as configured in the evaluator.
191
+ """
192
+ print("--- Running Standard RAG Metrics (Ragas) ---")
193
+
194
+ # Import RAGAS LLM wrapper
195
+ try:
196
+ from ragas.llms import LangchainLLMWrapper
197
+ from ragas.embeddings import LangchainEmbeddingsWrapper
198
+ except ImportError:
199
+ # Older RAGAS versions
200
+ LangchainLLMWrapper = None
201
+ LangchainEmbeddingsWrapper = None
202
+
203
+ # RAGAS 0.4.x requires specific column names
204
+ # user_input (question), response (answer), retrieved_contexts, reference (ground_truth)
205
+ ragas_data = {
206
+ "user_input": [d.get('question', "") for d in data],
207
+ "response": [d.get('answer', "") for d in data],
208
+ "retrieved_contexts": [d.get('contexts', []) for d in data],
209
+ # Use answer as reference if not provided
210
+ "reference": [d.get('ground_truth', d.get('answer', "")) for d in data]
211
+ }
212
+
213
+ dataset = Dataset.from_dict(ragas_data)
214
+
215
+ # Wrap LLM and embeddings for RAGAS
216
+ import warnings
217
+ with warnings.catch_warnings():
218
+ warnings.simplefilter("ignore", DeprecationWarning)
219
+
220
+ # Configure parallel execution
221
+ try:
222
+ from ragas import RunConfig
223
+ # Increase parallelism for faster evaluation
224
+ # max_workers=64 for high throughput, timeout=300 for long contexts
225
+ run_config = RunConfig(
226
+ max_workers=64, # Parallel LLM calls
227
+ timeout=300, # 5 min timeout per call
228
+ max_retries=3, # Retry on failures
229
+ )
230
+ print(f" Using parallel execution with {run_config.max_workers} workers...")
231
+ except ImportError:
232
+ run_config = None
233
+
234
+ if LangchainLLMWrapper and LangchainEmbeddingsWrapper:
235
+ ragas_llm = LangchainLLMWrapper(self.llm)
236
+ ragas_embeddings = LangchainEmbeddingsWrapper(self.embeddings)
237
+
238
+ eval_kwargs = {
239
+ "dataset": dataset,
240
+ "metrics": self.ragas_metrics,
241
+ "llm": ragas_llm,
242
+ "embeddings": ragas_embeddings,
243
+ }
244
+ if run_config:
245
+ eval_kwargs["run_config"] = run_config
246
+
247
+ results = evaluate(**eval_kwargs)
248
+ else:
249
+ # Fallback for older RAGAS versions
250
+ results = evaluate(
251
+ dataset=dataset,
252
+ metrics=self.ragas_metrics,
253
+ llm=self.llm,
254
+ embeddings=self.embeddings,
255
+ )
256
+
257
+ return results.to_pandas()
258
+
259
+ # ==========================================
260
+ # 2. CUSTOM: REASONING COMPLEXITY (MULTI-HOP) - LLM-as-a-Judge
261
+ # ==========================================
262
+ def evaluate_multihop_reasoning(self, question: str, answer: str, contexts: List[str]):
263
+ """
264
+ Uses LLM-as-a-Judge to determine if a question is truly multi-hop.
265
+ Returns: Dict with hop_count (int), reasoning_score (float 0-1), bridge_entity (str)
266
+ """
267
+ import re
268
+
269
+ if "multihop_reasoning" not in PROMPTS_METRICS:
270
+ raise ValueError("PROMPTS_METRICS['multihop_reasoning'] not found in prompt.py")
271
+ prompt_template = PROMPTS_METRICS["multihop_reasoning"]
272
+ prompt = ChatPromptTemplate.from_template(prompt_template)
273
+ chain = prompt | self.llm
274
+
275
+ try:
276
+ response = chain.invoke({
277
+ "contexts": "\n".join(contexts),
278
+ "question": question,
279
+ "answer": answer
280
+ })
281
+ content = response.content.strip()
282
+
283
+ # Parse the response
284
+ hop_match = re.search(r'HOP_COUNT:\s*(\d+)', content)
285
+ score_match = re.search(r'REASONING_SCORE:\s*([\d.]+)', content)
286
+ bridge_match = re.search(r'BRIDGE_ENTITY:\s*(.+?)(?:\n|$)', content)
287
+
288
+ hop_count = int(hop_match.group(1)) if hop_match else 1
289
+ reasoning_score = float(score_match.group(1)) if score_match else 0.5
290
+ reasoning_score = min(1.0, max(0.0, reasoning_score))
291
+ bridge_entity = bridge_match.group(1).strip() if bridge_match else "None"
292
+
293
+ return {
294
+ "hop_count": hop_count,
295
+ "reasoning_score": reasoning_score,
296
+ "bridge_entity": bridge_entity
297
+ }
298
+ except Exception as e:
299
+ print(f"Error in multihop eval: {e}")
300
+ return {"hop_count": 1, "reasoning_score": 0.0, "bridge_entity": "Error"}
301
+
302
+ # ==========================================
303
+ # 3. CUSTOM: VISUAL DEPENDENCY (BLIND TEST) - LLM-as-a-Judge
304
+ # ==========================================
305
+ def evaluate_visual_dependency(self, question: str, text_contexts: List[str]):
306
+ """
307
+ The 'Blind Test': Can the question be answered using ONLY text contexts?
308
+ High Score (1.0) = Good for Multimodal (Model FAILED to answer without image).
309
+ Low Score (0.0) = Bad for Multimodal (Model could answer using text only).
310
+ """
311
+ if "visual_dependency" not in PROMPTS_METRICS:
312
+ raise ValueError("PROMPTS_METRICS['visual_dependency'] not found in prompt.py")
313
+ prompt_template = PROMPTS_METRICS["visual_dependency"]
314
+ prompt = ChatPromptTemplate.from_template(prompt_template)
315
+
316
+ chain = prompt | self.llm
317
+ try:
318
+ response = chain.invoke({"contexts": "\n".join(text_contexts), "question": question})
319
+ content = response.content.strip()
320
+
321
+ # If LLM says it's missing visual info, that's a PASS (1.0) for Visual Dependency
322
+ is_dependent = "MISSING_VISUAL" in content.upper()
323
+ return 1.0 if is_dependent else 0.0
324
+ except Exception as e:
325
+ print(f"Error in visual eval: {e}")
326
+ return 0.0
327
+
328
+ # ==========================================
329
+ # 4. CUSTOM: MULTIMODAL VLM METRICS
330
+ # ==========================================
331
+ def evaluate_multimodal_faithfulness_vlm(self, question: str, answer: str, context_chunks: List[Dict]) -> Dict:
332
+ """
333
+ VLM-based faithfulness evaluation: Does the answer faithfully represent
334
+ information from BOTH text and visual contexts?
335
+
336
+ Returns:
337
+ Dict with 'score' (0-1), 'text_supported', 'visual_supported', 'explanation'
338
+ """
339
+ if not VLM_AVAILABLE:
340
+ return {"score": 0.0, "text_supported": False, "visual_supported": False,
341
+ "explanation": "VLM not available"}
342
+
343
+ if "multimodal_faithfulness_vlm" not in PROMPTS_METRICS:
344
+ raise ValueError("PROMPTS_METRICS['multimodal_faithfulness_vlm'] not found in prompt.py")
345
+ prompt = PROMPTS_METRICS["multimodal_faithfulness_vlm"].format(question=question, answer=answer)
346
+
347
+ try:
348
+ response = call_vlm_interweaved(prompt, context_chunks)
349
+
350
+ # Parse response
351
+ text_supported = "TEXT_SUPPORTED: YES" in response.upper()
352
+ visual_supported = "VISUAL_SUPPORTED: YES" in response.upper()
353
+ visual_na = "VISUAL_SUPPORTED: NA" in response.upper()
354
+
355
+ # Extract score
356
+ import re
357
+ score_match = re.search(r'FAITHFULNESS_SCORE:\s*([\d.]+)', response)
358
+ score = float(score_match.group(1)) if score_match else 0.5
359
+ score = min(1.0, max(0.0, score))
360
+
361
+ # Extract explanation
362
+ exp_match = re.search(r'EXPLANATION:\s*(.+?)(?:\n|$)', response, re.DOTALL)
363
+ explanation = exp_match.group(1).strip() if exp_match else ""
364
+
365
+ return {
366
+ "score": score,
367
+ "text_supported": text_supported,
368
+ "visual_supported": visual_supported if not visual_na else None,
369
+ "explanation": explanation[:200]
370
+ }
371
+ except Exception as e:
372
+ print(f"Error in multimodal faithfulness eval: {e}")
373
+ return {"score": 0.0, "text_supported": False, "visual_supported": False,
374
+ "explanation": f"Error: {str(e)}"}
375
+
376
+ def evaluate_multimodal_answer_quality_vlm(self, question: str, answer: str, context_chunks: List[Dict]) -> Dict:
377
+ """
378
+ VLM-based answer quality evaluation considering multimodal context.
379
+
380
+ Returns:
381
+ Dict with 'completeness', 'accuracy', 'uses_visual_info', 'overall_score'
382
+ """
383
+ if not VLM_AVAILABLE:
384
+ return {"completeness": 0.0, "accuracy": 0.0, "uses_visual_info": False, "overall_score": 0.0}
385
+
386
+ if "multimodal_answer_quality_vlm" not in PROMPTS_METRICS:
387
+ raise ValueError("PROMPTS_METRICS['multimodal_answer_quality_vlm'] not found in prompt.py")
388
+ prompt = PROMPTS_METRICS["multimodal_answer_quality_vlm"].format(question=question, answer=answer)
389
+
390
+ try:
391
+ response = call_vlm_interweaved(prompt, context_chunks)
392
+
393
+ # Parse scores
394
+ import re
395
+ completeness = 0.5
396
+ accuracy = 0.5
397
+ overall = 0.5
398
+
399
+ comp_match = re.search(r'COMPLETENESS:\s*([\d.]+)', response)
400
+ if comp_match:
401
+ completeness = min(1.0, max(0.0, float(comp_match.group(1))))
402
+
403
+ acc_match = re.search(r'ACCURACY:\s*([\d.]+)', response)
404
+ if acc_match:
405
+ accuracy = min(1.0, max(0.0, float(acc_match.group(1))))
406
+
407
+ overall_match = re.search(r'OVERALL_SCORE:\s*([\d.]+)', response)
408
+ if overall_match:
409
+ overall = min(1.0, max(0.0, float(overall_match.group(1))))
410
+
411
+ uses_visual = "VISUAL_INFO_USED: YES" in response.upper()
412
+
413
+ return {
414
+ "completeness": completeness,
415
+ "accuracy": accuracy,
416
+ "uses_visual_info": uses_visual,
417
+ "overall_score": overall
418
+ }
419
+ except Exception as e:
420
+ print(f"Error in multimodal answer quality eval: {e}")
421
+ return {"completeness": 0.0, "accuracy": 0.0, "uses_visual_info": False, "overall_score": 0.0}
422
+
423
+ # ==========================================
424
+ # 5. CUSTOM: DATASET DIVERSITY
425
+ # ==========================================
426
+ def evaluate_semantic_diversity(self, questions: List[str]):
427
+ """
428
+ Calculates diversity based on cosine distance of question embeddings.
429
+ Returns: diversity_score (0-1, higher is better)
430
+ """
431
+ print("--- Calculating Semantic Diversity ---")
432
+ if not questions:
433
+ return 0.0
434
+
435
+ embeddings = self.embeddings.embed_documents(questions)
436
+ matrix = np.array(embeddings)
437
+
438
+ # Calculate cosine similarity matrix
439
+ sim_matrix = cosine_similarity(matrix)
440
+
441
+ # We want diversity (distance), so we look at 1 - average_similarity
442
+ # Exclude diagonal (self-similarity is always 1)
443
+ np.fill_diagonal(sim_matrix, np.nan)
444
+ avg_similarity = np.nanmean(sim_matrix)
445
+
446
+ diversity_score = 1 - avg_similarity
447
+ return diversity_score
448
+
449
+ # ==========================================
450
+ # 6. CUSTOM: CONTEXT NECESSITY (Anti-Parametric Bias)
451
+ # ==========================================
452
+ def evaluate_context_necessity(self, question: str, answer: str, context: str) -> Dict:
453
+ """
454
+ Measures if the question REQUIRES the context to be answered correctly.
455
+ Tests anti-parametric bias by checking if LLM can answer without context.
456
+
457
+ Objective: Ensure the question tests retrieval, not just parametric knowledge.
458
+
459
+ Mathematical Intuition: Maximizes information gain from context.
460
+ High score = context is necessary (good for RAG evaluation).
461
+ Low score = answerable from parametric knowledge (bad for RAG evaluation).
462
+
463
+ Input:
464
+ - question: The question string
465
+ - answer: The ground truth answer
466
+ - context: The provided context
467
+
468
+ Output:
469
+ Dict with:
470
+ - context_necessity_score: Float 0-1 (1 = context essential, 0 = not needed)
471
+ - without_context_correct: Boolean (did LLM answer correctly without context?)
472
+ - with_context_correct: Boolean (did LLM answer correctly with context?)
473
+ - explanation: String explaining the assessment
474
+
475
+ Interpretation:
476
+ - 0.8-1.0: Excellent - question strictly requires context
477
+ - 0.5-0.8: Moderate - context helps but partial answers possible
478
+ - 0.0-0.5: Poor - answerable from parametric knowledge
479
+ """
480
+ # Step 1: Ask LLM to answer WITHOUT context
481
+ if "context_necessity_without" not in PROMPTS_METRICS:
482
+ raise ValueError("PROMPTS_METRICS['context_necessity_without'] not found in prompt.py")
483
+ prompt_without_template = PROMPTS_METRICS["context_necessity_without"]
484
+ prompt_without = ChatPromptTemplate.from_template(prompt_without_template)
485
+
486
+ # Step 2: Ask LLM to verify answer WITH context
487
+ if "context_necessity_verify" not in PROMPTS_METRICS:
488
+ raise ValueError("PROMPTS_METRICS['context_necessity_verify'] not found in prompt.py")
489
+ prompt_verify_template = PROMPTS_METRICS["context_necessity_verify"]
490
+ prompt_verify = ChatPromptTemplate.from_template(prompt_verify_template)
491
+
492
+ try:
493
+ # Get answer without context
494
+ chain_without = prompt_without | self.llm
495
+ response_without = chain_without.invoke({"question": question})
496
+ answer_without_context = response_without.content.strip()
497
+
498
+ # Check if model refused to answer
499
+ refused = "CANNOT_ANSWER" in answer_without_context.upper()
500
+
501
+ if refused:
502
+ # Model couldn't answer without context - high context necessity
503
+ return {
504
+ "context_necessity_score": 1.0,
505
+ "without_context_correct": False,
506
+ "with_context_correct": True, # Assumed since we have the answer
507
+ "answer_without_context": answer_without_context[:200],
508
+ "explanation": "Model could not answer without context - context is essential"
509
+ }
510
+
511
+ # Verify if the answer without context matches ground truth
512
+ chain_verify = prompt_verify | self.llm
513
+ verify_response = chain_verify.invoke({
514
+ "ground_truth": answer,
515
+ "model_answer": answer_without_context
516
+ })
517
+ verify_content = verify_response.content.strip().upper()
518
+
519
+ if "MATCH: YES" in verify_content:
520
+ # Model answered correctly without context - low context necessity
521
+ return {
522
+ "context_necessity_score": 0.0,
523
+ "without_context_correct": True,
524
+ "with_context_correct": True,
525
+ "answer_without_context": answer_without_context[:200],
526
+ "explanation": "Model answered correctly without context - question may test parametric knowledge"
527
+ }
528
+ elif "MATCH: PARTIAL" in verify_content:
529
+ # Partial match - moderate context necessity
530
+ return {
531
+ "context_necessity_score": 0.5,
532
+ "without_context_correct": False,
533
+ "with_context_correct": True,
534
+ "answer_without_context": answer_without_context[:200],
535
+ "explanation": "Model partially answered without context - context adds value"
536
+ }
537
+ else:
538
+ # No match - high context necessity
539
+ return {
540
+ "context_necessity_score": 0.9,
541
+ "without_context_correct": False,
542
+ "with_context_correct": True,
543
+ "answer_without_context": answer_without_context[:200],
544
+ "explanation": "Model answered incorrectly without context - context is necessary"
545
+ }
546
+
547
+ except Exception as e:
548
+ print(f"Error in context necessity eval: {e}")
549
+ return {
550
+ "context_necessity_score": 0.5,
551
+ "without_context_correct": None,
552
+ "with_context_correct": None,
553
+ "answer_without_context": "",
554
+ "explanation": f"Error: {str(e)}"
555
+ }
556
+
557
+ def batch_evaluate_context_necessity(self, qa_items: List[Dict]) -> List[Dict]:
558
+ """
559
+ Batch evaluation of context necessity using concurrent API calls.
560
+
561
+ Args:
562
+ qa_items: List of dicts with 'question', 'answer', 'context' keys
563
+
564
+ Returns:
565
+ List of result dicts in same order
566
+ """
567
+ if not BATCH_AVAILABLE:
568
+ # Fallback to sequential
569
+ return [self.evaluate_context_necessity(
570
+ item['question'], item['answer'], item['context']
571
+ ) for item in qa_items]
572
+
573
+ if "context_necessity_without" not in PROMPTS_METRICS:
574
+ raise ValueError("PROMPTS_METRICS['context_necessity_without'] not found")
575
+ if "context_necessity_verify" not in PROMPTS_METRICS:
576
+ raise ValueError("PROMPTS_METRICS['context_necessity_verify'] not found")
577
+
578
+ prompt_without_template = PROMPTS_METRICS["context_necessity_without"]
579
+ prompt_verify_template = PROMPTS_METRICS["context_necessity_verify"]
580
+
581
+ # Phase 1: Batch "answer without context" calls
582
+ prompts_without = []
583
+ for item in qa_items:
584
+ prompt = prompt_without_template.replace("{question}", item['question'])
585
+ prompts_without.append(prompt)
586
+
587
+ print(f" ⚡ Phase 1: Batch answering {len(prompts_without)} questions without context...")
588
+ answers_without = batch_call_llm(prompts_without, show_progress=False)
589
+
590
+ # Phase 2: Batch verification calls for non-refused answers
591
+ verify_prompts = []
592
+ verify_indices = []
593
+ results = [None] * len(qa_items)
594
+
595
+ for i, (item, answer_without) in enumerate(zip(qa_items, answers_without)):
596
+ if answer_without.startswith("ERROR:"):
597
+ results[i] = {
598
+ "context_necessity_score": 0.5,
599
+ "without_context_correct": None,
600
+ "with_context_correct": None,
601
+ "answer_without_context": "",
602
+ "explanation": f"Error: {answer_without}"
603
+ }
604
+ elif "CANNOT_ANSWER" in answer_without.upper():
605
+ results[i] = {
606
+ "context_necessity_score": 1.0,
607
+ "without_context_correct": False,
608
+ "with_context_correct": True,
609
+ "answer_without_context": answer_without[:200],
610
+ "explanation": "Model could not answer without context - context is essential"
611
+ }
612
+ else:
613
+ # Need to verify
614
+ prompt = prompt_verify_template.replace(
615
+ "{ground_truth}", item['answer']
616
+ ).replace("{model_answer}", answer_without)
617
+ verify_prompts.append(prompt)
618
+ verify_indices.append(i)
619
+
620
+ if verify_prompts:
621
+ print(f" ⚡ Phase 2: Batch verifying {len(verify_prompts)} answers...")
622
+ verify_responses = batch_call_llm(verify_prompts, show_progress=False)
623
+
624
+ for idx, verify_content in zip(verify_indices, verify_responses):
625
+ answer_without = answers_without[idx]
626
+ verify_upper = verify_content.upper() if verify_content else ""
627
+
628
+ if "MATCH: YES" in verify_upper:
629
+ results[idx] = {
630
+ "context_necessity_score": 0.0,
631
+ "without_context_correct": True,
632
+ "with_context_correct": True,
633
+ "answer_without_context": answer_without[:200],
634
+ "explanation": "Model answered correctly without context - question may test parametric knowledge"
635
+ }
636
+ elif "MATCH: PARTIAL" in verify_upper:
637
+ results[idx] = {
638
+ "context_necessity_score": 0.5,
639
+ "without_context_correct": False,
640
+ "with_context_correct": True,
641
+ "answer_without_context": answer_without[:200],
642
+ "explanation": "Model partially answered without context - context adds value"
643
+ }
644
+ else:
645
+ results[idx] = {
646
+ "context_necessity_score": 0.9,
647
+ "without_context_correct": False,
648
+ "with_context_correct": True,
649
+ "answer_without_context": answer_without[:200],
650
+ "explanation": "Model answered incorrectly without context - context is necessary"
651
+ }
652
+
653
+ return results
654
+
655
+ def batch_evaluate_multihop_reasoning(self, qa_items: List[Dict]) -> List[Dict]:
656
+ """
657
+ Batch evaluation of multihop reasoning using concurrent API calls.
658
+
659
+ Args:
660
+ qa_items: List of dicts with 'question', 'answer', 'contexts' keys
661
+
662
+ Returns:
663
+ List of result dicts with hop_count, reasoning_score, bridge_entity
664
+ """
665
+ if not BATCH_AVAILABLE:
666
+ # Fallback to sequential
667
+ return [self.evaluate_multihop_reasoning(
668
+ item['question'], item['answer'], item['contexts']
669
+ ) for item in qa_items]
670
+
671
+ if "multihop_reasoning" not in PROMPTS_METRICS:
672
+ raise ValueError("PROMPTS_METRICS['multihop_reasoning'] not found")
673
+
674
+ prompt_template = PROMPTS_METRICS["multihop_reasoning"]
675
+
676
+ prompts = []
677
+ for item in qa_items:
678
+ contexts_str = "\n".join(item['contexts']) if isinstance(item['contexts'], list) else item['contexts']
679
+ prompt = prompt_template.replace(
680
+ "{contexts}", contexts_str
681
+ ).replace("{question}", item['question']).replace("{answer}", item['answer'])
682
+ prompts.append(prompt)
683
+
684
+ print(f" ⚡ Batch evaluating {len(prompts)} multihop reasoning questions...")
685
+ responses = batch_call_llm(prompts, show_progress=False)
686
+
687
+ results = []
688
+ import re
689
+ for response in responses:
690
+ if response.startswith("ERROR:"):
691
+ results.append({"hop_count": 1, "reasoning_score": 0.0, "bridge_entity": "Error"})
692
+ continue
693
+
694
+ hop_match = re.search(r'HOP_COUNT:\s*(\d+)', response)
695
+ score_match = re.search(r'REASONING_SCORE:\s*([\d.]+)', response)
696
+ bridge_match = re.search(r'BRIDGE_ENTITY:\s*(.+?)(?:\n|$)', response)
697
+
698
+ hop_count = int(hop_match.group(1)) if hop_match else 1
699
+ reasoning_score = float(score_match.group(1)) if score_match else 0.5
700
+ reasoning_score = min(1.0, max(0.0, reasoning_score))
701
+ bridge_entity = bridge_match.group(1).strip() if bridge_match else "None"
702
+
703
+ results.append({
704
+ "hop_count": hop_count,
705
+ "reasoning_score": reasoning_score,
706
+ "bridge_entity": bridge_entity
707
+ })
708
+
709
+ return results
710
+
711
+ # ==========================================
712
+ # 7. CUSTOM: DOMAIN COVERAGE
713
+ # ==========================================
714
+ def evaluate_domain_coverage(self, qa_data: List[Dict], corpus_chunks: List[Dict]) -> Dict:
715
+ """
716
+ Measures how well the QA dataset covers the source corpus.
717
+ Prevents sampling bias and ensures comprehensive evaluation.
718
+
719
+ Objective: Ensure QA dataset comprehensively tests knowledge across the corpus.
720
+
721
+ Mathematical Intuition: Minimizes Jensen-Shannon divergence between
722
+ topic distributions: min D_JS(P_topics(D) || P_topics(C))
723
+
724
+ Input:
725
+ - qa_data: List of QA pairs with chunk references
726
+ - corpus_chunks: List of all corpus chunks with metadata
727
+
728
+ Output:
729
+ Dict with:
730
+ - chunk_coverage: Float 0-1 (proportion of corpus chunks covered)
731
+ - file_coverage: Float 0-1 (proportion of source files covered)
732
+ - chunk_type_coverage: Dict (coverage by chunk type)
733
+ - topic_divergence: Float 0-1 (JS divergence, lower is better)
734
+ - uncovered_chunks: Int (number of chunks not referenced)
735
+ - coverage_by_file: Dict (coverage breakdown by file)
736
+
737
+ Interpretation:
738
+ - chunk_coverage 0.8+: Excellent corpus coverage
739
+ - chunk_coverage 0.5-0.8: Moderate coverage, some gaps
740
+ - chunk_coverage <0.5: Poor coverage, significant gaps
741
+ - topic_divergence <0.2: Good topic balance
742
+ - topic_divergence >0.5: Significant topic imbalance
743
+ """
744
+ # entropy imported at module level from scipy.stats
745
+
746
+ # Build corpus index
747
+ corpus_index = {}
748
+ corpus_by_file = Counter()
749
+ corpus_by_type = Counter()
750
+
751
+ for chunk in corpus_chunks:
752
+ key = (chunk.get('file_name'), str(chunk.get('chunk_id')))
753
+ corpus_index[key] = chunk
754
+ corpus_by_file[chunk.get('file_name', 'unknown')] += 1
755
+ corpus_by_type[chunk.get('chunk_type', 'unknown')] += 1
756
+
757
+ # Track covered chunks
758
+ covered_chunks = set()
759
+ covered_by_file = Counter()
760
+ covered_by_type = Counter()
761
+
762
+ for qa in qa_data:
763
+ for chunk_ref in qa.get('chunks_added', []):
764
+ key = (chunk_ref.get('file_name'), str(chunk_ref.get('chunk_id')))
765
+ if key in corpus_index:
766
+ covered_chunks.add(key)
767
+ chunk_info = corpus_index[key]
768
+ covered_by_file[chunk_info.get('file_name', 'unknown')] += 1
769
+ covered_by_type[chunk_info.get('chunk_type', 'unknown')] += 1
770
+
771
+ # Calculate coverage metrics
772
+ total_corpus = len(corpus_chunks)
773
+ total_covered = len(covered_chunks)
774
+ chunk_coverage = total_covered / total_corpus if total_corpus > 0 else 0.0
775
+
776
+ # File coverage
777
+ files_in_corpus = set(corpus_by_file.keys())
778
+ files_covered = set(covered_by_file.keys())
779
+ file_coverage = len(files_covered) / len(files_in_corpus) if files_in_corpus else 0.0
780
+
781
+ # Coverage by file
782
+ coverage_by_file = {}
783
+ for file_name in files_in_corpus:
784
+ total_in_file = corpus_by_file[file_name]
785
+ covered_in_file = len([k for k in covered_chunks if k[0] == file_name])
786
+ coverage_by_file[file_name] = {
787
+ "total_chunks": total_in_file,
788
+ "covered_chunks": covered_in_file,
789
+ "coverage_rate": covered_in_file / total_in_file if total_in_file > 0 else 0.0
790
+ }
791
+
792
+ # Coverage by chunk type
793
+ chunk_type_coverage = {}
794
+ for chunk_type in corpus_by_type.keys():
795
+ total_of_type = corpus_by_type[chunk_type]
796
+ covered_of_type = sum(1 for k in covered_chunks
797
+ if corpus_index.get(k, {}).get('chunk_type') == chunk_type)
798
+ chunk_type_coverage[chunk_type] = {
799
+ "total": total_of_type,
800
+ "covered": covered_of_type,
801
+ "coverage_rate": covered_of_type / total_of_type if total_of_type > 0 else 0.0
802
+ }
803
+
804
+ # Calculate Jensen-Shannon divergence for topic distribution
805
+ # Using file distribution as proxy for topic distribution
806
+ all_files = sorted(files_in_corpus)
807
+ corpus_dist = np.array([corpus_by_file.get(f, 0) for f in all_files], dtype=float)
808
+ corpus_dist = corpus_dist / corpus_dist.sum() if corpus_dist.sum() > 0 else corpus_dist
809
+
810
+ qa_file_counts = Counter()
811
+ for qa in qa_data:
812
+ for chunk_ref in qa.get('chunks_added', []):
813
+ qa_file_counts[chunk_ref.get('file_name')] += 1
814
+
815
+ qa_dist = np.array([qa_file_counts.get(f, 0) for f in all_files], dtype=float)
816
+ qa_dist = qa_dist / qa_dist.sum() if qa_dist.sum() > 0 else qa_dist
817
+
818
+ # Jensen-Shannon divergence (symmetric KL divergence)
819
+ # Add small epsilon to avoid log(0)
820
+ eps = 1e-10
821
+ corpus_dist = corpus_dist + eps
822
+ qa_dist = qa_dist + eps
823
+ corpus_dist = corpus_dist / corpus_dist.sum()
824
+ qa_dist = qa_dist / qa_dist.sum()
825
+
826
+ m = 0.5 * (corpus_dist + qa_dist)
827
+ js_divergence = 0.5 * (entropy(corpus_dist, m) + entropy(qa_dist, m))
828
+
829
+ return {
830
+ "chunk_coverage": chunk_coverage,
831
+ "file_coverage": file_coverage,
832
+ "chunks_covered": total_covered,
833
+ "chunks_total": total_corpus,
834
+ "uncovered_chunks": total_corpus - total_covered,
835
+ "topic_divergence_js": float(js_divergence),
836
+ "chunk_type_coverage": chunk_type_coverage,
837
+ "coverage_by_file": coverage_by_file
838
+ }
839
+
840
+ # ==========================================
841
+ # MAIN PIPELINE
842
+ # ==========================================
843
+ def run_full_evaluation(self, json_path: str, output_path: str = "eval_report.json"):
844
+ """
845
+ Runs the full suite of metrics on the provided JSON dataset.
846
+ """
847
+ data = self.load_dataset(json_path)
848
+ metrics_log = []
849
+
850
+ print(f"Starting evaluation for {len(data)} items...")
851
+
852
+ # 1. Run Standard Metrics (Batch)
853
+ # Note: Ragas requires 'ground_truth' for some metrics.
854
+ # If your dataset is purely synthetic without human labels,
855
+ # context_precision/recall might be approximations based on generated answers.
856
+ ragas_df = self.evaluate_ragas_standard(data)
857
+
858
+ # 2. Run Custom Agentic Metrics (Iterative)
859
+ questions = [d['question'] for d in data]
860
+
861
+ for i, item in enumerate(data):
862
+ if i % 5 == 0:
863
+ print(f"Processing item {i+1}/{len(data)}...")
864
+
865
+ # A. Multi-hop Evaluation
866
+ mh_res = self.evaluate_multihop_reasoning(
867
+ item['question'],
868
+ item['answer'],
869
+ item['contexts']
870
+ )
871
+
872
+ # B. Visual Dependency (Only for items marked as visual/multimodal)
873
+ # Checks metadata type or if image_contexts exist
874
+ is_visual = (
875
+ item.get('metadata', {}).get('type') in ['visual', 'chart', 'table'] or
876
+ len(item.get('image_contexts', [])) > 0
877
+ )
878
+
879
+ vis_score = 0.0
880
+ if is_visual:
881
+ # For the blind test, we pass ONLY text contexts, excluding image descriptions
882
+ vis_score = self.evaluate_visual_dependency(item['question'], item['contexts'])
883
+
884
+ # C. VLM-based Multimodal Metrics (only if images available)
885
+ context_chunks = item.get('context_chunks', [])
886
+ vlm_faithfulness = None
887
+ vlm_answer_quality = None
888
+
889
+ if is_visual and VLM_AVAILABLE and context_chunks:
890
+ print(f" Running VLM multimodal evaluation for item {i+1}...")
891
+ vlm_faithfulness = self.evaluate_multimodal_faithfulness_vlm(
892
+ item['question'], item['answer'], context_chunks
893
+ )
894
+ vlm_answer_quality = self.evaluate_multimodal_answer_quality_vlm(
895
+ item['question'], item['answer'], context_chunks
896
+ )
897
+
898
+ metrics_log.append({
899
+ "hop_count": mh_res['hop_count'],
900
+ "reasoning_score": mh_res['reasoning_score'],
901
+ "bridge_entity": mh_res['bridge_entity'],
902
+ "visual_dependency": vis_score if is_visual else None,
903
+ "is_visual": is_visual,
904
+ # VLM Multimodal metrics
905
+ "vlm_faithfulness_score": vlm_faithfulness['score'] if vlm_faithfulness else None,
906
+ "vlm_text_supported": vlm_faithfulness['text_supported'] if vlm_faithfulness else None,
907
+ "vlm_visual_supported": vlm_faithfulness['visual_supported'] if vlm_faithfulness else None,
908
+ "vlm_completeness": vlm_answer_quality['completeness'] if vlm_answer_quality else None,
909
+ "vlm_accuracy": vlm_answer_quality['accuracy'] if vlm_answer_quality else None,
910
+ "vlm_uses_visual": vlm_answer_quality['uses_visual_info'] if vlm_answer_quality else None,
911
+ "vlm_overall_score": vlm_answer_quality['overall_score'] if vlm_answer_quality else None,
912
+ })
913
+
914
+ # 3. Diversity Evaluation
915
+ diversity_score = self.evaluate_semantic_diversity(questions)
916
+
917
+ # 4. Aggregate Results
918
+ custom_df = pd.DataFrame(metrics_log)
919
+ final_df = pd.concat([ragas_df, custom_df], axis=1)
920
+
921
+ # Calculate Summary Statistics
922
+ rag_quality = {
923
+ "Faithfulness": final_df['faithfulness'].mean(),
924
+ "Answer_Relevance": final_df['answer_relevancy'].mean() if 'answer_relevancy' in final_df.columns else final_df.get('answer_relevance', pd.Series([0])).mean(),
925
+ "Context_Precision": final_df['context_precision'].mean(),
926
+ "Context_Recall": final_df['context_recall'].mean(),
927
+ }
928
+
929
+ # Add optional metrics if they were computed
930
+ if HAS_ENTITY_RECALL and 'context_entity_recall' in final_df.columns:
931
+ rag_quality["Context_Entity_Recall"] = final_df['context_entity_recall'].mean()
932
+ if HAS_NOISE_SENSITIVITY and 'noise_sensitivity_relevant' in final_df.columns:
933
+ rag_quality["Noise_Sensitivity"] = final_df['noise_sensitivity_relevant'].mean()
934
+
935
+ multimodal_quality = {
936
+ "Visual_Necessity_Rate": final_df[final_df['is_visual'] == True]['visual_dependency'].mean()
937
+ if not final_df[final_df['is_visual'] == True].empty else 0.0
938
+ }
939
+
940
+ # Add multimodal ragas metrics if available
941
+ if HAS_MULTIMODAL:
942
+ if 'multimodal_faithfulness' in final_df.columns:
943
+ multimodal_quality["Multimodal_Faithfulness"] = final_df['multimodal_faithfulness'].mean()
944
+ if 'multimodal_relevance' in final_df.columns:
945
+ multimodal_quality["Multimodal_Relevance"] = final_df['multimodal_relevance'].mean()
946
+
947
+ # Add VLM-based multimodal metrics
948
+ visual_items = final_df[final_df['is_visual'] == True]
949
+ if not visual_items.empty:
950
+ if 'vlm_faithfulness_score' in final_df.columns:
951
+ vlm_faith_scores = visual_items['vlm_faithfulness_score'].dropna()
952
+ if len(vlm_faith_scores) > 0:
953
+ multimodal_quality["VLM_Faithfulness_Score"] = vlm_faith_scores.mean()
954
+ if 'vlm_overall_score' in final_df.columns:
955
+ vlm_overall_scores = visual_items['vlm_overall_score'].dropna()
956
+ if len(vlm_overall_scores) > 0:
957
+ multimodal_quality["VLM_Overall_Quality"] = vlm_overall_scores.mean()
958
+ if 'vlm_accuracy' in final_df.columns:
959
+ vlm_accuracy_scores = visual_items['vlm_accuracy'].dropna()
960
+ if len(vlm_accuracy_scores) > 0:
961
+ multimodal_quality["VLM_Accuracy"] = vlm_accuracy_scores.mean()
962
+ if 'vlm_completeness' in final_df.columns:
963
+ vlm_completeness_scores = visual_items['vlm_completeness'].dropna()
964
+ if len(vlm_completeness_scores) > 0:
965
+ multimodal_quality["VLM_Completeness"] = vlm_completeness_scores.mean()
966
+ # Count items using visual info
967
+ if 'vlm_uses_visual' in final_df.columns:
968
+ uses_visual_count = visual_items['vlm_uses_visual'].sum()
969
+ multimodal_quality["Items_Using_Visual_Info"] = int(uses_visual_count)
970
+ multimodal_quality["Visual_Info_Usage_Rate"] = uses_visual_count / len(visual_items) if len(visual_items) > 0 else 0.0
971
+
972
+ report = {
973
+ "RAG_Quality": rag_quality,
974
+ "Reasoning_Complexity": {
975
+ "Avg_Reasoning_Score": final_df['reasoning_score'].mean(),
976
+ "Avg_Hop_Count": final_df['hop_count'].mean(),
977
+ },
978
+ "Multimodal_Quality": multimodal_quality,
979
+ "Dataset_Health": {
980
+ "Semantic_Diversity": diversity_score,
981
+ "Total_Samples": len(data)
982
+ }
983
+ }
984
+
985
+ # Save detailed results
986
+ final_df.to_csv(output_path.replace(".json", "_detailed.csv"), index=False)
987
+ with open(output_path, "w") as f:
988
+ json.dump(report, f, indent=4)
989
+
990
+ print("\nEvaluation Complete. Summary:")
991
+ print(json.dumps(report, indent=2))
992
+
993
+ return final_df, report
994
+
995
+ def transform_qa_data(raw_data: List[Dict]) -> List[Dict]:
996
+ """
997
+ Transform qa_multihop_pass.json format to the format expected by the evaluator.
998
+
999
+ Input format (from qa_multihop_pass.json):
1000
+ - chunk_id, original_chunk, final_context, context_chunks, context_status,
1001
+ - depth_reached, chunks_added, expert_persona, domain, question, answer,
1002
+ - relevance_score, difficulty_score, selection_status, selection_reason, verification_result
1003
+
1004
+ Expected format:
1005
+ - question, answer, contexts (list), ground_truth, metadata, image_contexts, context_chunks
1006
+ """
1007
+ transformed = []
1008
+ for item in raw_data:
1009
+ # Extract context_chunks which contain both text content and image_path
1010
+ context_chunks = item.get("context_chunks", [])
1011
+
1012
+ # Extract text contexts from context_chunks or fall back to final_context
1013
+ if context_chunks:
1014
+ contexts = [chunk.get("content", "") for chunk in context_chunks if chunk.get("content")]
1015
+ else:
1016
+ contexts = [item.get("final_context", item.get("original_chunk", ""))]
1017
+
1018
+ # Extract image paths from context_chunks
1019
+ image_contexts = []
1020
+ for chunk in context_chunks:
1021
+ img_path = chunk.get("image_path")
1022
+ if img_path and img_path != "null":
1023
+ image_contexts.append(img_path)
1024
+
1025
+ # Determine if this is a visual/multimodal item
1026
+ has_images = len(image_contexts) > 0
1027
+ item_type = "visual" if has_images else "text"
1028
+
1029
+ transformed.append({
1030
+ "question": item.get("question", ""),
1031
+ "answer": item.get("answer", ""),
1032
+ "contexts": contexts,
1033
+ "ground_truth": item.get("answer", ""), # Using answer as ground_truth (no human labels)
1034
+ "metadata": {
1035
+ "chunk_id": item.get("chunk_id"),
1036
+ "expert_persona": item.get("expert_persona"),
1037
+ "domain": item.get("domain"),
1038
+ "context_status": item.get("context_status"),
1039
+ "relevance_score": item.get("relevance_score"),
1040
+ "difficulty_score": item.get("difficulty_score"),
1041
+ "selection_status": item.get("selection_status"),
1042
+ "depth_reached": item.get("depth_reached"),
1043
+ "chunks_added": item.get("chunks_added", []),
1044
+ "type": item_type,
1045
+ },
1046
+ "image_contexts": image_contexts,
1047
+ "context_chunks": context_chunks, # Keep full chunks for VLM calls
1048
+ })
1049
+ return transformed
1050
+
1051
+
1052
+ def analyze_missing_information(raw_data: List[Dict]) -> Dict[str, Any]:
1053
+ """
1054
+ Analyze which metrics cannot be fully evaluated and what information is missing.
1055
+ """
1056
+ missing_info = {
1057
+ "metrics_status": {},
1058
+ "missing_fields": [],
1059
+ "recommendations": [],
1060
+ "image_stats": {}
1061
+ }
1062
+
1063
+ # Check what's available
1064
+ sample = raw_data[0] if raw_data else {}
1065
+ available_fields = set(sample.keys())
1066
+
1067
+ # Count items with images in context_chunks
1068
+ items_with_images = 0
1069
+ total_images = 0
1070
+ for item in raw_data:
1071
+ context_chunks = item.get("context_chunks", [])
1072
+ item_has_image = False
1073
+ for chunk in context_chunks:
1074
+ img_path = chunk.get("image_path")
1075
+ if img_path and img_path != "null" and img_path is not None:
1076
+ total_images += 1
1077
+ item_has_image = True
1078
+ if item_has_image:
1079
+ items_with_images += 1
1080
+
1081
+ missing_info["image_stats"] = {
1082
+ "total_items": len(raw_data),
1083
+ "items_with_images": items_with_images,
1084
+ "total_images": total_images,
1085
+ "has_context_chunks": "context_chunks" in available_fields
1086
+ }
1087
+
1088
+ has_images = items_with_images > 0
1089
+
1090
+ # 1. RAGAS Standard Metrics
1091
+ missing_info["metrics_status"]["faithfulness"] = {
1092
+ "can_evaluate": True,
1093
+ "quality": "FULL",
1094
+ "notes": "Question, answer, and context available."
1095
+ }
1096
+ missing_info["metrics_status"]["answer_relevance"] = {
1097
+ "can_evaluate": True,
1098
+ "quality": "FULL",
1099
+ "notes": "Question and answer available."
1100
+ }
1101
+ missing_info["metrics_status"]["context_precision"] = {
1102
+ "can_evaluate": True,
1103
+ "quality": "APPROXIMATED",
1104
+ "notes": "No human-labeled ground_truth. Using generated answer as proxy.",
1105
+ "missing": ["Human-annotated ground truth answers"]
1106
+ }
1107
+ missing_info["metrics_status"]["context_recall"] = {
1108
+ "can_evaluate": True,
1109
+ "quality": "APPROXIMATED",
1110
+ "notes": "No human-labeled ground_truth. Using generated answer as proxy.",
1111
+ "missing": ["Human-annotated ground truth answers"]
1112
+ }
1113
+
1114
+ # 2. Multi-hop Reasoning Metric
1115
+ missing_info["metrics_status"]["multihop_reasoning"] = {
1116
+ "can_evaluate": True,
1117
+ "quality": "FULL",
1118
+ "notes": "Question, answer, and context available for LLM-as-a-Judge evaluation."
1119
+ }
1120
+
1121
+ # 3. Visual Dependency Metric
1122
+ if has_images:
1123
+ missing_info["metrics_status"]["visual_dependency"] = {
1124
+ "can_evaluate": True,
1125
+ "quality": "FULL",
1126
+ "notes": f"Found {items_with_images} items with {total_images} images in context_chunks."
1127
+ }
1128
+ else:
1129
+ missing_info["metrics_status"]["visual_dependency"] = {
1130
+ "can_evaluate": False,
1131
+ "quality": "NOT_APPLICABLE",
1132
+ "notes": "No image data found in context_chunks.",
1133
+ "missing": [
1134
+ "context_chunks[].image_path: Image file paths in context chunks",
1135
+ "Ensure source chunks have 'artifact' fields with image references"
1136
+ ]
1137
+ }
1138
+
1139
+ # 4. Multimodal VLM Metrics (custom implementation)
1140
+ if has_images:
1141
+ missing_info["metrics_status"]["multimodal_faithfulness_vlm"] = {
1142
+ "can_evaluate": True,
1143
+ "quality": "FULL",
1144
+ "notes": f"VLM-based evaluation using {total_images} images from context_chunks."
1145
+ }
1146
+ missing_info["metrics_status"]["multimodal_answer_quality_vlm"] = {
1147
+ "can_evaluate": True,
1148
+ "quality": "FULL",
1149
+ "notes": "VLM-based answer quality evaluation with visual context."
1150
+ }
1151
+ else:
1152
+ missing_info["metrics_status"]["multimodal_faithfulness_vlm"] = {
1153
+ "can_evaluate": False,
1154
+ "quality": "NOT_APPLICABLE",
1155
+ "notes": "Requires image data in context_chunks.",
1156
+ "missing": ["context_chunks[].image_path: Image file paths"]
1157
+ }
1158
+ missing_info["metrics_status"]["multimodal_answer_quality_vlm"] = {
1159
+ "can_evaluate": False,
1160
+ "quality": "NOT_APPLICABLE",
1161
+ "notes": "Requires image data in context_chunks.",
1162
+ "missing": ["context_chunks[].image_path: Image file paths"]
1163
+ }
1164
+
1165
+ # 5. Semantic Diversity
1166
+ missing_info["metrics_status"]["semantic_diversity"] = {
1167
+ "can_evaluate": True,
1168
+ "quality": "FULL",
1169
+ "notes": "Questions available for embedding-based diversity calculation."
1170
+ }
1171
+
1172
+ # 6. Context Necessity (Anti-Parametric Bias)
1173
+ missing_info["metrics_status"]["context_necessity"] = {
1174
+ "can_evaluate": True,
1175
+ "quality": "FULL",
1176
+ "notes": "Question, answer, and context available for LLM-based necessity evaluation."
1177
+ }
1178
+
1179
+ # 7. Domain Coverage
1180
+ missing_info["metrics_status"]["domain_coverage"] = {
1181
+ "can_evaluate": True,
1182
+ "quality": "FULL" if "chunks_added" in available_fields else "LIMITED",
1183
+ "notes": "Chunk references available for coverage calculation. Requires corpus chunks.json."
1184
+ }
1185
+
1186
+ # Recommendations
1187
+ recommendations = [
1188
+ "Add 'ground_truth' field with human-annotated answers for accurate context_precision/recall"
1189
+ ]
1190
+ if not has_images:
1191
+ recommendations.append("Ensure source chunks have 'artifact' fields with image paths for multimodal metrics")
1192
+ recommendations.append("Re-run QA generation with updated context_retrieved.py to capture images")
1193
+
1194
+ missing_info["recommendations"] = recommendations
1195
+
1196
+ return missing_info
1197
+
1198
+
1199
+ def identify_qa_subsets(raw_data: List[Dict]) -> Dict[str, List[Dict]]:
1200
+ """
1201
+ Identify QA subsets: multihop, multimodal, and their intersection.
1202
+
1203
+ Returns:
1204
+ Dict with keys: 'all', 'multihop', 'multimodal', 'multihop_and_multimodal'
1205
+ """
1206
+ all_qa = raw_data
1207
+
1208
+ # Multihop: hop_count > 0 (each hop adds one link between chunks)
1209
+ # hop_count = len(chunks_added) - 1
1210
+ multihop = [
1211
+ qa for qa in raw_data
1212
+ if qa.get('hop_count', 0) > 0
1213
+ ]
1214
+
1215
+ # Multimodal: content mentions figures/tables/images OR has image artifacts
1216
+ multimodal = []
1217
+ multimodal_keywords = ['figure', 'diagram', 'table', 'image', 'chart', '![image]', 'block diagram']
1218
+
1219
+ for qa in raw_data:
1220
+ content = (qa.get('original_chunk', '') + ' ' + qa.get('final_context', '')).lower()
1221
+ # Check for visual keywords in content
1222
+ has_visual_content = any(kw in content for kw in multimodal_keywords)
1223
+ # Check for explicit image paths in context_chunks
1224
+ context_chunks = qa.get('context_chunks', [])
1225
+ has_image_path = any(
1226
+ chunk.get('image_path') and chunk.get('image_path') != 'null'
1227
+ for chunk in context_chunks
1228
+ )
1229
+ if has_visual_content or has_image_path:
1230
+ multimodal.append(qa)
1231
+
1232
+ # Intersection
1233
+ multihop_ids = set(id(qa) for qa in multihop)
1234
+ multihop_and_multimodal = [qa for qa in multimodal if id(qa) in multihop_ids]
1235
+
1236
+ return {
1237
+ 'all': all_qa,
1238
+ 'multihop': multihop,
1239
+ 'multimodal': multimodal,
1240
+ 'multihop_and_multimodal': multihop_and_multimodal
1241
+ }
1242
+
1243
+
1244
+ def _count_tokens(text: str) -> int:
1245
+ """
1246
+ Count tokens using tokenizer if available, otherwise use improved approximation.
1247
+
1248
+ Approximation: ~0.75 tokens per word for English (GPT-4 average).
1249
+ For better accuracy, uses tiktoken if available, otherwise word-based estimate.
1250
+ """
1251
+ if not text:
1252
+ return 0
1253
+
1254
+ # Try tiktoken (fast, accurate for GPT models, handles long text without warnings)
1255
+ try:
1256
+ import tiktoken
1257
+ enc = tiktoken.get_encoding("cl100k_base") # GPT-4 tokenizer
1258
+ return len(enc.encode(text))
1259
+ except ImportError:
1260
+ pass
1261
+
1262
+ # Try transformers tokenizer with large model_max_length to avoid warnings
1263
+ try:
1264
+ from transformers import GPT2TokenizerFast
1265
+ tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", model_max_length=100000)
1266
+ return len(tokenizer.encode(text))
1267
+ except (ImportError, Exception):
1268
+ pass
1269
+
1270
+ # Fallback: improved word-based approximation
1271
+ # GPT-4 average: ~0.75 tokens per word, but varies by language
1272
+ # Using 1.3 tokens per word as conservative estimate (accounts for punctuation, etc.)
1273
+ words = len(text.split())
1274
+ return int(words * 1.3)
1275
+
1276
+
1277
+ def _count_pages_from_pdf(pdf_path: str) -> int:
1278
+ """Count pages from PDF using pypdfium2 or pypdf."""
1279
+ try:
1280
+ import pypdfium2 as pdfium
1281
+ with open(pdf_path, 'rb') as f:
1282
+ pdf = pdfium.PdfDocument(f.read())
1283
+ return len(pdf)
1284
+ except ImportError:
1285
+ try:
1286
+ import pypdf
1287
+ with open(pdf_path, 'rb') as f:
1288
+ pdf_reader = pypdf.PdfReader(f)
1289
+ return len(pdf_reader.pages)
1290
+ except ImportError:
1291
+ pass
1292
+ except Exception:
1293
+ pass
1294
+ return 0
1295
+
1296
+
1297
+ def _count_pages_from_html_or_markdown(file_path: str) -> int:
1298
+ """
1299
+ Estimate pages from HTML or Markdown files.
1300
+ Uses content length with reasonable assumptions:
1301
+ - Average page: ~2000-2500 words or ~12,000-15,000 characters
1302
+ - For HTML: strips tags first
1303
+ """
1304
+ try:
1305
+ content = Path(file_path).read_text(encoding='utf-8')
1306
+
1307
+ # For HTML, strip tags to get actual content
1308
+ if file_path.lower().endswith('.html') or file_path.lower().endswith('.htm'):
1309
+ import re
1310
+ # Remove HTML tags
1311
+ content = re.sub(r'<[^>]+>', '', content)
1312
+ # Remove extra whitespace
1313
+ content = ' '.join(content.split())
1314
+
1315
+ # Estimate: ~2500 words per page or ~13,000 chars per page
1316
+ word_count = len(content.split())
1317
+ char_count = len(content)
1318
+
1319
+ # Use word-based estimate (more reliable)
1320
+ pages_from_words = max(1, word_count // 2500)
1321
+ # Use char-based estimate as backup
1322
+ pages_from_chars = max(1, char_count // 13000)
1323
+
1324
+ # Return average, rounded up
1325
+ return max(pages_from_words, pages_from_chars)
1326
+ except Exception:
1327
+ return 0
1328
+
1329
+
1330
+ def compute_corpus_and_dataset_stats(
1331
+ qa_data: List[Dict],
1332
+ corpus_chunks: List[Dict] = None,
1333
+ pdf_dir: str = None,
1334
+ markdown_dir: str = None
1335
+ ) -> Dict[str, Any]:
1336
+ """
1337
+ Compute comprehensive corpus and dataset statistics.
1338
+
1339
+ Computes:
1340
+ - Corpus stats: #chunks, #multimodal chunks, #tables, #images, #tokens
1341
+ - Context stats: Distribution by hop count
1342
+ - QA stats: By modality categories (Multimodal, Table, Table+Image) and hop counts
1343
+
1344
+ Args:
1345
+ qa_data: List of QA pairs
1346
+ corpus_chunks: List of corpus chunks (from chunks.json)
1347
+ pdf_dir: Path to original PDF files directory (for accurate page count using pypdfium2/pypdf)
1348
+ markdown_dir: Path to markdown/HTML files directory (fallback for page count if PDFs not available)
1349
+
1350
+ Returns:
1351
+ Dict with corpus_stats, context_stats, and qa_category_stats
1352
+ """
1353
+ import re
1354
+ from pathlib import Path
1355
+
1356
+ stats = {
1357
+ "corpus_stats": {},
1358
+ "context_stats": {},
1359
+ "qa_category_stats": {}
1360
+ }
1361
+
1362
+ # ==========================================
1363
+ # 1. CORPUS STATS (from chunks.json)
1364
+ # ==========================================
1365
+ if corpus_chunks:
1366
+ total_chunks = len(corpus_chunks)
1367
+
1368
+ # Count by chunk type
1369
+ text_chunks = 0
1370
+ table_chunks = 0
1371
+ image_chunks = 0
1372
+ multimodal_chunks = 0 # Chunks with images (standalone or embedded)
1373
+ total_tokens = 0
1374
+
1375
+ for chunk in corpus_chunks:
1376
+ chunk_type = chunk.get('chunk_type', 'text').lower()
1377
+ content = chunk.get('content', '')
1378
+ artifact = chunk.get('artifact', 'None')
1379
+
1380
+ # Count tokens using proper tokenizer or improved approximation
1381
+ total_tokens += _count_tokens(content)
1382
+
1383
+ if chunk_type == 'table':
1384
+ table_chunks += 1
1385
+ elif chunk_type == 'standalone image':
1386
+ image_chunks += 1
1387
+ multimodal_chunks += 1
1388
+ else:
1389
+ text_chunks += 1
1390
+
1391
+ # Check for embedded images in content or artifact
1392
+ has_image = (
1393
+ artifact and artifact != 'None' and '![' in str(artifact)
1394
+ ) or '![' in content
1395
+
1396
+ if has_image and chunk_type != 'standalone image':
1397
+ multimodal_chunks += 1
1398
+
1399
+ stats["corpus_stats"] = {
1400
+ "total_chunks": total_chunks,
1401
+ "text_chunks": text_chunks,
1402
+ "table_chunks": table_chunks,
1403
+ "image_chunks": image_chunks,
1404
+ "multimodal_chunks": multimodal_chunks,
1405
+ "total_tokens": total_tokens, # Renamed from total_tokens_approx
1406
+ "avg_tokens_per_chunk": round(total_tokens / total_chunks, 1) if total_chunks > 0 else 0
1407
+ }
1408
+
1409
+ # Count unique files
1410
+ unique_files = set(chunk.get('file_name', '') for chunk in corpus_chunks)
1411
+ stats["corpus_stats"]["num_source_files"] = len(unique_files)
1412
+
1413
+ # ==========================================
1414
+ # PAGE COUNTING (from PDFs if available, otherwise markdown/HTML)
1415
+ # ==========================================
1416
+ total_pages = 0
1417
+ pages_counted = 0
1418
+
1419
+ # First, try to count from original PDFs (most accurate)
1420
+ if pdf_dir and os.path.exists(pdf_dir):
1421
+ pdf_files = list(Path(pdf_dir).glob("*.pdf"))
1422
+ for pdf_file in pdf_files:
1423
+ page_count = _count_pages_from_pdf(str(pdf_file))
1424
+ if page_count > 0:
1425
+ total_pages += page_count
1426
+ pages_counted += 1
1427
+
1428
+ # If no PDFs found or PDF dir not provided, try markdown files
1429
+ if total_pages == 0 and markdown_dir and os.path.exists(markdown_dir):
1430
+ md_files = list(Path(markdown_dir).rglob("*.md"))
1431
+ html_files = list(Path(markdown_dir).rglob("*.html")) + list(Path(markdown_dir).rglob("*.htm"))
1432
+
1433
+ for md_file in md_files:
1434
+ page_count = _count_pages_from_html_or_markdown(str(md_file))
1435
+ total_pages += page_count
1436
+
1437
+ for html_file in html_files:
1438
+ page_count = _count_pages_from_html_or_markdown(str(html_file))
1439
+ total_pages += page_count
1440
+
1441
+ if total_pages > 0:
1442
+ stats["corpus_stats"]["total_pages"] = total_pages
1443
+ if pages_counted > 0:
1444
+ stats["corpus_stats"]["pages_counted_from_pdfs"] = pages_counted
1445
+
1446
+ # ==========================================
1447
+ # 2. CONTEXT STATS (from QA data)
1448
+ # ==========================================
1449
+ hop_distribution = Counter()
1450
+ context_sizes = [] # Number of chunks per context
1451
+
1452
+ for qa in qa_data:
1453
+ # Determine hop count from chunks_added
1454
+ chunks_added = qa.get('chunks_added', [])
1455
+ num_chunks = len(chunks_added) if isinstance(chunks_added, list) else 1
1456
+ context_sizes.append(num_chunks)
1457
+
1458
+ # Hop count = number of chunks - 1 (0-hop means single chunk)
1459
+ hop_count = max(0, num_chunks - 1)
1460
+ hop_distribution[hop_count] += 1
1461
+
1462
+ stats["context_stats"] = {
1463
+ "total_contexts": len(qa_data),
1464
+ "hop_distribution": dict(sorted(hop_distribution.items())),
1465
+ "avg_chunks_per_context": round(np.mean(context_sizes), 2) if context_sizes else 0,
1466
+ "max_chunks_in_context": max(context_sizes) if context_sizes else 0
1467
+ }
1468
+
1469
+ # Add summary counts
1470
+ for hop in range(max(hop_distribution.keys()) + 1 if hop_distribution else 0):
1471
+ stats["context_stats"][f"num_{hop}_hop_contexts"] = hop_distribution.get(hop, 0)
1472
+
1473
+ # ==========================================
1474
+ # 3. QA CATEGORY STATS
1475
+ # ==========================================
1476
+ # Categories based on context content:
1477
+ # - Multimodal: at least one image in context
1478
+ # - Table: at least one table in context
1479
+ # - Table+Image: both table and image in context
1480
+
1481
+ def context_has_table(qa: Dict) -> bool:
1482
+ """Check if context contains a table."""
1483
+ context_chunks = qa.get('context_chunks', [])
1484
+ final_context = qa.get('final_context', '')
1485
+
1486
+ # Check in context_chunks
1487
+ for chunk in context_chunks:
1488
+ content = chunk.get('content', '')
1489
+ if '|' in content and '-|' in content: # Markdown table pattern
1490
+ return True
1491
+
1492
+ # Check in final_context
1493
+ if '|' in final_context and '-|' in final_context:
1494
+ return True
1495
+
1496
+ return False
1497
+
1498
+ def context_has_image(qa: Dict) -> bool:
1499
+ """Check if context contains an image."""
1500
+ context_chunks = qa.get('context_chunks', [])
1501
+
1502
+ for chunk in context_chunks:
1503
+ image_path = chunk.get('image_path')
1504
+ if image_path and image_path != 'null' and image_path is not None:
1505
+ return True
1506
+ content = chunk.get('content', '')
1507
+ if '![' in content: # Markdown image pattern
1508
+ return True
1509
+
1510
+ return False
1511
+
1512
+ # Initialize counters
1513
+ total_qa = len(qa_data)
1514
+ multimodal_qa = []
1515
+ table_qa = []
1516
+ table_image_qa = []
1517
+ text_only_qa = []
1518
+
1519
+ # Per-hop counters for each category
1520
+ hop_multimodal = Counter()
1521
+ hop_table = Counter()
1522
+ hop_table_image = Counter()
1523
+ hop_text_only = Counter()
1524
+ hop_all = Counter()
1525
+
1526
+ for qa in qa_data:
1527
+ chunks_added = qa.get('chunks_added', [])
1528
+ num_chunks = len(chunks_added) if isinstance(chunks_added, list) else 1
1529
+ hop_count = max(0, num_chunks - 1)
1530
+
1531
+ has_table = context_has_table(qa)
1532
+ has_image = context_has_image(qa)
1533
+
1534
+ hop_all[hop_count] += 1
1535
+
1536
+ if has_table and has_image:
1537
+ table_image_qa.append(qa)
1538
+ hop_table_image[hop_count] += 1
1539
+ elif has_image:
1540
+ multimodal_qa.append(qa)
1541
+ hop_multimodal[hop_count] += 1
1542
+ elif has_table:
1543
+ table_qa.append(qa)
1544
+ hop_table[hop_count] += 1
1545
+ else:
1546
+ text_only_qa.append(qa)
1547
+ hop_text_only[hop_count] += 1
1548
+
1549
+ # Also count inclusive categories (for reporting)
1550
+ # Multimodal (any image): includes table_image
1551
+ multimodal_inclusive = [qa for qa in qa_data if context_has_image(qa)]
1552
+ # Table (any table): includes table_image
1553
+ table_inclusive = [qa for qa in qa_data if context_has_table(qa)]
1554
+
1555
+ stats["qa_category_stats"] = {
1556
+ "total_qa_pairs": total_qa,
1557
+
1558
+ # Exclusive categories (mutually exclusive)
1559
+ "text_only_qa": len(text_only_qa),
1560
+ "table_only_qa": len(table_qa),
1561
+ "image_only_qa": len(multimodal_qa),
1562
+ "table_and_image_qa": len(table_image_qa),
1563
+
1564
+ # Inclusive categories (overlapping)
1565
+ "multimodal_qa_inclusive": len(multimodal_inclusive), # Any QA with image
1566
+ "table_qa_inclusive": len(table_inclusive), # Any QA with table
1567
+
1568
+ # Hop distribution for all QA
1569
+ "qa_hop_distribution": dict(sorted(hop_all.items())),
1570
+
1571
+ # Hop distribution by category
1572
+ "text_only_by_hop": dict(sorted(hop_text_only.items())),
1573
+ "table_only_by_hop": dict(sorted(hop_table.items())),
1574
+ "image_only_by_hop": dict(sorted(hop_multimodal.items())),
1575
+ "table_and_image_by_hop": dict(sorted(hop_table_image.items()))
1576
+ }
1577
+
1578
+ # Add summary counts per hop
1579
+ max_hop = max(hop_all.keys()) if hop_all else 0
1580
+ for hop in range(max_hop + 1):
1581
+ stats["qa_category_stats"][f"num_{hop}_hop_qa"] = hop_all.get(hop, 0)
1582
+
1583
+ # Compute multimodal inclusive by hop
1584
+ hop_multimodal_inclusive = Counter()
1585
+ hop_table_inclusive = Counter()
1586
+ for qa in qa_data:
1587
+ chunks_added = qa.get('chunks_added', [])
1588
+ num_chunks = len(chunks_added) if isinstance(chunks_added, list) else 1
1589
+ hop_count = max(0, num_chunks - 1)
1590
+
1591
+ if context_has_image(qa):
1592
+ hop_multimodal_inclusive[hop_count] += 1
1593
+ if context_has_table(qa):
1594
+ hop_table_inclusive[hop_count] += 1
1595
+
1596
+ stats["qa_category_stats"]["multimodal_inclusive_by_hop"] = dict(sorted(hop_multimodal_inclusive.items()))
1597
+ stats["qa_category_stats"]["table_inclusive_by_hop"] = dict(sorted(hop_table_inclusive.items()))
1598
+
1599
+ return stats
1600
+
1601
+
1602
+ def run_subset_evaluation(
1603
+ qa_data: List[Dict],
1604
+ corpus_path: str = None,
1605
+ output_dir: str = None,
1606
+ sample_size: int = None,
1607
+ run_context_necessity: bool = True,
1608
+ pdf_dir: str = None,
1609
+ markdown_dir: str = None
1610
+ ) -> Dict[str, Any]:
1611
+ """
1612
+ Run comprehensive evaluation on QA dataset including new metrics.
1613
+
1614
+ Evaluates:
1615
+ - Corpus and Dataset Statistics (chunks, modalities, hop distributions)
1616
+ - Context Necessity (anti-parametric bias)
1617
+ - Domain Coverage (corpus coverage)
1618
+ - Subset statistics (multihop, multimodal counts)
1619
+ - Targeted metrics on subsets
1620
+
1621
+ Args:
1622
+ qa_data: List of QA pairs (raw format)
1623
+ corpus_path: Path to chunks.json for domain coverage
1624
+ output_dir: Output directory for reports
1625
+ sample_size: If set, sample this many items for expensive evaluations
1626
+ run_context_necessity: Whether to run context necessity (expensive)
1627
+ pdf_dir: Path to original PDF files directory (optional, for accurate page count)
1628
+ markdown_dir: Path to markdown/HTML files directory (optional, fallback for page count)
1629
+
1630
+ Returns:
1631
+ Dict with evaluation results
1632
+ """
1633
+ import random
1634
+
1635
+ results = {
1636
+ "corpus_stats": {},
1637
+ "context_stats": {},
1638
+ "qa_category_stats": {},
1639
+ "subset_statistics": {},
1640
+ "ragas_metrics": {}, # Faithfulness, Relevance, Precision, Recall
1641
+ "context_necessity": {},
1642
+ "domain_coverage": {},
1643
+ "multihop_metrics": {},
1644
+ "multimodal_metrics": {}
1645
+ }
1646
+
1647
+ # 0. Compute Corpus and Dataset Statistics
1648
+ print("\n" + "=" * 60)
1649
+ print("CORPUS AND DATASET STATISTICS")
1650
+ print("=" * 60)
1651
+
1652
+ corpus_chunks = None
1653
+ if corpus_path and os.path.exists(corpus_path):
1654
+ with open(corpus_path, 'r') as f:
1655
+ corpus_chunks = json.load(f)
1656
+
1657
+ # Auto-detect directories if not provided
1658
+ pdf_dir_auto = pdf_dir # Keep original if provided
1659
+ if corpus_path:
1660
+ base_dir = os.path.dirname(corpus_path)
1661
+
1662
+ # Try to find PDF directory (check parent directories)
1663
+ if not pdf_dir_auto:
1664
+ # Check config for input_pdf_dir
1665
+ try:
1666
+ from config_loader import get_paths_config
1667
+ paths_config = get_paths_config()
1668
+ potential_pdf_dir = paths_config.get('input_pdf_dir')
1669
+ if potential_pdf_dir and os.path.exists(potential_pdf_dir):
1670
+ pdf_dir_auto = potential_pdf_dir
1671
+ except:
1672
+ pass
1673
+
1674
+ # Fallback: check common locations relative to output_dir
1675
+ if not pdf_dir_auto:
1676
+ potential_pdf_dirs = [
1677
+ os.path.join(os.path.dirname(base_dir), "data"),
1678
+ os.path.join(base_dir, "..", "data"),
1679
+ "data/documents" # Default from config
1680
+ ]
1681
+ for pd in potential_pdf_dirs:
1682
+ if os.path.exists(pd) and any(Path(pd).glob("*.pdf")):
1683
+ pdf_dir_auto = pd
1684
+ break
1685
+
1686
+ # Auto-detect markdown_dir if not provided
1687
+ if not markdown_dir:
1688
+ potential_md_dir = os.path.join(base_dir, "markdown")
1689
+ if os.path.exists(potential_md_dir):
1690
+ markdown_dir = potential_md_dir
1691
+
1692
+ comprehensive_stats = compute_corpus_and_dataset_stats(
1693
+ qa_data=qa_data,
1694
+ corpus_chunks=corpus_chunks,
1695
+ pdf_dir=pdf_dir,
1696
+ markdown_dir=markdown_dir
1697
+ )
1698
+
1699
+ results["corpus_stats"] = comprehensive_stats["corpus_stats"]
1700
+ results["context_stats"] = comprehensive_stats["context_stats"]
1701
+ results["qa_category_stats"] = comprehensive_stats["qa_category_stats"]
1702
+
1703
+ # Print corpus stats
1704
+ cs = results["corpus_stats"]
1705
+ if cs:
1706
+ print(f"\n 📚 CORPUS STATS:")
1707
+ print(f" Total Chunks: {cs.get('total_chunks', 'N/A')}")
1708
+ print(f" Text Chunks: {cs.get('text_chunks', 'N/A')}")
1709
+ print(f" Table Chunks: {cs.get('table_chunks', 'N/A')}")
1710
+ print(f" Image Chunks: {cs.get('image_chunks', 'N/A')}")
1711
+ print(f" Multimodal Chunks: {cs.get('multimodal_chunks', 'N/A')}")
1712
+ print(f" Total Tokens: {cs.get('total_tokens', 'N/A'):,}")
1713
+ print(f" Source Files: {cs.get('num_source_files', 'N/A')}")
1714
+ if cs.get('total_pages'):
1715
+ page_source = "PDFs" if cs.get('pages_counted_from_pdfs') else "Markdown/HTML"
1716
+ print(f" Total Pages ({page_source}): {cs.get('total_pages')}")
1717
+
1718
+ # Print context stats
1719
+ ctx = results["context_stats"]
1720
+ print(f"\n 📋 CONTEXT STATS:")
1721
+ print(f" Total Contexts: {ctx.get('total_contexts', 'N/A')}")
1722
+ print(f" Avg Chunks/Context: {ctx.get('avg_chunks_per_context', 'N/A')}")
1723
+ print(f" Hop Distribution: {ctx.get('hop_distribution', {})}")
1724
+
1725
+ # Print QA category stats
1726
+ qa_cat = results["qa_category_stats"]
1727
+ print(f"\n 📊 QA CATEGORY STATS:")
1728
+ print(f" Total QA Pairs: {qa_cat.get('total_qa_pairs', 'N/A')}")
1729
+ print(f" Text-only QA: {qa_cat.get('text_only_qa', 'N/A')}")
1730
+ print(f" Table-only QA: {qa_cat.get('table_only_qa', 'N/A')}")
1731
+ print(f" Image-only QA: {qa_cat.get('image_only_qa', 'N/A')}")
1732
+ print(f" Table+Image QA: {qa_cat.get('table_and_image_qa', 'N/A')}")
1733
+ print(f" Multimodal QA (inclusive): {qa_cat.get('multimodal_qa_inclusive', 'N/A')}")
1734
+ print(f" Table QA (inclusive): {qa_cat.get('table_qa_inclusive', 'N/A')}")
1735
+ print(f" QA Hop Distribution: {qa_cat.get('qa_hop_distribution', {})}")
1736
+
1737
+ # 1. Identify subsets (legacy, kept for backwards compatibility)
1738
+ print("\n" + "=" * 60)
1739
+ print("SUBSET ANALYSIS")
1740
+ print("=" * 60)
1741
+
1742
+ subsets = identify_qa_subsets(qa_data)
1743
+
1744
+ results["subset_statistics"] = {
1745
+ "total_qa_pairs": len(subsets['all']),
1746
+ "multihop_count": len(subsets['multihop']),
1747
+ "multimodal_count": len(subsets['multimodal']),
1748
+ "multihop_and_multimodal_count": len(subsets['multihop_and_multimodal']),
1749
+ "single_hop_text_only": len(subsets['all']) - len(subsets['multihop']) - len(subsets['multimodal']) + len(subsets['multihop_and_multimodal'])
1750
+ }
1751
+
1752
+ print(f"\n Total QA pairs: {results['subset_statistics']['total_qa_pairs']}")
1753
+ print(f" Multihop QA pairs: {results['subset_statistics']['multihop_count']}")
1754
+ print(f" Multimodal QA pairs: {results['subset_statistics']['multimodal_count']}")
1755
+ print(f" Multihop AND Multimodal: {results['subset_statistics']['multihop_and_multimodal_count']}")
1756
+
1757
+ # 1.5. RAGAS Standard Metrics (Faithfulness, Relevance, Precision, Recall)
1758
+ print("\n" + "=" * 60)
1759
+ print("RAGAS STANDARD METRICS")
1760
+ print("=" * 60)
1761
+
1762
+ if RAGAS_AVAILABLE:
1763
+ try:
1764
+ evaluator = MultimodalFrameworkEvaluator()
1765
+
1766
+ # Transform data for RAGAS
1767
+ transformed_data = transform_qa_data(qa_data)
1768
+
1769
+ # Sample if needed for expensive RAGAS evaluation
1770
+ # RAGAS with 64 parallel workers. Limit samples to manage API costs.
1771
+ RAGAS_MAX_SAMPLES = 50 # Reasonable sample for statistical significance
1772
+ eval_data = transformed_data
1773
+ ragas_sample_size = min(sample_size or RAGAS_MAX_SAMPLES, RAGAS_MAX_SAMPLES)
1774
+ if len(transformed_data) > ragas_sample_size:
1775
+ eval_data = random.sample(transformed_data, ragas_sample_size)
1776
+ print(f"\n 📊 Sampling {ragas_sample_size}/{len(transformed_data)} items for RAGAS...")
1777
+
1778
+ print(f"\n ⚡ Running RAGAS evaluation on {len(eval_data)} items (parallel, ~2-5 min)...")
1779
+ ragas_df = evaluator.evaluate_ragas_standard(eval_data)
1780
+
1781
+ # Extract metrics
1782
+ ragas_results = {
1783
+ "faithfulness": float(ragas_df['faithfulness'].mean()) if 'faithfulness' in ragas_df.columns else None,
1784
+ "answer_relevance": float(ragas_df['answer_relevancy'].mean()) if 'answer_relevancy' in ragas_df.columns else None,
1785
+ "context_precision": float(ragas_df['context_precision'].mean()) if 'context_precision' in ragas_df.columns else None,
1786
+ "context_recall": float(ragas_df['context_recall'].mean()) if 'context_recall' in ragas_df.columns else None,
1787
+ "items_evaluated": len(eval_data)
1788
+ }
1789
+
1790
+ # Add optional metrics if available
1791
+ if HAS_ENTITY_RECALL and 'context_entity_recall' in ragas_df.columns:
1792
+ ragas_results["context_entity_recall"] = float(ragas_df['context_entity_recall'].mean())
1793
+ if HAS_MULTIMODAL:
1794
+ if 'multimodal_faithfulness' in ragas_df.columns:
1795
+ ragas_results["multimodal_faithfulness"] = float(ragas_df['multimodal_faithfulness'].mean())
1796
+ if 'multimodal_relevance' in ragas_df.columns:
1797
+ ragas_results["multimodal_relevance"] = float(ragas_df['multimodal_relevance'].mean())
1798
+
1799
+ results["ragas_metrics"] = ragas_results
1800
+
1801
+ print(f"\n 📊 RAGAS Results:")
1802
+ print(f" Faithfulness: {ragas_results.get('faithfulness', 'N/A'):.3f}" if ragas_results.get('faithfulness') else " Faithfulness: N/A")
1803
+ print(f" Answer Relevance: {ragas_results.get('answer_relevance', 'N/A'):.3f}" if ragas_results.get('answer_relevance') else " Answer Relevance: N/A")
1804
+ print(f" Context Precision: {ragas_results.get('context_precision', 'N/A'):.3f}" if ragas_results.get('context_precision') else " Context Precision: N/A")
1805
+ print(f" Context Recall: {ragas_results.get('context_recall', 'N/A'):.3f}" if ragas_results.get('context_recall') else " Context Recall: N/A")
1806
+
1807
+ except Exception as e:
1808
+ print(f"\n ⚠️ RAGAS evaluation failed: {e}")
1809
+ results["ragas_metrics"] = {"error": str(e)}
1810
+ else:
1811
+ print("\n ⚠️ RAGAS not available. Install with: pip install ragas datasets")
1812
+ results["ragas_metrics"] = {"error": "RAGAS not installed"}
1813
+
1814
+ # 2. Domain Coverage (if corpus provided)
1815
+ if corpus_path and os.path.exists(corpus_path):
1816
+ print("\n" + "=" * 60)
1817
+ print("DOMAIN COVERAGE EVALUATION")
1818
+ print("=" * 60)
1819
+
1820
+ with open(corpus_path, 'r') as f:
1821
+ corpus_chunks = json.load(f)
1822
+
1823
+ evaluator = MultimodalFrameworkEvaluator()
1824
+ coverage = evaluator.evaluate_domain_coverage(qa_data, corpus_chunks)
1825
+ results["domain_coverage"] = coverage
1826
+
1827
+ print(f"\n Chunk Coverage: {coverage['chunk_coverage']*100:.1f}% ({coverage['chunks_covered']}/{coverage['chunks_total']})")
1828
+ print(f" File Coverage: {coverage['file_coverage']*100:.1f}%")
1829
+ print(f" Topic Divergence (JS): {coverage['topic_divergence_js']:.4f}")
1830
+ print(f" Uncovered Chunks: {coverage['uncovered_chunks']}")
1831
+
1832
+ print("\n Coverage by Chunk Type:")
1833
+ for ctype, stats in coverage['chunk_type_coverage'].items():
1834
+ print(f" {ctype}: {stats['coverage_rate']*100:.1f}% ({stats['covered']}/{stats['total']})")
1835
+
1836
+ print("\n Coverage by File:")
1837
+ for fname, stats in coverage['coverage_by_file'].items():
1838
+ print(f" {fname}: {stats['coverage_rate']*100:.1f}% ({stats['covered_chunks']}/{stats['total_chunks']})")
1839
+
1840
+ # 3. Context Necessity (sample if too large) - BATCH PROCESSING
1841
+ if run_context_necessity:
1842
+ print("\n" + "=" * 60)
1843
+ print("CONTEXT NECESSITY EVALUATION (Anti-Parametric Bias)")
1844
+ print("=" * 60)
1845
+
1846
+ evaluator = MultimodalFrameworkEvaluator()
1847
+
1848
+ # Sample if needed
1849
+ eval_data = qa_data
1850
+ if sample_size and len(qa_data) > sample_size:
1851
+ eval_data = random.sample(qa_data, sample_size)
1852
+ print(f"\n Sampling {sample_size} items for evaluation...")
1853
+
1854
+ # Prepare batch items
1855
+ batch_items = []
1856
+ for qa in eval_data:
1857
+ context = qa.get('final_context', qa.get('original_chunk', ''))
1858
+ batch_items.append({
1859
+ 'question': qa['question'],
1860
+ 'answer': qa['answer'],
1861
+ 'context': context
1862
+ })
1863
+
1864
+ # Use batch evaluation if available
1865
+ if BATCH_AVAILABLE and len(batch_items) > 1:
1866
+ print(f"\n ⚡ Using batch processing for {len(batch_items)} items...")
1867
+ batch_results = evaluator.batch_evaluate_context_necessity(batch_items)
1868
+
1869
+ necessity_scores = [r['context_necessity_score'] for r in batch_results]
1870
+ without_context_correct = sum(1 for r in batch_results if r.get('without_context_correct'))
1871
+ else:
1872
+ # Fallback to sequential processing
1873
+ necessity_scores = []
1874
+ without_context_correct = 0
1875
+
1876
+ for i, item in enumerate(batch_items):
1877
+ if i % 10 == 0:
1878
+ print(f" Processing {i+1}/{len(batch_items)}...")
1879
+
1880
+ result = evaluator.evaluate_context_necessity(
1881
+ item['question'], item['answer'], item['context']
1882
+ )
1883
+ necessity_scores.append(result['context_necessity_score'])
1884
+ if result.get('without_context_correct'):
1885
+ without_context_correct += 1
1886
+
1887
+ avg_necessity = np.mean(necessity_scores) if necessity_scores else 0.0
1888
+ results["context_necessity"] = {
1889
+ "avg_context_necessity_score": float(avg_necessity),
1890
+ "items_evaluated": len(eval_data),
1891
+ "items_answerable_without_context": without_context_correct,
1892
+ "parametric_leakage_rate": without_context_correct / len(eval_data) if eval_data else 0.0,
1893
+ "score_distribution": {
1894
+ "high_necessity (0.8-1.0)": sum(1 for s in necessity_scores if s >= 0.8),
1895
+ "moderate_necessity (0.5-0.8)": sum(1 for s in necessity_scores if 0.5 <= s < 0.8),
1896
+ "low_necessity (0.0-0.5)": sum(1 for s in necessity_scores if s < 0.5)
1897
+ }
1898
+ }
1899
+
1900
+ print(f"\n Average Context Necessity Score: {avg_necessity:.3f}")
1901
+ print(f" Items answerable without context: {without_context_correct}/{len(eval_data)} ({without_context_correct/len(eval_data)*100:.1f}%)")
1902
+ print(f" Score Distribution:")
1903
+ for k, v in results["context_necessity"]["score_distribution"].items():
1904
+ print(f" {k}: {v} ({v/len(eval_data)*100:.1f}%)")
1905
+
1906
+ # 4. Multihop-specific metrics - BATCH PROCESSING
1907
+ if subsets['multihop']:
1908
+ print("\n" + "=" * 60)
1909
+ print("MULTIHOP METRICS (on multihop subset)")
1910
+ print("=" * 60)
1911
+
1912
+ evaluator = MultimodalFrameworkEvaluator()
1913
+
1914
+ # Prepare batch items
1915
+ batch_items = []
1916
+ for qa in subsets['multihop']:
1917
+ contexts = [qa.get('final_context', qa.get('original_chunk', ''))]
1918
+ batch_items.append({
1919
+ 'question': qa['question'],
1920
+ 'answer': qa['answer'],
1921
+ 'contexts': contexts
1922
+ })
1923
+
1924
+ # Use batch evaluation if available
1925
+ if BATCH_AVAILABLE and len(batch_items) > 1:
1926
+ print(f"\n ⚡ Using batch processing for {len(batch_items)} multihop items...")
1927
+ batch_results = evaluator.batch_evaluate_multihop_reasoning(batch_items)
1928
+
1929
+ hop_counts = [int(r.get('hop_count', 1)) for r in batch_results]
1930
+ reasoning_scores = [float(r.get('reasoning_score', 0)) for r in batch_results]
1931
+ bridge_entities = [r['bridge_entity'] for r in batch_results
1932
+ if r.get('bridge_entity') and r['bridge_entity'] != 'None']
1933
+ else:
1934
+ # Fallback to sequential
1935
+ hop_counts = []
1936
+ reasoning_scores = []
1937
+ bridge_entities = []
1938
+
1939
+ for i, item in enumerate(batch_items):
1940
+ if i % 5 == 0:
1941
+ print(f" Processing {i+1}/{len(batch_items)}...")
1942
+
1943
+ result = evaluator.evaluate_multihop_reasoning(
1944
+ item['question'], item['answer'], item['contexts']
1945
+ )
1946
+ hop_counts.append(int(result.get('hop_count', 1)))
1947
+ reasoning_scores.append(float(result.get('reasoning_score', 0)))
1948
+ if result.get('bridge_entity') and result['bridge_entity'] != 'None':
1949
+ bridge_entities.append(result['bridge_entity'])
1950
+
1951
+ results["multihop_metrics"] = {
1952
+ "items_evaluated": len(subsets['multihop']),
1953
+ "avg_hop_count": float(np.mean(hop_counts)) if hop_counts else 0.0,
1954
+ "avg_reasoning_score": float(np.mean(reasoning_scores)) if reasoning_scores else 0.0,
1955
+ "hop_distribution": dict(Counter(hop_counts)),
1956
+ "items_with_bridge_entity": len(bridge_entities),
1957
+ "sample_bridge_entities": bridge_entities[:10]
1958
+ }
1959
+
1960
+ print(f"\n Items evaluated: {len(subsets['multihop'])}")
1961
+ print(f" Average Hop Count: {results['multihop_metrics']['avg_hop_count']:.2f}")
1962
+ print(f" Average Reasoning Score: {results['multihop_metrics']['avg_reasoning_score']:.3f}")
1963
+ print(f" Hop Distribution: {results['multihop_metrics']['hop_distribution']}")
1964
+
1965
+ # 5. Multimodal-specific metrics
1966
+ if subsets['multimodal']:
1967
+ print("\n" + "=" * 60)
1968
+ print("MULTIMODAL METRICS (on multimodal subset)")
1969
+ print("=" * 60)
1970
+
1971
+ evaluator = MultimodalFrameworkEvaluator()
1972
+ visual_dependency_scores = []
1973
+
1974
+ for i, qa in enumerate(subsets['multimodal']):
1975
+ if i % 10 == 0:
1976
+ print(f" Processing {i+1}/{len(subsets['multimodal'])}...")
1977
+
1978
+ # Visual dependency test (text-only blind test)
1979
+ contexts = [qa.get('final_context', qa.get('original_chunk', ''))]
1980
+ score = evaluator.evaluate_visual_dependency(qa['question'], contexts)
1981
+ visual_dependency_scores.append(score)
1982
+
1983
+ results["multimodal_metrics"] = {
1984
+ "items_evaluated": len(subsets['multimodal']),
1985
+ "avg_visual_dependency": float(np.mean(visual_dependency_scores)) if visual_dependency_scores else 0.0,
1986
+ "items_requiring_visual": sum(1 for s in visual_dependency_scores if s > 0.5),
1987
+ "visual_necessity_rate": sum(1 for s in visual_dependency_scores if s > 0.5) / len(visual_dependency_scores) if visual_dependency_scores else 0.0
1988
+ }
1989
+
1990
+ print(f"\n Items evaluated: {len(subsets['multimodal'])}")
1991
+ print(f" Average Visual Dependency: {results['multimodal_metrics']['avg_visual_dependency']:.3f}")
1992
+ print(f" Items requiring visual info: {results['multimodal_metrics']['items_requiring_visual']}/{len(subsets['multimodal'])}")
1993
+
1994
+ # =========================================================================
1995
+ # FINAL SUMMARY - Key Metrics from MiRAGE Paper (Table 2)
1996
+ # =========================================================================
1997
+ print("\n" + "=" * 70)
1998
+ print("📊 MiRAGE EVALUATION SUMMARY (Paper Table 2 Metrics)")
1999
+ print("=" * 70)
2000
+
2001
+ # Extract metrics
2002
+ faith = results.get("ragas_metrics", {}).get("faithfulness")
2003
+ rel = results.get("ragas_metrics", {}).get("answer_relevance")
2004
+ ctx_prec = results.get("ragas_metrics", {}).get("context_precision")
2005
+ ctx_rec = results.get("ragas_metrics", {}).get("context_recall")
2006
+
2007
+ # Hop count from context_stats (avg_chunks_per_context - 1)
2008
+ avg_hops = results.get("context_stats", {}).get("avg_chunks_per_context", 1) - 1
2009
+ if avg_hops < 0:
2010
+ avg_hops = 0
2011
+
2012
+ # Reasoning score from multihop_metrics
2013
+ s_reason = results.get("multihop_metrics", {}).get("avg_reasoning_score")
2014
+
2015
+ # Visual grounding from multimodal_metrics
2016
+ vis_gr = results.get("multimodal_metrics", {}).get("avg_visual_dependency")
2017
+
2018
+ # JSD from domain_coverage
2019
+ jsd = results.get("domain_coverage", {}).get("topic_divergence_js")
2020
+
2021
+ # Context necessity (anti-parametric bias)
2022
+ ctx_nec = results.get("context_necessity", {}).get("avg_context_necessity_score")
2023
+
2024
+ # Helper to format metric values
2025
+ def fmt(val, decimals=3):
2026
+ if val is None:
2027
+ return "N/A".rjust(8)
2028
+ return f"{val:.{decimals}f}".rjust(8)
2029
+
2030
+ print("\n ┌─────────────────────────────────────────────────────────────┐")
2031
+ print(" │ CORE METRICS │")
2032
+ print(" ├─────────────────────────────────────────────────────────────┤")
2033
+ print(f" │ Faithfulness (Faith.) │ {fmt(faith)} │")
2034
+ print(f" │ Answer Relevance (Rel.) │ {fmt(rel)} │")
2035
+ print(f" │ Context Precision │ {fmt(ctx_prec)} │")
2036
+ print(f" │ Context Recall │ {fmt(ctx_rec)} │")
2037
+ print(" ├─────────────────────────────────────────────────────────────┤")
2038
+ print(" │ REASONING COMPLEXITY │")
2039
+ print(" ├─────────────────────────────────────────────────────────────┤")
2040
+ print(f" │ Avg Hops (H) │ {fmt(avg_hops, 2)} │")
2041
+ print(f" │ Reasoning Score (S_reason) │ {fmt(s_reason)} │")
2042
+ print(" ├─────────────────────────────────────────────────────────────┤")
2043
+ print(" │ MULTIMODAL & DOMAIN │")
2044
+ print(" ├─────────────────────────────────────────────────────────────┤")
2045
+ print(f" │ Visual Grounding (Vis. Gr.) │ {fmt(vis_gr)} │")
2046
+ print(f" │ Jensen-Shannon Div. (JSD) ↓ │ {fmt(jsd, 4)} │")
2047
+ print(f" │ Context Necessity │ {fmt(ctx_nec)} │")
2048
+ print(" └─────────────────────────────────────────────────────────────┘")
2049
+
2050
+ # Dataset summary
2051
+ total_qa = results.get("qa_category_stats", {}).get("total_qa_pairs", 0)
2052
+ mm_qa = results.get("qa_category_stats", {}).get("multimodal_qa_inclusive", 0)
2053
+ table_qa = results.get("qa_category_stats", {}).get("table_qa_inclusive", 0)
2054
+
2055
+ print(f"\n 📈 Dataset: {total_qa} QA pairs | {mm_qa} multimodal | {table_qa} with tables")
2056
+ print("=" * 70)
2057
+
2058
+ # Save results
2059
+ if output_dir:
2060
+ report_path = os.path.join(output_dir, "subset_evaluation_report.json")
2061
+ with open(report_path, 'w') as f:
2062
+ json.dump(results, f, indent=2, default=str)
2063
+ print(f"\n Report saved to: {report_path}")
2064
+
2065
+ return results
2066
+
2067
+
2068
+ def main(json_path: str, output_dir: str = None):
2069
+ """
2070
+ Main function to evaluate QA dataset quality.
2071
+
2072
+ Args:
2073
+ json_path: Path to the qa_multihop_pass.json file
2074
+ output_dir: Directory to save output reports (defaults to same dir as input)
2075
+ """
2076
+ import os
2077
+
2078
+ if output_dir is None:
2079
+ output_dir = os.path.dirname(json_path)
2080
+
2081
+ print("=" * 60)
2082
+ print("QA DATASET QUALITY EVALUATION")
2083
+ print("=" * 60)
2084
+
2085
+ # 1. Load raw data
2086
+ print(f"\n[1/5] Loading dataset from: {json_path}")
2087
+ with open(json_path, 'r') as f:
2088
+ raw_data = json.load(f)
2089
+ print(f" Loaded {len(raw_data)} QA pairs")
2090
+
2091
+ # 2. Analyze missing information BEFORE transformation
2092
+ print("\n[2/5] Analyzing data completeness...")
2093
+ missing_info = analyze_missing_information(raw_data)
2094
+
2095
+ print("\n" + "-" * 60)
2096
+ print("METRICS EVALUATION STATUS:")
2097
+ print("-" * 60)
2098
+ for metric, status in missing_info["metrics_status"].items():
2099
+ symbol = "✓" if status["can_evaluate"] else "✗"
2100
+ quality = status["quality"]
2101
+ print(f" {symbol} {metric}: {quality}")
2102
+ if status.get("missing"):
2103
+ for m in status["missing"]:
2104
+ print(f" Missing: {m}")
2105
+
2106
+ print("\n" + "-" * 60)
2107
+ print("RECOMMENDATIONS:")
2108
+ print("-" * 60)
2109
+ for rec in missing_info["recommendations"]:
2110
+ print(f" • {rec}")
2111
+
2112
+ # 3. Transform data
2113
+ print("\n[3/5] Transforming data to evaluation format...")
2114
+ transformed_data = transform_qa_data(raw_data)
2115
+
2116
+ # 4. Run evaluation (only metrics that can be evaluated)
2117
+ print("\n[4/5] Running evaluation...")
2118
+
2119
+ try:
2120
+ evaluator = MultimodalFrameworkEvaluator()
2121
+
2122
+ # Save transformed data for the evaluator
2123
+ transformed_path = os.path.join(output_dir, "qa_transformed_for_eval.json")
2124
+ with open(transformed_path, 'w') as f:
2125
+ json.dump(transformed_data, f, indent=2)
2126
+
2127
+ output_path = os.path.join(output_dir, "eval_report.json")
2128
+ final_df, report = evaluator.run_full_evaluation(transformed_path, output_path)
2129
+
2130
+ # Add missing info analysis to report
2131
+ report["data_completeness"] = missing_info
2132
+
2133
+ # Save updated report
2134
+ with open(output_path, 'w') as f:
2135
+ json.dump(report, f, indent=4)
2136
+
2137
+ print(f"\n[5/5] Reports saved to:")
2138
+ print(f" - {output_path}")
2139
+ print(f" - {output_path.replace('.json', '_detailed.csv')}")
2140
+
2141
+ return final_df, report
2142
+
2143
+ except Exception as e:
2144
+ print(f"\n[ERROR] Evaluation failed: {e}")
2145
+ print("\nRunning basic statistics only...")
2146
+
2147
+ # Compute basic statistics without LLM calls
2148
+ basic_stats = {
2149
+ "total_samples": len(raw_data),
2150
+ "context_status_distribution": Counter(
2151
+ item.get("context_status", "UNKNOWN") for item in raw_data
2152
+ ),
2153
+ "avg_relevance_score": np.mean([
2154
+ float(item.get("relevance_score", 0)) for item in raw_data
2155
+ if item.get("relevance_score")
2156
+ ]),
2157
+ "avg_difficulty_score": np.mean([
2158
+ float(item.get("difficulty_score", 0)) for item in raw_data
2159
+ if item.get("difficulty_score")
2160
+ ]),
2161
+ "domain_distribution": Counter(
2162
+ item.get("domain", "UNKNOWN") for item in raw_data
2163
+ ),
2164
+ "data_completeness": missing_info
2165
+ }
2166
+
2167
+ output_path = os.path.join(output_dir, "eval_report_basic.json")
2168
+ with open(output_path, 'w') as f:
2169
+ # Convert Counter objects to dict for JSON serialization
2170
+ basic_stats["context_status_distribution"] = dict(basic_stats["context_status_distribution"])
2171
+ basic_stats["domain_distribution"] = dict(basic_stats["domain_distribution"])
2172
+ json.dump(basic_stats, f, indent=4)
2173
+
2174
+ print(f"\n[5/5] Basic report saved to: {output_path}")
2175
+ print("\nBasic Statistics:")
2176
+ print(json.dumps({k: v for k, v in basic_stats.items() if k != "data_completeness"}, indent=2))
2177
+
2178
+ return None, basic_stats
2179
+
2180
+
2181
+ if __name__ == "__main__":
2182
+ import argparse
2183
+
2184
+ # Load Gemini API key
2185
+ API_KEY_PATH = os.environ.get("GEMINI_API_KEY_PATH", os.path.expanduser("~/.config/gemini/api_key.txt"))
2186
+ with open(API_KEY_PATH, 'r') as f:
2187
+ os.environ["GOOGLE_API_KEY"] = f.read().strip()
2188
+
2189
+ # Default paths (override via command line arguments)
2190
+ DEFAULT_QA_PATH = "output/results/qa_deduplicated.json"
2191
+ DEFAULT_CORPUS_PATH = "output/results/chunks.json"
2192
+
2193
+ parser = argparse.ArgumentParser(description="Evaluate QA dataset quality")
2194
+ parser.add_argument("--qa-file", "-q", default=DEFAULT_QA_PATH, help="Path to QA JSON file")
2195
+ parser.add_argument("--corpus-file", "-c", default=DEFAULT_CORPUS_PATH, help="Path to corpus chunks.json")
2196
+ parser.add_argument("--output-dir", "-o", default=None, help="Output directory for reports")
2197
+ parser.add_argument("--sample-size", "-s", type=int, default=50, help="Sample size for expensive metrics")
2198
+ parser.add_argument("--skip-context-necessity", action="store_true", help="Skip context necessity evaluation")
2199
+
2200
+ args = parser.parse_args()
2201
+
2202
+ # Set output dir
2203
+ output_dir = args.output_dir or os.path.dirname(args.qa_file)
2204
+
2205
+ # Load QA data
2206
+ print(f"Loading QA data from: {args.qa_file}")
2207
+ with open(args.qa_file, 'r') as f:
2208
+ qa_data = json.load(f)
2209
+ print(f"Loaded {len(qa_data)} QA pairs")
2210
+
2211
+ # Run evaluation
2212
+ results = run_subset_evaluation(
2213
+ qa_data=qa_data,
2214
+ corpus_path=args.corpus_file,
2215
+ output_dir=output_dir,
2216
+ sample_size=args.sample_size,
2217
+ run_context_necessity=not args.skip_context_necessity
2218
+ )
2219
+
2220
+ print("\n" + "=" * 60)
2221
+ print("FINAL EVALUATION RESULTS")
2222
+ print("=" * 60)
2223
+ print(json.dumps(results, indent=2, default=str))