mirage-benchmark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mirage-benchmark might be problematic. Click here for more details.

@@ -0,0 +1,2172 @@
1
+ """
2
+ Optimized Metrics Evaluation with Minimal LLM Calls
3
+
4
+ Harmonized with metrics.py - same metric names, optimized implementation.
5
+
6
+ Strategy:
7
+ - 1 PREPARATION call per QA pair: Extract claims + reverse questions
8
+ - 1 EVALUATION call per metric: Batch verify all claims/contexts together
9
+ - Total: 4-6 LLM calls per QA vs 10-20+ in standard RAGAS
10
+
11
+ METRICS IMPLEMENTED (same as metrics.py):
12
+ 1. Faithfulness - Answer claims supported by context?
13
+ 2. Answer Relevancy - Answer addresses question?
14
+ 3. Context Precision - Retrieved chunks relevant and well-ranked?
15
+ 4. Context Recall - Context contains reference info? (skip for dataset creation)
16
+ 5. Multimodal Faithfulness - Answer grounded in text+images?
17
+ 6. Multimodal Relevance - Answer uses multimodal context?
18
+ 7. Context Necessity - Requires context to answer? (anti-parametric bias)
19
+ 8. Semantic Diversity - Questions diverse?
20
+ 9. Domain Coverage - Corpus coverage?
21
+ 10. Multihop Reasoning - Multi-step reasoning quality?
22
+ 11. Visual Dependency - Needs image to answer?
23
+
24
+ Usage modes:
25
+ - Dataset Creation: faithfulness, answer_relevancy, context_necessity
26
+ - RAG Evaluation: All metrics
27
+ """
28
+
29
+ import os
30
+ import json
31
+ import re
32
+ import logging
33
+ from typing import List, Dict, Any, Optional, Tuple
34
+ from dataclasses import dataclass, field
35
+ from concurrent.futures import ThreadPoolExecutor, as_completed
36
+ from collections import Counter
37
+ import numpy as np
38
+
39
+ # Imports
40
+ try:
41
+ from call_llm import call_llm, batch_call_llm, call_vlm_interweaved, batch_call_vlm_interweaved, API_KEY
42
+ LLM_AVAILABLE = True
43
+ except ImportError:
44
+ LLM_AVAILABLE = False
45
+ API_KEY = None
46
+ print("Warning: call_llm not available")
47
+
48
+ try:
49
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
50
+ import google.generativeai as genai
51
+ GEMINI_AVAILABLE = True
52
+ except ImportError:
53
+ GEMINI_AVAILABLE = False
54
+
55
+ # Fallback: sentence-transformers for local embeddings
56
+ try:
57
+ from sentence_transformers import SentenceTransformer
58
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
59
+ except ImportError:
60
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
61
+
62
+ try:
63
+ from sklearn.metrics.pairwise import cosine_similarity
64
+ SKLEARN_AVAILABLE = True
65
+ except ImportError:
66
+ SKLEARN_AVAILABLE = False
67
+
68
+
69
+ class LocalEmbeddingWrapper:
70
+ """Wrapper to make sentence-transformers compatible with langchain embeddings interface."""
71
+ def __init__(self, model_name: str = "BAAI/bge-m3"):
72
+ self.model = SentenceTransformer(model_name, trust_remote_code=True)
73
+ print(f"āœ… Loaded local embedding model: {model_name}")
74
+
75
+ def embed_query(self, text: str) -> List[float]:
76
+ """Embed a single query text."""
77
+ return self.model.encode(text, normalize_embeddings=True).tolist()
78
+
79
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
80
+ """Embed multiple documents."""
81
+ return self.model.encode(texts, normalize_embeddings=True).tolist()
82
+
83
+ # Import prompts from prompt.py
84
+ try:
85
+ from prompt import PROMPTS_METRICS_OPT
86
+ PROMPTS_AVAILABLE = True
87
+ except ImportError:
88
+ PROMPTS_AVAILABLE = False
89
+ PROMPTS_METRICS_OPT = {}
90
+ print("Warning: PROMPTS_METRICS_OPT not available from prompt.py")
91
+
92
+
93
+ # ============================================================================
94
+ # HELPER FUNCTIONS
95
+ # ============================================================================
96
+
97
+ def extract_image_path_from_content(content: str, file_name: str = None) -> Optional[str]:
98
+ """Extract image path from markdown content (e.g., ![Image](path))
99
+
100
+ Args:
101
+ content: The markdown content string
102
+ file_name: The source document name (used for directory structure)
103
+
104
+ Returns:
105
+ First valid image path found, or None
106
+ """
107
+ if not content:
108
+ return None
109
+
110
+ # Find all markdown image references: ![alt](path)
111
+ matches = re.findall(r'!\[[^\]]*\]\(([^)]+)\)', content)
112
+
113
+ if not matches:
114
+ return None
115
+
116
+ # Use first match
117
+ rel_path = matches[0]
118
+
119
+ # Try to construct absolute path (override via OUTPUT_DIR/markdown)
120
+ IMAGE_BASE_DIR = "output/results/markdown"
121
+
122
+ if file_name and file_name != 'unknown':
123
+ abs_path = f"{IMAGE_BASE_DIR}/{file_name}/{rel_path}"
124
+ else:
125
+ if rel_path.startswith('ref_artifacts/'):
126
+ abs_path = f"{IMAGE_BASE_DIR}/{rel_path}"
127
+ else:
128
+ abs_path = f"{IMAGE_BASE_DIR}/{rel_path}"
129
+
130
+ # Check if file exists
131
+ if os.path.exists(abs_path):
132
+ return abs_path
133
+
134
+ # Even if file doesn't exist, return the path if it looks valid
135
+ # (for detection purposes - file might be in different location)
136
+ return abs_path if rel_path else None
137
+
138
+
139
+ def has_image_in_chunk(chunk: Dict) -> bool:
140
+ """Check if a chunk has an image (via image_path or content)
141
+
142
+ Args:
143
+ chunk: Chunk dictionary with 'image_path' and/or 'content'
144
+
145
+ Returns:
146
+ True if chunk has an image reference
147
+ """
148
+ if not isinstance(chunk, dict):
149
+ return False
150
+
151
+ # Check image_path field first
152
+ image_path = chunk.get('image_path')
153
+ if image_path and image_path != 'null' and image_path is not None:
154
+ return True
155
+
156
+ # Check content for markdown image references
157
+ content = chunk.get('content', '')
158
+ if content and re.search(r'!\[[^\]]*\]\([^)]+\)', content):
159
+ return True
160
+
161
+ return False
162
+
163
+
164
+ # ============================================================================
165
+ # PROMPTS FOR OPTIMIZED METRICS (from prompt.py or fallback)
166
+ # ============================================================================
167
+
168
+ # Use imported prompts or define fallbacks
169
+ PROMPT_PREPARE_QA = PROMPTS_METRICS_OPT.get("prepare_qa", """You are a QA analysis assistant. Analyze the following QA pair and extract information needed for evaluation.
170
+
171
+ QUESTION: {question}
172
+
173
+ ANSWER: {answer}
174
+
175
+ REFERENCE (Ground Truth): {reference}
176
+
177
+ TASKS:
178
+ 1. Extract ALL atomic claims/statements from the ANSWER (factual assertions that can be verified)
179
+ 2. Extract ALL atomic claims/statements from the REFERENCE
180
+ 3. Generate {num_reverse_questions} diverse questions that the ANSWER could plausibly answer
181
+
182
+ OUTPUT FORMAT (use exactly this format):
183
+ ANSWER_CLAIMS:
184
+ - [claim 1]
185
+ - [claim 2]
186
+ ...
187
+
188
+ REFERENCE_CLAIMS:
189
+ - [claim 1]
190
+ - [claim 2]
191
+ ...
192
+
193
+ REVERSE_QUESTIONS:
194
+ - [question 1]
195
+ - [question 2]
196
+ - [question 3]
197
+ """)
198
+
199
+ PROMPT_FAITHFULNESS = PROMPTS_METRICS_OPT.get("faithfulness", """You are a faithfulness evaluator. Determine if each claim from the answer can be inferred from the given context.
200
+
201
+ CONTEXT:
202
+ {context}
203
+
204
+ CLAIMS TO VERIFY:
205
+ {claims}
206
+
207
+ For each claim, respond with ONLY "SUPPORTED" or "NOT_SUPPORTED".
208
+
209
+ OUTPUT FORMAT (one per line, in order):
210
+ CLAIM_1: SUPPORTED/NOT_SUPPORTED
211
+ CLAIM_2: SUPPORTED/NOT_SUPPORTED
212
+ ...
213
+ """)
214
+
215
+ PROMPT_CONTEXT_RECALL = PROMPTS_METRICS_OPT.get("context_recall", """You are a context recall evaluator. Determine if each claim from the reference/ground truth can be attributed to the retrieved context.
216
+
217
+ CONTEXT:
218
+ {context}
219
+
220
+ REFERENCE CLAIMS TO VERIFY:
221
+ {claims}
222
+
223
+ For each claim, respond with ONLY "ATTRIBUTED" or "NOT_ATTRIBUTED".
224
+
225
+ OUTPUT FORMAT (one per line, in order):
226
+ CLAIM_1: ATTRIBUTED/NOT_ATTRIBUTED
227
+ CLAIM_2: ATTRIBUTED/NOT_ATTRIBUTED
228
+ ...
229
+ """)
230
+
231
+ PROMPT_CONTEXT_PRECISION = PROMPTS_METRICS_OPT.get("context_precision", """You are a context precision evaluator. For each context chunk, determine if it is RELEVANT or NOT_RELEVANT to answering the question, given the reference answer.
232
+
233
+ QUESTION: {question}
234
+ REFERENCE ANSWER: {reference}
235
+
236
+ CONTEXT CHUNKS:
237
+ {contexts}
238
+
239
+ For each context chunk, respond with ONLY "RELEVANT" or "NOT_RELEVANT".
240
+
241
+ OUTPUT FORMAT (one per line, in order):
242
+ CHUNK_1: RELEVANT/NOT_RELEVANT
243
+ CHUNK_2: RELEVANT/NOT_RELEVANT
244
+ ...
245
+ """)
246
+
247
+ PROMPT_MULTIMODAL_FAITHFULNESS = PROMPTS_METRICS_OPT.get("multimodal_faithfulness", """You are a multimodal faithfulness evaluator. Verify if EACH claim from the answer can be inferred from the provided context (text AND/OR images).
248
+
249
+ QUESTION: {question}
250
+
251
+ ANSWER: {answer}
252
+
253
+ CLAIMS TO VERIFY:
254
+ {claims}
255
+
256
+ For EACH claim, determine:
257
+ 1. Is it SUPPORTED or NOT_SUPPORTED by the context?
258
+ 2. If supported, is it from TEXT, IMAGE, or BOTH?
259
+
260
+ OUTPUT FORMAT (one per line, in order):
261
+ CLAIM_1: SUPPORTED/NOT_SUPPORTED | SOURCE: TEXT/IMAGE/BOTH/NONE
262
+ CLAIM_2: SUPPORTED/NOT_SUPPORTED | SOURCE: TEXT/IMAGE/BOTH/NONE
263
+ ...
264
+
265
+ SUMMARY:
266
+ TEXT_GROUNDED: YES/NO
267
+ VISUAL_GROUNDED: YES/NO/NA
268
+ SUPPORTED_COUNT: [number]
269
+ TOTAL_CLAIMS: [number]
270
+ """)
271
+
272
+ PROMPT_MULTIMODAL_RELEVANCE = PROMPTS_METRICS_OPT.get("multimodal_relevance", """You are a multimodal relevance evaluator. Generate {num_questions} questions that the given answer could plausibly be answering, then evaluate relevance.
273
+
274
+ ANSWER: {answer}
275
+
276
+ CONTEXT: (text and images provided below)
277
+
278
+ TASK:
279
+ 1. Generate {num_questions} diverse questions that this answer could address
280
+ 2. For each generated question, indicate if it uses TEXT context, IMAGE context, or BOTH
281
+
282
+ OUTPUT FORMAT:
283
+ GENERATED_QUESTIONS:
284
+ Q1: [question] | USES: TEXT/IMAGE/BOTH
285
+ Q2: [question] | USES: TEXT/IMAGE/BOTH
286
+ Q3: [question] | USES: TEXT/IMAGE/BOTH
287
+
288
+ CONTEXT_UTILIZATION:
289
+ USES_TEXT: YES/NO
290
+ USES_IMAGES: YES/NO/NA
291
+ RELEVANCE_SCORE: [0.0-1.0]
292
+ """)
293
+
294
+ PROMPT_CONTEXT_NECESSITY_WITHOUT = PROMPTS_METRICS_OPT.get("context_necessity_without", """You are an expert assistant. Answer the following question using ONLY your general knowledge. Do NOT make up specific facts.
295
+
296
+ If you cannot answer confidently without additional context, respond with: CANNOT_ANSWER
297
+
298
+ QUESTION: {question}
299
+
300
+ YOUR ANSWER:""")
301
+
302
+ PROMPT_CONTEXT_NECESSITY_VERIFY = PROMPTS_METRICS_OPT.get("context_necessity_verify", """Compare the model's answer to the ground truth answer.
303
+
304
+ GROUND TRUTH: {ground_truth}
305
+
306
+ MODEL ANSWER: {model_answer}
307
+
308
+ Respond with exactly one of:
309
+ - MATCH: YES (if model answer is correct and complete)
310
+ - MATCH: PARTIAL (if model answer is partially correct)
311
+ - MATCH: NO (if model answer is incorrect or missing key information)
312
+
313
+ YOUR VERDICT:""")
314
+
315
+ PROMPT_MULTIHOP_REASONING = PROMPTS_METRICS_OPT.get("multihop_reasoning", """Analyze if answering this question requires multi-hop reasoning (combining information from multiple sources).
316
+
317
+ CONTEXTS:
318
+ {contexts}
319
+
320
+ QUESTION: {question}
321
+
322
+ ANSWER: {answer}
323
+
324
+ Evaluate:
325
+ 1. HOP_COUNT: How many distinct pieces of information must be combined? (1 = single fact, 2+ = multi-hop)
326
+ 2. REASONING_SCORE: How complex is the reasoning? (0.0 = trivial, 1.0 = complex multi-step)
327
+ 3. BRIDGE_ENTITY: What entity/concept connects the information pieces? (or "None")
328
+
329
+ OUTPUT FORMAT:
330
+ HOP_COUNT: [number]
331
+ REASONING_SCORE: [0.0-1.0]
332
+ BRIDGE_ENTITY: [entity or None]
333
+ EXPLANATION: [brief explanation]
334
+ """)
335
+
336
+ PROMPT_VISUAL_DEPENDENCY = PROMPTS_METRICS_OPT.get("visual_dependency", """You are given ONLY text context (no images). Determine if you can fully answer the question.
337
+
338
+ TEXT CONTEXT:
339
+ {contexts}
340
+
341
+ QUESTION: {question}
342
+
343
+ If you can answer completely using ONLY the text above, provide your answer.
344
+ If you CANNOT answer because visual information (figures, diagrams, images) is missing, respond with: MISSING_VISUAL
345
+
346
+ YOUR RESPONSE:""")
347
+
348
+
349
+ # ============================================================================
350
+ # DATA CLASSES
351
+ # ============================================================================
352
+
353
+ @dataclass
354
+ class PreparedQA:
355
+ """Prepared QA pair with extracted claims and reverse questions"""
356
+ question: str
357
+ answer: str
358
+ reference: str
359
+ contexts: List[str]
360
+ context_chunks: List[Dict] = field(default_factory=list)
361
+
362
+ # Extracted by preparation call
363
+ answer_claims: List[str] = field(default_factory=list)
364
+ reference_claims: List[str] = field(default_factory=list)
365
+ reverse_questions: List[str] = field(default_factory=list)
366
+ concept_hops_question: str = ""
367
+
368
+ # Metric scores (filled after evaluation)
369
+ faithfulness_score: float = 0.0
370
+ context_recall_score: float = 0.0
371
+ context_precision_score: float = 0.0
372
+ answer_relevancy_score: float = 0.0
373
+ multimodal_faithfulness_score: float = 0.0
374
+ multimodal_relevance_score: float = 0.0
375
+
376
+ # Detailed results
377
+ faithfulness_details: Dict = field(default_factory=dict)
378
+ context_recall_details: Dict = field(default_factory=dict)
379
+ context_precision_details: Dict = field(default_factory=dict)
380
+ multimodal_details: Dict = field(default_factory=dict)
381
+
382
+
383
+ # ============================================================================
384
+ # OPTIMIZED METRICS EVALUATOR
385
+ # ============================================================================
386
+
387
+ class OptimizedMetricsEvaluator:
388
+ """
389
+ Evaluates RAGAS-style metrics with minimal LLM calls.
390
+
391
+ Call Pattern per QA pair:
392
+ 1. prepare_qa() - 1 LLM call to extract claims + generate reverse questions
393
+ 2. evaluate_faithfulness() - 1 LLM call to verify all answer claims
394
+ 3. evaluate_context_recall() - 1 LLM call to verify all reference claims
395
+ 4. evaluate_context_precision() - 1 LLM call to evaluate all contexts
396
+ 5. evaluate_answer_relevancy() - 0 LLM calls (uses embeddings only)
397
+ 6. evaluate_multimodal_faithfulness() - 1 VLM call (if images present)
398
+ 7. evaluate_multimodal_relevance() - 1 VLM call (if images present)
399
+
400
+ Total: 4-6 LLM/VLM calls per QA pair (vs 10-20+ with default RAGAS)
401
+ """
402
+
403
+ def __init__(self,
404
+ model_name: str = "gemini-2.0-flash",
405
+ embedding_model: str = "models/text-embedding-004",
406
+ num_reverse_questions: int = 3,
407
+ max_workers: int = 8,
408
+ enable_multimodal: bool = True):
409
+ self.model_name = model_name
410
+ self.embedding_model = embedding_model
411
+ self.num_reverse_questions = num_reverse_questions
412
+ self.max_workers = max_workers
413
+ self.enable_multimodal = enable_multimodal
414
+
415
+ # Initialize embeddings with fallback
416
+ self.embeddings = None
417
+ self.embedding_type = None
418
+
419
+ # Try Gemini embeddings first
420
+ if GEMINI_AVAILABLE and API_KEY:
421
+ try:
422
+ self.embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model, google_api_key=API_KEY)
423
+ self.embedding_type = "gemini"
424
+ print(f"āœ… Using Gemini embeddings: {embedding_model}")
425
+ except Exception as e:
426
+ print(f"Warning: Failed to initialize Gemini embeddings: {e}")
427
+
428
+ # Fallback to sentence-transformers (local embeddings)
429
+ if self.embeddings is None and SENTENCE_TRANSFORMERS_AVAILABLE:
430
+ try:
431
+ self.embeddings = LocalEmbeddingWrapper("BAAI/bge-m3")
432
+ self.embedding_type = "local"
433
+ except Exception as e:
434
+ print(f"Warning: Failed to initialize local embeddings: {e}")
435
+
436
+ # No embeddings available
437
+ if self.embeddings is None:
438
+ if not GEMINI_AVAILABLE and not SENTENCE_TRANSFORMERS_AVAILABLE:
439
+ print("Warning: No embedding model available (install langchain-google-genai or sentence-transformers)")
440
+ print(" answer_relevancy and semantic_diversity will be skipped")
441
+ elif not GEMINI_AVAILABLE:
442
+ print("Warning: langchain-google-genai not installed, no fallback available")
443
+ elif not API_KEY:
444
+ print("Warning: API key not available for embeddings")
445
+
446
+ # ========================================================================
447
+ # STEP 1: PREPARATION (1 LLM call per QA)
448
+ # ========================================================================
449
+
450
+ def prepare_qa(self, question: str, answer: str, reference: str,
451
+ contexts: List[str], context_chunks: List[Dict] = None) -> PreparedQA:
452
+ """
453
+ Prepare QA pair by extracting claims and generating reverse questions.
454
+ 1 LLM call.
455
+ """
456
+ prepared = PreparedQA(
457
+ question=question,
458
+ answer=answer,
459
+ reference=reference,
460
+ contexts=contexts,
461
+ context_chunks=context_chunks or []
462
+ )
463
+
464
+ prompt = PROMPT_PREPARE_QA.format(
465
+ question=question,
466
+ answer=answer,
467
+ reference=reference,
468
+ num_reverse_questions=self.num_reverse_questions
469
+ )
470
+
471
+ try:
472
+ response = call_llm(prompt)
473
+
474
+ # Parse answer claims
475
+ answer_claims_match = re.search(
476
+ r'ANSWER_CLAIMS:\s*\n(.*?)(?=REFERENCE_CLAIMS:|$)',
477
+ response, re.DOTALL
478
+ )
479
+ if answer_claims_match:
480
+ claims_text = answer_claims_match.group(1)
481
+ prepared.answer_claims = [
482
+ c.strip().lstrip('- ').strip()
483
+ for c in claims_text.strip().split('\n')
484
+ if c.strip() and c.strip() != '-'
485
+ ]
486
+
487
+ # Parse reference claims
488
+ ref_claims_match = re.search(
489
+ r'REFERENCE_CLAIMS:\s*\n(.*?)(?=REVERSE_QUESTIONS:|$)',
490
+ response, re.DOTALL
491
+ )
492
+ if ref_claims_match:
493
+ claims_text = ref_claims_match.group(1)
494
+ prepared.reference_claims = [
495
+ c.strip().lstrip('- ').strip()
496
+ for c in claims_text.strip().split('\n')
497
+ if c.strip() and c.strip() != '-'
498
+ ]
499
+
500
+ # Parse concept hops for question
501
+ concept_hops_match = re.search(
502
+ r'CONCEPT_HOPS_QUESTION:\s*\n(.*?)(?=REVERSE_QUESTIONS:|$)',
503
+ response, re.DOTALL
504
+ )
505
+ if concept_hops_match:
506
+ prepared.concept_hops_question = concept_hops_match.group(1).strip()
507
+
508
+ # Parse reverse questions
509
+ rev_q_match = re.search(
510
+ r'REVERSE_QUESTIONS:\s*\n(.*?)$',
511
+ response, re.DOTALL
512
+ )
513
+ if rev_q_match:
514
+ q_text = rev_q_match.group(1)
515
+ prepared.reverse_questions = [
516
+ q.strip().lstrip('- ').strip()
517
+ for q in q_text.strip().split('\n')
518
+ if q.strip() and q.strip() != '-'
519
+ ][:self.num_reverse_questions]
520
+
521
+ except Exception as e:
522
+ print(f"Error in prepare_qa: {e}")
523
+ # Fallback: use answer/reference as single claim
524
+ prepared.answer_claims = [answer] if answer else []
525
+ prepared.reference_claims = [reference] if reference else []
526
+ prepared.reverse_questions = [question]
527
+
528
+ return prepared
529
+
530
+ # ========================================================================
531
+ # STEP 2: METRIC EVALUATION (1 LLM call each)
532
+ # ========================================================================
533
+
534
+ def evaluate_faithfulness(self, prepared: PreparedQA) -> float:
535
+ """
536
+ Evaluate faithfulness: Do answer claims follow from context?
537
+ 1 LLM call to verify ALL claims at once.
538
+ """
539
+ if not prepared.answer_claims:
540
+ prepared.faithfulness_score = 1.0
541
+ prepared.faithfulness_details = {"claims": [], "supported": 0, "total": 0}
542
+ return 1.0
543
+
544
+ # Format claims for prompt
545
+ claims_text = "\n".join([
546
+ f"CLAIM_{i+1}: {claim}"
547
+ for i, claim in enumerate(prepared.answer_claims)
548
+ ])
549
+
550
+ context_text = "\n\n".join(prepared.contexts)
551
+
552
+ prompt = PROMPT_FAITHFULNESS.format(
553
+ context=context_text,
554
+ claims=claims_text
555
+ )
556
+
557
+ try:
558
+ response = call_llm(prompt)
559
+
560
+ # Parse results
561
+ supported_count = 0
562
+ claim_results = []
563
+
564
+ for i, claim in enumerate(prepared.answer_claims):
565
+ pattern = rf'CLAIM_{i+1}:\s*(SUPPORTED|NOT_SUPPORTED)'
566
+ match = re.search(pattern, response, re.IGNORECASE)
567
+ is_supported = match and 'NOT' not in match.group(1).upper() if match else False
568
+
569
+ # Also check for simple line-by-line format
570
+ if not match:
571
+ lines = response.strip().split('\n')
572
+ if i < len(lines):
573
+ is_supported = 'SUPPORTED' in lines[i].upper() and 'NOT' not in lines[i].upper()
574
+
575
+ claim_results.append({
576
+ "claim": claim,
577
+ "supported": is_supported
578
+ })
579
+ if is_supported:
580
+ supported_count += 1
581
+
582
+ score = supported_count / len(prepared.answer_claims) if prepared.answer_claims else 1.0
583
+ prepared.faithfulness_score = score
584
+ prepared.faithfulness_details = {
585
+ "claims": claim_results,
586
+ "supported": supported_count,
587
+ "total": len(prepared.answer_claims)
588
+ }
589
+ return score
590
+
591
+ except Exception as e:
592
+ print(f"Error in evaluate_faithfulness: {e}")
593
+ prepared.faithfulness_score = 0.0
594
+ return 0.0
595
+
596
+ def evaluate_context_recall(self, prepared: PreparedQA) -> float:
597
+ """
598
+ Evaluate context recall: Can reference claims be attributed to context?
599
+ 1 LLM call to verify ALL reference claims at once.
600
+ """
601
+ if not prepared.reference_claims:
602
+ prepared.context_recall_score = 1.0
603
+ prepared.context_recall_details = {"claims": [], "attributed": 0, "total": 0}
604
+ return 1.0
605
+
606
+ claims_text = "\n".join([
607
+ f"CLAIM_{i+1}: {claim}"
608
+ for i, claim in enumerate(prepared.reference_claims)
609
+ ])
610
+
611
+ context_text = "\n\n".join(prepared.contexts)
612
+
613
+ prompt = PROMPT_CONTEXT_RECALL.format(
614
+ context=context_text,
615
+ claims=claims_text
616
+ )
617
+
618
+ try:
619
+ response = call_llm(prompt)
620
+
621
+ attributed_count = 0
622
+ claim_results = []
623
+
624
+ for i, claim in enumerate(prepared.reference_claims):
625
+ pattern = rf'CLAIM_{i+1}:\s*(ATTRIBUTED|NOT_ATTRIBUTED)'
626
+ match = re.search(pattern, response, re.IGNORECASE)
627
+ is_attributed = match and 'NOT' not in match.group(1).upper() if match else False
628
+
629
+ if not match:
630
+ lines = response.strip().split('\n')
631
+ if i < len(lines):
632
+ is_attributed = 'ATTRIBUTED' in lines[i].upper() and 'NOT' not in lines[i].upper()
633
+
634
+ claim_results.append({
635
+ "claim": claim,
636
+ "attributed": is_attributed
637
+ })
638
+ if is_attributed:
639
+ attributed_count += 1
640
+
641
+ score = attributed_count / len(prepared.reference_claims) if prepared.reference_claims else 1.0
642
+ prepared.context_recall_score = score
643
+ prepared.context_recall_details = {
644
+ "claims": claim_results,
645
+ "attributed": attributed_count,
646
+ "total": len(prepared.reference_claims)
647
+ }
648
+ return score
649
+
650
+ except Exception as e:
651
+ print(f"Error in evaluate_context_recall: {e}")
652
+ prepared.context_recall_score = 0.0
653
+ return 0.0
654
+
655
+ def evaluate_context_precision(self, prepared: PreparedQA) -> float:
656
+ """
657
+ Evaluate context precision: Are contexts relevant and well-ranked?
658
+ 1 LLM call to evaluate ALL contexts at once.
659
+ Uses mean precision@k formula.
660
+ """
661
+ if not prepared.contexts:
662
+ prepared.context_precision_score = 0.0
663
+ prepared.context_precision_details = {"contexts": [], "precision_at_k": []}
664
+ return 0.0
665
+
666
+ contexts_text = "\n\n".join([
667
+ f"CHUNK_{i+1}:\n{ctx}"
668
+ for i, ctx in enumerate(prepared.contexts)
669
+ ])
670
+
671
+ prompt = PROMPT_CONTEXT_PRECISION.format(
672
+ question=prepared.question,
673
+ reference=prepared.reference,
674
+ contexts=contexts_text
675
+ )
676
+
677
+ try:
678
+ response = call_llm(prompt)
679
+
680
+ relevance = []
681
+ context_results = []
682
+
683
+ for i, ctx in enumerate(prepared.contexts):
684
+ pattern = rf'CHUNK_{i+1}:\s*(RELEVANT|NOT_RELEVANT)'
685
+ match = re.search(pattern, response, re.IGNORECASE)
686
+ is_relevant = match and 'NOT' not in match.group(1).upper() if match else False
687
+
688
+ if not match:
689
+ lines = response.strip().split('\n')
690
+ if i < len(lines):
691
+ is_relevant = 'RELEVANT' in lines[i].upper() and 'NOT' not in lines[i].upper()
692
+
693
+ relevance.append(1 if is_relevant else 0)
694
+ context_results.append({
695
+ "context_idx": i,
696
+ "relevant": is_relevant
697
+ })
698
+
699
+ # Calculate mean precision@k
700
+ precision_at_k = []
701
+ relevant_so_far = 0
702
+ for k, rel in enumerate(relevance, 1):
703
+ relevant_so_far += rel
704
+ if rel: # Only count precision at positions where item is relevant
705
+ precision_at_k.append(relevant_so_far / k)
706
+
707
+ score = np.mean(precision_at_k) if precision_at_k else 0.0
708
+ prepared.context_precision_score = score
709
+ prepared.context_precision_details = {
710
+ "contexts": context_results,
711
+ "precision_at_k": precision_at_k,
712
+ "relevance_binary": relevance
713
+ }
714
+ return score
715
+
716
+ except Exception as e:
717
+ print(f"Error in evaluate_context_precision: {e}")
718
+ prepared.context_precision_score = 0.0
719
+ return 0.0
720
+
721
+ def evaluate_answer_relevancy(self, prepared: PreparedQA) -> float:
722
+ """
723
+ Evaluate answer relevancy using reverse questions.
724
+ 0 LLM calls - uses embeddings only (reverse questions already generated).
725
+ """
726
+ if not self.embeddings or not prepared.reverse_questions:
727
+ prepared.answer_relevancy_score = 0.0
728
+ return 0.0
729
+
730
+ try:
731
+ # Get embeddings for original question and reverse questions
732
+ original_embedding = self.embeddings.embed_query(prepared.question)
733
+ reverse_embeddings = [
734
+ self.embeddings.embed_query(q)
735
+ for q in prepared.reverse_questions
736
+ ]
737
+
738
+ # Calculate cosine similarities
739
+ similarities = []
740
+ for rev_emb in reverse_embeddings:
741
+ sim = cosine_similarity(
742
+ [original_embedding],
743
+ [rev_emb]
744
+ )[0][0]
745
+ similarities.append(sim)
746
+
747
+ score = float(np.mean(similarities))
748
+ prepared.answer_relevancy_score = max(0.0, min(1.0, score))
749
+ return prepared.answer_relevancy_score
750
+
751
+ except Exception as e:
752
+ print(f"Error in evaluate_answer_relevancy: {e}")
753
+ prepared.answer_relevancy_score = 0.0
754
+ return 0.0
755
+
756
+ # ========================================================================
757
+ # MULTIMODAL METRICS (1 VLM call each)
758
+ # ========================================================================
759
+
760
+ def evaluate_multimodal_faithfulness(self, prepared: PreparedQA) -> float:
761
+ """
762
+ Evaluate multimodal faithfulness using VLM.
763
+ 1 VLM call - verifies ALL claims at once against text+image context.
764
+
765
+ Same pattern as text faithfulness:
766
+ - Pass all claims in one prompt
767
+ - VLM returns SUPPORTED/NOT_SUPPORTED for each
768
+ - Compute score as supported_count / total_claims
769
+ """
770
+ if not self.enable_multimodal or not prepared.context_chunks:
771
+ prepared.multimodal_faithfulness_score = 0.0
772
+ return 0.0
773
+
774
+ # Check if there are any images
775
+ has_images = any(
776
+ chunk.get('image_path') and chunk.get('image_path') != 'null'
777
+ for chunk in prepared.context_chunks
778
+ )
779
+
780
+ if not has_images:
781
+ # Fall back to text-only faithfulness
782
+ prepared.multimodal_faithfulness_score = prepared.faithfulness_score
783
+ return prepared.faithfulness_score
784
+
785
+ # Use extracted claims, or fall back to answer as single claim
786
+ claims = prepared.answer_claims if prepared.answer_claims else [prepared.answer]
787
+
788
+ claims_text = "\n".join([
789
+ f"CLAIM_{i+1}: {claim}"
790
+ for i, claim in enumerate(claims)
791
+ ])
792
+
793
+ prompt = PROMPT_MULTIMODAL_FAITHFULNESS.format(
794
+ question=prepared.question,
795
+ answer=prepared.answer,
796
+ claims=claims_text
797
+ )
798
+
799
+ try:
800
+ response = call_vlm_interweaved(prompt, prepared.context_chunks)
801
+
802
+ # Parse each claim result (same pattern as text faithfulness)
803
+ supported_count = 0
804
+ claim_results = []
805
+ text_sources = 0
806
+ image_sources = 0
807
+
808
+ for i, claim in enumerate(claims):
809
+ pattern = rf'CLAIM_{i+1}:\s*(SUPPORTED|NOT_SUPPORTED)\s*\|\s*SOURCE:\s*(TEXT|IMAGE|BOTH|NONE)'
810
+ match = re.search(pattern, response, re.IGNORECASE)
811
+
812
+ if match:
813
+ is_supported = 'NOT' not in match.group(1).upper()
814
+ source = match.group(2).upper()
815
+ else:
816
+ # Fallback: simpler pattern
817
+ simple_pattern = rf'CLAIM_{i+1}:\s*(SUPPORTED|NOT_SUPPORTED)'
818
+ simple_match = re.search(simple_pattern, response, re.IGNORECASE)
819
+ is_supported = simple_match and 'NOT' not in simple_match.group(1).upper() if simple_match else False
820
+ source = "UNKNOWN"
821
+
822
+ claim_results.append({
823
+ "claim": claim,
824
+ "supported": is_supported,
825
+ "source": source
826
+ })
827
+
828
+ if is_supported:
829
+ supported_count += 1
830
+ if source in ['TEXT', 'BOTH']:
831
+ text_sources += 1
832
+ if source in ['IMAGE', 'BOTH']:
833
+ image_sources += 1
834
+
835
+ # Compute score: supported_count / total_claims
836
+ score = supported_count / len(claims) if claims else 1.0
837
+
838
+ # Parse summary details
839
+ text_grounded = text_sources > 0 or 'TEXT_GROUNDED: YES' in response.upper()
840
+ visual_grounded = image_sources > 0 or 'VISUAL_GROUNDED: YES' in response.upper()
841
+
842
+ prepared.multimodal_faithfulness_score = score
843
+ prepared.multimodal_details['faithfulness'] = {
844
+ 'score': score,
845
+ 'supported_count': supported_count,
846
+ 'total_claims': len(claims),
847
+ 'text_grounded': text_grounded,
848
+ 'visual_grounded': visual_grounded,
849
+ 'text_sources': text_sources,
850
+ 'image_sources': image_sources,
851
+ 'claim_results': claim_results
852
+ }
853
+ return score
854
+
855
+ except Exception as e:
856
+ print(f"Error in evaluate_multimodal_faithfulness: {e}")
857
+ prepared.multimodal_faithfulness_score = 0.0
858
+ return 0.0
859
+
860
+ def evaluate_multimodal_relevance(self, prepared: PreparedQA) -> float:
861
+ """
862
+ Evaluate multimodal relevance using VLM.
863
+ 1 VLM call - generates reverse questions and evaluates context usage.
864
+
865
+ Same pattern as answer_relevancy but with VLM for multimodal context:
866
+ - Generate reverse questions from answer
867
+ - Check if questions use text/image context
868
+ - Compute relevance score
869
+ """
870
+ if not self.enable_multimodal or not prepared.context_chunks:
871
+ prepared.multimodal_relevance_score = 0.0
872
+ return 0.0
873
+
874
+ has_images = any(
875
+ chunk.get('image_path') and chunk.get('image_path') != 'null'
876
+ for chunk in prepared.context_chunks
877
+ )
878
+
879
+ if not has_images:
880
+ prepared.multimodal_relevance_score = prepared.answer_relevancy_score
881
+ return prepared.answer_relevancy_score
882
+
883
+ prompt = PROMPT_MULTIMODAL_RELEVANCE.format(
884
+ answer=prepared.answer,
885
+ num_questions=self.num_reverse_questions
886
+ )
887
+
888
+ try:
889
+ response = call_vlm_interweaved(prompt, prepared.context_chunks)
890
+
891
+ # Parse generated questions
892
+ generated_questions = []
893
+ uses_text_count = 0
894
+ uses_image_count = 0
895
+
896
+ for i in range(1, self.num_reverse_questions + 1):
897
+ pattern = rf'Q{i}:\s*(.+?)\s*\|\s*USES:\s*(TEXT|IMAGE|BOTH)'
898
+ match = re.search(pattern, response, re.IGNORECASE)
899
+
900
+ if match:
901
+ question = match.group(1).strip()
902
+ source = match.group(2).upper()
903
+ generated_questions.append({
904
+ 'question': question,
905
+ 'uses': source
906
+ })
907
+ if source in ['TEXT', 'BOTH']:
908
+ uses_text_count += 1
909
+ if source in ['IMAGE', 'BOTH']:
910
+ uses_image_count += 1
911
+
912
+ # Parse relevance score from VLM
913
+ score_match = re.search(r'RELEVANCE_SCORE:\s*([\d.]+)', response)
914
+ vlm_score = float(score_match.group(1)) if score_match else 0.5
915
+ vlm_score = max(0.0, min(1.0, vlm_score))
916
+
917
+ # If we have embeddings and generated questions, compute embedding-based score
918
+ embedding_score = 0.0
919
+ if self.embeddings and generated_questions:
920
+ try:
921
+ original_emb = self.embeddings.embed_query(prepared.question)
922
+ similarities = []
923
+ for gq in generated_questions:
924
+ q_emb = self.embeddings.embed_query(gq['question'])
925
+ sim = cosine_similarity([original_emb], [q_emb])[0][0]
926
+ similarities.append(sim)
927
+ embedding_score = float(np.mean(similarities)) if similarities else 0.0
928
+ except Exception:
929
+ embedding_score = 0.0
930
+
931
+ # Combine VLM score and embedding score (if available)
932
+ if embedding_score > 0:
933
+ score = 0.5 * vlm_score + 0.5 * embedding_score
934
+ else:
935
+ score = vlm_score
936
+
937
+ uses_text = uses_text_count > 0 or 'USES_TEXT: YES' in response.upper()
938
+ uses_images = uses_image_count > 0 or 'USES_IMAGES: YES' in response.upper()
939
+
940
+ prepared.multimodal_relevance_score = score
941
+ prepared.multimodal_details['relevance'] = {
942
+ 'score': score,
943
+ 'vlm_score': vlm_score,
944
+ 'embedding_score': embedding_score,
945
+ 'uses_text': uses_text,
946
+ 'uses_images': uses_images,
947
+ 'generated_questions': generated_questions
948
+ }
949
+ return score
950
+
951
+ except Exception as e:
952
+ print(f"Error in evaluate_multimodal_relevance: {e}")
953
+ prepared.multimodal_relevance_score = 0.0
954
+ return 0.0
955
+
956
+ # ========================================================================
957
+ # ADDITIONAL METRICS (harmonized with metrics.py)
958
+ # ========================================================================
959
+
960
+ def evaluate_context_necessity(self, question: str, answer: str, context: str) -> Dict:
961
+ """
962
+ Measures if the question REQUIRES context to answer (anti-parametric bias).
963
+
964
+ High score (1.0) = Context is essential (good for RAG testing)
965
+ Low score (0.0) = Answerable from parametric knowledge (bad for RAG testing)
966
+
967
+ 1 LLM call to answer without context + 1 LLM call to verify.
968
+ """
969
+ try:
970
+ # Step 1: Try to answer WITHOUT context
971
+ prompt_without = PROMPT_CONTEXT_NECESSITY_WITHOUT.format(question=question)
972
+ answer_without = call_llm(prompt_without)
973
+
974
+ # Check if model refused
975
+ if "CANNOT_ANSWER" in answer_without.upper():
976
+ return {
977
+ "context_necessity_score": 1.0,
978
+ "without_context_correct": False,
979
+ "with_context_correct": True,
980
+ "answer_without_context": answer_without[:200],
981
+ "explanation": "Model could not answer without context - context is essential"
982
+ }
983
+
984
+ # Step 2: Verify if answer without context is correct
985
+ prompt_verify = PROMPT_CONTEXT_NECESSITY_VERIFY.format(
986
+ ground_truth=answer,
987
+ model_answer=answer_without
988
+ )
989
+ verify_response = call_llm(prompt_verify)
990
+ verify_upper = verify_response.upper()
991
+
992
+ if "MATCH: YES" in verify_upper:
993
+ return {
994
+ "context_necessity_score": 0.0,
995
+ "without_context_correct": True,
996
+ "with_context_correct": True,
997
+ "answer_without_context": answer_without[:200],
998
+ "explanation": "Model answered correctly without context - question tests parametric knowledge"
999
+ }
1000
+ elif "MATCH: PARTIAL" in verify_upper:
1001
+ return {
1002
+ "context_necessity_score": 0.5,
1003
+ "without_context_correct": False,
1004
+ "with_context_correct": True,
1005
+ "answer_without_context": answer_without[:200],
1006
+ "explanation": "Model partially answered without context - context adds value"
1007
+ }
1008
+ else:
1009
+ return {
1010
+ "context_necessity_score": 0.9,
1011
+ "without_context_correct": False,
1012
+ "with_context_correct": True,
1013
+ "answer_without_context": answer_without[:200],
1014
+ "explanation": "Model answered incorrectly without context - context is necessary"
1015
+ }
1016
+
1017
+ except Exception as e:
1018
+ logging.error(f"Error in context_necessity: {e}")
1019
+ return {
1020
+ "context_necessity_score": 0.5,
1021
+ "without_context_correct": None,
1022
+ "with_context_correct": None,
1023
+ "answer_without_context": "",
1024
+ "explanation": f"Error: {str(e)}"
1025
+ }
1026
+
1027
+ def batch_evaluate_context_necessity(self, qa_items: List[Dict]) -> List[Dict]:
1028
+ """Batch evaluation of context necessity using parallel calls."""
1029
+ # Phase 1: Batch "answer without context"
1030
+ prompts_without = [
1031
+ PROMPT_CONTEXT_NECESSITY_WITHOUT.format(question=item['question'])
1032
+ for item in qa_items
1033
+ ]
1034
+
1035
+ print(f" ⚔ Phase 1: Answering {len(prompts_without)} questions without context...")
1036
+ answers_without = batch_call_llm(prompts_without, show_progress=False)
1037
+
1038
+ # Phase 2: Batch verification
1039
+ verify_prompts = []
1040
+ verify_indices = []
1041
+ results = [None] * len(qa_items)
1042
+
1043
+ for i, (item, answer_without) in enumerate(zip(qa_items, answers_without)):
1044
+ if answer_without.startswith("ERROR:"):
1045
+ results[i] = {
1046
+ "context_necessity_score": 0.5,
1047
+ "without_context_correct": None,
1048
+ "explanation": f"Error: {answer_without}"
1049
+ }
1050
+ elif "CANNOT_ANSWER" in answer_without.upper():
1051
+ results[i] = {
1052
+ "context_necessity_score": 1.0,
1053
+ "without_context_correct": False,
1054
+ "answer_without_context": answer_without[:200],
1055
+ "explanation": "Model could not answer without context"
1056
+ }
1057
+ else:
1058
+ verify_prompts.append(PROMPT_CONTEXT_NECESSITY_VERIFY.format(
1059
+ ground_truth=item['answer'],
1060
+ model_answer=answer_without
1061
+ ))
1062
+ verify_indices.append(i)
1063
+
1064
+ if verify_prompts:
1065
+ print(f" ⚔ Phase 2: Verifying {len(verify_prompts)} answers...")
1066
+ verify_responses = batch_call_llm(verify_prompts, show_progress=False)
1067
+
1068
+ for idx, verify_content in zip(verify_indices, verify_responses):
1069
+ answer_without = answers_without[idx]
1070
+ verify_upper = verify_content.upper() if verify_content else ""
1071
+
1072
+ if "MATCH: YES" in verify_upper:
1073
+ results[idx] = {
1074
+ "context_necessity_score": 0.0,
1075
+ "without_context_correct": True,
1076
+ "answer_without_context": answer_without[:200],
1077
+ "explanation": "Answered correctly without context"
1078
+ }
1079
+ elif "MATCH: PARTIAL" in verify_upper:
1080
+ results[idx] = {
1081
+ "context_necessity_score": 0.5,
1082
+ "without_context_correct": False,
1083
+ "answer_without_context": answer_without[:200],
1084
+ "explanation": "Partially answered without context"
1085
+ }
1086
+ else:
1087
+ results[idx] = {
1088
+ "context_necessity_score": 0.9,
1089
+ "without_context_correct": False,
1090
+ "answer_without_context": answer_without[:200],
1091
+ "explanation": "Answered incorrectly without context"
1092
+ }
1093
+
1094
+ return results
1095
+
1096
+ def evaluate_multihop_reasoning(self, question: str, answer: str, contexts: List[str]) -> Dict:
1097
+ """
1098
+ Evaluate if QA requires multi-hop reasoning (combining multiple facts).
1099
+
1100
+ Returns: hop_count, reasoning_score, bridge_entity
1101
+ """
1102
+ contexts_str = "\n".join(contexts) if isinstance(contexts, list) else contexts
1103
+
1104
+ prompt = PROMPT_MULTIHOP_REASONING.format(
1105
+ contexts=contexts_str,
1106
+ question=question,
1107
+ answer=answer
1108
+ )
1109
+
1110
+ try:
1111
+ response = call_llm(prompt)
1112
+
1113
+ hop_match = re.search(r'HOP_COUNT:\s*(\d+)', response)
1114
+ score_match = re.search(r'REASONING_SCORE:\s*([\d.]+)', response)
1115
+ bridge_match = re.search(r'BRIDGE_ENTITY:\s*(.+?)(?:\n|$)', response)
1116
+
1117
+ hop_count = int(hop_match.group(1)) if hop_match else 1
1118
+ reasoning_score = float(score_match.group(1)) if score_match else 0.5
1119
+ reasoning_score = min(1.0, max(0.0, reasoning_score))
1120
+ bridge_entity = bridge_match.group(1).strip() if bridge_match else "None"
1121
+
1122
+ return {
1123
+ "hop_count": hop_count,
1124
+ "reasoning_score": reasoning_score,
1125
+ "bridge_entity": bridge_entity
1126
+ }
1127
+ except Exception as e:
1128
+ logging.error(f"Error in multihop_reasoning: {e}")
1129
+ return {"hop_count": 1, "reasoning_score": 0.0, "bridge_entity": "Error"}
1130
+
1131
+ def batch_evaluate_multihop_reasoning(self, qa_items: List[Dict]) -> List[Dict]:
1132
+ """Batch evaluation of multihop reasoning."""
1133
+ prompts = []
1134
+ for item in qa_items:
1135
+ contexts_str = "\n".join(item['contexts']) if isinstance(item['contexts'], list) else item['contexts']
1136
+ prompts.append(PROMPT_MULTIHOP_REASONING.format(
1137
+ contexts=contexts_str,
1138
+ question=item['question'],
1139
+ answer=item['answer']
1140
+ ))
1141
+
1142
+ print(f" ⚔ Batch evaluating {len(prompts)} multihop questions...")
1143
+ responses = batch_call_llm(prompts, show_progress=False)
1144
+
1145
+ results = []
1146
+ for response in responses:
1147
+ if response.startswith("ERROR:"):
1148
+ results.append({"hop_count": 1, "reasoning_score": 0.0, "bridge_entity": "Error"})
1149
+ continue
1150
+
1151
+ hop_match = re.search(r'HOP_COUNT:\s*(\d+)', response)
1152
+ score_match = re.search(r'REASONING_SCORE:\s*([\d.]+)', response)
1153
+ bridge_match = re.search(r'BRIDGE_ENTITY:\s*(.+?)(?:\n|$)', response)
1154
+
1155
+ results.append({
1156
+ "hop_count": int(hop_match.group(1)) if hop_match else 1,
1157
+ "reasoning_score": min(1.0, max(0.0, float(score_match.group(1)))) if score_match else 0.5,
1158
+ "bridge_entity": bridge_match.group(1).strip() if bridge_match else "None"
1159
+ })
1160
+
1161
+ return results
1162
+
1163
+ def evaluate_visual_dependency(self, question: str, text_contexts: List[str],
1164
+ has_image_description: bool = True) -> float:
1165
+ """
1166
+ Evaluate if visual content is essential for answering the question.
1167
+
1168
+ Returns:
1169
+ 1.0 = Strongly requires visual (image references, diagrams, visual elements)
1170
+ 0.5 = Partially visual (enhanced by images but answerable from text)
1171
+ 0.0 = Text-only sufficient
1172
+ """
1173
+ contexts_str = "\n".join(text_contexts) if isinstance(text_contexts, list) else text_contexts
1174
+
1175
+ # Enhanced prompt that checks for visual references in question/answer
1176
+ prompt = f"""Analyze if this question REQUIRES visual information (images, diagrams, figures) to answer properly.
1177
+
1178
+ CONTEXT:
1179
+ {contexts_str}
1180
+
1181
+ QUESTION: {question}
1182
+
1183
+ Evaluate:
1184
+ 1. Does the question ask about visual elements (shapes, colors, layouts, diagrams, figures)?
1185
+ 2. Would seeing an image provide essential information not captured in text?
1186
+ 3. Does the context describe visual content that needs to be seen to understand?
1187
+
1188
+ RESPOND WITH ONE OF:
1189
+ - VISUAL_ESSENTIAL: Question cannot be properly answered without seeing visual content
1190
+ - VISUAL_HELPFUL: Visual content enhances understanding but text is sufficient
1191
+ - TEXT_SUFFICIENT: Can be fully answered from text alone
1192
+
1193
+ YOUR VERDICT:"""
1194
+
1195
+ try:
1196
+ response = call_llm(prompt)
1197
+ response_upper = response.upper()
1198
+
1199
+ if "VISUAL_ESSENTIAL" in response_upper:
1200
+ return 1.0
1201
+ elif "VISUAL_HELPFUL" in response_upper:
1202
+ return 0.5
1203
+ else:
1204
+ return 0.0
1205
+ except Exception as e:
1206
+ logging.error(f"Error in visual_dependency: {e}")
1207
+ return 0.0
1208
+
1209
+ def evaluate_semantic_diversity(self, questions: List[str]) -> float:
1210
+ """
1211
+ Calculate diversity of questions using embedding similarity.
1212
+
1213
+ High score = Diverse questions (good)
1214
+ Low score = Repetitive questions (bad)
1215
+ """
1216
+ if not self.embeddings or not questions or len(questions) < 2:
1217
+ return 0.0
1218
+
1219
+ try:
1220
+ embeddings = [self.embeddings.embed_query(q) for q in questions]
1221
+ matrix = np.array(embeddings)
1222
+
1223
+ sim_matrix = cosine_similarity(matrix)
1224
+ np.fill_diagonal(sim_matrix, np.nan)
1225
+ avg_similarity = np.nanmean(sim_matrix)
1226
+
1227
+ diversity_score = 1 - avg_similarity
1228
+ return max(0.0, min(1.0, diversity_score))
1229
+ except Exception as e:
1230
+ logging.error(f"Error in semantic_diversity: {e}")
1231
+ return 0.0
1232
+
1233
+ def evaluate_domain_coverage(self, qa_data: List[Dict], corpus_chunks: List[Dict]) -> Dict:
1234
+ """
1235
+ Measure how well QA dataset covers the source corpus.
1236
+
1237
+ Returns: chunk_coverage, file_coverage, topic_divergence
1238
+ """
1239
+ try:
1240
+ from scipy.stats import entropy
1241
+ except ImportError:
1242
+ return {"error": "scipy not installed"}
1243
+
1244
+ # Build corpus index
1245
+ corpus_index = {}
1246
+ corpus_by_file = Counter()
1247
+
1248
+ for chunk in corpus_chunks:
1249
+ key = (chunk.get('file_name'), str(chunk.get('chunk_id')))
1250
+ corpus_index[key] = chunk
1251
+ corpus_by_file[chunk.get('file_name', 'unknown')] += 1
1252
+
1253
+ # Track covered chunks
1254
+ covered_chunks = set()
1255
+ qa_file_counts = Counter()
1256
+
1257
+ for qa in qa_data:
1258
+ for chunk_ref in qa.get('chunks_added', []):
1259
+ key = (chunk_ref.get('file_name'), str(chunk_ref.get('chunk_id')))
1260
+ if key in corpus_index:
1261
+ covered_chunks.add(key)
1262
+ qa_file_counts[chunk_ref.get('file_name')] += 1
1263
+
1264
+ # Calculate metrics
1265
+ total_corpus = len(corpus_chunks)
1266
+ total_covered = len(covered_chunks)
1267
+ chunk_coverage = total_covered / total_corpus if total_corpus > 0 else 0.0
1268
+
1269
+ files_in_corpus = set(corpus_by_file.keys())
1270
+ files_covered = set(f for f, _ in covered_chunks)
1271
+ file_coverage = len(files_covered) / len(files_in_corpus) if files_in_corpus else 0.0
1272
+
1273
+ # Jensen-Shannon divergence
1274
+ all_files = sorted(files_in_corpus)
1275
+ corpus_dist = np.array([corpus_by_file.get(f, 0) for f in all_files], dtype=float)
1276
+ corpus_dist = corpus_dist / corpus_dist.sum() if corpus_dist.sum() > 0 else corpus_dist
1277
+
1278
+ qa_dist = np.array([qa_file_counts.get(f, 0) for f in all_files], dtype=float)
1279
+ qa_dist = qa_dist / qa_dist.sum() if qa_dist.sum() > 0 else qa_dist
1280
+
1281
+ eps = 1e-10
1282
+ corpus_dist = (corpus_dist + eps) / (corpus_dist + eps).sum()
1283
+ qa_dist = (qa_dist + eps) / (qa_dist + eps).sum()
1284
+
1285
+ m = 0.5 * (corpus_dist + qa_dist)
1286
+ js_divergence = 0.5 * (entropy(corpus_dist, m) + entropy(qa_dist, m))
1287
+
1288
+ return {
1289
+ "chunk_coverage": chunk_coverage,
1290
+ "file_coverage": file_coverage,
1291
+ "chunks_covered": total_covered,
1292
+ "chunks_total": total_corpus,
1293
+ "uncovered_chunks": total_corpus - total_covered,
1294
+ "topic_divergence_js": float(js_divergence)
1295
+ }
1296
+
1297
+ # ========================================================================
1298
+ # FULL EVALUATION PIPELINE
1299
+ # ========================================================================
1300
+
1301
+ def evaluate_single(self, qa_item: Dict, enable_multimodal: bool = None) -> Dict:
1302
+ """
1303
+ Evaluate a single QA pair with all metrics.
1304
+
1305
+ Total LLM calls: 4-6 per QA pair
1306
+ - 1 preparation call
1307
+ - 1 faithfulness call
1308
+ - 1 context_recall call
1309
+ - 1 context_precision call
1310
+ - 0 answer_relevancy call (embeddings only)
1311
+ - 1 multimodal_faithfulness call (if images)
1312
+ - 1 multimodal_relevance call (if images)
1313
+ """
1314
+ if enable_multimodal is None:
1315
+ enable_multimodal = self.enable_multimodal
1316
+
1317
+ # Extract fields
1318
+ question = qa_item.get('question', '')
1319
+ answer = qa_item.get('answer', '')
1320
+ reference = qa_item.get('ground_truth', qa_item.get('answer', ''))
1321
+ contexts = qa_item.get('contexts', [])
1322
+ context_chunks = qa_item.get('context_chunks', [])
1323
+
1324
+ # Step 1: Prepare (1 call)
1325
+ prepared = self.prepare_qa(question, answer, reference, contexts, context_chunks)
1326
+
1327
+ # Step 2: Evaluate each metric (1 call each)
1328
+ self.evaluate_faithfulness(prepared)
1329
+ self.evaluate_context_recall(prepared)
1330
+ self.evaluate_context_precision(prepared)
1331
+ self.evaluate_answer_relevancy(prepared)
1332
+
1333
+ # Step 3: Multimodal metrics (1-2 VLM calls if images present)
1334
+ if enable_multimodal and context_chunks:
1335
+ self.evaluate_multimodal_faithfulness(prepared)
1336
+ self.evaluate_multimodal_relevance(prepared)
1337
+
1338
+ # Extract metadata from qa_item
1339
+ metadata = qa_item.get('metadata', {})
1340
+
1341
+ # Determine is_multihop and is_multimodal
1342
+ hop_count = metadata.get('hop_count', 0)
1343
+ is_multihop = hop_count > 0
1344
+
1345
+ # Check for multimodal (has images in context)
1346
+ is_multimodal = False
1347
+ for chunk in context_chunks:
1348
+ if has_image_in_chunk(chunk):
1349
+ is_multimodal = True
1350
+ break
1351
+
1352
+ # Return results with complete metadata
1353
+ return {
1354
+ 'question': question,
1355
+ 'answer': answer,
1356
+ 'faithfulness': prepared.faithfulness_score,
1357
+ 'context_recall': prepared.context_recall_score,
1358
+ 'context_precision': prepared.context_precision_score,
1359
+ 'answer_relevancy': prepared.answer_relevancy_score,
1360
+ 'multimodal_faithfulness': prepared.multimodal_faithfulness_score,
1361
+ 'multimodal_relevance': prepared.multimodal_relevance_score,
1362
+ # Additional fields for visualization/analysis
1363
+ 'hop_count': hop_count,
1364
+ 'is_multihop': is_multihop,
1365
+ 'is_multimodal': is_multimodal,
1366
+ 'chunk_id': metadata.get('chunk_id'),
1367
+ 'context_status': metadata.get('context_status'),
1368
+ 'depth_reached': metadata.get('depth_reached'),
1369
+ # Dataset metadata
1370
+ 'expert_persona': qa_item.get('expert_persona') or metadata.get('expert_persona'),
1371
+ 'domain': qa_item.get('domain') or metadata.get('domain'),
1372
+ 'concept_hops_question': prepared.concept_hops_question,
1373
+ 'details': {
1374
+ 'answer_claims': prepared.answer_claims,
1375
+ 'reference_claims': prepared.reference_claims,
1376
+ 'reverse_questions': prepared.reverse_questions,
1377
+ 'faithfulness': prepared.faithfulness_details,
1378
+ 'context_recall': prepared.context_recall_details,
1379
+ 'context_precision': prepared.context_precision_details,
1380
+ 'multimodal': prepared.multimodal_details
1381
+ }
1382
+ }
1383
+
1384
+ def evaluate_batch(self, qa_data: List[Dict],
1385
+ enable_multimodal: bool = None,
1386
+ show_progress: bool = True) -> List[Dict]:
1387
+ """
1388
+ Evaluate a batch of QA pairs with parallel processing.
1389
+ """
1390
+ if enable_multimodal is None:
1391
+ enable_multimodal = self.enable_multimodal
1392
+
1393
+ results = []
1394
+ total = len(qa_data)
1395
+
1396
+ # Use ThreadPoolExecutor for parallel evaluation
1397
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
1398
+ futures = {
1399
+ executor.submit(self.evaluate_single, qa, enable_multimodal): i
1400
+ for i, qa in enumerate(qa_data)
1401
+ }
1402
+
1403
+ completed = 0
1404
+ for future in as_completed(futures):
1405
+ idx = futures[future]
1406
+ try:
1407
+ result = future.result()
1408
+ results.append((idx, result))
1409
+ except Exception as e:
1410
+ print(f"Error evaluating item {idx}: {e}")
1411
+ results.append((idx, {
1412
+ 'question': qa_data[idx].get('question', ''),
1413
+ 'faithfulness': 0.0,
1414
+ 'context_recall': 0.0,
1415
+ 'context_precision': 0.0,
1416
+ 'answer_relevancy': 0.0,
1417
+ 'multimodal_faithfulness': 0.0,
1418
+ 'multimodal_relevance': 0.0,
1419
+ 'error': str(e)
1420
+ }))
1421
+
1422
+ completed += 1
1423
+ if show_progress and completed % 10 == 0:
1424
+ print(f" Evaluated {completed}/{total} QA pairs...")
1425
+
1426
+ # Sort by original index
1427
+ results.sort(key=lambda x: x[0])
1428
+ return [r[1] for r in results]
1429
+
1430
+ def _count_concept_hops(self, concept_hops_str: str) -> int:
1431
+ """Count the number of concept hops from concept hops string.
1432
+
1433
+ Concept hops format: concept1 --> concept2 --> concept3
1434
+ Number of hops = number of arrows (-->) = number of concepts - 1
1435
+ """
1436
+ if not concept_hops_str or not concept_hops_str.strip():
1437
+ return 0
1438
+ # Count occurrences of arrow pattern
1439
+ return concept_hops_str.count('-->')
1440
+
1441
+ def compute_aggregate_scores(self, results: List[Dict]) -> Dict:
1442
+ """Compute aggregate scores from batch results."""
1443
+ if not results:
1444
+ return {}
1445
+
1446
+ valid_results = [r for r in results if 'error' not in r]
1447
+
1448
+ scores = {
1449
+ 'faithfulness': np.mean([r['faithfulness'] for r in valid_results]),
1450
+ 'context_recall': np.mean([r['context_recall'] for r in valid_results]),
1451
+ 'context_precision': np.mean([r['context_precision'] for r in valid_results]),
1452
+ 'answer_relevancy': np.mean([r['answer_relevancy'] for r in valid_results]),
1453
+ 'multimodal_faithfulness': np.mean([r['multimodal_faithfulness'] for r in valid_results]),
1454
+ 'multimodal_relevance': np.mean([r['multimodal_relevance'] for r in valid_results]),
1455
+ 'total_evaluated': len(valid_results),
1456
+ 'total_errors': len(results) - len(valid_results)
1457
+ }
1458
+
1459
+ # Calculate average concept hops for questions
1460
+ concept_hops_list = []
1461
+ for r in valid_results:
1462
+ hops_str = r.get('concept_hops_question', '')
1463
+ if hops_str:
1464
+ hops_count = self._count_concept_hops(hops_str)
1465
+ concept_hops_list.append(hops_count)
1466
+
1467
+ if concept_hops_list:
1468
+ scores['avg_concept_hops_question'] = float(np.mean(concept_hops_list))
1469
+ scores['concept_hops_question_count'] = len(concept_hops_list)
1470
+ else:
1471
+ scores['avg_concept_hops_question'] = 0.0
1472
+ scores['concept_hops_question_count'] = 0
1473
+
1474
+ # Filter out zero multimodal scores (items without images)
1475
+ mm_faithfulness = [r['multimodal_faithfulness'] for r in valid_results if r['multimodal_faithfulness'] > 0]
1476
+ mm_relevance = [r['multimodal_relevance'] for r in valid_results if r['multimodal_relevance'] > 0]
1477
+
1478
+ if mm_faithfulness:
1479
+ scores['multimodal_faithfulness_valid'] = np.mean(mm_faithfulness)
1480
+ scores['multimodal_items_count'] = len(mm_faithfulness)
1481
+
1482
+ if mm_relevance:
1483
+ scores['multimodal_relevance_valid'] = np.mean(mm_relevance)
1484
+
1485
+ return scores
1486
+
1487
+
1488
+ # ============================================================================
1489
+ # CONVENIENCE FUNCTION
1490
+ # ============================================================================
1491
+
1492
+ def run_dataset_qa_evaluation(qa_data: List[Dict],
1493
+ output_path: str = None,
1494
+ enable_multimodal: bool = True,
1495
+ max_workers: int = 8) -> Dict:
1496
+ """
1497
+ Quality assurance evaluation for GOLD-STANDARD DATASET CREATION.
1498
+
1499
+ Only runs metrics that matter for dataset creation:
1500
+ - Faithfulness: Are gold answers grounded in source context?
1501
+ - Answer Relevancy: Do answers address the questions?
1502
+ - Context Precision: Are all source chunks relevant? (optional)
1503
+ - Multimodal metrics: For image-containing contexts
1504
+
1505
+ SKIPS Context Recall (redundant when answer = reference)
1506
+
1507
+ Args:
1508
+ qa_data: List of QA dicts with question, answer, contexts
1509
+ output_path: Optional path to save results
1510
+ enable_multimodal: Whether to run VLM metrics for image contexts
1511
+ max_workers: Number of parallel workers
1512
+
1513
+ Returns:
1514
+ Dict with quality scores and flagged issues
1515
+ """
1516
+ print("=" * 60)
1517
+ print("DATASET QUALITY ASSURANCE EVALUATION")
1518
+ print("=" * 60)
1519
+ print(f" Total QA pairs: {len(qa_data)}")
1520
+ print(f" Mode: Gold-standard dataset creation")
1521
+ print(f" Key metrics: Faithfulness, Answer Relevancy")
1522
+ print()
1523
+
1524
+ evaluator = OptimizedMetricsEvaluator(
1525
+ enable_multimodal=enable_multimodal,
1526
+ max_workers=max_workers
1527
+ )
1528
+
1529
+ results = []
1530
+ issues = []
1531
+
1532
+ for i, qa in enumerate(qa_data):
1533
+ if i % 10 == 0:
1534
+ print(f" Evaluating {i+1}/{len(qa_data)}...")
1535
+
1536
+ # Prepare
1537
+ question = qa.get('question', '')
1538
+ answer = qa.get('answer', '')
1539
+ contexts = qa.get('contexts', [])
1540
+ context_chunks = qa.get('context_chunks', [])
1541
+
1542
+ prepared = evaluator.prepare_qa(question, answer, answer, contexts, context_chunks)
1543
+
1544
+ # Only run critical metrics for dataset creation
1545
+ evaluator.evaluate_faithfulness(prepared)
1546
+ evaluator.evaluate_answer_relevancy(prepared)
1547
+ evaluator.evaluate_context_precision(prepared)
1548
+
1549
+ # Multimodal if applicable
1550
+ if enable_multimodal and context_chunks:
1551
+ evaluator.evaluate_multimodal_faithfulness(prepared)
1552
+
1553
+ result = {
1554
+ 'idx': i,
1555
+ 'question': question[:100],
1556
+ 'faithfulness': prepared.faithfulness_score,
1557
+ 'answer_relevancy': prepared.answer_relevancy_score,
1558
+ 'context_precision': prepared.context_precision_score,
1559
+ 'multimodal_faithfulness': prepared.multimodal_faithfulness_score,
1560
+ 'concept_hops_question': prepared.concept_hops_question,
1561
+ }
1562
+ results.append(result)
1563
+
1564
+ # Flag potential issues
1565
+ if prepared.faithfulness_score < 0.8:
1566
+ issues.append({
1567
+ 'idx': i,
1568
+ 'issue': 'LOW_FAITHFULNESS',
1569
+ 'score': prepared.faithfulness_score,
1570
+ 'question': question[:100],
1571
+ 'unsupported_claims': [
1572
+ c['claim'] for c in prepared.faithfulness_details.get('claims', [])
1573
+ if not c.get('supported', True)
1574
+ ]
1575
+ })
1576
+
1577
+ if prepared.answer_relevancy_score < 0.7:
1578
+ issues.append({
1579
+ 'idx': i,
1580
+ 'issue': 'LOW_RELEVANCY',
1581
+ 'score': prepared.answer_relevancy_score,
1582
+ 'question': question[:100],
1583
+ })
1584
+
1585
+ # Compute aggregates
1586
+ avg_faithfulness = np.mean([r['faithfulness'] for r in results])
1587
+ avg_relevancy = np.mean([r['answer_relevancy'] for r in results])
1588
+ avg_precision = np.mean([r['context_precision'] for r in results])
1589
+
1590
+ # Compute average concept hops
1591
+ concept_hops_list = []
1592
+ for r in results:
1593
+ hops_str = r.get('concept_hops_question', '')
1594
+ if hops_str:
1595
+ hops_count = evaluator._count_concept_hops(hops_str)
1596
+ concept_hops_list.append(hops_count)
1597
+ avg_concept_hops = np.mean(concept_hops_list) if concept_hops_list else 0.0
1598
+
1599
+ print("\n" + "=" * 60)
1600
+ print("DATASET QUALITY SUMMARY")
1601
+ print("=" * 60)
1602
+ print(f" šŸ“Š Average Faithfulness: {avg_faithfulness:.3f}")
1603
+ print(f" šŸ“Š Average Answer Relevancy: {avg_relevancy:.3f}")
1604
+ print(f" šŸ“Š Average Context Precision: {avg_precision:.3f}")
1605
+ if concept_hops_list:
1606
+ print(f" šŸ“Š Average Concept Hops: {avg_concept_hops:.2f} ({len(concept_hops_list)} questions)")
1607
+ print(f"\n āš ļø Issues found: {len(issues)}")
1608
+
1609
+ if issues:
1610
+ print("\n Issues breakdown:")
1611
+ low_faith = [i for i in issues if i['issue'] == 'LOW_FAITHFULNESS']
1612
+ low_rel = [i for i in issues if i['issue'] == 'LOW_RELEVANCY']
1613
+ print(f" - Low faithfulness: {len(low_faith)} QA pairs (answer not grounded)")
1614
+ print(f" - Low relevancy: {len(low_rel)} QA pairs (Q&A mismatch)")
1615
+
1616
+ output = {
1617
+ 'mode': 'dataset_creation_qa',
1618
+ 'total_qa_pairs': len(qa_data),
1619
+ 'aggregate_scores': {
1620
+ 'faithfulness': float(avg_faithfulness),
1621
+ 'answer_relevancy': float(avg_relevancy),
1622
+ 'context_precision': float(avg_precision),
1623
+ 'avg_concept_hops_question': float(avg_concept_hops),
1624
+ 'concept_hops_question_count': len(concept_hops_list),
1625
+ },
1626
+ 'issues': issues,
1627
+ 'detailed_results': results
1628
+ }
1629
+
1630
+ if output_path:
1631
+ with open(output_path, 'w') as f:
1632
+ json.dump(output, f, indent=2)
1633
+ print(f"\nšŸ’¾ Results saved to: {output_path}")
1634
+
1635
+ return output
1636
+
1637
+
1638
+ def run_optimized_evaluation(qa_data: List[Dict],
1639
+ output_path: str = None,
1640
+ enable_multimodal: bool = True,
1641
+ max_workers: int = 8) -> Dict:
1642
+ """
1643
+ Run optimized RAGAS-style evaluation on QA data.
1644
+
1645
+ Args:
1646
+ qa_data: List of QA dicts with question, answer, contexts, etc.
1647
+ output_path: Optional path to save detailed results
1648
+ enable_multimodal: Whether to run multimodal metrics
1649
+ max_workers: Number of parallel workers
1650
+
1651
+ Returns:
1652
+ Dict with aggregate scores and detailed results
1653
+ """
1654
+ print("=" * 60)
1655
+ print("OPTIMIZED METRICS EVALUATION")
1656
+ print("=" * 60)
1657
+ print(f" Total QA pairs: {len(qa_data)}")
1658
+ print(f" Multimodal: {'Enabled' if enable_multimodal else 'Disabled'}")
1659
+ print(f" Max workers: {max_workers}")
1660
+ print()
1661
+
1662
+ evaluator = OptimizedMetricsEvaluator(
1663
+ enable_multimodal=enable_multimodal,
1664
+ max_workers=max_workers
1665
+ )
1666
+
1667
+ print("šŸ“Š Evaluating metrics (4-6 LLM calls per QA pair)...")
1668
+ results = evaluator.evaluate_batch(qa_data, show_progress=True)
1669
+
1670
+ print("\nšŸ“ˆ Computing aggregate scores...")
1671
+ aggregate = evaluator.compute_aggregate_scores(results)
1672
+
1673
+ print("\n" + "=" * 60)
1674
+ print("RESULTS SUMMARY")
1675
+ print("=" * 60)
1676
+ print(f" Faithfulness: {aggregate.get('faithfulness', 0):.3f}")
1677
+ print(f" Context Recall: {aggregate.get('context_recall', 0):.3f}")
1678
+ print(f" Context Precision: {aggregate.get('context_precision', 0):.3f}")
1679
+ print(f" Answer Relevancy: {aggregate.get('answer_relevancy', 0):.3f}")
1680
+ if aggregate.get('multimodal_items_count', 0) > 0:
1681
+ print(f" Multimodal Faithfulness: {aggregate.get('multimodal_faithfulness_valid', 0):.3f} ({aggregate['multimodal_items_count']} items)")
1682
+ print(f" Multimodal Relevance: {aggregate.get('multimodal_relevance_valid', 0):.3f}")
1683
+ if aggregate.get('concept_hops_question_count', 0) > 0:
1684
+ print(f" Avg Concept Hops: {aggregate.get('avg_concept_hops_question', 0):.2f} ({aggregate['concept_hops_question_count']} questions)")
1685
+ print(f"\n Total evaluated: {aggregate.get('total_evaluated', 0)}")
1686
+ print(f" Errors: {aggregate.get('total_errors', 0)}")
1687
+
1688
+ output = {
1689
+ 'aggregate_scores': aggregate,
1690
+ 'detailed_results': results
1691
+ }
1692
+
1693
+ if output_path:
1694
+ with open(output_path, 'w') as f:
1695
+ json.dump(output, f, indent=2)
1696
+ print(f"\nšŸ’¾ Results saved to: {output_path}")
1697
+
1698
+ return output
1699
+
1700
+
1701
+ # ============================================================================
1702
+ # INTEGRATION WITH MAIN PIPELINE
1703
+ # ============================================================================
1704
+
1705
+ def transform_qa_for_evaluation(raw_data: List[Dict]) -> List[Dict]:
1706
+ """
1707
+ Transform qa_multihop_pass.json / qa_deduplicated.json format to evaluation format.
1708
+
1709
+ Input format (from pipeline):
1710
+ - chunk_id, original_chunk, final_context, context_chunks, context_status,
1711
+ - depth_reached, chunks_added, expert_persona, domain, question, answer, etc.
1712
+
1713
+ Output format (for evaluation):
1714
+ - question, answer, contexts (list), ground_truth, context_chunks
1715
+ """
1716
+ transformed = []
1717
+ for item in raw_data:
1718
+ context_chunks = item.get("context_chunks", [])
1719
+
1720
+ # Extract text contexts
1721
+ if context_chunks:
1722
+ contexts = [chunk.get("content", "") for chunk in context_chunks if chunk.get("content")]
1723
+ else:
1724
+ contexts = [item.get("final_context", item.get("original_chunk", ""))]
1725
+
1726
+ transformed.append({
1727
+ "question": item.get("question", ""),
1728
+ "answer": item.get("answer", ""),
1729
+ "contexts": contexts,
1730
+ "ground_truth": item.get("answer", ""), # Use answer as ground_truth
1731
+ "context_chunks": context_chunks, # Keep for multimodal evaluation
1732
+ "expert_persona": item.get("expert_persona"), # Pass through for reporting
1733
+ "domain": item.get("domain"), # Pass through for reporting
1734
+ "metadata": {
1735
+ "chunk_id": item.get("chunk_id"),
1736
+ "hop_count": item.get("hop_count", 0),
1737
+ "context_status": item.get("context_status"),
1738
+ "depth_reached": item.get("depth_reached"),
1739
+ "expert_persona": item.get("expert_persona"),
1740
+ "domain": item.get("domain"),
1741
+ }
1742
+ })
1743
+ return transformed
1744
+
1745
+
1746
+ def run_optimized_pipeline_evaluation(
1747
+ qa_file: str,
1748
+ output_dir: str = None,
1749
+ corpus_path: str = None,
1750
+ enable_multimodal: bool = True,
1751
+ max_workers: int = 8,
1752
+ sample_size: int = None,
1753
+ run_context_necessity: bool = True
1754
+ ) -> Dict:
1755
+ """
1756
+ Comprehensive evaluation with ALL metrics (harmonized with metrics.py).
1757
+
1758
+ Drop-in replacement for metrics.run_subset_evaluation() with optimized implementation.
1759
+
1760
+ METRICS EVALUATED:
1761
+ 1. Faithfulness - Answer grounded in context?
1762
+ 2. Answer Relevancy - Answer addresses question?
1763
+ 3. Context Precision - Retrieved chunks relevant?
1764
+ 4. Context Recall - Context contains reference info?
1765
+ 5. Multimodal Faithfulness - Answer grounded in text+images?
1766
+ 6. Multimodal Relevance - Answer uses multimodal context?
1767
+ 7. Context Necessity - Requires context to answer?
1768
+ 8. Semantic Diversity - Questions diverse?
1769
+ 9. Domain Coverage - Corpus coverage?
1770
+ 10. Multihop Reasoning - Multi-step reasoning quality?
1771
+ 11. Visual Dependency - Needs image to answer?
1772
+
1773
+ Args:
1774
+ qa_file: Path to qa_deduplicated.json or qa_multihop_pass.json
1775
+ output_dir: Directory to save results
1776
+ corpus_path: Path to chunks.json for domain coverage
1777
+ enable_multimodal: Whether to run VLM-based multimodal metrics
1778
+ max_workers: Parallel workers for evaluation
1779
+ sample_size: Limit evaluation to N samples (None = all)
1780
+ run_context_necessity: Whether to run context necessity (expensive)
1781
+
1782
+ Returns:
1783
+ Dict with all metrics, aggregate scores, and detailed results
1784
+ """
1785
+ import os
1786
+ import random
1787
+
1788
+ # Load QA data
1789
+ print(f"šŸ“‚ Loading QA data from {qa_file}...")
1790
+ with open(qa_file, 'r', encoding='utf-8') as f:
1791
+ raw_data = json.load(f)
1792
+
1793
+ original_count = len(raw_data)
1794
+
1795
+ # Extract expert_persona and domain from QA data (should be consistent across all)
1796
+ expert_persona = None
1797
+ domain = None
1798
+ if raw_data:
1799
+ expert_persona = raw_data[0].get('expert_persona')
1800
+ domain = raw_data[0].get('domain')
1801
+
1802
+ # Sample if requested
1803
+ if sample_size and sample_size < len(raw_data):
1804
+ raw_data = random.sample(raw_data, sample_size)
1805
+ print(f"šŸ“Š Sampled {sample_size}/{original_count} QA pairs for evaluation")
1806
+
1807
+ print(f"āœ… Loaded {len(raw_data)} QA pairs")
1808
+ if expert_persona:
1809
+ print(f" Expert Persona: {expert_persona}")
1810
+ if domain:
1811
+ print(f" Domain: {domain}")
1812
+
1813
+ # Initialize results
1814
+ results = {
1815
+ 'ragas_metrics': {},
1816
+ 'context_necessity': {},
1817
+ 'domain_coverage': {},
1818
+ 'multihop_metrics': {},
1819
+ 'multimodal_metrics': {},
1820
+ 'dataset_health': {},
1821
+ 'subset_statistics': {},
1822
+ 'dataset_info': {
1823
+ 'expert_persona': expert_persona,
1824
+ 'domain': domain,
1825
+ 'total_qa_pairs_generated': original_count,
1826
+ 'total_qa_pairs_evaluated': len(raw_data),
1827
+ 'sampled': sample_size is not None and sample_size < original_count
1828
+ }
1829
+ }
1830
+
1831
+ # Transform to evaluation format
1832
+ qa_data = transform_qa_for_evaluation(raw_data)
1833
+
1834
+ # Initialize evaluator
1835
+ evaluator = OptimizedMetricsEvaluator(
1836
+ enable_multimodal=enable_multimodal,
1837
+ max_workers=max_workers
1838
+ )
1839
+
1840
+ # ==========================================
1841
+ # 1. CORE RAGAS METRICS (optimized batch)
1842
+ # ==========================================
1843
+ print("\n" + "=" * 60)
1844
+ print("RAGAS-STYLE METRICS (Optimized)")
1845
+ print("=" * 60)
1846
+
1847
+ print(f"šŸ“Š Evaluating {len(qa_data)} QA pairs (4-6 LLM calls each)...")
1848
+ batch_results = evaluator.evaluate_batch(qa_data, show_progress=True)
1849
+ aggregate = evaluator.compute_aggregate_scores(batch_results)
1850
+
1851
+ results['ragas_metrics'] = {
1852
+ 'faithfulness': float(aggregate.get('faithfulness', 0)),
1853
+ 'answer_relevancy': float(aggregate.get('answer_relevancy', 0)),
1854
+ 'context_precision': float(aggregate.get('context_precision', 0)),
1855
+ 'context_recall': float(aggregate.get('context_recall', 0)),
1856
+ 'items_evaluated': aggregate.get('total_evaluated', 0)
1857
+ }
1858
+
1859
+ if aggregate.get('multimodal_items_count', 0) > 0:
1860
+ results['ragas_metrics']['multimodal_faithfulness'] = float(aggregate.get('multimodal_faithfulness_valid', 0))
1861
+ results['ragas_metrics']['multimodal_relevance'] = float(aggregate.get('multimodal_relevance_valid', 0))
1862
+ results['ragas_metrics']['multimodal_items'] = aggregate.get('multimodal_items_count', 0)
1863
+
1864
+ # Store aggregate scores including concept hops
1865
+ results['aggregate_scores'] = {
1866
+ 'avg_concept_hops_question': float(aggregate.get('avg_concept_hops_question', 0)),
1867
+ 'concept_hops_question_count': aggregate.get('concept_hops_question_count', 0)
1868
+ }
1869
+
1870
+ # Store initial batch results (will be enriched with additional metrics later)
1871
+ results['detailed_results'] = batch_results
1872
+
1873
+ # ==========================================
1874
+ # 2. SUBSET STATISTICS (with intersection)
1875
+ # ==========================================
1876
+ multihop_count = 0
1877
+ multimodal_count = 0
1878
+ both_count = 0
1879
+
1880
+ for item in raw_data:
1881
+ hop_count = item.get('hop_count', 0)
1882
+ chunks_added = item.get('chunks_added', [])
1883
+ is_multihop = hop_count > 0 or (isinstance(chunks_added, list) and len(chunks_added) > 1)
1884
+
1885
+ is_multimodal = any(
1886
+ has_image_in_chunk(c) for c in item.get('context_chunks', [])
1887
+ )
1888
+
1889
+ if is_multihop:
1890
+ multihop_count += 1
1891
+ if is_multimodal:
1892
+ multimodal_count += 1
1893
+ if is_multihop and is_multimodal:
1894
+ both_count += 1
1895
+
1896
+ results['subset_statistics'] = {
1897
+ 'total_qa_pairs': len(raw_data),
1898
+ 'multihop_count': multihop_count,
1899
+ 'multimodal_count': multimodal_count,
1900
+ 'multihop_multimodal_count': both_count,
1901
+ 'multihop_only_count': multihop_count - both_count,
1902
+ 'multimodal_only_count': multimodal_count - both_count,
1903
+ 'text_only_count': len(raw_data) - multihop_count - multimodal_count + both_count,
1904
+ 'avg_hop_count': float(np.mean([item.get('hop_count', 0) for item in raw_data]))
1905
+ }
1906
+
1907
+ # ==========================================
1908
+ # 3. CONTEXT NECESSITY (anti-parametric bias)
1909
+ # ==========================================
1910
+ if run_context_necessity:
1911
+ print("\n" + "=" * 60)
1912
+ print("CONTEXT NECESSITY (Anti-Parametric Bias)")
1913
+ print("=" * 60)
1914
+
1915
+ # Prepare batch items
1916
+ batch_items = []
1917
+ for qa in raw_data:
1918
+ context = qa.get('final_context', qa.get('original_chunk', ''))
1919
+ batch_items.append({
1920
+ 'question': qa['question'],
1921
+ 'answer': qa['answer'],
1922
+ 'context': context
1923
+ })
1924
+
1925
+ cn_results = evaluator.batch_evaluate_context_necessity(batch_items)
1926
+ necessity_scores = [r['context_necessity_score'] for r in cn_results]
1927
+
1928
+ # Merge context_necessity into detailed_results
1929
+ for i, cn_result in enumerate(cn_results):
1930
+ if i < len(results['detailed_results']):
1931
+ results['detailed_results'][i]['context_necessity_score'] = cn_result['context_necessity_score']
1932
+ results['detailed_results'][i]['without_context_correct'] = cn_result.get('without_context_correct')
1933
+
1934
+ results['context_necessity'] = {
1935
+ 'avg_context_necessity_score': float(np.mean(necessity_scores)),
1936
+ 'items_evaluated': len(necessity_scores),
1937
+ 'items_answerable_without_context': sum(1 for r in cn_results if r.get('without_context_correct')),
1938
+ 'score_distribution': {
1939
+ 'high (0.8-1.0)': sum(1 for s in necessity_scores if s >= 0.8),
1940
+ 'moderate (0.5-0.8)': sum(1 for s in necessity_scores if 0.5 <= s < 0.8),
1941
+ 'low (0.0-0.5)': sum(1 for s in necessity_scores if s < 0.5)
1942
+ }
1943
+ }
1944
+
1945
+ print(f" Average Context Necessity: {results['context_necessity']['avg_context_necessity_score']:.3f}")
1946
+
1947
+ # ==========================================
1948
+ # 4. MULTIHOP REASONING (on multihop subset)
1949
+ # ==========================================
1950
+ multihop_items = [qa for qa in raw_data if qa.get('hop_count', 0) > 0]
1951
+ if multihop_items:
1952
+ print("\n" + "=" * 60)
1953
+ print(f"MULTIHOP REASONING ({len(multihop_items)} items)")
1954
+ print("=" * 60)
1955
+
1956
+ mh_batch = [{
1957
+ 'question': qa['question'],
1958
+ 'answer': qa['answer'],
1959
+ 'contexts': [qa.get('final_context', qa.get('original_chunk', ''))]
1960
+ } for qa in multihop_items]
1961
+
1962
+ mh_results = evaluator.batch_evaluate_multihop_reasoning(mh_batch)
1963
+
1964
+ # Merge multihop reasoning into detailed_results
1965
+ # Create a mapping from question to multihop index
1966
+ mh_questions = set(qa['question'] for qa in multihop_items)
1967
+ mh_idx = 0
1968
+ for i, detail in enumerate(results['detailed_results']):
1969
+ if detail.get('question') in mh_questions and mh_idx < len(mh_results):
1970
+ detail['reasoning_score'] = mh_results[mh_idx]['reasoning_score']
1971
+ detail['bridge_entity'] = mh_results[mh_idx].get('bridge_entity', 'None')
1972
+ detail['llm_hop_count'] = mh_results[mh_idx]['hop_count']
1973
+ mh_idx += 1
1974
+
1975
+ results['multihop_metrics'] = {
1976
+ 'items_evaluated': len(multihop_items),
1977
+ 'avg_hop_count': float(np.mean([r['hop_count'] for r in mh_results])),
1978
+ 'avg_reasoning_score': float(np.mean([r['reasoning_score'] for r in mh_results])),
1979
+ 'hop_distribution': dict(Counter([r['hop_count'] for r in mh_results]))
1980
+ }
1981
+
1982
+ print(f" Avg Hop Count: {results['multihop_metrics']['avg_hop_count']:.2f}")
1983
+ print(f" Avg Reasoning Score: {results['multihop_metrics']['avg_reasoning_score']:.3f}")
1984
+
1985
+ # ==========================================
1986
+ # 5. MULTIMODAL METRICS (on multimodal subset)
1987
+ # ==========================================
1988
+ multimodal_items = [qa for qa in raw_data
1989
+ if any(c.get('image_path') and c.get('image_path') != 'null'
1990
+ for c in qa.get('context_chunks', []))]
1991
+ if multimodal_items:
1992
+ print("\n" + "=" * 60)
1993
+ print(f"VISUAL DEPENDENCY ({len(multimodal_items)} items)")
1994
+ print("=" * 60)
1995
+
1996
+ visual_scores = []
1997
+ mm_questions = []
1998
+ for qa in multimodal_items:
1999
+ contexts = [qa.get('final_context', qa.get('original_chunk', ''))]
2000
+ score = evaluator.evaluate_visual_dependency(qa['question'], contexts)
2001
+ visual_scores.append(score)
2002
+ mm_questions.append(qa['question'])
2003
+
2004
+ # Merge visual dependency into detailed_results
2005
+ mm_question_set = set(mm_questions)
2006
+ mm_idx = 0
2007
+ for i, detail in enumerate(results['detailed_results']):
2008
+ if detail.get('question') in mm_question_set and mm_idx < len(visual_scores):
2009
+ detail['visual_dependency_score'] = visual_scores[mm_idx]
2010
+ mm_idx += 1
2011
+
2012
+ results['multimodal_metrics'] = {
2013
+ 'items_evaluated': len(multimodal_items),
2014
+ 'avg_visual_dependency': float(np.mean(visual_scores)),
2015
+ 'items_visual_essential': sum(1 for s in visual_scores if s >= 1.0),
2016
+ 'items_visual_helpful': sum(1 for s in visual_scores if 0.0 < s < 1.0),
2017
+ 'items_text_sufficient': sum(1 for s in visual_scores if s == 0.0)
2018
+ }
2019
+
2020
+ print(f" Avg Visual Dependency: {results['multimodal_metrics']['avg_visual_dependency']:.3f}")
2021
+
2022
+ # ==========================================
2023
+ # 6. DOMAIN COVERAGE (if corpus provided)
2024
+ # ==========================================
2025
+ if corpus_path and os.path.exists(corpus_path):
2026
+ print("\n" + "=" * 60)
2027
+ print("DOMAIN COVERAGE")
2028
+ print("=" * 60)
2029
+
2030
+ with open(corpus_path, 'r') as f:
2031
+ corpus_chunks = json.load(f)
2032
+
2033
+ coverage = evaluator.evaluate_domain_coverage(raw_data, corpus_chunks)
2034
+ results['domain_coverage'] = coverage
2035
+
2036
+ print(f" Chunk Coverage: {coverage['chunk_coverage']*100:.1f}%")
2037
+ print(f" File Coverage: {coverage['file_coverage']*100:.1f}%")
2038
+
2039
+ # ==========================================
2040
+ # 7. SEMANTIC DIVERSITY
2041
+ # ==========================================
2042
+ print("\n" + "=" * 60)
2043
+ print("SEMANTIC DIVERSITY")
2044
+ print("=" * 60)
2045
+
2046
+ questions = [qa.get('question', '') for qa in raw_data]
2047
+ diversity = evaluator.evaluate_semantic_diversity(questions)
2048
+ results['dataset_health'] = {
2049
+ 'semantic_diversity': float(diversity),
2050
+ 'total_samples': len(raw_data)
2051
+ }
2052
+
2053
+ print(f" Semantic Diversity: {diversity:.3f}")
2054
+
2055
+ # ==========================================
2056
+ # FINAL SUMMARY
2057
+ # ==========================================
2058
+ print("\n" + "=" * 70)
2059
+ print("šŸ“Š EVALUATION SUMMARY (Harmonized Metrics)")
2060
+ print("=" * 70)
2061
+
2062
+ # Dataset Info
2063
+ ds_info = results.get('dataset_info', {})
2064
+ if ds_info.get('expert_persona') or ds_info.get('domain'):
2065
+ print(f"\n Dataset Info:")
2066
+ if ds_info.get('expert_persona'):
2067
+ print(f" Expert Persona: {ds_info['expert_persona']}")
2068
+ if ds_info.get('domain'):
2069
+ print(f" Domain: {ds_info['domain']}")
2070
+ print(f" QA Pairs Generated: {ds_info.get('total_qa_pairs_generated', 0)}")
2071
+ print(f" QA Pairs Evaluated: {ds_info.get('total_qa_pairs_evaluated', 0)}")
2072
+ if ds_info.get('sampled'):
2073
+ print(f" āš ļø Sampled for evaluation")
2074
+
2075
+ rm = results['ragas_metrics']
2076
+ print(f"\n RAGAS Metrics:")
2077
+ print(f" Faithfulness: {rm.get('faithfulness', 0):.3f}")
2078
+ print(f" Answer Relevancy: {rm.get('answer_relevancy', 0):.3f}")
2079
+ print(f" Context Precision: {rm.get('context_precision', 0):.3f}")
2080
+ print(f" Context Recall: {rm.get('context_recall', 0):.3f}")
2081
+
2082
+ if results.get('context_necessity'):
2083
+ print(f"\n Context Necessity: {results['context_necessity'].get('avg_context_necessity_score', 0):.3f}")
2084
+
2085
+ if results.get('multihop_metrics'):
2086
+ print(f"\n Multihop Reasoning:")
2087
+ print(f" Avg Hops: {results['multihop_metrics'].get('avg_hop_count', 0):.2f}")
2088
+ print(f" Reasoning Score: {results['multihop_metrics'].get('avg_reasoning_score', 0):.3f}")
2089
+
2090
+ if results.get('multimodal_metrics'):
2091
+ print(f"\n Multimodal:")
2092
+ print(f" Visual Dependency: {results['multimodal_metrics'].get('avg_visual_dependency', 0):.3f}")
2093
+
2094
+ print(f"\n Dataset Health:")
2095
+ print(f" Semantic Diversity: {results['dataset_health'].get('semantic_diversity', 0):.3f}")
2096
+ print(f" Total Samples: {results['subset_statistics'].get('total_qa_pairs', 0)}")
2097
+ print("=" * 70)
2098
+
2099
+ # Save full results
2100
+ if output_dir:
2101
+ os.makedirs(output_dir, exist_ok=True)
2102
+ report_path = os.path.join(output_dir, "subset_evaluation_report.json") # Same name as metrics.py
2103
+
2104
+ def convert_numpy(obj):
2105
+ """Convert numpy types to Python types for JSON serialization"""
2106
+ if isinstance(obj, np.floating):
2107
+ return float(obj)
2108
+ elif isinstance(obj, np.integer):
2109
+ return int(obj)
2110
+ elif isinstance(obj, np.ndarray):
2111
+ return obj.tolist()
2112
+ elif isinstance(obj, dict):
2113
+ return {k: convert_numpy(v) for k, v in obj.items()}
2114
+ elif isinstance(obj, list):
2115
+ return [convert_numpy(i) for i in obj]
2116
+ return obj
2117
+
2118
+ with open(report_path, 'w') as f:
2119
+ json.dump(convert_numpy(results), f, indent=2)
2120
+ print(f"\nšŸ’¾ Report saved to: {report_path}")
2121
+
2122
+ return results
2123
+
2124
+
2125
+ # ============================================================================
2126
+ # MAIN (for standalone/CLI usage)
2127
+ # ============================================================================
2128
+
2129
+ if __name__ == "__main__":
2130
+ import sys
2131
+ import argparse
2132
+
2133
+ parser = argparse.ArgumentParser(description="Run optimized metrics evaluation")
2134
+ parser.add_argument("qa_file", nargs="?", help="Path to QA JSON file")
2135
+ parser.add_argument("-o", "--output-dir", default=".", help="Output directory")
2136
+ parser.add_argument("-c", "--corpus", default=None, help="Path to chunks.json for domain coverage")
2137
+ parser.add_argument("-s", "--sample-size", type=int, default=None, help="Sample size for evaluation")
2138
+ parser.add_argument("--no-multimodal", action="store_true", help="Disable multimodal metrics")
2139
+ parser.add_argument("--no-context-necessity", action="store_true", help="Skip context necessity evaluation")
2140
+
2141
+ args = parser.parse_args()
2142
+
2143
+ if args.qa_file:
2144
+ # Run on provided QA file
2145
+ results = run_optimized_pipeline_evaluation(
2146
+ qa_file=args.qa_file,
2147
+ output_dir=args.output_dir,
2148
+ corpus_path=args.corpus,
2149
+ enable_multimodal=not args.no_multimodal,
2150
+ max_workers=8,
2151
+ sample_size=args.sample_size,
2152
+ run_context_necessity=not args.no_context_necessity
2153
+ )
2154
+ else:
2155
+ # Test with sample data
2156
+ print("No QA file provided. Running with sample data...")
2157
+ sample_qa = [
2158
+ {
2159
+ "question": "What is the capital of France?",
2160
+ "answer": "Paris is the capital of France. It is also the largest city in France.",
2161
+ "contexts": [
2162
+ "Paris is the capital and most populous city of France.",
2163
+ "France is a country in Western Europe."
2164
+ ],
2165
+ "ground_truth": "Paris is the capital of France."
2166
+ }
2167
+ ]
2168
+
2169
+ results = run_optimized_evaluation(sample_qa, enable_multimodal=False)
2170
+ print("\nDetailed result:")
2171
+ print(json.dumps(results['detailed_results'][0], indent=2))
2172
+