mirage-benchmark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mirage-benchmark might be problematic. Click here for more details.

@@ -0,0 +1,149 @@
1
+
2
+ from typing import List, Dict, Any, Tuple
3
+ import logging
4
+ import re
5
+ from mirage.core.llm import call_vlm_with_multiple_images
6
+ from mirage.core.prompts import PROMPTS
7
+
8
+ class LLMReranker:
9
+ """
10
+ Uses an LLM to evaluate, rank, and merge QA pairs.
11
+ """
12
+ def __init__(self, model_name="gpt-oss", expert_persona: str = None,
13
+ domain: str = None):
14
+ self.model_name = model_name
15
+ self.expert_persona = expert_persona
16
+ self.domain = domain
17
+ self.tuple_delimiter = PROMPTS.get("DEFAULT_TUPLE_DELIMITER", "<|#|>")
18
+ self.completion_delimiter = PROMPTS.get("DEFAULT_COMPLETION_DELIMITER", "<|#|>END<|#|>")
19
+
20
+ def _parse_qa_pairs(self, response_text: str) -> List[Dict[str, str]]:
21
+ """Parses QA pairs from the |#| delimited format."""
22
+ qa_pairs = []
23
+ try:
24
+ # Remove completion delimiter if present
25
+ if self.completion_delimiter in response_text:
26
+ response_text = response_text.split(self.completion_delimiter)[0].strip()
27
+
28
+ # Remove START delimiter if present
29
+ start_delimiter = self.tuple_delimiter + "START" + self.tuple_delimiter
30
+ if response_text.startswith(start_delimiter):
31
+ response_text = response_text[len(start_delimiter):].strip()
32
+
33
+ lines = [line.strip() for line in response_text.split('\n') if line.strip()]
34
+
35
+ for line in lines:
36
+ # Skip NEXT lines
37
+ next_delimiter = self.tuple_delimiter + "NEXT" + self.tuple_delimiter
38
+ if line == next_delimiter:
39
+ continue
40
+
41
+ # Check for Question delimiter pattern
42
+ if line.startswith("Question" + self.tuple_delimiter) or line.startswith("question" + self.tuple_delimiter):
43
+ parts = line.split(self.tuple_delimiter)
44
+ # Expected: Question<|#|>Q<|#|>Answer<|#|>A...
45
+ if len(parts) >= 4 and parts[0].lower() == "question" and parts[2].lower() == "answer":
46
+ question = parts[1].strip()
47
+ answer = parts[3].strip()
48
+ if question and answer:
49
+ qa_pairs.append({"question": question, "answer": answer})
50
+
51
+ return qa_pairs
52
+ except Exception as e:
53
+ logging.error(f"Error parsing QA pairs: {e}")
54
+ return []
55
+
56
+ def rank_cluster(self, cluster_candidates: List[Dict]) -> List[Dict]:
57
+ """
58
+ Step 3: Order/Rank the QA pairs in the cluster.
59
+ """
60
+ if not cluster_candidates or len(cluster_candidates) < 2:
61
+ return cluster_candidates
62
+
63
+ candidates_text = ""
64
+ for idx, item in enumerate(cluster_candidates, 1):
65
+ candidates_text += f"\n--- Candidate {idx} ---\n"
66
+ candidates_text += f"Question: {item.get('question', '')}\n"
67
+ candidates_text += f"Answer: {item.get('answer', '')}\n"
68
+
69
+ prompt_template = PROMPTS.get("deduplication_rank", "")
70
+ if not prompt_template:
71
+ logging.warning("deduplication_rank prompt not found.")
72
+ return cluster_candidates
73
+
74
+ prompt = prompt_template.format(
75
+ candidates_text=candidates_text,
76
+ expert_persona=self.expert_persona,
77
+ domain=self.domain
78
+ )
79
+
80
+ try:
81
+ response = call_vlm_with_multiple_images(prompt, [])
82
+ ordered_pairs = self._parse_qa_pairs(response)
83
+
84
+ if not ordered_pairs:
85
+ logging.warning("LLM returned empty ranking, using original order.")
86
+ return cluster_candidates
87
+
88
+ # Attach metadata from original if possible (heuristic matching) or just return new order
89
+ # Since we are just reordering, we can try to match back to original items to keep metadata,
90
+ # but for deduplication, text is primary.
91
+ # For simplicity, we return the parsed pairs as the ordered list.
92
+ return ordered_pairs
93
+
94
+ except Exception as e:
95
+ logging.error(f"Error in LLM ranking: {e}")
96
+ return cluster_candidates
97
+
98
+ def deduplicate_and_merge(self, cluster_candidates: List[Dict]) -> List[Dict]:
99
+ """
100
+ Step 5: Deduplicate and merge based on the ordered cluster.
101
+ """
102
+ if not cluster_candidates:
103
+ return []
104
+
105
+ # First, Rank/Order them
106
+ ordered_candidates = self.rank_cluster(cluster_candidates)
107
+
108
+ # Prepare text for merge prompt
109
+ candidates_text = ""
110
+ for idx, item in enumerate(ordered_candidates, 1):
111
+ candidates_text += f"\n--- Candidate {idx} ---\n"
112
+ candidates_text += f"Question: {item.get('question', '')}\n"
113
+ candidates_text += f"Answer: {item.get('answer', '')}\n"
114
+
115
+ prompt_template = PROMPTS.get("deduplication_merge", "")
116
+ if not prompt_template:
117
+ logging.warning("deduplication_merge prompt not found.")
118
+ return cluster_candidates[:1] # Fallback
119
+
120
+ prompt = prompt_template.format(
121
+ candidates_text=candidates_text,
122
+ expert_persona=self.expert_persona,
123
+ domain=self.domain
124
+ )
125
+
126
+ try:
127
+ response = call_vlm_with_multiple_images(prompt, [])
128
+ merged_pairs = self._parse_qa_pairs(response)
129
+
130
+ if not merged_pairs:
131
+ logging.warning("LLM returned empty merge, returning first candidate.")
132
+ return cluster_candidates[:1]
133
+
134
+ # Propagate metadata from the first original candidate to all new pairs
135
+ # (approximate, since we might have merged multiple)
136
+ base_metadata = cluster_candidates[0].copy()
137
+ final_results = []
138
+ for pair in merged_pairs:
139
+ new_item = base_metadata.copy()
140
+ new_item["question"] = pair["question"]
141
+ new_item["answer"] = pair["answer"]
142
+ new_item["merged_from_count"] = len(cluster_candidates)
143
+ final_results.append(new_item)
144
+
145
+ return final_results
146
+
147
+ except Exception as e:
148
+ logging.error(f"Error in LLM deduplication: {e}")
149
+ return cluster_candidates[:1]
@@ -0,0 +1,26 @@
1
+ """
2
+ Evaluation module for MiRAGE - Metrics and dataset evaluation.
3
+
4
+ Imports are lazy to avoid loading optional dependencies at import time.
5
+ """
6
+
7
+ _LAZY_IMPORTS = {
8
+ # Basic metrics
9
+ "evaluate_qa_dataset": ("metrics", "evaluate_qa_dataset"),
10
+ "compute_faithfulness": ("metrics", "compute_faithfulness"),
11
+ "compute_relevancy": ("metrics", "compute_relevancy"),
12
+ # Optimized evaluation
13
+ "OptimizedEvaluator": ("metrics_optimized", "OptimizedEvaluator"),
14
+ "evaluate_subset": ("metrics_optimized", "evaluate_subset"),
15
+ "generate_evaluation_report": ("metrics_optimized", "generate_evaluation_report"),
16
+ }
17
+
18
+
19
+ def __getattr__(name):
20
+ """Lazy import to avoid loading optional dependencies at import time."""
21
+ if name in _LAZY_IMPORTS:
22
+ module_name, attr_name = _LAZY_IMPORTS[name]
23
+ import importlib
24
+ module = importlib.import_module(f"mirage.evaluation.{module_name}")
25
+ return getattr(module, attr_name)
26
+ raise AttributeError(f"module 'mirage.evaluation' has no attribute '{name}'")