mirage-benchmark 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mirage-benchmark might be problematic. Click here for more details.
- mirage/__init__.py +83 -0
- mirage/cli.py +150 -0
- mirage/core/__init__.py +52 -0
- mirage/core/config.py +248 -0
- mirage/core/llm.py +1745 -0
- mirage/core/prompts.py +884 -0
- mirage/embeddings/__init__.py +31 -0
- mirage/embeddings/models.py +512 -0
- mirage/embeddings/rerankers_multimodal.py +766 -0
- mirage/embeddings/rerankers_text.py +149 -0
- mirage/evaluation/__init__.py +26 -0
- mirage/evaluation/metrics.py +2223 -0
- mirage/evaluation/metrics_optimized.py +2172 -0
- mirage/pipeline/__init__.py +45 -0
- mirage/pipeline/chunker.py +545 -0
- mirage/pipeline/context.py +1003 -0
- mirage/pipeline/deduplication.py +491 -0
- mirage/pipeline/domain.py +514 -0
- mirage/pipeline/pdf_processor.py +598 -0
- mirage/pipeline/qa_generator.py +798 -0
- mirage/utils/__init__.py +31 -0
- mirage/utils/ablation.py +360 -0
- mirage/utils/preflight.py +663 -0
- mirage/utils/stats.py +626 -0
- mirage_benchmark-1.0.4.dist-info/METADATA +490 -0
- mirage_benchmark-1.0.4.dist-info/RECORD +30 -0
- mirage_benchmark-1.0.4.dist-info/WHEEL +5 -0
- mirage_benchmark-1.0.4.dist-info/entry_points.txt +3 -0
- mirage_benchmark-1.0.4.dist-info/licenses/LICENSE +190 -0
- mirage_benchmark-1.0.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import List, Dict, Any, Tuple
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from mirage.core.llm import call_vlm_with_multiple_images
|
|
6
|
+
from mirage.core.prompts import PROMPTS
|
|
7
|
+
|
|
8
|
+
class LLMReranker:
|
|
9
|
+
"""
|
|
10
|
+
Uses an LLM to evaluate, rank, and merge QA pairs.
|
|
11
|
+
"""
|
|
12
|
+
def __init__(self, model_name="gpt-oss", expert_persona: str = None,
|
|
13
|
+
domain: str = None):
|
|
14
|
+
self.model_name = model_name
|
|
15
|
+
self.expert_persona = expert_persona
|
|
16
|
+
self.domain = domain
|
|
17
|
+
self.tuple_delimiter = PROMPTS.get("DEFAULT_TUPLE_DELIMITER", "<|#|>")
|
|
18
|
+
self.completion_delimiter = PROMPTS.get("DEFAULT_COMPLETION_DELIMITER", "<|#|>END<|#|>")
|
|
19
|
+
|
|
20
|
+
def _parse_qa_pairs(self, response_text: str) -> List[Dict[str, str]]:
|
|
21
|
+
"""Parses QA pairs from the |#| delimited format."""
|
|
22
|
+
qa_pairs = []
|
|
23
|
+
try:
|
|
24
|
+
# Remove completion delimiter if present
|
|
25
|
+
if self.completion_delimiter in response_text:
|
|
26
|
+
response_text = response_text.split(self.completion_delimiter)[0].strip()
|
|
27
|
+
|
|
28
|
+
# Remove START delimiter if present
|
|
29
|
+
start_delimiter = self.tuple_delimiter + "START" + self.tuple_delimiter
|
|
30
|
+
if response_text.startswith(start_delimiter):
|
|
31
|
+
response_text = response_text[len(start_delimiter):].strip()
|
|
32
|
+
|
|
33
|
+
lines = [line.strip() for line in response_text.split('\n') if line.strip()]
|
|
34
|
+
|
|
35
|
+
for line in lines:
|
|
36
|
+
# Skip NEXT lines
|
|
37
|
+
next_delimiter = self.tuple_delimiter + "NEXT" + self.tuple_delimiter
|
|
38
|
+
if line == next_delimiter:
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
# Check for Question delimiter pattern
|
|
42
|
+
if line.startswith("Question" + self.tuple_delimiter) or line.startswith("question" + self.tuple_delimiter):
|
|
43
|
+
parts = line.split(self.tuple_delimiter)
|
|
44
|
+
# Expected: Question<|#|>Q<|#|>Answer<|#|>A...
|
|
45
|
+
if len(parts) >= 4 and parts[0].lower() == "question" and parts[2].lower() == "answer":
|
|
46
|
+
question = parts[1].strip()
|
|
47
|
+
answer = parts[3].strip()
|
|
48
|
+
if question and answer:
|
|
49
|
+
qa_pairs.append({"question": question, "answer": answer})
|
|
50
|
+
|
|
51
|
+
return qa_pairs
|
|
52
|
+
except Exception as e:
|
|
53
|
+
logging.error(f"Error parsing QA pairs: {e}")
|
|
54
|
+
return []
|
|
55
|
+
|
|
56
|
+
def rank_cluster(self, cluster_candidates: List[Dict]) -> List[Dict]:
|
|
57
|
+
"""
|
|
58
|
+
Step 3: Order/Rank the QA pairs in the cluster.
|
|
59
|
+
"""
|
|
60
|
+
if not cluster_candidates or len(cluster_candidates) < 2:
|
|
61
|
+
return cluster_candidates
|
|
62
|
+
|
|
63
|
+
candidates_text = ""
|
|
64
|
+
for idx, item in enumerate(cluster_candidates, 1):
|
|
65
|
+
candidates_text += f"\n--- Candidate {idx} ---\n"
|
|
66
|
+
candidates_text += f"Question: {item.get('question', '')}\n"
|
|
67
|
+
candidates_text += f"Answer: {item.get('answer', '')}\n"
|
|
68
|
+
|
|
69
|
+
prompt_template = PROMPTS.get("deduplication_rank", "")
|
|
70
|
+
if not prompt_template:
|
|
71
|
+
logging.warning("deduplication_rank prompt not found.")
|
|
72
|
+
return cluster_candidates
|
|
73
|
+
|
|
74
|
+
prompt = prompt_template.format(
|
|
75
|
+
candidates_text=candidates_text,
|
|
76
|
+
expert_persona=self.expert_persona,
|
|
77
|
+
domain=self.domain
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
response = call_vlm_with_multiple_images(prompt, [])
|
|
82
|
+
ordered_pairs = self._parse_qa_pairs(response)
|
|
83
|
+
|
|
84
|
+
if not ordered_pairs:
|
|
85
|
+
logging.warning("LLM returned empty ranking, using original order.")
|
|
86
|
+
return cluster_candidates
|
|
87
|
+
|
|
88
|
+
# Attach metadata from original if possible (heuristic matching) or just return new order
|
|
89
|
+
# Since we are just reordering, we can try to match back to original items to keep metadata,
|
|
90
|
+
# but for deduplication, text is primary.
|
|
91
|
+
# For simplicity, we return the parsed pairs as the ordered list.
|
|
92
|
+
return ordered_pairs
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logging.error(f"Error in LLM ranking: {e}")
|
|
96
|
+
return cluster_candidates
|
|
97
|
+
|
|
98
|
+
def deduplicate_and_merge(self, cluster_candidates: List[Dict]) -> List[Dict]:
|
|
99
|
+
"""
|
|
100
|
+
Step 5: Deduplicate and merge based on the ordered cluster.
|
|
101
|
+
"""
|
|
102
|
+
if not cluster_candidates:
|
|
103
|
+
return []
|
|
104
|
+
|
|
105
|
+
# First, Rank/Order them
|
|
106
|
+
ordered_candidates = self.rank_cluster(cluster_candidates)
|
|
107
|
+
|
|
108
|
+
# Prepare text for merge prompt
|
|
109
|
+
candidates_text = ""
|
|
110
|
+
for idx, item in enumerate(ordered_candidates, 1):
|
|
111
|
+
candidates_text += f"\n--- Candidate {idx} ---\n"
|
|
112
|
+
candidates_text += f"Question: {item.get('question', '')}\n"
|
|
113
|
+
candidates_text += f"Answer: {item.get('answer', '')}\n"
|
|
114
|
+
|
|
115
|
+
prompt_template = PROMPTS.get("deduplication_merge", "")
|
|
116
|
+
if not prompt_template:
|
|
117
|
+
logging.warning("deduplication_merge prompt not found.")
|
|
118
|
+
return cluster_candidates[:1] # Fallback
|
|
119
|
+
|
|
120
|
+
prompt = prompt_template.format(
|
|
121
|
+
candidates_text=candidates_text,
|
|
122
|
+
expert_persona=self.expert_persona,
|
|
123
|
+
domain=self.domain
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
response = call_vlm_with_multiple_images(prompt, [])
|
|
128
|
+
merged_pairs = self._parse_qa_pairs(response)
|
|
129
|
+
|
|
130
|
+
if not merged_pairs:
|
|
131
|
+
logging.warning("LLM returned empty merge, returning first candidate.")
|
|
132
|
+
return cluster_candidates[:1]
|
|
133
|
+
|
|
134
|
+
# Propagate metadata from the first original candidate to all new pairs
|
|
135
|
+
# (approximate, since we might have merged multiple)
|
|
136
|
+
base_metadata = cluster_candidates[0].copy()
|
|
137
|
+
final_results = []
|
|
138
|
+
for pair in merged_pairs:
|
|
139
|
+
new_item = base_metadata.copy()
|
|
140
|
+
new_item["question"] = pair["question"]
|
|
141
|
+
new_item["answer"] = pair["answer"]
|
|
142
|
+
new_item["merged_from_count"] = len(cluster_candidates)
|
|
143
|
+
final_results.append(new_item)
|
|
144
|
+
|
|
145
|
+
return final_results
|
|
146
|
+
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logging.error(f"Error in LLM deduplication: {e}")
|
|
149
|
+
return cluster_candidates[:1]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation module for MiRAGE - Metrics and dataset evaluation.
|
|
3
|
+
|
|
4
|
+
Imports are lazy to avoid loading optional dependencies at import time.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
_LAZY_IMPORTS = {
|
|
8
|
+
# Basic metrics
|
|
9
|
+
"evaluate_qa_dataset": ("metrics", "evaluate_qa_dataset"),
|
|
10
|
+
"compute_faithfulness": ("metrics", "compute_faithfulness"),
|
|
11
|
+
"compute_relevancy": ("metrics", "compute_relevancy"),
|
|
12
|
+
# Optimized evaluation
|
|
13
|
+
"OptimizedEvaluator": ("metrics_optimized", "OptimizedEvaluator"),
|
|
14
|
+
"evaluate_subset": ("metrics_optimized", "evaluate_subset"),
|
|
15
|
+
"generate_evaluation_report": ("metrics_optimized", "generate_evaluation_report"),
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def __getattr__(name):
|
|
20
|
+
"""Lazy import to avoid loading optional dependencies at import time."""
|
|
21
|
+
if name in _LAZY_IMPORTS:
|
|
22
|
+
module_name, attr_name = _LAZY_IMPORTS[name]
|
|
23
|
+
import importlib
|
|
24
|
+
module = importlib.import_module(f"mirage.evaluation.{module_name}")
|
|
25
|
+
return getattr(module, attr_name)
|
|
26
|
+
raise AttributeError(f"module 'mirage.evaluation' has no attribute '{name}'")
|