mirage-benchmark 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mirage-benchmark might be problematic. Click here for more details.
- mirage/__init__.py +83 -0
- mirage/cli.py +150 -0
- mirage/core/__init__.py +52 -0
- mirage/core/config.py +248 -0
- mirage/core/llm.py +1745 -0
- mirage/core/prompts.py +884 -0
- mirage/embeddings/__init__.py +31 -0
- mirage/embeddings/models.py +512 -0
- mirage/embeddings/rerankers_multimodal.py +766 -0
- mirage/embeddings/rerankers_text.py +149 -0
- mirage/evaluation/__init__.py +26 -0
- mirage/evaluation/metrics.py +2223 -0
- mirage/evaluation/metrics_optimized.py +2172 -0
- mirage/pipeline/__init__.py +45 -0
- mirage/pipeline/chunker.py +545 -0
- mirage/pipeline/context.py +1003 -0
- mirage/pipeline/deduplication.py +491 -0
- mirage/pipeline/domain.py +514 -0
- mirage/pipeline/pdf_processor.py +598 -0
- mirage/pipeline/qa_generator.py +798 -0
- mirage/utils/__init__.py +31 -0
- mirage/utils/ablation.py +360 -0
- mirage/utils/preflight.py +663 -0
- mirage/utils/stats.py +626 -0
- mirage_benchmark-1.0.4.dist-info/METADATA +490 -0
- mirage_benchmark-1.0.4.dist-info/RECORD +30 -0
- mirage_benchmark-1.0.4.dist-info/WHEEL +5 -0
- mirage_benchmark-1.0.4.dist-info/entry_points.txt +3 -0
- mirage_benchmark-1.0.4.dist-info/licenses/LICENSE +190 -0
- mirage_benchmark-1.0.4.dist-info/top_level.txt +1 -0
mirage/utils/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utilities module for MiRAGE - Preflight checks, statistics, and ablation studies.
|
|
3
|
+
|
|
4
|
+
Imports are lazy to avoid loading optional dependencies at import time.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
_LAZY_IMPORTS = {
|
|
8
|
+
# Preflight
|
|
9
|
+
"run_preflight_checks": ("preflight", "run_preflight_checks"),
|
|
10
|
+
"check_gpu_availability": ("preflight", "check_gpu_availability"),
|
|
11
|
+
"check_api_connectivity": ("preflight", "check_api_connectivity"),
|
|
12
|
+
"PreflightChecker": ("preflight", "PreflightChecker"),
|
|
13
|
+
# Statistics
|
|
14
|
+
"compute_dataset_stats": ("stats", "compute_dataset_stats"),
|
|
15
|
+
"print_dataset_stats": ("stats", "print_dataset_stats"),
|
|
16
|
+
"compute_qa_category_stats": ("stats", "compute_qa_category_stats"),
|
|
17
|
+
"print_qa_category_stats": ("stats", "print_qa_category_stats"),
|
|
18
|
+
# Ablation
|
|
19
|
+
"run_ablation_study": ("ablation", "run_ablation_study"),
|
|
20
|
+
"AblationConfig": ("ablation", "AblationConfig"),
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def __getattr__(name):
|
|
25
|
+
"""Lazy import to avoid loading optional dependencies at import time."""
|
|
26
|
+
if name in _LAZY_IMPORTS:
|
|
27
|
+
module_name, attr_name = _LAZY_IMPORTS[name]
|
|
28
|
+
import importlib
|
|
29
|
+
module = importlib.import_module(f"mirage.utils.{module_name}")
|
|
30
|
+
return getattr(module, attr_name)
|
|
31
|
+
raise AttributeError(f"module 'mirage.utils' has no attribute '{name}'")
|
mirage/utils/ablation.py
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Unified Ablation Study Runner
|
|
4
|
+
|
|
5
|
+
Runs the full MiRAGE pipeline for:
|
|
6
|
+
1. Baseline (all features enabled)
|
|
7
|
+
2. Each ablation configuration (one at a time)
|
|
8
|
+
|
|
9
|
+
Results are saved to separate directories under the base output path.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python run_ablation_study.py [--config config.yaml] [--skip-baseline] [--only ABLATION_NAME]
|
|
13
|
+
|
|
14
|
+
Examples:
|
|
15
|
+
python run_ablation_study.py # Run all (baseline + all ablations)
|
|
16
|
+
python run_ablation_study.py --skip-baseline # Run only ablations
|
|
17
|
+
python run_ablation_study.py --only disable_verifier # Run only one ablation
|
|
18
|
+
python run_ablation_study.py --only baseline # Run only baseline
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import os
|
|
22
|
+
import sys
|
|
23
|
+
import yaml
|
|
24
|
+
import shutil
|
|
25
|
+
import argparse
|
|
26
|
+
import subprocess
|
|
27
|
+
from datetime import datetime
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Dict, Any, List, Optional
|
|
30
|
+
|
|
31
|
+
# Ablation configurations to test
|
|
32
|
+
ABLATION_MODES = [
|
|
33
|
+
{
|
|
34
|
+
"name": "baseline",
|
|
35
|
+
"description": "Full MiRAGE framework (all features enabled)",
|
|
36
|
+
"config_changes": {} # No changes for baseline
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"name": "no_multihop",
|
|
40
|
+
"description": "Disable Multihop Context Optimization Loop",
|
|
41
|
+
"config_changes": {
|
|
42
|
+
"ablation.disable_multihop_context.enabled": True
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"name": "no_verifier",
|
|
47
|
+
"description": "Disable Verifier Agent",
|
|
48
|
+
"config_changes": {
|
|
49
|
+
"ablation.disable_verifier.enabled": True
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"name": "no_persona",
|
|
54
|
+
"description": "Disable Domain/Persona Injection",
|
|
55
|
+
"config_changes": {
|
|
56
|
+
"ablation.disable_persona.enabled": True
|
|
57
|
+
}
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"name": "fixed_chunking",
|
|
61
|
+
"description": "Use Fixed-Length Chunking (2048 tokens)",
|
|
62
|
+
"config_changes": {
|
|
63
|
+
"ablation.fixed_chunking.enabled": True
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"name": "description_only",
|
|
68
|
+
"description": "Multimodal: Description Only (no raw images)",
|
|
69
|
+
"config_changes": {
|
|
70
|
+
"ablation.description_only.enabled": True
|
|
71
|
+
}
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"name": "image_only",
|
|
75
|
+
"description": "Multimodal: Image Only (no generated descriptions)",
|
|
76
|
+
"config_changes": {
|
|
77
|
+
"ablation.image_only.enabled": True
|
|
78
|
+
}
|
|
79
|
+
},
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def load_config(config_path: str) -> Dict[str, Any]:
|
|
84
|
+
"""Load YAML configuration file."""
|
|
85
|
+
with open(config_path, 'r') as f:
|
|
86
|
+
return yaml.safe_load(f)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def save_config(config: Dict[str, Any], config_path: str):
|
|
90
|
+
"""Save configuration to YAML file."""
|
|
91
|
+
with open(config_path, 'w') as f:
|
|
92
|
+
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def set_nested_value(config: Dict, key_path: str, value: Any):
|
|
96
|
+
"""Set a nested dictionary value using dot notation."""
|
|
97
|
+
keys = key_path.split('.')
|
|
98
|
+
d = config
|
|
99
|
+
for key in keys[:-1]:
|
|
100
|
+
if key not in d:
|
|
101
|
+
d[key] = {}
|
|
102
|
+
d = d[key]
|
|
103
|
+
d[keys[-1]] = value
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def reset_all_ablations(config: Dict) -> Dict:
|
|
107
|
+
"""Reset all ablation settings to disabled."""
|
|
108
|
+
if 'ablation' not in config:
|
|
109
|
+
config['ablation'] = {}
|
|
110
|
+
|
|
111
|
+
ablation = config['ablation']
|
|
112
|
+
|
|
113
|
+
# Reset each ablation to disabled
|
|
114
|
+
for ablation_key in ['disable_multihop_context', 'disable_verifier',
|
|
115
|
+
'disable_persona', 'fixed_chunking',
|
|
116
|
+
'description_only', 'image_only']:
|
|
117
|
+
if ablation_key not in ablation:
|
|
118
|
+
ablation[ablation_key] = {}
|
|
119
|
+
ablation[ablation_key]['enabled'] = False
|
|
120
|
+
|
|
121
|
+
return config
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def apply_ablation_config(config: Dict, ablation_mode: Dict) -> Dict:
|
|
125
|
+
"""Apply ablation-specific configuration changes."""
|
|
126
|
+
# First reset all ablations
|
|
127
|
+
config = reset_all_ablations(config)
|
|
128
|
+
|
|
129
|
+
# Apply specific changes for this ablation
|
|
130
|
+
for key_path, value in ablation_mode.get('config_changes', {}).items():
|
|
131
|
+
set_nested_value(config, key_path, value)
|
|
132
|
+
|
|
133
|
+
return config
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def get_output_dir(base_output_dir: str, ablation_name: str) -> str:
|
|
137
|
+
"""Generate output directory path for an ablation run."""
|
|
138
|
+
return os.path.join(base_output_dir, ablation_name)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def run_pipeline(config_path: str, ablation_name: str) -> bool:
|
|
142
|
+
"""Run the main pipeline and return success status."""
|
|
143
|
+
print(f"\n{'='*70}")
|
|
144
|
+
print(f"š Running: {ablation_name}")
|
|
145
|
+
print(f"{'='*70}\n")
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
# Run main.py
|
|
149
|
+
result = subprocess.run(
|
|
150
|
+
[sys.executable, 'main.py'],
|
|
151
|
+
cwd=os.path.dirname(os.path.abspath(__file__)),
|
|
152
|
+
check=False
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if result.returncode == 0:
|
|
156
|
+
print(f"\nā
{ablation_name}: COMPLETED SUCCESSFULLY")
|
|
157
|
+
return True
|
|
158
|
+
else:
|
|
159
|
+
print(f"\nā {ablation_name}: FAILED (exit code {result.returncode})")
|
|
160
|
+
return False
|
|
161
|
+
|
|
162
|
+
except Exception as e:
|
|
163
|
+
print(f"\nā {ablation_name}: ERROR - {e}")
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def create_summary_report(results: List[Dict], output_dir: str):
|
|
168
|
+
"""Create a summary report of all ablation runs."""
|
|
169
|
+
report_path = os.path.join(output_dir, "ablation_study_summary.txt")
|
|
170
|
+
|
|
171
|
+
with open(report_path, 'w') as f:
|
|
172
|
+
f.write("=" * 70 + "\n")
|
|
173
|
+
f.write("ABLATION STUDY SUMMARY\n")
|
|
174
|
+
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
175
|
+
f.write("=" * 70 + "\n\n")
|
|
176
|
+
|
|
177
|
+
successful = sum(1 for r in results if r['success'])
|
|
178
|
+
total = len(results)
|
|
179
|
+
|
|
180
|
+
f.write(f"Total Runs: {total}\n")
|
|
181
|
+
f.write(f"Successful: {successful}\n")
|
|
182
|
+
f.write(f"Failed: {total - successful}\n\n")
|
|
183
|
+
|
|
184
|
+
f.write("-" * 70 + "\n")
|
|
185
|
+
f.write("INDIVIDUAL RESULTS\n")
|
|
186
|
+
f.write("-" * 70 + "\n\n")
|
|
187
|
+
|
|
188
|
+
for result in results:
|
|
189
|
+
status = "ā
SUCCESS" if result['success'] else "ā FAILED"
|
|
190
|
+
f.write(f"{result['name']:20} {status}\n")
|
|
191
|
+
f.write(f" Description: {result['description']}\n")
|
|
192
|
+
f.write(f" Output Dir: {result['output_dir']}\n")
|
|
193
|
+
f.write(f" Duration: {result.get('duration', 'N/A')}\n\n")
|
|
194
|
+
|
|
195
|
+
print(f"\nš Summary report saved: {report_path}")
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def main():
|
|
199
|
+
parser = argparse.ArgumentParser(
|
|
200
|
+
description="Run unified ablation study for MiRAGE pipeline",
|
|
201
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
202
|
+
epilog="""
|
|
203
|
+
Examples:
|
|
204
|
+
python run_ablation_study.py # Run all
|
|
205
|
+
python run_ablation_study.py --skip-baseline # Skip baseline
|
|
206
|
+
python run_ablation_study.py --only no_verifier # Run specific ablation
|
|
207
|
+
python run_ablation_study.py --only baseline # Run baseline only
|
|
208
|
+
python run_ablation_study.py --list # List available modes
|
|
209
|
+
"""
|
|
210
|
+
)
|
|
211
|
+
parser.add_argument('--config', default='config.yaml',
|
|
212
|
+
help='Path to config file (default: config.yaml)')
|
|
213
|
+
parser.add_argument('--skip-baseline', action='store_true',
|
|
214
|
+
help='Skip the baseline run')
|
|
215
|
+
parser.add_argument('--only', type=str, default=None,
|
|
216
|
+
help='Run only a specific ablation mode')
|
|
217
|
+
parser.add_argument('--list', action='store_true',
|
|
218
|
+
help='List available ablation modes and exit')
|
|
219
|
+
parser.add_argument('--dry-run', action='store_true',
|
|
220
|
+
help='Show what would be run without executing')
|
|
221
|
+
|
|
222
|
+
args = parser.parse_args()
|
|
223
|
+
|
|
224
|
+
# List available modes
|
|
225
|
+
if args.list:
|
|
226
|
+
print("\nAvailable Ablation Modes:")
|
|
227
|
+
print("-" * 50)
|
|
228
|
+
for mode in ABLATION_MODES:
|
|
229
|
+
print(f" {mode['name']:20} - {mode['description']}")
|
|
230
|
+
print()
|
|
231
|
+
return 0
|
|
232
|
+
|
|
233
|
+
# Validate --only argument
|
|
234
|
+
if args.only:
|
|
235
|
+
valid_names = [m['name'] for m in ABLATION_MODES]
|
|
236
|
+
if args.only not in valid_names:
|
|
237
|
+
print(f"ā Error: Unknown ablation mode '{args.only}'")
|
|
238
|
+
print(f" Valid modes: {', '.join(valid_names)}")
|
|
239
|
+
return 1
|
|
240
|
+
|
|
241
|
+
# Load original config
|
|
242
|
+
config_path = args.config
|
|
243
|
+
if not os.path.exists(config_path):
|
|
244
|
+
print(f"ā Error: Config file not found: {config_path}")
|
|
245
|
+
return 1
|
|
246
|
+
|
|
247
|
+
print("=" * 70)
|
|
248
|
+
print("š¬ UNIFIED ABLATION STUDY")
|
|
249
|
+
print("=" * 70)
|
|
250
|
+
print(f"Config: {config_path}")
|
|
251
|
+
print(f"Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
252
|
+
|
|
253
|
+
# Load and backup original config
|
|
254
|
+
original_config = load_config(config_path)
|
|
255
|
+
backup_path = config_path + '.backup'
|
|
256
|
+
shutil.copy2(config_path, backup_path)
|
|
257
|
+
print(f"Config backed up: {backup_path}")
|
|
258
|
+
|
|
259
|
+
# Get base output directory
|
|
260
|
+
base_output_dir = original_config.get('paths', {}).get('output_dir', 'output')
|
|
261
|
+
print(f"Base Output Dir: {base_output_dir}")
|
|
262
|
+
|
|
263
|
+
# Determine which modes to run
|
|
264
|
+
modes_to_run = ABLATION_MODES.copy()
|
|
265
|
+
|
|
266
|
+
if args.only:
|
|
267
|
+
modes_to_run = [m for m in modes_to_run if m['name'] == args.only]
|
|
268
|
+
elif args.skip_baseline:
|
|
269
|
+
modes_to_run = [m for m in modes_to_run if m['name'] != 'baseline']
|
|
270
|
+
|
|
271
|
+
print(f"\nModes to run: {[m['name'] for m in modes_to_run]}")
|
|
272
|
+
print("-" * 70)
|
|
273
|
+
|
|
274
|
+
if args.dry_run:
|
|
275
|
+
print("\nš DRY RUN - Would execute the following:")
|
|
276
|
+
for mode in modes_to_run:
|
|
277
|
+
output_dir = get_output_dir(base_output_dir, mode['name'])
|
|
278
|
+
print(f"\n [{mode['name']}]")
|
|
279
|
+
print(f" Description: {mode['description']}")
|
|
280
|
+
print(f" Output: {output_dir}")
|
|
281
|
+
print(f" Config changes: {mode['config_changes']}")
|
|
282
|
+
print("\nā
Dry run complete. Use without --dry-run to execute.")
|
|
283
|
+
return 0
|
|
284
|
+
|
|
285
|
+
# Run each ablation mode
|
|
286
|
+
results = []
|
|
287
|
+
|
|
288
|
+
try:
|
|
289
|
+
for mode in modes_to_run:
|
|
290
|
+
start_time = datetime.now()
|
|
291
|
+
|
|
292
|
+
# Create output directory for this ablation
|
|
293
|
+
output_dir = get_output_dir(base_output_dir, mode['name'])
|
|
294
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
295
|
+
|
|
296
|
+
# Prepare config for this run
|
|
297
|
+
config = load_config(config_path) # Reload fresh each time
|
|
298
|
+
config = apply_ablation_config(config, mode)
|
|
299
|
+
|
|
300
|
+
# Update output directory
|
|
301
|
+
if 'paths' not in config:
|
|
302
|
+
config['paths'] = {}
|
|
303
|
+
config['paths']['output_dir'] = output_dir
|
|
304
|
+
|
|
305
|
+
# Save modified config
|
|
306
|
+
save_config(config, config_path)
|
|
307
|
+
|
|
308
|
+
print(f"\nš Output: {output_dir}")
|
|
309
|
+
print(f"š Mode: {mode['description']}")
|
|
310
|
+
|
|
311
|
+
# Run the pipeline
|
|
312
|
+
success = run_pipeline(config_path, mode['name'])
|
|
313
|
+
|
|
314
|
+
end_time = datetime.now()
|
|
315
|
+
duration = str(end_time - start_time).split('.')[0] # Remove microseconds
|
|
316
|
+
|
|
317
|
+
results.append({
|
|
318
|
+
'name': mode['name'],
|
|
319
|
+
'description': mode['description'],
|
|
320
|
+
'output_dir': output_dir,
|
|
321
|
+
'success': success,
|
|
322
|
+
'duration': duration
|
|
323
|
+
})
|
|
324
|
+
|
|
325
|
+
except KeyboardInterrupt:
|
|
326
|
+
print("\n\nā ļø Ablation study interrupted by user")
|
|
327
|
+
|
|
328
|
+
finally:
|
|
329
|
+
# Restore original config
|
|
330
|
+
print(f"\n{'='*70}")
|
|
331
|
+
print("Restoring original configuration...")
|
|
332
|
+
shutil.copy2(backup_path, config_path)
|
|
333
|
+
os.remove(backup_path)
|
|
334
|
+
print("ā
Original config restored")
|
|
335
|
+
|
|
336
|
+
# Generate summary report
|
|
337
|
+
if results:
|
|
338
|
+
create_summary_report(results, base_output_dir)
|
|
339
|
+
|
|
340
|
+
# Print final summary
|
|
341
|
+
print(f"\n{'='*70}")
|
|
342
|
+
print("ABLATION STUDY COMPLETE")
|
|
343
|
+
print(f"{'='*70}")
|
|
344
|
+
|
|
345
|
+
successful = sum(1 for r in results if r['success'])
|
|
346
|
+
print(f"\nResults: {successful}/{len(results)} successful")
|
|
347
|
+
|
|
348
|
+
for r in results:
|
|
349
|
+
status = "ā
" if r['success'] else "ā"
|
|
350
|
+
print(f" {status} {r['name']:20} ({r['duration']})")
|
|
351
|
+
|
|
352
|
+
print(f"\nResults saved in: {base_output_dir}/")
|
|
353
|
+
print(f"Summary report: {base_output_dir}/ablation_study_summary.txt")
|
|
354
|
+
|
|
355
|
+
return 0 if all(r['success'] for r in results) else 1
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
if __name__ == "__main__":
|
|
359
|
+
sys.exit(main())
|
|
360
|
+
|