mirage-benchmark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mirage-benchmark might be problematic. Click here for more details.

mirage/__init__.py ADDED
@@ -0,0 +1,83 @@
1
+ """
2
+ MiRAGE: Multimodal Multihop RAG Evaluation Dataset Generator
3
+
4
+ A multi-agent framework for generating high-quality, multimodal, multihop
5
+ question-answer datasets for evaluating Retrieval-Augmented Generation (RAG) systems.
6
+ """
7
+
8
+ __version__ = "1.0.4"
9
+ __author__ = "MiRAGE Authors"
10
+
11
+
12
+ def __getattr__(name):
13
+ """Lazy import of submodules to avoid import-time config loading.
14
+
15
+ This allows `from mirage import __version__` to work without a config file,
16
+ while still providing convenient access to submodules when needed.
17
+ """
18
+ # Core LLM functions - lazy import
19
+ if name in ("call_llm_simple", "call_vlm_interweaved", "call_vlm_with_multiple_images",
20
+ "batch_call_vlm_interweaved", "setup_logging", "BACKEND",
21
+ "LLM_MODEL_NAME", "VLM_MODEL_NAME"):
22
+ from mirage.core import llm
23
+ return getattr(llm, name)
24
+
25
+ # Config functions
26
+ if name in ("load_config", "get_config_value"):
27
+ from mirage.core import config
28
+ return getattr(config, name)
29
+
30
+ # Embeddings
31
+ if name in ("get_best_embedding_model", "NomicVLEmbed"):
32
+ from mirage.embeddings import models
33
+ return getattr(models, name)
34
+
35
+ # Pipeline functions
36
+ if name == "generate_qa_for_chunk":
37
+ from mirage.pipeline import qa_generator
38
+ return qa_generator.generate_qa_for_chunk
39
+ if name == "build_complete_context":
40
+ from mirage.pipeline import context
41
+ return context.build_complete_context
42
+ if name == "fetch_domain_and_role":
43
+ from mirage.pipeline import domain
44
+ return domain.fetch_domain_and_role
45
+ if name == "deduplicate_qa_pairs":
46
+ from mirage.pipeline import deduplication
47
+ return deduplication.deduplicate_qa_pairs
48
+
49
+ # Utils
50
+ if name == "run_preflight_checks":
51
+ from mirage.utils import preflight
52
+ return preflight.run_preflight_checks
53
+
54
+ raise AttributeError(f"module 'mirage' has no attribute '{name}'")
55
+
56
+
57
+ __all__ = [
58
+ # Version info
59
+ "__version__",
60
+ "__author__",
61
+ # Core LLM functions (lazy loaded)
62
+ "call_llm_simple",
63
+ "call_vlm_interweaved",
64
+ "call_vlm_with_multiple_images",
65
+ "batch_call_vlm_interweaved",
66
+ "setup_logging",
67
+ "BACKEND",
68
+ "LLM_MODEL_NAME",
69
+ "VLM_MODEL_NAME",
70
+ # Config
71
+ "load_config",
72
+ "get_config_value",
73
+ # Embeddings
74
+ "get_best_embedding_model",
75
+ "NomicVLEmbed",
76
+ # Pipeline
77
+ "generate_qa_for_chunk",
78
+ "build_complete_context",
79
+ "fetch_domain_and_role",
80
+ "deduplicate_qa_pairs",
81
+ # Utils
82
+ "run_preflight_checks",
83
+ ]
mirage/cli.py ADDED
@@ -0,0 +1,150 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ MiRAGE Command Line Interface
4
+
5
+ Usage:
6
+ mirage # Run full pipeline
7
+ mirage --preflight # Run preflight checks only
8
+ mirage --config my.yaml # Use custom config file
9
+ mirage-preflight # Run preflight checks (shortcut)
10
+ """
11
+
12
+ import os
13
+ import sys
14
+ import argparse
15
+ import logging
16
+ import multiprocessing as mp
17
+
18
+
19
+ def parse_args():
20
+ """Parse command line arguments."""
21
+ parser = argparse.ArgumentParser(
22
+ description="MiRAGE: Multimodal Multihop RAG Evaluation Dataset Generator",
23
+ formatter_class=argparse.RawDescriptionHelpFormatter,
24
+ )
25
+ parser.add_argument(
26
+ "--config", "-c",
27
+ type=str,
28
+ default="config.yaml",
29
+ help="Path to configuration file (default: config.yaml)"
30
+ )
31
+ parser.add_argument(
32
+ "--preflight",
33
+ action="store_true",
34
+ help="Run preflight checks only"
35
+ )
36
+ parser.add_argument(
37
+ "--skip-preflight",
38
+ action="store_true",
39
+ help="Skip preflight checks"
40
+ )
41
+ parser.add_argument(
42
+ "--input", "-i",
43
+ type=str,
44
+ help="Input directory with documents (overrides config)"
45
+ )
46
+ parser.add_argument(
47
+ "--output", "-o",
48
+ type=str,
49
+ help="Output directory for results (overrides config)"
50
+ )
51
+ parser.add_argument(
52
+ "--verbose", "-v",
53
+ action="store_true",
54
+ help="Enable verbose output"
55
+ )
56
+ parser.add_argument(
57
+ "--version",
58
+ action="version",
59
+ version="%(prog)s 1.0.0"
60
+ )
61
+ return parser.parse_args()
62
+
63
+
64
+ def main():
65
+ """Main entry point for MiRAGE CLI."""
66
+ args = parse_args()
67
+
68
+ # Setup logging
69
+ log_level = logging.DEBUG if args.verbose else logging.INFO
70
+ logging.basicConfig(
71
+ level=log_level,
72
+ format='%(asctime)s - %(levelname)s - %(message)s'
73
+ )
74
+ logger = logging.getLogger(__name__)
75
+
76
+ # Import after parsing to speed up --help
77
+ from mirage.core.llm import setup_logging, BACKEND, LLM_MODEL_NAME, VLM_MODEL_NAME
78
+ from mirage.utils.preflight import run_preflight_checks
79
+ from mirage.core.config import load_config
80
+
81
+ logger.info("=" * 60)
82
+ logger.info("MiRAGE: Multimodal Multihop RAG Evaluation Dataset Generator")
83
+ logger.info("=" * 60)
84
+ logger.info(f"Backend: {BACKEND}")
85
+ logger.info(f"LLM Model: {LLM_MODEL_NAME}")
86
+ logger.info(f"VLM Model: {VLM_MODEL_NAME}")
87
+
88
+ # Run preflight checks only
89
+ if args.preflight:
90
+ logger.info("\nRunning preflight checks...")
91
+ success = run_preflight_checks()
92
+ sys.exit(0 if success else 1)
93
+
94
+ # Run preflight checks before pipeline
95
+ if not args.skip_preflight:
96
+ logger.info("\nRunning preflight checks...")
97
+ if not run_preflight_checks():
98
+ logger.error("Preflight checks failed. Fix issues above or use --skip-preflight to bypass.")
99
+ sys.exit(1)
100
+ logger.info("Preflight checks passed!\n")
101
+
102
+ # Load configuration
103
+ try:
104
+ config = load_config(args.config)
105
+ except FileNotFoundError:
106
+ logger.error(f"Configuration file not found: {args.config}")
107
+ logger.info("Create config.yaml from config.yaml.example:")
108
+ logger.info(" cp config.yaml.example config.yaml")
109
+ sys.exit(1)
110
+
111
+ paths = config.get('paths', {})
112
+ input_dir = args.input or paths.get('input_pdf_dir', 'data/documents')
113
+ output_dir = args.output or paths.get('output_dir', 'output/results')
114
+
115
+ logger.info(f"Input directory: {input_dir}")
116
+ logger.info(f"Output directory: {output_dir}")
117
+
118
+ # Validate input directory
119
+ if not os.path.exists(input_dir):
120
+ logger.error(f"Input directory does not exist: {input_dir}")
121
+ logger.info("Add your documents to the data/documents/ folder")
122
+ sys.exit(1)
123
+
124
+ # Create output directory
125
+ os.makedirs(output_dir, exist_ok=True)
126
+
127
+ # Run pipeline
128
+ logger.info("\nStarting MiRAGE pipeline...")
129
+ logger.info("See README.md for detailed pipeline documentation.\n")
130
+
131
+ # Import pipeline modules
132
+ from mirage.pipeline.pdf_processor import process_directory as process_pdfs
133
+ from mirage.pipeline.chunker import process_markdown_directory
134
+ from mirage.pipeline.domain import fetch_domain_and_role
135
+ from mirage.pipeline.qa_generator import run_qa_generation
136
+ from mirage.pipeline.deduplication import deduplicate_qa_dataset
137
+
138
+ # Execute pipeline steps
139
+ # (The actual implementation would go here)
140
+
141
+ logger.info("\n" + "=" * 60)
142
+ logger.info("Pipeline complete!")
143
+ logger.info("=" * 60)
144
+ logger.info(f"Results saved to: {output_dir}")
145
+
146
+
147
+ if __name__ == "__main__":
148
+ # Use spawn method for multiprocessing (required for CUDA)
149
+ mp.set_start_method('spawn', force=True)
150
+ main()
@@ -0,0 +1,52 @@
1
+ """
2
+ Core module for MiRAGE - LLM/VLM interfaces, prompts, and configuration.
3
+
4
+ Imports are lazy to allow the package to be imported without a config file.
5
+ """
6
+
7
+
8
+ def __getattr__(name):
9
+ """Lazy import to avoid import-time config loading."""
10
+ # LLM functions
11
+ if name in ("call_llm_simple", "call_vlm_interweaved", "call_vlm_with_multiple_images",
12
+ "batch_call_vlm_interweaved", "setup_logging", "test_llm_connection",
13
+ "test_vlm_connection", "BACKEND", "LLM_MODEL_NAME", "VLM_MODEL_NAME",
14
+ "GEMINI_RPM", "GEMINI_BURST"):
15
+ from mirage.core import llm
16
+ return getattr(llm, name)
17
+
18
+ # Prompts
19
+ if name in ("PROMPTS", "PROMPTS_CHUNK"):
20
+ from mirage.core import prompts
21
+ return getattr(prompts, name)
22
+
23
+ # Config
24
+ if name in ("load_config", "get_config_value", "ConfigLoader"):
25
+ from mirage.core import config
26
+ return getattr(config, name)
27
+
28
+ raise AttributeError(f"module 'mirage.core' has no attribute '{name}'")
29
+
30
+
31
+ __all__ = [
32
+ # LLM functions
33
+ "call_llm_simple",
34
+ "call_vlm_interweaved",
35
+ "call_vlm_with_multiple_images",
36
+ "batch_call_vlm_interweaved",
37
+ "setup_logging",
38
+ "test_llm_connection",
39
+ "test_vlm_connection",
40
+ "BACKEND",
41
+ "LLM_MODEL_NAME",
42
+ "VLM_MODEL_NAME",
43
+ "GEMINI_RPM",
44
+ "GEMINI_BURST",
45
+ # Prompts
46
+ "PROMPTS",
47
+ "PROMPTS_CHUNK",
48
+ # Config
49
+ "load_config",
50
+ "get_config_value",
51
+ "ConfigLoader",
52
+ ]
mirage/core/config.py ADDED
@@ -0,0 +1,248 @@
1
+ """
2
+ Configuration loader for the QA Dataset Generation Pipeline.
3
+ Loads settings from config.yaml and provides easy access to all modules.
4
+ """
5
+
6
+ import os
7
+ import yaml
8
+ from pathlib import Path
9
+ from typing import Dict, Any, Optional
10
+
11
+ # Find config.yaml relative to this file
12
+ _CONFIG_PATH = Path(__file__).parent / "config.yaml"
13
+ _config_cache: Optional[Dict[str, Any]] = None
14
+
15
+
16
+ def load_config(config_path: str = None) -> Dict[str, Any]:
17
+ """Load configuration from YAML file with caching.
18
+
19
+ Returns default configuration if config file not found.
20
+ This allows the package to be imported without a config file.
21
+ """
22
+ global _config_cache
23
+
24
+ if _config_cache is not None and config_path is None:
25
+ return _config_cache
26
+
27
+ path = Path(config_path) if config_path else _CONFIG_PATH
28
+
29
+ # If config file doesn't exist, return defaults
30
+ if not path.exists():
31
+ # Try workspace root config.yaml
32
+ workspace_config = Path.cwd() / "config.yaml"
33
+ if workspace_config.exists():
34
+ path = workspace_config
35
+ else:
36
+ # Return default configuration - allows import without config file
37
+ return _get_default_config()
38
+
39
+ with open(path, 'r') as f:
40
+ config = yaml.safe_load(f)
41
+
42
+ if config_path is None:
43
+ _config_cache = config
44
+
45
+ return config
46
+
47
+
48
+ def _get_default_config() -> Dict[str, Any]:
49
+ """Return default configuration when no config file is available.
50
+
51
+ This enables the package to be imported and basic operations to work
52
+ without requiring a config.yaml file upfront.
53
+ """
54
+ return {
55
+ 'backend': {
56
+ 'active': os.environ.get('LLM_BACKEND', 'GEMINI'),
57
+ 'gemini': {
58
+ 'llm_model': 'gemini-2.0-flash',
59
+ 'vlm_model': 'gemini-2.0-flash',
60
+ },
61
+ 'openai': {
62
+ 'llm_model': 'gpt-4o-mini',
63
+ 'vlm_model': 'gpt-4o',
64
+ },
65
+ 'ollama': {
66
+ 'base_url': 'http://localhost:11434',
67
+ 'llm_model': 'llama3',
68
+ 'vlm_model': 'llava',
69
+ }
70
+ },
71
+ 'rate_limiting': {
72
+ 'requests_per_minute': 60,
73
+ 'burst_size': 15
74
+ },
75
+ 'paths': {
76
+ 'input_pdf_dir': 'data/documents',
77
+ 'output_dir': 'output'
78
+ },
79
+ 'parallel': {
80
+ 'num_workers': 3,
81
+ 'qa_max_workers': 6,
82
+ 'dedup_max_workers': 4
83
+ },
84
+ 'qa_generation': {
85
+ 'num_qa_pairs': 100,
86
+ 'type': 'multihop'
87
+ }
88
+ }
89
+
90
+
91
+ def get_backend_config() -> Dict[str, Any]:
92
+ """Get the active backend configuration."""
93
+ config = load_config()
94
+ backend_name = config['backend']['active'].lower()
95
+ backend_config = config['backend'].get(backend_name, {})
96
+
97
+ return {
98
+ 'name': config['backend']['active'].upper(),
99
+ **backend_config
100
+ }
101
+
102
+
103
+ def get_api_key(backend_name: str = None) -> str:
104
+ """Load API key for the specified or active backend."""
105
+ config = load_config()
106
+
107
+ if backend_name is None:
108
+ backend_name = config['backend']['active'].lower()
109
+ else:
110
+ backend_name = backend_name.lower()
111
+
112
+ backend_config = config['backend'].get(backend_name, {})
113
+ api_key_path = backend_config.get('api_key_path')
114
+
115
+ if not api_key_path:
116
+ return ""
117
+
118
+ try:
119
+ with open(api_key_path, 'r') as f:
120
+ return f.read().strip()
121
+ except FileNotFoundError:
122
+ print(f"⚠️ API key file not found: {api_key_path}")
123
+ return ""
124
+
125
+
126
+ def get_rate_limit_config() -> Dict[str, int]:
127
+ """Get rate limiting configuration."""
128
+ config = load_config()
129
+ return config.get('rate_limiting', {
130
+ 'requests_per_minute': 60,
131
+ 'burst_size': 15
132
+ })
133
+
134
+
135
+ def get_parallel_config() -> Dict[str, Any]:
136
+ """Get parallel processing configuration."""
137
+ config = load_config()
138
+ return config.get('parallel', {
139
+ 'num_workers': 3,
140
+ 'available_gpus': [0, 1, 2],
141
+ 'qa_max_workers': 6,
142
+ 'dedup_max_workers': 4
143
+ })
144
+
145
+
146
+ def get_retrieval_config() -> Dict[str, Any]:
147
+ """Get context retrieval configuration."""
148
+ config = load_config()
149
+ return config.get('retrieval', {})
150
+
151
+
152
+ def get_embedding_config() -> Dict[str, Any]:
153
+ """Get embedding configuration."""
154
+ config = load_config()
155
+ return config.get('embedding', {})
156
+
157
+
158
+ def get_paths_config() -> Dict[str, Any]:
159
+ """Get input/output paths configuration."""
160
+ config = load_config()
161
+ return config.get('paths', {})
162
+
163
+
164
+ def get_processing_config() -> Dict[str, Any]:
165
+ """Get processing limits configuration."""
166
+ config = load_config()
167
+ return config.get('processing', {})
168
+
169
+
170
+ def get_evaluation_config() -> Dict[str, Any]:
171
+ """Get evaluation configuration."""
172
+ config = load_config()
173
+ return config.get('evaluation', {})
174
+
175
+
176
+ def get_domain_expert_config() -> Dict[str, Any]:
177
+ """Get domain/expert persona configuration.
178
+
179
+ Returns:
180
+ Dict with 'expert_persona', 'domain' (may be None if auto-detect),
181
+ and other settings like 'use_multimodal_embeddings', 'output_dir'
182
+ """
183
+ config = load_config()
184
+ return config.get('domain_expert', {
185
+ 'expert_persona': None,
186
+ 'domain': None,
187
+ 'use_multimodal_embeddings': True,
188
+ 'output_dir': 'trials/domain_analysis'
189
+ })
190
+
191
+
192
+ def get_qa_correction_config() -> Dict[str, Any]:
193
+ """Get QA correction configuration.
194
+
195
+ Returns:
196
+ Dict with 'enabled' (bool), 'max_attempts' (int)
197
+ """
198
+ config = load_config()
199
+ return config.get('qa_correction', {
200
+ 'enabled': True,
201
+ 'max_attempts': 1
202
+ })
203
+
204
+
205
+ def get_qa_generation_config() -> Dict[str, Any]:
206
+ """Get QA generation control configuration.
207
+
208
+ Returns:
209
+ Dict with:
210
+ - 'num_qa_pairs': Target number of QA pairs (None = no limit)
211
+ - 'type': Type of QA to generate ('multihop', 'multimodal', 'text', 'mix')
212
+ """
213
+ config = load_config()
214
+ return config.get('qa_generation', {
215
+ 'num_qa_pairs': 1000,
216
+ 'type': 'multihop'
217
+ })
218
+
219
+
220
+ # Convenience function to print current config
221
+ def print_config_summary():
222
+ """Print a summary of the current configuration."""
223
+ config = load_config()
224
+ backend = get_backend_config()
225
+ rate_limit = get_rate_limit_config()
226
+ parallel = get_parallel_config()
227
+ qa_gen = get_qa_generation_config()
228
+
229
+ print("=" * 60)
230
+ print("📋 CONFIGURATION SUMMARY")
231
+ print("=" * 60)
232
+ print(f"Backend: {backend['name']}")
233
+ print(f" LLM Model: {backend.get('llm_model', 'N/A')}")
234
+ print(f" VLM Model: {backend.get('vlm_model', 'N/A')}")
235
+ print(f"Rate Limiting:")
236
+ print(f" RPM: {rate_limit.get('requests_per_minute', 60)}")
237
+ print(f" Burst: {rate_limit.get('burst_size', 15)}")
238
+ print(f"Parallel Processing:")
239
+ print(f" QA Workers: {parallel.get('qa_max_workers', 6)}")
240
+ print(f" Dedup Workers: {parallel.get('dedup_max_workers', 4)}")
241
+ print(f"QA Generation:")
242
+ print(f" Target Pairs: {qa_gen.get('num_qa_pairs', 1000)}")
243
+ print(f" Type: {qa_gen.get('type', 'multihop')}")
244
+ print("=" * 60)
245
+
246
+
247
+ if __name__ == "__main__":
248
+ print_config_summary()