PyPI - mirage-benchmark - Versions diffs - 1.0.4__py3-none-any.whl - Mend

mirage-benchmark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mirage-benchmark might be problematic. Click here for more details.

Files changed (30) hide show

mirage/__init__.py +83 -0
mirage/cli.py +150 -0
mirage/core/__init__.py +52 -0
mirage/core/config.py +248 -0
mirage/core/llm.py +1745 -0
mirage/core/prompts.py +884 -0
mirage/embeddings/__init__.py +31 -0
mirage/embeddings/models.py +512 -0
mirage/embeddings/rerankers_multimodal.py +766 -0
mirage/embeddings/rerankers_text.py +149 -0
mirage/evaluation/__init__.py +26 -0
mirage/evaluation/metrics.py +2223 -0
mirage/evaluation/metrics_optimized.py +2172 -0
mirage/pipeline/__init__.py +45 -0
mirage/pipeline/chunker.py +545 -0
mirage/pipeline/context.py +1003 -0
mirage/pipeline/deduplication.py +491 -0
mirage/pipeline/domain.py +514 -0
mirage/pipeline/pdf_processor.py +598 -0
mirage/pipeline/qa_generator.py +798 -0
mirage/utils/__init__.py +31 -0
mirage/utils/ablation.py +360 -0
mirage/utils/preflight.py +663 -0
mirage/utils/stats.py +626 -0
mirage_benchmark-1.0.4.dist-info/METADATA +490 -0
mirage_benchmark-1.0.4.dist-info/RECORD +30 -0
mirage_benchmark-1.0.4.dist-info/WHEEL +5 -0
mirage_benchmark-1.0.4.dist-info/entry_points.txt +3 -0
mirage_benchmark-1.0.4.dist-info/licenses/LICENSE +190 -0
mirage_benchmark-1.0.4.dist-info/top_level.txt +1 -0

mirage/__init__.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""
+MiRAGE: Multimodal Multihop RAG Evaluation Dataset Generator
+A multi-agent framework for generating high-quality, multimodal, multihop
+question-answer datasets for evaluating Retrieval-Augmented Generation (RAG) systems.
+"""
+__version__ = "1.0.4"
+__author__ = "MiRAGE Authors"
+def __getattr__(name):
+    """Lazy import of submodules to avoid import-time config loading.
+    This allows `from mirage import __version__` to work without a config file,
+    while still providing convenient access to submodules when needed.
+    """
+    # Core LLM functions - lazy import
+    if name in ("call_llm_simple", "call_vlm_interweaved", "call_vlm_with_multiple_images",
+                "batch_call_vlm_interweaved", "setup_logging", "BACKEND",
+                "LLM_MODEL_NAME", "VLM_MODEL_NAME"):
+        from mirage.core import llm
+        return getattr(llm, name)
+    # Config functions
+    if name in ("load_config", "get_config_value"):
+        from mirage.core import config
+        return getattr(config, name)
+    # Embeddings
+    if name in ("get_best_embedding_model", "NomicVLEmbed"):
+        from mirage.embeddings import models
+        return getattr(models, name)
+    # Pipeline functions
+    if name == "generate_qa_for_chunk":
+        from mirage.pipeline import qa_generator
+        return qa_generator.generate_qa_for_chunk
+    if name == "build_complete_context":
+        from mirage.pipeline import context
+        return context.build_complete_context
+    if name == "fetch_domain_and_role":
+        from mirage.pipeline import domain
+        return domain.fetch_domain_and_role
+    if name == "deduplicate_qa_pairs":
+        from mirage.pipeline import deduplication
+        return deduplication.deduplicate_qa_pairs
+    # Utils
+    if name == "run_preflight_checks":
+        from mirage.utils import preflight
+        return preflight.run_preflight_checks
+    raise AttributeError(f"module 'mirage' has no attribute '{name}'")
+__all__ = [
+    # Version info
+    "__version__",
+    "__author__",
+    # Core LLM functions (lazy loaded)
+    "call_llm_simple",
+    "call_vlm_interweaved",
+    "call_vlm_with_multiple_images",
+    "batch_call_vlm_interweaved",
+    "setup_logging",
+    "BACKEND",
+    "LLM_MODEL_NAME",
+    "VLM_MODEL_NAME",
+    # Config
+    "load_config",
+    "get_config_value",
+    # Embeddings
+    "get_best_embedding_model",
+    "NomicVLEmbed",
+    # Pipeline
+    "generate_qa_for_chunk",
+    "build_complete_context",
+    "fetch_domain_and_role",
+    "deduplicate_qa_pairs",
+    # Utils
+    "run_preflight_checks",
+]

mirage/cli.py ADDED Viewed

@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""
+MiRAGE Command Line Interface
+Usage:
+    mirage                      # Run full pipeline
+    mirage --preflight          # Run preflight checks only
+    mirage --config my.yaml     # Use custom config file
+    mirage-preflight            # Run preflight checks (shortcut)
+"""
+import os
+import sys
+import argparse
+import logging
+import multiprocessing as mp
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="MiRAGE: Multimodal Multihop RAG Evaluation Dataset Generator",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--config", "-c",
+        type=str,
+        default="config.yaml",
+        help="Path to configuration file (default: config.yaml)"
+    )
+    parser.add_argument(
+        "--preflight",
+        action="store_true",
+        help="Run preflight checks only"
+    )
+    parser.add_argument(
+        "--skip-preflight",
+        action="store_true",
+        help="Skip preflight checks"
+    )
+    parser.add_argument(
+        "--input", "-i",
+        type=str,
+        help="Input directory with documents (overrides config)"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=str,
+        help="Output directory for results (overrides config)"
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Enable verbose output"
+    )
+    parser.add_argument(
+        "--version",
+        action="version",
+        version="%(prog)s 1.0.0"
+    )
+    return parser.parse_args()
+def main():
+    """Main entry point for MiRAGE CLI."""
+    args = parse_args()
+    # Setup logging
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(
+        level=log_level,
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    logger = logging.getLogger(__name__)
+    # Import after parsing to speed up --help
+    from mirage.core.llm import setup_logging, BACKEND, LLM_MODEL_NAME, VLM_MODEL_NAME
+    from mirage.utils.preflight import run_preflight_checks
+    from mirage.core.config import load_config
+    logger.info("=" * 60)
+    logger.info("MiRAGE: Multimodal Multihop RAG Evaluation Dataset Generator")
+    logger.info("=" * 60)
+    logger.info(f"Backend: {BACKEND}")
+    logger.info(f"LLM Model: {LLM_MODEL_NAME}")
+    logger.info(f"VLM Model: {VLM_MODEL_NAME}")
+    # Run preflight checks only
+    if args.preflight:
+        logger.info("\nRunning preflight checks...")
+        success = run_preflight_checks()
+        sys.exit(0 if success else 1)
+    # Run preflight checks before pipeline
+    if not args.skip_preflight:
+        logger.info("\nRunning preflight checks...")
+        if not run_preflight_checks():
+            logger.error("Preflight checks failed. Fix issues above or use --skip-preflight to bypass.")
+            sys.exit(1)
+        logger.info("Preflight checks passed!\n")
+    # Load configuration
+    try:
+        config = load_config(args.config)
+    except FileNotFoundError:
+        logger.error(f"Configuration file not found: {args.config}")
+        logger.info("Create config.yaml from config.yaml.example:")
+        logger.info("  cp config.yaml.example config.yaml")
+        sys.exit(1)
+    paths = config.get('paths', {})
+    input_dir = args.input or paths.get('input_pdf_dir', 'data/documents')
+    output_dir = args.output or paths.get('output_dir', 'output/results')
+    logger.info(f"Input directory: {input_dir}")
+    logger.info(f"Output directory: {output_dir}")
+    # Validate input directory
+    if not os.path.exists(input_dir):
+        logger.error(f"Input directory does not exist: {input_dir}")
+        logger.info("Add your documents to the data/documents/ folder")
+        sys.exit(1)
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # Run pipeline
+    logger.info("\nStarting MiRAGE pipeline...")
+    logger.info("See README.md for detailed pipeline documentation.\n")
+    # Import pipeline modules
+    from mirage.pipeline.pdf_processor import process_directory as process_pdfs
+    from mirage.pipeline.chunker import process_markdown_directory
+    from mirage.pipeline.domain import fetch_domain_and_role
+    from mirage.pipeline.qa_generator import run_qa_generation
+    from mirage.pipeline.deduplication import deduplicate_qa_dataset
+    # Execute pipeline steps
+    # (The actual implementation would go here)
+    logger.info("\n" + "=" * 60)
+    logger.info("Pipeline complete!")
+    logger.info("=" * 60)
+    logger.info(f"Results saved to: {output_dir}")
+if __name__ == "__main__":
+    # Use spawn method for multiprocessing (required for CUDA)
+    mp.set_start_method('spawn', force=True)
+    main()

mirage/core/__init__.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""
+Core module for MiRAGE - LLM/VLM interfaces, prompts, and configuration.
+Imports are lazy to allow the package to be imported without a config file.
+"""
+def __getattr__(name):
+    """Lazy import to avoid import-time config loading."""
+    # LLM functions
+    if name in ("call_llm_simple", "call_vlm_interweaved", "call_vlm_with_multiple_images",
+                "batch_call_vlm_interweaved", "setup_logging", "test_llm_connection",
+                "test_vlm_connection", "BACKEND", "LLM_MODEL_NAME", "VLM_MODEL_NAME",
+                "GEMINI_RPM", "GEMINI_BURST"):
+        from mirage.core import llm
+        return getattr(llm, name)
+    # Prompts
+    if name in ("PROMPTS", "PROMPTS_CHUNK"):
+        from mirage.core import prompts
+        return getattr(prompts, name)
+    # Config
+    if name in ("load_config", "get_config_value", "ConfigLoader"):
+        from mirage.core import config
+        return getattr(config, name)
+    raise AttributeError(f"module 'mirage.core' has no attribute '{name}'")
+__all__ = [
+    # LLM functions
+    "call_llm_simple",
+    "call_vlm_interweaved",
+    "call_vlm_with_multiple_images",
+    "batch_call_vlm_interweaved",
+    "setup_logging",
+    "test_llm_connection",
+    "test_vlm_connection",
+    "BACKEND",
+    "LLM_MODEL_NAME",
+    "VLM_MODEL_NAME",
+    "GEMINI_RPM",
+    "GEMINI_BURST",
+    # Prompts
+    "PROMPTS",
+    "PROMPTS_CHUNK",
+    # Config
+    "load_config",
+    "get_config_value",
+    "ConfigLoader",
+]

mirage/core/config.py ADDED Viewed

@@ -0,0 +1,248 @@
+"""
+Configuration loader for the QA Dataset Generation Pipeline.
+Loads settings from config.yaml and provides easy access to all modules.
+"""
+import os
+import yaml
+from pathlib import Path
+from typing import Dict, Any, Optional
+# Find config.yaml relative to this file
+_CONFIG_PATH = Path(__file__).parent / "config.yaml"
+_config_cache: Optional[Dict[str, Any]] = None
+def load_config(config_path: str = None) -> Dict[str, Any]:
+    """Load configuration from YAML file with caching.
+    Returns default configuration if config file not found.
+    This allows the package to be imported without a config file.
+    """
+    global _config_cache
+    if _config_cache is not None and config_path is None:
+        return _config_cache
+    path = Path(config_path) if config_path else _CONFIG_PATH
+    # If config file doesn't exist, return defaults
+    if not path.exists():
+        # Try workspace root config.yaml
+        workspace_config = Path.cwd() / "config.yaml"
+        if workspace_config.exists():
+            path = workspace_config
+        else:
+            # Return default configuration - allows import without config file
+            return _get_default_config()
+    with open(path, 'r') as f:
+        config = yaml.safe_load(f)
+    if config_path is None:
+        _config_cache = config
+    return config
+def _get_default_config() -> Dict[str, Any]:
+    """Return default configuration when no config file is available.
+    This enables the package to be imported and basic operations to work
+    without requiring a config.yaml file upfront.
+    """
+    return {
+        'backend': {
+            'active': os.environ.get('LLM_BACKEND', 'GEMINI'),
+            'gemini': {
+                'llm_model': 'gemini-2.0-flash',
+                'vlm_model': 'gemini-2.0-flash',
+            },
+            'openai': {
+                'llm_model': 'gpt-4o-mini',
+                'vlm_model': 'gpt-4o',
+            },
+            'ollama': {
+                'base_url': 'http://localhost:11434',
+                'llm_model': 'llama3',
+                'vlm_model': 'llava',
+            }
+        },
+        'rate_limiting': {
+            'requests_per_minute': 60,
+            'burst_size': 15
+        },
+        'paths': {
+            'input_pdf_dir': 'data/documents',
+            'output_dir': 'output'
+        },
+        'parallel': {
+            'num_workers': 3,
+            'qa_max_workers': 6,
+            'dedup_max_workers': 4
+        },
+        'qa_generation': {
+            'num_qa_pairs': 100,
+            'type': 'multihop'
+        }
+    }
+def get_backend_config() -> Dict[str, Any]:
+    """Get the active backend configuration."""
+    config = load_config()
+    backend_name = config['backend']['active'].lower()
+    backend_config = config['backend'].get(backend_name, {})
+    return {
+        'name': config['backend']['active'].upper(),
+        **backend_config
+    }
+def get_api_key(backend_name: str = None) -> str:
+    """Load API key for the specified or active backend."""
+    config = load_config()
+    if backend_name is None:
+        backend_name = config['backend']['active'].lower()
+    else:
+        backend_name = backend_name.lower()
+    backend_config = config['backend'].get(backend_name, {})
+    api_key_path = backend_config.get('api_key_path')
+    if not api_key_path:
+        return ""
+    try:
+        with open(api_key_path, 'r') as f:
+            return f.read().strip()
+    except FileNotFoundError:
+        print(f"⚠️ API key file not found: {api_key_path}")
+        return ""
+def get_rate_limit_config() -> Dict[str, int]:
+    """Get rate limiting configuration."""
+    config = load_config()
+    return config.get('rate_limiting', {
+        'requests_per_minute': 60,
+        'burst_size': 15
+    })
+def get_parallel_config() -> Dict[str, Any]:
+    """Get parallel processing configuration."""
+    config = load_config()
+    return config.get('parallel', {
+        'num_workers': 3,
+        'available_gpus': [0, 1, 2],
+        'qa_max_workers': 6,
+        'dedup_max_workers': 4
+    })
+def get_retrieval_config() -> Dict[str, Any]:
+    """Get context retrieval configuration."""
+    config = load_config()
+    return config.get('retrieval', {})
+def get_embedding_config() -> Dict[str, Any]:
+    """Get embedding configuration."""
+    config = load_config()
+    return config.get('embedding', {})
+def get_paths_config() -> Dict[str, Any]:
+    """Get input/output paths configuration."""
+    config = load_config()
+    return config.get('paths', {})
+def get_processing_config() -> Dict[str, Any]:
+    """Get processing limits configuration."""
+    config = load_config()
+    return config.get('processing', {})
+def get_evaluation_config() -> Dict[str, Any]:
+    """Get evaluation configuration."""
+    config = load_config()
+    return config.get('evaluation', {})
+def get_domain_expert_config() -> Dict[str, Any]:
+    """Get domain/expert persona configuration.
+    Returns:
+        Dict with 'expert_persona', 'domain' (may be None if auto-detect),
+        and other settings like 'use_multimodal_embeddings', 'output_dir'
+    """
+    config = load_config()
+    return config.get('domain_expert', {
+        'expert_persona': None,
+        'domain': None,
+        'use_multimodal_embeddings': True,
+        'output_dir': 'trials/domain_analysis'
+    })
+def get_qa_correction_config() -> Dict[str, Any]:
+    """Get QA correction configuration.
+    Returns:
+        Dict with 'enabled' (bool), 'max_attempts' (int)
+    """
+    config = load_config()
+    return config.get('qa_correction', {
+        'enabled': True,
+        'max_attempts': 1
+    })
+def get_qa_generation_config() -> Dict[str, Any]:
+    """Get QA generation control configuration.
+    Returns:
+        Dict with:
+        - 'num_qa_pairs': Target number of QA pairs (None = no limit)
+        - 'type': Type of QA to generate ('multihop', 'multimodal', 'text', 'mix')
+    """
+    config = load_config()
+    return config.get('qa_generation', {
+        'num_qa_pairs': 1000,
+        'type': 'multihop'
+    })
+# Convenience function to print current config
+def print_config_summary():
+    """Print a summary of the current configuration."""
+    config = load_config()
+    backend = get_backend_config()
+    rate_limit = get_rate_limit_config()
+    parallel = get_parallel_config()
+    qa_gen = get_qa_generation_config()
+    print("=" * 60)
+    print("📋 CONFIGURATION SUMMARY")
+    print("=" * 60)
+    print(f"Backend: {backend['name']}")
+    print(f"  LLM Model: {backend.get('llm_model', 'N/A')}")
+    print(f"  VLM Model: {backend.get('vlm_model', 'N/A')}")
+    print(f"Rate Limiting:")
+    print(f"  RPM: {rate_limit.get('requests_per_minute', 60)}")
+    print(f"  Burst: {rate_limit.get('burst_size', 15)}")
+    print(f"Parallel Processing:")
+    print(f"  QA Workers: {parallel.get('qa_max_workers', 6)}")
+    print(f"  Dedup Workers: {parallel.get('dedup_max_workers', 4)}")
+    print(f"QA Generation:")
+    print(f"  Target Pairs: {qa_gen.get('num_qa_pairs', 1000)}")
+    print(f"  Type: {qa_gen.get('type', 'multihop')}")
+    print("=" * 60)
+if __name__ == "__main__":
+    print_config_summary()