PyPI - solace-agent-mesh - Versions diffs - 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl - Mend - Supply Chain Defender

solace-agent-mesh 1.5.1py3-none-any.whl → 1.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of solace-agent-mesh might be problematic. Click here for more details.

Files changed (184) hide show

solace_agent_mesh/evaluation/run.py CHANGED Viewed

@@ -4,64 +4,95 @@ This module orchestrates the evaluation of AI models against test cases.
 """
 import json
+import logging
+import mimetypes
 import os
+import shutil
+import subprocess
 import sys
+import threading
 import time
-import subprocess
-import requests
 import uuid
-import shutil
-import mimetypes
-import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from importlib import metadata
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Any
-from dataclasses import dataclass
+import click
+import requests
 from dotenv import load_dotenv
-from .config_loader import ConfigLoader
-from .message_organizer import MessageOrganizer
-from .summary_builder import SummaryBuilder
-from .subscriber import Subscriber
 from .evaluator import EvaluationOrchestrator
+from .message_organizer import MessageOrganizer
 from .report_generator import ReportGenerator
+from .shared import (
+    DEFAULT_STARTUP_WAIT_TIME,
+    DEFAULT_TEST_TIMEOUT,
+    EVALUATION_DIR,
+    MAX_ARTIFACT_SIZE_MB,
+    EvaluationConfigLoader,
+    TestSuiteConfiguration,
+    get_local_base_url,
+)
+from .subscriber import Subscriber
+from .summary_builder import SummaryBuilder
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+log = logging.getLogger(__name__)
-@dataclass
-class EvaluationConfig:
-    """Centralized configuration with validation and defaults."""
+def _error_exit(message: str):
+    """Logs an error message and exits."""
+    log.error(message)
+    sys.exit(1)
-    # Constants
-    DEFAULT_STARTUP_WAIT_TIME = 60
-    DEFAULT_TEST_TIMEOUT = 60
-    def __init__(self, config_data: Dict[str, Any]):
-        load_dotenv()
-        host = os.getenv("REST_API_HOST", "0.0.0.0")
-        port = os.getenv("REST_API_PORT", "8080")
-        self.API_BASE_URL = f"http://{host}:{port}/api/v2"
-        self.config_data = config_data
-        self.agents = config_data.get("agents", [])
-        self.test_cases = config_data.get("test_cases", [])
-        self.llm_models = config_data.get("llm_models", [])
-        self.runs = config_data.get("runs", 1)
-        self.results_dir_name = config_data.get("results_dir_name", "tests")
-        self._validate_config()
-    def _validate_config(self):
-        """Validate required configuration fields."""
-        if not self.agents:
-            raise ValueError("'agents' configuration is required and cannot be empty")
-        if not self.test_cases:
-            raise ValueError(
-                "'test_cases' configuration is required and cannot be empty"
-            )
-        if not self.llm_models:
-            raise ValueError(
-                "'llm_models' configuration is required and cannot be empty"
+def _ensure_eval_backend_config_exists():
+    """Checks for eval_backend.yaml and creates it from a template if missing."""
+    project_root = Path.cwd()
+    configs_dir = project_root / "configs"
+    eval_backend_config_path = configs_dir / "eval_backend.yaml"
+    if eval_backend_config_path.exists():
+        return
+    click.echo(
+        f"'{eval_backend_config_path.relative_to(project_root)}' not found. Creating it..."
+    )
+    if not (configs_dir / "shared_config.yaml").exists():
+        _error_exit(
+            "Error: 'configs/shared_config.yaml' not found. Please run 'sam init' first."
+        )
+    try:
+        # This is a simplified way to get the template content.
+        # In a real CLI, you'd use a more robust method like `importlib.resources`.
+        template_path = Path(__file__).parent.parent / "templates" / "eval_backend_template.yaml"
+        with open(template_path, encoding="utf-8") as f:
+            template_content = f.read()
+        with open(eval_backend_config_path, "w", encoding="utf-8") as f:
+            f.write(template_content)
+        click.echo(
+            click.style(
+                f"Successfully created '{eval_backend_config_path.relative_to(project_root)}'.",
+                fg="green",
             )
+        )
+    except Exception as e:
+        _error_exit(f"Failed to create eval_backend.yaml: {e}")
+def _ensure_sam_rest_gateway_installed():
+    """Checks if the sam-rest-gateway package is installed for local evaluation."""
+    try:
+        metadata.distribution("sam-rest-gateway")
+    except metadata.PackageNotFoundError:
+        _error_exit(
+            "Error: 'sam-rest-gateway' is not installed. "
+            "Please install it using: "
+            'pip install "sam-rest-gateway @ git+https://github.com/SolaceLabs/solace-agent-mesh-core-plugins#subdirectory=sam-rest-gateway"'
+        )
 @dataclass
@@ -70,7 +101,7 @@ class TestRun:
     agent: str
     query: str
-    artifacts: List[str]
+    artifacts: list[str]
     wait_time: int
     test_case_file: str
     run_num: int
@@ -78,192 +109,220 @@ class TestRun:
     @property
     def test_case_id(self) -> str:
         """Extract test case ID from filename."""
-        base_name = os.path.basename(self.test_case_file)
-        return os.path.splitext(base_name)[0].replace(".test", "")
+        return Path(self.test_case_file).stem.replace(".test", "")
 class ProcessManager:
     """Manages subprocess lifecycle for the Solace AI Connector."""
-    def __init__(self, config: EvaluationConfig, verbose: bool = False):
+    def __init__(self, config: TestSuiteConfiguration, verbose: bool = False):
         self.config = config
-        self.process: Optional[subprocess.Popen] = None
-        self.namespace: Optional[str] = None
+        self.process: subprocess.Popen | None = None
+        self.namespace: str | None = None
         self.verbose = verbose
-    def start_services(self) -> Tuple[subprocess.Popen, str]:
+    def start_services(self) -> tuple[subprocess.Popen, str]:
         """Start the Solace AI Connector and return process and namespace."""
         load_dotenv()
         self.namespace = f"eval-{uuid.uuid4()}"
         os.environ["NAMESPACE"] = self.namespace
-        agent_files = self.config.agents
+        # Set broker environment variables from the required configuration
+        log.info("Setting broker configuration from test suite...")
+        for key, value in self.config.broker.dict().items():
+            if value is not None:
+                env_key = f"SOLACE_BROKER_{key.upper()}"
+                os.environ[env_key] = str(value)
+                log.info(f"  - Set {env_key}")
+        agent_files = self.config.agent_configs
         command = [sys.executable, "-m", "solace_ai_connector.main", *agent_files]
-        print("Starting Solace AI Connector as a subprocess...")
-        project_root = os.path.abspath(os.path.join(SCRIPT_DIR, ".."))
+        log.info("Starting Solace AI Connector as a subprocess...")
+        project_root = Path(EVALUATION_DIR).parent.resolve()
         self.process = subprocess.Popen(
             command, stdout=sys.stdout, stderr=sys.stderr, cwd=project_root
         )
-        print("Waiting for server to become healthy...")
-        self._wait_for_server_ready()
+        log.info("Waiting for server to become healthy...")
+        self._wait_for_server_ready(get_local_base_url())
         return self.process, self.namespace
-    def _wait_for_server_ready(self):
+    def _wait_for_server_ready(self, base_url: str):
         """Poll the health endpoint until the server is ready."""
         start_time = time.time()
-        health_url = f"{self.config.API_BASE_URL.replace('/api/v2', '')}/health"
+        health_url = f"{base_url}/health"
-        while time.time() - start_time < self.config.DEFAULT_STARTUP_WAIT_TIME:
+        while time.time() - start_time < DEFAULT_STARTUP_WAIT_TIME:
             try:
                 response = requests.get(health_url)
                 if response.status_code == 200:
-                    print("Server is healthy.")
-                    time.sleep(1)  # Wait an extra second as requested
+                    log.info("Server is healthy.")
+                    time.sleep(5)
                     return
             except requests.ConnectionError:
                 # Server is not yet available, wait and retry
                 time.sleep(1)
             except Exception as e:
-                print(f"An unexpected error occurred during health check: {e}")
+                log.error(f"An unexpected error occurred during health check: {e}")
                 time.sleep(1)
         raise RuntimeError(
-            f"Server did not become healthy within {self.config.DEFAULT_STARTUP_WAIT_TIME} seconds."
+            f"Server did not become healthy within {DEFAULT_STARTUP_WAIT_TIME} seconds."
         )
-    def stop_services(self, subscriber: Optional[Subscriber] = None):
+    def stop_services(self, subscriber: Subscriber | None = None):
         """Clean up running processes."""
         if subscriber:
-            print("--- Terminating subscriber ---")
+            log.info("Terminating subscriber")
             subscriber.stop()
             subscriber.join()
-            print("Subscriber terminated.")
+            log.info("Subscriber terminated.")
         if self.process:
-            print("--- Terminating subprocess ---")
+            log.info("Terminating subprocess")
             self.process.terminate()
             try:
                 self.process.wait(timeout=5)
-                print("Subprocess terminated.")
+                log.info("Subprocess terminated.")
             except subprocess.TimeoutExpired:
-                print("Subprocess did not terminate gracefully, killing.")
+                log.info("Subprocess did not terminate gracefully, killing.")
                 self.process.kill()
-        print("Process cleanup completed.")
+        log.info("Process cleanup completed.")
 class TaskService:
     """Handles task submission and tracking."""
-    def __init__(self, config: EvaluationConfig, verbose: bool = False):
-        self.config = config
-        self.base_url = config.API_BASE_URL
+    def __init__(self, config: TestSuiteConfiguration, verbose: bool = False):
         self.verbose = verbose
+        self.config = config
+        if config.remote:
+            self.base_url = config.remote.environment.get("EVAL_REMOTE_URL")
+        else:
+            self.base_url = get_local_base_url()
     def submit_task(
-        self, agent_name: str, message: str, artifact_paths: Optional[List[str]] = None
-    ) -> Optional[str]:
+        self, agent_name: str, message: str, artifact_paths: list[str] | None = None
+    ) -> str | None:
         """Submit a test case to the agent and return the task ID."""
-        print("--- Sending test request ---")
-        url = f"{self.base_url}/tasks"
+        log.info("Sending test request")
+        url = f"{self.base_url}/api/v2/tasks"
         data = {
             "agent_name": agent_name,
             "prompt": message,
         }
+        headers = {}
+        if self.config.remote:
+            auth_token = self.config.remote.environment.get("EVAL_AUTH_TOKEN")
+            if auth_token:
+                headers["Authorization"] = f"Bearer {auth_token}"
         files_to_upload = []
         if artifact_paths:
             files_to_upload = self._prepare_file_uploads(artifact_paths)
         try:
             with requests.Session() as session:
-                response = session.post(url, data=data, files=files_to_upload)
+                response = session.post(url, data=data, files=files_to_upload, headers=headers)
             response.raise_for_status()
             task_id = response.json()["taskId"]
-            print(f"Task submitted with ID: {task_id}")
+            log.info(f"Task submitted with ID: {task_id}")
             return task_id
         except requests.RequestException as e:
-            print(f"Failed to submit task: {e}")
+            log.error(f"Failed to submit task: {e}")
             return None
         finally:
             self._close_file_uploads(files_to_upload)
-    def _prepare_file_uploads(self, artifact_paths: List[str]) -> List[Tuple]:
+    def _prepare_file_uploads(self, artifact_paths: list[str]) -> list[tuple]:
         """Prepare file uploads for the request."""
         files_to_upload = []
-        for path in artifact_paths:
+        for path_str in artifact_paths:
+            path = Path(path_str)
+            # Check file size before reading
+            try:
+                file_size_mb = path.stat().st_size / (1024 * 1024)
+                if file_size_mb > MAX_ARTIFACT_SIZE_MB:
+                    log.warning(
+                        f"Artifact '{path.name}' is {file_size_mb:.2f} MB, "
+                        f"which is larger than the recommended maximum of {MAX_ARTIFACT_SIZE_MB} MB. "
+                        "This may cause memory issues."
+                    )
+            except OSError as e:
+                log.error(f"Could not get size of artifact {path}: {e}")
+                continue
             mimetype, _ = mimetypes.guess_type(path)
             if mimetype is None:
                 mimetype = "text/plain"
-            files_to_upload.append(
-                ("files", (os.path.basename(path), open(path, "rb"), mimetype))
-            )
+            # Read file content with context manager
+            with path.open("rb") as f:
+                file_content = f.read()
+            files_to_upload.append(("files", (path.name, file_content, mimetype)))
         return files_to_upload
-    def _close_file_uploads(self, files_to_upload: List[Tuple]):
-        """Close file handles after upload."""
-        for _, file_tuple in files_to_upload:
-            file_tuple[1].close()
+    def _close_file_uploads(self, files_to_upload: list[tuple]):
+        """Close file handles after upload (no longer needed)."""
+        # No longer needed
+        pass
 class FileService:
     """Handles file operations and path management."""
     @staticmethod
-    def ensure_directory(path: str):
+    def ensure_directory(path: Path):
         """Ensure directory exists, create if necessary."""
-        os.makedirs(path, exist_ok=True)
+        path.mkdir(parents=True, exist_ok=True)
     @staticmethod
-    def remove_directory(path: str):
+    def remove_directory(path: Path):
         """Remove directory and all contents."""
-        if os.path.exists(path):
+        if path.exists():
             shutil.rmtree(path)
     @staticmethod
-    def save_json(data: Any, filepath: str):
+    def save_json(data: any, filepath: Path):
         """Save data as JSON to file."""
-        with open(filepath, "w") as f:
+        with filepath.open("w") as f:
             json.dump(data, f, indent=4)
     @staticmethod
-    def load_json(filepath: str) -> Any:
+    def load_json(filepath: Path) -> any:
         """Load JSON data from file."""
-        with open(filepath, "r") as f:
+        with filepath.open() as f:
             return json.load(f)
 class TestRunBuilder:
     """Builds test run configurations from test cases."""
-    def __init__(self, config: EvaluationConfig):
+    def __init__(self, config: TestSuiteConfiguration):
         self.config = config
-    def build_test_runs(self) -> List[TestRun]:
+    def build_test_runs(self) -> list[TestRun]:
         """Build all test runs from configuration."""
         test_runs = []
-        for test_case_path in self.config.test_cases:
-            test_case = FileService.load_json(test_case_path)
+        for test_case_path in self.config.test_case_files:
+            test_case = FileService.load_json(Path(test_case_path))
             artifact_paths = self._get_artifact_paths(test_case, test_case_path)
-            test_case_file = os.path.basename(test_case_path)
-            for run_num in range(1, self.config.runs + 1):
+            for run_num in range(1, self.config.run_count + 1):
                 test_run = TestRun(
                     agent=test_case["target_agent"],
                     query=test_case["query"],
                     artifacts=artifact_paths,
-                    wait_time=test_case.get(
-                        "wait_time", self.config.DEFAULT_TEST_TIMEOUT
-                    ),
+                    wait_time=test_case.get("wait_time", DEFAULT_TEST_TIMEOUT),
                     test_case_file=test_case_path,
                     run_num=run_num,
                 )
@@ -271,14 +330,14 @@ class TestRunBuilder:
         return test_runs
-    def _get_artifact_paths(self, test_case: Dict, test_case_path: str) -> List[str]:
+    def _get_artifact_paths(self, test_case: dict, test_case_path: str) -> list[str]:
         """Extract artifact paths from test case."""
         artifact_paths = []
         if "artifacts" in test_case:
-            test_case_dir = os.path.dirname(test_case_path)
+            test_case_dir = Path(test_case_path).parent
             for artifact in test_case["artifacts"]:
                 if artifact.get("type") == "file":
-                    artifact_paths.append(os.path.join(test_case_dir, artifact["path"]))
+                    artifact_paths.append(str(test_case_dir / artifact["path"]))
         return artifact_paths
@@ -293,13 +352,14 @@ class TestExecutor:
     def execute_test(
         self,
         test_run: TestRun,
-        model_results_path: str,
-        task_mappings: Dict[str, str],
+        model_results_path: Path,
+        task_mappings: dict[str, str],
         subscriber: Subscriber,
+        task_mappings_lock: threading.Lock,
     ) -> bool:
         """Execute a single test case and wait for completion."""
-        print(
-            f"--- Starting test: {test_run.test_case_file} (run {test_run.run_num}) ---"
+        log.info(
+            f"Starting test: {test_run.test_case_file} (run {test_run.run_num})"
         )
         # Submit the task
@@ -308,25 +368,23 @@ class TestExecutor:
         )
         if not task_id:
-            print(
+            log.error(
                 f"Failed to start test case: {test_run.test_case_file} (run {test_run.run_num})"
             )
             return False
         # Set up result directory
-        run_dir = os.path.join(
-            model_results_path, test_run.test_case_id, f"run_{test_run.run_num}"
-        )
+        test_case_name = Path(test_run.test_case_file).stem.replace(".test", "")
+        run_dir = model_results_path / test_case_name / f"run_{test_run.run_num}"
         self.file_service.ensure_directory(run_dir)
         # Save test case path for summary builder
         test_info = {"path": test_run.test_case_file}
-        self.file_service.save_json(
-            test_info, os.path.join(run_dir, "test_case_info.json")
-        )
+        self.file_service.save_json(test_info, run_dir / "test_case_info.json")
         # Track the task
-        task_mappings[task_id] = run_dir
+        with task_mappings_lock:
+            task_mappings[task_id] = str(run_dir)
         subscriber.active_tasks.add(task_id)
         # Wait for completion
@@ -336,26 +394,26 @@ class TestExecutor:
         self, task_id: str, wait_time: int, subscriber: Subscriber
     ) -> bool:
         """Wait for task completion with timeout."""
-        print(
+        log.info(
             f"Waiting for task {task_id} to complete (timeout: {wait_time} seconds)..."
         )
         start_time = time.time()
         while task_id in subscriber.active_tasks:
             if time.time() - start_time > wait_time:
-                print(f"Task {task_id} timed out after {wait_time} seconds")
+                log.warning(f"Task {task_id} timed out after {wait_time} seconds")
                 subscriber.active_tasks.discard(task_id)
                 return False
             time.sleep(1)
-        print(f"Task {task_id} completed successfully")
+        log.info(f"Task {task_id} completed successfully")
         return True
 class ModelEvaluator:
     """Handles the evaluation of a single model."""
-    def __init__(self, config: EvaluationConfig, verbose: bool = False):
+    def __init__(self, config: TestSuiteConfiguration, verbose: bool = False):
         self.config = config
         self.process_manager = ProcessManager(config, verbose=verbose)
         self.task_service = TaskService(config, verbose=verbose)
@@ -363,20 +421,21 @@ class ModelEvaluator:
         self.test_builder = TestRunBuilder(config)
         self.test_executor = TestExecutor(self.task_service, self.file_service, verbose=verbose)
         self.verbose = verbose
+        self._task_mappings_lock = threading.Lock()
     def evaluate_model(
-        self, model_config: Dict[str, Any], base_results_path: str
+        self, model_config: dict[str, any], base_results_path: Path
     ) -> float:
         """Evaluate a single model and return execution time."""
-        model_name = model_config["name"]
-        print(f"--- Starting evaluation for model: {model_name} ---")
+        model_name = model_config.name
+        log.info(f"Starting evaluation for model: {model_name}")
         start_time = time.time()
         # Set environment variables for the model
         self._set_model_environment(model_config)
         # Set up paths
-        model_results_path = os.path.join(base_results_path, model_name)
+        model_results_path = base_results_path / model_name
         self.file_service.ensure_directory(model_results_path)
         # Start services
@@ -388,10 +447,10 @@ class ModelEvaluator:
         try:
             # Execute tests
             successful_tests = self._execute_all_tests(model_results_path, subscriber)
-            print(f"--- Completed {successful_tests} tests successfully ---")
+            log.info(f"Completed {successful_tests} tests successfully")
         except Exception as e:
-            print(f"Error during test case execution for model {model_name}: {e}")
+            log.error(f"Error during test case execution for model {model_name}: {e}")
         finally:
             # Cleanup
             task_mappings = getattr(self, "_task_mappings", {})
@@ -401,52 +460,84 @@ class ModelEvaluator:
         end_time = time.time()
         execution_time = end_time - start_time
-        print(
-            f"--- Evaluation for model: {model_name} complete in {execution_time:.2f} seconds ---"
+        log.info(
+            f"Evaluation for model: {model_name} complete in {execution_time:.2f} seconds"
         )
         return execution_time
-    def _set_model_environment(self, model_config: Dict[str, Any]):
+    def _set_model_environment(self, model_config: dict[str, any]):
         """Set environment variables for the model."""
-        for key, value in model_config.get("env", {}).items():
-            os.environ[key] = value
+        for key, value in model_config.environment.variables.items():
+            if value is not None:
+                os.environ[key] = value
-    def _setup_subscriber(self, namespace: str, model_results_path: str) -> Subscriber:
+    def _setup_subscriber(self, namespace: str, model_results_path: Path) -> Subscriber:
         """Set up and start the subscriber."""
         subscription_ready_event = threading.Event()
         subscriber = Subscriber(
-            namespace, set(), None, subscription_ready_event, model_results_path
+            self.config.broker,
+            namespace,
+            set(),
+            None,
+            subscription_ready_event,
+            model_results_path,
         )
         subscriber.start()
-        print("Waiting for subscriber to be ready...")
+        log.info("Waiting for subscriber to be ready...")
         subscription_ready_event.wait()
-        print("Subscriber is ready.")
+        log.info("Subscriber is ready.")
         return subscriber
     def _execute_all_tests(
-        self, model_results_path: str, subscriber: Subscriber
+        self, model_results_path: Path, subscriber: Subscriber
     ) -> int:
-        """Execute all test cases and return count of successful tests."""
+        """Execute all test cases in parallel and return count of successful tests."""
         test_runs = self.test_builder.build_test_runs()
         self._task_mappings = {}
         total_tests = len(test_runs)
         successful_tests = 0
-        print(f"--- Starting sequential execution of {total_tests} tests ---")
+        log.info(
+            f"Starting parallel execution of {total_tests} tests with {self.config.workers} workers."
+        )
-        for i, test_run in enumerate(test_runs, 1):
-            print(f"--- Test {i}/{total_tests} ---")
-            success = self.test_executor.execute_test(
-                test_run, model_results_path, self._task_mappings, subscriber
-            )
-            if success:
-                successful_tests += 1
-            else:
-                print(f"Test {i} failed or timed out")
+        with ThreadPoolExecutor(max_workers=self.config.workers) as executor:
+            # Create a dictionary to map futures to their test_run
+            future_to_run = {
+                executor.submit(
+                    self.test_executor.execute_test,
+                    test_run,
+                    model_results_path,
+                    self._task_mappings,
+                    subscriber,
+                    self._task_mappings_lock,  # Pass the lock to the worker
+                ): test_run
+                for test_run in test_runs
+            }
+            # Process results as they complete
+            for i, future in enumerate(as_completed(future_to_run), 1):
+                test_run = future_to_run[future]
+                log.info(
+                    f"Processing result for test {i}/{total_tests}: {test_run.test_case_id}"
+                )
+                try:
+                    success = future.result()
+                    if success:
+                        successful_tests += 1
+                    else:
+                        log.warning(
+                            f"Test {test_run.test_case_id} (run {test_run.run_num}) failed or timed out."
+                        )
+                except Exception as e:
+                    log.error(
+                        f"Test {test_run.test_case_id} (run {test_run.run_num}) generated an exception: {e}",
+                        exc_info=True,
+                    )
         return successful_tests
@@ -454,16 +545,16 @@ class ModelEvaluator:
         self,
         app_process: subprocess.Popen,
         subscriber: Subscriber,
-        model_results_path: str,
-        task_mappings: Dict[str, str],
+        model_results_path: Path,
+        task_mappings: dict[str, str],
     ):
         """Clean up after model evaluation."""
         self.process_manager.stop_services(subscriber)
         # Save task mappings
-        mappings_file = os.path.join(model_results_path, "task_mappings.json")
+        mappings_file = model_results_path / "task_mappings.json"
         self.file_service.save_json(task_mappings, mappings_file)
-        print(f"Task mappings saved to {mappings_file}")
+        log.info(f"Task mappings saved to {mappings_file}")
 class ResultsProcessor:
@@ -471,52 +562,49 @@ class ResultsProcessor:
     def __init__(self, file_service: FileService, verbose: bool = False):
         self.file_service = file_service
-        self.summary_builder = SummaryBuilder()
+        self.summary_builder: SummaryBuilder | None = None
         self.verbose = verbose
-    def summarize_results(self, base_results_path: str):
+    def summarize_results(self, base_results_path: Path, config: TestSuiteConfiguration):
         """Generate summaries for all test results."""
-        print("--- Summarizing results ---")
+        log.info("Summarizing results")
-        for model_name in os.listdir(base_results_path):
-            model_path = os.path.join(base_results_path, model_name)
-            if not os.path.isdir(model_path):
-                continue
+        self.summary_builder = SummaryBuilder(config)
+        for model_path in base_results_path.iterdir():
+            if not model_path.is_dir():
+                continue
             self._process_model_results(model_path)
-    def _process_model_results(self, model_path: str):
+    def _process_model_results(self, model_path: Path):
         """Process results for a single model."""
-        for test_case_name in os.listdir(model_path):
-            test_case_path = os.path.join(model_path, test_case_name)
-            if not os.path.isdir(test_case_path):
+        for test_case_path in model_path.iterdir():
+            if not test_case_path.is_dir():
                 continue
             self._process_test_case_results(test_case_path)
-    def _process_test_case_results(self, test_case_path: str):
+    def _process_test_case_results(self, test_case_path: Path):
         """Process results for a single test case."""
-        for run_name in os.listdir(test_case_path):
-            run_path = os.path.join(test_case_path, run_name)
-            if not os.path.isdir(run_path):
+        for run_path in test_case_path.iterdir():
+            if not run_path.is_dir():
                 continue
-            messages_file = os.path.join(run_path, "messages.json")
-            if os.path.exists(messages_file):
+            messages_file = run_path / "messages.json"
+            if messages_file.exists():
                 summary_data = self.summary_builder.summarize_run(messages_file)
-                summary_file = os.path.join(run_path, "summary.json")
+                summary_file = run_path / "summary.json"
                 self.file_service.save_json(summary_data, summary_file)
-                print(f"Summary created for {run_path}")
+                log.info(f"Summary created for {run_path}")
 class EvaluationRunner:
     """Main orchestrator that coordinates the entire evaluation process."""
     def __init__(self, verbose: bool = False):
-        self.config: Optional[EvaluationConfig] = None
+        self.config: TestSuiteConfiguration | None = None
         self.file_service = FileService()
         self.results_processor = ResultsProcessor(self.file_service, verbose=verbose)
-        self.report_generator: Optional[ReportGenerator] = None
+        self.report_generator: ReportGenerator | None = None
         self.verbose = verbose
     def run_evaluation(self, config_path: str):
@@ -528,91 +616,188 @@ class EvaluationRunner:
             self._load_configuration(config_path)
             # Set up results directory in the current working directory
-            base_results_path = Path.cwd() / "results" / self.config.results_dir_name
+            base_results_path = Path.cwd() / "results" / self.config.results_directory
             self._setup_results_directory(base_results_path)
             # Run model evaluations
-            model_execution_times = self._evaluate_all_models(str(base_results_path))
+            if self.config.remote:
+                model_execution_times = self._run_remote_evaluation(base_results_path)
+            else:
+                model_execution_times = self._run_local_evaluation(base_results_path)
             # Post-process results
             self._post_process_results(
-                str(base_results_path), model_execution_times, config_path
+                base_results_path, model_execution_times, config_path
             )
             # Save overall statistics
-            self._save_execution_stats(str(base_results_path), start_time)
+            self._save_execution_stats(base_results_path, start_time)
             # Generate reports
             self._generate_reports(config_path, base_results_path)
-            # Display verbose summary if enabled
-            if self.verbose:
-                self._display_verbose_summary(base_results_path)
+            # Display summary
+            self._display_summary(base_results_path)
         except Exception as e:
-            print(f"Evaluation failed: {e}")
+            log.error(f"Evaluation failed: {e}")
             raise
     def _load_configuration(self, config_path: str):
         """Load and validate the evaluation configuration."""
-        config_loader = ConfigLoader(config_path)
-        config_data = config_loader.load_config()
-        self.config = EvaluationConfig(config_data)
+        config_loader = EvaluationConfigLoader(config_path)
+        self.config = config_loader.load_configuration()
         self.report_generator = ReportGenerator(config_path)
-        print("Configuration loaded and validated successfully.")
+        log.info("Configuration loaded and validated successfully.")
     def _setup_results_directory(self, base_results_path: Path):
         """Set up the results directory."""
         # Clean up existing results
-        self.file_service.remove_directory(str(base_results_path))
-        self.file_service.ensure_directory(str(base_results_path))
+        self.file_service.remove_directory(base_results_path)
+        self.file_service.ensure_directory(base_results_path)
-        print(f"Results directory set up at: {base_results_path}")
+        log.info(f"Results directory set up at: {base_results_path}")
-    def _evaluate_all_models(self, base_results_path: str) -> Dict[str, float]:
-        """Evaluate all configured models."""
+    def _run_local_evaluation(self, base_results_path: Path) -> dict[str, float]:
+        """Run the full local evaluation with service management."""
+        _ensure_eval_backend_config_exists()
+        _ensure_sam_rest_gateway_installed()
+        log.info("Starting local evaluation")
         model_execution_times = {}
-        for model_config in self.config.llm_models:
+        # This loop iterates through the models defined in the config
+        for model_config in self.config.model_configurations:
+            # ModelEvaluator manages the lifecycle of local services for each model
             model_evaluator = ModelEvaluator(self.config, verbose=self.verbose)
             execution_time = model_evaluator.evaluate_model(
                 model_config, base_results_path
             )
-            model_execution_times[model_config["name"]] = execution_time
+            model_execution_times[model_config.name] = execution_time
         return model_execution_times
+    def _run_remote_evaluation(self, base_results_path: Path) -> dict[str, float]:
+        """Run evaluation against a remote endpoint in parallel."""
+        remote_url = self.config.remote.environment.get("EVAL_REMOTE_URL")
+        log.info(f"Starting remote evaluation against: {remote_url}")
+        start_time = time.time()
+        # Check if the remote server is healthy before proceeding
+        process_manager = ProcessManager(self.config, self.verbose)
+        process_manager._wait_for_server_ready(remote_url)
+        # Instantiate services with the remote configuration
+        task_service = TaskService(self.config, self.verbose)
+        test_builder = TestRunBuilder(self.config)
+        test_executor = TestExecutor(task_service, self.file_service, self.verbose)
+        # In remote mode, there's no model loop. We create a single "remote" results directory.
+        remote_results_path = base_results_path / "remote"
+        self.file_service.ensure_directory(remote_results_path)
+        # The subscriber needs to be configured for remote use.
+        subscriber = self._setup_remote_subscriber(str(remote_results_path))
+        task_mappings = {}
+        try:
+            test_runs = test_builder.build_test_runs()
+            successful_tests = 0
+            task_mappings_lock = threading.Lock()
+            log.info(
+                f"Starting parallel execution of {len(test_runs)} remote tests with {self.config.workers} workers."
+            )
+            with ThreadPoolExecutor(max_workers=self.config.workers) as executor:
+                future_to_run = {
+                    executor.submit(
+                        test_executor.execute_test,
+                        test_run,
+                        remote_results_path,
+                        task_mappings,
+                        subscriber,
+                        task_mappings_lock,
+                    ): test_run
+                    for test_run in test_runs
+                }
+                for i, future in enumerate(as_completed(future_to_run), 1):
+                    test_run = future_to_run[future]
+                    log.info(
+                        f"Processing result for remote test {i}/{len(test_runs)}: {test_run.test_case_id}"
+                    )
+                    try:
+                        success = future.result()
+                        if success:
+                            successful_tests += 1
+                    except Exception as e:
+                        log.error(
+                            f"Remote test {test_run.test_case_id} generated an exception: {e}",
+                            exc_info=True,
+                        )
+            log.info(f"Completed {successful_tests} remote tests successfully")
+        finally:
+            if subscriber:
+                subscriber.stop()
+                subscriber.join()
+            # Save task mappings for remote run
+            mappings_file = remote_results_path / "task_mappings.json"
+            self.file_service.save_json(task_mappings, mappings_file)
+        execution_time = time.time() - start_time
+        return {"remote": execution_time}
+    def _setup_remote_subscriber(self, results_path: str) -> Subscriber:
+        """Set up a subscriber for remote evaluation."""
+        subscription_ready_event = threading.Event()
+        namespace = self.config.remote.environment.get("EVAL_NAMESPACE")
+        subscriber = Subscriber(
+            self.config.broker,
+            namespace,
+            set(),
+            None,
+            subscription_ready_event,
+            results_path,
+        )
+        subscriber.start()
+        subscription_ready_event.wait()
+        log.info("Remote subscriber is ready.")
+        return subscriber
     def _post_process_results(
         self,
-        base_results_path: str,
-        model_execution_times: Dict[str, float],
+        base_results_path: Path,
+        model_execution_times: dict[str, float],
         config_path: str,
     ):
         """Post-process evaluation results."""
         # Categorize messages using the refactored categorizer
-        print("--- Categorizing messages ---")
+        log.info("Categorizing messages")
         message_organizer = MessageOrganizer()
-        categorization_results = message_organizer.categorize_all_messages(
-            base_results_path
-        )
-        print("--- Message categorization finished ---")
+        message_organizer.categorize_all_messages(base_results_path)
+        log.info("Message categorization finished")
         # Generate summaries
-        self.results_processor.summarize_results(base_results_path)
+        self.results_processor.summarize_results(base_results_path, self.config)
         # Run evaluation
-        print("--- Starting evaluation of results ---")
+        log.info("Starting evaluation of results")
         evaluation_orchestrator = EvaluationOrchestrator(config_path)
-        evaluation_orchestrator.run_evaluation(base_results_path, model_execution_times)
-        print("--- Evaluation of results finished ---")
+        evaluation_orchestrator.run_evaluation(
+            base_results_path, model_execution_times
+        )
+        log.info("Evaluation of results finished")
     def _generate_reports(self, config_path: str, base_results_path: Path):
         """Generate evaluation reports."""
         if self.report_generator:
             self.report_generator.generate_report(base_results_path)
-    def _display_verbose_summary(self, base_results_path: Path):
-        """Display a verbose summary of the evaluation results in the terminal."""
+    def _display_summary(self, base_results_path: Path):
+        """Display a summary of the evaluation results in the terminal."""
         # Pre-process data to find column widths
         summary_data = []
@@ -628,7 +813,7 @@ class EvaluationRunner:
                 continue
             try:
-                results_data = self.file_service.load_json(str(results_file))
+                results_data = self.file_service.load_json(results_file)
                 model_name = results_data.get("model_name", model_dir.name)
                 max_model_len = max(max_model_len, len(model_name))
@@ -656,44 +841,42 @@ class EvaluationRunner:
                         summary_data.append((model_name, test_case_id, scores))
             except Exception as e:
-                print(f"Error processing results for {model_dir.name}: {e}")
+                log.error(f"Error processing results for {model_dir.name}: {e}")
-        # Print formatted output
         if not summary_data:
-            print("No summary data to display.")
+            log.warning("No summary data to display.")
             return
-        # Define headers and find max score lengths
-        headers = ["Tool Match", "Response Match", "LLM Eval"]
-        # Print header
+        # Define header line
         header_line = (
             f"{'Model':<{max_model_len}} | {'Test Case':<{max_test_case_len}} | "
             f"{'Tool Match':<12} | {'Response Match':<16} | {'LLM Eval':<10}"
         )
-        print(header_line)
-        print("-" * len(header_line))
+        click.echo(click.style(header_line, fg="white", bold=True))
+        click.echo(click.style("-" * len(header_line), fg="white", bold=True))
-        # Print data rows
         for model_name, test_case_id, scores in summary_data:
             tool_score = scores.get("Tool Match", "N/A")
             response_score = scores.get("Response Match", "N/A")
             llm_score = scores.get("LLM Eval", "N/A")
-            print(
-                f"{model_name:<{max_model_len}} | {test_case_id:<{max_test_case_len}} | "
-                f"{tool_score:<12} | {response_score:<16} | {llm_score:<10}"
+            click.echo(
+                click.style(
+                    f"{model_name:<{max_model_len}} | {test_case_id:<{max_test_case_len}} | "
+                    f"{tool_score:<12} | {response_score:<16} | {llm_score:<10}",
+                    fg="white",
+                )
             )
-    def _get_model_stats(self, model_path: str) -> Dict[str, Any]:
+    def _get_model_stats(self, model_path: Path) -> dict[str, any]:
         """Process results for a single model and return stats."""
         model_stats = {}
-        results_file = os.path.join(model_path, "results.json")
-        if not os.path.exists(results_file):
+        results_file = model_path / "results.json"
+        if not results_file.exists():
             return model_stats
         results_data = self.file_service.load_json(results_file)
-        model_name = results_data.get("model_name", os.path.basename(model_path))
+        model_name = results_data.get("model_name", model_path.name)
         model_stats[model_name] = {}
         for test_case in results_data.get("test_cases", []):
@@ -718,32 +901,35 @@ class EvaluationRunner:
                 model_stats[model_name][test_case_id] = scores
         return model_stats
-    def _save_execution_stats(self, base_results_path: str, start_time: float):
+    def _save_execution_stats(self, base_results_path: Path, start_time: float):
         """Save overall execution statistics."""
         end_time = time.time()
         total_execution_time = end_time - start_time
         stats = {"total_execution_time": total_execution_time, "models": {}}
         try:
-            for model_dir in os.listdir(base_results_path):
-                model_path = os.path.join(base_results_path, model_dir)
-                if not os.path.isdir(model_path):
+            for model_path in base_results_path.iterdir():
+                if not model_path.is_dir():
                     continue
                 model_stats = self._get_model_stats(model_path)
                 stats["models"].update(model_stats)
         except Exception as e:
-            print(f"Error processing results for stats: {e}")
+            log.error(f"Error processing results for stats: {e}")
-        stats_path = os.path.join(base_results_path, "stats.json")
+        stats_path = base_results_path / "stats.json"
         self.file_service.save_json(stats, stats_path)
-        print(f"Overall stats written to {stats_path}")
-        print(f"Total execution time: {total_execution_time:.2f} seconds")
+        log.info(f"Overall stats written to {stats_path}")
+        log.info(f"Total execution time: {total_execution_time:.2f} seconds")
 def main(config_path: str, verbose: bool = False):
     """Main entry point for the evaluation script."""
+    if verbose:
+        logging.basicConfig(level=logging.INFO)
+        log.info("Verbose logging enabled.")
     orchestrator = EvaluationRunner(verbose=verbose)
     orchestrator.run_evaluation(config_path)