PyPI - eval-protocol - Versions diffs - 0.0.3__py3-none-any.whl - Mend

eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

development/__init__.py +1 -0
development/normalize_sandbox_fusion.py +628 -0
development/utils/__init__.py +1 -0
development/utils/generate_api_key.py +31 -0
development/utils/subprocess_manager.py +481 -0
eval_protocol/__init__.py +86 -0
eval_protocol/__main__.py +10 -0
eval_protocol/_version.py +21 -0
eval_protocol/adapters/__init__.py +1 -0
eval_protocol/adapters/braintrust.py +8 -0
eval_protocol/adapters/trl.py +8 -0
eval_protocol/agent/__init__.py +29 -0
eval_protocol/agent/models.py +69 -0
eval_protocol/agent/orchestrator.py +893 -0
eval_protocol/agent/resource_abc.py +89 -0
eval_protocol/agent/resource_pool.py +184 -0
eval_protocol/agent/resources/__init__.py +44 -0
eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
eval_protocol/agent/resources/docker_resource.py +479 -0
eval_protocol/agent/resources/filesystem_resource.py +371 -0
eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
eval_protocol/agent/resources/http_rollout_resource.py +325 -0
eval_protocol/agent/resources/python_state_resource.py +170 -0
eval_protocol/agent/resources/sql_resource.py +271 -0
eval_protocol/agent/task_manager.py +1064 -0
eval_protocol/agent/tool_registry.py +111 -0
eval_protocol/auth.py +156 -0
eval_protocol/cli.py +425 -0
eval_protocol/cli_commands/__init__.py +1 -0
eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
eval_protocol/cli_commands/common.py +242 -0
eval_protocol/cli_commands/deploy.py +486 -0
eval_protocol/cli_commands/deploy_mcp.py +287 -0
eval_protocol/cli_commands/preview.py +186 -0
eval_protocol/cli_commands/run_eval_cmd.py +202 -0
eval_protocol/common_utils.py +36 -0
eval_protocol/config.py +180 -0
eval_protocol/datasets/__init__.py +1 -0
eval_protocol/datasets/loader.py +521 -0
eval_protocol/evaluation.py +1045 -0
eval_protocol/execution/__init__.py +1 -0
eval_protocol/execution/pipeline.py +920 -0
eval_protocol/gcp_tools.py +484 -0
eval_protocol/generation/cache.py +141 -0
eval_protocol/generation/clients/base.py +67 -0
eval_protocol/generation/clients.py +248 -0
eval_protocol/generic_server.py +165 -0
eval_protocol/integrations/__init__.py +12 -0
eval_protocol/integrations/braintrust.py +51 -0
eval_protocol/integrations/deepeval.py +106 -0
eval_protocol/integrations/openeval.py +40 -0
eval_protocol/integrations/trl.py +187 -0
eval_protocol/mcp/__init__.py +48 -0
eval_protocol/mcp/adapter.py +131 -0
eval_protocol/mcp/client/__init__.py +12 -0
eval_protocol/mcp/client/connection.py +499 -0
eval_protocol/mcp/clients.py +195 -0
eval_protocol/mcp/execution/__init__.py +23 -0
eval_protocol/mcp/execution/base_policy.py +227 -0
eval_protocol/mcp/execution/fireworks_policy.py +209 -0
eval_protocol/mcp/execution/manager.py +506 -0
eval_protocol/mcp/execution/policy.py +421 -0
eval_protocol/mcp/grid_renderer.py +54 -0
eval_protocol/mcp/mcpgym.py +637 -0
eval_protocol/mcp/process_manager.py +177 -0
eval_protocol/mcp/session/__init__.py +11 -0
eval_protocol/mcp/session/manager.py +228 -0
eval_protocol/mcp/simple_process_manager.py +291 -0
eval_protocol/mcp/simulation_server.py +458 -0
eval_protocol/mcp/types.py +80 -0
eval_protocol/mcp_agent/__init__.py +1 -0
eval_protocol/mcp_agent/config.py +147 -0
eval_protocol/mcp_agent/intermediary_server.py +542 -0
eval_protocol/mcp_agent/main.py +210 -0
eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
eval_protocol/mcp_agent/session.py +79 -0
eval_protocol/mcp_env.py +304 -0
eval_protocol/models.py +366 -0
eval_protocol/packaging.py +219 -0
eval_protocol/platform_api.py +360 -0
eval_protocol/playback_policy.py +396 -0
eval_protocol/resources.py +128 -0
eval_protocol/reward_function.py +410 -0
eval_protocol/rewards/__init__.py +94 -0
eval_protocol/rewards/accuracy.py +454 -0
eval_protocol/rewards/accuracy_length.py +173 -0
eval_protocol/rewards/apps_coding_reward.py +331 -0
eval_protocol/rewards/apps_execution_utils.py +149 -0
eval_protocol/rewards/apps_testing_util.py +559 -0
eval_protocol/rewards/bfcl_reward.py +313 -0
eval_protocol/rewards/code_execution.py +1620 -0
eval_protocol/rewards/code_execution_utils.py +72 -0
eval_protocol/rewards/cpp_code.py +861 -0
eval_protocol/rewards/deepcoder_reward.py +161 -0
eval_protocol/rewards/format.py +129 -0
eval_protocol/rewards/function_calling.py +541 -0
eval_protocol/rewards/json_schema.py +422 -0
eval_protocol/rewards/language_consistency.py +700 -0
eval_protocol/rewards/lean_prover.py +479 -0
eval_protocol/rewards/length.py +375 -0
eval_protocol/rewards/list_comparison_math_reward.py +221 -0
eval_protocol/rewards/math.py +762 -0
eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
eval_protocol/rewards/reasoning_steps.py +249 -0
eval_protocol/rewards/repetition.py +342 -0
eval_protocol/rewards/tag_count.py +162 -0
eval_protocol/rl_processing.py +82 -0
eval_protocol/server.py +271 -0
eval_protocol/typed_interface.py +260 -0
eval_protocol/utils/__init__.py +8 -0
eval_protocol/utils/batch_evaluation.py +217 -0
eval_protocol/utils/batch_transformation.py +205 -0
eval_protocol/utils/dataset_helpers.py +112 -0
eval_protocol/utils/module_loader.py +56 -0
eval_protocol/utils/packaging_utils.py +108 -0
eval_protocol/utils/static_policy.py +305 -0
eval_protocol-0.0.3.dist-info/METADATA +635 -0
eval_protocol-0.0.3.dist-info/RECORD +130 -0
eval_protocol-0.0.3.dist-info/WHEEL +5 -0
eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
eval_protocol-0.0.3.dist-info/top_level.txt +2 -0

eval_protocol/gcp_tools.py ADDED Viewed

@@ -0,0 +1,484 @@
+import logging
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+logger = logging.getLogger(__name__)
+def _run_gcloud_command(command: List[str], dry_run: bool = False) -> Tuple[bool, str, str]:
+    """
+    Helper to run a gcloud command.
+    In a real scenario, this would interact with subprocess.
+    Returns: (success_status, stdout, stderr)
+    """
+    command_str_for_print = " ".join(["gcloud"] + command)
+    logger.info(f"Executing: {command_str_for_print}")
+    if dry_run:
+        logger.info(f"Dry run mode. Command not executed: {command_str_for_print}")
+        return True, f"Dry run: {command_str_for_print}", ""
+    try:
+        process = subprocess.run(
+            ["gcloud"] + command,
+            capture_output=True,
+            text=True,
+            check=False,  # Handle non-zero exit codes manually
+        )
+        if process.returncode == 0:
+            if process.stderr:  # gcloud sometimes prints informational messages to stderr on success
+                logger.info(f"Command successful with stderr output:\n{process.stderr}")
+            return True, process.stdout.strip(), process.stderr.strip()
+        else:
+            logger.error(f"Command failed. Return code: {process.returncode}")
+            logger.error(f"Stdout:\n{process.stdout}")
+            logger.error(f"Stderr:\n{process.stderr}")
+            return False, process.stdout.strip(), process.stderr.strip()
+    except FileNotFoundError:
+        logger.error("gcloud command not found. Is it installed and in PATH?")
+        return False, "", "gcloud command not found."
+    except Exception as e:
+        logger.error(f"An unexpected error occurred while running gcloud command: {e}")
+        return False, "", str(e)
+def build_and_push_docker_image(
+    image_name_tag: str,  # e.g., gcr.io/my-project/my-reward-func:latest
+    dockerfile_content: str,
+    build_context_dir: str,  # Directory where Dockerfile and user code are (usually CWD)
+    gcp_project_id: Optional[str] = None,  # Required if using gcloud builds submit without local Docker
+    dry_run: bool = False,
+) -> bool:
+    """
+    Builds a Docker image using the provided Dockerfile content and pushes it to a registry (e.g., GCR, Artifact Registry).
+    Can use local Docker or 'gcloud builds submit'.
+    Args:
+        image_name_tag: Full name and tag for the image (e.g., "gcr.io/project-id/image-name:tag").
+        dockerfile_content: String content of the Dockerfile.
+        build_context_dir: The build context directory for Docker.
+        gcp_project_id: GCP Project ID, used for 'gcloud builds submit'.
+        dry_run: If True, prints commands instead of executing them.
+    Returns:
+        True if successful, False otherwise.
+    """
+    logger.info(f"Attempting to build and push Docker image using Google Cloud Build: {image_name_tag}")
+    if not gcp_project_id:
+        logger.error("GCP Project ID is required for Google Cloud Build.")
+        return False
+    # Create Dockerfile in the build_context_dir. It must be named "Dockerfile".
+    dockerfile_path_in_context = Path(build_context_dir) / "Dockerfile"
+    try:
+        with open(dockerfile_path_in_context, "w") as f:
+            f.write(dockerfile_content)
+        logger.info(f"Dockerfile created at: {dockerfile_path_in_context}")
+        # Command for gcloud builds submit
+        # The build_context_dir (e.g., ".") is where gcloud looks for the Dockerfile and other source files.
+        build_cmd_gcloud = [
+            "builds",
+            "submit",
+            build_context_dir,  # Source code to upload (can be "." for CWD)
+            "--tag",
+            image_name_tag,
+            "--project",
+            gcp_project_id,
+        ]
+        success, stdout, stderr = _run_gcloud_command(build_cmd_gcloud, dry_run=dry_run)
+        if not success:
+            logger.error(f"Google Cloud Build failed. Stdout: {stdout}, Stderr: {stderr}")
+            return False
+    except Exception as e:
+        logger.error(f"An error occurred during Dockerfile creation or gcloud command preparation: {e}")
+        return False
+    finally:
+        if dockerfile_path_in_context.exists():
+            os.remove(dockerfile_path_in_context)
+            logger.info(f"Temporary Dockerfile {dockerfile_path_in_context} removed.")
+    if success:
+        logger.info(f"Successfully built and pushed image {image_name_tag}")
+    else:
+        logger.error(f"Failed to build and push image {image_name_tag}")
+    return success
+def deploy_to_cloud_run(
+    service_name: str,
+    image_name_tag: str,
+    gcp_project_id: str,
+    gcp_region: str,
+    allow_unauthenticated: bool = True,  # For --auth api-key, the service itself is open, auth is app-level
+    env_vars: Optional[Dict[str, str]] = None,
+    secrets_to_mount: Optional[Dict[str, str]] = None,
+    service_port: int = 8080,
+    dry_run: bool = False,
+) -> Optional[str]:
+    """
+    Deploys a container image to Google Cloud Run.
+    Args:
+        service_name: Name for the Cloud Run service.
+        image_name_tag: Full name of the Docker image to deploy (e.g., "gcr.io/project/image:tag").
+        gcp_project_id: GCP Project ID.
+        gcp_region: GCP Region for the service.
+        allow_unauthenticated: Whether to allow unauthenticated invocations (publicly accessible).
+        env_vars: Environment variables to set for the service.
+        secrets_to_mount: Secrets from GCP Secret Manager to mount as environment variables.
+        service_port: Port the container exposes.
+        dry_run: If True, prints commands instead of executing them.
+    Returns:
+        The URL of the deployed service if successful, else None.
+    """
+    if not gcp_project_id:
+        logger.error("GCP Project ID is required for deploying to Cloud Run.")
+        return None
+    if not gcp_region:
+        logger.error("GCP Region is required for deploying to Cloud Run.")
+        return None
+    try:
+        logger.info(
+            f"Deploying image {image_name_tag} to Cloud Run service {service_name} in {gcp_region} (Project: {gcp_project_id})"
+        )
+        deploy_cmd_list = [
+            "run",
+            "deploy",
+            service_name,
+            "--image",
+            image_name_tag,
+            "--region",
+            gcp_region,
+            "--project",
+            gcp_project_id,
+            "--port",
+            str(service_port),
+            # "--platform", "managed",
+        ]
+        if allow_unauthenticated:
+            deploy_cmd_list.append("--allow-unauthenticated")
+        else:
+            # For IAM based auth, would be --no-allow-unauthenticated and then set IAM policy
+            deploy_cmd_list.append("--no-allow-unauthenticated")
+            logger.info("Note: --no-allow-unauthenticated set. Further IAM configuration might be needed.")
+        if env_vars:
+            env_vars_str = ",".join([f"{k}={v}" for k, v in env_vars.items()])
+            deploy_cmd_list.extend(["--set-env-vars", env_vars_str])
+        if secrets_to_mount:
+            # Format: ENV_VAR_NAME=secret_name:version,...
+            # Secret name here is just the short ID, not the full path.
+            # gcloud will resolve it within the project.
+            # Example: MY_API_KEY=my-api-key-secret:latest
+            secrets_str_list = []
+            for env_var_name, secret_manager_full_id in secrets_to_mount.items():
+                # Parse projects/PROJECT_ID/secrets/SECRET_ID/versions/VERSION
+                parts = secret_manager_full_id.split("/")
+                if len(parts) == 6 and parts[0] == "projects" and parts[2] == "secrets" and parts[4] == "versions":
+                    secret_id = parts[3]
+                    secret_version = parts[5]
+                    secrets_str_list.append(f"{env_var_name}={secret_id}:{secret_version}")
+                else:
+                    logger.warning(
+                        f"Invalid secret manager full ID format: {secret_manager_full_id}. Skipping secret mount for {env_var_name}."
+                    )
+            if secrets_str_list:
+                deploy_cmd_list.extend(["--update-secrets", ",".join(secrets_str_list)])
+        success, stdout, stderr = _run_gcloud_command(deploy_cmd_list, dry_run=dry_run)
+        if success:
+            if dry_run:
+                service_url_placeholder = f"https://{service_name}-mock-url.a.run.app"
+                logger.info(
+                    f"Successfully deployed service {service_name} (dry run). URL (placeholder): {service_url_placeholder}"
+                )
+                return service_url_placeholder
+            # Get the service URL after successful deployment
+            get_url_cmd = [
+                "run",
+                "services",
+                "describe",
+                service_name,
+                "--region",
+                gcp_region,
+                "--project",
+                gcp_project_id,
+                "--format",
+                "value(status.url)",
+            ]
+            url_success, url_stdout, url_stderr = _run_gcloud_command(
+                get_url_cmd, dry_run=False
+            )  # Always try to get URL if deploy was not dry_run
+            if url_success and url_stdout:
+                service_url = url_stdout.strip()
+                if not service_url.startswith("https://"):
+                    logger.error(f"Service URL is not valid (must be HTTPS): {service_url}")
+                    return None
+                logger.info(f"Successfully deployed service {service_name}. URL: {service_url}")
+                return service_url
+            else:
+                logger.error(f"Deployed service {service_name}, but failed to retrieve its URL. Stderr: {url_stderr}")
+                return None  # Consider deployment failed if URL cannot be retrieved
+        else:
+            logger.error(f"Failed to deploy service {service_name}. Stderr: {stderr}")
+            return None
+    except Exception as e:
+        logger.error(f"An error occurred during Cloud Run deployment for service {service_name}: {e}")
+        return None
+def ensure_artifact_registry_repo_exists(project_id: str, region: str, repo_name: str, dry_run: bool = False) -> bool:
+    """
+    Checks if an Artifact Registry repository exists, and creates it if it doesn't.
+    """
+    logger.info(
+        f"Ensuring Artifact Registry repository '{repo_name}' exists in project '{project_id}', region '{region}'."
+    )
+    try:
+        describe_cmd = [
+            "artifacts",
+            "repositories",
+            "describe",
+            repo_name,
+            "--project",
+            project_id,
+            "--location",
+            region,
+        ]
+        # Don't use dry_run for describe, as we need to know if it exists
+        success, stdout, stderr = _run_gcloud_command(describe_cmd, dry_run=False)
+        if success:
+            logger.info(f"Artifact Registry repository '{repo_name}' already exists.")
+            return True
+        # If describe failed, check if it's because the repo was not found
+        # gcloud typically returns non-zero exit code and an error message to stderr for "not found"
+        if "NOT_FOUND" in stderr.upper() or "failed to find" in stderr.lower():  # Heuristic check
+            logger.info(f"Artifact Registry repository '{repo_name}' not found. Attempting to create it.")
+            create_cmd = [
+                "artifacts",
+                "repositories",
+                "create",
+                repo_name,
+                "--project",
+                project_id,
+                "--repository-format",
+                "docker",
+                "--location",
+                region,
+                "--description",
+                "Repository for reward-kit evaluators (auto-created by reward-kit CLI)",
+            ]
+            create_success, create_stdout, create_stderr = _run_gcloud_command(create_cmd, dry_run=dry_run)
+            if create_success:
+                logger.info(f"Successfully created Artifact Registry repository '{repo_name}'.")
+                return True
+            else:
+                logger.error(f"Failed to create Artifact Registry repository '{repo_name}'. Stderr: {create_stderr}")
+                return False
+        else:
+            # Describe failed for a reason other than "not found"
+            logger.error(f"Error describing Artifact Registry repository '{repo_name}'. Stderr: {stderr}")
+            return False
+    except Exception as e:
+        logger.error(f"An unexpected error occurred while ensuring Artifact Registry repository '{repo_name}': {e}")
+        return False
+def ensure_gcp_secret(
+    project_id: str,
+    secret_id: str,
+    secret_value: str,
+    region: Optional[str] = None,  # For replication policy if needed, or if secrets are regional
+    labels: Optional[Dict[str, str]] = None,
+    dry_run: bool = False,
+) -> Optional[str]:
+    """
+    Ensures a secret exists in GCP Secret Manager and adds the given value as a new version.
+    Returns the full resource name of the new secret version if successful, else None.
+    e.g., projects/PROJECT_ID/secrets/SECRET_ID/versions/VERSION
+    """
+    if not project_id:
+        logger.error("GCP Project ID is required to manage secrets.")
+        return None
+    if not secret_id:
+        logger.error("Secret ID is required to manage secrets.")
+        return None
+    if secret_value is None:
+        logger.error("Secret value is required to create or update a secret.")
+        return None
+    logger.info(f"Ensuring secret '{secret_id}' in project '{project_id}'.")
+    describe_cmd = ["secrets", "describe", secret_id, "--project", project_id]
+    secret_exists, _, describe_stderr = _run_gcloud_command(describe_cmd, dry_run=False)
+    if not secret_exists:
+        if "NOT_FOUND" in describe_stderr.upper() or "failed to find" in describe_stderr.lower():
+            logger.info(f"Secret '{secret_id}' not found. Attempting to create it.")
+            create_cmd_list = [
+                "secrets",
+                "create",
+                secret_id,
+                "--project",
+                project_id,
+            ]
+            # Replication policy: automatic is default (global).
+            # If a region is provided, could set --replication-policy user-managed --locations <region>
+            # For simplicity, using automatic for now.
+            # TODO: Consider if region-specific replication is needed.
+            # if region:
+            #     create_cmd_list.extend(["--replication-policy", "user-managed", "--locations", region])
+            # else:
+            create_cmd_list.extend(["--replication-policy", "automatic"])
+            if labels:
+                labels_str = ",".join([f"{k}={v}" for k, v in labels.items()])
+                create_cmd_list.extend(["--labels", labels_str])
+            create_success, _, create_stderr = _run_gcloud_command(create_cmd_list, dry_run=dry_run)
+            if not create_success:
+                logger.error(f"Failed to create secret '{secret_id}'. Stderr: {create_stderr}")
+                return None
+            logger.info(f"Successfully created secret '{secret_id}'.")
+            secret_exists = True  # Now it exists
+        else:
+            # Describe failed for another reason
+            logger.error(f"Error describing secret '{secret_id}'. Stderr: {describe_stderr}")
+            return None
+    # Add a new version to the secret
+    # Create a temporary file for the secret value
+    try:
+        with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_secret_file:
+            tmp_secret_file.write(secret_value)
+            tmp_secret_file_path = tmp_secret_file.name
+        add_version_cmd = [
+            "secrets",
+            "versions",
+            "add",
+            secret_id,
+            "--project",
+            project_id,
+            "--data-file",
+            tmp_secret_file_path,
+        ]
+        version_success, version_stdout, version_stderr = _run_gcloud_command(add_version_cmd, dry_run=dry_run)
+        if tmp_secret_file_path and os.path.exists(tmp_secret_file_path):
+            os.remove(tmp_secret_file_path)
+        if not version_success:
+            logger.error(f"Failed to add version to secret '{secret_id}'. Stderr: {version_stderr}")
+            return None
+        # The stdout of 'versions add' usually contains the version name, but it's safer to describe.
+        # Let's parse the version from the output if available, or describe to get the latest.
+        # For simplicity, if dry_run, we can't get a real version.
+        if dry_run:
+            logger.info(f"Successfully added version to secret '{secret_id}' (dry run).")
+            return f"projects/{project_id}/secrets/{secret_id}/versions/latest-dry-run"
+        # Get the full name of the newly added version
+        # 'gcloud secrets versions describe latest --secret=SECRET_ID --format="value(name)"' gets the name
+        describe_version_cmd = [
+            "secrets",
+            "versions",
+            "describe",
+            "latest",
+            "--secret",
+            secret_id,
+            "--project",
+            project_id,
+            "--format",
+            "value(name)",
+        ]
+        desc_ver_success, desc_ver_stdout, desc_ver_stderr = _run_gcloud_command(describe_version_cmd, dry_run=False)
+        if desc_ver_success and desc_ver_stdout:
+            secret_version_name = desc_ver_stdout.strip()
+            logger.info(f"Successfully added version to secret '{secret_id}'. Version name: {secret_version_name}")
+            return secret_version_name
+        else:
+            logger.error(
+                f"Added version to secret '{secret_id}', but failed to retrieve new version name. Stderr: {desc_ver_stderr}"
+            )
+            return None
+    except Exception as e:
+        logger.error(f"An error occurred while adding secret version: {e}")
+        if "tmp_secret_file_path" in locals() and os.path.exists(tmp_secret_file_path):
+            os.remove(tmp_secret_file_path)
+        return None
+if __name__ == "__main__":
+    # Basic setup for logger to see output when run directly
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
+    logger.info("--- GCP Tools Module (Placeholder Examples) ---")
+    # Note: Dockerfile content would come from packaging.py
+    dummy_dockerfile = 'FROM python:3.10-slim\nCMD ["echo", "hello"]'
+    img_name = "gcr.io/my-test-project/my-test-reward-eval:latest"  # Old GCR name, update for AR
+    # Example AR image name: us-central1-docker.pkg.dev/my-test-project/my-ar-repo/my-test-reward-eval:latest
+    ar_img_name = "us-central1-docker.pkg.dev/my-test-project/reward-kit-images/my-test-reward-eval:latest"
+    print(f"\n1. Simulating build and push for {ar_img_name} (dry_run=True)")
+    build_and_push_docker_image(
+        image_name_tag=ar_img_name,
+        dockerfile_content=dummy_dockerfile,
+        build_context_dir=".",  # Assumes CWD is build context
+        gcp_project_id="my-test-project",
+        dry_run=True,
+    )
+    print(f"\n2. Simulating deploy to Cloud Run (dry_run=True)")
+    deploy_to_cloud_run(
+        service_name="my-reward-service",
+        image_name_tag=ar_img_name,  # Use AR image name
+        gcp_project_id="my-test-project",
+        gcp_region="us-central1",
+        allow_unauthenticated=True,
+        env_vars={"MY_ENV_VAR": "my_value"},
+        secrets_to_mount={"API_KEY_SECRET": "projects/my-test-project/secrets/my-api-key/versions/latest"},
+        dry_run=True,
+    )
+    print(f"\n3. Simulating ensure_artifact_registry_repo_exists (dry_run=True)")
+    ensure_artifact_registry_repo_exists(
+        project_id="my-test-project",
+        region="us-central1",
+        repo_name="reward-kit-evaluators",
+        dry_run=True,
+    )
+    print(f"\n4. Simulating ensure_gcp_secret (dry_run=True)")
+    ensure_gcp_secret(
+        project_id="my-test-project",
+        secret_id="my-test-api-key-secret",
+        secret_value="supersecretvalue123",
+        labels={"managed-by": "reward-kit-test"},
+        dry_run=True,
+    )
+    print("\nNote: These are placeholder executions. Real implementation requires gcloud CLI and Docker.")

eval_protocol/generation/cache.py ADDED Viewed

@@ -0,0 +1,141 @@
+"""
+Caching for model-generated responses.
+"""
+import hashlib
+import json
+import logging
+import os
+from typing import Any, Dict, Optional
+from omegaconf import DictConfig
+logger = logging.getLogger(__name__)
+class ResponseCache:
+    def __init__(self, cache_config: DictConfig):
+        self.cache_config = cache_config
+        self.cache_dir = cache_config.get("cache_dir", ".eval_protocol_cache/generated_responses")
+        # Resolve cache_dir relative to CWD if not an absolute path.
+        # Consider making this configurable to be relative to project root or Hydra's original CWD.
+        if not os.path.isabs(self.cache_dir):
+            self.cache_dir = os.path.join(os.getcwd(), self.cache_dir)
+        try:
+            os.makedirs(self.cache_dir, exist_ok=True)
+            logger.info(f"Response cache directory: {self.cache_dir}")
+        except OSError as e:
+            logger.error(f"Failed to create cache directory {self.cache_dir}: {e}. Caching will be disabled.")
+            self.cache_dir = None  # Disable caching if dir creation fails
+    def _generate_key(
+        self,
+        sample_id: str,
+        system_prompt: Optional[str],
+        user_query: str,  # Or full messages list for more robustness
+        model_name: str,
+        temperature: float,
+        top_p: float,
+        top_k: int,
+        min_p: float,
+        max_tokens: int,
+        reasoning_effort: Optional[str],  # Added reasoning_effort
+    ) -> str:
+        """Generates a cache key."""
+        key_material = f"{sample_id}-{system_prompt}-{user_query}-{model_name}-{temperature}-{top_p}-{top_k}-{min_p}-{max_tokens}-{reasoning_effort}"
+        return hashlib.md5(key_material.encode()).hexdigest()
+    def get(
+        self,
+        sample_id: str,
+        system_prompt: Optional[str],
+        user_query: str,
+        model_name: str,
+        temperature: float,
+        top_p: float,
+        top_k: int,
+        min_p: float,
+        max_tokens: int,
+        reasoning_effort: Optional[str],  # Added reasoning_effort
+    ) -> Optional[str]:
+        """Retrieves an item from the cache. Returns None if not found or error."""
+        if not self.cache_dir:
+            return None
+        if temperature != 0.0:  # Only cache deterministic (temp=0) generations by default
+            return None
+        cache_key = self._generate_key(
+            sample_id,
+            system_prompt,
+            user_query,
+            model_name,
+            temperature,
+            top_p,
+            top_k,
+            min_p,
+            max_tokens,
+            reasoning_effort,
+        )
+        cache_file_path = os.path.join(self.cache_dir, f"{cache_key}.json")
+        if os.path.exists(cache_file_path):
+            try:
+                with open(cache_file_path, "r", encoding="utf-8") as f:
+                    cached_data = json.load(f)
+                    response = cached_data.get("assistant_response")
+                    if response is not None:
+                        logger.debug(f"Cache hit for key {cache_key} (sample {sample_id})")
+                        return response
+                    else:
+                        logger.warning(f"Cache file {cache_file_path} for key {cache_key} is malformed.")
+            except json.JSONDecodeError:
+                logger.warning(f"Error decoding JSON from cache file {cache_file_path} for key {cache_key}.")
+            except Exception as e:
+                logger.warning(f"Error reading from cache file {cache_file_path}: {e}")
+        else:
+            logger.debug(f"Cache miss for key {cache_key} (sample {sample_id})")
+        return None
+    def put(
+        self,
+        sample_id: str,
+        system_prompt: Optional[str],
+        user_query: str,
+        model_name: str,
+        temperature: float,
+        response: str,
+        top_p: float,
+        top_k: int,
+        min_p: float,
+        max_tokens: int,
+        reasoning_effort: Optional[str],  # Added reasoning_effort
+    ) -> None:
+        """Stores an item in the cache."""
+        if not self.cache_dir:
+            return
+        if temperature != 0.0:  # Only cache deterministic (temp=0) generations
+            return
+        cache_key = self._generate_key(
+            sample_id,
+            system_prompt,
+            user_query,
+            model_name,
+            temperature,
+            top_p,
+            top_k,
+            min_p,
+            max_tokens,
+            reasoning_effort,
+        )
+        cache_file_path = os.path.join(self.cache_dir, f"{cache_key}.json")
+        try:
+            with open(cache_file_path, "w", encoding="utf-8") as f:
+                json.dump({"assistant_response": response}, f)
+            logger.debug(f"Cached response for key {cache_key} (sample {sample_id})")
+        except Exception as e:
+            logger.warning(f"Error writing to cache file {cache_file_path}: {e}")

eval_protocol/generation/clients/base.py ADDED Viewed

@@ -0,0 +1,67 @@
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional
+import aiohttp
+from omegaconf import DictConfig
+from pydantic import BaseModel, Field
+class ToolCallFunction(BaseModel):
+    name: str
+    arguments: str  # Should be a JSON string
+class ToolCall(BaseModel):
+    id: str
+    type: str = "function"  # OpenAI default is "function"
+    function: ToolCallFunction
+class GenerationResult(BaseModel):
+    content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = None
+    # Add a validator to ensure that not both content and tool_calls are None,
+    # and not both are set, if that's a desired constraint.
+    # For now, allowing flexibility.
+class ModelClient(ABC):
+    """Abstract base class for model clients."""
+    def __init__(self, client_config: DictConfig, api_key: Optional[str] = None):
+        self.model_name = client_config.get("model_name", "unknown")
+        self.temperature = client_config.get("temperature", 0.0)
+        self.top_p = client_config.get("top_p", 1.0)
+        self.top_k = client_config.get("top_k", None)  # Optional, None if not used
+        self.min_p = client_config.get("min_p", None)  # Optional, None if not used
+        self.max_tokens = client_config.get("max_tokens", 1024)
+        self.reasoning_effort = client_config.get("reasoning_effort", None)  # Optional
+        self.api_key = api_key
+        self.client_config = client_config  # Store the raw config for other params
+    @abstractmethod
+    async def generate(
+        self,
+        messages: List[Dict[str, str]],
+        session: aiohttp.ClientSession,
+        tools: Optional[List[Dict[str, Any]]] = None,  # For OpenAI-style tool definitions
+        **kwargs: Any,  # For additional model-specific parameters
+    ) -> GenerationResult:
+        """
+        Generates a response from the model.
+        Args:
+            messages: A list of messages comprising the conversation history.
+            session: An aiohttp.ClientSession for making HTTP requests.
+            tools: Optional list of tool definitions to provide to the model.
+            **kwargs: Additional keyword arguments for model-specific parameters.
+        Returns:
+            A GenerationResult object containing either text content or tool calls.
+        """
+        pass
+    @property
+    def name(self) -> str:
+        return self.model_name