eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,484 @@
1
+ import logging
2
+ import os
3
+ import shutil
4
+ import subprocess
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional, Tuple
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def _run_gcloud_command(command: List[str], dry_run: bool = False) -> Tuple[bool, str, str]:
13
+ """
14
+ Helper to run a gcloud command.
15
+ In a real scenario, this would interact with subprocess.
16
+ Returns: (success_status, stdout, stderr)
17
+ """
18
+ command_str_for_print = " ".join(["gcloud"] + command)
19
+ logger.info(f"Executing: {command_str_for_print}")
20
+ if dry_run:
21
+ logger.info(f"Dry run mode. Command not executed: {command_str_for_print}")
22
+ return True, f"Dry run: {command_str_for_print}", ""
23
+
24
+ try:
25
+ process = subprocess.run(
26
+ ["gcloud"] + command,
27
+ capture_output=True,
28
+ text=True,
29
+ check=False, # Handle non-zero exit codes manually
30
+ )
31
+ if process.returncode == 0:
32
+ if process.stderr: # gcloud sometimes prints informational messages to stderr on success
33
+ logger.info(f"Command successful with stderr output:\n{process.stderr}")
34
+ return True, process.stdout.strip(), process.stderr.strip()
35
+ else:
36
+ logger.error(f"Command failed. Return code: {process.returncode}")
37
+ logger.error(f"Stdout:\n{process.stdout}")
38
+ logger.error(f"Stderr:\n{process.stderr}")
39
+ return False, process.stdout.strip(), process.stderr.strip()
40
+ except FileNotFoundError:
41
+ logger.error("gcloud command not found. Is it installed and in PATH?")
42
+ return False, "", "gcloud command not found."
43
+ except Exception as e:
44
+ logger.error(f"An unexpected error occurred while running gcloud command: {e}")
45
+ return False, "", str(e)
46
+
47
+
48
+ def build_and_push_docker_image(
49
+ image_name_tag: str, # e.g., gcr.io/my-project/my-reward-func:latest
50
+ dockerfile_content: str,
51
+ build_context_dir: str, # Directory where Dockerfile and user code are (usually CWD)
52
+ gcp_project_id: Optional[str] = None, # Required if using gcloud builds submit without local Docker
53
+ dry_run: bool = False,
54
+ ) -> bool:
55
+ """
56
+ Builds a Docker image using the provided Dockerfile content and pushes it to a registry (e.g., GCR, Artifact Registry).
57
+ Can use local Docker or 'gcloud builds submit'.
58
+
59
+ Args:
60
+ image_name_tag: Full name and tag for the image (e.g., "gcr.io/project-id/image-name:tag").
61
+ dockerfile_content: String content of the Dockerfile.
62
+ build_context_dir: The build context directory for Docker.
63
+ gcp_project_id: GCP Project ID, used for 'gcloud builds submit'.
64
+ dry_run: If True, prints commands instead of executing them.
65
+
66
+ Returns:
67
+ True if successful, False otherwise.
68
+ """
69
+ logger.info(f"Attempting to build and push Docker image using Google Cloud Build: {image_name_tag}")
70
+
71
+ if not gcp_project_id:
72
+ logger.error("GCP Project ID is required for Google Cloud Build.")
73
+ return False
74
+
75
+ # Create Dockerfile in the build_context_dir. It must be named "Dockerfile".
76
+ dockerfile_path_in_context = Path(build_context_dir) / "Dockerfile"
77
+
78
+ try:
79
+ with open(dockerfile_path_in_context, "w") as f:
80
+ f.write(dockerfile_content)
81
+ logger.info(f"Dockerfile created at: {dockerfile_path_in_context}")
82
+
83
+ # Command for gcloud builds submit
84
+ # The build_context_dir (e.g., ".") is where gcloud looks for the Dockerfile and other source files.
85
+ build_cmd_gcloud = [
86
+ "builds",
87
+ "submit",
88
+ build_context_dir, # Source code to upload (can be "." for CWD)
89
+ "--tag",
90
+ image_name_tag,
91
+ "--project",
92
+ gcp_project_id,
93
+ ]
94
+
95
+ success, stdout, stderr = _run_gcloud_command(build_cmd_gcloud, dry_run=dry_run)
96
+
97
+ if not success:
98
+ logger.error(f"Google Cloud Build failed. Stdout: {stdout}, Stderr: {stderr}")
99
+ return False
100
+
101
+ except Exception as e:
102
+ logger.error(f"An error occurred during Dockerfile creation or gcloud command preparation: {e}")
103
+ return False
104
+ finally:
105
+ if dockerfile_path_in_context.exists():
106
+ os.remove(dockerfile_path_in_context)
107
+ logger.info(f"Temporary Dockerfile {dockerfile_path_in_context} removed.")
108
+
109
+ if success:
110
+ logger.info(f"Successfully built and pushed image {image_name_tag}")
111
+ else:
112
+ logger.error(f"Failed to build and push image {image_name_tag}")
113
+ return success
114
+
115
+
116
+ def deploy_to_cloud_run(
117
+ service_name: str,
118
+ image_name_tag: str,
119
+ gcp_project_id: str,
120
+ gcp_region: str,
121
+ allow_unauthenticated: bool = True, # For --auth api-key, the service itself is open, auth is app-level
122
+ env_vars: Optional[Dict[str, str]] = None,
123
+ secrets_to_mount: Optional[Dict[str, str]] = None,
124
+ service_port: int = 8080,
125
+ dry_run: bool = False,
126
+ ) -> Optional[str]:
127
+ """
128
+ Deploys a container image to Google Cloud Run.
129
+
130
+ Args:
131
+ service_name: Name for the Cloud Run service.
132
+ image_name_tag: Full name of the Docker image to deploy (e.g., "gcr.io/project/image:tag").
133
+ gcp_project_id: GCP Project ID.
134
+ gcp_region: GCP Region for the service.
135
+ allow_unauthenticated: Whether to allow unauthenticated invocations (publicly accessible).
136
+ env_vars: Environment variables to set for the service.
137
+ secrets_to_mount: Secrets from GCP Secret Manager to mount as environment variables.
138
+ service_port: Port the container exposes.
139
+ dry_run: If True, prints commands instead of executing them.
140
+
141
+ Returns:
142
+ The URL of the deployed service if successful, else None.
143
+ """
144
+
145
+ if not gcp_project_id:
146
+ logger.error("GCP Project ID is required for deploying to Cloud Run.")
147
+ return None
148
+ if not gcp_region:
149
+ logger.error("GCP Region is required for deploying to Cloud Run.")
150
+ return None
151
+
152
+ try:
153
+ logger.info(
154
+ f"Deploying image {image_name_tag} to Cloud Run service {service_name} in {gcp_region} (Project: {gcp_project_id})"
155
+ )
156
+
157
+ deploy_cmd_list = [
158
+ "run",
159
+ "deploy",
160
+ service_name,
161
+ "--image",
162
+ image_name_tag,
163
+ "--region",
164
+ gcp_region,
165
+ "--project",
166
+ gcp_project_id,
167
+ "--port",
168
+ str(service_port),
169
+ # "--platform", "managed",
170
+ ]
171
+
172
+ if allow_unauthenticated:
173
+ deploy_cmd_list.append("--allow-unauthenticated")
174
+ else:
175
+ # For IAM based auth, would be --no-allow-unauthenticated and then set IAM policy
176
+ deploy_cmd_list.append("--no-allow-unauthenticated")
177
+ logger.info("Note: --no-allow-unauthenticated set. Further IAM configuration might be needed.")
178
+
179
+ if env_vars:
180
+ env_vars_str = ",".join([f"{k}={v}" for k, v in env_vars.items()])
181
+ deploy_cmd_list.extend(["--set-env-vars", env_vars_str])
182
+
183
+ if secrets_to_mount:
184
+ # Format: ENV_VAR_NAME=secret_name:version,...
185
+ # Secret name here is just the short ID, not the full path.
186
+ # gcloud will resolve it within the project.
187
+ # Example: MY_API_KEY=my-api-key-secret:latest
188
+ secrets_str_list = []
189
+ for env_var_name, secret_manager_full_id in secrets_to_mount.items():
190
+ # Parse projects/PROJECT_ID/secrets/SECRET_ID/versions/VERSION
191
+ parts = secret_manager_full_id.split("/")
192
+ if len(parts) == 6 and parts[0] == "projects" and parts[2] == "secrets" and parts[4] == "versions":
193
+ secret_id = parts[3]
194
+ secret_version = parts[5]
195
+ secrets_str_list.append(f"{env_var_name}={secret_id}:{secret_version}")
196
+ else:
197
+ logger.warning(
198
+ f"Invalid secret manager full ID format: {secret_manager_full_id}. Skipping secret mount for {env_var_name}."
199
+ )
200
+
201
+ if secrets_str_list:
202
+ deploy_cmd_list.extend(["--update-secrets", ",".join(secrets_str_list)])
203
+
204
+ success, stdout, stderr = _run_gcloud_command(deploy_cmd_list, dry_run=dry_run)
205
+
206
+ if success:
207
+ if dry_run:
208
+ service_url_placeholder = f"https://{service_name}-mock-url.a.run.app"
209
+ logger.info(
210
+ f"Successfully deployed service {service_name} (dry run). URL (placeholder): {service_url_placeholder}"
211
+ )
212
+ return service_url_placeholder
213
+
214
+ # Get the service URL after successful deployment
215
+ get_url_cmd = [
216
+ "run",
217
+ "services",
218
+ "describe",
219
+ service_name,
220
+ "--region",
221
+ gcp_region,
222
+ "--project",
223
+ gcp_project_id,
224
+ "--format",
225
+ "value(status.url)",
226
+ ]
227
+ url_success, url_stdout, url_stderr = _run_gcloud_command(
228
+ get_url_cmd, dry_run=False
229
+ ) # Always try to get URL if deploy was not dry_run
230
+
231
+ if url_success and url_stdout:
232
+ service_url = url_stdout.strip()
233
+ if not service_url.startswith("https://"):
234
+ logger.error(f"Service URL is not valid (must be HTTPS): {service_url}")
235
+ return None
236
+ logger.info(f"Successfully deployed service {service_name}. URL: {service_url}")
237
+ return service_url
238
+ else:
239
+ logger.error(f"Deployed service {service_name}, but failed to retrieve its URL. Stderr: {url_stderr}")
240
+ return None # Consider deployment failed if URL cannot be retrieved
241
+ else:
242
+ logger.error(f"Failed to deploy service {service_name}. Stderr: {stderr}")
243
+ return None
244
+ except Exception as e:
245
+ logger.error(f"An error occurred during Cloud Run deployment for service {service_name}: {e}")
246
+ return None
247
+
248
+
249
+ def ensure_artifact_registry_repo_exists(project_id: str, region: str, repo_name: str, dry_run: bool = False) -> bool:
250
+ """
251
+ Checks if an Artifact Registry repository exists, and creates it if it doesn't.
252
+ """
253
+ logger.info(
254
+ f"Ensuring Artifact Registry repository '{repo_name}' exists in project '{project_id}', region '{region}'."
255
+ )
256
+
257
+ try:
258
+ describe_cmd = [
259
+ "artifacts",
260
+ "repositories",
261
+ "describe",
262
+ repo_name,
263
+ "--project",
264
+ project_id,
265
+ "--location",
266
+ region,
267
+ ]
268
+
269
+ # Don't use dry_run for describe, as we need to know if it exists
270
+ success, stdout, stderr = _run_gcloud_command(describe_cmd, dry_run=False)
271
+
272
+ if success:
273
+ logger.info(f"Artifact Registry repository '{repo_name}' already exists.")
274
+ return True
275
+
276
+ # If describe failed, check if it's because the repo was not found
277
+ # gcloud typically returns non-zero exit code and an error message to stderr for "not found"
278
+ if "NOT_FOUND" in stderr.upper() or "failed to find" in stderr.lower(): # Heuristic check
279
+ logger.info(f"Artifact Registry repository '{repo_name}' not found. Attempting to create it.")
280
+ create_cmd = [
281
+ "artifacts",
282
+ "repositories",
283
+ "create",
284
+ repo_name,
285
+ "--project",
286
+ project_id,
287
+ "--repository-format",
288
+ "docker",
289
+ "--location",
290
+ region,
291
+ "--description",
292
+ "Repository for reward-kit evaluators (auto-created by reward-kit CLI)",
293
+ ]
294
+ create_success, create_stdout, create_stderr = _run_gcloud_command(create_cmd, dry_run=dry_run)
295
+ if create_success:
296
+ logger.info(f"Successfully created Artifact Registry repository '{repo_name}'.")
297
+ return True
298
+ else:
299
+ logger.error(f"Failed to create Artifact Registry repository '{repo_name}'. Stderr: {create_stderr}")
300
+ return False
301
+ else:
302
+ # Describe failed for a reason other than "not found"
303
+ logger.error(f"Error describing Artifact Registry repository '{repo_name}'. Stderr: {stderr}")
304
+ return False
305
+ except Exception as e:
306
+ logger.error(f"An unexpected error occurred while ensuring Artifact Registry repository '{repo_name}': {e}")
307
+ return False
308
+
309
+
310
+ def ensure_gcp_secret(
311
+ project_id: str,
312
+ secret_id: str,
313
+ secret_value: str,
314
+ region: Optional[str] = None, # For replication policy if needed, or if secrets are regional
315
+ labels: Optional[Dict[str, str]] = None,
316
+ dry_run: bool = False,
317
+ ) -> Optional[str]:
318
+ """
319
+ Ensures a secret exists in GCP Secret Manager and adds the given value as a new version.
320
+ Returns the full resource name of the new secret version if successful, else None.
321
+ e.g., projects/PROJECT_ID/secrets/SECRET_ID/versions/VERSION
322
+ """
323
+ if not project_id:
324
+ logger.error("GCP Project ID is required to manage secrets.")
325
+ return None
326
+ if not secret_id:
327
+ logger.error("Secret ID is required to manage secrets.")
328
+ return None
329
+ if secret_value is None:
330
+ logger.error("Secret value is required to create or update a secret.")
331
+ return None
332
+
333
+ logger.info(f"Ensuring secret '{secret_id}' in project '{project_id}'.")
334
+ describe_cmd = ["secrets", "describe", secret_id, "--project", project_id]
335
+ secret_exists, _, describe_stderr = _run_gcloud_command(describe_cmd, dry_run=False)
336
+
337
+ if not secret_exists:
338
+ if "NOT_FOUND" in describe_stderr.upper() or "failed to find" in describe_stderr.lower():
339
+ logger.info(f"Secret '{secret_id}' not found. Attempting to create it.")
340
+ create_cmd_list = [
341
+ "secrets",
342
+ "create",
343
+ secret_id,
344
+ "--project",
345
+ project_id,
346
+ ]
347
+ # Replication policy: automatic is default (global).
348
+ # If a region is provided, could set --replication-policy user-managed --locations <region>
349
+ # For simplicity, using automatic for now.
350
+ # TODO: Consider if region-specific replication is needed.
351
+ # if region:
352
+ # create_cmd_list.extend(["--replication-policy", "user-managed", "--locations", region])
353
+ # else:
354
+ create_cmd_list.extend(["--replication-policy", "automatic"])
355
+
356
+ if labels:
357
+ labels_str = ",".join([f"{k}={v}" for k, v in labels.items()])
358
+ create_cmd_list.extend(["--labels", labels_str])
359
+
360
+ create_success, _, create_stderr = _run_gcloud_command(create_cmd_list, dry_run=dry_run)
361
+ if not create_success:
362
+ logger.error(f"Failed to create secret '{secret_id}'. Stderr: {create_stderr}")
363
+ return None
364
+ logger.info(f"Successfully created secret '{secret_id}'.")
365
+ secret_exists = True # Now it exists
366
+ else:
367
+ # Describe failed for another reason
368
+ logger.error(f"Error describing secret '{secret_id}'. Stderr: {describe_stderr}")
369
+ return None
370
+
371
+ # Add a new version to the secret
372
+ # Create a temporary file for the secret value
373
+ try:
374
+ with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_secret_file:
375
+ tmp_secret_file.write(secret_value)
376
+ tmp_secret_file_path = tmp_secret_file.name
377
+
378
+ add_version_cmd = [
379
+ "secrets",
380
+ "versions",
381
+ "add",
382
+ secret_id,
383
+ "--project",
384
+ project_id,
385
+ "--data-file",
386
+ tmp_secret_file_path,
387
+ ]
388
+ version_success, version_stdout, version_stderr = _run_gcloud_command(add_version_cmd, dry_run=dry_run)
389
+
390
+ if tmp_secret_file_path and os.path.exists(tmp_secret_file_path):
391
+ os.remove(tmp_secret_file_path)
392
+
393
+ if not version_success:
394
+ logger.error(f"Failed to add version to secret '{secret_id}'. Stderr: {version_stderr}")
395
+ return None
396
+
397
+ # The stdout of 'versions add' usually contains the version name, but it's safer to describe.
398
+ # Let's parse the version from the output if available, or describe to get the latest.
399
+ # For simplicity, if dry_run, we can't get a real version.
400
+ if dry_run:
401
+ logger.info(f"Successfully added version to secret '{secret_id}' (dry run).")
402
+ return f"projects/{project_id}/secrets/{secret_id}/versions/latest-dry-run"
403
+
404
+ # Get the full name of the newly added version
405
+ # 'gcloud secrets versions describe latest --secret=SECRET_ID --format="value(name)"' gets the name
406
+ describe_version_cmd = [
407
+ "secrets",
408
+ "versions",
409
+ "describe",
410
+ "latest",
411
+ "--secret",
412
+ secret_id,
413
+ "--project",
414
+ project_id,
415
+ "--format",
416
+ "value(name)",
417
+ ]
418
+ desc_ver_success, desc_ver_stdout, desc_ver_stderr = _run_gcloud_command(describe_version_cmd, dry_run=False)
419
+ if desc_ver_success and desc_ver_stdout:
420
+ secret_version_name = desc_ver_stdout.strip()
421
+ logger.info(f"Successfully added version to secret '{secret_id}'. Version name: {secret_version_name}")
422
+ return secret_version_name
423
+ else:
424
+ logger.error(
425
+ f"Added version to secret '{secret_id}', but failed to retrieve new version name. Stderr: {desc_ver_stderr}"
426
+ )
427
+ return None
428
+
429
+ except Exception as e:
430
+ logger.error(f"An error occurred while adding secret version: {e}")
431
+ if "tmp_secret_file_path" in locals() and os.path.exists(tmp_secret_file_path):
432
+ os.remove(tmp_secret_file_path)
433
+ return None
434
+
435
+
436
+ if __name__ == "__main__":
437
+ # Basic setup for logger to see output when run directly
438
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
439
+ logger.info("--- GCP Tools Module (Placeholder Examples) ---")
440
+
441
+ # Note: Dockerfile content would come from packaging.py
442
+ dummy_dockerfile = 'FROM python:3.10-slim\nCMD ["echo", "hello"]'
443
+ img_name = "gcr.io/my-test-project/my-test-reward-eval:latest" # Old GCR name, update for AR
444
+ # Example AR image name: us-central1-docker.pkg.dev/my-test-project/my-ar-repo/my-test-reward-eval:latest
445
+ ar_img_name = "us-central1-docker.pkg.dev/my-test-project/reward-kit-images/my-test-reward-eval:latest"
446
+
447
+ print(f"\n1. Simulating build and push for {ar_img_name} (dry_run=True)")
448
+ build_and_push_docker_image(
449
+ image_name_tag=ar_img_name,
450
+ dockerfile_content=dummy_dockerfile,
451
+ build_context_dir=".", # Assumes CWD is build context
452
+ gcp_project_id="my-test-project",
453
+ dry_run=True,
454
+ )
455
+
456
+ print(f"\n2. Simulating deploy to Cloud Run (dry_run=True)")
457
+ deploy_to_cloud_run(
458
+ service_name="my-reward-service",
459
+ image_name_tag=ar_img_name, # Use AR image name
460
+ gcp_project_id="my-test-project",
461
+ gcp_region="us-central1",
462
+ allow_unauthenticated=True,
463
+ env_vars={"MY_ENV_VAR": "my_value"},
464
+ secrets_to_mount={"API_KEY_SECRET": "projects/my-test-project/secrets/my-api-key/versions/latest"},
465
+ dry_run=True,
466
+ )
467
+
468
+ print(f"\n3. Simulating ensure_artifact_registry_repo_exists (dry_run=True)")
469
+ ensure_artifact_registry_repo_exists(
470
+ project_id="my-test-project",
471
+ region="us-central1",
472
+ repo_name="reward-kit-evaluators",
473
+ dry_run=True,
474
+ )
475
+
476
+ print(f"\n4. Simulating ensure_gcp_secret (dry_run=True)")
477
+ ensure_gcp_secret(
478
+ project_id="my-test-project",
479
+ secret_id="my-test-api-key-secret",
480
+ secret_value="supersecretvalue123",
481
+ labels={"managed-by": "reward-kit-test"},
482
+ dry_run=True,
483
+ )
484
+ print("\nNote: These are placeholder executions. Real implementation requires gcloud CLI and Docker.")
@@ -0,0 +1,141 @@
1
+ """
2
+ Caching for model-generated responses.
3
+ """
4
+
5
+ import hashlib
6
+ import json
7
+ import logging
8
+ import os
9
+ from typing import Any, Dict, Optional
10
+
11
+ from omegaconf import DictConfig
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class ResponseCache:
17
+ def __init__(self, cache_config: DictConfig):
18
+ self.cache_config = cache_config
19
+ self.cache_dir = cache_config.get("cache_dir", ".eval_protocol_cache/generated_responses")
20
+ # Resolve cache_dir relative to CWD if not an absolute path.
21
+ # Consider making this configurable to be relative to project root or Hydra's original CWD.
22
+ if not os.path.isabs(self.cache_dir):
23
+ self.cache_dir = os.path.join(os.getcwd(), self.cache_dir)
24
+
25
+ try:
26
+ os.makedirs(self.cache_dir, exist_ok=True)
27
+ logger.info(f"Response cache directory: {self.cache_dir}")
28
+ except OSError as e:
29
+ logger.error(f"Failed to create cache directory {self.cache_dir}: {e}. Caching will be disabled.")
30
+ self.cache_dir = None # Disable caching if dir creation fails
31
+
32
+ def _generate_key(
33
+ self,
34
+ sample_id: str,
35
+ system_prompt: Optional[str],
36
+ user_query: str, # Or full messages list for more robustness
37
+ model_name: str,
38
+ temperature: float,
39
+ top_p: float,
40
+ top_k: int,
41
+ min_p: float,
42
+ max_tokens: int,
43
+ reasoning_effort: Optional[str], # Added reasoning_effort
44
+ ) -> str:
45
+ """Generates a cache key."""
46
+ key_material = f"{sample_id}-{system_prompt}-{user_query}-{model_name}-{temperature}-{top_p}-{top_k}-{min_p}-{max_tokens}-{reasoning_effort}"
47
+ return hashlib.md5(key_material.encode()).hexdigest()
48
+
49
+ def get(
50
+ self,
51
+ sample_id: str,
52
+ system_prompt: Optional[str],
53
+ user_query: str,
54
+ model_name: str,
55
+ temperature: float,
56
+ top_p: float,
57
+ top_k: int,
58
+ min_p: float,
59
+ max_tokens: int,
60
+ reasoning_effort: Optional[str], # Added reasoning_effort
61
+ ) -> Optional[str]:
62
+ """Retrieves an item from the cache. Returns None if not found or error."""
63
+ if not self.cache_dir:
64
+ return None
65
+
66
+ if temperature != 0.0: # Only cache deterministic (temp=0) generations by default
67
+ return None
68
+
69
+ cache_key = self._generate_key(
70
+ sample_id,
71
+ system_prompt,
72
+ user_query,
73
+ model_name,
74
+ temperature,
75
+ top_p,
76
+ top_k,
77
+ min_p,
78
+ max_tokens,
79
+ reasoning_effort,
80
+ )
81
+ cache_file_path = os.path.join(self.cache_dir, f"{cache_key}.json")
82
+
83
+ if os.path.exists(cache_file_path):
84
+ try:
85
+ with open(cache_file_path, "r", encoding="utf-8") as f:
86
+ cached_data = json.load(f)
87
+ response = cached_data.get("assistant_response")
88
+ if response is not None:
89
+ logger.debug(f"Cache hit for key {cache_key} (sample {sample_id})")
90
+ return response
91
+ else:
92
+ logger.warning(f"Cache file {cache_file_path} for key {cache_key} is malformed.")
93
+ except json.JSONDecodeError:
94
+ logger.warning(f"Error decoding JSON from cache file {cache_file_path} for key {cache_key}.")
95
+ except Exception as e:
96
+ logger.warning(f"Error reading from cache file {cache_file_path}: {e}")
97
+ else:
98
+ logger.debug(f"Cache miss for key {cache_key} (sample {sample_id})")
99
+ return None
100
+
101
+ def put(
102
+ self,
103
+ sample_id: str,
104
+ system_prompt: Optional[str],
105
+ user_query: str,
106
+ model_name: str,
107
+ temperature: float,
108
+ response: str,
109
+ top_p: float,
110
+ top_k: int,
111
+ min_p: float,
112
+ max_tokens: int,
113
+ reasoning_effort: Optional[str], # Added reasoning_effort
114
+ ) -> None:
115
+ """Stores an item in the cache."""
116
+ if not self.cache_dir:
117
+ return
118
+
119
+ if temperature != 0.0: # Only cache deterministic (temp=0) generations
120
+ return
121
+
122
+ cache_key = self._generate_key(
123
+ sample_id,
124
+ system_prompt,
125
+ user_query,
126
+ model_name,
127
+ temperature,
128
+ top_p,
129
+ top_k,
130
+ min_p,
131
+ max_tokens,
132
+ reasoning_effort,
133
+ )
134
+ cache_file_path = os.path.join(self.cache_dir, f"{cache_key}.json")
135
+
136
+ try:
137
+ with open(cache_file_path, "w", encoding="utf-8") as f:
138
+ json.dump({"assistant_response": response}, f)
139
+ logger.debug(f"Cached response for key {cache_key} (sample {sample_id})")
140
+ except Exception as e:
141
+ logger.warning(f"Error writing to cache file {cache_file_path}: {e}")
@@ -0,0 +1,67 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ import aiohttp
5
+ from omegaconf import DictConfig
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ class ToolCallFunction(BaseModel):
10
+ name: str
11
+ arguments: str # Should be a JSON string
12
+
13
+
14
+ class ToolCall(BaseModel):
15
+ id: str
16
+ type: str = "function" # OpenAI default is "function"
17
+ function: ToolCallFunction
18
+
19
+
20
+ class GenerationResult(BaseModel):
21
+ content: Optional[str] = None
22
+ tool_calls: Optional[List[ToolCall]] = None
23
+
24
+ # Add a validator to ensure that not both content and tool_calls are None,
25
+ # and not both are set, if that's a desired constraint.
26
+ # For now, allowing flexibility.
27
+
28
+
29
+ class ModelClient(ABC):
30
+ """Abstract base class for model clients."""
31
+
32
+ def __init__(self, client_config: DictConfig, api_key: Optional[str] = None):
33
+ self.model_name = client_config.get("model_name", "unknown")
34
+ self.temperature = client_config.get("temperature", 0.0)
35
+ self.top_p = client_config.get("top_p", 1.0)
36
+ self.top_k = client_config.get("top_k", None) # Optional, None if not used
37
+ self.min_p = client_config.get("min_p", None) # Optional, None if not used
38
+ self.max_tokens = client_config.get("max_tokens", 1024)
39
+ self.reasoning_effort = client_config.get("reasoning_effort", None) # Optional
40
+ self.api_key = api_key
41
+ self.client_config = client_config # Store the raw config for other params
42
+
43
+ @abstractmethod
44
+ async def generate(
45
+ self,
46
+ messages: List[Dict[str, str]],
47
+ session: aiohttp.ClientSession,
48
+ tools: Optional[List[Dict[str, Any]]] = None, # For OpenAI-style tool definitions
49
+ **kwargs: Any, # For additional model-specific parameters
50
+ ) -> GenerationResult:
51
+ """
52
+ Generates a response from the model.
53
+
54
+ Args:
55
+ messages: A list of messages comprising the conversation history.
56
+ session: An aiohttp.ClientSession for making HTTP requests.
57
+ tools: Optional list of tool definitions to provide to the model.
58
+ **kwargs: Additional keyword arguments for model-specific parameters.
59
+
60
+ Returns:
61
+ A GenerationResult object containing either text content or tool calls.
62
+ """
63
+ pass
64
+
65
+ @property
66
+ def name(self) -> str:
67
+ return self.model_name