eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,486 @@
1
+ """
2
+ CLI command for creating and deploying an evaluator,
3
+ or registering a pre-deployed remote evaluator.
4
+ """
5
+
6
+ import importlib # For dynamically importing modules
7
+ import json
8
+ import os # For os.path.join, os.makedirs, os.getcwd (already imported but good to be explicit if used extensively)
9
+ import secrets # For API key generation (already imported but good to be explicit)
10
+ import sys # For sys.executable
11
+ import time # For sleep
12
+ from pathlib import Path # For path operations
13
+ from typing import Any, Dict
14
+
15
+ import yaml # For saving config if save_config helper doesn't exist
16
+
17
+ # TODO: Consider moving subprocess_manager functions to a more central location if used by core CLI
18
+ try:
19
+ from development.utils.subprocess_manager import (
20
+ start_ngrok_and_get_url, # Added ngrok function
21
+ start_process,
22
+ start_serveo_and_get_url,
23
+ stop_process,
24
+ )
25
+ except ImportError:
26
+ # Fallback implementations when development module is not available
27
+ import subprocess
28
+ import signal
29
+ import socket
30
+
31
+ def start_process(command, log_path, env=None):
32
+ """Fallback process starter."""
33
+ try:
34
+ with open(log_path, "w") as log_file:
35
+ process = subprocess.Popen(command, stdout=log_file, stderr=subprocess.STDOUT, env=env)
36
+ return process
37
+ except Exception as e:
38
+ print(f"Error starting process: {e}")
39
+ return None
40
+
41
+ def stop_process(pid):
42
+ """Fallback process stopper."""
43
+ try:
44
+ import os
45
+
46
+ os.kill(pid, signal.SIGTERM)
47
+ except Exception:
48
+ pass
49
+
50
+ def start_serveo_and_get_url(local_port, log_path):
51
+ """Fallback serveo tunnel - returns None to indicate unavailable."""
52
+ print("Serveo tunneling not available - development module not found")
53
+ return None, None
54
+
55
+ def start_ngrok_and_get_url(local_port, log_path):
56
+ """Fallback ngrok tunnel - returns None to indicate unavailable."""
57
+ print("ngrok tunneling not available - development module not found")
58
+ return None, None
59
+
60
+
61
+ from eval_protocol.auth import get_fireworks_account_id
62
+ from eval_protocol.config import GCPCloudRunConfig, RewardKitConfig
63
+ from eval_protocol.config import _config_file_path as global_loaded_config_path
64
+ from eval_protocol.config import get_config
65
+ from eval_protocol.evaluation import create_evaluation
66
+ from eval_protocol.gcp_tools import (
67
+ build_and_push_docker_image,
68
+ deploy_to_cloud_run,
69
+ ensure_artifact_registry_repo_exists,
70
+ ensure_gcp_secret,
71
+ )
72
+ from eval_protocol.packaging import generate_dockerfile_content
73
+ from eval_protocol.platform_api import ( # For catching errors from create_evaluation
74
+ PlatformAPIError,
75
+ create_or_update_fireworks_secret,
76
+ )
77
+
78
+ from .common import check_environment
79
+
80
+
81
+ def _establish_local_server_and_tunnel(args):
82
+ """
83
+ Handles starting the local generic server and establishing a public tunnel
84
+ using Serveo, with a fallback to ngrok.
85
+ Returns: (public_url, tunnel_provider_name, local_server_pid, tunnel_process_pid)
86
+ Returns (None, None, server_pid_or_None, None) if tunneling fails.
87
+ """
88
+ if not args.function_ref:
89
+ print("Error: --function-ref is required for local-serve target.")
90
+ return None, None, None, None
91
+
92
+ evaluator_id = args.id
93
+ function_ref = args.function_ref
94
+ local_server_port = args.local_port
95
+
96
+ log_dir = os.path.join(os.getcwd(), "logs", "reward-kit-local")
97
+ os.makedirs(log_dir, exist_ok=True)
98
+ generic_server_log_path = os.path.join(log_dir, f"generic_server_{evaluator_id}.log")
99
+
100
+ server_env = None # Run local server without API key protection
101
+ print(f"Note: Local server for '{evaluator_id}' will run without API key protection.")
102
+
103
+ print(f"Starting local reward function server for '{function_ref}' on port {local_server_port}...")
104
+ server_command = [
105
+ sys.executable,
106
+ "-m",
107
+ "eval_protocol.generic_server",
108
+ function_ref,
109
+ "--port",
110
+ str(local_server_port),
111
+ ]
112
+
113
+ local_server_process = start_process(server_command, generic_server_log_path, env=server_env)
114
+
115
+ if not local_server_process or local_server_process.poll() is not None:
116
+ print(f"Error: Failed to start local generic server. Check log: {generic_server_log_path}")
117
+ return None, None, None, None # No server, no tunnel
118
+
119
+ local_server_pid = local_server_process.pid
120
+ print(f"Local server started (PID: {local_server_pid}). Log: {generic_server_log_path}")
121
+ print("Waiting for server to initialize...")
122
+ time.sleep(5)
123
+
124
+ # Attempt Serveo first
125
+ print(f"Attempting Serveo tunnel for local port {local_server_port}...")
126
+ serveo_log_path = os.path.join(log_dir, f"serveo_{evaluator_id}.log")
127
+ serveo_tunnel_process, serveo_url = start_serveo_and_get_url(local_server_port, serveo_log_path)
128
+
129
+ if serveo_url and serveo_tunnel_process:
130
+ print(f"Serveo tunnel established: {serveo_url} (PID: {serveo_tunnel_process.pid}). Log: {serveo_log_path}")
131
+ return serveo_url, "serveo", local_server_pid, serveo_tunnel_process.pid
132
+ else:
133
+ print(f"Serveo tunnel failed. Check log: {serveo_log_path}")
134
+ print("Attempting fallback to ngrok...")
135
+
136
+ ngrok_log_path = os.path.join(log_dir, f"ngrok_{evaluator_id}.log")
137
+ # Assuming ngrok authtoken is pre-configured by the user or via NGROK_AUTHTOKEN env var
138
+ ngrok_tunnel_process, ngrok_url = start_ngrok_and_get_url(local_server_port, ngrok_log_path)
139
+
140
+ if ngrok_url and ngrok_tunnel_process:
141
+ print(f"ngrok tunnel established: {ngrok_url} (PID: {ngrok_tunnel_process.pid}). Log: {ngrok_log_path}")
142
+ return ngrok_url, "ngrok", local_server_pid, ngrok_tunnel_process.pid
143
+ else:
144
+ print(f"ngrok tunnel also failed. Check log: {ngrok_log_path}")
145
+ # Both failed, stop the local server we started
146
+ if local_server_pid:
147
+ stop_process(local_server_pid)
148
+ return (
149
+ None,
150
+ None,
151
+ local_server_pid,
152
+ None,
153
+ ) # URL, provider, server_pid, tunnel_pid
154
+
155
+
156
+ def _deploy_to_gcp_cloud_run(args, current_config, gcp_config_from_yaml):
157
+ """Handles the logic for --target gcp-cloud-run up to service deployment."""
158
+ print(f"Starting GCP Cloud Run deployment for evaluator '{args.id}'...")
159
+
160
+ # Resolve function_ref (must be from CLI for GCP)
161
+ if not args.function_ref: # This check is also in main, but good for helper too
162
+ print("Error: --function-ref is required for GCP Cloud Run deployment.")
163
+ return None
164
+
165
+ # Dynamically import the reward function to get its requirements
166
+ inline_requirements_content = None
167
+ try:
168
+ module_name, func_name = args.function_ref.rsplit(".", 1)
169
+ module = importlib.import_module(module_name)
170
+ reward_func = getattr(module, func_name)
171
+ if hasattr(reward_func, "_reward_function_requirements"):
172
+ inline_requirements_content = reward_func._reward_function_requirements
173
+ if inline_requirements_content:
174
+ print(f"Found inline requirements for {args.function_ref}")
175
+ except Exception as e:
176
+ print(f"Warning: Could not load reward function {args.function_ref} to check for inline requirements: {e}")
177
+ # Continue without inline requirements if loading fails
178
+
179
+ # Resolve GCP project_id
180
+ gcp_project_id = args.gcp_project
181
+ if not gcp_project_id and gcp_config_from_yaml:
182
+ gcp_project_id = gcp_config_from_yaml.project_id
183
+ if not gcp_project_id:
184
+ print("Error: GCP Project ID must be provided via --gcp-project argument or in rewardkit.yaml.")
185
+ return None
186
+
187
+ # Resolve GCP region
188
+ gcp_region = args.gcp_region
189
+ if not gcp_region and gcp_config_from_yaml:
190
+ gcp_region = gcp_config_from_yaml.region
191
+ if not gcp_region:
192
+ print("Error: GCP Region must be provided via --gcp-region argument or in rewardkit.yaml.")
193
+ return None
194
+
195
+ # Resolve GCP AR repo name
196
+ gcp_ar_repo_name = args.gcp_ar_repo
197
+ if not gcp_ar_repo_name and gcp_config_from_yaml:
198
+ gcp_ar_repo_name = gcp_config_from_yaml.artifact_registry_repository
199
+ if not gcp_ar_repo_name:
200
+ gcp_ar_repo_name = "reward-kit-evaluators"
201
+
202
+ print(f"Using GCP Project: {gcp_project_id}, Region: {gcp_region}, AR Repo: {gcp_ar_repo_name}")
203
+
204
+ if not ensure_artifact_registry_repo_exists(
205
+ project_id=gcp_project_id, region=gcp_region, repo_name=gcp_ar_repo_name
206
+ ):
207
+ print(f"Failed to ensure Artifact Registry repository '{gcp_ar_repo_name}' exists. Aborting.")
208
+ return None
209
+
210
+ dockerfile_content = generate_dockerfile_content(
211
+ function_ref=args.function_ref,
212
+ python_version=(
213
+ f"{args.runtime[6]}.{args.runtime[7:]}"
214
+ if args.runtime.startswith("python") and len(args.runtime) > 7
215
+ else args.runtime.replace("python", "")
216
+ ),
217
+ eval_protocol_install_source=".",
218
+ user_requirements_path=None, # Explicitly None, inline_requirements_content will be used
219
+ inline_requirements_content=inline_requirements_content,
220
+ service_port=8080,
221
+ )
222
+ if not dockerfile_content:
223
+ print("Failed to generate Dockerfile content. Aborting.")
224
+ return None
225
+
226
+ image_tag = "latest"
227
+ image_name_tag = f"{gcp_region}-docker.pkg.dev/{gcp_project_id}/{gcp_ar_repo_name}/{args.id}:{image_tag}"
228
+ build_context_dir = os.getcwd()
229
+
230
+ if not build_and_push_docker_image(
231
+ image_name_tag=image_name_tag,
232
+ dockerfile_content=dockerfile_content,
233
+ build_context_dir=build_context_dir,
234
+ gcp_project_id=gcp_project_id,
235
+ ):
236
+ print(f"Failed to build and push Docker image {image_name_tag}. Aborting.")
237
+ return None
238
+ print(f"Successfully built and pushed Docker image: {image_name_tag}")
239
+
240
+ gcp_env_vars: Dict[str, str] = {}
241
+ parsed_gcp_secrets: Dict[str, Any] = {}
242
+ allow_unauthenticated_gcp = True
243
+
244
+ resolved_auth_mode = "api-key"
245
+ if gcp_config_from_yaml and gcp_config_from_yaml.default_auth_mode:
246
+ resolved_auth_mode = gcp_config_from_yaml.default_auth_mode
247
+ if args.gcp_auth_mode is not None:
248
+ resolved_auth_mode = args.gcp_auth_mode
249
+ print(f"Using GCP Auth Mode for service: {resolved_auth_mode}")
250
+
251
+ if resolved_auth_mode == "api-key":
252
+ print("Configuring GCP Cloud Run service for API key authentication (application layer).")
253
+ evaluator_id = args.id
254
+ api_key_for_service = None # This is the key the service itself will use
255
+ config_path = global_loaded_config_path
256
+
257
+ if current_config.evaluator_endpoint_keys and evaluator_id in current_config.evaluator_endpoint_keys:
258
+ api_key_for_service = current_config.evaluator_endpoint_keys[evaluator_id]
259
+ print(f"Using existing API key for '{evaluator_id}' from configuration for the service.")
260
+ else:
261
+ api_key_for_service = secrets.token_hex(32)
262
+ print(f"Generated new API key for '{evaluator_id}' for the service.")
263
+ if not current_config.evaluator_endpoint_keys:
264
+ current_config.evaluator_endpoint_keys = {}
265
+ current_config.evaluator_endpoint_keys[evaluator_id] = api_key_for_service
266
+ if config_path:
267
+ _save_config(current_config, config_path)
268
+ else:
269
+ print(f"Warning: No rewardkit.yaml found to save API key for '{evaluator_id}'.")
270
+
271
+ gcp_sanitized_eval_id = "".join(filter(lambda char: char.isalnum() or char in ["-", "_"], args.id))
272
+ if not gcp_sanitized_eval_id:
273
+ gcp_sanitized_eval_id = "rewardkit-evaluator"
274
+ secret_id_for_auth_key = f"rk-eval-{gcp_sanitized_eval_id}-authkey"
275
+ secret_labels = {"managed-by": "reward-kit", "evaluator-id": evaluator_id}
276
+
277
+ api_key_secret_version_id = ensure_gcp_secret(
278
+ project_id=gcp_project_id,
279
+ secret_id=secret_id_for_auth_key,
280
+ secret_value=api_key_for_service,
281
+ labels=secret_labels,
282
+ )
283
+ if not api_key_secret_version_id:
284
+ print(f"Error: Failed to store API key in GCP Secret Manager for '{evaluator_id}'. Aborting.")
285
+ return None
286
+ print(f"API key for service stored in GCP Secret Manager: {secret_id_for_auth_key}")
287
+ parsed_gcp_secrets["RK_ENDPOINT_API_KEY"] = api_key_secret_version_id
288
+
289
+ # Register this key with Fireworks secrets for the shim
290
+ fireworks_account_id_for_secret = get_fireworks_account_id()
291
+ if fireworks_account_id_for_secret:
292
+ fw_eval_id_sanitized = args.id.lower()
293
+ fw_eval_id_sanitized = "".join(filter(lambda char: char.isalnum() or char == "-", fw_eval_id_sanitized))
294
+ fw_eval_id_sanitized = "-".join(filter(None, fw_eval_id_sanitized.split("-")))
295
+ if not fw_eval_id_sanitized:
296
+ fw_eval_id_sanitized = "evaluator"
297
+ fw_eval_id_sanitized = fw_eval_id_sanitized[:40]
298
+ fw_secret_key_name = f"rkeval-{fw_eval_id_sanitized}-shim-key"
299
+ print(f"Registering API key on Fireworks platform as secret '{fw_secret_key_name}' for shim...")
300
+ if create_or_update_fireworks_secret(
301
+ account_id=fireworks_account_id_for_secret,
302
+ key_name=fw_secret_key_name,
303
+ secret_value=api_key_for_service,
304
+ ):
305
+ print(f"Successfully registered/updated secret '{fw_secret_key_name}' on Fireworks platform.")
306
+ else:
307
+ print(f"Warning: Failed to register/update secret '{fw_secret_key_name}' on Fireworks platform.")
308
+ else:
309
+ print("Warning: Fireworks Account ID not found, cannot store shim API key on Fireworks platform.")
310
+
311
+ cloud_run_service_url = deploy_to_cloud_run(
312
+ service_name=args.id,
313
+ image_name_tag=image_name_tag,
314
+ gcp_project_id=gcp_project_id,
315
+ gcp_region=gcp_region,
316
+ allow_unauthenticated=allow_unauthenticated_gcp, # True if api-key mode, app handles auth
317
+ env_vars=gcp_env_vars if gcp_env_vars else None,
318
+ secrets_to_mount=parsed_gcp_secrets,
319
+ )
320
+
321
+ if not cloud_run_service_url:
322
+ print("Failed to deploy to Cloud Run or retrieve service URL. Aborting.")
323
+ return None
324
+
325
+ print(f"Successfully deployed to Cloud Run. Service URL: {cloud_run_service_url}")
326
+ return cloud_run_service_url
327
+
328
+
329
+ # Helper to save config (can be moved to config.py later)
330
+ def _save_config(config_data: RewardKitConfig, path: str):
331
+ # Basic save, ideally config.py would provide a robust method
332
+ try:
333
+ with open(path, "w") as f:
334
+ yaml.dump(config_data.model_dump(exclude_none=True), f, sort_keys=False)
335
+ print(f"Config updated and saved to {path}")
336
+ except Exception as e:
337
+ print(f"Warning: Failed to save updated config to {path}: {e}")
338
+
339
+
340
+ def deploy_command(args):
341
+ """Create and deploy an evaluator or register a remote one."""
342
+
343
+ # Check environment variables
344
+ if not check_environment():
345
+ return 1
346
+
347
+ if not args.id: # ID is always required
348
+ print("Error: Evaluator ID (--id) is required.")
349
+ return 1
350
+
351
+ # Process HuggingFace key mapping if provided
352
+ huggingface_message_key_map = None
353
+ if args.huggingface_key_map:
354
+ try:
355
+ huggingface_message_key_map = json.loads(args.huggingface_key_map)
356
+ except json.JSONDecodeError:
357
+ print("Error: Invalid JSON format for --huggingface-key-map")
358
+ return 1
359
+
360
+ # Initialize variables for URL registration path
361
+ service_url_to_register = None
362
+ # api_key_for_shim = None # Not currently used by create_evaluation for shim auth directly
363
+
364
+ # PIDs for cleanup if registration fails for local-serve
365
+ local_server_pid_to_clean = None
366
+ # serveo_pid_to_clean = None # This was old, replaced by local_tunnel_pid_to_clean
367
+ local_tunnel_pid_to_clean = None # Initialize here
368
+
369
+ if args.target == "gcp-cloud-run":
370
+ current_config = get_config() # Needed by the helper
371
+ gcp_config_from_yaml = current_config.gcp_cloud_run if current_config.gcp_cloud_run else None
372
+
373
+ cloud_run_service_url = _deploy_to_gcp_cloud_run(args, current_config, gcp_config_from_yaml)
374
+ if not cloud_run_service_url:
375
+ return 1 # Error already printed by helper
376
+ service_url_to_register = cloud_run_service_url
377
+
378
+ elif args.target == "local-serve":
379
+ # Renamed helper and updated return values
380
+ url, tunnel_provider, server_pid, tunnel_pid = _establish_local_server_and_tunnel(args)
381
+ if not url:
382
+ # _establish_local_server_and_tunnel handles cleanup of server if tunnel fails completely
383
+ return 1 # Error already printed by helper
384
+ service_url_to_register = url
385
+ local_server_pid_to_clean = server_pid
386
+ # serveo_pid_to_clean was specific, now it's generic tunnel_pid
387
+ # Let's rename it for clarity in the cleanup logic
388
+ local_tunnel_pid_to_clean = tunnel_pid
389
+ print(f"Tunnel established using {tunnel_provider}.")
390
+
391
+ elif args.remote_url:
392
+ # This is for --target fireworks (default) but with --remote-url
393
+ print(f"Registering remote URL: {args.remote_url} for evaluator '{args.id}'")
394
+ if not (args.remote_url.startswith("http://") or args.remote_url.startswith("https://")):
395
+ print(f"Error: Invalid --remote-url '{args.remote_url}'. Must start with http:// or https://")
396
+ return 1
397
+ if args.metrics_folders: # This check might be redundant if --target is explicit
398
+ print("Info: --metrics-folders are ignored when deploying with --remote-url.")
399
+ service_url_to_register = args.remote_url
400
+ # No specific shim auth provided by this path.
401
+
402
+ # Common registration step for targets that produce a URL
403
+ if service_url_to_register:
404
+ try:
405
+ print(f"Registering URL '{service_url_to_register}' with Fireworks AI for evaluator '{args.id}'...")
406
+ evaluator = create_evaluation(
407
+ evaluator_id=args.id,
408
+ remote_url=service_url_to_register,
409
+ display_name=args.display_name or args.id,
410
+ description=args.description or f"Evaluator for {args.id} at {service_url_to_register}",
411
+ force=args.force,
412
+ huggingface_dataset=args.huggingface_dataset,
413
+ huggingface_split=args.huggingface_split,
414
+ huggingface_message_key_map=huggingface_message_key_map,
415
+ huggingface_prompt_key=args.huggingface_prompt_key,
416
+ huggingface_response_key=args.huggingface_response_key,
417
+ # remote_auth_header_name="X-Api-Key" if api_key_for_shim else None, # No API key for shim for now
418
+ # remote_auth_header_value=api_key_for_shim # No API key for shim for now
419
+ )
420
+ evaluator_name = evaluator.get("name", args.id)
421
+ print(
422
+ f"Successfully registered evaluator '{evaluator_name}' on Fireworks AI, pointing to '{service_url_to_register}'."
423
+ )
424
+ if args.target == "local-serve":
425
+ # tunnel_provider is defined in the local-serve block
426
+ # We need to ensure it's accessible here or pass it through.
427
+ # For now, let's assume tunnel_provider was defined in the calling scope of this block.
428
+ # This will require a small adjustment to how tunnel_provider is scoped.
429
+ # Let's fetch it from args if we store it there, or pass it.
430
+ # Simpler: just make the message generic or re-fetch from the PIDs.
431
+ # The variable `tunnel_provider` is set in the `elif args.target == "local-serve":` block.
432
+ # It needs to be available here.
433
+ # For now, I'll adjust the print statement to be more generic or rely on the PIDs.
434
+ # The `tunnel_provider` variable is indeed set in the correct scope.
435
+ print(
436
+ f"Local server (PID: {local_server_pid_to_clean}) and {tunnel_provider} tunnel (PID: {local_tunnel_pid_to_clean}) are running."
437
+ )
438
+ print("They will be stopped automatically when this command exits (e.g., Ctrl+C).")
439
+ return 0
440
+ except PlatformAPIError as e:
441
+ print(f"Error registering URL with Fireworks AI: {str(e)}")
442
+ except Exception as e:
443
+ print(f"An unexpected error occurred during Fireworks AI registration: {str(e)}")
444
+ finally:
445
+ # If registration fails for local-serve, clean up the started processes
446
+ if args.target == "local-serve" and ("evaluator" not in locals() or not locals().get("evaluator")):
447
+ print("Registration failed or was interrupted for local-serve. Cleaning up local processes...")
448
+ if local_tunnel_pid_to_clean: # Use the new generic tunnel PID variable
449
+ stop_process(local_tunnel_pid_to_clean)
450
+ if local_server_pid_to_clean:
451
+ stop_process(local_server_pid_to_clean)
452
+ return 1
453
+
454
+ # Fallback to original behavior: Deploying by packaging local metrics_folders (target=fireworks, no remote_url)
455
+ # This is when args.target == "fireworks" (default) AND args.remote_url is NOT provided.
456
+ elif args.target == "fireworks" and not args.remote_url:
457
+ if not args.metrics_folders:
458
+ print("Error: --metrics-folders are required for 'fireworks' target if --remote-url is not provided.")
459
+ return 1
460
+ for folder_spec in args.metrics_folders:
461
+ if "=" not in folder_spec:
462
+ print(f"Error: Metric folder format should be 'name=path', got '{folder_spec}'")
463
+ return 1
464
+ try:
465
+ print(f"Packaging and deploying metrics for evaluator '{args.id}' to Fireworks AI...")
466
+ evaluator = create_evaluation(
467
+ evaluator_id=args.id,
468
+ metric_folders=args.metrics_folders,
469
+ display_name=args.display_name or args.id,
470
+ description=args.description or f"Evaluator: {args.id}",
471
+ force=args.force,
472
+ huggingface_dataset=args.huggingface_dataset,
473
+ huggingface_split=args.huggingface_split,
474
+ huggingface_message_key_map=huggingface_message_key_map,
475
+ huggingface_prompt_key=args.huggingface_prompt_key,
476
+ huggingface_response_key=args.huggingface_response_key,
477
+ )
478
+ evaluator_name = evaluator.get("name", args.id)
479
+ print(f"Successfully created/updated evaluator: {evaluator_name}")
480
+ return 0
481
+ except PlatformAPIError as e:
482
+ print(f"Error creating/updating evaluator '{args.id}': {str(e)}")
483
+ return 1
484
+ except Exception as e:
485
+ print(f"Error creating/updating evaluator '{args.id}': {str(e)}")
486
+ return 1