eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1045 @@
1
+ import ast # Added for AST parsing
2
+ import importlib.util # Added for dynamic module loading
3
+ import json
4
+ import logging
5
+ import os
6
+ import sys # Added for path manipulation
7
+ import time
8
+ import types
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
11
+
12
+ if TYPE_CHECKING:
13
+ # For type checking only
14
+ import datasets
15
+
16
+ import requests
17
+
18
+ from eval_protocol.auth import get_fireworks_account_id, get_fireworks_api_key
19
+ from eval_protocol.typed_interface import EvaluationMode
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Flag to track if the preview API was successfully used
24
+ used_preview_api = False
25
+
26
+
27
+ def huggingface_dataset_to_jsonl(
28
+ dataset_name: str,
29
+ split: str = "train",
30
+ output_file: Optional[str] = None,
31
+ max_samples: int = 100,
32
+ message_key_map: Optional[Dict[str, str]] = None,
33
+ response_key: str = "response",
34
+ prompt_key: str = "prompt",
35
+ ) -> str:
36
+ """
37
+ Converts a HuggingFace dataset to JSONL format suitable for reward-kit evaluation.
38
+
39
+ Args:
40
+ dataset_name: The name of the HuggingFace dataset (e.g., "deepseek-ai/DeepSeek-ProverBench")
41
+ split: The dataset split to use (default: "train")
42
+ output_file: Optional file path to save the JSONL output (if None, generates a temp file)
43
+ max_samples: Maximum number of samples to include
44
+ message_key_map: Optional mapping of dataset keys to reward-kit message keys
45
+ response_key: Key in the dataset containing the response text (default: "response")
46
+ prompt_key: Key in the dataset containing the prompt text (default: "prompt")
47
+
48
+ Returns:
49
+ Path to the generated JSONL file
50
+ """
51
+ try:
52
+ from datasets import load_dataset
53
+ except ImportError:
54
+ raise ImportError(
55
+ "The 'datasets' package is required to use this function. "
56
+ "Please install it with 'pip install \"reward-kit[deepseek]\"'"
57
+ )
58
+
59
+ import tempfile
60
+
61
+ logger.info(f"Loading dataset {dataset_name} (split: {split})")
62
+ dataset = load_dataset(dataset_name, split=split)
63
+
64
+ if not output_file:
65
+ temp_dir = tempfile.gettempdir()
66
+ dataset_basename = dataset_name.split("/")[-1]
67
+ output_file = os.path.join(temp_dir, f"{dataset_basename}_{split}_{int(time.time())}.jsonl")
68
+
69
+ os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
70
+
71
+ if message_key_map is None:
72
+ message_key_map = {}
73
+
74
+ processed_samples = 0
75
+ # Initialize i to handle empty dataset case for logging
76
+ i = -1
77
+ with open(output_file, "w") as f:
78
+ for i, item in enumerate(dataset):
79
+ if processed_samples >= max_samples:
80
+ break
81
+
82
+ if prompt_key not in item and "statement" not in item:
83
+ logger.debug(f"Skipping sample {i} due to missing prompt/statement key.")
84
+ continue
85
+
86
+ prompt_text = item.get(prompt_key, item.get("statement", ""))
87
+ response_text = item.get(
88
+ response_key,
89
+ item.get("reference_solution", item.get("expected_proof", "")),
90
+ )
91
+
92
+ if not prompt_text or not response_text:
93
+ logger.debug(f"Skipping sample {i} due to missing prompt or response text.")
94
+ continue
95
+
96
+ messages = [
97
+ {"role": "user", "content": prompt_text},
98
+ {"role": "assistant", "content": response_text},
99
+ ]
100
+ entry = {"messages": messages}
101
+
102
+ for ds_key, rk_key in message_key_map.items():
103
+ if ds_key in item:
104
+ entry[rk_key] = item[ds_key]
105
+
106
+ for key, value in item.items():
107
+ if key not in [prompt_key, response_key] and key not in message_key_map:
108
+ entry[key] = value
109
+
110
+ f.write(json.dumps(entry) + "\n")
111
+ processed_samples += 1
112
+
113
+ if processed_samples == 0 and i == -1:
114
+ logger.info(f"No samples converted to JSONL format: {output_file}")
115
+ else:
116
+ logger.info(f"Converted {processed_samples} samples to JSONL format: {output_file}")
117
+ return output_file
118
+
119
+
120
+ class EvaluatorPreviewResult:
121
+ def __init__(self):
122
+ self.results = []
123
+ self.total_samples = 0
124
+ self.total_runtime_ms = 0
125
+
126
+ def add_result(self, sample_index, success, score, per_metric_evals):
127
+ result_obj = types.SimpleNamespace(
128
+ index=sample_index,
129
+ success=success,
130
+ score=score,
131
+ per_metric_evals=per_metric_evals,
132
+ )
133
+ self.results.append(result_obj)
134
+
135
+ def display(self):
136
+ print("Evaluation Preview Results")
137
+ print("------------------------")
138
+ print(f"Total Samples: {self.total_samples}")
139
+ print(f"Total Runtime: {self.total_runtime_ms} ms\n")
140
+ print("Individual Results:")
141
+ print("------------------")
142
+ for i, result_obj in enumerate(self.results):
143
+ print(f"Sample {result_obj.index + 1}:")
144
+ print(f" Success: {result_obj.success}")
145
+ print(f" Score: {result_obj.score}")
146
+ if hasattr(result_obj, "per_metric_evals") and isinstance(result_obj.per_metric_evals, dict):
147
+ for metric, value in result_obj.per_metric_evals.items():
148
+ print(f" {metric}: {value}")
149
+ elif hasattr(result_obj, "per_metric_evals"):
150
+ print(f" Per-Metric Evals: {result_obj.per_metric_evals}")
151
+ if i < len(self.results) - 1:
152
+ print()
153
+
154
+
155
+ class Evaluator:
156
+ def __init__(
157
+ self,
158
+ multi_metrics=False, # Relates to output structure (dict of metrics vs single)
159
+ remote_url: Optional[str] = None,
160
+ ts_mode_config: Optional[Dict[str, Any]] = None,
161
+ reward_function_mode: EvaluationMode = "pointwise", # New parameter for input processing mode
162
+ account_id: Optional[str] = None,
163
+ api_key: Optional[str] = None,
164
+ ):
165
+ self.multi_metrics = multi_metrics
166
+ self.remote_url = remote_url
167
+ self.ts_mode_config = ts_mode_config
168
+ self.reward_function_mode = reward_function_mode
169
+ self.code_files = {}
170
+ self.metric_folders: Dict[str, Dict[str, Any]] = {} # Changed to store path and requirements
171
+ self.account_id = account_id
172
+ self.api_key = api_key
173
+ self.description = ""
174
+ self.display_name = ""
175
+ self.api_base = os.environ.get("FIREWORKS_API_BASE", "https://api.fireworks.ai")
176
+
177
+ if self.ts_mode_config:
178
+ python_code = self.ts_mode_config.get("python_code")
179
+ file_name = self.ts_mode_config.get("file_name", "main.py")
180
+ if not python_code:
181
+ raise ValueError("python_code is required in ts_mode_config")
182
+ self.code_files[file_name] = python_code
183
+ # ts_mode implies multiMetrics: true for the payload structure
184
+ # but it's distinct from folder-based multi_metrics for loading.
185
+ # The original self.multi_metrics flag is for folder loading.
186
+ # The payload's multiMetrics field will be set to True if ts_mode_config is active.
187
+ # The check for (metric_folders or folder) is not applicable in __init__ and was causing an error.
188
+ # If ts_mode_config is active, it takes precedence for code definition.
189
+ # The multi_metrics flag passed to __init__ is for folder-based loading if ts_mode_config is not used.
190
+
191
+ def _load_python_files_from_folder(self, folder_path: str) -> Dict[str, str]:
192
+ """
193
+ Loads all Python files from a given folder.
194
+
195
+ Args:
196
+ folder_path: Absolute path to the folder.
197
+
198
+ Returns:
199
+ A dictionary mapping filenames to their content.
200
+
201
+ Raises:
202
+ ValueError: If folder_path is invalid, not a directory,
203
+ or if main.py is missing or doesn't contain 'evaluate'.
204
+ """
205
+ if not os.path.exists(folder_path):
206
+ raise ValueError(f"Folder does not exist: {folder_path}")
207
+
208
+ if not os.path.isdir(folder_path):
209
+ raise ValueError(f"Not a directory: {folder_path}")
210
+
211
+ files = {}
212
+ for file_path in Path(folder_path).glob("*.py"):
213
+ if file_path.is_file():
214
+ with open(file_path, "r") as f:
215
+ filename = file_path.name
216
+ content = f.read()
217
+ files[filename] = content
218
+
219
+ # Check for main.py with evaluate function
220
+ if filename == "main.py" and "evaluate" not in content:
221
+ raise ValueError(f"main.py in {folder_path} must contain an evaluate function")
222
+
223
+ if "main.py" not in files:
224
+ raise ValueError(f"main.py is required in {folder_path}")
225
+
226
+ return files
227
+
228
+ def load_metric_folder(self, metric_name, folder_path):
229
+ """
230
+ Load code files from a metric folder
231
+
232
+ Args:
233
+ metric_name: Name of the metric
234
+ folder_path: Path to the folder containing code files
235
+
236
+ Returns:
237
+ Dict mapping filenames to their contents
238
+ """
239
+ folder_path = os.path.abspath(folder_path)
240
+ files = self._load_python_files_from_folder(folder_path) # Reads all .py files into a dict
241
+ metric_requirements_list: Optional[List[str]] = None
242
+
243
+ main_py_content = files.get("main.py")
244
+ if main_py_content:
245
+ try:
246
+ tree = ast.parse(main_py_content)
247
+ for node in ast.walk(tree):
248
+ if isinstance(node, ast.FunctionDef) and node.name == "evaluate":
249
+ for decorator_node in node.decorator_list:
250
+ if (
251
+ isinstance(decorator_node, ast.Call)
252
+ and isinstance(decorator_node.func, ast.Name)
253
+ and decorator_node.func.id == "reward_function"
254
+ ):
255
+ for keyword in decorator_node.keywords:
256
+ if keyword.arg == "requirements":
257
+ if isinstance(keyword.value, ast.List):
258
+ reqs = []
259
+ for elt in keyword.value.elts:
260
+ if isinstance(elt, ast.Constant) and isinstance(
261
+ elt.value, str
262
+ ): # Python 3.8+
263
+ reqs.append(elt.value)
264
+ elif isinstance(elt, ast.Str): # Python < 3.8
265
+ reqs.append(elt.s)
266
+ if reqs:
267
+ metric_requirements_list = reqs
268
+ elif isinstance(keyword.value, ast.Constant) and isinstance(
269
+ keyword.value.value, str
270
+ ): # Python 3.8+ (single req string)
271
+ metric_requirements_list = [keyword.value.value]
272
+ elif isinstance(keyword.value, ast.Str): # Python < 3.8 (single req string)
273
+ metric_requirements_list = [keyword.value.s]
274
+ break
275
+ if metric_requirements_list:
276
+ break
277
+ if metric_requirements_list:
278
+ logger.info(
279
+ f"Found requirements for metric '{metric_name}' via AST: {metric_requirements_list}"
280
+ )
281
+ break
282
+ except SyntaxError as e:
283
+ logger.error(f"Syntax error parsing main.py for metric '{metric_name}' to find requirements: {e}")
284
+ except Exception as e:
285
+ logger.error(f"Error parsing main.py AST for metric '{metric_name}': {e}")
286
+
287
+ self.metric_folders[metric_name] = {
288
+ "path": folder_path,
289
+ "requirements": metric_requirements_list, # This is now a list of strings or None
290
+ }
291
+
292
+ for filename, content in files.items():
293
+ self.code_files[f"{metric_name}/{filename}"] = content
294
+
295
+ logger.info(f"Loaded {len(files)} Python files for metric '{metric_name}' from {folder_path}")
296
+ return files
297
+
298
+ def load_multi_metrics_folder(self, folder_path):
299
+ """
300
+ Load code files from a folder with multiple metrics
301
+
302
+ Args:
303
+ folder_path: Path to the folder containing code files
304
+
305
+ Returns:
306
+ Dict mapping filenames to their contents
307
+ """
308
+ folder_path = os.path.abspath(folder_path)
309
+ files = self._load_python_files_from_folder(folder_path)
310
+
311
+ self.code_files = files
312
+ logger.info(f"Loaded {len(files)} Python files from {folder_path} " f"for multi-metrics evaluation")
313
+ return files
314
+
315
+ def load_samples_from_jsonl(self, sample_file, max_samples=5):
316
+ if not os.path.exists(sample_file):
317
+ raise ValueError(f"Sample file does not exist: {sample_file}")
318
+ samples = []
319
+ with open(sample_file, "r") as f:
320
+ for i, line in enumerate(f):
321
+ if i >= max_samples:
322
+ break
323
+ line = line.strip()
324
+ if not line:
325
+ continue
326
+ try:
327
+ sample = json.loads(line)
328
+ samples.append(sample)
329
+ except json.JSONDecodeError:
330
+ logger.warning(f"Invalid JSON on line {i+1}, skipping")
331
+ logger.info(f"Loaded {len(samples)} samples from {sample_file}")
332
+ return samples
333
+
334
+ def preview(self, sample_file, max_samples=5):
335
+ if not self.remote_url and not self.ts_mode_config and not self.code_files:
336
+ raise ValueError("No code files loaded. Load metric folder(s) or provide ts_mode_config/remote_url first.")
337
+
338
+ # If not remote and not ts_mode, then main.py check applies to loaded code_files
339
+ if not self.remote_url and not self.ts_mode_config:
340
+ if "main.py" not in self.code_files and not any(k.endswith("/main.py") for k in self.code_files):
341
+ raise ValueError("No main.py found in loaded code files for folder-based evaluation.")
342
+
343
+ samples = self.load_samples_from_jsonl(sample_file, max_samples)
344
+ if not samples:
345
+ raise ValueError(f"No valid samples found in {sample_file}")
346
+
347
+ account_id = self.account_id or get_fireworks_account_id()
348
+ auth_token = self.api_key or get_fireworks_api_key()
349
+ logger.debug(f"Preview using account_id: {account_id}")
350
+
351
+ if not account_id or not auth_token:
352
+ logger.error("Authentication error: Missing Fireworks Account ID or API Key.")
353
+ raise ValueError("Missing Fireworks Account ID or API Key.")
354
+
355
+ # Determine multiMetrics for payload based on ts_mode_config or original flag
356
+ payload_multi_metrics = True
357
+ payload_rollup_settings = {"skipRollup": True}
358
+
359
+ # For preview, evaluator_id might not be as critical for shim's env var name,
360
+ # but pass it for consistency. Use display_name as a proxy if no specific ID.
361
+ preview_evaluator_id_for_shim = self.display_name or "preview_evaluator"
362
+ evaluator_payload_data = {
363
+ "displayName": self.display_name or "Preview Evaluator",
364
+ "description": self.description or "Preview Evaluator",
365
+ "multiMetrics": payload_multi_metrics,
366
+ "criteria": self._construct_criteria(criteria_data={}),
367
+ "requirements": self._get_combined_requirements(), # Changed to use combined requirements
368
+ "rollupSettings": payload_rollup_settings,
369
+ }
370
+
371
+ sample_strings = [json.dumps(sample) for sample in samples]
372
+ payload = {
373
+ "evaluator": evaluator_payload_data,
374
+ "sampleData": sample_strings,
375
+ "maxSamples": max_samples,
376
+ }
377
+
378
+ api_base = os.environ.get("FIREWORKS_API_BASE", "https://api.fireworks.ai")
379
+ print("show payload", payload)
380
+ if "dev.api.fireworks.ai" in api_base and account_id == "fireworks":
381
+ account_id = "pyroworks-dev"
382
+
383
+ url = f"{api_base}/v1/accounts/{account_id}/evaluators:previewEvaluator"
384
+ headers = {
385
+ "Authorization": f"Bearer {auth_token}",
386
+ "Content-Type": "application/json",
387
+ }
388
+ logger.info(f"Previewing evaluator using API endpoint: {url} with account: {account_id}")
389
+ logger.debug(f"Preview API Request URL: {url}")
390
+ logger.debug(f"Preview API Request Headers: {json.dumps(headers, indent=2)}")
391
+ logger.debug(f"Preview API Request Payload: {json.dumps(payload, indent=2)}")
392
+
393
+ global used_preview_api
394
+ try:
395
+ response = requests.post(url, json=payload, headers=headers)
396
+ response.raise_for_status()
397
+ result = response.json()
398
+ used_preview_api = True
399
+ preview_result_obj = EvaluatorPreviewResult()
400
+ preview_result_obj.total_samples = result.get("totalSamples", len(samples))
401
+ preview_result_obj.total_runtime_ms = int(result.get("totalRuntimeMs", 0))
402
+ sample_results = result.get("results", [])
403
+ for i, sample_result_item in enumerate(sample_results):
404
+ preview_result_obj.add_result(
405
+ sample_index=i,
406
+ success=sample_result_item.get("success", False),
407
+ score=sample_result_item.get("score", 0.0),
408
+ per_metric_evals=sample_result_item.get("perMetricEvals", {}),
409
+ )
410
+ return preview_result_obj
411
+ except Exception as e:
412
+ logger.error(f"Error previewing evaluator: {str(e)}")
413
+ if isinstance(e, requests.exceptions.HTTPError) and hasattr(e, "response"):
414
+ logger.error(f"Response: {e.response.text}")
415
+ used_preview_api = False
416
+ logger.warning("Falling back to simulated preview mode")
417
+ return self._simulated_preview(samples)
418
+
419
+ def _get_combined_requirements(self) -> str:
420
+ """Combines requirements from all loaded metrics."""
421
+ all_requirements_set = set()
422
+ for metric_data in self.metric_folders.values():
423
+ req_list_or_str = metric_data.get("requirements")
424
+ if req_list_or_str:
425
+ if isinstance(req_list_or_str, list):
426
+ for req_item in req_list_or_str:
427
+ if isinstance(req_item, str):
428
+ all_requirements_set.add(req_item.strip())
429
+ elif isinstance(req_list_or_str, str): # Fallback if somehow a string is still passed
430
+ items = [r.strip() for r in req_list_or_str.splitlines() if r.strip()]
431
+ for item in items:
432
+ all_requirements_set.add(item)
433
+
434
+ # For multi_metrics loaded directly into self.code_files (not via metric_folders)
435
+ # This part is more complex as it requires loading the 'main.py' from self.code_files
436
+ # if self.multi_metrics and not self.metric_folders and "main.py" in self.code_files:
437
+ # We would need a temporary way to load this main.py to get its requirements.
438
+ # For now, focusing on metric_folders which is the primary path for --metrics-folders.
439
+ # If a multi_metrics folder is loaded via load_multi_metrics_folder, it also needs a similar
440
+ # dynamic import logic to fetch requirements from its main 'evaluate' function.
441
+ # This part is NOT YET IMPLEMENTED for multi_metrics folders.
442
+
443
+ if not all_requirements_set and hasattr(self, "_loaded_multi_metric_requirements_str"):
444
+ # Fallback for multi_metrics if requirements were loaded differently (hypothetical)
445
+ # This attribute doesn't exist yet, placeholder for future enhancement if needed.
446
+ if self._loaded_multi_metric_requirements_str: # type: ignore
447
+ requirements_list = [r.strip() for r in self._loaded_multi_metric_requirements_str.splitlines() if r.strip()] # type: ignore
448
+ for req_item in requirements_list:
449
+ all_requirements_set.add(req_item)
450
+
451
+ logger.info(f"Combined unique requirements: {all_requirements_set}")
452
+ return "\n".join(sorted(list(all_requirements_set)))
453
+
454
+ def _simulated_preview(self, samples):
455
+ preview_result = EvaluatorPreviewResult()
456
+ preview_result.total_samples = len(samples)
457
+ start_time = time.time()
458
+ for i, sample in enumerate(samples):
459
+ try:
460
+ if "messages" not in sample:
461
+ raise ValueError(f"Sample {i+1} is missing 'messages' field")
462
+ _ = sample.get("messages", [])
463
+ _ = sample.get("ground_truth", [])
464
+ _ = sample.get("tools", [])
465
+ _ = {
466
+ k: v
467
+ for k, v in sample.items()
468
+ if k
469
+ not in [
470
+ "messages",
471
+ "ground_truth",
472
+ "tools",
473
+ ]
474
+ }
475
+
476
+ if self.multi_metrics or self.ts_mode_config: # ts_mode also implies a single set of results
477
+ per_metric_evals = {"quality": 0.8, "relevance": 0.7, "safety": 0.9}
478
+ else:
479
+ per_metric_evals = {metric_name: 0.75 for metric_name in self.metric_folders}
480
+
481
+ score = sum(per_metric_evals.values()) / len(per_metric_evals) if per_metric_evals else 0.0
482
+ preview_result.add_result(
483
+ sample_index=i,
484
+ success=True,
485
+ score=score,
486
+ per_metric_evals=per_metric_evals,
487
+ )
488
+ except Exception as e:
489
+ logger.error(f"Error processing sample {i+1}: {str(e)}")
490
+ preview_result.add_result(
491
+ sample_index=i,
492
+ success=False,
493
+ score=0.0,
494
+ per_metric_evals={"error": str(e)},
495
+ )
496
+ end_time = time.time()
497
+ preview_result.total_runtime_ms = max(1, int((end_time - start_time) * 1000))
498
+ return preview_result
499
+
500
+ def create(self, evaluator_id, display_name=None, description=None, force=False):
501
+ if not self.remote_url and not self.ts_mode_config and not self.code_files:
502
+ raise ValueError("No code files loaded. Load metric folder(s) or provide ts_mode_config/remote_url first.")
503
+
504
+ account_id = self.account_id or get_fireworks_account_id()
505
+ auth_token = self.api_key or get_fireworks_api_key()
506
+ if not auth_token or not account_id:
507
+ logger.error("Authentication error: API credentials appear to be invalid or incomplete.")
508
+ raise ValueError("Invalid or missing API credentials.")
509
+
510
+ self.display_name = display_name or evaluator_id
511
+ self.description = description or f"Evaluator created from {evaluator_id}"
512
+
513
+ # Determine multiMetrics for payload
514
+ payload_multi_metrics = True
515
+ payload_rollup_settings = {"skipRollup": True}
516
+
517
+ payload_data = {
518
+ "evaluator": {
519
+ "displayName": self.display_name,
520
+ "description": self.description,
521
+ "multiMetrics": payload_multi_metrics, # How results are structured
522
+ # "rewardFunctionMode": self.reward_function_mode, # How input is processed by user func
523
+ "criteria": self._construct_criteria(criteria_data={}),
524
+ "requirements": "",
525
+ "rollupSettings": payload_rollup_settings,
526
+ },
527
+ "evaluatorId": evaluator_id,
528
+ }
529
+
530
+ if "dev.api.fireworks.ai" in self.api_base and account_id == "fireworks":
531
+ account_id = "pyroworks-dev"
532
+
533
+ base_url = f"{self.api_base}/v1/accounts/{account_id}/evaluators"
534
+ headers = {
535
+ "Authorization": f"Bearer {auth_token}",
536
+ "Content-Type": "application/json",
537
+ }
538
+ logger.info(f"Creating evaluator '{evaluator_id}' for account '{account_id}'...")
539
+
540
+ try:
541
+ if force:
542
+ check_url = f"{base_url}/{evaluator_id}"
543
+ try:
544
+ check_response = requests.get(check_url, headers=headers)
545
+ if check_response.status_code == 200:
546
+ logger.info(f"Evaluator '{evaluator_id}' already exists, deleting and recreating...")
547
+ delete_url = f"{base_url}/{evaluator_id}"
548
+ try:
549
+ delete_response = requests.delete(delete_url, headers=headers)
550
+ if delete_response.status_code < 400:
551
+ logger.info(f"Successfully deleted evaluator '{evaluator_id}'")
552
+ else:
553
+ logger.warning(
554
+ f"Unable to delete evaluator '{evaluator_id}', status: {delete_response.status_code}"
555
+ )
556
+ except Exception as e_del:
557
+ logger.warning(f"Error deleting evaluator: {str(e_del)}")
558
+ response = requests.post(base_url, json=payload_data, headers=headers)
559
+ else:
560
+ response = requests.post(base_url, json=payload_data, headers=headers)
561
+ except requests.exceptions.RequestException:
562
+ response = requests.post(base_url, json=payload_data, headers=headers)
563
+ else:
564
+ response = requests.post(base_url, json=payload_data, headers=headers)
565
+
566
+ response.raise_for_status()
567
+ result = response.json()
568
+ logger.info(f"Successfully created evaluator '{evaluator_id}'")
569
+ return result
570
+ except Exception as e:
571
+ logger.error(f"Error creating evaluator: {str(e)}")
572
+ if isinstance(e, requests.exceptions.HTTPError) and hasattr(e, "response"):
573
+ logger.error(f"Response: {e.response.text}")
574
+ raise
575
+
576
+ def _construct_criteria(self, criteria_data: Any) -> Any:
577
+ assertions = []
578
+ if self.remote_url:
579
+ shim_main_py_content = f"""
580
+ import json
581
+ import os
582
+ import requests
583
+
584
+ REMOTE_EVALUATOR_URL = "{self.remote_url}"
585
+
586
+ def evaluate(messages, ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs):
587
+ payload = {{
588
+ "messages": messages,
589
+ "ground_truth": ground_truth,
590
+ "tools": tools,
591
+ "kwargs": kwargs
592
+ }}
593
+ headers = {{"Content-Type": "application/json"}}
594
+ try:
595
+ response = requests.post(REMOTE_EVALUATOR_URL, json=payload, headers=headers, timeout=30)
596
+ response.raise_for_status()
597
+ return response.json()
598
+ except requests.exceptions.RequestException as e:
599
+ error_info = {{
600
+ "error": f"Failed to call remote evaluator at {{REMOTE_EVALUATOR_URL}}: {{str(e)}}",
601
+ "status_code": getattr(e.response, 'status_code', None),
602
+ "response_text": getattr(e.response, 'text', None)
603
+ }}
604
+ return {{
605
+ "score": 0.0, "reason": f"Error calling remote evaluator: {{str(e)}}",
606
+ "is_score_valid": False, "metrics": {{"remote_call_error": {{"score": 0.0, "is_score_valid": False, "reason": json.dumps(error_info)}}}}
607
+ }}
608
+ except Exception as e:
609
+ return {{
610
+ "score": 0.0, "reason": f"Unexpected error in remote evaluator shim: {{str(e)}}",
611
+ "is_score_valid": False, "metrics": {{"shim_error": {{"score": 0.0, "is_score_valid": False, "reason": str(e)}}}}
612
+ }}
613
+ """
614
+ file_contents = {"main.py": shim_main_py_content}
615
+ assertions.append(
616
+ {
617
+ "codeSnippets": {
618
+ "language": "python",
619
+ "fileContents": file_contents,
620
+ },
621
+ "name": "remote_eval_proxy",
622
+ "type": "CODE_SNIPPETS",
623
+ "description": f"Proxies evaluation to remote URL: {self.remote_url}",
624
+ }
625
+ )
626
+ elif self.ts_mode_config:
627
+ python_code = self.ts_mode_config.get("python_code")
628
+ file_name = self.ts_mode_config.get("file_name", "main.py")
629
+ criterion_name = self.ts_mode_config.get("criterion_name", "default_code_criterion")
630
+ description = self.ts_mode_config.get("description", "Python code execution")
631
+ if not python_code:
632
+ raise ValueError("python_code is required in ts_mode_config")
633
+ assertions.append(
634
+ {
635
+ "type": "CODE_SNIPPETS",
636
+ "name": criterion_name,
637
+ "description": description,
638
+ "codeSnippets": {
639
+ "language": "python",
640
+ "fileContents": {file_name: python_code},
641
+ },
642
+ }
643
+ )
644
+ elif self.multi_metrics:
645
+ file_contents = {}
646
+ for filename, content in self.code_files.items():
647
+ if not filename.endswith(".py"):
648
+ continue
649
+ file_contents[filename] = self._update_evaluate_signature(content)
650
+ if not file_contents:
651
+ raise ValueError("No Python files found for multi-metrics mode.")
652
+ assertions.append(
653
+ {
654
+ "codeSnippets": {
655
+ "language": "python",
656
+ "fileContents": file_contents,
657
+ },
658
+ "name": "eval",
659
+ "type": "CODE_SNIPPETS",
660
+ "description": self.description or "Multi-metric evaluation",
661
+ }
662
+ )
663
+ else: # Folder-based, non-multi_metrics
664
+ for metric_name in self.metric_folders:
665
+ file_contents = {}
666
+ # Prioritize sending only main.py for the preview evaluator
667
+ main_py_key = f"{metric_name}/main.py"
668
+ if main_py_key in self.code_files:
669
+ file_contents["main.py"] = self._update_evaluate_signature(self.code_files[main_py_key])
670
+ else:
671
+ # Fallback to sending all files if main.py isn't found directly under metric_name/ (should not happen with current loading logic)
672
+ # Or if a more complex structure was intended. For now, this path means an issue.
673
+ logger.warning(
674
+ f"main.py not found for metric '{metric_name}' with key '{main_py_key}'. "
675
+ "The preview payload might be incorrect or incomplete."
676
+ )
677
+
678
+ if not file_contents:
679
+ logger.warning(
680
+ f"No Python files (specifically main.py) prepared for metric '{metric_name}', skipping this metric for criteria."
681
+ )
682
+ continue
683
+
684
+ assertions.append(
685
+ {
686
+ "codeSnippets": {
687
+ "language": "python",
688
+ "fileContents": file_contents, # Should now ideally only contain main.py
689
+ },
690
+ "name": metric_name,
691
+ "type": "CODE_SNIPPETS",
692
+ "description": f"Metric: {metric_name}",
693
+ }
694
+ )
695
+
696
+ if not assertions:
697
+ raise ValueError("No valid criteria could be constructed.")
698
+ return assertions
699
+
700
+ def _update_evaluate_signature(self, content):
701
+ import re
702
+
703
+ # Simple regex to match the old evaluate function signature
704
+ old_pattern = r"def\s+evaluate\s*\(\s*entry\s*(?::\s*dict)?\s*\)"
705
+ # Regex to match the signature we are changing from (original_messages)
706
+ current_signature_pattern = (
707
+ r"def\s+evaluate\s*\(\s*messages,\s*original_messages\s*=\s*None,\s*tools\s*=\s*None,\s*\*\*kwargs\s*\)"
708
+ )
709
+ new_signature = "def evaluate(messages, ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs)"
710
+
711
+ # Check if the old pattern (entry-based) exists
712
+ if re.search(old_pattern, content):
713
+ updated_content = re.sub(old_pattern, new_signature, content, count=1)
714
+
715
+ # Add a compatibility layer for the 'entry' style
716
+ compat_layer = """
717
+ # Compatibility layer for old 'entry' format
718
+ if ground_truth is None: # Default ground_truth from messages if not provided
719
+ ground_truth = messages
720
+ # Assuming 'entry' dict was constructed from messages, original_messages (now ground_truth), tools, kwargs
721
+ # This part might need more context on how 'entry' was used.
722
+ # For now, we'll assume ground_truth takes precedence or is derived.
723
+ """
724
+ # Check if the current signature (with original_messages) exists
725
+ elif re.search(current_signature_pattern, content):
726
+ updated_content = re.sub(current_signature_pattern, new_signature, content, count=1)
727
+ # No specific compatibility layer needed here as it's a direct parameter rename
728
+ compat_layer = "" # No additional layer for this direct change
729
+ else:
730
+ # If neither known signature is found, return content as is
731
+ return content
732
+
733
+ # Find the function body indent level if a change was made
734
+ if "updated_content" in locals() and compat_layer: # Only add layer if it's defined
735
+ func_match = re.search(r"def\s+evaluate.*?:\s*\n(\s+)", updated_content, re.DOTALL)
736
+ if func_match:
737
+ indent = func_match.group(1)
738
+ # Adjust indentation of compatibility layer
739
+ indented_compat_layer = "\n".join(indent + line for line in compat_layer.strip().split("\n"))
740
+
741
+ # Insert compatibility layer after function definition
742
+ updated_content = re.sub(
743
+ re.escape(new_signature) + r"\s*:",
744
+ new_signature + ":" + indented_compat_layer,
745
+ updated_content,
746
+ count=1,
747
+ )
748
+ return updated_content
749
+ elif "updated_content" in locals():
750
+ return updated_content
751
+ return content
752
+
753
+ def _get_combined_code(self): # This method seems unused now, consider removal
754
+ # ... (implementation unchanged, but likely dead code)
755
+ pass
756
+
757
+ def _get_code_from_files(self, files): # This method seems unused now, consider removal
758
+ # ... (implementation unchanged, but likely dead code)
759
+ pass
760
+
761
+ def _get_authentication(self):
762
+ account_id = get_fireworks_account_id()
763
+ auth_token = get_fireworks_api_key()
764
+ if not account_id:
765
+ logger.error("Authentication error: Fireworks Account ID not found.")
766
+ raise ValueError("Fireworks Account ID not found.")
767
+ if not auth_token:
768
+ logger.error("Authentication error: Fireworks API Key not found.")
769
+ raise ValueError("Fireworks API Key not found.")
770
+ return account_id, auth_token
771
+
772
+
773
+ # Helper functions for CLI commands
774
+ def preview_evaluation(
775
+ metric_folders: Optional[List[str]] = None,
776
+ multi_metrics: bool = False,
777
+ folder: Optional[str] = None,
778
+ python_code_to_evaluate: Optional[str] = None,
779
+ python_file_name_for_code: str = "main.py",
780
+ criterion_name_for_code: str = "default_code_criterion",
781
+ criterion_description_for_code: str = "Python code execution",
782
+ sample_file: Optional[str] = None,
783
+ max_samples: int = 5,
784
+ huggingface_dataset: Optional[str] = None,
785
+ huggingface_split: str = "train",
786
+ huggingface_message_key_map: Optional[Dict[str, str]] = None,
787
+ huggingface_response_key: str = "response",
788
+ huggingface_prompt_key: str = "prompt",
789
+ reward_function_mode: EvaluationMode = "pointwise", # Added for consistency
790
+ account_id: Optional[str] = None,
791
+ api_key: Optional[str] = None,
792
+ ):
793
+ ts_mode_config = None
794
+ if python_code_to_evaluate:
795
+ if metric_folders or folder: # Removed multi_metrics from this check as it's handled by Evaluator init
796
+ raise ValueError(
797
+ "Cannot use python_code_to_evaluate with folder-based parameters (metric_folders, folder)."
798
+ )
799
+ ts_mode_config = {
800
+ "python_code": python_code_to_evaluate,
801
+ "file_name": python_file_name_for_code,
802
+ "criterion_name": criterion_name_for_code,
803
+ "description": criterion_description_for_code,
804
+ }
805
+ # When python_code_to_evaluate is used, multi_metrics in Evaluator constructor is effectively True
806
+ # due to how ts_mode_config is handled (sets self.multi_metrics = True for payload).
807
+ # The multi_metrics flag passed to Evaluator here should be the original one for folder logic.
808
+ evaluator = Evaluator(
809
+ multi_metrics=multi_metrics,
810
+ ts_mode_config=ts_mode_config,
811
+ reward_function_mode=reward_function_mode,
812
+ account_id=account_id,
813
+ api_key=api_key,
814
+ )
815
+ else:
816
+ evaluator = Evaluator(
817
+ multi_metrics=multi_metrics,
818
+ reward_function_mode=reward_function_mode,
819
+ account_id=account_id,
820
+ api_key=api_key,
821
+ ) # Pass mode to Evaluator
822
+ if multi_metrics:
823
+ if not folder:
824
+ raise ValueError("`folder` must be specified for multi_metrics mode.")
825
+ evaluator.load_multi_metrics_folder(folder)
826
+ else:
827
+ if not metric_folders:
828
+ raise ValueError("At least one metric_folder must be specified.")
829
+ for pair in metric_folders:
830
+ if "=" not in pair:
831
+ raise ValueError(f"Invalid metric-folder format: {pair}.")
832
+ metric_name, folder_path = pair.split("=", 1)
833
+ evaluator.load_metric_folder(metric_name, folder_path)
834
+
835
+ if huggingface_dataset:
836
+ if sample_file:
837
+ logger.warning("Both sample_file and huggingface_dataset specified. Using HuggingFace dataset.")
838
+ sample_file = huggingface_dataset_to_jsonl(
839
+ dataset_name=huggingface_dataset,
840
+ split=huggingface_split,
841
+ max_samples=max_samples,
842
+ message_key_map=huggingface_message_key_map,
843
+ response_key=huggingface_response_key,
844
+ prompt_key=huggingface_prompt_key,
845
+ )
846
+ logger.info(f"Converted dataset saved to: {sample_file}")
847
+
848
+ if not sample_file:
849
+ raise ValueError("Either sample_file or huggingface_dataset must be specified.")
850
+ return evaluator.preview(sample_file, max_samples)
851
+
852
+
853
+ def preview_folder_evaluation( # This function might become redundant or need to align with the new preview_evaluation
854
+ evaluator_folder,
855
+ sample_file=None,
856
+ max_samples=5,
857
+ multi_metrics=False, # original multi_metrics
858
+ huggingface_dataset=None,
859
+ huggingface_split="train",
860
+ huggingface_message_key_map=None,
861
+ huggingface_response_key="response",
862
+ huggingface_prompt_key="prompt",
863
+ ):
864
+ evaluator_folder = os.path.abspath(evaluator_folder)
865
+ if not os.path.exists(evaluator_folder):
866
+ raise ValueError(f"Evaluator folder does not exist: {evaluator_folder}")
867
+ if not os.path.isdir(evaluator_folder):
868
+ raise ValueError(f"Not a directory: {evaluator_folder}")
869
+
870
+ has_main_py = os.path.exists(os.path.join(evaluator_folder, "main.py"))
871
+ # Auto-detect multi_metrics if not specified by caller
872
+ detected_multi_metrics = multi_metrics
873
+ if has_main_py and not multi_metrics:
874
+ py_files = list(Path(evaluator_folder).glob("*.py"))
875
+ if len(py_files) > 1:
876
+ logger.info(f"Auto-detecting multi-metrics mode based on folder structure for preview_folder_evaluation")
877
+ detected_multi_metrics = True
878
+
879
+ # Call the unified preview_evaluation
880
+ # This function doesn't directly support ts_mode_config, so python_code_to_evaluate is None
881
+ return preview_evaluation(
882
+ metric_folders=(
883
+ None if detected_multi_metrics else [f"{os.path.basename(evaluator_folder)}={evaluator_folder}"]
884
+ ), # Simplified for now
885
+ multi_metrics=detected_multi_metrics,
886
+ folder=evaluator_folder if detected_multi_metrics else None,
887
+ python_code_to_evaluate=None, # Not applicable for this helper
888
+ sample_file=sample_file,
889
+ max_samples=max_samples,
890
+ huggingface_dataset=huggingface_dataset,
891
+ huggingface_split=huggingface_split,
892
+ huggingface_message_key_map=huggingface_message_key_map,
893
+ huggingface_response_key=huggingface_response_key,
894
+ huggingface_prompt_key=huggingface_prompt_key,
895
+ )
896
+
897
+
898
+ def create_evaluation(
899
+ evaluator_id: str,
900
+ metric_folders: Optional[List[str]] = None,
901
+ multi_metrics: bool = False, # Original folder-based multi_metrics flag
902
+ folder: Optional[str] = None,
903
+ python_code_to_evaluate: Optional[str] = None,
904
+ python_file_name_for_code: str = "main.py",
905
+ criterion_name_for_code: str = "default_code_criterion",
906
+ criterion_description_for_code: str = "Python code execution",
907
+ display_name: Optional[str] = None,
908
+ description: Optional[str] = None,
909
+ force: bool = False,
910
+ huggingface_dataset: Optional[str] = None,
911
+ huggingface_split: str = "train",
912
+ huggingface_message_key_map: Optional[Dict[str, str]] = None,
913
+ huggingface_response_key: str = "response",
914
+ huggingface_prompt_key: str = "prompt",
915
+ remote_url: Optional[str] = None,
916
+ reward_function_mode: EvaluationMode = "pointwise", # Added
917
+ account_id: Optional[str] = None,
918
+ api_key: Optional[str] = None,
919
+ ):
920
+ ts_mode_config = None
921
+ if python_code_to_evaluate:
922
+ if metric_folders or folder: # Removed multi_metrics from this check
923
+ raise ValueError("Cannot use python_code_to_evaluate with folder-based parameters.")
924
+ ts_mode_config = {
925
+ "python_code": python_code_to_evaluate,
926
+ "file_name": python_file_name_for_code,
927
+ "criterion_name": criterion_name_for_code,
928
+ "description": criterion_description_for_code,
929
+ }
930
+
931
+ evaluator = Evaluator(
932
+ multi_metrics=multi_metrics,
933
+ remote_url=remote_url,
934
+ ts_mode_config=ts_mode_config,
935
+ reward_function_mode=reward_function_mode,
936
+ account_id=account_id,
937
+ api_key=api_key,
938
+ )
939
+
940
+ if remote_url:
941
+ logger.info(f"Configuring evaluator to use remote URL: {remote_url}")
942
+ if (
943
+ metric_folders or folder or python_code_to_evaluate
944
+ ): # If remote_url, other code sources are ignored for execution
945
+ logger.warning(
946
+ "When remote_url is provided, other code sources (folders, python_code_to_evaluate) are ignored for execution logic by the platform."
947
+ )
948
+ elif ts_mode_config:
949
+ # ts_mode_config already handled in Evaluator.__init__ for self.code_files
950
+ logger.info(f"Configuring evaluator with direct Python code snippet (ts_mode).")
951
+ elif multi_metrics: # Folder-based multi_metrics
952
+ if not folder:
953
+ raise ValueError("`folder` must be specified for folder-based multi_metrics mode.")
954
+ evaluator.load_multi_metrics_folder(folder)
955
+ else: # Folder-based single/multiple metrics (non-multi_metrics structure)
956
+ if not metric_folders:
957
+ raise ValueError("At least one metric_folder must be specified.")
958
+ for pair in metric_folders:
959
+ if "=" not in pair:
960
+ raise ValueError(f"Invalid metric-folder format: {pair}.")
961
+ metric_name, folder_path = pair.split("=", 1)
962
+ evaluator.load_metric_folder(metric_name, folder_path)
963
+
964
+ if huggingface_dataset:
965
+ logger.info(f"HuggingFace dataset specified: {huggingface_dataset} (currently for preview only).")
966
+
967
+ return evaluator.create(evaluator_id, display_name, description, force)
968
+
969
+
970
+ def deploy_folder_evaluation( # This function might become redundant or need to align with the new create_evaluation
971
+ evaluator_id,
972
+ evaluator_folder,
973
+ display_name=None,
974
+ description=None,
975
+ force=False,
976
+ multi_metrics=False, # original multi_metrics
977
+ huggingface_dataset=None,
978
+ huggingface_split="train",
979
+ huggingface_message_key_map=None,
980
+ huggingface_response_key="response",
981
+ huggingface_prompt_key="prompt",
982
+ remote_url: Optional[str] = None,
983
+ ):
984
+ evaluator_folder_abs = os.path.abspath(evaluator_folder) if evaluator_folder else None
985
+
986
+ # If remote_url is provided, evaluator_folder is less relevant for code loading
987
+ # but might still be used for context/metadata if the function design implies it.
988
+ # For now, if remote_url, we don't load from folder.
989
+
990
+ python_code_to_evaluate = None # This helper doesn't take direct code string
991
+
992
+ if not remote_url and not evaluator_folder_abs:
993
+ raise ValueError("evaluator_folder must be specified if not using remote_url.")
994
+
995
+ if evaluator_folder_abs:
996
+ if not os.path.exists(evaluator_folder_abs):
997
+ raise ValueError(f"Evaluator folder does not exist: {evaluator_folder_abs}")
998
+ if not os.path.isdir(evaluator_folder_abs):
999
+ raise ValueError(f"Not a directory: {evaluator_folder_abs}")
1000
+
1001
+ # Auto-detect multi_metrics if not specified and not remote_url and folder is given
1002
+ detected_multi_metrics = multi_metrics
1003
+ folder_for_loading = None
1004
+ metric_folders_for_loading = None
1005
+
1006
+ if not remote_url and evaluator_folder_abs:
1007
+ has_main_py = os.path.exists(os.path.join(evaluator_folder_abs, "main.py"))
1008
+ if has_main_py and not multi_metrics: # If user says not multi_metrics, but main.py is at root
1009
+ py_files = list(Path(evaluator_folder_abs).glob("*.py"))
1010
+ if len(py_files) > 1: # Heuristic: if multiple .py files at root with main.py, likely multi-metric
1011
+ logger.info(f"Auto-detecting multi-metrics mode for deploy_folder_evaluation.")
1012
+ detected_multi_metrics = True
1013
+
1014
+ if detected_multi_metrics:
1015
+ folder_for_loading = evaluator_folder_abs
1016
+ else: # Prepare metric_folders list
1017
+ metric_folders_for_loading = []
1018
+ if has_main_py: # Single metric in the root folder
1019
+ metric_folders_for_loading.append(f"{os.path.basename(evaluator_folder_abs)}={evaluator_folder_abs}")
1020
+ else: # Look for subdirectories
1021
+ for item in os.listdir(evaluator_folder_abs):
1022
+ item_path = os.path.join(evaluator_folder_abs, item)
1023
+ if os.path.isdir(item_path) and os.path.exists(os.path.join(item_path, "main.py")):
1024
+ metric_folders_for_loading.append(f"{item}={item_path}")
1025
+ if not metric_folders_for_loading:
1026
+ raise ValueError(
1027
+ f"No valid metrics found in {evaluator_folder_abs} for non-multi-metric deployment."
1028
+ )
1029
+
1030
+ return create_evaluation(
1031
+ evaluator_id=evaluator_id,
1032
+ metric_folders=metric_folders_for_loading,
1033
+ multi_metrics=detected_multi_metrics, # Use the detected or passed-in multi_metrics
1034
+ folder=folder_for_loading,
1035
+ python_code_to_evaluate=python_code_to_evaluate, # None for this helper
1036
+ display_name=display_name,
1037
+ description=description,
1038
+ force=force,
1039
+ huggingface_dataset=huggingface_dataset,
1040
+ huggingface_split=huggingface_split,
1041
+ huggingface_message_key_map=huggingface_message_key_map,
1042
+ huggingface_response_key=huggingface_response_key,
1043
+ huggingface_prompt_key=huggingface_prompt_key,
1044
+ remote_url=remote_url,
1045
+ )