eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,422 @@
1
+ import json
2
+ import re
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ from ..models import EvaluateResult, Message, MetricResult
6
+ from ..typed_interface import reward_function
7
+ from .function_calling import (
8
+ calculate_jaccard_similarity,
9
+ extract_schema_properties,
10
+ normalize_schema,
11
+ )
12
+
13
+
14
+ @reward_function
15
+ def json_schema_reward(
16
+ messages: Union[List[Message], List[Dict[str, Any]]],
17
+ ground_truth: Optional[Union[List[Message], List[Dict[str, Any]]]] = None,
18
+ json_content: Optional[Union[Dict[str, Any], str]] = None,
19
+ expected_schema: Optional[Union[Dict[str, Any], str]] = None,
20
+ **kwargs,
21
+ ) -> EvaluateResult:
22
+ """
23
+ Evaluate JSON content against an expected schema using Jaccard similarity.
24
+ The model's response (containing JSON) is assumed to be the last message in the `messages` list.
25
+
26
+ This reward function compares the structure of JSON content against an
27
+ expected schema and calculates a similarity score using Jaccard similarity.
28
+ It repurposes the same approach used for function calling validation but for
29
+ general JSON schema validation.
30
+
31
+ Args:
32
+ messages: List of conversation messages, where `messages[-1]` is the model's response.
33
+ ground_truth: Optional. Expected assistant response trajectory. Not directly used by this reward.
34
+ json_content: The JSON content to evaluate (if not provided, extracts
35
+ from the last message).
36
+ expected_schema: The expected schema for the JSON content.
37
+ **kwargs: Additional keyword arguments.
38
+
39
+ Returns:
40
+ EvaluateResult with score and metrics
41
+ """
42
+ metrics = {}
43
+
44
+ if json_content is None:
45
+ if not messages:
46
+ return EvaluateResult(
47
+ score=0.0,
48
+ reason="No messages provided to extract JSON content.",
49
+ metrics={"error": MetricResult(score=0.0, reason="No messages provided", is_score_valid=False)},
50
+ )
51
+
52
+ last_message = messages[-1]
53
+ content_text = ""
54
+
55
+ if isinstance(last_message, Message):
56
+ if last_message.role == "assistant" and last_message.content is not None:
57
+ content_text = last_message.content
58
+ else:
59
+ return EvaluateResult(
60
+ score=0.0,
61
+ reason="Last message is not a valid assistant response to extract JSON from.",
62
+ metrics={
63
+ "error": MetricResult(
64
+ score=0.0,
65
+ reason="Invalid assistant message for JSON extraction.",
66
+ is_score_valid=False,
67
+ )
68
+ },
69
+ )
70
+ elif isinstance(last_message, dict):
71
+ if last_message.get("role") == "assistant" and last_message.get("content") is not None:
72
+ content_text = last_message.get("content", "")
73
+ else:
74
+ return EvaluateResult(
75
+ score=0.0,
76
+ reason="Last message is not a valid assistant response (dict) to extract JSON from.",
77
+ metrics={
78
+ "error": MetricResult(
79
+ score=0.0,
80
+ reason="Invalid assistant message (dict) for JSON extraction.",
81
+ is_score_valid=False,
82
+ )
83
+ },
84
+ )
85
+ else:
86
+ return EvaluateResult(
87
+ score=0.0,
88
+ reason=f"Unexpected type for last message: {type(last_message)}.",
89
+ metrics={
90
+ "error": MetricResult(
91
+ score=0.0,
92
+ reason="Invalid message type for JSON extraction.",
93
+ is_score_valid=False,
94
+ )
95
+ },
96
+ )
97
+
98
+ extracted_json_str = None
99
+ if content_text:
100
+ try:
101
+ pattern = r"```(?:json)?\s*([\s\S]*?)```"
102
+ code_blocks = re.findall(pattern, content_text)
103
+ if code_blocks:
104
+ extracted_json_str = code_blocks[0]
105
+ else:
106
+ json_match = re.search(r"(\{[\s\S]*\}|\[[\s\S]*\])", content_text, re.DOTALL)
107
+ if json_match:
108
+ try:
109
+ json.loads(json_match.group(0))
110
+ extracted_json_str = json_match.group(0)
111
+ except json.JSONDecodeError:
112
+ pass
113
+ except Exception:
114
+ pass
115
+
116
+ if extracted_json_str:
117
+ json_content = extracted_json_str
118
+
119
+ if not json_content:
120
+ return EvaluateResult(
121
+ score=0.0,
122
+ reason="No JSON content found in messages.",
123
+ metrics={
124
+ "error": MetricResult(
125
+ score=0.0,
126
+ reason="No JSON content found in messages",
127
+ is_score_valid=False,
128
+ )
129
+ },
130
+ )
131
+
132
+ if expected_schema is None:
133
+ return EvaluateResult(
134
+ score=0.0,
135
+ reason="No expected schema provided for comparison.",
136
+ metrics={
137
+ "error": MetricResult(
138
+ score=0.0,
139
+ reason="No expected schema provided",
140
+ is_score_valid=False,
141
+ )
142
+ },
143
+ )
144
+
145
+ expected_schema = normalize_schema(expected_schema)
146
+
147
+ try:
148
+ if isinstance(json_content, str):
149
+ parsed_content = json.loads(json_content)
150
+ else:
151
+ parsed_content = json_content
152
+ except json.JSONDecodeError:
153
+ return EvaluateResult(
154
+ score=0.0,
155
+ reason=f"Invalid JSON content: {json_content}",
156
+ metrics={
157
+ "error": MetricResult(
158
+ score=0.0,
159
+ reason=f"Invalid JSON content: {json_content}",
160
+ is_score_valid=False,
161
+ )
162
+ },
163
+ )
164
+
165
+ # Function to recursively build a schema from content
166
+ def build_schema_from_content(content: Any) -> Dict[str, Any]:
167
+ if isinstance(content, dict):
168
+ schema: Dict[str, Any] = {"type": "object", "properties": {}}
169
+ for key, value in content.items():
170
+ if isinstance(schema["properties"], dict): # Should always be true
171
+ schema["properties"][key] = build_schema_from_content(value)
172
+ return schema
173
+ elif isinstance(content, list):
174
+ if content:
175
+ return {
176
+ "type": "array",
177
+ "items": build_schema_from_content(content[0]),
178
+ }
179
+ return {"type": "array"}
180
+ elif isinstance(content, str):
181
+ return {"type": "string"}
182
+ elif isinstance(content, bool):
183
+ return {"type": "boolean"}
184
+ elif isinstance(content, (int, float)):
185
+ return {"type": "number"}
186
+ elif content is None:
187
+ return {"type": "null"}
188
+ else:
189
+ return {"type": "any"}
190
+
191
+ content_schema = build_schema_from_content(parsed_content)
192
+ expected_properties = extract_schema_properties(expected_schema)
193
+ actual_properties = extract_schema_properties(content_schema)
194
+ schema_similarity = calculate_jaccard_similarity(expected_properties, actual_properties)
195
+
196
+ missing_props = expected_properties - actual_properties
197
+ extra_props = actual_properties - expected_properties
198
+ matching_props = expected_properties.intersection(actual_properties)
199
+
200
+ comparison_details = []
201
+ if matching_props:
202
+ comparison_details.append(f"Matching properties ({len(matching_props)}):")
203
+ for prop, prop_type in sorted(matching_props):
204
+ comparison_details.append(f" - {prop}: {prop_type}")
205
+ if missing_props:
206
+ comparison_details.append(f"Missing properties ({len(missing_props)}):")
207
+ for prop, prop_type in sorted(missing_props):
208
+ comparison_details.append(f" - {prop}: {prop_type}")
209
+ if extra_props:
210
+ comparison_details.append(f"Extra properties ({len(extra_props)}):")
211
+ for prop, prop_type in sorted(extra_props):
212
+ comparison_details.append(f" - {prop}: {prop_type}")
213
+
214
+ schema_comparison_reason = "\n".join(comparison_details)
215
+
216
+ metrics["schema_similarity"] = MetricResult(
217
+ score=schema_similarity,
218
+ reason=f"Schema similarity: {schema_similarity:.2f}\n{schema_comparison_reason}",
219
+ is_score_valid=schema_similarity == 1.0,
220
+ )
221
+
222
+ final_score = schema_similarity
223
+ final_reason = f"Final score based on schema similarity: {final_score:.2f}."
224
+
225
+ return EvaluateResult(score=final_score, reason=final_reason, metrics=metrics)
226
+
227
+
228
+ def json_schema_reward_with_llm_judge(
229
+ messages: Union[List[Message], List[Dict[str, Any]]],
230
+ ground_truth: Optional[Union[List[Message], List[Dict[str, Any]]]] = None,
231
+ json_content: Optional[Union[Dict[str, Any], str]] = None,
232
+ expected_schema: Optional[Union[Dict[str, Any], str]] = None,
233
+ expected_behavior: Optional[str] = None,
234
+ openai_api_key: Optional[str] = None,
235
+ model: str = "gpt-4o-mini",
236
+ temperature: float = 0.0,
237
+ weights: Optional[Dict[str, float]] = None,
238
+ **kwargs,
239
+ ) -> EvaluateResult:
240
+ """
241
+ Combined reward function that evaluates JSON content using both schema
242
+ validation and LLM judgment.
243
+
244
+ Args:
245
+ messages: The conversation messages, where `messages[-1]` is the model's response.
246
+ ground_truth: Optional. Expected assistant response trajectory. Not directly used by this reward.
247
+ json_content: The JSON content to evaluate (if not provided, extracts
248
+ from the last message).
249
+ expected_schema: The expected schema for the JSON content.
250
+ expected_behavior: Description of the expected behavior/content
251
+ openai_api_key: OpenAI API key (if not provided, uses environment variable)
252
+ model: Model to use for LLM evaluation (default: gpt-4o-mini)
253
+ temperature: Temperature for the model generation (default: 0.0)
254
+ weights: Dictionary of weights for each component
255
+ (default: {"schema": 0.7, "llm": 0.3})
256
+ **kwargs: Additional keyword arguments
257
+
258
+ Returns:
259
+ EvaluateResult with score and metrics
260
+ """
261
+ # Import OpenAI at call time to make this optional
262
+ try:
263
+ from openai import OpenAI
264
+ except ImportError:
265
+ return EvaluateResult(
266
+ score=0.0,
267
+ reason="OpenAI package not installed.",
268
+ metrics={
269
+ "error": MetricResult(
270
+ score=0.0,
271
+ reason="OpenAI package not installed. Install it with: pip install openai",
272
+ is_score_valid=False,
273
+ )
274
+ },
275
+ )
276
+
277
+ if weights is None:
278
+ weights = {"schema": 0.7, "llm": 0.3}
279
+
280
+ total_weight = sum(weights.values())
281
+ normalized_weights = {k: v / total_weight for k, v in weights.items()}
282
+
283
+ schema_result = json_schema_reward(
284
+ messages=messages,
285
+ ground_truth=ground_truth,
286
+ json_content=json_content,
287
+ expected_schema=expected_schema,
288
+ **kwargs,
289
+ )
290
+
291
+ llm_score = 0.0
292
+ llm_reason = "Skipped: No expected behavior provided"
293
+ if expected_behavior:
294
+ if json_content is None:
295
+ if "error" in schema_result.metrics:
296
+ return schema_result
297
+ last_message = messages[-1]
298
+ content = last_message.get("content", "")
299
+ json_str_from_msg = ""
300
+ try:
301
+ pattern = r"```(?:json)?\s*([\s\S]*?)```"
302
+ code_blocks = re.findall(pattern, content)
303
+ if code_blocks:
304
+ json_str_from_msg = code_blocks[0]
305
+ else:
306
+ json_matches = re.findall(r"\{.*\}", content, re.DOTALL)
307
+ if json_matches:
308
+ json_str_from_msg = json_matches[0]
309
+ except Exception:
310
+ pass
311
+ try:
312
+ if json_str_from_msg:
313
+ json_content = json.loads(json_str_from_msg)
314
+ except json.JSONDecodeError:
315
+ json_content = json_str_from_msg
316
+
317
+ if isinstance(json_content, dict):
318
+ json_str_for_llm = json.dumps(json_content, indent=2)
319
+ else:
320
+ json_str_for_llm = str(json_content)
321
+
322
+ expected_schema_str = json.dumps(expected_schema, indent=2) if expected_schema else "No schema provided"
323
+
324
+ conversation_msg = "No conversation context provided"
325
+ if messages:
326
+ conversation_parts = []
327
+ for msg in messages[:-1]:
328
+ role = msg.get("role", "")
329
+ content_part = msg.get("content", "")
330
+ if role and content_part:
331
+ conversation_parts.append(f"{role}: {content_part}")
332
+ if conversation_parts:
333
+ conversation_msg = "\n".join(conversation_parts)
334
+
335
+ prompt = f"""You are evaluating the quality of JSON content provided by an AI assistant.
336
+ Your job is to assess whether the JSON structure and content is appropriate, correctly formatted,
337
+ and follows the expected schema and behavior.
338
+
339
+ CONVERSATION CONTEXT:
340
+ {conversation_msg}
341
+
342
+ JSON CONTENT:
343
+ {json_str_for_llm}
344
+
345
+ EXPECTED SCHEMA:
346
+ {expected_schema_str}
347
+
348
+ EXPECTED BEHAVIOR/CONTENT:
349
+ {expected_behavior}
350
+
351
+ Evaluate the JSON content and provide:
352
+ 1. A score from 0.0 to 1.0 (where 1.0 is perfect)
353
+ 2. A detailed explanation of your rating
354
+ 3. Specific issues or strengths of the JSON content
355
+
356
+ Format your response as:
357
+ SCORE: [number between 0.0 and 1.0]
358
+ EXPLANATION: [your detailed explanation]
359
+ """
360
+ try:
361
+ import os
362
+
363
+ api_key = openai_api_key or os.environ.get("OPENAI_API_KEY")
364
+ if not api_key:
365
+ raise ValueError("OpenAI API key not provided")
366
+ client = OpenAI(api_key=api_key)
367
+ response = client.chat.completions.create(
368
+ model=model,
369
+ temperature=temperature,
370
+ messages=[{"role": "user", "content": prompt}],
371
+ )
372
+ llm_response = response.choices[0].message.content or ""
373
+ score_match = re.search(r"SCORE:\s*([\d.]+)", llm_response)
374
+ explanation_match = re.search(r"EXPLANATION:\s*(.*)", llm_response, re.DOTALL)
375
+ if score_match:
376
+ try:
377
+ llm_score = float(score_match.group(1))
378
+ llm_score = max(0.0, min(llm_score, 1.0))
379
+ except ValueError:
380
+ llm_score = 0.5
381
+ else:
382
+ llm_score = 0.5
383
+ llm_reason = explanation_match.group(1).strip() if explanation_match else "No explanation provided"
384
+ except Exception as e:
385
+ llm_score = 0.0
386
+ llm_reason = f"Error calling OpenAI API: {str(e)}"
387
+
388
+ combined_metrics = {}
389
+ for key, metric_val in schema_result.metrics.items():
390
+ if key != "schema_similarity":
391
+ combined_metrics[f"schema_{key}"] = metric_val
392
+ else:
393
+ combined_metrics[key] = metric_val
394
+
395
+ combined_metrics["llm_judge"] = MetricResult(
396
+ score=llm_score,
397
+ reason=llm_reason,
398
+ is_score_valid=llm_score >= 0.8,
399
+ )
400
+ combined_metrics["schema_score"] = MetricResult(
401
+ score=schema_result.score,
402
+ reason=f"Schema validation score: {schema_result.score:.2f}",
403
+ is_score_valid=schema_result.score == 1.0,
404
+ )
405
+ combined_metrics["llm_score"] = MetricResult(
406
+ score=llm_score,
407
+ reason=f"LLM judge score: {llm_score:.2f}",
408
+ is_score_valid=llm_score >= 0.8,
409
+ )
410
+
411
+ schema_weight = normalized_weights.get("schema", 0.7)
412
+ llm_weight = normalized_weights.get("llm", 0.3)
413
+ final_score = (schema_result.score * schema_weight) + (llm_score * llm_weight)
414
+ final_reason = f"Composite score. Schema ({schema_result.score:.2f} * {schema_weight:.2f}) + LLM ({llm_score:.2f} * {llm_weight:.2f})."
415
+
416
+ combined_metrics["weights"] = MetricResult(
417
+ score=0.0,
418
+ reason=f"Weights used - Schema: {schema_weight:.2f}, LLM: {llm_weight:.2f}",
419
+ is_score_valid=True,
420
+ )
421
+
422
+ return EvaluateResult(score=final_score, reason=final_reason, metrics=combined_metrics)