ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,1882 @@
1
+ from typing import Any, Dict, List, Optional
2
+ import asyncio
3
+ import json
4
+ import logging
5
+
6
+ from llmevalkit.llm.output_parser import ValidatingLLMClient
7
+ from llmevalkit.metrics import Metric, StandardMetric, MetricPrompt
8
+ from .base import BaseComparator
9
+ from ..types import (
10
+ ParameterComparisonResult,
11
+ ComparisonStrategy,
12
+ ParameterStatus,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class ParameterComparisonMetric(StandardMetric):
19
+ """Metric for parameter comparison using LLM judge."""
20
+
21
+ def __init__(self):
22
+ super().__init__(
23
+ name="parameter_comparison",
24
+ description="Compare two parameter values for semantic equivalence",
25
+ output_range=(0.0, 1.0),
26
+ confidence_range=(0.0, 1.0),
27
+ )
28
+
29
+ # Add custom field for match determination
30
+ from llmevalkit.metrics.field import BaseField
31
+
32
+ self.add_field(
33
+ BaseField(
34
+ name="is_match",
35
+ json_type="boolean",
36
+ description="Whether the parameters are functionally equivalent",
37
+ )
38
+ )
39
+
40
+
41
+ class ParameterComparisonPrompt(MetricPrompt):
42
+ """Prompt template for individual parameter comparison."""
43
+
44
+ system_template = """You are an expert system for comparing function call parameters. Your task is to determine how similar two parameter values are in the context of API function calls.
45
+
46
+ **SCORING GUIDELINES**:
47
+
48
+ 1. **EXACT MATCH (score = 1.0)**: Identical values
49
+ - "hello" == "hello"
50
+ - 123 == 123
51
+ - true == true
52
+
53
+ 2. **SEMANTIC EQUIVALENCE (score = 0.9-1.0)**: Different representation, same meaning
54
+ - "true" vs true (string vs boolean)
55
+ - "123" vs 123 (string vs number)
56
+ - "2023-01-01" vs "January 1, 2023" (different date formats)
57
+ - "yes" vs true (affirmative equivalents)
58
+
59
+ 3. **PARTIAL MATCH (score = 0.3-0.8)**: Similar but not equivalent
60
+ - "hello world" vs "hello" (partial string match)
61
+ - Related but different values
62
+ - Values with minor formatting differences
63
+
64
+ 4. **NO MATCH (score = 0.0-0.2)**: Completely different values
65
+ - "hello" vs 123
66
+ - true vs "no"
67
+ - Unrelated values
68
+
69
+ **Context**: Consider the conversation history and tool purpose when determining equivalence.
70
+
71
+ {% if custom_instructions %}
72
+ **CUSTOM EVALUATION INSTRUCTIONS**:
73
+ {{ custom_instructions }}
74
+
75
+ Follow these custom instructions carefully when making your comparison. They take priority over general guidelines when there are conflicts.
76
+ {% endif %}
77
+
78
+ {% if custom_schema %}
79
+ **CUSTOM RESPONSE SCHEMA**:
80
+ Use this custom JSON schema for your response instead of the default schema:
81
+ {{ custom_schema }}
82
+
83
+ IMPORTANT: Your response must strictly follow this custom schema format.
84
+ {% else %}
85
+ {{ metric_jsonschema }}
86
+ {% endif %}"""
87
+
88
+ user_template = """**Parameter Comparison Task**
89
+
90
+ Parameter Name: {{ parameter_name }}
91
+ Parameter Type: {{ parameter_type }}
92
+ Required: {{ is_required }}
93
+ Status: {{ parameter_status }}
94
+ Default Value: {{ default_value }}
95
+ Tool Function: {{ function_name }}
96
+
97
+ **Predicted Value**:
98
+ - Value: {{ predicted_value }}
99
+ - Type: {{ predicted_type }}
100
+
101
+ **Ground Truth Value**:
102
+ - Value: {{ ground_truth_value }}
103
+ - Type: {{ ground_truth_type }}
104
+
105
+ **Parameter Definition**:
106
+ {{ parameter_definition }}
107
+
108
+ **Tool Specification (OpenAI Format)**:
109
+ {% if tool_specification %}
110
+ {{ tool_specification }}
111
+ {% else %}
112
+ No tool specification provided
113
+ {% endif %}
114
+
115
+ **Conversation Context**:
116
+ {% for message in conversation_context %}
117
+ {{ message.role }}: {{ message.content }}
118
+ {% endfor %}
119
+
120
+ {% if custom_instructions %}
121
+ **CUSTOM EVALUATION INSTRUCTIONS**:
122
+ {{ custom_instructions }}
123
+
124
+ IMPORTANT: Follow these custom instructions carefully when making your comparison. They take priority over general guidelines when there are conflicts.
125
+ {% endif %}
126
+
127
+ {% if custom_schema %}
128
+ **CUSTOM RESPONSE SCHEMA**:
129
+ Use this custom JSON schema for your response instead of the default schema:
130
+ {{ custom_schema }}
131
+
132
+ IMPORTANT: Your response must strictly follow this custom schema format.
133
+ {% endif %}
134
+
135
+ Please compare these parameter values and provide a detailed analysis."""
136
+
137
+
138
+ class BulkParameterComparisonMetric(StandardMetric):
139
+ """Metric for bulk parameter comparison using LLM judge."""
140
+
141
+ def __init__(self):
142
+ super().__init__(
143
+ name="bulk_parameter_comparison",
144
+ description="Compare all parameters between predicted and ground truth tool calls in one evaluation",
145
+ output_range=(0.0, 1.0),
146
+ confidence_range=(0.0, 1.0),
147
+ )
148
+
149
+ # Add custom field for parameter-level scores
150
+ from llmevalkit.metrics.field import BaseField
151
+
152
+ self.add_field(
153
+ BaseField(
154
+ name="parameter_scores",
155
+ json_type="object",
156
+ description="Individual parameter comparison scores and explanations",
157
+ )
158
+ )
159
+
160
+
161
+ class BulkParameterComparisonPrompt(MetricPrompt):
162
+ """Prompt template for bulk parameter comparison."""
163
+
164
+ system_template = """You are an expert system for comparing function call parameters. Your task is to evaluate ALL parameters between two tool calls simultaneously and provide both individual parameter scores and an overall assessment.
165
+
166
+ **SCORING GUIDELINES**:
167
+
168
+ 1. **EXACT MATCH (score = 1.0)**: Identical values
169
+ 2. **SEMANTIC EQUIVALENCE (score = 0.9-1.0)**: Different representation, same meaning
170
+ 3. **PARTIAL MATCH (score = 0.3-0.8)**: Similar but not equivalent
171
+ 4. **NO MATCH (score = 0.0-0.2)**: Completely different values
172
+
173
+ **EVALUATION PROCESS**:
174
+ 1. Compare each parameter individually using the scoring guidelines
175
+ 2. Consider parameter importance (required vs optional)
176
+ 3. Account for default values and missing parameters
177
+ 4. Provide an overall score that weights individual parameter scores appropriately
178
+
179
+ **OUTPUT REQUIREMENTS**:
180
+ - Overall score: Weighted average considering parameter importance
181
+ - Individual parameter scores: For each compared parameter
182
+ - Detailed explanations: For both overall and individual assessments
183
+
184
+ {% if custom_instructions %}
185
+ **CUSTOM EVALUATION INSTRUCTIONS**:
186
+ {{ custom_instructions }}
187
+
188
+ Follow these custom instructions carefully when making your comparison. They take priority over general guidelines when there are conflicts.
189
+ {% endif %}
190
+
191
+ {% if custom_schema %}
192
+ **CUSTOM RESPONSE SCHEMA**:
193
+ Use this custom JSON schema for your response instead of the default schema:
194
+ {{ custom_schema }}
195
+
196
+ IMPORTANT: Your response must strictly follow this custom schema format.
197
+ {% else %}
198
+ {{ metric_jsonschema }}
199
+ {% endif %}"""
200
+
201
+ user_template = """**Bulk Parameter Comparison Task**
202
+
203
+ Function Name: {{ function_name }}
204
+
205
+ **Predicted Call Arguments**:
206
+ {{ predicted_arguments_json }}
207
+
208
+ **Ground Truth Call Arguments**:
209
+ {{ ground_truth_arguments_json }}
210
+
211
+ **Tool Specification (OpenAI Format)**:
212
+ {% if tool_specification %}
213
+ {{ tool_specification }}
214
+ {% else %}
215
+ No tool specification provided
216
+ {% endif %}
217
+
218
+ **Parameters to Compare**:
219
+ {% for param_name, param_info in parameters_info.items() %}
220
+ - **{{ param_name }}**:
221
+ - Required: {{ param_info.is_required }}
222
+ - Type: {{ param_info.parameter_type }}
223
+ - Default: {{ param_info.default_value }}
224
+ - Status: {{ param_info.status }}
225
+ - Predicted: {{ param_info.predicted_value }} ({{ param_info.predicted_type }})
226
+ - Ground Truth: {{ param_info.ground_truth_value }} ({{ param_info.ground_truth_type }})
227
+ {% endfor %}
228
+
229
+ **Conversation Context**:
230
+ {% for message in conversation_context %}
231
+ {{ message.role }}: {{ message.content }}
232
+ {% endfor %}
233
+
234
+ {% if custom_instructions %}
235
+ **CUSTOM EVALUATION INSTRUCTIONS**:
236
+ {{ custom_instructions }}
237
+
238
+ IMPORTANT: Follow these custom instructions carefully when making your comparison. They take priority over general guidelines when there are conflicts.
239
+ {% endif %}
240
+
241
+ {% if custom_schema %}
242
+ **CUSTOM RESPONSE SCHEMA**:
243
+ Use this custom JSON schema for your response instead of the default schema:
244
+ {{ custom_schema }}
245
+
246
+ IMPORTANT: Your response must strictly follow this custom schema format.
247
+ {% endif %}
248
+
249
+ Please evaluate all parameters and provide individual scores plus an overall assessment."""
250
+
251
+
252
+ class LLMJudgeComparator(BaseComparator):
253
+ """LLM-based semantic comparison using ValidatingLLMClient and metrics framework."""
254
+
255
+ def __init__(self, config, llm_client):
256
+ super().__init__(config)
257
+
258
+ # Accept ValidatingLLMClient or compatible objects (for testing)
259
+ if not (
260
+ isinstance(llm_client, ValidatingLLMClient)
261
+ or hasattr(llm_client, "generate")
262
+ or hasattr(llm_client, "generate_async")
263
+ ):
264
+ raise TypeError(
265
+ "LLMJudgeComparator requires a ValidatingLLMClient or compatible client"
266
+ )
267
+
268
+ self.llm_client = llm_client
269
+
270
+ # Initialize metrics and prompts for both individual and bulk comparison
271
+ self.metric = ParameterComparisonMetric()
272
+ self.prompt = ParameterComparisonPrompt(
273
+ metric=self.metric,
274
+ system_template=ParameterComparisonPrompt.system_template,
275
+ user_template=ParameterComparisonPrompt.user_template,
276
+ )
277
+
278
+ # Initialize bulk comparison metric and prompt
279
+ self.bulk_metric = BulkParameterComparisonMetric()
280
+ self.bulk_prompt = BulkParameterComparisonPrompt(
281
+ metric=self.bulk_metric,
282
+ system_template=BulkParameterComparisonPrompt.system_template,
283
+ user_template=BulkParameterComparisonPrompt.user_template,
284
+ )
285
+
286
+ # Add few-shot examples
287
+ self._add_examples()
288
+
289
+ @staticmethod
290
+ def get_default_individual_schema():
291
+ """Get default JSON schema for individual parameter comparison."""
292
+ return {
293
+ "type": "object",
294
+ "properties": {
295
+ "score": {
296
+ "type": "number",
297
+ "description": "Similarity score between 0.0 and 1.0",
298
+ },
299
+ "is_match": {
300
+ "type": "boolean",
301
+ "description": "Whether the parameters are functionally equivalent",
302
+ },
303
+ "explanation": {
304
+ "type": "string",
305
+ "description": "Brief explanation of the comparison result",
306
+ },
307
+ "confidence": {
308
+ "type": "number",
309
+ "description": "Confidence in the assessment between 0.0 and 1.0",
310
+ },
311
+ },
312
+ "required": ["score", "is_match", "explanation", "confidence"],
313
+ }
314
+
315
+ @staticmethod
316
+ def get_default_bulk_schema():
317
+ """Get default JSON schema for bulk parameter comparison."""
318
+ return {
319
+ "type": "object",
320
+ "properties": {
321
+ "overall_score": {
322
+ "type": "number",
323
+ "description": "Overall similarity score between 0.0 and 1.0",
324
+ },
325
+ "overall_explanation": {
326
+ "type": "string",
327
+ "description": "Detailed explanation of the comparison",
328
+ },
329
+ "confidence": {
330
+ "type": "number",
331
+ "description": "Confidence in the assessment between 0.0 and 1.0",
332
+ },
333
+ "parameter_scores": {
334
+ "type": "object",
335
+ "additionalProperties": {
336
+ "type": "object",
337
+ "properties": {
338
+ "score": {
339
+ "type": "number",
340
+ "description": "Individual parameter score",
341
+ },
342
+ "explanation": {
343
+ "type": "string",
344
+ "description": "Explanation for this parameter",
345
+ },
346
+ "is_match": {
347
+ "type": "boolean",
348
+ "description": "Whether this parameter matches",
349
+ },
350
+ },
351
+ "required": ["score", "explanation", "is_match"],
352
+ },
353
+ },
354
+ },
355
+ "required": [
356
+ "overall_score",
357
+ "overall_explanation",
358
+ "confidence",
359
+ "parameter_scores",
360
+ ],
361
+ }
362
+
363
+ def _add_examples(self):
364
+ """Add few-shot examples to improve LLM performance."""
365
+ examples = [
366
+ {
367
+ "user_kwargs": {
368
+ "parameter_name": "enabled",
369
+ "function_name": "set_feature",
370
+ "parameter_type": "boolean",
371
+ "is_required": "true",
372
+ "parameter_status": "both_present",
373
+ "default_value": "null",
374
+ "predicted_value": "true",
375
+ "predicted_type": "string",
376
+ "ground_truth_value": "true",
377
+ "ground_truth_type": "boolean",
378
+ "conversation_context": [],
379
+ "parameter_definition": "Boolean flag to enable feature",
380
+ },
381
+ "output": {
382
+ "explanation": "String 'true' is semantically equivalent to boolean true",
383
+ "evidence": "Both values represent the same boolean state despite different types",
384
+ "output": 1.0,
385
+ "confidence": 0.95,
386
+ "is_match": True,
387
+ "correction": {
388
+ "has_issues": False,
389
+ "issue_type": "none",
390
+ "corrected_value": None,
391
+ },
392
+ },
393
+ },
394
+ {
395
+ "user_kwargs": {
396
+ "parameter_name": "count",
397
+ "function_name": "process_items",
398
+ "parameter_type": "integer",
399
+ "is_required": "true",
400
+ "parameter_status": "both_present",
401
+ "default_value": "null",
402
+ "predicted_value": "10",
403
+ "predicted_type": "string",
404
+ "ground_truth_value": "10",
405
+ "ground_truth_type": "integer",
406
+ "conversation_context": [],
407
+ "parameter_definition": "Number of items to process",
408
+ },
409
+ "output": {
410
+ "explanation": "String '10' represents the same numeric value as integer 10",
411
+ "evidence": "Both values represent the same quantity despite type difference",
412
+ "output": 1.0,
413
+ "confidence": 0.98,
414
+ "is_match": True,
415
+ "correction": {
416
+ "has_issues": False,
417
+ "issue_type": "none",
418
+ "corrected_value": None,
419
+ },
420
+ },
421
+ },
422
+ {
423
+ "user_kwargs": {
424
+ "parameter_name": "message",
425
+ "function_name": "send_notification",
426
+ "parameter_type": "string",
427
+ "is_required": "true",
428
+ "parameter_status": "both_present",
429
+ "default_value": "null",
430
+ "predicted_value": "Hello World",
431
+ "predicted_type": "string",
432
+ "ground_truth_value": "Hello world",
433
+ "ground_truth_type": "string",
434
+ "conversation_context": [],
435
+ "parameter_definition": "Message text to display",
436
+ },
437
+ "output": {
438
+ "explanation": "Minor capitalization difference in otherwise identical strings",
439
+ "evidence": "Content is essentially the same with only case variation",
440
+ "output": 0.9,
441
+ "confidence": 0.8,
442
+ "is_match": False,
443
+ "correction": {
444
+ "has_issues": True,
445
+ "issue_type": "formatting",
446
+ "corrected_value": "Hello world",
447
+ },
448
+ },
449
+ },
450
+ ]
451
+
452
+ for example in examples:
453
+ self.prompt.add_example(example["user_kwargs"], example["output"])
454
+
455
+ def compare_parameter(
456
+ self,
457
+ param_name: str,
458
+ predicted_value: Any,
459
+ ground_truth_value: Any,
460
+ context: Optional[Dict[str, Any]] = None,
461
+ custom_instructions: Optional[str] = None,
462
+ custom_schema: Optional[Dict[str, Any]] = None,
463
+ ) -> ParameterComparisonResult:
464
+ """Compare parameters with optional custom instructions - SYNC VERSION"""
465
+
466
+ context = context or {}
467
+ param_def = context.get("parameter_definition", {})
468
+ param_status = context.get("parameter_status", ParameterStatus.BOTH_PRESENT)
469
+
470
+ # Normalize values if configured
471
+ param_type = param_def.get("type", "string") if param_def else "string"
472
+
473
+ if self.config.normalize_types:
474
+ predicted_value = self._normalize_value(predicted_value, param_type)
475
+ ground_truth_value = self._normalize_value(ground_truth_value, param_type)
476
+
477
+ # Build prompt arguments
478
+ user_kwargs = {
479
+ "parameter_name": param_name,
480
+ "function_name": context.get("function_name", "unknown"),
481
+ "parameter_type": param_type,
482
+ "is_required": (
483
+ str(param_def.get("required", False)) if param_def else "false"
484
+ ),
485
+ "parameter_status": (
486
+ param_status.value
487
+ if hasattr(param_status, "value")
488
+ else str(param_status)
489
+ ),
490
+ "default_value": (
491
+ str(param_def.get("default"))
492
+ if param_def and param_def.get("default") is not None
493
+ else "null"
494
+ ),
495
+ "predicted_value": str(predicted_value),
496
+ "predicted_type": type(predicted_value).__name__,
497
+ "ground_truth_value": str(ground_truth_value),
498
+ "ground_truth_type": type(ground_truth_value).__name__,
499
+ "conversation_context": context.get("conversation_history", []),
500
+ "parameter_definition": (
501
+ param_def.get("description", "") if param_def else ""
502
+ ),
503
+ "tool_specification": context.get("tool_specification", None),
504
+ "custom_instructions": custom_instructions or " ",
505
+ }
506
+
507
+ # Build messages using the prompt template
508
+ messages = self.prompt.build_messages(user_kwargs)
509
+
510
+ try:
511
+ # Use ValidatingLLMClient with schema - try metric schema first, fallback to default
512
+ schema = None
513
+ try:
514
+ schema = self.metric.to_jsonschema()
515
+ except Exception:
516
+ # Fallback to default schema if metric schema fails
517
+ schema = self.get_default_individual_schema()
518
+
519
+ response = self.llm_client.generate(
520
+ prompt=messages,
521
+ schema=schema,
522
+ )
523
+
524
+ # Parse the validated response
525
+ if isinstance(response, str):
526
+ result_data = json.loads(response)
527
+ else:
528
+ result_data = response
529
+
530
+ score = float(result_data.get("output", 0.0))
531
+ explanation = result_data.get("explanation", "")
532
+ confidence = float(result_data.get("confidence", 0.5))
533
+ is_match = bool(result_data.get("is_match", False))
534
+ evidence = result_data.get("evidence", "")
535
+
536
+ # Clamp score to valid range
537
+ score = max(0.0, min(1.0, score))
538
+ confidence = max(0.0, min(1.0, confidence))
539
+
540
+ except Exception as e:
541
+ logger.warning(f"LLM comparison failed for parameter {param_name}: {e}")
542
+
543
+ # Fallback to exact match
544
+ is_match = predicted_value == ground_truth_value
545
+ score = 1.0 if is_match else 0.0
546
+ explanation = f"LLM comparison failed ({str(e)}), using exact match. Result: {is_match}"
547
+ confidence = 0.3 # Low confidence due to fallback
548
+ evidence = "Fallback comparison due to LLM error"
549
+
550
+ return ParameterComparisonResult(
551
+ parameter_name=param_name,
552
+ predicted_value=predicted_value,
553
+ ground_truth_value=ground_truth_value,
554
+ predicted_resolved_value=predicted_value,
555
+ ground_truth_resolved_value=ground_truth_value,
556
+ parameter_status=param_status,
557
+ comparison_strategy=ComparisonStrategy.LLM_JUDGE,
558
+ score=score,
559
+ explanation=explanation,
560
+ evidence=evidence,
561
+ is_match=is_match,
562
+ confidence=confidence,
563
+ )
564
+
565
+ def compare_function_name(
566
+ self,
567
+ predicted_name: str,
568
+ ground_truth_name: str,
569
+ context: Optional[Dict[str, Any]] = None,
570
+ ) -> float:
571
+ """Compare function names with semantic understanding."""
572
+
573
+ # Exact match gets perfect score
574
+ if predicted_name == ground_truth_name:
575
+ return 1.0
576
+
577
+ # Use LLM for semantic function name comparison
578
+ system_prompt = """You are comparing two function names for semantic similarity.
579
+ Consider:
580
+ 1. Exact matches = 1.0
581
+ 2. Synonymous functions (e.g., "get_user" vs "fetch_user") = 0.8-0.9
582
+ 3. Related functions (e.g., "get_user" vs "get_profile") = 0.3-0.7
583
+ 4. Unrelated functions = 0.0-0.2
584
+
585
+ Return only a numeric score between 0.0 and 1.0."""
586
+
587
+ messages = [
588
+ {"role": "system", "content": system_prompt},
589
+ {
590
+ "role": "user",
591
+ "content": f"Function 1: {predicted_name}\nFunction 2: {ground_truth_name}",
592
+ },
593
+ ]
594
+
595
+ try:
596
+ response = self.llm_client.generate(prompt=messages)
597
+
598
+ # Extract numeric score
599
+ import re
600
+
601
+ score_match = re.search(r"\b([0-1](?:\.\d+)?)\b", str(response))
602
+ if score_match:
603
+ score = float(score_match.group(1))
604
+ return max(0.0, min(1.0, score))
605
+
606
+ except Exception as e:
607
+ logger.warning(f"LLM function name comparison failed: {e}")
608
+
609
+ # Conservative fallback
610
+ return 0.0
611
+
612
+ async def compare_parameter_async(
613
+ self,
614
+ param_name: str,
615
+ predicted_value: Any,
616
+ ground_truth_value: Any,
617
+ context: Optional[Dict[str, Any]] = None,
618
+ custom_instructions: Optional[str] = None,
619
+ custom_schema: Optional[Dict[str, Any]] = None,
620
+ ) -> ParameterComparisonResult:
621
+ """Async version of parameter comparison."""
622
+
623
+ context = context or {}
624
+ param_def = context.get("parameter_definition", {})
625
+ param_status = context.get("parameter_status", ParameterStatus.BOTH_PRESENT)
626
+
627
+ # Normalize values if configured
628
+ param_type = param_def.get("type", "string") if param_def else "string"
629
+
630
+ if self.config.normalize_types:
631
+ predicted_value = self._normalize_value(predicted_value, param_type)
632
+ ground_truth_value = self._normalize_value(ground_truth_value, param_type)
633
+
634
+ # Build prompt arguments
635
+ user_kwargs = {
636
+ "parameter_name": param_name,
637
+ "function_name": context.get("function_name", "unknown"),
638
+ "parameter_type": param_type,
639
+ "is_required": (
640
+ str(param_def.get("required", False)) if param_def else "false"
641
+ ),
642
+ "parameter_status": (
643
+ param_status.value
644
+ if hasattr(param_status, "value")
645
+ else str(param_status)
646
+ ),
647
+ "default_value": (
648
+ str(param_def.get("default"))
649
+ if param_def and param_def.get("default") is not None
650
+ else "null"
651
+ ),
652
+ "predicted_value": str(predicted_value),
653
+ "predicted_type": type(predicted_value).__name__,
654
+ "ground_truth_value": str(ground_truth_value),
655
+ "ground_truth_type": type(ground_truth_value).__name__,
656
+ "conversation_context": context.get("conversation_history", []),
657
+ "parameter_definition": (
658
+ param_def.get("description", "") if param_def else ""
659
+ ),
660
+ "tool_specification": context.get("tool_specification", None),
661
+ "custom_instructions": custom_instructions or " ",
662
+ }
663
+
664
+ # Build messages using the prompt template
665
+ messages = self.prompt.build_messages(user_kwargs)
666
+
667
+ try:
668
+ # Use ValidatingLLMClient with schema - try metric schema first, fallback to default
669
+ schema = None
670
+ try:
671
+ schema = self.metric.to_jsonschema()
672
+ except Exception:
673
+ # Fallback to default schema if metric schema fails
674
+ schema = self.get_default_individual_schema()
675
+
676
+ response = await self.llm_client.generate_async(
677
+ prompt=messages,
678
+ schema=schema,
679
+ )
680
+
681
+ # Parse the validated response
682
+ if isinstance(response, str):
683
+ result_data = json.loads(response)
684
+ else:
685
+ result_data = response
686
+
687
+ score = float(result_data.get("output", 0.0))
688
+ explanation = result_data.get("explanation", "")
689
+ confidence = float(result_data.get("confidence", 0.5))
690
+ is_match = bool(result_data.get("is_match", False))
691
+ evidence = result_data.get("evidence", "")
692
+
693
+ # Clamp score to valid range
694
+ score = max(0.0, min(1.0, score))
695
+ confidence = max(0.0, min(1.0, confidence))
696
+
697
+ except Exception as e:
698
+ logger.warning(
699
+ f"Async LLM comparison failed for parameter {param_name}: {e}"
700
+ )
701
+
702
+ # Fallback to exact match
703
+ is_match = predicted_value == ground_truth_value
704
+ score = 1.0 if is_match else 0.0
705
+ explanation = f"LLM comparison failed ({str(e)}), using exact match. Result: {is_match}"
706
+ confidence = 0.3 # Low confidence due to fallback
707
+ evidence = "Fallback comparison due to LLM error"
708
+
709
+ return ParameterComparisonResult(
710
+ parameter_name=param_name,
711
+ predicted_value=predicted_value,
712
+ ground_truth_value=ground_truth_value,
713
+ predicted_resolved_value=predicted_value,
714
+ ground_truth_resolved_value=ground_truth_value,
715
+ parameter_status=param_status,
716
+ comparison_strategy=ComparisonStrategy.LLM_JUDGE,
717
+ score=score,
718
+ explanation=explanation,
719
+ evidence=evidence,
720
+ is_match=is_match,
721
+ confidence=confidence,
722
+ )
723
+
724
+ async def compare_tool_calls_async(
725
+ self,
726
+ predicted_call: Dict[str, Any],
727
+ ground_truth_call: Dict[str, Any],
728
+ conversation_history: Optional[List[Dict[str, str]]] = None,
729
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
730
+ custom_instructions: Optional[str] = None,
731
+ ) -> Any: # Return type imported dynamically to avoid circular imports
732
+ """Async version of tool call comparison with parameter-level async support."""
733
+
734
+ # Import here to avoid circular imports
735
+ from ..types import ToolCallComparisonResult, FunctionCallInput
736
+
737
+ # Use the base class logic but with async parameter comparison
738
+ # Extract function names
739
+ pred_name = predicted_call.get("function", {}).get("name", "")
740
+ gt_name = ground_truth_call.get("function", {}).get("name", "")
741
+
742
+ # Compare function names (sync operation)
743
+ fn_score = self.compare_function_name(pred_name, gt_name)
744
+ fn_match = fn_score >= 0.95
745
+
746
+ # Extract tool specification
747
+ tool_spec = self._extract_tool_spec(
748
+ gt_name, tool_specs
749
+ ) or self._extract_tool_spec(pred_name, tool_specs)
750
+
751
+ # Extract and parse parameters
752
+ pred_params = predicted_call.get("function", {}).get("arguments", {})
753
+ gt_params = ground_truth_call.get("function", {}).get("arguments", {})
754
+
755
+ if isinstance(pred_params, str):
756
+ try:
757
+ pred_params = json.loads(pred_params)
758
+ except json.JSONDecodeError:
759
+ logger.warning(f"Failed to parse predicted parameters: {pred_params}")
760
+ pred_params = {}
761
+
762
+ if isinstance(gt_params, str):
763
+ try:
764
+ gt_params = json.loads(gt_params)
765
+ except json.JSONDecodeError:
766
+ logger.warning(f"Failed to parse ground truth parameters: {gt_params}")
767
+ gt_params = {}
768
+
769
+ # Resolve parameters with defaults
770
+ pred_resolved = self._resolve_parameters_with_defaults(pred_params, tool_spec)
771
+ gt_resolved = self._resolve_parameters_with_defaults(gt_params, tool_spec)
772
+
773
+ # Determine all parameters to compare
774
+ params_to_compare = self.config.parameters_to_compare
775
+ if params_to_compare is None:
776
+ if self.config.include_default_parameters:
777
+ params_to_compare = set(pred_resolved.keys()) | set(gt_resolved.keys())
778
+ else:
779
+ params_to_compare = set(pred_params.keys()) | set(gt_params.keys())
780
+
781
+ # Compare each parameter asynchronously
782
+ param_tasks = []
783
+ context = {
784
+ "conversation_history": conversation_history,
785
+ "tool_specs": tool_specs,
786
+ "tool_spec": tool_spec,
787
+ "predicted_call": predicted_call,
788
+ "ground_truth_call": ground_truth_call,
789
+ "function_name": gt_name or pred_name,
790
+ }
791
+
792
+ import asyncio
793
+
794
+ for param_name in params_to_compare:
795
+ pred_val = pred_params.get(param_name)
796
+ gt_val = gt_params.get(param_name)
797
+ pred_resolved_val = pred_resolved.get(param_name)
798
+ gt_resolved_val = gt_resolved.get(param_name)
799
+
800
+ # Get parameter definition from tool spec
801
+ param_def = None
802
+ if tool_spec:
803
+ param_def = next(
804
+ (p for p in tool_spec.parameters if p.name == param_name), None
805
+ )
806
+
807
+ # Determine parameter status
808
+ param_status = self._determine_parameter_status(
809
+ param_name, pred_params, gt_params, pred_resolved, gt_resolved
810
+ )
811
+
812
+ # Enhanced context for this parameter
813
+ param_context = context.copy()
814
+ param_context.update(
815
+ {
816
+ "parameter_definition": param_def.dict() if param_def else None,
817
+ "parameter_status": param_status,
818
+ "predicted_resolved": pred_resolved_val,
819
+ "ground_truth_resolved": gt_resolved_val,
820
+ }
821
+ )
822
+
823
+ # Create async task for parameter comparison
824
+ task = self.compare_parameter_async(
825
+ param_name,
826
+ pred_resolved_val,
827
+ gt_resolved_val,
828
+ param_context,
829
+ custom_instructions=custom_instructions,
830
+ )
831
+ param_tasks.append(task)
832
+
833
+ # Wait for all parameter comparisons to complete
834
+ param_results = await asyncio.gather(*param_tasks)
835
+
836
+ # Enhance results with additional information
837
+ for result, param_name in zip(param_results, params_to_compare):
838
+ pred_resolved_val = pred_resolved.get(param_name)
839
+ gt_resolved_val = gt_resolved.get(param_name)
840
+ param_status = self._determine_parameter_status(
841
+ param_name, pred_params, gt_params, pred_resolved, gt_resolved
842
+ )
843
+
844
+ param_def = None
845
+ if tool_spec:
846
+ param_def = next(
847
+ (p for p in tool_spec.parameters if p.name == param_name), None
848
+ )
849
+
850
+ result.predicted_resolved_value = pred_resolved_val
851
+ result.ground_truth_resolved_value = gt_resolved_val
852
+ result.parameter_status = param_status
853
+ result.parameter_definition = param_def.dict() if param_def else None
854
+ result.is_required = param_def.required if param_def else False
855
+ result.default_value = param_def.default if param_def else None
856
+
857
+ # Calculate overall score using weighted approach
858
+ param_score = self._calculate_weighted_score(param_results)
859
+
860
+ overall_score = (
861
+ self.config.weight_function_name * fn_score
862
+ + self.config.weight_parameters * param_score
863
+ )
864
+
865
+ # Find missing required parameters and unexpected parameters
866
+ missing_required = []
867
+ unexpected_params = []
868
+
869
+ if tool_spec:
870
+ required_params = {p.name for p in tool_spec.parameters if p.required}
871
+ all_defined_params = {p.name for p in tool_spec.parameters}
872
+
873
+ # Check for missing required parameters
874
+ for req_param in required_params:
875
+ if req_param not in pred_resolved and req_param not in gt_resolved:
876
+ missing_required.append(req_param)
877
+
878
+ # Check for unexpected parameters
879
+ for param_name in params_to_compare:
880
+ if param_name not in all_defined_params:
881
+ unexpected_params.append(param_name)
882
+
883
+ # Apply penalties for missing required parameters
884
+ if missing_required:
885
+ penalty = len(missing_required) * self.config.missing_parameter_penalty
886
+ overall_score *= 1 - penalty
887
+ overall_score = max(0.0, overall_score)
888
+
889
+ # Generate overall explanation
890
+ overall_explanation = self._generate_overall_explanation(
891
+ fn_match,
892
+ fn_score,
893
+ param_results,
894
+ overall_score,
895
+ missing_required,
896
+ unexpected_params,
897
+ )
898
+
899
+ return ToolCallComparisonResult(
900
+ predicted_call=predicted_call,
901
+ ground_truth_call=ground_truth_call,
902
+ function_name_match=fn_match,
903
+ function_name_score=fn_score,
904
+ parameter_results=param_results,
905
+ overall_score=overall_score,
906
+ overall_explanation=overall_explanation,
907
+ strategy_used=self.config.strategy,
908
+ missing_required_params=missing_required,
909
+ unexpected_params=unexpected_params,
910
+ metadata={
911
+ "tool_spec_used": tool_spec.dict() if tool_spec else None,
912
+ "parameters_compared": list(params_to_compare),
913
+ "default_parameters_included": self.config.include_default_parameters,
914
+ "execution_mode": "async",
915
+ },
916
+ )
917
+
918
+ def compare_tool_calls(
919
+ self,
920
+ predicted_call: Dict[str, Any],
921
+ ground_truth_call: Dict[str, Any],
922
+ conversation_history: Optional[List[Dict[str, str]]] = None,
923
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
924
+ custom_instructions: Optional[str] = None,
925
+ custom_schema: Optional[str] = None,
926
+ ) -> Any:
927
+ """
928
+ Sync version that checks if bulk comparison is enabled.
929
+ """
930
+ if self.config.llm_bulk_comparison:
931
+ return self._compare_tool_calls_bulk(
932
+ predicted_call,
933
+ ground_truth_call,
934
+ conversation_history,
935
+ tool_specs,
936
+ custom_instructions,
937
+ )
938
+ else:
939
+ # Use the base class implementation (individual parameter comparison)
940
+ # Note: Base class doesn't support custom_instructions yet, but we can still call it
941
+ return super().compare_tool_calls(
942
+ predicted_call, ground_truth_call, conversation_history, tool_specs
943
+ )
944
+
945
+ async def _compare_tool_calls_individual_async(
946
+ self,
947
+ predicted_call: Dict[str, Any],
948
+ ground_truth_call: Dict[str, Any],
949
+ conversation_history: Optional[List[Dict[str, str]]] = None,
950
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
951
+ custom_instructions: Optional[str] = None,
952
+ custom_schema: Optional[str] = None,
953
+ ) -> Any:
954
+ """
955
+ Async version that checks if bulk comparison is enabled.
956
+ """
957
+ if self.config.llm_bulk_comparison:
958
+ return await self._compare_tool_calls_bulk_async(
959
+ predicted_call,
960
+ ground_truth_call,
961
+ conversation_history,
962
+ tool_specs,
963
+ custom_instructions,
964
+ custom_schema,
965
+ )
966
+ else:
967
+ # Use individual parameter comparison async (already implemented above)
968
+ return await self._compare_tool_calls_individual_async(
969
+ predicted_call,
970
+ ground_truth_call,
971
+ conversation_history,
972
+ tool_specs,
973
+ custom_instructions,
974
+ )
975
+
976
+ def _compare_tool_calls_bulk(
977
+ self,
978
+ predicted_call: Dict[str, Any],
979
+ ground_truth_call: Dict[str, Any],
980
+ conversation_history: Optional[List[Dict[str, str]]] = None,
981
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
982
+ custom_instructions: Optional[str] = None,
983
+ ) -> Any:
984
+ """Sync bulk comparison of all parameters in one LLM call."""
985
+
986
+ # Import here to avoid circular imports
987
+ from ..types import ToolCallComparisonResult
988
+
989
+ # Extract function names
990
+ pred_name = predicted_call.get("function", {}).get("name", "")
991
+ gt_name = ground_truth_call.get("function", {}).get("name", "")
992
+
993
+ # Compare function names (sync operation)
994
+ fn_score = self.compare_function_name(pred_name, gt_name)
995
+ fn_match = fn_score >= 0.95
996
+
997
+ # Extract tool specification
998
+ tool_spec = self._extract_tool_spec(
999
+ gt_name, tool_specs
1000
+ ) or self._extract_tool_spec(pred_name, tool_specs)
1001
+
1002
+ # Extract and parse parameters
1003
+ pred_params = predicted_call.get("function", {}).get("arguments", {})
1004
+ gt_params = ground_truth_call.get("function", {}).get("arguments", {})
1005
+
1006
+ if isinstance(pred_params, str):
1007
+ try:
1008
+ pred_params = json.loads(pred_params)
1009
+ except json.JSONDecodeError:
1010
+ logger.warning(f"Failed to parse predicted parameters: {pred_params}")
1011
+ pred_params = {}
1012
+
1013
+ if isinstance(gt_params, str):
1014
+ try:
1015
+ gt_params = json.loads(gt_params)
1016
+ except json.JSONDecodeError:
1017
+ logger.warning(f"Failed to parse ground truth parameters: {gt_params}")
1018
+ gt_params = {}
1019
+
1020
+ # Resolve parameters with defaults
1021
+ pred_resolved = self._resolve_parameters_with_defaults(pred_params, tool_spec)
1022
+ gt_resolved = self._resolve_parameters_with_defaults(gt_params, tool_spec)
1023
+
1024
+ # Determine all parameters to compare
1025
+ params_to_compare = self.config.parameters_to_compare
1026
+ if params_to_compare is None:
1027
+ if self.config.include_default_parameters:
1028
+ params_to_compare = set(pred_resolved.keys()) | set(gt_resolved.keys())
1029
+ else:
1030
+ params_to_compare = set(pred_params.keys()) | set(gt_params.keys())
1031
+
1032
+ # Build bulk comparison context
1033
+ parameters_info = {}
1034
+ for param_name in params_to_compare:
1035
+ pred_val = pred_resolved.get(param_name)
1036
+ gt_val = gt_resolved.get(param_name)
1037
+ param_status = self._determine_parameter_status(
1038
+ param_name, pred_params, gt_params, pred_resolved, gt_resolved
1039
+ )
1040
+
1041
+ # Get parameter definition from tool spec
1042
+ param_def = None
1043
+ if tool_spec:
1044
+ param_def = next(
1045
+ (p for p in tool_spec.parameters if p.name == param_name), None
1046
+ )
1047
+
1048
+ parameters_info[param_name] = {
1049
+ "predicted_value": pred_val,
1050
+ "ground_truth_value": gt_val,
1051
+ "predicted_type": type(pred_val).__name__,
1052
+ "ground_truth_type": type(gt_val).__name__,
1053
+ "is_required": param_def.required if param_def else False,
1054
+ "parameter_type": param_def.type if param_def else "unknown",
1055
+ "default_value": param_def.default if param_def else None,
1056
+ "status": (
1057
+ param_status.value
1058
+ if hasattr(param_status, "value")
1059
+ else str(param_status)
1060
+ ),
1061
+ }
1062
+
1063
+ # Build prompt arguments for bulk comparison
1064
+ user_kwargs = {
1065
+ "function_name": gt_name or pred_name,
1066
+ "predicted_arguments_json": json.dumps(pred_resolved, indent=2),
1067
+ "ground_truth_arguments_json": json.dumps(gt_resolved, indent=2),
1068
+ "tool_specification": json.dumps(
1069
+ tool_spec.dict() if tool_spec else {}, indent=2
1070
+ ),
1071
+ "parameters_info": parameters_info,
1072
+ "conversation_context": conversation_history or [],
1073
+ "custom_instructions": custom_instructions or " ",
1074
+ }
1075
+
1076
+ # Build messages using the bulk prompt template
1077
+ messages = self.bulk_prompt.build_messages(user_kwargs)
1078
+
1079
+ try:
1080
+ # Use ValidatingLLMClient with schema - try bulk metric schema first, fallback to default
1081
+ schema = None
1082
+ try:
1083
+ schema = self.bulk_metric.to_jsonschema()
1084
+ except Exception:
1085
+ # Fallback to default schema if bulk metric schema fails
1086
+ schema = self.get_default_bulk_schema()
1087
+
1088
+ response = self.llm_client.generate(
1089
+ prompt=messages,
1090
+ schema=schema,
1091
+ )
1092
+
1093
+ # Parse the validated response
1094
+ if isinstance(response, str):
1095
+ result_data = json.loads(response)
1096
+ else:
1097
+ result_data = response
1098
+
1099
+ overall_score = float(result_data.get("overall_score", 0.0))
1100
+ overall_explanation = result_data.get("overall_explanation", "")
1101
+ confidence = float(result_data.get("confidence", 0.5))
1102
+ parameter_scores = result_data.get("parameter_scores", {})
1103
+
1104
+ # Convert bulk result to individual parameter results
1105
+ param_results = []
1106
+ for param_name in params_to_compare:
1107
+ param_score_data = parameter_scores.get(param_name, {})
1108
+ param_score = float(param_score_data.get("score", 0.0))
1109
+ param_explanation = param_score_data.get(
1110
+ "explanation", f"No explanation for {param_name}"
1111
+ )
1112
+ param_is_match = bool(param_score_data.get("is_match", False))
1113
+
1114
+ param_info = parameters_info[param_name]
1115
+ param_status = self._determine_parameter_status(
1116
+ param_name, pred_params, gt_params, pred_resolved, gt_resolved
1117
+ )
1118
+
1119
+ param_result = ParameterComparisonResult(
1120
+ parameter_name=param_name,
1121
+ predicted_value=pred_params.get(param_name),
1122
+ ground_truth_value=gt_params.get(param_name),
1123
+ predicted_resolved_value=param_info["predicted_value"],
1124
+ ground_truth_resolved_value=param_info["ground_truth_value"],
1125
+ parameter_status=param_status,
1126
+ comparison_strategy=ComparisonStrategy.LLM_JUDGE,
1127
+ score=param_score,
1128
+ explanation=param_explanation,
1129
+ evidence=f"Bulk LLM comparison (confidence: {confidence:.2f})",
1130
+ is_match=param_is_match,
1131
+ confidence=confidence,
1132
+ )
1133
+ param_results.append(param_result)
1134
+
1135
+ except Exception as e:
1136
+ logger.warning(f"Bulk LLM comparison failed: {e}")
1137
+
1138
+ # Fallback to individual parameter comparison
1139
+ return super().compare_tool_calls(
1140
+ predicted_call, ground_truth_call, conversation_history, tool_specs
1141
+ )
1142
+
1143
+ # Calculate overall score (already provided by bulk LLM)
1144
+ overall_score = max(0.0, min(1.0, overall_score))
1145
+
1146
+ # Apply function name weight
1147
+ final_score = (
1148
+ self.config.weight_function_name * fn_score
1149
+ + self.config.weight_parameters * overall_score
1150
+ )
1151
+
1152
+ # Find missing required parameters and unexpected parameters
1153
+ missing_required = []
1154
+ unexpected_params = []
1155
+
1156
+ if tool_spec:
1157
+ required_params = {p.name for p in tool_spec.parameters if p.required}
1158
+ all_defined_params = {p.name for p in tool_spec.parameters}
1159
+
1160
+ for req_param in required_params:
1161
+ if req_param not in pred_resolved and req_param not in gt_resolved:
1162
+ missing_required.append(req_param)
1163
+
1164
+ for param_name in params_to_compare:
1165
+ if param_name not in all_defined_params:
1166
+ unexpected_params.append(param_name)
1167
+
1168
+ # Apply penalties for missing required parameters
1169
+ if missing_required:
1170
+ penalty = len(missing_required) * self.config.missing_parameter_penalty
1171
+ final_score *= 1 - penalty
1172
+ final_score = max(0.0, final_score)
1173
+
1174
+ return ToolCallComparisonResult(
1175
+ predicted_call=predicted_call,
1176
+ ground_truth_call=ground_truth_call,
1177
+ function_name_match=fn_match,
1178
+ function_name_score=fn_score,
1179
+ parameter_results=param_results,
1180
+ overall_score=final_score,
1181
+ overall_explanation=f"Bulk LLM comparison: {overall_explanation}",
1182
+ strategy_used=self.config.strategy,
1183
+ missing_required_params=missing_required,
1184
+ unexpected_params=unexpected_params,
1185
+ metadata={
1186
+ "tool_spec_used": tool_spec.dict() if tool_spec else None,
1187
+ "parameters_compared": list(params_to_compare),
1188
+ "default_parameters_included": self.config.include_default_parameters,
1189
+ "bulk_comparison": True,
1190
+ "llm_confidence": confidence,
1191
+ "execution_mode": "sync_bulk",
1192
+ },
1193
+ )
1194
+
1195
+ async def _compare_tool_calls_bulk_async(
1196
+ self,
1197
+ predicted_call: Dict[str, Any],
1198
+ ground_truth_call: Dict[str, Any],
1199
+ conversation_history: Optional[List[Dict[str, str]]] = None,
1200
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
1201
+ custom_instructions: Optional[str] = None,
1202
+ ) -> Any:
1203
+ """Async bulk comparison of all parameters in one LLM call."""
1204
+
1205
+ # Most of the logic is the same as sync version, but with async LLM call
1206
+ # Import here to avoid circular imports
1207
+ from ..types import ToolCallComparisonResult
1208
+
1209
+ # [Same parameter extraction and processing logic as sync version]
1210
+ # Extract function names
1211
+ pred_name = predicted_call.get("function", {}).get("name", "")
1212
+ gt_name = ground_truth_call.get("function", {}).get("name", "")
1213
+
1214
+ # Compare function names (sync operation)
1215
+ fn_score = self.compare_function_name(pred_name, gt_name)
1216
+ fn_match = fn_score >= 0.95
1217
+
1218
+ # Extract tool specification
1219
+ tool_spec = self._extract_tool_spec(
1220
+ gt_name, tool_specs
1221
+ ) or self._extract_tool_spec(pred_name, tool_specs)
1222
+
1223
+ # Extract and parse parameters
1224
+ pred_params = predicted_call.get("function", {}).get("arguments", {})
1225
+ gt_params = ground_truth_call.get("function", {}).get("arguments", {})
1226
+
1227
+ if isinstance(pred_params, str):
1228
+ try:
1229
+ pred_params = json.loads(pred_params)
1230
+ except json.JSONDecodeError:
1231
+ logger.warning(f"Failed to parse predicted parameters: {pred_params}")
1232
+ pred_params = {}
1233
+
1234
+ if isinstance(gt_params, str):
1235
+ try:
1236
+ gt_params = json.loads(gt_params)
1237
+ except json.JSONDecodeError:
1238
+ logger.warning(f"Failed to parse ground truth parameters: {gt_params}")
1239
+ gt_params = {}
1240
+
1241
+ # Resolve parameters with defaults
1242
+ pred_resolved = self._resolve_parameters_with_defaults(pred_params, tool_spec)
1243
+ gt_resolved = self._resolve_parameters_with_defaults(gt_params, tool_spec)
1244
+
1245
+ # Determine all parameters to compare
1246
+ params_to_compare = self.config.parameters_to_compare
1247
+ if params_to_compare is None:
1248
+ if self.config.include_default_parameters:
1249
+ params_to_compare = set(pred_resolved.keys()) | set(gt_resolved.keys())
1250
+ else:
1251
+ params_to_compare = set(pred_params.keys()) | set(gt_params.keys())
1252
+
1253
+ # Build bulk comparison context
1254
+ parameters_info = {}
1255
+ for param_name in params_to_compare:
1256
+ pred_val = pred_resolved.get(param_name)
1257
+ gt_val = gt_resolved.get(param_name)
1258
+ param_status = self._determine_parameter_status(
1259
+ param_name, pred_params, gt_params, pred_resolved, gt_resolved
1260
+ )
1261
+
1262
+ # Get parameter definition from tool spec
1263
+ param_def = None
1264
+ if tool_spec:
1265
+ param_def = next(
1266
+ (p for p in tool_spec.parameters if p.name == param_name), None
1267
+ )
1268
+
1269
+ parameters_info[param_name] = {
1270
+ "predicted_value": pred_val,
1271
+ "ground_truth_value": gt_val,
1272
+ "predicted_type": type(pred_val).__name__,
1273
+ "ground_truth_type": type(gt_val).__name__,
1274
+ "is_required": param_def.required if param_def else False,
1275
+ "parameter_type": param_def.type if param_def else "unknown",
1276
+ "default_value": param_def.default if param_def else None,
1277
+ "status": (
1278
+ param_status.value
1279
+ if hasattr(param_status, "value")
1280
+ else str(param_status)
1281
+ ),
1282
+ }
1283
+
1284
+ # Build prompt arguments for bulk comparison
1285
+ user_kwargs = {
1286
+ "function_name": gt_name or pred_name,
1287
+ "predicted_arguments_json": json.dumps(pred_resolved, indent=2),
1288
+ "ground_truth_arguments_json": json.dumps(gt_resolved, indent=2),
1289
+ "tool_specification": json.dumps(
1290
+ tool_spec.dict() if tool_spec else {}, indent=2
1291
+ ),
1292
+ "parameters_info": parameters_info,
1293
+ "conversation_context": conversation_history or [],
1294
+ }
1295
+
1296
+ # Build messages using the bulk prompt template
1297
+ messages = self.bulk_prompt.build_messages(user_kwargs)
1298
+
1299
+ try:
1300
+ # Use ValidatingLLMClient with schema - try bulk metric schema first, fallback to default
1301
+ schema = None
1302
+ try:
1303
+ schema = self.bulk_metric.to_jsonschema()
1304
+ except Exception:
1305
+ # Fallback to default schema if bulk metric schema fails
1306
+ schema = self.get_default_bulk_schema()
1307
+
1308
+ response = await self.llm_client.generate_async(
1309
+ prompt=messages,
1310
+ schema=schema,
1311
+ )
1312
+
1313
+ # Parse the validated response
1314
+ if isinstance(response, str):
1315
+ result_data = json.loads(response)
1316
+ else:
1317
+ result_data = response
1318
+
1319
+ overall_score = float(result_data.get("overall_score", 0.0))
1320
+ overall_explanation = result_data.get("overall_explanation", "")
1321
+ confidence = float(result_data.get("confidence", 0.5))
1322
+ parameter_scores = result_data.get("parameter_scores", {})
1323
+
1324
+ # Convert bulk result to individual parameter results
1325
+ param_results = []
1326
+ for param_name in params_to_compare:
1327
+ param_score_data = parameter_scores.get(param_name, {})
1328
+ param_score = float(param_score_data.get("score", 0.0))
1329
+ param_explanation = param_score_data.get(
1330
+ "explanation", f"No explanation for {param_name}"
1331
+ )
1332
+ param_is_match = bool(param_score_data.get("is_match", False))
1333
+
1334
+ param_info = parameters_info[param_name]
1335
+ param_status = self._determine_parameter_status(
1336
+ param_name, pred_params, gt_params, pred_resolved, gt_resolved
1337
+ )
1338
+
1339
+ param_result = ParameterComparisonResult(
1340
+ parameter_name=param_name,
1341
+ predicted_value=pred_params.get(param_name),
1342
+ ground_truth_value=gt_params.get(param_name),
1343
+ predicted_resolved_value=param_info["predicted_value"],
1344
+ ground_truth_resolved_value=param_info["ground_truth_value"],
1345
+ parameter_status=param_status,
1346
+ comparison_strategy=ComparisonStrategy.LLM_JUDGE,
1347
+ score=param_score,
1348
+ explanation=param_explanation,
1349
+ evidence=f"Bulk LLM comparison (confidence: {confidence:.2f})",
1350
+ is_match=param_is_match,
1351
+ confidence=confidence,
1352
+ )
1353
+ param_results.append(param_result)
1354
+
1355
+ except Exception as e:
1356
+ logger.warning(f"Async bulk LLM comparison failed: {e}")
1357
+
1358
+ # Fallback to individual parameter comparison async
1359
+ return await self._compare_tool_calls_individual_async(
1360
+ predicted_call, ground_truth_call, conversation_history, tool_specs
1361
+ )
1362
+
1363
+ # Calculate overall score (already provided by bulk LLM)
1364
+ overall_score = max(0.0, min(1.0, overall_score))
1365
+
1366
+ # Apply function name weight
1367
+ final_score = (
1368
+ self.config.weight_function_name * fn_score
1369
+ + self.config.weight_parameters * overall_score
1370
+ )
1371
+
1372
+ # Find missing required parameters and unexpected parameters
1373
+ missing_required = []
1374
+ unexpected_params = []
1375
+
1376
+ if tool_spec:
1377
+ required_params = {p.name for p in tool_spec.parameters if p.required}
1378
+ all_defined_params = {p.name for p in tool_spec.parameters}
1379
+
1380
+ for req_param in required_params:
1381
+ if req_param not in pred_resolved and req_param not in gt_resolved:
1382
+ missing_required.append(req_param)
1383
+
1384
+ for param_name in params_to_compare:
1385
+ if param_name not in all_defined_params:
1386
+ unexpected_params.append(param_name)
1387
+
1388
+ # Apply penalties for missing required parameters
1389
+ if missing_required:
1390
+ penalty = len(missing_required) * self.config.missing_parameter_penalty
1391
+ final_score *= 1 - penalty
1392
+ final_score = max(0.0, final_score)
1393
+
1394
+ return ToolCallComparisonResult(
1395
+ predicted_call=predicted_call,
1396
+ ground_truth_call=ground_truth_call,
1397
+ function_name_match=fn_match,
1398
+ function_name_score=fn_score,
1399
+ parameter_results=param_results,
1400
+ overall_score=final_score,
1401
+ overall_explanation=f"Bulk LLM comparison: {overall_explanation}",
1402
+ strategy_used=self.config.strategy,
1403
+ missing_required_params=missing_required,
1404
+ unexpected_params=unexpected_params,
1405
+ metadata={
1406
+ "tool_spec_used": tool_spec.dict() if tool_spec else None,
1407
+ "parameters_compared": list(params_to_compare),
1408
+ "default_parameters_included": self.config.include_default_parameters,
1409
+ "bulk_comparison": True,
1410
+ "llm_confidence": confidence,
1411
+ "execution_mode": "async_bulk",
1412
+ },
1413
+ )
1414
+
1415
+ async def _compare_tool_calls_individual_async(
1416
+ self,
1417
+ predicted_call: Dict[str, Any],
1418
+ ground_truth_call: Dict[str, Any],
1419
+ conversation_history: Optional[List[Dict[str, str]]] = None,
1420
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
1421
+ custom_instructions: Optional[str] = None,
1422
+ ) -> Any:
1423
+ """Wrapper for the original async individual parameter comparison logic."""
1424
+
1425
+ # This is the existing async logic that was in compare_tool_calls_async
1426
+ # Import here to avoid circular imports
1427
+ from ..types import ToolCallComparisonResult, FunctionCallInput
1428
+
1429
+ # Use the base class logic but with async parameter comparison
1430
+ # Extract function names
1431
+ pred_name = predicted_call.get("function", {}).get("name", "")
1432
+ gt_name = ground_truth_call.get("function", {}).get("name", "")
1433
+
1434
+ # Compare function names (sync operation)
1435
+ fn_score = self.compare_function_name(pred_name, gt_name)
1436
+ fn_match = fn_score >= 0.95
1437
+
1438
+ # Extract tool specification
1439
+ tool_spec = self._extract_tool_spec(
1440
+ gt_name, tool_specs
1441
+ ) or self._extract_tool_spec(pred_name, tool_specs)
1442
+
1443
+ # Extract and parse parameters
1444
+ pred_params = predicted_call.get("function", {}).get("arguments", {})
1445
+ gt_params = ground_truth_call.get("function", {}).get("arguments", {})
1446
+
1447
+ if isinstance(pred_params, str):
1448
+ try:
1449
+ pred_params = json.loads(pred_params)
1450
+ except json.JSONDecodeError:
1451
+ logger.warning(f"Failed to parse predicted parameters: {pred_params}")
1452
+ pred_params = {}
1453
+
1454
+ if isinstance(gt_params, str):
1455
+ try:
1456
+ gt_params = json.loads(gt_params)
1457
+ except json.JSONDecodeError:
1458
+ logger.warning(f"Failed to parse ground truth parameters: {gt_params}")
1459
+ gt_params = {}
1460
+
1461
+ # Resolve parameters with defaults
1462
+ pred_resolved = self._resolve_parameters_with_defaults(pred_params, tool_spec)
1463
+ gt_resolved = self._resolve_parameters_with_defaults(gt_params, tool_spec)
1464
+
1465
+ # Determine all parameters to compare
1466
+ params_to_compare = self.config.parameters_to_compare
1467
+ if params_to_compare is None:
1468
+ if self.config.include_default_parameters:
1469
+ params_to_compare = set(pred_resolved.keys()) | set(gt_resolved.keys())
1470
+ else:
1471
+ params_to_compare = set(pred_params.keys()) | set(gt_params.keys())
1472
+
1473
+ # Compare each parameter asynchronously
1474
+ param_tasks = []
1475
+ context = {
1476
+ "conversation_history": conversation_history,
1477
+ "tool_specs": tool_specs,
1478
+ "tool_spec": tool_spec,
1479
+ "predicted_call": predicted_call,
1480
+ "ground_truth_call": ground_truth_call,
1481
+ "function_name": gt_name or pred_name,
1482
+ }
1483
+
1484
+ import asyncio
1485
+
1486
+ for param_name in params_to_compare:
1487
+ pred_val = pred_params.get(param_name)
1488
+ gt_val = gt_params.get(param_name)
1489
+ pred_resolved_val = pred_resolved.get(param_name)
1490
+ gt_resolved_val = gt_resolved.get(param_name)
1491
+
1492
+ # Get parameter definition from tool spec
1493
+ param_def = None
1494
+ if tool_spec:
1495
+ param_def = next(
1496
+ (p for p in tool_spec.parameters if p.name == param_name), None
1497
+ )
1498
+
1499
+ # Determine parameter status
1500
+ param_status = self._determine_parameter_status(
1501
+ param_name, pred_params, gt_params, pred_resolved, gt_resolved
1502
+ )
1503
+
1504
+ # Enhanced context for this parameter
1505
+ param_context = context.copy()
1506
+ param_context.update(
1507
+ {
1508
+ "parameter_definition": param_def.dict() if param_def else None,
1509
+ "parameter_status": param_status,
1510
+ "predicted_resolved": pred_resolved_val,
1511
+ "ground_truth_resolved": gt_resolved_val,
1512
+ }
1513
+ )
1514
+
1515
+ # Create async task for parameter comparison
1516
+ task = self.compare_parameter_async(
1517
+ param_name,
1518
+ pred_resolved_val,
1519
+ gt_resolved_val,
1520
+ param_context,
1521
+ custom_instructions=custom_instructions,
1522
+ )
1523
+ param_tasks.append(task)
1524
+
1525
+ # Wait for all parameter comparisons to complete
1526
+ param_results = await asyncio.gather(*param_tasks)
1527
+
1528
+ # Enhance results with additional information
1529
+ for result, param_name in zip(param_results, params_to_compare):
1530
+ pred_resolved_val = pred_resolved.get(param_name)
1531
+ gt_resolved_val = gt_resolved.get(param_name)
1532
+ param_status = self._determine_parameter_status(
1533
+ param_name, pred_params, gt_params, pred_resolved, gt_resolved
1534
+ )
1535
+
1536
+ param_def = None
1537
+ if tool_spec:
1538
+ param_def = next(
1539
+ (p for p in tool_spec.parameters if p.name == param_name), None
1540
+ )
1541
+
1542
+ result.predicted_resolved_value = pred_resolved_val
1543
+ result.ground_truth_resolved_value = gt_resolved_val
1544
+ result.parameter_status = param_status
1545
+ result.parameter_definition = param_def.dict() if param_def else None
1546
+ result.is_required = param_def.required if param_def else False
1547
+ result.default_value = param_def.default if param_def else None
1548
+
1549
+ # Calculate overall score using weighted approach
1550
+ param_score = self._calculate_weighted_score(param_results)
1551
+
1552
+ overall_score = (
1553
+ self.config.weight_function_name * fn_score
1554
+ + self.config.weight_parameters * param_score
1555
+ )
1556
+
1557
+ # Find missing required parameters and unexpected parameters
1558
+ missing_required = []
1559
+ unexpected_params = []
1560
+
1561
+ if tool_spec:
1562
+ required_params = {p.name for p in tool_spec.parameters if p.required}
1563
+ all_defined_params = {p.name for p in tool_spec.parameters}
1564
+
1565
+ # Check for missing required parameters
1566
+ for req_param in required_params:
1567
+ if req_param not in pred_resolved and req_param not in gt_resolved:
1568
+ missing_required.append(req_param)
1569
+
1570
+ # Check for unexpected parameters
1571
+ for param_name in params_to_compare:
1572
+ if param_name not in all_defined_params:
1573
+ unexpected_params.append(param_name)
1574
+
1575
+ # Apply penalties for missing required parameters
1576
+ if missing_required:
1577
+ penalty = len(missing_required) * self.config.missing_parameter_penalty
1578
+ overall_score *= 1 - penalty
1579
+ overall_score = max(0.0, overall_score)
1580
+
1581
+ # Generate overall explanation
1582
+ overall_explanation = self._generate_overall_explanation(
1583
+ fn_match,
1584
+ fn_score,
1585
+ param_results,
1586
+ overall_score,
1587
+ missing_required,
1588
+ unexpected_params,
1589
+ )
1590
+
1591
+ return ToolCallComparisonResult(
1592
+ predicted_call=predicted_call,
1593
+ ground_truth_call=ground_truth_call,
1594
+ function_name_match=fn_match,
1595
+ function_name_score=fn_score,
1596
+ parameter_results=param_results,
1597
+ overall_score=overall_score,
1598
+ overall_explanation=overall_explanation,
1599
+ strategy_used=self.config.strategy,
1600
+ missing_required_params=missing_required,
1601
+ unexpected_params=unexpected_params,
1602
+ metadata={
1603
+ "tool_spec_used": tool_spec.dict() if tool_spec else None,
1604
+ "parameters_compared": list(params_to_compare),
1605
+ "default_parameters_included": self.config.include_default_parameters,
1606
+ "execution_mode": "async_individual",
1607
+ },
1608
+ )
1609
+
1610
+ # Enhanced LLM Judge Methods with Custom Schema Support
1611
+
1612
+ async def compare_tool_calls_with_custom_schema(
1613
+ self,
1614
+ predicted_call: Dict[str, Any],
1615
+ ground_truth_call: Dict[str, Any],
1616
+ conversation_history: Optional[List[Dict[str, str]]] = None,
1617
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
1618
+ custom_instructions: Optional[str] = None,
1619
+ custom_schema: Optional[str] = None,
1620
+ ) -> Dict[str, Any]:
1621
+ """
1622
+ Compare tool calls using custom schema and instructions.
1623
+
1624
+ Args:
1625
+ predicted_call: The predicted function call
1626
+ ground_truth_call: The ground truth function call
1627
+ conversation_history: Optional conversation context
1628
+ tool_specs: Optional tool specifications
1629
+ custom_instructions: Custom evaluation instructions
1630
+ custom_schema: Custom JSON schema for response format
1631
+
1632
+ Returns:
1633
+ Comparison result following the custom schema format
1634
+ """
1635
+
1636
+ # Build detailed context for evaluation
1637
+ user_prompt = self._build_custom_evaluation_prompt(
1638
+ predicted_call=predicted_call,
1639
+ ground_truth_call=ground_truth_call,
1640
+ conversation_history=conversation_history or [],
1641
+ tool_specs=tool_specs or [],
1642
+ custom_instructions=custom_instructions,
1643
+ )
1644
+
1645
+ # Build system prompt with custom schema
1646
+ system_prompt = self._build_custom_system_prompt(
1647
+ custom_instructions=custom_instructions,
1648
+ custom_schema=custom_schema,
1649
+ )
1650
+
1651
+ messages = [
1652
+ {"role": "system", "content": system_prompt},
1653
+ {"role": "user", "content": user_prompt},
1654
+ ]
1655
+
1656
+ try:
1657
+ # Use custom schema if provided, otherwise fallback to default
1658
+ schema = None
1659
+ if custom_schema:
1660
+ try:
1661
+ schema = json.loads(custom_schema)
1662
+ except json.JSONDecodeError as e:
1663
+ logger.warning(
1664
+ f"Invalid custom schema JSON: {e}. Using default schema."
1665
+ )
1666
+ schema = self.get_default_bulk_schema()
1667
+ else:
1668
+ schema = self.get_default_bulk_schema()
1669
+
1670
+ # Generate response with schema validation
1671
+ response = await self.llm_client.generate_async(
1672
+ prompt=messages,
1673
+ schema=schema,
1674
+ )
1675
+
1676
+ # Parse response
1677
+ if isinstance(response, str):
1678
+ result_data = json.loads(response)
1679
+ else:
1680
+ result_data = response
1681
+
1682
+ # Create result object with custom schema data
1683
+ from llmevalkit.function_calling.comparison.types import (
1684
+ ToolCallComparisonResult,
1685
+ )
1686
+
1687
+ # Extract standard fields with fallbacks
1688
+ overall_score = self._extract_overall_score(result_data)
1689
+ overall_explanation = self._extract_overall_explanation(result_data)
1690
+ function_name_match = self._extract_function_match(
1691
+ predicted_call, ground_truth_call
1692
+ )
1693
+
1694
+ # Create result object
1695
+ result = ToolCallComparisonResult(
1696
+ predicted_call=predicted_call,
1697
+ ground_truth_call=ground_truth_call,
1698
+ function_name_match=function_name_match,
1699
+ function_name_score=1.0 if function_name_match else 0.0,
1700
+ parameter_results=[], # Can be populated from custom schema
1701
+ overall_score=overall_score,
1702
+ overall_explanation=overall_explanation,
1703
+ strategy_used=ComparisonStrategy.LLM_JUDGE,
1704
+ metadata={
1705
+ "custom_schema_response": result_data,
1706
+ "custom_schema_used": True,
1707
+ "execution_mode": "async_custom_schema",
1708
+ }, # Store full custom response
1709
+ )
1710
+
1711
+ return result
1712
+
1713
+ except Exception as e:
1714
+ logger.error(f"Custom schema comparison failed: {e}")
1715
+ # Fallback to standard comparison
1716
+ return await self.compare_tool_calls_async(
1717
+ predicted_call=predicted_call,
1718
+ ground_truth_call=ground_truth_call,
1719
+ conversation_history=conversation_history,
1720
+ tool_specs=tool_specs,
1721
+ custom_instructions=custom_instructions,
1722
+ )
1723
+
1724
+ def _build_custom_system_prompt(
1725
+ self,
1726
+ custom_instructions: Optional[str] = None,
1727
+ custom_schema: Optional[str] = None,
1728
+ ) -> str:
1729
+ """Build system prompt with custom instructions and schema."""
1730
+
1731
+ base_prompt = """You are an expert system for comparing function call parameters and tool calls. Your task is to evaluate the similarity and functional equivalence between predicted and ground truth function calls.
1732
+
1733
+ **EVALUATION PRINCIPLES**:
1734
+ 1. Focus on functional equivalence rather than literal matching
1735
+ 2. Consider context and user intent when making comparisons
1736
+ 3. Account for different representations of the same logical concepts
1737
+ 4. Provide detailed reasoning for your assessments
1738
+
1739
+ **SCORING GUIDELINES**:
1740
+ - 1.0: Perfect functional equivalence
1741
+ - 0.9-0.99: Semantically equivalent with minor differences
1742
+ - 0.7-0.89: Mostly equivalent with some differences
1743
+ - 0.5-0.69: Partially equivalent
1744
+ - 0.3-0.49: Some similarity but significant differences
1745
+ - 0.0-0.29: Not functionally equivalent
1746
+ """
1747
+
1748
+ if custom_instructions:
1749
+ base_prompt += f"""
1750
+
1751
+ **CUSTOM EVALUATION INSTRUCTIONS**:
1752
+ {custom_instructions}
1753
+
1754
+ IMPORTANT: Follow these custom instructions carefully. They take priority over general guidelines when there are conflicts.
1755
+ """
1756
+
1757
+ if custom_schema:
1758
+ base_prompt += f"""
1759
+
1760
+ **RESPONSE FORMAT**:
1761
+ You must respond using this exact JSON schema format:
1762
+
1763
+ {custom_schema}
1764
+
1765
+ Your response must be valid JSON that strictly follows this schema structure.
1766
+ """
1767
+ else:
1768
+ base_prompt += """
1769
+
1770
+ **RESPONSE FORMAT**:
1771
+ Provide your response as a JSON object with detailed analysis and scoring.
1772
+ """
1773
+
1774
+ return base_prompt
1775
+
1776
+ def _build_custom_evaluation_prompt(
1777
+ self,
1778
+ predicted_call: Dict[str, Any],
1779
+ ground_truth_call: Dict[str, Any],
1780
+ conversation_history: List[Dict[str, str]],
1781
+ tool_specs: List[Dict[str, Any]],
1782
+ custom_instructions: Optional[str] = None,
1783
+ ) -> str:
1784
+ """Build the user evaluation prompt with all context."""
1785
+
1786
+ prompt = "**FUNCTION CALL COMPARISON TASK**\n\n"
1787
+
1788
+ # Add function calls
1789
+ prompt += "**Predicted Function Call**:\n"
1790
+ prompt += f"```json\n{json.dumps(predicted_call, indent=2)}\n```\n\n"
1791
+
1792
+ prompt += "**Ground Truth Function Call**:\n"
1793
+ prompt += f"```json\n{json.dumps(ground_truth_call, indent=2)}\n```\n\n"
1794
+
1795
+ # Add tool specifications if provided
1796
+ if tool_specs:
1797
+ prompt += "**Tool Specifications**:\n"
1798
+ for spec in tool_specs:
1799
+ prompt += f"```json\n{json.dumps(spec, indent=2)}\n```\n"
1800
+ prompt += "\n"
1801
+
1802
+ # Add conversation history if provided
1803
+ if conversation_history:
1804
+ prompt += "**Conversation Context**:\n"
1805
+ for msg in conversation_history:
1806
+ role = msg.get("role", "unknown")
1807
+ content = msg.get("content", "")
1808
+ prompt += f"**{role.title()}**: {content}\n"
1809
+ prompt += "\n"
1810
+
1811
+ # Add custom instructions if provided
1812
+ if custom_instructions:
1813
+ prompt += "**SPECIAL INSTRUCTIONS**:\n"
1814
+ prompt += f"{custom_instructions}\n\n"
1815
+
1816
+ prompt += "Please evaluate these function calls and provide a detailed comparison following the specified schema format."
1817
+
1818
+ return prompt
1819
+
1820
+ def _extract_overall_score(self, result_data: Dict[str, Any]) -> float:
1821
+ """Extract overall score from custom schema response."""
1822
+ # Try different possible locations for the score
1823
+ score_paths = [
1824
+ ["overall_assessment", "overall_score"],
1825
+ ["overall_score"],
1826
+ ["score"],
1827
+ ["output"],
1828
+ ]
1829
+
1830
+ for path in score_paths:
1831
+ value = result_data
1832
+ try:
1833
+ for key in path:
1834
+ value = value[key]
1835
+ return float(value)
1836
+ except (KeyError, TypeError, ValueError):
1837
+ continue
1838
+
1839
+ # Fallback: estimate from parameter scores
1840
+ param_scores = result_data.get("parameter_scores", {})
1841
+ if param_scores:
1842
+ scores = []
1843
+ for param_data in param_scores.values():
1844
+ if isinstance(param_data, dict) and "score" in param_data:
1845
+ scores.append(float(param_data["score"]))
1846
+ if scores:
1847
+ return sum(scores) / len(scores)
1848
+
1849
+ return 0.5 # Default fallback
1850
+
1851
+ def _extract_overall_explanation(self, result_data: Dict[str, Any]) -> str:
1852
+ """Extract overall explanation from custom schema response."""
1853
+ explanation_paths = [
1854
+ ["overall_assessment", "summary"],
1855
+ ["overall_assessment", "explanation"],
1856
+ ["summary"],
1857
+ ["explanation"],
1858
+ ["reasoning"],
1859
+ ]
1860
+
1861
+ for path in explanation_paths:
1862
+ value = result_data
1863
+ try:
1864
+ for key in path:
1865
+ value = value[key]
1866
+ return str(value)
1867
+ except (KeyError, TypeError):
1868
+ continue
1869
+
1870
+ return "Custom schema evaluation completed"
1871
+
1872
+ def _extract_function_match(
1873
+ self, predicted_call: Dict[str, Any], ground_truth_call: Dict[str, Any]
1874
+ ) -> bool:
1875
+ """Extract function name match."""
1876
+ pred_name = predicted_call.get("function", {}).get(
1877
+ "name"
1878
+ ) or predicted_call.get("name")
1879
+ gt_name = ground_truth_call.get("function", {}).get(
1880
+ "name"
1881
+ ) or ground_truth_call.get("name")
1882
+ return pred_name == gt_name