ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,516 @@
1
+ from typing import Any, Dict, Optional, List
2
+ import logging
3
+
4
+ from .base import BaseComparator
5
+ from .exact_match import ExactMatchComparator
6
+ from .fuzzy_string import FuzzyStringComparator
7
+ from .llm_judge import LLMJudgeComparator
8
+ from ..types import (
9
+ ParameterComparisonResult,
10
+ ComparisonStrategy,
11
+ ParameterStatus,
12
+ ComparisonConfig,
13
+ )
14
+
15
+ # Import code agent conditionally to avoid dependency issues
16
+ try:
17
+ from .code_agent import CodeAgentComparator
18
+
19
+ CODE_AGENT_AVAILABLE = True
20
+ except ImportError:
21
+ CODE_AGENT_AVAILABLE = False
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class HybridComparator(BaseComparator):
27
+ """
28
+ Hybrid comparator that uses multiple strategies and picks the best result.
29
+
30
+ Strategy order:
31
+ 1. Exact match (if perfect match found)
32
+ 2. Fuzzy string similarity (for near matches)
33
+ 3. LLM judge (for semantic understanding) - now with enhanced capabilities
34
+ 4. Code agent (for complex programmatic analysis) - if available
35
+
36
+ Enhanced LLM Judge capabilities:
37
+ - Supports custom instructions for specialized evaluation scenarios
38
+ - Supports custom schemas for tailored response formats
39
+ - Both sync and async operation modes
40
+ - Bulk and individual parameter comparison modes
41
+ """
42
+
43
+ def __init__(self, config: ComparisonConfig, llm_client=None):
44
+ super().__init__(config)
45
+ self.llm_client = llm_client
46
+
47
+ # Initialize sub-comparators
48
+ self.exact_comparator = ExactMatchComparator(config)
49
+ self.fuzzy_comparator = FuzzyStringComparator(config)
50
+
51
+ # Only initialize LLM comparator if client is available
52
+ self.llm_comparator = None
53
+ if llm_client:
54
+ try:
55
+ self.llm_comparator = LLMJudgeComparator(config, llm_client)
56
+ except Exception as e:
57
+ logger.warning(f"Failed to initialize LLM comparator: {e}")
58
+
59
+ # Only initialize Code Agent comparator if available and client provided
60
+ self.code_agent_comparator = None
61
+ if CODE_AGENT_AVAILABLE and llm_client:
62
+ try:
63
+ self.code_agent_comparator = CodeAgentComparator(config, llm_client)
64
+ except Exception as e:
65
+ logger.warning(f"Failed to initialize Code Agent comparator: {e}")
66
+
67
+ def compare_parameter(
68
+ self,
69
+ param_name: str,
70
+ predicted_value: Any,
71
+ ground_truth_value: Any,
72
+ context: Optional[Dict[str, Any]] = None,
73
+ custom_instructions: Optional[str] = None,
74
+ custom_schema: Optional[Dict[str, Any]] = None,
75
+ ) -> ParameterComparisonResult:
76
+
77
+ results = []
78
+ strategies_used = []
79
+
80
+ # 1. Try exact match first
81
+ try:
82
+ exact_result = self.exact_comparator.compare_parameter(
83
+ param_name,
84
+ predicted_value,
85
+ ground_truth_value,
86
+ context,
87
+ custom_instructions,
88
+ )
89
+ results.append(exact_result)
90
+ strategies_used.append(ComparisonStrategy.EXACT_MATCH)
91
+
92
+ # If exact match is perfect, return it
93
+ if exact_result.score >= 0.95:
94
+ exact_result.comparison_strategy = ComparisonStrategy.HYBRID
95
+ return exact_result
96
+
97
+ except Exception as e:
98
+ logger.warning(f"Exact match comparison failed: {e}")
99
+
100
+ # 2. Try fuzzy string matching
101
+ try:
102
+ fuzzy_result = self.fuzzy_comparator.compare_parameter(
103
+ param_name,
104
+ predicted_value,
105
+ ground_truth_value,
106
+ context,
107
+ custom_instructions,
108
+ )
109
+ results.append(fuzzy_result)
110
+ strategies_used.append(ComparisonStrategy.FUZZY_STRING)
111
+
112
+ except Exception as e:
113
+ logger.warning(f"Fuzzy string comparison failed: {e}")
114
+
115
+ # 3. Try LLM judge if available and other methods haven't given high confidence
116
+ if self.llm_comparator and (not results or max(r.score for r in results) < 0.8):
117
+ try:
118
+ llm_result = self.llm_comparator.compare_parameter(
119
+ param_name,
120
+ predicted_value,
121
+ ground_truth_value,
122
+ context,
123
+ custom_instructions,
124
+ custom_schema,
125
+ )
126
+ results.append(llm_result)
127
+ strategies_used.append(ComparisonStrategy.LLM_JUDGE)
128
+
129
+ except Exception as e:
130
+ logger.warning(f"LLM judge comparison failed: {e}")
131
+
132
+ # 4. Try Code Agent if available and other methods haven't given high confidence
133
+ if self.code_agent_comparator and (
134
+ not results or max(r.score for r in results) < 0.85
135
+ ):
136
+ try:
137
+ # Code agent typically doesn't support custom_schema, so pass other params
138
+ code_result = self.code_agent_comparator.compare_parameter(
139
+ param_name,
140
+ predicted_value,
141
+ ground_truth_value,
142
+ context,
143
+ custom_instructions,
144
+ )
145
+ results.append(code_result)
146
+ strategies_used.append(ComparisonStrategy.CODE_AGENT)
147
+
148
+ except Exception as e:
149
+ logger.warning(f"Code agent comparison failed: {e}")
150
+
151
+ # Select the best result
152
+ if not results:
153
+ # Fallback to basic exact match
154
+ return ParameterComparisonResult(
155
+ parameter_name=param_name,
156
+ predicted_value=predicted_value,
157
+ ground_truth_value=ground_truth_value,
158
+ predicted_resolved_value=predicted_value,
159
+ ground_truth_resolved_value=ground_truth_value,
160
+ parameter_status=(
161
+ context.get("parameter_status", ParameterStatus.BOTH_PRESENT)
162
+ if context
163
+ else ParameterStatus.BOTH_PRESENT
164
+ ),
165
+ comparison_strategy=ComparisonStrategy.HYBRID,
166
+ score=0.0,
167
+ explanation="All comparison strategies failed",
168
+ is_match=False,
169
+ confidence=0.1,
170
+ error_type="comparison_failed",
171
+ )
172
+
173
+ # Choose best result based on combination of score and confidence
174
+ best_result = self._select_best_result(results, strategies_used)
175
+
176
+ # Update the strategy to reflect hybrid approach
177
+ best_result.comparison_strategy = ComparisonStrategy.HYBRID
178
+
179
+ # Enhance explanation with strategy information
180
+ strategy_names = [s.value for s in strategies_used]
181
+ best_result.explanation += (
182
+ f" (Hybrid strategies used: {', '.join(strategy_names)})"
183
+ )
184
+
185
+ return best_result
186
+
187
+ def _select_best_result(
188
+ self,
189
+ results: List[ParameterComparisonResult],
190
+ strategies: List[ComparisonStrategy],
191
+ ) -> ParameterComparisonResult:
192
+ """Select the best result from multiple comparison strategies."""
193
+
194
+ if len(results) == 1:
195
+ return results[0]
196
+
197
+ # Calculate weighted scores
198
+ strategy_weights = {
199
+ ComparisonStrategy.EXACT_MATCH: 1.0, # Highest priority for exact matches
200
+ ComparisonStrategy.CODE_AGENT: 0.95, # Very high priority for code analysis
201
+ ComparisonStrategy.LLM_JUDGE: 0.9, # High priority for LLM understanding
202
+ ComparisonStrategy.FUZZY_STRING: 0.7, # Medium priority for fuzzy matching
203
+ }
204
+
205
+ best_result = None
206
+ best_weighted_score = -1
207
+
208
+ for result, strategy in zip(results, strategies):
209
+ # Weighted score combines result score, confidence, and strategy preference
210
+ weight = strategy_weights.get(strategy, 0.5)
211
+ weighted_score = (result.score * 0.6 + result.confidence * 0.2) * weight
212
+
213
+ # Bonus for exact matches
214
+ if result.score >= 0.95:
215
+ weighted_score += 0.1
216
+
217
+ # Bonus for high confidence
218
+ if result.confidence >= 0.9:
219
+ weighted_score += 0.05
220
+
221
+ if weighted_score > best_weighted_score:
222
+ best_weighted_score = weighted_score
223
+ best_result = result
224
+
225
+ return best_result or results[0]
226
+
227
+ def compare_function_name(
228
+ self,
229
+ predicted_name: str,
230
+ ground_truth_name: str,
231
+ context: Optional[Dict[str, Any]] = None,
232
+ ) -> float:
233
+ """Hybrid function name comparison."""
234
+
235
+ # Try exact match first
236
+ exact_score = self.exact_comparator.compare_function_name(
237
+ predicted_name, ground_truth_name, context
238
+ )
239
+
240
+ if exact_score >= 0.95:
241
+ return exact_score
242
+
243
+ # Try fuzzy matching
244
+ fuzzy_score = self.fuzzy_comparator.compare_function_name(
245
+ predicted_name, ground_truth_name, context
246
+ )
247
+
248
+ # Try LLM if available and fuzzy score is not high
249
+ if self.llm_comparator and fuzzy_score < 0.8:
250
+ try:
251
+ llm_score = self.llm_comparator.compare_function_name(
252
+ predicted_name, ground_truth_name, context
253
+ )
254
+ # Take the higher of fuzzy and LLM scores
255
+ fuzzy_score = max(fuzzy_score, llm_score)
256
+ except Exception as e:
257
+ logger.warning(f"LLM function name comparison failed: {e}")
258
+
259
+ # Try Code Agent if available and current score is not high
260
+ if self.code_agent_comparator and fuzzy_score < 0.85:
261
+ try:
262
+ code_score = self.code_agent_comparator.compare_function_name(
263
+ predicted_name, ground_truth_name, context
264
+ )
265
+ # Take the highest score
266
+ fuzzy_score = max(fuzzy_score, code_score)
267
+ except Exception as e:
268
+ logger.warning(f"Code agent function name comparison failed: {e}")
269
+
270
+ return fuzzy_score
271
+
272
+ async def compare_parameter_async(
273
+ self,
274
+ param_name: str,
275
+ predicted_value: Any,
276
+ ground_truth_value: Any,
277
+ context: Optional[Dict[str, Any]] = None,
278
+ custom_instructions: Optional[str] = None,
279
+ custom_schema: Optional[Dict[str, Any]] = None,
280
+ ) -> ParameterComparisonResult:
281
+ """Async hybrid parameter comparison with all enhanced features."""
282
+
283
+ results = []
284
+ strategies_used = []
285
+
286
+ # 1. Try exact match first
287
+ try:
288
+ exact_result = self.exact_comparator.compare_parameter(
289
+ param_name,
290
+ predicted_value,
291
+ ground_truth_value,
292
+ context,
293
+ custom_instructions,
294
+ )
295
+ results.append(exact_result)
296
+ strategies_used.append(ComparisonStrategy.EXACT_MATCH)
297
+
298
+ # If exact match is perfect, return it
299
+ if exact_result.score >= 0.95:
300
+ exact_result.comparison_strategy = ComparisonStrategy.HYBRID
301
+ return exact_result
302
+
303
+ except Exception as e:
304
+ logger.warning(f"Exact match comparison failed: {e}")
305
+
306
+ # 2. Try fuzzy string matching
307
+ try:
308
+ fuzzy_result = self.fuzzy_comparator.compare_parameter(
309
+ param_name,
310
+ predicted_value,
311
+ ground_truth_value,
312
+ context,
313
+ custom_instructions,
314
+ )
315
+ results.append(fuzzy_result)
316
+ strategies_used.append(ComparisonStrategy.FUZZY_STRING)
317
+
318
+ except Exception as e:
319
+ logger.warning(f"Fuzzy string comparison failed: {e}")
320
+
321
+ # 3. Try LLM judge async if available and other methods haven't given high confidence
322
+ if self.llm_comparator and (not results or max(r.score for r in results) < 0.8):
323
+ try:
324
+ llm_result = await self.llm_comparator.compare_parameter_async(
325
+ param_name,
326
+ predicted_value,
327
+ ground_truth_value,
328
+ context,
329
+ custom_instructions,
330
+ custom_schema,
331
+ )
332
+ results.append(llm_result)
333
+ strategies_used.append(ComparisonStrategy.LLM_JUDGE)
334
+
335
+ except Exception as e:
336
+ logger.warning(f"Async LLM judge comparison failed: {e}")
337
+
338
+ # 4. Try Code Agent async if available and other methods haven't given high confidence
339
+ if self.code_agent_comparator and (
340
+ not results or max(r.score for r in results) < 0.85
341
+ ):
342
+ try:
343
+ # Check if code agent has async support
344
+ if hasattr(self.code_agent_comparator, "compare_parameter_async"):
345
+ code_result = (
346
+ await self.code_agent_comparator.compare_parameter_async(
347
+ param_name,
348
+ predicted_value,
349
+ ground_truth_value,
350
+ context,
351
+ custom_instructions,
352
+ )
353
+ )
354
+ else:
355
+ # Fallback to sync version
356
+ code_result = self.code_agent_comparator.compare_parameter(
357
+ param_name,
358
+ predicted_value,
359
+ ground_truth_value,
360
+ context,
361
+ custom_instructions,
362
+ )
363
+ results.append(code_result)
364
+ strategies_used.append(ComparisonStrategy.CODE_AGENT)
365
+
366
+ except Exception as e:
367
+ logger.warning(f"Async code agent comparison failed: {e}")
368
+
369
+ # Select the best result
370
+ if not results:
371
+ # Fallback to basic exact match
372
+ return ParameterComparisonResult(
373
+ parameter_name=param_name,
374
+ predicted_value=predicted_value,
375
+ ground_truth_value=ground_truth_value,
376
+ predicted_resolved_value=predicted_value,
377
+ ground_truth_resolved_value=ground_truth_value,
378
+ parameter_status=(
379
+ context.get("parameter_status", ParameterStatus.BOTH_PRESENT)
380
+ if context
381
+ else ParameterStatus.BOTH_PRESENT
382
+ ),
383
+ comparison_strategy=ComparisonStrategy.HYBRID,
384
+ score=0.0,
385
+ explanation="All async comparison strategies failed",
386
+ is_match=False,
387
+ confidence=0.1,
388
+ error_type="comparison_failed",
389
+ )
390
+
391
+ # Choose best result based on combination of score and confidence
392
+ best_result = self._select_best_result(results, strategies_used)
393
+
394
+ # Update the strategy to reflect hybrid approach
395
+ best_result.comparison_strategy = ComparisonStrategy.HYBRID
396
+
397
+ # Enhance explanation with strategy information
398
+ strategy_names = [s.value for s in strategies_used]
399
+ best_result.explanation += (
400
+ f" (Async hybrid strategies used: {', '.join(strategy_names)})"
401
+ )
402
+
403
+ return best_result
404
+
405
+ async def compare_tool_calls_async(
406
+ self,
407
+ predicted_call: Dict[str, Any],
408
+ ground_truth_call: Dict[str, Any],
409
+ conversation_history: Optional[List[Dict[str, str]]] = None,
410
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
411
+ custom_instructions: Optional[str] = None,
412
+ custom_schema: Optional[str] = None,
413
+ ) -> Any:
414
+ """Async hybrid tool call comparison with enhanced LLM Judge features."""
415
+
416
+ # For tool call level comparison, prioritize LLM-based approaches
417
+
418
+ # Try enhanced LLM judge with custom schema first if available
419
+ if self.llm_comparator and custom_schema:
420
+ try:
421
+ return await self.llm_comparator.compare_tool_calls_with_custom_schema(
422
+ predicted_call,
423
+ ground_truth_call,
424
+ conversation_history,
425
+ tool_specs,
426
+ custom_instructions,
427
+ custom_schema,
428
+ )
429
+ except Exception as e:
430
+ logger.warning(f"Custom schema LLM comparison failed: {e}")
431
+
432
+ # Try standard async LLM judge
433
+ if self.llm_comparator:
434
+ try:
435
+ return await self.llm_comparator.compare_tool_calls_async(
436
+ predicted_call,
437
+ ground_truth_call,
438
+ conversation_history,
439
+ tool_specs,
440
+ custom_instructions,
441
+ )
442
+ except Exception as e:
443
+ logger.warning(f"Async LLM tool call comparison failed: {e}")
444
+
445
+ # Try code agent async if available
446
+ if self.code_agent_comparator:
447
+ try:
448
+ if hasattr(self.code_agent_comparator, "compare_tool_calls_async"):
449
+ return await self.code_agent_comparator.compare_tool_calls_async(
450
+ predicted_call,
451
+ ground_truth_call,
452
+ conversation_history,
453
+ tool_specs,
454
+ custom_instructions,
455
+ )
456
+ else:
457
+ # Fallback to sync version
458
+ return self.code_agent_comparator.compare_tool_calls(
459
+ predicted_call,
460
+ ground_truth_call,
461
+ conversation_history,
462
+ tool_specs,
463
+ custom_instructions,
464
+ )
465
+ except Exception as e:
466
+ logger.warning(f"Async code agent tool call comparison failed: {e}")
467
+
468
+ # Fallback to base class comparison
469
+ return await super().compare_tool_calls_async(
470
+ predicted_call, ground_truth_call, conversation_history, tool_specs
471
+ )
472
+
473
+ def compare_tool_calls(
474
+ self,
475
+ predicted_call: Dict[str, Any],
476
+ ground_truth_call: Dict[str, Any],
477
+ conversation_history: Optional[List[Dict[str, str]]] = None,
478
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
479
+ custom_instructions: Optional[str] = None,
480
+ custom_schema: Optional[str] = None,
481
+ ) -> Any:
482
+ """Sync hybrid tool call comparison with enhanced LLM Judge features."""
483
+
484
+ # For tool call level comparison, prioritize LLM-based approaches
485
+
486
+ # Try standard LLM judge first
487
+ if self.llm_comparator:
488
+ try:
489
+ return self.llm_comparator.compare_tool_calls(
490
+ predicted_call,
491
+ ground_truth_call,
492
+ conversation_history,
493
+ tool_specs,
494
+ custom_instructions,
495
+ custom_schema,
496
+ )
497
+ except Exception as e:
498
+ logger.warning(f"LLM tool call comparison failed: {e}")
499
+
500
+ # Try code agent if available
501
+ if self.code_agent_comparator:
502
+ try:
503
+ return self.code_agent_comparator.compare_tool_calls(
504
+ predicted_call,
505
+ ground_truth_call,
506
+ conversation_history,
507
+ tool_specs,
508
+ custom_instructions,
509
+ )
510
+ except Exception as e:
511
+ logger.warning(f"Code agent tool call comparison failed: {e}")
512
+
513
+ # Fallback to base class comparison
514
+ return super().compare_tool_calls(
515
+ predicted_call, ground_truth_call, conversation_history, tool_specs
516
+ )