ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,144 @@
1
+ import asyncio
2
+ from typing import Any, Awaitable, Callable, Dict, List, Optional, TypeVar, Union, Tuple
3
+ from pydantic import BaseModel
4
+
5
+ Prompt = Union[str, List[Dict[str, Any]]]
6
+ PromptAndSchema = Tuple[Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]]
7
+ SyncGen = Callable[[Prompt], Union[str, Any]]
8
+ BatchGen = Callable[[List[Prompt]], List[Union[str, Any]]]
9
+ AsyncGen = Callable[[Prompt], Awaitable[Union[str, Any]]]
10
+ AsyncBatchGen = Callable[[List[Prompt]], Awaitable[List[Union[str, Any]]]]
11
+
12
+ T = TypeVar("T")
13
+
14
+
15
+ class PromptResult(BaseModel):
16
+ """
17
+ Holds the prompt sent and the response (or error).
18
+ """
19
+
20
+ prompt: Prompt
21
+ response: Optional[Any] = None
22
+ error: Optional[str] = None
23
+
24
+
25
+ class PromptRunner:
26
+ """
27
+ Runs a collection of prompts through various generation strategies.
28
+
29
+ Attributes:
30
+ prompts: the list of prompts to run.
31
+ """
32
+
33
+ def __init__(
34
+ self, prompts: Optional[List[Union[Prompt, PromptAndSchema]]] = None
35
+ ) -> None:
36
+ """
37
+ Args:
38
+ prompts: initial list of prompts (strings or chat messages).
39
+ """
40
+ self.prompts: List[Union[Prompt, PromptAndSchema]] = prompts or []
41
+
42
+ def add_prompt(self, prompt: Union[Prompt, PromptAndSchema]) -> None:
43
+ """Append a prompt to the runner."""
44
+ self.prompts.append(prompt)
45
+
46
+ def remove_prompt(self, prompt: Union[Prompt, PromptAndSchema]) -> None:
47
+ """Remove a prompt (first occurrence)."""
48
+ self.prompts.remove(prompt)
49
+
50
+ def clear_prompts(self) -> None:
51
+ """Remove all prompts."""
52
+ self.prompts.clear()
53
+
54
+ def get_prompt_and_schema(
55
+ self, prompt: Union[Prompt, PromptAndSchema]
56
+ ) -> Tuple[Prompt, Optional[Dict[str, Any]]]:
57
+ """
58
+ Extract the prompt and schema from a Prompt object.
59
+
60
+ Args:
61
+ prompt: The prompt to extract from.
62
+
63
+ Returns:
64
+ Tuple of (prompt, schema).
65
+ """
66
+ if isinstance(prompt, tuple):
67
+ return prompt[0], prompt[1]
68
+ return prompt, None
69
+
70
+ def run_all(
71
+ self,
72
+ gen_fn: SyncGen,
73
+ prompt_param_name: str = "prompt",
74
+ schema_param_name: Optional[str] = None,
75
+ **kwargs: Any,
76
+ ) -> List[PromptResult]:
77
+ """
78
+ Run each prompt through a synchronous single-prompt generator.
79
+
80
+ Args:
81
+ gen_fn: Callable taking one Prompt, returning str or Any.
82
+ prompt_param_name: Name of the parameter for the prompt.
83
+ schema_param_name: Name of the parameter for the schema.
84
+ kwargs: Additional arguments to pass to the function.
85
+
86
+ Returns:
87
+ List of PromptResult.
88
+ """
89
+ results: List[PromptResult] = []
90
+ for p in self.prompts:
91
+ try:
92
+ prompt, schema = self.get_prompt_and_schema(p)
93
+ args = {prompt_param_name: prompt, **kwargs}
94
+ if schema_param_name and schema:
95
+ args[schema_param_name] = schema
96
+ resp = gen_fn(**args)
97
+ results.append(PromptResult(prompt=prompt, response=resp))
98
+ except Exception as e:
99
+ results.append(PromptResult(prompt=prompt, error=str(e)))
100
+ return results
101
+
102
+ async def run_async(
103
+ self,
104
+ async_fn: AsyncGen,
105
+ max_parallel: int = 10,
106
+ prompt_param_name: str = "prompt",
107
+ schema_param_name: Optional[str] = None,
108
+ **kwargs: Any,
109
+ ) -> List[PromptResult]:
110
+ """
111
+ Run each prompt through an async single-prompt generator with concurrency limit.
112
+ Results are returned in the same order as self.prompts.
113
+
114
+ Args:
115
+ async_fn: Async callable taking one Prompt, returning str or Any.
116
+ max_parallel: Max concurrent tasks.
117
+ prompt_param_name: Name of the parameter for the prompt.
118
+ schema_param_name: Name of the parameter for the schema.
119
+ kwargs: Additional arguments to pass to the async function.
120
+
121
+ Returns:
122
+ List of PromptResult.
123
+ """
124
+ semaphore = asyncio.Semaphore(max_parallel)
125
+
126
+ async def _run_one(index: int, p: Prompt) -> Tuple[int, PromptResult]:
127
+ async with semaphore:
128
+ try:
129
+ prompt, schema = self.get_prompt_and_schema(p)
130
+ args = {prompt_param_name: prompt, **kwargs}
131
+ if schema_param_name and schema:
132
+ args[schema_param_name] = schema
133
+ resp = await async_fn(**args)
134
+ return index, PromptResult(prompt=prompt, response=resp)
135
+ except Exception as e:
136
+ return index, PromptResult(prompt=prompt, error=str(e))
137
+
138
+ tasks = [
139
+ asyncio.create_task(_run_one(i, p)) for i, p in enumerate(self.prompts)
140
+ ]
141
+ indexed_results = await asyncio.gather(*tasks)
142
+ # Sort results to match original order
143
+ indexed_results.sort(key=lambda x: x[0])
144
+ return [res for _, res in indexed_results]
@@ -0,0 +1,455 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ import json
10
+ from functools import lru_cache
11
+
12
+ import pandas as pd
13
+ from llmevalkit.function_calling.pipeline.pipeline import ReflectionPipeline
14
+ from llmevalkit.function_calling.pipeline.types import ToolCall, ToolSpec
15
+
16
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
17
+ from ibm_watsonx_gov.entities.base_classes import Error
18
+ from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
19
+ RecordMetricResult)
20
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
21
+ from ibm_watsonx_gov.utils.python_utils import (
22
+ get, parse_functions_to_openai_schema)
23
+
24
+
25
+ class ToolCallMetricProvider():
26
+ """
27
+ Base class for Tool Call Metrics Computation.
28
+ """
29
+
30
+ def __init__(self, configuration: GenAIConfiguration | AgenticAIConfiguration, metric: GenAIMetric):
31
+ """
32
+ Initialize the ToolCallMetricProvider with the configuration.
33
+
34
+ Args:
35
+ configuration (GenAIConfiguration | AgenticAIConfiguration): The configuration for the metric computation.
36
+ metric (GenAIMetric): The metric to be computed.
37
+ """
38
+ self.configuration = configuration
39
+ self.metric = metric
40
+
41
+ def pre_process(self, data: pd.DataFrame):
42
+ """
43
+ Preprocess the dataframe and tool list for metrics computation
44
+
45
+ Args:
46
+ data (pd.DataFrame): Input dataframe
47
+
48
+ Returns:
49
+ pd.Dataframe: Processed dataframe
50
+ """
51
+ # Get the specification of tools used in the application
52
+ # in proper format if it is a list of Callable
53
+ if isinstance(self.configuration.tools, list) and all(callable(item) for item in self.configuration.tools):
54
+ self.configuration.tools = self.get_tools_list_schema(
55
+ self.configuration.tools)
56
+
57
+ if self.configuration.available_tools_field and self.configuration.available_tools_field in data.columns:
58
+ data[self.configuration.available_tools_field] = data[self.configuration.available_tools_field].apply(
59
+ lambda x: json.loads(x) if isinstance(x, str) else x)
60
+
61
+ # TODO: Add validation for the tool_call_field data schema
62
+ tool_call_field = self.configuration.tool_calls_field
63
+ if tool_call_field:
64
+ data[tool_call_field] = data[tool_call_field].apply(
65
+ lambda x: json.loads(x) if isinstance(x, str) else x)
66
+ return data
67
+
68
+ @staticmethod
69
+ def get_tools_list_schema(tools: list) -> list:
70
+ """
71
+ Convert the list of callable objects to the
72
+ format needed for the TCH computation
73
+
74
+ Args:
75
+ tools (list): List of Callable objects
76
+
77
+ Returns:
78
+ list: List of dictionary containing the tool
79
+ specifications
80
+ """
81
+ tools_specifications = []
82
+ for func in tools:
83
+ tool_schema = parse_functions_to_openai_schema(func)
84
+ if not tool_schema:
85
+ continue
86
+ tools_specifications.append(ToolSpec.model_validate(tool_schema))
87
+
88
+ return tools_specifications
89
+
90
+ async def compute_metrics(self, data: pd.DataFrame, syntactic_only: bool = True, metric_result_mapping_name: str = None, **kwargs):
91
+ """
92
+ Compute the Tool Call Metrics for the given data
93
+
94
+ Args:
95
+ data (pd.DataFrame): Input data including the tools used for the application
96
+ syntactic_only (bool): If True, compute only syntactic metrics.
97
+ metric_result_mapping_name (str): The mapping name for the metric result with the llmevalkit
98
+ kwargs: Additional keyword arguments for the pipeline
99
+
100
+ Returns:
101
+ list: List of metrics calculated for each record
102
+ """
103
+ try:
104
+
105
+ data = self.pre_process(data)
106
+ tool_calls_field = self.configuration.tool_calls_field
107
+ record_id_field = self.configuration.record_id_field
108
+ record_level_metrics = []
109
+
110
+ # Do not compute metrics if llm_judge is not set
111
+ # and trying to compute a non syntactic metrics
112
+ if not getattr(self.metric, "llm_judge", None) and not syntactic_only:
113
+ return []
114
+
115
+ for _, row in data.iterrows():
116
+
117
+ available_tools = self.configuration.tools or row.get(
118
+ self.configuration.available_tools_field, [])
119
+ if not all(isinstance(t, ToolSpec) for t in available_tools):
120
+ available_tools = [ToolSpec.model_validate(
121
+ func) for func in available_tools]
122
+
123
+ tool_calls = self.extract_tool_calls_from_response(
124
+ row[tool_calls_field])
125
+
126
+ if not available_tools:
127
+ record_level_metrics.append({
128
+ "value": None, # Treat no available tools as None score as we are not able to compute a score
129
+ "record_id": row[record_id_field],
130
+ "errors": [Error(code="BAD_REQUEST", message_en="The list of available tools is empty.")]
131
+ })
132
+ continue
133
+
134
+ if not tool_calls:
135
+ record_level_metrics.append({
136
+ "value": None, # Treat no tool calls as None score as we are not able to compute a score
137
+ "record_id": row[record_id_field],
138
+ "errors": [Error(code="BAD_REQUEST", message_en="The list of tool calls made by LLM is empty.")]
139
+ })
140
+ continue
141
+
142
+ if syntactic_only:
143
+ tool_call_level_explanation = self.compute_syntactic_metrics(
144
+ data=row, tool_calls=tool_calls, available_tools=available_tools)
145
+ record_level_metrics.append({
146
+ "value": 0.0 if tool_call_level_explanation else 1.0,
147
+ "record_id": row[record_id_field],
148
+ "explanations": tool_call_level_explanation
149
+ })
150
+ else:
151
+ tool_call_level_explanation = await self.compute_semantic_metrics(
152
+ data=row, tool_calls=tool_calls, available_tools=available_tools, metric_result_mapping_name=metric_result_mapping_name, **kwargs)
153
+ record_level_metrics.append({
154
+ "value": min(entry.get("value") for entry in tool_call_level_explanation),
155
+ "errors": [Error(code="REQUEST_FAILED", message_en=entry.get("error")) for entry in
156
+ tool_call_level_explanation if entry.get("error")],
157
+ "record_id": row[record_id_field],
158
+ "explanations": tool_call_level_explanation
159
+ })
160
+
161
+ metric_result = self.post_process(
162
+ record_level_metrics, syntactic_only=syntactic_only)
163
+
164
+ return metric_result
165
+ except Exception as ex:
166
+ raise Exception(
167
+ f"Error while computing metrics: '{self.metric.name}' using '{self.metric.method}'. Reason: {str(ex)}") from ex
168
+
169
+ def compute_syntactic_metrics(self, data: pd.DataFrame, tool_calls: list, available_tools: list):
170
+ """
171
+ Compute the Tool Call Metrics for the given data
172
+ in static mode
173
+
174
+ Args:
175
+ data (pd.DataFrame): Input data including the tools used for the application
176
+ tool_calls (list): List of tool calls made by the LLM
177
+
178
+ Returns:
179
+ list: List of metrics calculated for each record
180
+ """
181
+ tool_call_level_explanation = []
182
+ for call in tool_calls:
183
+ explanations = ReflectionPipeline.static_only(
184
+ inventory=available_tools, call=ToolCall.model_validate(call))
185
+ explanations = explanations.model_dump()
186
+ if explanations.get("final_decision") is False:
187
+ tool_call_level_explanation.append({
188
+ "tool_name": call.get("function").get("name"),
189
+ "hallucinations": {
190
+ key: val for key, val in explanations["metrics"].items() if not val["valid"]
191
+ }
192
+ })
193
+ return tool_call_level_explanation
194
+
195
+ async def compute_semantic_metrics(self, data: pd.DataFrame, tool_calls: list, available_tools: list, metric_result_mapping_name: str, **kwargs):
196
+ """
197
+ Compute the Tool Call Metrics for the given data
198
+ in semantic mode
199
+
200
+ Args:
201
+ data (pd.DataFrame): Input data including the tools used for the application
202
+ configuration (GenAIConfiguration | AgenticAIConfiguration): Metrics configuration
203
+ metric_result_mapping_name (str): The mapping name for the metric result with the llmevalkit
204
+ kwargs: Additional keyword arguments for the pipeline
205
+
206
+ Returns:
207
+ list: List of metrics calculated for each record
208
+ """
209
+ tool_call_level_explanation = []
210
+ metrics_client = self.get_llm_metric_client()
211
+ pipeline = ReflectionPipeline(
212
+ metrics_client=metrics_client,
213
+ **kwargs
214
+ )
215
+ for call in tool_calls:
216
+ result = await pipeline.semantic_async(
217
+ conversation=data[self.configuration.input_fields[0]],
218
+ inventory=available_tools,
219
+ call=ToolCall.model_validate(call),
220
+ retries=2
221
+ )
222
+
223
+ explanations = get(
224
+ result.model_dump(), f"{metric_result_mapping_name}.metrics.{self.metric.metric_mapping_name}")
225
+
226
+ error = get(explanations, "error")
227
+ if explanations:
228
+ tool_call_level_explanation.append({
229
+ "tool_name": get(call, "function.name"),
230
+ "value": float(get(explanations, "raw_response.output", 0.0))/5 if not error else None,
231
+ "error": error,
232
+ "explanation": get(explanations, "raw_response.explanation"),
233
+ "evidence": get(explanations, "raw_response.evidence"),
234
+ "correction": get(explanations, "raw_response.correction")
235
+ })
236
+ return tool_call_level_explanation
237
+
238
+ @staticmethod
239
+ def extract_tool_calls_from_response(tool_calls_response) -> list:
240
+ """
241
+ Extracts the tool calls from the response
242
+
243
+ Args:
244
+ tool_calls_response (Any): The tool calls response
245
+ can be a list of dictionary, an AIMessage object
246
+ or a dictionary
247
+
248
+ Returns:
249
+ list: List of openai formatted tool call
250
+ """
251
+ if isinstance(tool_calls_response, dict):
252
+ tool_calls = get(tool_calls_response, "kwargs.tool_calls")
253
+ elif hasattr(tool_calls_response, "tool_calls"):
254
+ tool_calls = tool_calls_response.tool_calls
255
+ else:
256
+ tool_calls = tool_calls_response
257
+
258
+ if tool_calls is None:
259
+ tool_calls = []
260
+ converted = []
261
+ for call in tool_calls:
262
+ # check if tool call is already in the required format, else convert it
263
+ if (isinstance(call, dict) and
264
+ "id" in call and
265
+ call.get("type") == "function" and
266
+ isinstance(call.get("function"), dict) and
267
+ "name" in call["function"] and
268
+ "arguments" in call["function"]):
269
+ converted.append(call)
270
+ else:
271
+ converted.append({
272
+ "id": call["id"],
273
+ "type": "function",
274
+ "function": {
275
+ "name": call["name"],
276
+ "arguments": json.dumps(call["args"])
277
+ }
278
+ })
279
+ return converted
280
+
281
+ def post_process(self, results: pd.DataFrame, syntactic_only: bool = True):
282
+ """
283
+ Post process the computed metrics to get the Aggregated Result and
284
+ Record level metric result in the proper format
285
+
286
+ Args:
287
+ results (pd.DataFrame): Computed metric results
288
+ configuration (GenAIConfiguration | AgenticAIConfiguration): Metric configuration
289
+
290
+ Returns:
291
+ AggregateMetricResult: The AggregateMetricResult object containing the calculated
292
+ metrics information
293
+ """
294
+
295
+ # Preparing the record level metrics
296
+ record_level_metrics: list[RecordMetricResult] = []
297
+
298
+ for row in results:
299
+ record_level_metrics.append(
300
+ RecordMetricResult(
301
+ name=self.metric.name,
302
+ display_name=self.metric.display_name,
303
+ method=self.metric.method,
304
+ value=row.get("value"),
305
+ provider="ibm",
306
+ errors=row.get("errors", []),
307
+ group=self.metric.group,
308
+ record_id=row["record_id"],
309
+ thresholds=self.metric.thresholds,
310
+ additional_info={"explanations": row.get("explanations")}
311
+ )
312
+ )
313
+
314
+ # Get the number of records are violated, min, max
315
+ values = [item.get("value") or 0.0 for item in results]
316
+ min_value = min(values, default=0.0)
317
+ max_value = max(values, default=0.0)
318
+ if syntactic_only:
319
+ count_invalid = sum(val == 0.0 for val in values)
320
+ value = int(count_invalid)/int(len(results))
321
+ else:
322
+ value = sum(values)/len(values) if values else 0.0
323
+
324
+ # creating AggregateMetricResult
325
+ aggregated_result = AggregateMetricResult(
326
+ name=self.metric.name,
327
+ display_name=self.metric.display_name,
328
+ method=self.metric.method,
329
+ provider="ibm",
330
+ group=self.metric.group,
331
+ value=value,
332
+ total_records=len(results),
333
+ record_level_metrics=record_level_metrics,
334
+ min=min_value,
335
+ max=max_value,
336
+ thresholds=self.metric.thresholds
337
+ )
338
+
339
+ # return the aggregated result
340
+ return aggregated_result
341
+
342
+ @staticmethod
343
+ @lru_cache(maxsize=128)
344
+ def _create_client_impl(
345
+ provider: str,
346
+ model_id: str,
347
+ project_id: str,
348
+ space_id: str,
349
+ credentials_json: str
350
+ ):
351
+ """
352
+ Cached instance method for creating LLM clients.
353
+
354
+
355
+ Args:
356
+ provider: Provider name
357
+ model_id: Model identifier
358
+ project_id: Project ID (empty string if None)
359
+ space_id: Space ID (empty string if None)
360
+ credentials_json: JSON string of credentials
361
+
362
+ Returns:
363
+ Configured LLM client
364
+ """
365
+ from llmevalkit.llm import get_llm
366
+
367
+ if provider == "ibm_watsonx.ai":
368
+ from llmevalkit.llm.providers.ibm_watsonx_ai.ibm_watsonx_ai import \
369
+ WatsonxLLMClientOutputVal
370
+
371
+ provider_kwargs = json.loads(credentials_json)
372
+ provider_kwargs["model_id"] = model_id
373
+
374
+ if project_id:
375
+ provider_kwargs["project_id"] = project_id
376
+ if space_id:
377
+ provider_kwargs["space_id"] = space_id
378
+
379
+ return WatsonxLLMClientOutputVal(**provider_kwargs)
380
+
381
+ elif provider == "openai":
382
+ MetricsClientCls = get_llm("openai.async")
383
+ return MetricsClientCls(model_name=model_id)
384
+
385
+ elif provider == "wxo_ai_gateway":
386
+ from llmevalkit.llm.providers.wxo_ai_gateway.wxo_ai_gateway import \
387
+ WxoAIGatewayClientOutputVal
388
+
389
+ provider_kwargs = json.loads(credentials_json)
390
+ return WxoAIGatewayClientOutputVal(**provider_kwargs)
391
+ else:
392
+ raise ValueError(f"Unsupported provider: {provider}")
393
+
394
+ def get_llm_metric_client(self):
395
+ """
396
+ Get or create cached LLM metrics client.
397
+
398
+ Returns:
399
+ Cached or newly created LLM client
400
+ """
401
+ llm_judge = self.metric.llm_judge
402
+
403
+ # Extract hashable parameters
404
+ provider = llm_judge.get_model_provider()
405
+ model_id = getattr(llm_judge.model, 'model_id', None) or ""
406
+ project_id = getattr(llm_judge.model, 'project_id', None) or ""
407
+ space_id = getattr(llm_judge.model, 'space_id', None) or ""
408
+
409
+ credentials = llm_judge.model.provider.credentials.model_dump(
410
+ exclude_none=True, exclude_unset=True
411
+ )
412
+ credentials_json = json.dumps(credentials, sort_keys=True)
413
+
414
+ # Call cached method with hashable parameters
415
+ return ToolCallMetricProvider._create_client_impl(
416
+ provider, model_id, project_id, space_id, credentials_json
417
+ )
418
+
419
+ def extract_parameter_info(self, data, metric_mapping_name):
420
+ """
421
+ Extract parameter metrics into a list
422
+
423
+ Args:
424
+ data (dict): Response data to be extracted
425
+ metric_mapping_name (str): Metric mapping name
426
+
427
+ Returns:
428
+ List: List of Parameter based explanation
429
+ """
430
+ result = {
431
+ "is_issue": False,
432
+ "raw_response": []
433
+ }
434
+
435
+ for param_name, param_data in data.get("parameter", {}).items():
436
+ metrics = get(param_data, f"metrics.{metric_mapping_name}")
437
+ raw_response = metrics['raw_response']
438
+ is_issue = metrics.get('is_issue', False)
439
+
440
+ if is_issue:
441
+ result["is_issue"] = True
442
+
443
+ param_info = {
444
+ "parameter": param_name,
445
+ "explanation": raw_response['explanation'],
446
+ "evidence": raw_response['evidence'],
447
+ "output": raw_response['output'],
448
+ "confidence": raw_response['confidence'],
449
+ "correction": raw_response['correction'],
450
+ "is_issue": is_issue
451
+ }
452
+
453
+ result["raw_response"].append(param_info)
454
+
455
+ return result
@@ -0,0 +1,10 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from .core.tool_loader import load_tool
@@ -0,0 +1,11 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from ..entities.ai_tools import ToolRegistrationPayload, ToolUpdatePayload
10
+ from .ai_tool_client import (delete_tool, delete_tool_with_name, get_tool,
11
+ get_tool_info, list_tools, register_tool)