ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
@@ -0,0 +1,123 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ import textstat
14
+ from pydantic import Field
15
+
16
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
17
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
18
+ from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
19
+ RecordMetricResult)
20
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
21
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
22
+ from ibm_watsonx_gov.utils.python_utils import replace_none_with_empty_string
23
+ from ibm_watsonx_gov.utils.validation_util import validate_output
24
+
25
+ TEXT_READING_EASE = "text_reading_ease"
26
+ TEXT_READING_EASE_DISPLAY_NAME = "Text Reading Ease"
27
+ FLESCH_READING_EASE = "flesch_reading_ease"
28
+ TEXTSTAT = "textstat"
29
+
30
+
31
+ class TextReadingEaseResult(RecordMetricResult):
32
+ name: str = TEXT_READING_EASE
33
+ display_name: str = TEXT_READING_EASE_DISPLAY_NAME
34
+ provider: str = TEXTSTAT
35
+ method: str = FLESCH_READING_EASE
36
+
37
+
38
+ class TextReadingEaseMetric(GenAIMetric):
39
+ """
40
+ Defines the Text Reading Ease metric class.
41
+
42
+ The Text Reading Ease metric measures how readable the text is.
43
+ It is computed using the flesch_reading_ease method.
44
+ The score ranges broadly from 0 to 100, where a higher score indicates that a text is easier to read
45
+
46
+ Examples:
47
+ 1. Create Text Reading Ease metric with default parameters and compute using metrics evaluator.
48
+ .. code-block:: python
49
+
50
+ metric = TextReadingEaseMetric()
51
+ result = MetricsEvaluator().evaluate(data={"generated_text": "..."},
52
+ metrics=[metric])
53
+
54
+ 2. Create Text Reading Ease metric with a custom threshold.
55
+ .. code-block:: python
56
+
57
+ threshold = MetricThreshold(type="lower_limit", value=70)
58
+ metric = TextReadingEaseMetric(thresholds=[threshold])
59
+ """
60
+ name: Annotated[Literal["text_reading_ease"],
61
+ Field(title="name",
62
+ description="The text reading ease metric name.",
63
+ default=TEXT_READING_EASE, frozen=True)]
64
+ display_name: Annotated[Literal["Text Reading Ease"],
65
+ Field(title="Display Name",
66
+ description="The text reading ease metric display name.",
67
+ default=TEXT_READING_EASE_DISPLAY_NAME, frozen=True)]
68
+ method: Annotated[Literal["flesch_reading_ease"],
69
+ Field(title="Method",
70
+ description="The method used to compute text reading ease metric.",
71
+ default=FLESCH_READING_EASE)]
72
+ tasks: Annotated[list[TaskType],
73
+ Field(title="Tasks",
74
+ description="The list of supported tasks.",
75
+ default=TaskType.values(), frozen=True)]
76
+ thresholds: Annotated[list[MetricThreshold],
77
+ Field(title="Thresholds",
78
+ description="The metric thresholds.",
79
+ default=[MetricThreshold(type="lower_limit", value=70)])]
80
+ group: Annotated[MetricGroup,
81
+ Field(title="Group",
82
+ description="The metric group.",
83
+ default=MetricGroup.READABILITY, frozen=True)]
84
+
85
+ def evaluate(
86
+ self,
87
+ data: pd.DataFrame,
88
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
89
+ **kwargs,
90
+ ) -> list[AggregateMetricResult]:
91
+ from ibm_watsonx_gov.utils.aggregation_util import get_summaries
92
+
93
+ validate_output(data.columns.to_list(), configuration)
94
+ predictions = data[configuration.output_fields[0]].to_list()
95
+ record_ids = data[configuration.record_id_field].to_list()
96
+ replace_none_with_empty_string(predictions)
97
+
98
+ all_scores = self._compute(predictions=predictions)
99
+ record_level_metrics = [
100
+ TextReadingEaseResult(record_id=record_id,
101
+ value=score, thresholds=self.thresholds, group=MetricGroup.READABILITY.value)
102
+ for score, record_id in zip(all_scores, record_ids)
103
+ ]
104
+ summary = get_summaries(all_scores)
105
+ aggregate_metric_scores = AggregateMetricResult(
106
+ name=self.name,
107
+ display_name=self.display_name,
108
+ provider=TEXTSTAT,
109
+ method=self.method,
110
+ group=self.group,
111
+ min=summary.get("min"),
112
+ max=summary.get("max"),
113
+ mean=summary.get("mean"),
114
+ value=summary.get("mean"),
115
+ total_records=len(record_level_metrics),
116
+ record_level_metrics=record_level_metrics,
117
+ thresholds=self.thresholds,
118
+ )
119
+
120
+ return aggregate_metric_scores
121
+
122
+ def _compute(self, predictions: list) -> list:
123
+ return [textstat.flesch_reading_ease(pred) for pred in predictions]
File without changes
@@ -0,0 +1,67 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from functools import partial
10
+ from typing import Callable, Optional
11
+
12
+ from wrapt import decorator
13
+
14
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
15
+ AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
19
+ from ibm_watsonx_gov.metrics.tool_call_accuracy.tool_call_accuracy_metric import \
20
+ ToolCallAccuracyMetric
21
+ from ibm_watsonx_gov.providers.tool_call_metric_provider import \
22
+ ToolCallMetricProvider
23
+
24
+
25
+ class ToolCallAccuracyDecorator(BaseMetricDecorator):
26
+ def evaluate_tool_call_accuracy(self,
27
+ func: Optional[Callable] = None,
28
+ *,
29
+ configuration: Optional[AgenticAIConfiguration] = None,
30
+ metrics: list[GenAIMetric] = []
31
+ ) -> dict:
32
+ """
33
+ An evaluation decorator for computing tool call accuracy metric on an agentic node.
34
+ """
35
+ if func is None:
36
+ return partial(self.evaluate_tool_call_accuracy, configuration=configuration, metrics=metrics)
37
+
38
+ if not metrics:
39
+ metrics = [ToolCallAccuracyMetric()]
40
+
41
+ @decorator
42
+ def wrapper(func, instance, args, kwargs):
43
+ try:
44
+ self.validate(func=func, metrics=metrics,
45
+ valid_metric_types=(ToolCallAccuracyMetric,))
46
+
47
+ metric_inputs = [
48
+ EvaluatorFields.INPUT_FIELDS
49
+ ]
50
+ metric_outputs = [
51
+ EvaluatorFields.TOOL_CALLS_FIELD, EvaluatorFields.OUTPUT_FIELDS]
52
+
53
+ if isinstance(configuration.tools, list) and all(callable(item) for item in configuration.tools):
54
+ configuration.tools = ToolCallMetricProvider.get_tools_list_schema(
55
+ configuration.tools)
56
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
57
+ configuration=configuration,
58
+ metrics=metrics,
59
+ metric_inputs=metric_inputs,
60
+ metric_outputs=metric_outputs)
61
+
62
+ return original_result
63
+ except Exception as ex:
64
+ raise Exception(
65
+ f"There was an error while evaluating tool call accuracy metric on {func.__name__},") from ex
66
+
67
+ return wrapper(func)
@@ -0,0 +1,162 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
14
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
15
+ from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
16
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
17
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
18
+ from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
19
+ from ibm_watsonx_gov.providers.tool_call_metric_provider import \
20
+ ToolCallMetricProvider
21
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
22
+ from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
23
+ from ibm_watsonx_gov.utils.validation_util import validate_tool_calls, validate_input
24
+ from pydantic import Field
25
+
26
+ logger = GovSDKLogger.get_logger(__name__)
27
+ TOOL_CALL_ACCURACY = "tool_call_accuracy"
28
+ FUNCTION_CALL = "function_call"
29
+
30
+
31
+ class ToolCallAccuracyMetric(GenAIMetric):
32
+ """
33
+ ToolCallAccuracyMetric checks whether the tool call in the LLM response is
34
+ syntactically correct and semantically meaningful, given the user's query and
35
+ the available tool definitions.
36
+
37
+ The ToolCallAccuracyMetric can be computed using the below methods:
38
+
39
+ 1. syntactic (default)
40
+ 2. granite_guardian
41
+
42
+ Examples:
43
+ 1. Create ToolCallAccuracyMetric by passing the basic configuration.
44
+ .. code-block:: python
45
+
46
+ config = GenAIConfiguration(tools = [get_weather,fetch_stock_price])
47
+ evaluator = MetricsEvaluator(configuration=config)
48
+ df = pd.read_csv("")
49
+
50
+ metrics = [ToolCallAccuracyMetric()]
51
+ result = evaluator.evaluate(data=df, metrics=metrics)
52
+
53
+ 2. Create ToolCallAccuracyMetric with a custom threshold.
54
+ .. code-block:: python
55
+
56
+ threshold = MetricThreshold(type="upper_limit", value=0.8)
57
+ metric = ToolCallAccuracyMetric(threshold=threshold)
58
+
59
+ 3. Create ToolCallAccuracyMetric by passing custom tool calls field in configuration.
60
+ .. code-block:: python
61
+
62
+ test_data = {"input_text": "What's the latest on Tesla today?",
63
+ "tools_used":[{"name": "get_weather", "args": {"location": "Tesla"}, "id": "0724", "type": "tool_call"}]}
64
+
65
+ config = GenAIConfiguration(tools = [get_weather,fetch_stock_price],
66
+ tool_calls_field="tools_used")
67
+ evaluator = MetricsEvaluator(configuration=config)
68
+ metrics = [ToolCallAccuracyMetric()]
69
+ result = evaluator.evaluate(data=test_data, metrics=metrics)
70
+
71
+ 4. Create ToolCallAccuracyMetric by passing a list of dictionary items as tools field in configuration.
72
+ .. code-block:: python
73
+
74
+ available_tools = [{"type":"function","function":{"name":"f1_name","description":"f1_description.","parameters":{"parameter1":{"description":"parameter_description","type":"parameter_type","default":"default_value"}}}}]
75
+ config = GenAIConfiguration(tools = available_tools,
76
+ tool_calls_field="tools_used")
77
+ evaluator = MetricsEvaluator(configuration=config)
78
+ df = pd.read_csv("")
79
+
80
+ metrics = [ToolCallAccuracyMetric()]
81
+ result = evaluator.evaluate(data=df, metrics=metrics)
82
+ """
83
+
84
+ name: Annotated[Literal["tool_call_accuracy"], Field(title="Metric Name",
85
+ description="The tool call accuracy metric name.",
86
+ default=TOOL_CALL_ACCURACY)]
87
+ display_name: Annotated[Literal["Tool Call Accuracy"], Field(title="Display Name",
88
+ description="The tool call accuracy metric display name.",
89
+ default="Tool Call Accuracy", frozen=True)]
90
+ tasks: Annotated[list[TaskType], Field(title="Task Type",
91
+ description="The generative task type.",
92
+ default=[TaskType.RAG])]
93
+ group: Annotated[MetricGroup, Field(
94
+ default=MetricGroup.TOOL_CALL_QUALITY, frozen=True)]
95
+
96
+ method: Annotated[Literal["syntactic", "granite_guardian"], Field(title="Computation Method",
97
+ description="The method used to compute the metric.",
98
+ default="syntactic")]
99
+ thresholds: Annotated[list[MetricThreshold], Field(title="Metric threshold",
100
+ description="Value that defines the violation limit for the metric",
101
+ default=[MetricThreshold(
102
+ type="lower_limit", value=0.7)]
103
+ )]
104
+
105
+ async def evaluate_async(
106
+ self,
107
+ data: pd.DataFrame,
108
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
109
+ **kwargs
110
+ ) -> list[AggregateMetricResult]:
111
+
112
+ data_cols = data.columns.to_list()
113
+
114
+ try:
115
+ validate_tool_calls(data_cols, configuration)
116
+ validate_input(data_cols, configuration)
117
+ except ValueError as ve:
118
+ if kwargs.get("ignore_validation_errors"):
119
+ message = f"Skipping '{self.name}' computation because the validation failed. Details: {str(ve)}"
120
+ logger.warning(message)
121
+ return
122
+ raise ve
123
+
124
+ if self.method == "granite_guardian":
125
+ kwargs["detector_params"] = {
126
+ "risk_name": FUNCTION_CALL, "threshold": 0.001}
127
+ tool_call_provider = DetectorsProvider(configuration=configuration,
128
+ metric_name=self.name,
129
+ metric_display_name=self.display_name,
130
+ metric_method=self.method,
131
+ metric_group=self.group,
132
+ thresholds=self.thresholds,
133
+ **kwargs)
134
+ metric_result = await tool_call_provider.evaluate_async(data=data)
135
+ elif self.method == "syntactic":
136
+ tool_call_provider = ToolCallMetricProvider(
137
+ configuration=configuration, metric=self)
138
+
139
+ # Compute the metrics
140
+ metric_result = await tool_call_provider.compute_metrics(data)
141
+ return metric_result
142
+
143
+ def evaluate(self, data: pd.DataFrame | dict,
144
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
145
+ **kwargs):
146
+ """
147
+ Evaluate the data for ToolCallAccuracyMetric
148
+ Args:
149
+ data (pd.DataFrame | dict): Data to be evaluated
150
+ configuration (GenAIConfiguration | AgenticAIConfiguration): Metrics configuration
151
+ **kwargs: Additional keyword arguments
152
+
153
+ Returns:
154
+ AggregateMetricResult: The computed metrics
155
+ """
156
+ # If ran in sync mode, block until it is done
157
+ return run_in_event_loop(
158
+ self.evaluate_async,
159
+ data=data,
160
+ configuration=configuration,
161
+ **kwargs,
162
+ )
@@ -0,0 +1,68 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from functools import partial
10
+ from typing import Callable, Optional
11
+
12
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
13
+ AgenticAIConfiguration
14
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
15
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
16
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
17
+ from ibm_watsonx_gov.metrics.tool_call_parameter_accuracy.tool_call_parameter_accuracy_metric import \
18
+ ToolCallParameterAccuracyMetric
19
+ from ibm_watsonx_gov.providers.tool_call_metric_provider import \
20
+ ToolCallMetricProvider
21
+ from wrapt import decorator
22
+
23
+
24
+ class ToolCallParameterAccuracyDecorator(BaseMetricDecorator):
25
+ def evaluate_tool_call_parameter_accuracy(self,
26
+ func: Optional[Callable] = None,
27
+ *,
28
+ configuration: Optional[AgenticAIConfiguration] = None,
29
+ metrics: list[GenAIMetric] = []
30
+ ) -> dict:
31
+ """
32
+ An evaluation decorator for computing tool call parameter accuracy metric on an agentic node.
33
+ """
34
+ if func is None:
35
+ return partial(self.evaluate_tool_call_parameter_accuracy, configuration=configuration, metrics=metrics)
36
+
37
+ if not metrics:
38
+ metrics = [ToolCallParameterAccuracyMetric()]
39
+
40
+ @decorator
41
+ def wrapper(func, instance, args, kwargs):
42
+
43
+ try:
44
+ self.validate(func=func, metrics=metrics,
45
+ valid_metric_types=(ToolCallParameterAccuracyMetric,))
46
+
47
+ metric_inputs = [
48
+ EvaluatorFields.INPUT_FIELDS
49
+ ]
50
+ metric_outputs = [
51
+ EvaluatorFields.TOOL_CALLS_FIELD, EvaluatorFields.OUTPUT_FIELDS]
52
+
53
+ if isinstance(configuration.tools, list) and all(callable(item) for item in configuration.tools):
54
+ configuration.tools = ToolCallMetricProvider.get_tools_list_schema(
55
+ configuration.tools)
56
+
57
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
58
+ configuration=configuration,
59
+ metrics=metrics,
60
+ metric_inputs=metric_inputs,
61
+ metric_outputs=metric_outputs)
62
+
63
+ return original_result
64
+ except Exception as ex:
65
+ raise Exception(
66
+ f"There was an error while evaluating tool call parameter accuracy metric on {func.__name__},") from ex
67
+
68
+ return wrapper(func)
@@ -0,0 +1,151 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ from pydantic import Field
14
+
15
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
17
+ from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
18
+ from ibm_watsonx_gov.entities.llm_judge import LLMJudge
19
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
20
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
21
+ from ibm_watsonx_gov.providers.tool_call_metric_provider import \
22
+ ToolCallMetricProvider
23
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
24
+ from ibm_watsonx_gov.utils.validation_util import validate_tool_calls
25
+
26
+ TOOL_CALL_PARAMETER_ACCURACY = "tool_call_parameter_accuracy"
27
+
28
+
29
+ class ToolCallParameterAccuracyMetric(GenAIMetric):
30
+ """
31
+ ToolCallParameterAccuracyMetric assesses whether ALL parameter values
32
+ in a function call are directly supported by conversation history
33
+ or API specifications. Identifies hallucinated values, missing information,
34
+ format errors, and contradictory values.
35
+
36
+ The ToolCallParameterAccuracyMetric will be computed using llm_as_judge.
37
+
38
+ Examples:
39
+ 1. Create ToolCallParameterAccuracyMetric by passing the basic configuration.
40
+ .. code-block:: python
41
+
42
+ config = GenAIConfiguration(tools = [get_weather,fetch_stock_price])
43
+ evaluator = MetricsEvaluator(configuration=config)
44
+ df = pd.read_csv("")
45
+ llm_judge = LLMJudge(
46
+ model=WxAIFoundationModel(
47
+ model_id="meta-llama/llama-3-3-70b-instruct",
48
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
49
+ )
50
+ )
51
+ metrics = [ToolCallParameterAccuracyMetric(llm_judge=llm_judge)]
52
+ result = evaluator.evaluate(data=df, metrics=metrics)
53
+
54
+ 2. Create ToolCallParameterAccuracyMetric by passing custom tool calls field in configuration.
55
+ .. code-block:: python
56
+
57
+ config = GenAIConfiguration(tools = [get_weather,fetch_stock_price],
58
+ tool_calls_field="tools_used")
59
+ evaluator = MetricsEvaluator(configuration=config)
60
+ df = pd.read_csv("")
61
+ llm_judge = LLMJudge(
62
+ model=WxAIFoundationModel(
63
+ model_id="meta-llama/llama-3-3-70b-instruct",
64
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
65
+ )
66
+ )
67
+ metrics = [ToolCallParameterAccuracyMetric(llm_judge=llm_judge)]
68
+ result = evaluator.evaluate(data=df, metrics=metrics)
69
+
70
+ 3. Create ToolCallParameterAccuracyMetric with a custom threshold.
71
+ .. code-block:: python
72
+
73
+ llm_judge = LLMJudge(
74
+ model=WxAIFoundationModel(
75
+ model_id="meta-llama/llama-3-3-70b-instruct",
76
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
77
+ )
78
+ )
79
+ threshold = MetricThreshold(type="upper_limit", value=0.8)
80
+ metric = ToolCallParameterAccuracyMetric(llm_judge=llm_judge, threshold=threshold)
81
+ """
82
+
83
+ name: Annotated[Literal["tool_call_parameter_accuracy"], Field(title="Metric Name",
84
+ description="The name of metric.",
85
+ default=TOOL_CALL_PARAMETER_ACCURACY)]
86
+ display_name: Annotated[Literal["Tool Call Parameter Accuracy"], Field(title="Display Name",
87
+ description="The tool call parameter accuracy metric display name.",
88
+ default="Tool Call Parameter Accuracy", frozen=True)]
89
+ tasks: Annotated[list[TaskType], Field(title="Task Type",
90
+ description="The generative task type.",
91
+ default=[TaskType.RAG])]
92
+ group: Annotated[MetricGroup, Field(
93
+ default=MetricGroup.TOOL_CALL_QUALITY, frozen=True)]
94
+
95
+ llm_judge: Annotated[LLMJudge | None, Field(
96
+ description="The LLM judge used to compute the metric.", default=None)]
97
+
98
+ method: Annotated[Literal["llm_as_judge"], Field(title="Computation Method",
99
+ description="The method used to compute the metric.",
100
+ default="llm_as_judge")]
101
+ thresholds: Annotated[list[MetricThreshold], Field(title="Metric threshold",
102
+ description="Value that defines the violation limit for the metric",
103
+ default=[MetricThreshold(
104
+ type="lower_limit", value=0.8)]
105
+ )]
106
+ metric_mapping_name: Annotated[Literal["general_hallucination_check"], Field(title="Metric Mapping Name",
107
+ description="The mapping name of metric with llmevalkit.",
108
+ default="general_hallucination_check")]
109
+
110
+ async def evaluate_async(self, data: pd.DataFrame | dict,
111
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
112
+ **kwargs) -> AggregateMetricResult:
113
+ """
114
+ Evaluate the data for ToolCallParameterAccuracyMetric
115
+ Args:
116
+ data (pd.DataFrame | dict): Data to be evaluated
117
+ configuration (GenAIConfiguration | AgenticAIConfiguration): Metrics configuration
118
+ **kwargs: Additional keyword arguments
119
+
120
+ Returns:
121
+ AggregateMetricResult: The computed metrics
122
+ """
123
+ data_cols = data.columns.to_list()
124
+ validate_tool_calls(data_cols, configuration)
125
+
126
+ tool_call_provider = ToolCallMetricProvider(
127
+ configuration=configuration, metric=self)
128
+ metric_config = {
129
+ "general_metrics": [self.metric_mapping_name],
130
+ "function_metrics": None,
131
+ "parameter_metrics": None,
132
+ "transform_enabled": False
133
+ }
134
+ metric_result = await tool_call_provider.compute_metrics(
135
+ data, syntactic_only=False, metric_result_mapping_name="general", **metric_config)
136
+
137
+ return metric_result
138
+
139
+ def evaluate(
140
+ self,
141
+ data: pd.DataFrame | dict,
142
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
143
+ **kwargs,
144
+ ):
145
+ # If ran in sync mode, block until it is done
146
+ return run_in_event_loop(
147
+ self.evaluate_async,
148
+ data=data,
149
+ configuration=configuration,
150
+ **kwargs,
151
+ )
@@ -0,0 +1,71 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from functools import partial
10
+ from typing import Callable, Optional
11
+
12
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
13
+ AgenticAIConfiguration
14
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
15
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
16
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
17
+ from ibm_watsonx_gov.metrics.tool_call_relevance.tool_call_relevance_metric import \
18
+ ToolCallRelevanceMetric
19
+ from ibm_watsonx_gov.providers.tool_call_metric_provider import \
20
+ ToolCallMetricProvider
21
+ from ibm_watsonx_gov.utils.python_utils import parse_functions_to_openai_schema
22
+ from wrapt import decorator
23
+
24
+
25
+ class ToolCallRelevanceDecorator(BaseMetricDecorator):
26
+ def evaluate_tool_call_relevance(self,
27
+ func: Optional[Callable] = None,
28
+ *,
29
+ configuration: Optional[AgenticAIConfiguration] = None,
30
+ metrics: list[GenAIMetric] = [
31
+ ToolCallRelevanceMetric()
32
+ ]
33
+ ) -> dict:
34
+ """
35
+ An evaluation decorator for computing tool call relevance metric on an agentic node.
36
+ """
37
+ if func is None:
38
+ return partial(self.evaluate_tool_call_relevance, configuration=configuration, metrics=metrics)
39
+
40
+ if not metrics:
41
+ metrics = [ToolCallRelevanceMetric()]
42
+
43
+ @decorator
44
+ def wrapper(func, instance, args, kwargs):
45
+
46
+ try:
47
+ self.validate(func=func, metrics=metrics,
48
+ valid_metric_types=(ToolCallRelevanceMetric,))
49
+
50
+ metric_inputs = [
51
+ EvaluatorFields.INPUT_FIELDS
52
+ ]
53
+ metric_outputs = [
54
+ EvaluatorFields.TOOL_CALLS_FIELD, EvaluatorFields.OUTPUT_FIELDS]
55
+
56
+ if isinstance(configuration.tools, list) and all(callable(item) for item in configuration.tools):
57
+ configuration.tools = ToolCallMetricProvider.get_tools_list_schema(
58
+ configuration.tools)
59
+
60
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
61
+ configuration=configuration,
62
+ metrics=metrics,
63
+ metric_inputs=metric_inputs,
64
+ metric_outputs=metric_outputs)
65
+
66
+ return original_result
67
+ except Exception as ex:
68
+ raise Exception(
69
+ f"There was an error while evaluating tool call relevance metric on {func.__name__},") from ex
70
+
71
+ return wrapper(func)