ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,2725 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ import os
11
+ import time
12
+ from pathlib import Path
13
+ from threading import Lock
14
+ from typing import Annotated, Callable, List, Optional, Set
15
+ from uuid import uuid4
16
+
17
+ from pydantic import Field, PrivateAttr
18
+
19
+ from ibm_watsonx_gov.ai_experiments.ai_experiments_client import \
20
+ AIExperimentsClient
21
+ from ibm_watsonx_gov.config import AgenticAIConfiguration
22
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
23
+ TracingConfiguration
24
+ from ibm_watsonx_gov.entities import ai_experiment as ai_experiment_entity
25
+ from ibm_watsonx_gov.entities.agentic_app import AgenticApp, Node
26
+ from ibm_watsonx_gov.entities.agentic_evaluation_result import \
27
+ AgenticEvaluationResult
28
+ from ibm_watsonx_gov.entities.ai_evaluation import AIEvaluationAsset
29
+ from ibm_watsonx_gov.entities.ai_experiment import (AIExperiment,
30
+ AIExperimentRun,
31
+ AIExperimentRunRequest)
32
+ from ibm_watsonx_gov.entities.evaluation_result import AgentMetricResult
33
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
34
+ from ibm_watsonx_gov.evaluators.base_evaluator import BaseEvaluator
35
+ from ibm_watsonx_gov.metric_groups.answer_quality.answer_quality_decorator import \
36
+ AnswerQualityDecorator
37
+ from ibm_watsonx_gov.metric_groups.content_safety.content_safety_decorator import \
38
+ ContentSafetyDecorator
39
+ from ibm_watsonx_gov.metric_groups.readability.readability_decorator import \
40
+ ReadabilityDecorator
41
+ from ibm_watsonx_gov.metric_groups.retrieval_quality.retrieval_quality_decorator import \
42
+ RetrievalQualityDecorator
43
+ from ibm_watsonx_gov.metrics.answer_relevance.answer_relevance_decorator import \
44
+ AnswerRelevanceDecorator
45
+ from ibm_watsonx_gov.metrics.answer_similarity.answer_similarity_decorator import \
46
+ AnswerSimilarityDecorator
47
+ from ibm_watsonx_gov.metrics.average_precision.average_precision_decorator import \
48
+ AveragePrecisionDecorator
49
+ from ibm_watsonx_gov.metrics.context_relevance.context_relevance_decorator import \
50
+ ContextRelevanceDecorator
51
+ from ibm_watsonx_gov.metrics.evasiveness.evasiveness_decorator import \
52
+ EvasivenessDecorator
53
+ from ibm_watsonx_gov.metrics.faithfulness.faithfulness_decorator import \
54
+ FaithfulnessDecorator
55
+ from ibm_watsonx_gov.metrics.hap.hap_decorator import HAPDecorator
56
+ from ibm_watsonx_gov.metrics.harm.harm_decorator import HarmDecorator
57
+ from ibm_watsonx_gov.metrics.harm_engagement.harm_engagement_decorator import \
58
+ HarmEngagementDecorator
59
+ from ibm_watsonx_gov.metrics.hit_rate.hit_rate_decorator import \
60
+ HitRateDecorator
61
+ from ibm_watsonx_gov.metrics.jailbreak.jailbreak_decorator import \
62
+ JailbreakDecorator
63
+ from ibm_watsonx_gov.metrics.keyword_detection.keyword_detection_decorator import \
64
+ KeywordDetectionDecorator
65
+ from ibm_watsonx_gov.metrics.ndcg.ndcg_decorator import NDCGDecorator
66
+ from ibm_watsonx_gov.metrics.pii.pii_decorator import PIIDecorator
67
+ from ibm_watsonx_gov.metrics.profanity.profanity_decorator import \
68
+ ProfanityDecorator
69
+ from ibm_watsonx_gov.metrics.prompt_safety_risk.prompt_safety_risk_decorator import \
70
+ PromptSafetyRiskDecorator
71
+ from ibm_watsonx_gov.metrics.reciprocal_rank.reciprocal_rank_decorator import \
72
+ ReciprocalRankDecorator
73
+ from ibm_watsonx_gov.metrics.regex_detection.regex_detection_decorator import \
74
+ RegexDetectionDecorator
75
+ from ibm_watsonx_gov.metrics.retrieval_precision.retrieval_precision_decorator import \
76
+ RetrievalPrecisionDecorator
77
+ from ibm_watsonx_gov.metrics.sexual_content.sexual_content_decorator import \
78
+ SexualContentDecorator
79
+ from ibm_watsonx_gov.metrics.social_bias.social_bias_decorator import \
80
+ SocialBiasDecorator
81
+ from ibm_watsonx_gov.metrics.text_grade_level.text_grade_level_decorator import \
82
+ TextGradeLevelDecorator
83
+ from ibm_watsonx_gov.metrics.text_reading_ease.text_reading_ease_decorator import \
84
+ TextReadingEaseDecorator
85
+ from ibm_watsonx_gov.metrics.tool_call_accuracy.tool_call_accuracy_decorator import \
86
+ ToolCallAccuracyDecorator
87
+ from ibm_watsonx_gov.metrics.tool_call_parameter_accuracy.tool_call_parameter_accuracy_decorator import \
88
+ ToolCallParameterAccuracyDecorator
89
+ from ibm_watsonx_gov.metrics.tool_call_relevance.tool_call_relevance_decorator import \
90
+ ToolCallRelevanceDecorator
91
+ from ibm_watsonx_gov.metrics.tool_call_syntactic_accuracy.tool_call_syntactic_accuracy_decorator import \
92
+ ToolCallSyntacticAccuracyDecorator
93
+ from ibm_watsonx_gov.metrics.topic_relevance.topic_relevance_decorator import \
94
+ TopicRelevanceDecorator
95
+ from ibm_watsonx_gov.metrics.unethical_behavior.unethical_behavior_decorator import \
96
+ UnethicalBehaviorDecorator
97
+ from ibm_watsonx_gov.metrics.unsuccessful_requests.unsuccessful_requests_decorator import \
98
+ UnsuccessfulRequestsDecorator
99
+ from ibm_watsonx_gov.metrics.violence.violence_decorator import \
100
+ ViolenceDecorator
101
+ from ibm_watsonx_gov.traces.span_util import get_attributes
102
+ from ibm_watsonx_gov.traces.trace_utils import TraceUtils
103
+ from ibm_watsonx_gov.utils.aggregation_util import \
104
+ get_agentic_evaluation_result
105
+ from ibm_watsonx_gov.utils.async_util import (gather_with_concurrency,
106
+ run_in_event_loop)
107
+ from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
108
+ from ibm_watsonx_gov.utils.python_utils import add_if_unique
109
+ from ibm_watsonx_gov.utils.singleton_meta import SingletonMeta
110
+
111
+ try:
112
+ from ibm_watsonx_gov.traces.span_exporter import WxGovSpanExporter
113
+ except Exception:
114
+ pass
115
+
116
+ logger = GovSDKLogger.get_logger(__name__)
117
+ PROCESS_TRACES = True
118
+
119
+
120
+ try:
121
+ from ibm_agent_analytics.instrumentation import agent_analytics_sdk
122
+ from ibm_agent_analytics.instrumentation.configs import OTLPCollectorConfig
123
+ from ibm_agent_analytics.instrumentation.utils import get_current_trace_id
124
+ except ImportError as e:
125
+ logger.warning(str(e))
126
+ PROCESS_TRACES = False
127
+
128
+
129
+ update_lock = Lock()
130
+ TRACE_LOG_FILE_NAME = os.getenv(
131
+ "TRACE_LOG_FILE_NAME", f"experiment_traces_{str(uuid4())}")
132
+ TRACE_LOG_FILE_PATH = os.getenv("TRACE_LOG_FILE_PATH", "./wxgov_traces")
133
+
134
+ AI_SERVICE_QUALITY = "ai_service_quality"
135
+ CUSTOM_METRICS = "custom_metrics"
136
+ MAX_CONCURRENCY = 10
137
+ AGENTIC_RESULT_COMPONENTS = ["conversation", "message", "node"]
138
+
139
+
140
+ class AgenticEvaluator(BaseEvaluator, metaclass=SingletonMeta):
141
+ """
142
+ The class to evaluate agentic application.
143
+
144
+ Examples:
145
+ 1. Evaluate Agent with default parameters. This will compute only the performance(latency, duration) and usage(cost, input_token_count, output_token_count) metrics.
146
+ .. code-block:: python
147
+
148
+ agentic_evaluator = AgenticEvaluator()
149
+ agentic_evaluator.start_run()
150
+ # Invoke the agentic application
151
+ agentic_evaluator.end_run()
152
+ result = agentic_evaluator.get_result()
153
+
154
+ 2. Evaluate Agent by specifying the agent or message level metrics and the node level metrics which will be computed post graph invocation when end_run() is called.
155
+ .. code-block:: python
156
+
157
+ # Below example provides the node configuration to compute the ContextRelevanceMetric and all the Retrieval Quality group metrics.
158
+ nodes = [Node(name="Retrieval Node",
159
+ metrics_configurations=[MetricsConfiguration(metrics=[ContextRelevanceMetric()],
160
+ metric_groups=[MetricGroup.RETRIEVAL_QUALITY])])]
161
+ # Please refer to MetricsConfiguration class for advanced usage where the fields details can be specified, in case the graph state has the attributes with non default names.
162
+
163
+ # Below example provides the agent configuration to compute the AnswerRelevanceMetric and all the Content Safety group metrics on agent or message level.
164
+ agentic_app = AgenticApp(name="Agentic App",
165
+ metrics_configuration=MetricsConfiguration(metrics=[AnswerRelevanceMetric()],
166
+ metric_groups=[MetricGroup.CONTENT_SAFETY]),
167
+ nodes=nodes)
168
+
169
+ agentic_evaluator = AgenticEvaluator(agentic_app=agentic_app)
170
+ agentic_evaluator.start_run()
171
+ # Invoke the agentic application
172
+ agentic_evaluator.end_run()
173
+ result = agentic_evaluator.get_result()
174
+
175
+ 3. Evaluate Agent by specifying the agent or message level metrics and use decorator to compute node level metrics which will be computed during graph invocation.
176
+ .. code-block:: python
177
+
178
+ # Below example provides the agent configuration to compute the AnswerRelevanceMetric and all the Content Safety group metrics on agent or message level.
179
+ # Agent or message level metrics will be computed post graph invocation when end_run() is called.
180
+ agentic_app = AgenticApp(name="Agentic App",
181
+ metrics_configuration=MetricsConfiguration(metrics=[AnswerRelevanceMetric()],
182
+ metric_groups=[MetricGroup.CONTENT_SAFETY]))
183
+
184
+ agentic_evaluator = AgenticEvaluator(agentic_app=agentic_app)
185
+
186
+ # Add decorator when defining the node functions
187
+ @evaluator.evaluate_retrieval_quality(configuration=AgenticAIConfiguration(**{"input_fields": ["input_text"], "context_fields": ["local_context"]}))
188
+ @evaluator.evaluate_content_safety() # Here the default AgenticAIConfiguration is used
189
+ def local_search_node(state: GraphState, config: RunnableConfig) -> dict:
190
+ # Retrieve data from vector db
191
+ # ...
192
+ return {"local_context": []}
193
+
194
+ agentic_evaluator.start_run()
195
+ # Invoke the agentic application
196
+ agentic_evaluator.end_run()
197
+ result = agentic_evaluator.get_result()
198
+
199
+ 4. Evaluate agent with experiment tracking
200
+ .. code-block:: python
201
+
202
+ tracing_config = TracingConfiguration(project_id=project_id)
203
+ agentic_evaluator = AgenticEvaluator(tracing_configuration=tracing_config)
204
+
205
+ agentic_evaluator.track_experiment(name="my_experiment")
206
+ agentic_evaluator.start_run(AIExperimentRunRequest(name="run1"))
207
+ # Invoke the agentic application
208
+ agentic_evaluator.end_run()
209
+ result = agentic_evaluator.get_result()
210
+
211
+
212
+ """
213
+ agentic_app: Annotated[Optional[AgenticApp],
214
+ Field(title="Agentic application configuration details",
215
+ description="The agentic application configuration details.",
216
+ default=None)]
217
+ tracing_configuration: Annotated[Optional[TracingConfiguration],
218
+ Field(title="Tracing Configuration",
219
+ description="The tracing configuration details.",
220
+ default=None)]
221
+ ai_experiment_client: Annotated[Optional[AIExperimentsClient],
222
+ Field(title="AI experiments client",
223
+ description="The AI experiment client object.",
224
+ default=None)]
225
+ max_concurrency: Annotated[int,
226
+ Field(title="Max Concurrency",
227
+ description="The maximum concurrency to use for evaluating metrics.",
228
+ default=MAX_CONCURRENCY)]
229
+ __latest_experiment_name: Annotated[Optional[str], PrivateAttr(
230
+ default=None)]
231
+ __latest_experiment_id: Annotated[Optional[str], PrivateAttr(
232
+ default=None)]
233
+ __experiment_results: Annotated[dict,
234
+ PrivateAttr(default={})]
235
+ __run_results: Annotated[dict[str, AgenticEvaluationResult],
236
+ PrivateAttr(default={})]
237
+ __online_metric_results: Annotated[list[AgentMetricResult],
238
+ PrivateAttr(default=[])]
239
+ """__metric_results holds the results of all the evaluations done for a particular evaluation instance."""
240
+ __execution_counts: Annotated[dict[str, dict[str, int]],
241
+ PrivateAttr(default={})]
242
+ """__execution_counts holds the execution count for a particular node, for a given record_id."""
243
+ __nodes_being_run: Annotated[dict[str, Set[str]],
244
+ PrivateAttr(default={})]
245
+ """__nodes_being_run holds the name of the current nodes being run for a given record_id. Multiple decorators can be applied on a single node using chaining. We don't want to hold multiple copies of same node here."""
246
+ __latest_run_name: Annotated[str, PrivateAttr(default=None)]
247
+ __nodes: Annotated[list[Node], PrivateAttr(default=[])]
248
+ __experiment_run_details: Annotated[AIExperimentRun, PrivateAttr(
249
+ default=None)]
250
+ __custom_metrics: Annotated[List[dict], PrivateAttr(default=None)]
251
+
252
+ def __init__(self, /, **data):
253
+ """
254
+ Initialize the AgenticEvaluator object and start the tracing framework.
255
+ """
256
+ super().__init__(**data)
257
+ # Initialize the agent analytics sdk
258
+ if PROCESS_TRACES:
259
+ tracing_params = self.__get_tracing_params(
260
+ data.get("tracing_configuration"))
261
+
262
+ agent_analytics_sdk.initialize_logging(
263
+ tracer_type=agent_analytics_sdk.SUPPORTED_TRACER_TYPES.CUSTOM,
264
+ custom_exporter=WxGovSpanExporter(
265
+ tracing_params.get("enable_local_traces"),
266
+ tracing_params.get("enable_server_traces"),
267
+ file_name=TRACE_LOG_FILE_NAME,
268
+ storage_path=TRACE_LOG_FILE_PATH,
269
+ # manually passing endpoint and timeout
270
+ endpoint=tracing_params.get("endpoint"),
271
+ timeout=tracing_params.get("timeout"),
272
+ headers=tracing_params.get("headers"),
273
+ ),
274
+ new_trace_on_workflow=True,
275
+ resource_attributes={
276
+ "wxgov.config.agentic_app": self.agentic_app.model_dump_json(exclude_none=True) if self.agentic_app else "",
277
+ **tracing_params.get("resource_attributes")
278
+ },
279
+ # Check: does this config has any effect on CUSTOM exporters
280
+ config=OTLPCollectorConfig(
281
+ **tracing_params.get("otlp_config_dict")) if tracing_params.get("otlp_config_dict") else None
282
+ )
283
+
284
+ self.__latest_experiment_name = "experiment_1"
285
+
286
+ def __get_tracing_params(self, tracing_config):
287
+ tracing_params = {
288
+ "enable_local_traces": True,
289
+ "enable_server_traces": False,
290
+ "endpoint": None,
291
+ "timeout": None,
292
+ "headers": None,
293
+ "resource_attributes": {},
294
+ "otlp_config_dict": {}
295
+ }
296
+
297
+ if tracing_config:
298
+ resource_attributes = tracing_config.resource_attributes
299
+ if tracing_config.project_id:
300
+ resource_attributes["wx-project-id"] = tracing_config.project_id
301
+ elif tracing_config.space_id:
302
+ resource_attributes["wx-space-id"] = tracing_config.space_id
303
+ tracing_params["resource_attributes"] = resource_attributes
304
+ otlp_collector_config = tracing_config.otlp_collector_config
305
+
306
+ if otlp_collector_config:
307
+ tracing_params["endpoint"] = otlp_collector_config.endpoint
308
+ tracing_params["timeout"] = otlp_collector_config.timeout
309
+ tracing_params["headers"] = otlp_collector_config.headers
310
+ tracing_params["otlp_config_dict"] = {k: v for k, v in otlp_collector_config.dict().items()
311
+ if k != "headers"}
312
+ tracing_params["enable_server_traces"] = True
313
+ tracing_params["enable_local_traces"] = tracing_config.log_traces_to_file
314
+
315
+ return tracing_params
316
+
317
+ def track_experiment(self, name: str = "experiment_1", description: str = None, use_existing: bool = True) -> str:
318
+ """
319
+ Start tracking an experiment for the metrics evaluation.
320
+ The experiment will be created if it doesn't exist.
321
+ If an existing experiment with the same name is found, it will be reused based on the flag use_existing.
322
+
323
+ Args:
324
+ project_id (string): The project id to store the experiment.
325
+ name (string): The name of the experiment.
326
+ description (str): The description of the experiment.
327
+ use_existing (bool): The flag to specify if the experiment should be reused if an existing experiment with the given name is found.
328
+
329
+ Returns:
330
+ The ID of AI experiment asset
331
+ """
332
+ self.__latest_experiment_name = name
333
+ # Checking if the ai_experiment_name already exists with given name if use_existing is enabled.
334
+ # If it does reuse it, otherwise creating a new ai_experiment
335
+ # Set the experiment_name and experiment_id
336
+ self.ai_experiment_client = AIExperimentsClient(
337
+ api_client=self.api_client,
338
+ project_id=self.tracing_configuration.project_id
339
+ )
340
+ ai_experiment = None
341
+ if use_existing:
342
+ ai_experiment = self.ai_experiment_client.search(name)
343
+
344
+ # If no AI experiment exists with specified name or use_existing is False, create new AI experiment
345
+ if not ai_experiment:
346
+ ai_experiment_details = AIExperiment(
347
+ name=name,
348
+ description=description or "AI experiment for Agent governance"
349
+ )
350
+ ai_experiment = self.ai_experiment_client.create(
351
+ ai_experiment_details)
352
+
353
+ ai_experiment_id = ai_experiment.asset_id
354
+
355
+ # Experiment id will be set when the experiment is tracked and not set when the experiment is not tracked
356
+ self.__latest_experiment_id = ai_experiment_id
357
+ self.__run_results = {}
358
+ return ai_experiment_id
359
+
360
+ def start_run(self, run_request: AIExperimentRunRequest = AIExperimentRunRequest(name="run_1")) -> AIExperimentRun:
361
+ """
362
+ Start a run to track the metrics computation within an experiment.
363
+ This method is required to be called before any metrics computation.
364
+
365
+ Args:
366
+ run_request (AIExperimentRunRequest): The run_request instance containing name, source_name, source_url, custom_tags
367
+
368
+ Returns:
369
+ The details of experiment run like id, name, description etc.
370
+ """
371
+ name = run_request.name
372
+ self.__latest_run_name = name
373
+ self.__experiment_results[self.__latest_experiment_name] = self.__run_results
374
+ self.__start_time = time.time()
375
+ # Having experiment id indicates user is tracking experiments
376
+ if self.__latest_experiment_id:
377
+ # Create run object, having experiment id indicates user is tracking experiments
378
+ self.__experiment_run_details = AIExperimentRun(
379
+ run_id=str(uuid4()),
380
+ run_name=name,
381
+ source_name=run_request.source_name,
382
+ source_url=run_request.source_url,
383
+ custom_tags=run_request.custom_tags,
384
+ agent_method_name=run_request.agent_method_name,
385
+ )
386
+
387
+ return self.__experiment_run_details
388
+
389
+ def log_custom_metrics(self, custom_metrics):
390
+ """
391
+ Collect the custom metrics provided by user and append with metrics of current run.
392
+
393
+ Args:
394
+ custom_metrics (List[Dict]): custom metrics
395
+ """
396
+ required_fields = ["name", "value"]
397
+ is_valid = True
398
+ for metric in custom_metrics:
399
+ # Check required fields
400
+ for key in required_fields:
401
+ if key not in metric or metric[key] in [None, ""]:
402
+ is_valid = False
403
+
404
+ # Conditional check: applies_to == "node" => node_name must exist and be non-empty
405
+ if metric.get("applies_to") == "node":
406
+ if "node_name" not in metric or metric["node_name"] in [None, ""]:
407
+ is_valid = False
408
+
409
+ if not is_valid:
410
+ message = "Invalid metrics formats. Required fields are 'name' and 'value'."
411
+ logger.error(message)
412
+ raise Exception(message)
413
+
414
+ self.__custom_metrics = custom_metrics
415
+
416
+ def end_run(self, track_notebook: Optional[bool] = False):
417
+ """
418
+ End a run to collect and compute the metrics within the current run.
419
+
420
+ Args:
421
+ track_notebook (bool): flag to specify storing the notebook with current run
422
+
423
+ """
424
+ eval_result = self.__compute_metrics_from_traces()
425
+ self.__run_results[self.__latest_run_name] = eval_result
426
+ # Having experiment id indicates user is tracking experiments and its needed to submit the run details
427
+ if self.__latest_experiment_id:
428
+ self.__store_run_results(track_notebook)
429
+
430
+ self.__reset_results()
431
+
432
+ def compare_ai_experiments(self,
433
+ ai_experiments: List[AIExperiment] = None,
434
+ ai_evaluation_details: AIEvaluationAsset = None
435
+ ) -> str:
436
+ """
437
+ Creates an AI Evaluation asset to compare AI experiment runs.
438
+
439
+ Args:
440
+ ai_experiments (List[AIExperiment], optional):
441
+ List of AI experiments to be compared. If all runs for an experiment need to be compared, then specify the runs value as empty list for the experiment.
442
+ ai_evaluation_details (AIEvaluationAsset, optional):
443
+ An instance of AIEvaluationAsset having details (name, description and metrics configuration)
444
+ Returns:
445
+ An instance of AIEvaluationAsset.
446
+
447
+ Examples:
448
+ 1. Create AI evaluation with list of experiment IDs
449
+
450
+ .. code-block:: python
451
+
452
+ # Initialize the API client with credentials
453
+ api_client = APIClient(credentials=Credentials(api_key="", url="wos_url"))
454
+
455
+ # Create the instance of Agentic evaluator
456
+ evaluator = AgenticEvaluator(api_client=api_client, tracing_configuration=TracingConfiguration(project_id=project_id))
457
+
458
+ # [Optional] Define evaluation configuration
459
+ evaluation_config = EvaluationConfig(
460
+ monitors={
461
+ "agentic_ai_quality": {
462
+ "parameters": {
463
+ "metrics_configuration": {}
464
+ }
465
+ }
466
+ }
467
+ )
468
+
469
+ # Create the evaluation asset
470
+ ai_evaluation_details = AIEvaluationAsset(
471
+ name="AI Evaluation for agent",
472
+ evaluation_configuration=evaluation_config
473
+ )
474
+
475
+ # Compare two or more AI experiments using the evaluation asset
476
+ ai_experiment1 = AIExperiment(
477
+ asset_id = ai_experiment_id_1,
478
+ runs = [<Run1 details>, <Run2 details>] # Run details are returned by the start_run method
479
+ )
480
+ ai_experiment2 = AIExperiment(
481
+ asset_id = ai_experiment_id_2,
482
+ runs = [] # Empty list means all runs for this experiment will be compared
483
+ )
484
+ ai_evaluation_asset_href = evaluator.compare_ai_experiments(
485
+ ai_experiments = [ai_experiment_1, ai_experiment_2],
486
+ ai_evaluation_details=ai_evaluation_asset
487
+ )
488
+ """
489
+ # If experiment runs to be compared are not provided, using all runs from the latest tracked experiment
490
+ if not ai_experiments:
491
+ ai_experiments = [AIExperiment(
492
+ asset_id=self.__latest_experiment_id, runs=[])]
493
+
494
+ # Construct experiment_runs map
495
+ ai_experiment_runs = {exp.asset_id: exp.runs for exp in ai_experiments}
496
+
497
+ ai_evaluation_asset = self.ai_experiment_client.create_ai_evaluation_asset(
498
+ ai_experiment_runs=ai_experiment_runs,
499
+ ai_evaluation_details=ai_evaluation_details
500
+ )
501
+ ai_evaluation_asset_href = self.ai_experiment_client.get_ai_evaluation_asset_href(
502
+ ai_evaluation_asset)
503
+
504
+ return ai_evaluation_asset_href
505
+
506
+ def __compute_metrics_from_traces(self):
507
+ """
508
+ Computes the metrics using the traces collected in the log file.
509
+ """
510
+ if not PROCESS_TRACES:
511
+ return
512
+
513
+ trace_log_file_path = Path(
514
+ f"{TRACE_LOG_FILE_PATH}/{TRACE_LOG_FILE_NAME}.log")
515
+ spans = []
516
+ for span in TraceUtils.stream_trace_data(trace_log_file_path):
517
+ spans.append(span)
518
+
519
+ metrics_result = []
520
+ coros = []
521
+ span_trees = TraceUtils.build_span_trees(
522
+ spans=spans, agentic_app=self.agentic_app)
523
+ for span_tree in span_trees:
524
+ # Process only the spans that are associated with the agent application
525
+ attrs = get_attributes(span_tree.span.attributes, [
526
+ "traceloop.span.kind"])
527
+ if not attrs.get("traceloop.span.kind") == "workflow":
528
+ continue
529
+ # Append coroutine for metric computation
530
+ coros.append(
531
+ TraceUtils.compute_metrics_from_trace_async(
532
+ span_tree=span_tree,
533
+ api_client=self.api_client,
534
+ max_concurrency=self.max_concurrency,
535
+ )
536
+ )
537
+ # Run all coroutines in parallel with concurrency control
538
+ results = run_in_event_loop(
539
+ gather_with_concurrency,
540
+ coros=coros,
541
+ max_concurrency=self.max_concurrency)
542
+
543
+ # Process results
544
+ for mr, ns, _ in results:
545
+ metrics_result.extend(mr)
546
+ for n in ns:
547
+ add_if_unique(n, self.__nodes, ["name", "func_name"], [
548
+ "foundation_models"])
549
+
550
+ return get_agentic_evaluation_result(
551
+ metrics_result=metrics_result, nodes=self.__nodes)
552
+
553
+ def __store_run_results(self, track_notebook: Optional[bool] = False):
554
+
555
+ aggregated_results = self.get_result().get_aggregated_metrics_results()
556
+ # Fetching the nodes details to update in experiment run
557
+ nodes = []
558
+ for node in self.get_nodes():
559
+ nodes.append(ai_experiment_entity.Node(
560
+ id=node.func_name, name=node.name, foundation_models=set(node.foundation_models)))
561
+ self.__experiment_run_details.nodes = nodes
562
+ # Duration of run in seconds
563
+ self.__experiment_run_details.duration = int(
564
+ time.time() - self.__start_time)
565
+
566
+ # Storing the run result as attachment and update the run info in AI experiment
567
+ # Todo - keeping the List[AggregateAgentMetricResult] - is that compatible? should store full AgenticEvaluationResult?
568
+ evaluation_result = {
569
+ AI_SERVICE_QUALITY: aggregated_results
570
+ }
571
+ # Adding custom metrics, if exist
572
+ if self.__custom_metrics:
573
+ evaluation_result[CUSTOM_METRICS] = self.__custom_metrics
574
+
575
+ self.ai_experiment_client.update(
576
+ self.__latest_experiment_id,
577
+ self.__experiment_run_details,
578
+ evaluation_result,
579
+ track_notebook,
580
+ )
581
+
582
+ def get_nodes(self) -> list[Node]:
583
+ """
584
+ Get the list of nodes used in the agentic application
585
+
586
+ Return:
587
+ nodes (list[Node]): The list of nodes used in the agentic application
588
+ """
589
+ return self.__nodes
590
+
591
+ def get_result(self, run_name: Optional[str] = None) -> AgenticEvaluationResult:
592
+ """
593
+ Get the AgenticEvaluationResult for the run. By default the result for the latest run is returned.
594
+ Specify the run name to get the result for a specific run.
595
+ Args:
596
+ run_name (string): The evaluation run name
597
+ Return:
598
+ agentic_evaluation_result (AgenticEvaluationResult): The AgenticEvaluationResult object for the run.
599
+ """
600
+ if run_name:
601
+ result = self.__run_results.get(run_name)
602
+ else:
603
+ result = self.__run_results.get(self.__latest_run_name)
604
+
605
+ return result
606
+
607
+ def get_metric_result(self, metric_name: str, node_name: str) -> AgentMetricResult:
608
+ """
609
+ Get the AgentMetricResult for the given metric and node name.
610
+ This is used to get the result of the metric computed during agent execution.
611
+
612
+ Args:
613
+ metric_name (string): The metric name
614
+ node_name (string): The node name
615
+ Return:
616
+ agent_metric_result (AgentMetricResult): The AgentMetricResult object for the metric.
617
+ """
618
+ for metric in self.__online_metric_results:
619
+ if metric.applies_to == "node" and metric.name == metric_name \
620
+ and metric.node_name == node_name and metric.message_id == get_current_trace_id():
621
+ return metric
622
+
623
+ return None
624
+
625
+ def __reset_results(self):
626
+ self.__online_metric_results.clear()
627
+ self.__execution_counts.clear()
628
+ self.__nodes_being_run.clear()
629
+ trace_log_file_path = Path(
630
+ f"{TRACE_LOG_FILE_PATH}/{TRACE_LOG_FILE_NAME}.log")
631
+ if os.path.exists(trace_log_file_path):
632
+ os.remove(trace_log_file_path)
633
+
634
+ def evaluate_context_relevance(self,
635
+ func: Optional[Callable] = None,
636
+ *,
637
+ configuration: Optional[AgenticAIConfiguration] = None,
638
+ metrics: list[GenAIMetric] = [],
639
+ compute_real_time: Optional[bool] = True) -> dict:
640
+ """
641
+ An evaluation decorator for computing context relevance metric on an agentic node.
642
+
643
+ For more details, see :class:`ibm_watsonx_gov.metrics.ContextRelevanceMetric`
644
+
645
+ Args:
646
+ func (Optional[Callable], optional): The node on which the metric is to be computed.
647
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
648
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ContextRelevanceMetric() ].
649
+ compute_real_time (Optional[bool], optional): The flag to indicate whether the metric should be computed along with the node execution or not.
650
+
651
+ Raises:
652
+ Exception: If there is any error while evaluation.
653
+
654
+ Returns:
655
+ dict: The result of the wrapped node.
656
+
657
+ Examples:
658
+ 1. Basic usage
659
+ .. code-block:: python
660
+
661
+ evaluator = AgenticEvaluator()
662
+ @evaluator.evaluate_context_relevance
663
+ def agentic_node(*args, *kwargs):
664
+ pass
665
+
666
+ 2. Usage with different thresholds and methods
667
+ .. code-block:: python
668
+
669
+ metric_1 = ContextRelevanceMetric(
670
+ method="sentence_bert_bge", thresholds=MetricThreshold(type="lower_limit", value=0.5))
671
+ metric_2 = ContextRelevanceMetric(
672
+ method="sentence_bert_mini_lm", thresholds=MetricThreshold(type="lower_limit", value=0.6))
673
+ metric_3 = ContextRelevanceMetric(
674
+ method="granite_guardian", thresholds=MetricThreshold(type="lower_limit", value=0.6))
675
+ evaluator = AgenticEvaluator()
676
+ @evaluator.evaluate_context_relevance(metrics=[metric_1, metric_2, metric_3])
677
+ def agentic_node(*args, *kwargs):
678
+ pass
679
+ """
680
+ return ContextRelevanceDecorator(api_client=self.api_client,
681
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
682
+ metric_results=self.__online_metric_results,
683
+ execution_counts=self.__execution_counts,
684
+ nodes_being_run=self.__nodes_being_run,
685
+ lock=update_lock,
686
+ compute_real_time=compute_real_time).evaluate_context_relevance(func, configuration=configuration, metrics=metrics)
687
+
688
+ def evaluate_average_precision(self,
689
+ func: Optional[Callable] = None,
690
+ *,
691
+ configuration: Optional[AgenticAIConfiguration] = None,
692
+ metrics: list[GenAIMetric] = [],
693
+ compute_real_time: Optional[bool] = True) -> dict:
694
+ """
695
+ An evaluation decorator for computing average precision metric on an agentic tool.
696
+ This metric uses context relevance values for computation, context relevance metric would be computed as a prerequisite.
697
+
698
+ For more details, see :class:`ibm_watsonx_gov.metrics.AveragePrecisionMetric`
699
+
700
+ Args:
701
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
702
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
703
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ AveragePrecisionMetric() ].
704
+
705
+ Raises:
706
+ Exception: If there is any error while evaluation.
707
+
708
+ Returns:
709
+ dict: The result of the wrapped tool.
710
+
711
+ Example:
712
+ 1. Basic usage
713
+ .. code-block:: python
714
+
715
+ evaluator = AgenticEvaluator()
716
+ @evaluator.evaluate_average_precision
717
+ def agentic_tool(*args, *kwargs):
718
+ pass
719
+
720
+ 2. Usage with different thresholds and methods
721
+ .. code-block:: python
722
+
723
+ metric_1 = AveragePrecisionMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
724
+ metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
725
+
726
+ evaluator = AgenticEvaluator()
727
+ @evaluator.evaluate_average_precision(metrics=[metric_1, metric_2])
728
+ def agentic_tool(*args, *kwargs):
729
+ pass
730
+ """
731
+ return AveragePrecisionDecorator(api_client=self.api_client,
732
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
733
+ metric_results=self.__online_metric_results,
734
+ execution_counts=self.__execution_counts,
735
+ nodes_being_run=self.__nodes_being_run,
736
+ lock=update_lock,
737
+ compute_real_time=compute_real_time).evaluate_average_precision(func, configuration=configuration, metrics=metrics)
738
+
739
+ def evaluate_ndcg(self,
740
+ func: Optional[Callable] = None,
741
+ *,
742
+ configuration: Optional[AgenticAIConfiguration] = None,
743
+ metrics: list[GenAIMetric] = [],
744
+ compute_real_time: Optional[bool] = True) -> dict:
745
+ """
746
+ An evaluation decorator for computing ndcg metric on an agentic tool.
747
+ This metric uses context relevance values for computation, context relevance metric would be computed as a prerequisite.
748
+
749
+ For more details, see :class:`ibm_watsonx_gov.metrics.NDCGMetric`
750
+
751
+ Args:
752
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
753
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
754
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ NDCGMetric() ].
755
+
756
+ Raises:
757
+ Exception: If there is any error while evaluation.
758
+
759
+ Returns:
760
+ dict: The result of the wrapped tool.
761
+
762
+ Example:
763
+ 1. Basic usage
764
+ .. code-block:: python
765
+
766
+ evaluator = AgenticEvaluator()
767
+ @evaluator.evaluate_ndcg
768
+ def agentic_tool(*args, *kwargs):
769
+ pass
770
+
771
+ 2. Usage with different thresholds and methods
772
+ .. code-block:: python
773
+
774
+ metric_1 = NDCGMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
775
+ metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
776
+
777
+ evaluator = AgenticEvaluator()
778
+ @evaluator.evaluate_ndcg(metrics=[metric_1, metric_2])
779
+ def agentic_tool(*args, *kwargs):
780
+ pass
781
+ """
782
+ return NDCGDecorator(api_client=self.api_client,
783
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
784
+ metric_results=self.__online_metric_results,
785
+ execution_counts=self.__execution_counts,
786
+ nodes_being_run=self.__nodes_being_run,
787
+ lock=update_lock,
788
+ compute_real_time=compute_real_time).evaluate_ndcg(func, configuration=configuration, metrics=metrics)
789
+
790
+ def evaluate_reciprocal_rank(self,
791
+ func: Optional[Callable] = None,
792
+ *,
793
+ configuration: Optional[AgenticAIConfiguration] = None,
794
+ metrics: list[GenAIMetric] = [],
795
+ compute_real_time: Optional[bool] = True) -> dict:
796
+ """
797
+ An evaluation decorator for computing reciprocal precision metric on an agentic tool.
798
+ This metric uses context relevance values for computation, context relevance metric would be computed as a prerequisite.
799
+
800
+ For more details, see :class:`ibm_watsonx_gov.metrics.ReciprocalRankMetric`
801
+
802
+ Args:
803
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
804
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
805
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ReciprocalRankMetric() ].
806
+
807
+ Raises:
808
+ Exception: If there is any error while evaluation.
809
+
810
+ Returns:
811
+ dict: The result of the wrapped tool.
812
+
813
+ Example:
814
+ 1. Basic usage
815
+ .. code-block:: python
816
+
817
+ evaluator = AgenticEvaluator()
818
+ @evaluator.evaluate_reciprocal_rank
819
+ def agentic_tool(*args, *kwargs):
820
+ pass
821
+
822
+ 2. Usage with different thresholds and methods
823
+ .. code-block:: python
824
+
825
+ metric_1 = ReciprocalRankMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
826
+ metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
827
+
828
+ evaluator = AgenticEvaluator()
829
+ @evaluator.evaluate_reciprocal_rank(metrics=[metric_1, metric_2])
830
+ def agentic_tool(*args, *kwargs):
831
+ pass
832
+ """
833
+ return ReciprocalRankDecorator(api_client=self.api_client,
834
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
835
+ metric_results=self.__online_metric_results,
836
+ execution_counts=self.__execution_counts,
837
+ nodes_being_run=self.__nodes_being_run,
838
+ lock=update_lock,
839
+ compute_real_time=compute_real_time).evaluate_reciprocal_rank(func, configuration=configuration, metrics=metrics)
840
+
841
+ def evaluate_retrieval_precision(self,
842
+ func: Optional[Callable] = None,
843
+ *,
844
+ configuration: Optional[AgenticAIConfiguration] = None,
845
+ metrics: list[GenAIMetric] = [],
846
+ compute_real_time: Optional[bool] = True) -> dict:
847
+ """
848
+ An evaluation decorator for computing retrieval precision metric on an agentic tool.
849
+ This metric uses context relevance values for computation, context relevance metric would be computed as a prerequisite.
850
+
851
+ For more details, see :class:`ibm_watsonx_gov.metrics.RetrievalPrecisionMetric`
852
+
853
+ Args:
854
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
855
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
856
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ RetrievalPrecisionMetric() ].
857
+
858
+ Raises:
859
+ Exception: If there is any error while evaluation.
860
+
861
+ Returns:
862
+ dict: The result of the wrapped tool.
863
+
864
+ Example:
865
+ 1. Basic usage
866
+ .. code-block:: python
867
+
868
+ evaluator = AgenticEvaluator()
869
+ @evaluator.evaluate_retrieval_precision
870
+ def agentic_tool(*args, *kwargs):
871
+ pass
872
+
873
+ 2. Usage with different thresholds and methods
874
+ .. code-block:: python
875
+
876
+ metric_1 = AveragePrecisionMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
877
+ metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
878
+
879
+ evaluator = AgenticEvaluator()
880
+ @evaluator.evaluate_retrieval_precision(metrics=[metric_1, metric_2])
881
+ def agentic_tool(*args, *kwargs):
882
+ pass
883
+ """
884
+ return RetrievalPrecisionDecorator(api_client=self.api_client,
885
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
886
+ metric_results=self.__online_metric_results,
887
+ execution_counts=self.__execution_counts,
888
+ nodes_being_run=self.__nodes_being_run,
889
+ lock=update_lock,
890
+ compute_real_time=compute_real_time).evaluate_retrieval_precision(func, configuration=configuration, metrics=metrics)
891
+
892
+ def evaluate_hit_rate(self,
893
+ func: Optional[Callable] = None,
894
+ *,
895
+ configuration: Optional[AgenticAIConfiguration] = None,
896
+ metrics: list[GenAIMetric] = [],
897
+ compute_real_time: Optional[bool] = True) -> dict:
898
+ """
899
+ An evaluation decorator for computing hit rate metric on an agentic tool.
900
+ This metric uses context relevance values for computation, context relevance metric would be computed as a prerequisite.
901
+
902
+ For more details, see :class:`ibm_watsonx_gov.metrics.HitRateMetric`
903
+
904
+ Args:
905
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
906
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
907
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ HitRateMetric() ].
908
+
909
+ Raises:
910
+ Exception: If there is any error while evaluation.
911
+
912
+ Returns:
913
+ dict: The result of the wrapped tool.
914
+
915
+ Example:
916
+ 1. Basic usage
917
+ .. code-block:: python
918
+
919
+ evaluator = AgenticEvaluator()
920
+ @evaluator.evaluate_hit_rate
921
+ def agentic_tool(*args, *kwargs):
922
+ pass
923
+
924
+ 2. Usage with different thresholds and methods
925
+ .. code-block:: python
926
+
927
+ metric_1 = HitRateMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
928
+ metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
929
+
930
+ evaluator = AgenticEvaluator()
931
+ @evaluator.evaluate_hit_rate(metrics=[metric_1, metric_2])
932
+ def agentic_tool(*args, *kwargs):
933
+ pass
934
+ """
935
+ return HitRateDecorator(api_client=self.api_client,
936
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
937
+ metric_results=self.__online_metric_results,
938
+ execution_counts=self.__execution_counts,
939
+ nodes_being_run=self.__nodes_being_run,
940
+ lock=update_lock,
941
+ compute_real_time=compute_real_time).evaluate_hit_rate(func, configuration=configuration, metrics=metrics)
942
+
943
+ def evaluate_answer_similarity(self,
944
+ func: Optional[Callable] = None,
945
+ *,
946
+ configuration: Optional[AgenticAIConfiguration] = None,
947
+ metrics: list[GenAIMetric] = [],
948
+ compute_real_time: Optional[bool] = True) -> dict:
949
+ """
950
+ An evaluation decorator for computing answer similarity metric on an agentic node.
951
+
952
+ For more details, see :class:`ibm_watsonx_gov.metrics.AnswerSimilarityMetric`
953
+
954
+ Args:
955
+ func (Optional[Callable], optional): The node on which the metric is to be computed.
956
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
957
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ AnswerSimilarityMetric() ].
958
+ compute_real_time (Optional[bool], optional): The flag to indicate whether the metric should be computed along with the node execution or not.
959
+
960
+ Raises:
961
+ Exception: If there is any error while evaluation.
962
+
963
+ Returns:
964
+ dict: The result of the wrapped node.
965
+
966
+ Examples:
967
+ 1. Basic usage
968
+ .. code-block:: python
969
+
970
+ evaluator = AgenticEvaluator()
971
+ @evaluator.evaluate_answer_similarity
972
+ def agentic_node(*args, *kwargs):
973
+ pass
974
+
975
+
976
+ 2. Usage with different thresholds and methods
977
+ .. code-block:: python
978
+
979
+ metric_1 = AnswerSimilarityMetric(
980
+ method="token_k_precision", threshold=MetricThreshold(type="lower_limit", value=0.5))
981
+ metric_2 = AnswerSimilarityMetric(
982
+ method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
983
+
984
+ evaluator = AgenticEvaluator()
985
+ @evaluator.evaluate_answer_similarity(metrics=[metric_1, metric_2])
986
+ def agentic_node(*args, *kwargs):
987
+ pass
988
+ """
989
+
990
+ return AnswerSimilarityDecorator(api_client=self.api_client,
991
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
992
+ metric_results=self.__online_metric_results,
993
+ execution_counts=self.__execution_counts,
994
+ nodes_being_run=self.__nodes_being_run,
995
+ lock=update_lock,
996
+ compute_real_time=compute_real_time).evaluate_answer_similarity(func, configuration=configuration, metrics=metrics)
997
+
998
+ def evaluate_faithfulness(self,
999
+ func: Optional[Callable] = None,
1000
+ *,
1001
+ configuration: Optional[AgenticAIConfiguration] = None,
1002
+ metrics: list[GenAIMetric] = [],
1003
+ compute_real_time: Optional[bool] = True) -> dict:
1004
+ """
1005
+ An evaluation decorator for computing faithfulness metric on an agentic node.
1006
+
1007
+ For more details, see :class:`ibm_watsonx_gov.metrics.FaithfulnessMetric`
1008
+
1009
+ Args:
1010
+ func (Optional[Callable], optional): The node on which the metric is to be computed.
1011
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1012
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ FaithfulnessMetric() ].
1013
+ compute_real_time (Optional[bool], optional): The flag to indicate whether the metric should be computed along with the node execution or not.
1014
+
1015
+ Raises:
1016
+ Exception: If there is any error while evaluation.
1017
+
1018
+ Returns:
1019
+ dict: The result of the wrapped node.
1020
+
1021
+ Examples:
1022
+ 1. Basic usage
1023
+ .. code-block:: python
1024
+
1025
+ evaluator = AgenticEvaluator()
1026
+ @evaluator.evaluate_faithfulness
1027
+ def agentic_node(*args, *kwargs):
1028
+ pass
1029
+
1030
+ 2. Usage with different thresholds and methods
1031
+ .. code-block:: python
1032
+
1033
+ metric_1 = FaithfulnessMetric(method="token_k_precision", threshold=MetricThreshold(type="lower_limit", value=0.5))
1034
+ metric_2 = FaithfulnessMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
1035
+
1036
+ evaluator = AgenticEvaluator()
1037
+ @evaluator.evaluate_faithfulness(metrics=[metric_1, metric_2])
1038
+ def agentic_node(*args, *kwargs):
1039
+ pass
1040
+ """
1041
+
1042
+ return FaithfulnessDecorator(api_client=self.api_client,
1043
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1044
+ metric_results=self.__online_metric_results,
1045
+ execution_counts=self.__execution_counts,
1046
+ nodes_being_run=self.__nodes_being_run,
1047
+ lock=update_lock,
1048
+ compute_real_time=compute_real_time).evaluate_faithfulness(func, configuration=configuration, metrics=metrics)
1049
+
1050
+ def evaluate_unsuccessful_requests(self,
1051
+ func: Optional[Callable] = None,
1052
+ *,
1053
+ configuration: Optional[AgenticAIConfiguration] = None,
1054
+ metrics: list[GenAIMetric] = [],
1055
+ compute_real_time: Optional[bool] = True) -> dict:
1056
+ """
1057
+ An evaluation decorator for computing unsuccessful requests metric on an agentic tool.
1058
+
1059
+ For more details, see :class:`ibm_watsonx_gov.metrics.UnsuccessfulRequestsMetric`
1060
+
1061
+ Args:
1062
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1063
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1064
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ UnsuccessfulRequestsMetric() ].
1065
+
1066
+ Raises:
1067
+ Exception: If there is any error while evaluation.
1068
+
1069
+ Returns:
1070
+ dict: The result of the wrapped tool.
1071
+
1072
+ Example:
1073
+ 1. Basic usage
1074
+ .. code-block:: python
1075
+
1076
+ evaluator = AgenticEvaluator()
1077
+ @evaluator.evaluate_unsuccessful_requests
1078
+ def agentic_tool(*args, *kwargs):
1079
+ pass
1080
+
1081
+ 2. Usage with different thresholds and methods
1082
+ .. code-block:: python
1083
+
1084
+ metric_1 = UnsuccessfulRequestsMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
1085
+
1086
+ evaluator = AgenticEvaluator()
1087
+ @evaluator.evaluate_unsuccessful_requests(metrics=[metric_1])
1088
+ def agentic_tool(*args, *kwargs):
1089
+ pass
1090
+ """
1091
+
1092
+ return UnsuccessfulRequestsDecorator(api_client=self.api_client,
1093
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1094
+ metric_results=self.__online_metric_results,
1095
+ execution_counts=self.__execution_counts,
1096
+ nodes_being_run=self.__nodes_being_run,
1097
+ lock=update_lock,
1098
+ compute_real_time=compute_real_time).evaluate_unsuccessful_requests(func, configuration=configuration, metrics=metrics)
1099
+
1100
+ def evaluate_answer_relevance(self,
1101
+ func: Optional[Callable] = None,
1102
+ *,
1103
+ configuration: Optional[AgenticAIConfiguration] = None,
1104
+ metrics: list[GenAIMetric] = [],
1105
+ compute_real_time: Optional[bool] = True) -> dict:
1106
+ """
1107
+ An evaluation decorator for computing answer relevance metric on an agentic tool.
1108
+
1109
+ For more details, see :class:`ibm_watsonx_gov.metrics.AnswerRelevanceMetric`
1110
+
1111
+ Args:
1112
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1113
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1114
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ AnswerRelevanceMetric() ].
1115
+
1116
+ Raises:
1117
+ Exception: If there is any error while evaluation.
1118
+
1119
+ Returns:
1120
+ dict: The result of the wrapped tool.
1121
+
1122
+ Example:
1123
+ 1. Basic usage
1124
+ .. code-block:: python
1125
+
1126
+ evaluator = AgenticEvaluator()
1127
+ @evaluator.evaluate_answer_relevance
1128
+ def agentic_tool(*args, *kwargs):
1129
+ pass
1130
+
1131
+ 2. Usage with different thresholds and methods
1132
+ .. code-block:: python
1133
+
1134
+ metric_1 = AnswerRelevanceMetric(method="token_recall", thresholds=[MetricThreshold(type="lower_limit", value=0.5)])
1135
+ metric_2 = AnswerRelevanceMetric(method="granite_guardian", thresholds=[MetricThreshold(type="lower_limit", value=0.5)])
1136
+
1137
+ evaluator = AgenticEvaluator()
1138
+ @evaluator.evaluate_answer_relevance(metrics=[metric_1, metric_2])
1139
+ def agentic_tool(*args, *kwargs):
1140
+ pass
1141
+ """
1142
+
1143
+ return AnswerRelevanceDecorator(api_client=self.api_client,
1144
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1145
+ metric_results=self.__online_metric_results,
1146
+ execution_counts=self.__execution_counts,
1147
+ nodes_being_run=self.__nodes_being_run,
1148
+ lock=update_lock,
1149
+ compute_real_time=compute_real_time).evaluate_answer_relevance(func, configuration=configuration, metrics=metrics)
1150
+
1151
+ def evaluate_general_quality_with_llm(self,
1152
+ func: Optional[Callable] = None,
1153
+ *,
1154
+ configuration: Optional[AgenticAIConfiguration] = None,
1155
+ metrics: list[GenAIMetric] = [],
1156
+ compute_real_time: Optional[bool] = True) -> dict:
1157
+ """
1158
+ An evaluation decorator for computing llm validation metric on an agentic node.
1159
+
1160
+ For more details, see :class:`ibm_watsonx_gov.metrics.LLMValidationMetric`
1161
+
1162
+ Args:
1163
+ func (Optional[Callable], optional): The node on which the metric is to be computed.
1164
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1165
+ metrics (list[GenAIMetric]): The list of metrics to compute as part of this evaluator.
1166
+ compute_real_time (Optional[bool], optional): The flag to indicate whether the metric should be computed along with the node execution or not.
1167
+ When online is set to False, evaluate_metrics method should be invoked on the AgenticEvaluator to compute the metric.
1168
+
1169
+ Raises:
1170
+ Exception: If there is any error while evaluation.
1171
+
1172
+ Returns:
1173
+ dict: The result of the wrapped node.
1174
+
1175
+ Examples:
1176
+ 1. Basic usage
1177
+ .. code-block:: python
1178
+
1179
+ evaluator = AgenticEvaluator()
1180
+ @evaluator.evaluate_general_quality_with_llm
1181
+ def agentic_node(*args, *kwargs):
1182
+ pass
1183
+ """
1184
+ return LLMValidationDecorator(api_client=self.api_client,
1185
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1186
+ metric_results=self.__online_metric_results,
1187
+ execution_counts=self.__execution_counts,
1188
+ nodes_being_run=self.__nodes_being_run,
1189
+ lock=update_lock,
1190
+ compute_real_time=compute_real_time).evaluate_general_quality_with_llm(func,
1191
+ configuration=configuration,
1192
+ metrics=metrics)
1193
+
1194
+ def evaluate_tool_call_parameter_accuracy(self,
1195
+ func: Optional[Callable] = None,
1196
+ *,
1197
+ configuration: Optional[AgenticAIConfiguration] = None,
1198
+ metrics: list[GenAIMetric] = [],
1199
+ compute_real_time: Optional[bool] = True) -> dict:
1200
+ """
1201
+ An evaluation decorator for computing tool_call_parameter_accuracy metric on an agentic tool.
1202
+
1203
+ For more details, see :class:`ibm_watsonx_gov.metrics.ToolCallParameterAccuracyMetric`
1204
+
1205
+ Args:
1206
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1207
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1208
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ToolCallParameterAccuracyMetric() ].
1209
+
1210
+ Raises:
1211
+ Exception: If there is any error while evaluation.
1212
+
1213
+ Returns:
1214
+ dict: The result of the wrapped tool.
1215
+
1216
+ Example:
1217
+ 1. Basic usage
1218
+ .. code-block:: python
1219
+
1220
+ evaluator = AgenticEvaluator()
1221
+ tool_calls_metric_config={
1222
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1223
+ }
1224
+ llm_judge = LLMJudge(
1225
+ model=WxAIFoundationModel(
1226
+ model_id="meta-llama/llama-3-3-70b-instruct",
1227
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
1228
+ )
1229
+ )
1230
+ metric_1 = ToolCallParameterAccuracyMetric(llm_judge=llm_judge)
1231
+ @evaluator.evaluate_tool_call_parameter_accuracy(configuration=AgenticAIConfiguration(**tool_calls_metric_config), metrics=[metric_1])
1232
+ def agentic_tool(*args, *kwargs):
1233
+ pass
1234
+
1235
+ 2. Usage with custom tool calls field
1236
+ .. code-block:: python
1237
+
1238
+ evaluator = AgenticEvaluator()
1239
+ tool_calls_metric_config={
1240
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1241
+ "tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
1242
+ }
1243
+ llm_judge = LLMJudge(
1244
+ model=WxAIFoundationModel(
1245
+ model_id="meta-llama/llama-3-3-70b-instruct",
1246
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
1247
+ )
1248
+ )
1249
+ metric_1 = ToolCallParameterAccuracyMetric(llm_judge=llm_judge)
1250
+ @evaluator.evaluate_tool_call_parameter_accuracy(configuration=AgenticAIConfiguration(**tool_calls_metric_config), metrics=[metric_1])
1251
+ def agentic_tool(*args, *kwargs):
1252
+ pass
1253
+
1254
+ 3. Usage with different thresholds
1255
+ .. code-block:: python
1256
+
1257
+ llm_judge = LLMJudge(
1258
+ model=WxAIFoundationModel(
1259
+ model_id="meta-llama/llama-3-3-70b-instruct",
1260
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
1261
+ )
1262
+ )
1263
+ metric_1 = ToolCallParameterAccuracyMetric(llm_judge=llm_judge, threshold=MetricThreshold(type="upper_limit", value=0.7))
1264
+ evaluator = AgenticEvaluator()
1265
+ tool_calls_metric_config={
1266
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1267
+ "tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
1268
+ }
1269
+ @evaluator.evaluate_tool_call_parameter_accuracy(configuration=AgenticAIConfiguration(**tool_calls_metric_config),metrics=[metric_1])
1270
+ def agentic_tool(*args, *kwargs):
1271
+ pass
1272
+ """
1273
+
1274
+ return ToolCallParameterAccuracyDecorator(api_client=self.api_client,
1275
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1276
+ metric_results=self.__online_metric_results,
1277
+ execution_counts=self.__execution_counts,
1278
+ nodes_being_run=self.__nodes_being_run,
1279
+ lock=update_lock,
1280
+ compute_real_time=compute_real_time).evaluate_tool_call_parameter_accuracy(func, configuration=configuration, metrics=metrics)
1281
+
1282
+ def evaluate_tool_call_relevance(self,
1283
+ func: Optional[Callable] = None,
1284
+ *,
1285
+ configuration: Optional[AgenticAIConfiguration] = None,
1286
+ metrics: list[GenAIMetric] = [],
1287
+ compute_real_time: Optional[bool] = True) -> dict:
1288
+ """
1289
+ An evaluation decorator for computing tool_call_relevance metric on an agent tool.
1290
+
1291
+ For more details, see :class:`ibm_watsonx_gov.metrics.ToolCallRelevanceMetric`
1292
+
1293
+ Args:
1294
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1295
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1296
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ToolCallRelevanceMetric() ].
1297
+
1298
+ Raises:
1299
+ Exception: If there is any error while evaluation.
1300
+
1301
+ Returns:
1302
+ dict: The result of the wrapped tool.
1303
+
1304
+ Example:
1305
+ 1. Basic usage
1306
+ .. code-block:: python
1307
+
1308
+ evaluator = AgenticEvaluator()
1309
+ tool_call_relevance_config={
1310
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1311
+ }
1312
+ llm_judge = LLMJudge(
1313
+ model=WxAIFoundationModel(
1314
+ model_id="meta-llama/llama-3-3-70b-instruct",
1315
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
1316
+ )
1317
+ )
1318
+ metric_1 = ToolCallRelevanceMetric(llm_judge=llm_judge)
1319
+ @evaluator.evaluate_tool_call_relevance(configuration=AgenticAIConfiguration(**tool_call_relevance_config), metrics=[metric_1])
1320
+ def agentic_tool(*args, *kwargs):
1321
+ pass
1322
+
1323
+ 2. Usage with custom tool calls field
1324
+ .. code-block:: python
1325
+
1326
+ evaluator = AgenticEvaluator()
1327
+ tool_call_relevance_config={
1328
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1329
+ "tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
1330
+ }
1331
+ llm_judge = LLMJudge(
1332
+ model=WxAIFoundationModel(
1333
+ model_id="meta-llama/llama-3-3-70b-instruct",
1334
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
1335
+ )
1336
+ )
1337
+ metric_1 = ToolCallRelevanceMetric(llm_judge=llm_judge)
1338
+ @evaluator.evaluate_tool_call_relevance(configuration=AgenticAIConfiguration(**tool_call_relevance_config), metrics=[metric_1])
1339
+ def agentic_tool(*args, *kwargs):
1340
+ pass
1341
+
1342
+ 3. Usage with different thresholds
1343
+ .. code-block:: python
1344
+
1345
+ llm_judge = LLMJudge(
1346
+ model=WxAIFoundationModel(
1347
+ model_id="meta-llama/llama-3-3-70b-instruct",
1348
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
1349
+ )
1350
+ )
1351
+ metric_1 = ToolCallRelevanceMetric(llm_judge=llm_judge, threshold=MetricThreshold(type="upper_limit", value=0.7))
1352
+ evaluator = AgenticEvaluator()
1353
+ tool_call_relevance_config={
1354
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1355
+ "tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
1356
+ }
1357
+ @evaluator.evaluate_tool_call_relevance(configuration=AgenticAIConfiguration(**tool_call_relevance_config),metrics=[metric_1])
1358
+ def agentic_tool(*args, *kwargs):
1359
+ pass
1360
+ """
1361
+
1362
+ return ToolCallRelevanceDecorator(api_client=self.api_client,
1363
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1364
+ metric_results=self.__online_metric_results,
1365
+ execution_counts=self.__execution_counts,
1366
+ nodes_being_run=self.__nodes_being_run,
1367
+ lock=update_lock,
1368
+ compute_real_time=compute_real_time).evaluate_tool_call_relevance(func, configuration=configuration, metrics=metrics)
1369
+
1370
+ def evaluate_tool_call_syntactic_accuracy(self,
1371
+ func: Optional[Callable] = None,
1372
+ *,
1373
+ configuration: Optional[AgenticAIConfiguration] = None,
1374
+ metrics: list[GenAIMetric] = [],
1375
+ compute_real_time: Optional[bool] = True) -> dict:
1376
+ """
1377
+ An evaluation decorator for computing tool_call_syntactic_accuracy metric on an agent tool.
1378
+
1379
+ For more details, see :class:`ibm_watsonx_gov.metrics.ToolCallSyntacticAccuracyMetric`
1380
+
1381
+ Args:
1382
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1383
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1384
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ToolCallSyntacticAccuracyMetric() ].
1385
+
1386
+ Raises:
1387
+ Exception: If there is any error while evaluation.
1388
+
1389
+ Returns:
1390
+ dict: The result of the wrapped tool.
1391
+
1392
+ Example:
1393
+ 1. Basic usage
1394
+ .. code-block:: python
1395
+
1396
+ evaluator = AgenticEvaluator()
1397
+ tool_call_syntactic_metric_config={
1398
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1399
+ }
1400
+ @evaluator.evaluate_tool_call_syntactic_accuracy(configuration=AgenticAIConfiguration(**tool_call_syntactic_metric_config))
1401
+ def agentic_tool(*args, *kwargs):
1402
+ pass
1403
+
1404
+ 2. Usage with custom tool calls field
1405
+ .. code-block:: python
1406
+
1407
+ evaluator = AgenticEvaluator()
1408
+ tool_call_syntactic_metric_config={
1409
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1410
+ "tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
1411
+ }
1412
+ @evaluator.evaluate_tool_call_syntactic_accuracy(configuration=AgenticAIConfiguration(**tool_call_syntactic_metric_config))
1413
+ def agentic_tool(*args, *kwargs):
1414
+ pass
1415
+
1416
+ 3. Usage with different thresholds
1417
+ .. code-block:: python
1418
+
1419
+ metric_1 = ToolCallSyntacticAccuracyMetric(threshold=MetricThreshold(type="upper_limit", value=0.7))
1420
+ evaluator = AgenticEvaluator()
1421
+ tool_call_syntactic_metric_config={
1422
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1423
+ "tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
1424
+ }
1425
+ @evaluator.evaluate_tool_call_syntactic_accuracy(configuration=AgenticAIConfiguration(**tool_call_syntactic_metric_config),metrics=[metric_1])
1426
+ def agentic_tool(*args, *kwargs):
1427
+ pass
1428
+ """
1429
+ return ToolCallSyntacticAccuracyDecorator(api_client=self.api_client,
1430
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1431
+ metric_results=self.__online_metric_results,
1432
+ execution_counts=self.__execution_counts,
1433
+ nodes_being_run=self.__nodes_being_run,
1434
+ lock=update_lock,
1435
+ compute_real_time=compute_real_time).evaluate_tool_call_syntactic_accuracy(func, configuration=configuration, metrics=metrics)
1436
+
1437
+ def evaluate_tool_call_accuracy(self,
1438
+ func: Optional[Callable] = None,
1439
+ *,
1440
+ configuration: Optional[AgenticAIConfiguration] = None,
1441
+ metrics: list[GenAIMetric] = [],
1442
+ compute_real_time: Optional[bool] = True) -> dict:
1443
+ """
1444
+ An evaluation decorator for computing tool_call_accuracy metric on an agent tool.
1445
+
1446
+ For more details, see :class:`ibm_watsonx_gov.metrics.ToolCallAccuracyMetric`
1447
+
1448
+ Args:
1449
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1450
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1451
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ToolCallAccuracyMetric() ].
1452
+
1453
+ Raises:
1454
+ Exception: If there is any error while evaluation.
1455
+
1456
+ Returns:
1457
+ dict: The result of the wrapped tool.
1458
+
1459
+ Example:
1460
+ 1. Basic usage
1461
+ .. code-block:: python
1462
+
1463
+ evaluator = AgenticEvaluator()
1464
+ tool_call_metric_config={
1465
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1466
+ }
1467
+ @evaluator.evaluate_tool_call_accuracy(configuration=AgenticAIConfiguration(**tool_call_metric_config))
1468
+ def agentic_tool(*args, *kwargs):
1469
+ pass
1470
+
1471
+ 2. Usage with custom tool calls field
1472
+ .. code-block:: python
1473
+
1474
+ evaluator = AgenticEvaluator()
1475
+ tool_call_metric_config={
1476
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1477
+ "tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
1478
+ }
1479
+ @evaluator.evaluate_tool_call_syntactic_accuracy(configuration=AgenticAIConfiguration(**tool_call_metric_config))
1480
+ def agentic_tool(*args, *kwargs):
1481
+ pass
1482
+
1483
+ 3. Usage with different thresholds
1484
+ .. code-block:: python
1485
+
1486
+ metric_1 = ToolCallAccuracyMetric(threshold=MetricThreshold(type="upper_limit", value=0.7))
1487
+ metric_2 = ToolCallAccuracyMetric(threshold=MetricThreshold(type="upper_limit", value=0.9))
1488
+ evaluator = AgenticEvaluator()
1489
+ tool_call_metric_config={
1490
+ "tools":[get_weather, fetch_stock_price], # List of tools available to the agent
1491
+ "tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
1492
+ }
1493
+ @evaluator.evaluate_tool_call_accuracy(configuration=AgenticAIConfiguration(**tool_call_metric_config),metrics=[metric_1, metric_2])
1494
+ def agentic_tool(*args, *kwargs):
1495
+ pass
1496
+
1497
+ 4. Usage with a list of dictionary items as tools
1498
+ .. code-block:: python
1499
+ available_tools = [{"type":"function","function":{"name":"f1_name","description":"f1_description.","parameters":{"parameter1":{"description":"parameter_description","type":"parameter_type","default":"default_value"}}}}]
1500
+ tool_call_metric_config={
1501
+ "tools":available_tools, # List of tools available to the agent
1502
+ "tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
1503
+ }
1504
+ metric = ToolCallAccuracyMetric()
1505
+ evaluator = AgenticEvaluator()
1506
+ @evaluator.evaluate_tool_call_accuracy(configuration=AgenticAIConfiguration(**tool_call_metric_config),metrics=[metric])
1507
+ def agentic_tool(*args, *kwargs):
1508
+ pass
1509
+ """
1510
+ return ToolCallAccuracyDecorator(api_client=self.api_client,
1511
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1512
+ metric_results=self.__online_metric_results,
1513
+ execution_counts=self.__execution_counts,
1514
+ nodes_being_run=self.__nodes_being_run,
1515
+ lock=update_lock,
1516
+ compute_real_time=compute_real_time).evaluate_tool_call_accuracy(func, configuration=configuration, metrics=metrics)
1517
+
1518
+ def evaluate_prompt_safety_risk(self,
1519
+ func: Optional[Callable] = None,
1520
+ *,
1521
+ configuration: Optional[AgenticAIConfiguration] = None,
1522
+ metrics: list[GenAIMetric],
1523
+ compute_real_time: Optional[bool] = True,
1524
+ ) -> dict:
1525
+ """
1526
+ An evaluation decorator for computing prompt safety risk metric on an agentic tool.
1527
+
1528
+ For more details, see :class:`ibm_watsonx_gov.metrics.PromptSafetyRiskMetric`
1529
+
1530
+ Args:
1531
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1532
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1533
+ metrics (list[GenAIMetric]): The list of metrics to compute as part of this evaluator.
1534
+
1535
+ Raises:
1536
+ Exception: If there is any error while evaluation.
1537
+
1538
+ Returns:
1539
+ dict: The result of the wrapped tool.
1540
+
1541
+ Example:
1542
+ 1. Create evaluate_prompt_safety_risk decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
1543
+ .. code-block:: python
1544
+
1545
+ evaluator = AgenticEvaluator()
1546
+ @evaluator.evaluate_prompt_safety_risk(metrics=[PromptSafetyRiskMetric(system_prompt="...")])
1547
+ def agentic_tool(*args, *kwargs):
1548
+ pass
1549
+
1550
+ 2. Create evaluate_prompt_safety_risk decorator with thresholds and configuration
1551
+ .. code-block:: python
1552
+
1553
+ metric = PromptSafetyRiskMetric(system_prompt="...", thresholds=MetricThreshold(type="lower_limit", value=0.7))
1554
+ config = {"input_fields": ["input"]}
1555
+ configuration = AgenticAIConfiguration(**config)
1556
+ evaluator = AgenticEvaluator()
1557
+ @evaluator.evaluate_prompt_safety_risk(metrics=[metric], configuration=configuration)
1558
+ def agentic_tool(*args, *kwargs):
1559
+ pass
1560
+ """
1561
+ return PromptSafetyRiskDecorator(api_client=self.api_client,
1562
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1563
+ metric_results=self.__online_metric_results,
1564
+ execution_counts=self.__execution_counts,
1565
+ nodes_being_run=self.__nodes_being_run,
1566
+ lock=update_lock,
1567
+ compute_real_time=compute_real_time).evaluate_prompt_safety_risk(func, configuration=configuration, metrics=metrics)
1568
+
1569
+ def evaluate_hap(self,
1570
+ func: Optional[Callable] = None,
1571
+ *,
1572
+ configuration: Optional[AgenticAIConfiguration] = None,
1573
+ metrics: list[GenAIMetric] = [],
1574
+ compute_real_time: Optional[bool] = True,
1575
+ ) -> dict:
1576
+ """
1577
+ An evaluation decorator for computing HAP metric on an agentic tool.
1578
+
1579
+ For more details, see :class:`ibm_watsonx_gov.metrics.HAPMetric`
1580
+
1581
+ Args:
1582
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1583
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1584
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [HAPMetric()].
1585
+
1586
+ Raises:
1587
+ Exception: If there is any error while evaluation.
1588
+
1589
+ Returns:
1590
+ dict: The result of the wrapped tool.
1591
+
1592
+ Example:
1593
+ 1. Create evaluate_hap decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
1594
+ .. code-block:: python
1595
+
1596
+ evaluator = AgenticEvaluator()
1597
+ @evaluator.evaluate_hap
1598
+ def agentic_tool(*args, *kwargs):
1599
+ pass
1600
+
1601
+ 2. Create evaluate_hap decorator with thresholds and configuration
1602
+ .. code-block:: python
1603
+
1604
+ metric = HAPMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
1605
+ config = {"input_fields": ["input"]}
1606
+ configuration = AgenticAIConfiguration(**config)
1607
+ evaluator = AgenticEvaluator()
1608
+ @evaluator.evaluate_hap(metrics=[metric], configuration=configuration)
1609
+ def agentic_tool(*args, *kwargs):
1610
+ pass
1611
+ """
1612
+ return HAPDecorator(api_client=self.api_client,
1613
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1614
+ metric_results=self.__online_metric_results,
1615
+ execution_counts=self.__execution_counts,
1616
+ nodes_being_run=self.__nodes_being_run,
1617
+ lock=update_lock,
1618
+ compute_real_time=compute_real_time).evaluate_hap(func, configuration=configuration, metrics=metrics)
1619
+
1620
+ def evaluate_pii(self,
1621
+ func: Optional[Callable] = None,
1622
+ *,
1623
+ configuration: Optional[AgenticAIConfiguration] = None,
1624
+ metrics: list[GenAIMetric] = [],
1625
+ compute_real_time: Optional[bool] = True,
1626
+ ) -> dict:
1627
+ """
1628
+ An evaluation decorator for computing PII metric on an agentic tool.
1629
+
1630
+ For more details, see :class:`ibm_watsonx_gov.metrics.PIIMetric`
1631
+
1632
+ Args:
1633
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1634
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1635
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [PIIMetric()].
1636
+
1637
+ Raises:
1638
+ Exception: If there is any error while evaluation.
1639
+
1640
+ Returns:
1641
+ dict: The result of the wrapped tool.
1642
+
1643
+ Example:
1644
+ 1. Create evaluate_pii decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
1645
+ .. code-block:: python
1646
+
1647
+ evaluator = AgenticEvaluator()
1648
+ @evaluator.evaluate_pii
1649
+ def agentic_tool(*args, *kwargs):
1650
+ pass
1651
+
1652
+ 2. Create evaluate_pii decorator with thresholds and configuration
1653
+ .. code-block:: python
1654
+
1655
+ metric = PIIMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
1656
+ config = {"input_fields": ["input"]}
1657
+ configuration = AgenticAIConfiguration(**config)
1658
+ evaluator = AgenticEvaluator()
1659
+ @evaluator.evaluate_pii(metrics=[metric], configuration=configuration)
1660
+ def agentic_tool(*args, *kwargs):
1661
+ pass
1662
+ """
1663
+ return PIIDecorator(api_client=self.api_client,
1664
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1665
+ metric_results=self.__online_metric_results,
1666
+ execution_counts=self.__execution_counts,
1667
+ nodes_being_run=self.__nodes_being_run,
1668
+ lock=update_lock,
1669
+ compute_real_time=compute_real_time).evaluate_pii(func, configuration=configuration, metrics=metrics)
1670
+
1671
+ def evaluate_harm(self,
1672
+ func: Optional[Callable] = None,
1673
+ *,
1674
+ configuration: Optional[AgenticAIConfiguration] = None,
1675
+ metrics: list[GenAIMetric] = [],
1676
+ compute_real_time: Optional[bool] = True,
1677
+ ) -> dict:
1678
+ """
1679
+ An evaluation decorator for computing harm risk on an agentic tool via granite guardian.
1680
+
1681
+ For more details, see :class:`ibm_watsonx_gov.metrics.HarmMetric`
1682
+
1683
+ Args:
1684
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1685
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1686
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ HarmMetric() ]
1687
+
1688
+ Raises:
1689
+ Exception: If there is any error while evaluation.
1690
+
1691
+ Returns:
1692
+ dict: The result of the wrapped tool.
1693
+
1694
+ Example:
1695
+ 1. Create evaluate_harm decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
1696
+ .. code-block:: python
1697
+
1698
+ evaluator = AgenticEvaluator()
1699
+ @evaluator.evaluate_harm
1700
+ def agentic_tool(*args, *kwargs):
1701
+ pass
1702
+
1703
+ 2. Create evaluate_harm decorator with thresholds and configuration
1704
+ .. code-block:: python
1705
+
1706
+ metric = HarmMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
1707
+ config = {"input_fields": ["input"]}
1708
+ configuration = AgenticAIConfiguration(**config)
1709
+ evaluator = AgenticEvaluator()
1710
+ @evaluator.evaluate_harm(metrics=[metric], configuration=configuration)
1711
+ def agentic_tool(*args, *kwargs):
1712
+ pass
1713
+ """
1714
+ return HarmDecorator(api_client=self.api_client,
1715
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1716
+ metric_results=self.__online_metric_results,
1717
+ execution_counts=self.__execution_counts,
1718
+ nodes_being_run=self.__nodes_being_run,
1719
+ lock=update_lock,
1720
+ compute_real_time=compute_real_time).evaluate_harm(func, configuration=configuration, metrics=metrics)
1721
+
1722
+ def evaluate_social_bias(self,
1723
+ func: Optional[Callable] = None,
1724
+ *,
1725
+ configuration: Optional[AgenticAIConfiguration] = None,
1726
+ metrics: list[GenAIMetric] = [],
1727
+ compute_real_time: Optional[bool] = True,
1728
+ ) -> dict:
1729
+ """
1730
+ An evaluation decorator for computing social bias on an agentic tool via granite guardian.
1731
+
1732
+ For more details, see :class:`ibm_watsonx_gov.metrics.SocialBiasMetric`
1733
+
1734
+ Args:
1735
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1736
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1737
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ SocialBiasMetric() ]
1738
+
1739
+ Raises:
1740
+ Exception: If there is any error while evaluation.
1741
+
1742
+ Returns:
1743
+ dict: The result of the wrapped tool.
1744
+
1745
+ Example:
1746
+ 1. Create evaluate_social_bias decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
1747
+ .. code-block:: python
1748
+
1749
+ evaluator = AgenticEvaluator()
1750
+ @evaluator.evaluate_social_bias
1751
+ def agentic_tool(*args, *kwargs):
1752
+ pass
1753
+
1754
+ 2. Create evaluate_social_bias decorator with thresholds and configuration
1755
+ .. code-block:: python
1756
+
1757
+ metric = SocialBiasMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
1758
+ config = {"input_fields": ["input"]}
1759
+ configuration = AgenticAIConfiguration(**config)
1760
+ evaluator = AgenticEvaluator()
1761
+ @evaluator.evaluate_social_bias(metrics=[metric], configuration=configuration)
1762
+ def agentic_tool(*args, *kwargs):
1763
+ pass
1764
+ """
1765
+ return SocialBiasDecorator(api_client=self.api_client,
1766
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1767
+ metric_results=self.__online_metric_results,
1768
+ execution_counts=self.__execution_counts,
1769
+ nodes_being_run=self.__nodes_being_run,
1770
+ lock=update_lock,
1771
+ compute_real_time=compute_real_time).evaluate_social_bias(func, configuration=configuration, metrics=metrics)
1772
+
1773
+ def evaluate_profanity(self,
1774
+ func: Optional[Callable] = None,
1775
+ *,
1776
+ configuration: Optional[AgenticAIConfiguration] = None,
1777
+ metrics: list[GenAIMetric] = [],
1778
+ compute_real_time: Optional[bool] = True,
1779
+ ) -> dict:
1780
+ """
1781
+ An evaluation decorator for computing profanity on an agentic tool via granite guardian.
1782
+
1783
+ For more details, see :class:`ibm_watsonx_gov.metrics.ProfanityMetric`
1784
+
1785
+ Args:
1786
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1787
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1788
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
1789
+
1790
+ Raises:
1791
+ Exception: If there is any error while evaluation.
1792
+
1793
+ Returns:
1794
+ dict: The result of the wrapped tool.
1795
+
1796
+ Example:
1797
+ 1. Create evaluate_profanity decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
1798
+ .. code-block:: python
1799
+
1800
+ evaluator = AgenticEvaluator()
1801
+ @evaluator.evaluate_profanity
1802
+ def agentic_tool(*args, *kwargs):
1803
+ pass
1804
+
1805
+ 2. Create evaluate_profanity decorator with thresholds and configuration
1806
+ .. code-block:: python
1807
+
1808
+ metric = ProfanityMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
1809
+ config = {"input_fields": ["input"]}
1810
+ configuration = AgenticAIConfiguration(**config)
1811
+ evaluator = AgenticEvaluator()
1812
+ @evaluator.evaluate_profanity(metrics=[metric], configuration=configuration)
1813
+ def agentic_tool(*args, *kwargs):
1814
+ pass
1815
+ """
1816
+ return ProfanityDecorator(api_client=self.api_client,
1817
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1818
+ metric_results=self.__online_metric_results,
1819
+ execution_counts=self.__execution_counts,
1820
+ nodes_being_run=self.__nodes_being_run,
1821
+ lock=update_lock,
1822
+ compute_real_time=compute_real_time).evaluate_profanity(func, configuration=configuration, metrics=metrics)
1823
+
1824
+ def evaluate_sexual_content(self,
1825
+ func: Optional[Callable] = None,
1826
+ *,
1827
+ configuration: Optional[AgenticAIConfiguration] = None,
1828
+ metrics: list[GenAIMetric] = [],
1829
+ compute_real_time: Optional[bool] = True,
1830
+ ) -> dict:
1831
+ """
1832
+ An evaluation decorator for computing sexual content on an agentic tool via granite guardian.
1833
+
1834
+ For more details, see :class:`ibm_watsonx_gov.metrics.SexualContentMetric`
1835
+
1836
+ Args:
1837
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1838
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1839
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
1840
+
1841
+ Raises:
1842
+ Exception: If there is any error while evaluation.
1843
+
1844
+ Returns:
1845
+ dict: The result of the wrapped tool.
1846
+
1847
+ Example:
1848
+ 1. Create evaluate_sexual_content decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
1849
+ .. code-block:: python
1850
+
1851
+ evaluator = AgenticEvaluator()
1852
+ @evaluator.evaluate_sexual_content
1853
+ def agentic_tool(*args, *kwargs):
1854
+ pass
1855
+
1856
+ 2. Create evaluate_sexual_content decorator with thresholds and configuration
1857
+ .. code-block:: python
1858
+
1859
+ metric = SexualContentMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
1860
+ config = {"input_fields": ["input"]}
1861
+ configuration = AgenticAIConfiguration(**config)
1862
+ evaluator = AgenticEvaluator()
1863
+ @evaluator.evaluate_sexual_content(metrics=[metric], configuration=configuration)
1864
+ def agentic_tool(*args, *kwargs):
1865
+ pass
1866
+ """
1867
+ return SexualContentDecorator(api_client=self.api_client,
1868
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1869
+ metric_results=self.__online_metric_results,
1870
+ execution_counts=self.__execution_counts,
1871
+ nodes_being_run=self.__nodes_being_run,
1872
+ lock=update_lock,
1873
+ compute_real_time=compute_real_time).evaluate_sexual_content(func, configuration=configuration, metrics=metrics)
1874
+
1875
+ def evaluate_unethical_behavior(self,
1876
+ func: Optional[Callable] = None,
1877
+ *,
1878
+ configuration: Optional[AgenticAIConfiguration] = None,
1879
+ metrics: list[GenAIMetric] = [],
1880
+ compute_real_time: Optional[bool] = True,
1881
+ ) -> dict:
1882
+ """
1883
+ An evaluation decorator for computing unethical behavior on an agentic tool via granite guardian.
1884
+
1885
+ For more details, see :class:`ibm_watsonx_gov.metrics.UnethicalBehaviorMetric`
1886
+
1887
+ Args:
1888
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1889
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1890
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
1891
+
1892
+ Raises:
1893
+ Exception: If there is any error while evaluation.
1894
+
1895
+ Returns:
1896
+ dict: The result of the wrapped tool.
1897
+
1898
+ Example:
1899
+ 1. Create evaluate_unethical_behavior decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
1900
+ .. code-block:: python
1901
+
1902
+ evaluator = AgenticEvaluator()
1903
+ @evaluator.evaluate_unethical_behavior
1904
+ def agentic_tool(*args, *kwargs):
1905
+ pass
1906
+
1907
+ 2. Create evaluate_unethical_behavior decorator with thresholds and configuration
1908
+ .. code-block:: python
1909
+
1910
+ metric = UnethicalBehaviorMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
1911
+ config = {"input_fields": ["input"]}
1912
+ configuration = AgenticAIConfiguration(**config)
1913
+ evaluator = AgenticEvaluator()
1914
+ @evaluator.evaluate_unethical_behavior(metrics=[metric], configuration=configuration)
1915
+ def agentic_tool(*args, *kwargs):
1916
+ pass
1917
+ """
1918
+
1919
+ return UnethicalBehaviorDecorator(api_client=self.api_client,
1920
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1921
+ metric_results=self.__online_metric_results,
1922
+ execution_counts=self.__execution_counts,
1923
+ nodes_being_run=self.__nodes_being_run,
1924
+ lock=update_lock,
1925
+ compute_real_time=compute_real_time).evaluate_unethical_behavior(func, configuration=configuration, metrics=metrics)
1926
+
1927
+ def evaluate_violence(self,
1928
+ func: Optional[Callable] = None,
1929
+ *,
1930
+ configuration: Optional[AgenticAIConfiguration] = None,
1931
+ metrics: list[GenAIMetric] = [],
1932
+ compute_real_time: Optional[bool] = True,
1933
+ ) -> dict:
1934
+ """
1935
+ An evaluation decorator for computing violence on an agentic tool via granite guardian.
1936
+
1937
+ For more details, see :class:`ibm_watsonx_gov.metrics.ViolenceMetric`
1938
+
1939
+ Args:
1940
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1941
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1942
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
1943
+
1944
+ Raises:
1945
+ Exception: If there is any error while evaluation.
1946
+
1947
+ Returns:
1948
+ dict: The result of the wrapped tool.
1949
+
1950
+ Example:
1951
+ 1. Create evaluate_violence decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
1952
+ .. code-block:: python
1953
+
1954
+ evaluator = AgenticEvaluator()
1955
+ @evaluator.evaluate_violence
1956
+ def agentic_tool(*args, *kwargs):
1957
+ pass
1958
+
1959
+ 2. Create evaluate_violence decorator with thresholds and configuration
1960
+ .. code-block:: python
1961
+
1962
+ metric = ViolenceMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
1963
+ config = {"input_fields": ["input"]}
1964
+ configuration = AgenticAIConfiguration(**config)
1965
+ evaluator = AgenticEvaluator()
1966
+ @evaluator.evaluate_violence(metrics=[metric], configuration=configuration)
1967
+ def agentic_tool(*args, *kwargs):
1968
+ pass
1969
+ """
1970
+ return ViolenceDecorator(api_client=self.api_client,
1971
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
1972
+ metric_results=self.__online_metric_results,
1973
+ execution_counts=self.__execution_counts,
1974
+ nodes_being_run=self.__nodes_being_run,
1975
+ lock=update_lock,
1976
+ compute_real_time=compute_real_time).evaluate_violence(func, configuration=configuration, metrics=metrics)
1977
+
1978
+ def evaluate_harm_engagement(self,
1979
+ func: Optional[Callable] = None,
1980
+ *,
1981
+ configuration: Optional[AgenticAIConfiguration] = None,
1982
+ metrics: list[GenAIMetric] = [],
1983
+ compute_real_time: Optional[bool] = True,
1984
+ ) -> dict:
1985
+ """
1986
+ An evaluation decorator for computing harm engagement on an agentic tool via granite guardian.
1987
+
1988
+ For more details, see :class:`ibm_watsonx_gov.metrics.HarmEngagementMetric`
1989
+
1990
+ Args:
1991
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
1992
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
1993
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
1994
+
1995
+ Raises:
1996
+ Exception: If there is any error while evaluation.
1997
+
1998
+ Returns:
1999
+ dict: The result of the wrapped tool.
2000
+
2001
+ Example:
2002
+ 1. Create evaluate_harm_engagement decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
2003
+ .. code-block:: python
2004
+
2005
+ evaluator = AgenticEvaluator()
2006
+ @evaluator.evaluate_harm_engagement
2007
+ def agentic_tool(*args, *kwargs):
2008
+ pass
2009
+
2010
+ 2. Create evaluate_harm_engagement decorator with thresholds and configuration
2011
+ .. code-block:: python
2012
+
2013
+ metric = HarmEngagementMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
2014
+ config = {"input_fields": ["input"]}
2015
+ configuration = AgenticAIConfiguration(**config)
2016
+ evaluator = AgenticEvaluator()
2017
+ @evaluator.evaluate_harm_engagement(metrics=[metric], configuration=configuration)
2018
+ def agentic_tool(*args, *kwargs):
2019
+ pass
2020
+ """
2021
+ return HarmEngagementDecorator(api_client=self.api_client,
2022
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2023
+ metric_results=self.__online_metric_results,
2024
+ execution_counts=self.__execution_counts,
2025
+ nodes_being_run=self.__nodes_being_run,
2026
+ lock=update_lock,
2027
+ compute_real_time=compute_real_time).evaluate_harm_engagement(func, configuration=configuration, metrics=metrics)
2028
+
2029
+ def evaluate_evasiveness(self,
2030
+ func: Optional[Callable] = None,
2031
+ *,
2032
+ configuration: Optional[AgenticAIConfiguration] = None,
2033
+ metrics: list[GenAIMetric] = [],
2034
+ compute_real_time: Optional[bool] = True,
2035
+ ) -> dict:
2036
+ """
2037
+ An evaluation decorator for computing evasiveness on an agentic tool via granite guardian.
2038
+
2039
+ For more details, see :class:`ibm_watsonx_gov.metrics.EvasivenessMetric`
2040
+
2041
+ Args:
2042
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
2043
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
2044
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
2045
+
2046
+ Raises:
2047
+ Exception: If there is any error while evaluation.
2048
+
2049
+ Returns:
2050
+ dict: The result of the wrapped tool.
2051
+
2052
+ Example:
2053
+ 1. Create evaluate_evasiveness decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
2054
+ .. code-block:: python
2055
+
2056
+ evaluator = AgenticEvaluator()
2057
+ @evaluator.evaluate_evasiveness
2058
+ def agentic_tool(*args, *kwargs):
2059
+ pass
2060
+
2061
+ 2. Create evaluate_evasiveness decorator with thresholds and configuration
2062
+ .. code-block:: python
2063
+
2064
+ metric = EvasivenessMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
2065
+ config = {"input_fields": ["input"]}
2066
+ configuration = AgenticAIConfiguration(**config)
2067
+ evaluator = AgenticEvaluator()
2068
+ @evaluator.evaluate_evasiveness(metrics=[metric], configuration=configuration)
2069
+ def agentic_tool(*args, *kwargs):
2070
+ pass
2071
+ """
2072
+ return EvasivenessDecorator(api_client=self.api_client,
2073
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2074
+ metric_results=self.__online_metric_results,
2075
+ execution_counts=self.__execution_counts,
2076
+ nodes_being_run=self.__nodes_being_run,
2077
+ lock=update_lock,
2078
+ compute_real_time=compute_real_time).evaluate_evasiveness(func, configuration=configuration, metrics=metrics)
2079
+
2080
+ def evaluate_jailbreak(self,
2081
+ func: Optional[Callable] = None,
2082
+ *,
2083
+ configuration: Optional[AgenticAIConfiguration] = None,
2084
+ metrics: list[GenAIMetric] = [],
2085
+ compute_real_time: Optional[bool] = True,
2086
+ ) -> dict:
2087
+ """
2088
+ An evaluation decorator for computing jailbreak on an agentic tool via granite guardian.
2089
+
2090
+ For more details, see :class:`ibm_watsonx_gov.metrics.JailbreakMetric`
2091
+
2092
+ Args:
2093
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
2094
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
2095
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
2096
+
2097
+ Raises:
2098
+ Exception: If there is any error while evaluation.
2099
+
2100
+ Returns:
2101
+ dict: The result of the wrapped tool.
2102
+
2103
+ Example:
2104
+ 1. Create evaluate_jailbreak decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
2105
+ .. code-block:: python
2106
+
2107
+ evaluator = AgenticEvaluator()
2108
+ @evaluator.evaluate_jailbreak
2109
+ def agentic_tool(*args, *kwargs):
2110
+ pass
2111
+
2112
+ 2. Create evaluate_jailbreak decorator with thresholds and configuration
2113
+ .. code-block:: python
2114
+
2115
+ metric = JailbreakMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
2116
+ config = {"input_fields": ["input"]}
2117
+ configuration = AgenticAIConfiguration(**config)
2118
+ evaluator = AgenticEvaluator()
2119
+ @evaluator.evaluate_jailbreak(metrics=[metric], configuration=configuration)
2120
+ def agentic_tool(*args, *kwargs):
2121
+ pass
2122
+ """
2123
+ return JailbreakDecorator(api_client=self.api_client,
2124
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2125
+ metric_results=self.__online_metric_results,
2126
+ execution_counts=self.__execution_counts,
2127
+ nodes_being_run=self.__nodes_being_run,
2128
+ lock=update_lock,
2129
+ compute_real_time=compute_real_time).evaluate_jailbreak(func, configuration=configuration, metrics=metrics)
2130
+
2131
+ def evaluate_topic_relevance(self,
2132
+ func: Optional[Callable] = None,
2133
+ *,
2134
+ configuration: Optional[AgenticAIConfiguration] = None,
2135
+ metrics: list[GenAIMetric],
2136
+ compute_real_time: Optional[bool] = True,
2137
+ ) -> dict:
2138
+ """
2139
+ An evaluation decorator for computing topic relevance on an agentic tool via off-topic detector.
2140
+
2141
+ For more details, see :class:`ibm_watsonx_gov.metrics.TopicRelevanceMetric`
2142
+
2143
+ Args:
2144
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
2145
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
2146
+ metrics (list[GenAIMetric]): The list of metrics to compute as part of this evaluator.
2147
+
2148
+ Raises:
2149
+ Exception: If there is any error while evaluation.
2150
+
2151
+ Returns:
2152
+ dict: The result of the wrapped tool.
2153
+
2154
+ Example:
2155
+ 1. Create evaluate_topic_relevance decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
2156
+ .. code-block:: python
2157
+
2158
+ metric = TopicRelevanceMetric(system_prompt="...")
2159
+ evaluator = AgenticEvaluator()
2160
+ @evaluator.evaluate_topic_relevance(metrics=[metric])
2161
+ def agentic_tool(*args, *kwargs):
2162
+ pass
2163
+
2164
+ 2. Create evaluate_topic_relevance decorator with thresholds and configuration
2165
+ .. code-block:: python
2166
+
2167
+ metric = TopicRelevanceMetric(system_prompt="...", thresholds=MetricThreshold(type="lower_limit", value=0.7))
2168
+ evaluator = AgenticEvaluator()
2169
+ config = {"input_fields": ["input"]}
2170
+ configuration = AgenticAIConfiguration(**config)
2171
+ @evaluator.evaluate_topic_relevance(metrics=[metric], configuration=configuration)
2172
+ def agentic_tool(*args, *kwargs):
2173
+ pass
2174
+ """
2175
+ return TopicRelevanceDecorator(api_client=self.api_client,
2176
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2177
+ metric_results=self.__online_metric_results,
2178
+ execution_counts=self.__execution_counts,
2179
+ nodes_being_run=self.__nodes_being_run,
2180
+ lock=update_lock,
2181
+ compute_real_time=compute_real_time).evaluate_topic_relevance(func, configuration=configuration, metrics=metrics)
2182
+
2183
+ def evaluate_answer_quality(self,
2184
+ func: Optional[Callable] = None,
2185
+ *,
2186
+ configuration: Optional[AgenticAIConfiguration] = None,
2187
+ metrics: list[GenAIMetric] = [],
2188
+ compute_real_time: Optional[bool] = True
2189
+ ) -> dict:
2190
+ """
2191
+ An evaluation decorator for computing answer quality metrics on an agentic tool.
2192
+ Answer Quality metrics include Answer Relevance, Faithfulness, Answer Similarity, Unsuccessful Requests
2193
+
2194
+ For more details, see :class:`ibm_watsonx_gov.metrics.AnswerRelevanceMetric`, :class:`ibm_watsonx_gov.metrics.FaithfulnessMetric`,
2195
+ :class:`ibm_watsonx_gov.metrics.UnsuccessfulRequestsMetric`, see :class:`ibm_watsonx_gov.metrics.AnswerSimilarityMetric`,
2196
+
2197
+ Args:
2198
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
2199
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
2200
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to MetricGroup.ANSWER_QUALITY.get_metrics().
2201
+
2202
+ Raises:
2203
+ Exception: If there is any error while evaluation.
2204
+
2205
+ Returns:
2206
+ dict: The result of the wrapped tool.
2207
+
2208
+ Example:
2209
+ 1. Basic usage
2210
+ .. code-block:: python
2211
+
2212
+ evaluator = AgenticEvaluator()
2213
+ @evaluator.evaluate_answer_quality
2214
+ def agentic_tool(*args, *kwargs):
2215
+ pass
2216
+
2217
+ 2. Usage with different thresholds and methods for some of the metrics in the group
2218
+ .. code-block:: python
2219
+
2220
+ metric_1 = FaithfulnessMetric(thresholds=MetricThreshold(type="lower_limit", value=0.5))
2221
+ metric_2 = AnswerRelevanceMetric(method="token_recall", thresholds=MetricThreshold(type="lower_limit", value=0.5))
2222
+
2223
+ evaluator = AgenticEvaluator()
2224
+ @evaluator.evaluate_answer_quality(metrics=[metric_1, metric_2])
2225
+ def agentic_tool(*args, *kwargs):
2226
+ pass
2227
+ """
2228
+ return AnswerQualityDecorator(api_client=self.api_client,
2229
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2230
+ metric_results=self.__online_metric_results,
2231
+ execution_counts=self.__execution_counts,
2232
+ nodes_being_run=self.__nodes_being_run,
2233
+ lock=update_lock,
2234
+ compute_real_time=compute_real_time).evaluate_answer_quality(func, configuration=configuration, metrics=metrics)
2235
+
2236
+ def evaluate_content_safety(self,
2237
+ func: Optional[Callable] = None,
2238
+ *,
2239
+ configuration: Optional[AgenticAIConfiguration] = None,
2240
+ metrics: list[GenAIMetric] = [],
2241
+ compute_real_time: Optional[bool] = True
2242
+ ) -> dict:
2243
+ """
2244
+ An evaluation decorator for computing content safety metrics on an agentic tool.
2245
+ Content Safety metrics include HAP, PII, Evasiveness, Harm, HarmEngagement, Jailbreak, Profanity, SexualContent, Social Bias, UnethicalBehavior and Violence
2246
+
2247
+ For more details, see :class:`ibm_watsonx_gov.metrics.HAPMetric`,
2248
+ :class:`ibm_watsonx_gov.metrics.PIIMetric`, :class:`ibm_watsonx_gov.metrics.EvasivenessMetric`, :class:`ibm_watsonx_gov.metrics.HarmMetric`,
2249
+ :class:`ibm_watsonx_gov.metrics.HarmEngagementMetric`, :class:`ibm_watsonx_gov.metrics.JailbreakMetric`, :class:`ibm_watsonx_gov.metrics.ProfanityMetric`,
2250
+ :class:`ibm_watsonx_gov.metrics.SexualContentMetric`, :class:`ibm_watsonx_gov.metrics.SocialBiasMetric`, :class:`ibm_watsonx_gov.metrics.UnethicalBehaviorMetric`,
2251
+ :class:`ibm_watsonx_gov.metrics.ViolenceMetric`
2252
+ Args:
2253
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
2254
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
2255
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to MetricGroup.CONTENT_SAFETY.get_metrics().
2256
+
2257
+ Raises:
2258
+ Exception: If there is any error while evaluation.
2259
+
2260
+ Returns:
2261
+ dict: The result of the wrapped tool.
2262
+
2263
+ Example:
2264
+ 1. Basic usage
2265
+ .. code-block:: python
2266
+
2267
+ evaluator = AgenticEvaluator()
2268
+ @evaluator.evaluate_content_safety
2269
+ def agentic_tool(*args, *kwargs):
2270
+ pass
2271
+
2272
+ 2. Usage with different thresholds and methods for some of the metrics in the group
2273
+ .. code-block:: python
2274
+
2275
+ metric_1 = PIIMetric(thresholds=MetricThreshold(type="lower_limit", value=0.5))
2276
+ metric_2 = HAPMetric(thresholds=MetricThreshold(type="lower_limit", value=0.5))
2277
+
2278
+ evaluator = AgenticEvaluator()
2279
+ @evaluator.evaluate_content_safety(metrics=[metric_1, metric_2])
2280
+ def agentic_tool(*args, *kwargs):
2281
+ pass
2282
+ """
2283
+ return ContentSafetyDecorator(api_client=self.api_client,
2284
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2285
+ metric_results=self.__online_metric_results,
2286
+ execution_counts=self.__execution_counts,
2287
+ nodes_being_run=self.__nodes_being_run,
2288
+ lock=update_lock,
2289
+ compute_real_time=compute_real_time).evaluate_content_safety(func, configuration=configuration, metrics=metrics)
2290
+
2291
+ def evaluate_retrieval_quality(self,
2292
+ func: Optional[Callable] = None,
2293
+ *,
2294
+ configuration: Optional[AgenticAIConfiguration] = None,
2295
+ metrics: list[GenAIMetric] = [],
2296
+ compute_real_time: Optional[bool] = True
2297
+ ) -> dict:
2298
+ """
2299
+ An evaluation decorator for computing retrieval quality metrics on an agentic tool.
2300
+ Retrieval Quality metrics include Context Relevance, Retrieval Precision, Average Precision, Hit Rate, Reciprocal Rank, NDCG
2301
+
2302
+ For more details, see :class:`ibm_watsonx_gov.metrics.ContextRelevanceMetric`, :class:`ibm_watsonx_gov.metrics.RetrievalPrecisionMetric`,
2303
+ :class:`ibm_watsonx_gov.metrics.AveragePrecisionMetric`, :class:`ibm_watsonx_gov.metrics.ReciprocalRankMetric`, :class:`ibm_watsonx_gov.metrics.HitRateMetric`,
2304
+ :class:`ibm_watsonx_gov.metrics.NDCGMetric`
2305
+
2306
+ Args:
2307
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
2308
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
2309
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to MetricGroup.RETRIEVAL_QUALITY.get_metrics().
2310
+
2311
+ Raises:
2312
+ Exception: If there is any error while evaluation.
2313
+
2314
+ Returns:
2315
+ dict: The result of the wrapped tool.
2316
+
2317
+ Example:
2318
+ 1. Basic usage
2319
+ .. code-block:: python
2320
+
2321
+ evaluator = AgenticEvaluator()
2322
+ @evaluator.evaluate_retrieval_quality
2323
+ def agentic_tool(*args, *kwargs):
2324
+ pass
2325
+
2326
+ 2. Usage with different thresholds and methods for some of the metrics in the group
2327
+ .. code-block:: python
2328
+
2329
+ metric_1 = NDCGMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
2330
+ metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
2331
+
2332
+ evaluator = AgenticEvaluator()
2333
+ @evaluator.evaluate_retrieval_quality(metrics=[metric_1, metric_2])
2334
+ def agentic_tool(*args, *kwargs):
2335
+ pass
2336
+ """
2337
+ return RetrievalQualityDecorator(api_client=self.api_client,
2338
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2339
+ metric_results=self.__online_metric_results,
2340
+ execution_counts=self.__execution_counts,
2341
+ nodes_being_run=self.__nodes_being_run,
2342
+ lock=update_lock,
2343
+ compute_real_time=compute_real_time).evaluate_retrieval_quality(func, configuration=configuration, metrics=metrics)
2344
+
2345
+ def evaluate_text_grade_level(self,
2346
+ func: Optional[Callable] = None,
2347
+ *,
2348
+ configuration: Optional[AgenticAIConfiguration] = None,
2349
+ metrics: list[GenAIMetric] = [],
2350
+ compute_real_time: Optional[bool] = True,
2351
+ ) -> dict:
2352
+ """
2353
+ An evaluation decorator for computing text grade level metric on an agentic tool.
2354
+
2355
+ For more details, see :class:`ibm_watsonx_gov.metrics.TextGradeLevelMetric`
2356
+
2357
+ Args:
2358
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
2359
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
2360
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [TextGradeLevelMetric()].
2361
+
2362
+ Raises:
2363
+ Exception: If there is any error while evaluation.
2364
+
2365
+ Returns:
2366
+ dict: The result of the wrapped tool.
2367
+
2368
+ Example:
2369
+ 1. Basic usage
2370
+ .. code-block:: python
2371
+
2372
+ evaluator = AgenticEvaluator()
2373
+ @evaluator.evaluate_text_grade_level
2374
+ def agentic_tool(*args, *kwargs):
2375
+ pass
2376
+
2377
+ 2. Create evaluate_text_grade_level decorator with thresholds and configuration
2378
+ .. code-block:: python
2379
+
2380
+ metric = TextGradeLevelMetric(thresholds=[MetricThreshold(type="lower_limit", value=6)])
2381
+ config = {"output_fields": ["generated_text"]}
2382
+ configuration = AgenticAIConfiguration(**config)
2383
+ evaluator = AgenticEvaluator()
2384
+ @evaluator.evaluate_text_grade_level(metrics=[metric], configuration=configuration)
2385
+ def agentic_tool(*args, *kwargs):
2386
+ pass
2387
+ """
2388
+ return TextGradeLevelDecorator(api_client=self.api_client,
2389
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2390
+ metric_results=self.__online_metric_results,
2391
+ execution_counts=self.__execution_counts,
2392
+ nodes_being_run=self.__nodes_being_run,
2393
+ lock=update_lock,
2394
+ compute_real_time=compute_real_time).evaluate_text_grade_level(func, configuration=configuration, metrics=metrics)
2395
+
2396
+ def evaluate_text_reading_ease(self,
2397
+ func: Optional[Callable] = None,
2398
+ *,
2399
+ configuration: Optional[AgenticAIConfiguration] = None,
2400
+ metrics: list[GenAIMetric] = [],
2401
+ compute_real_time: Optional[bool] = True,
2402
+ ) -> dict:
2403
+ """
2404
+ An evaluation decorator for computing text reading ease ease metric on an agentic tool.
2405
+
2406
+ For more details, see :class:`ibm_watsonx_gov.metrics.TextReadingEaseMetric`
2407
+
2408
+ Args:
2409
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
2410
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
2411
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [TextReadingEaseMetric()].
2412
+
2413
+ Raises:
2414
+ Exception: If there is any error while evaluation.
2415
+
2416
+ Returns:
2417
+ dict: The result of the wrapped tool.
2418
+
2419
+ Example:
2420
+ 1. Basic usage
2421
+ .. code-block:: python
2422
+
2423
+ evaluator = AgenticEvaluator()
2424
+ @evaluator.evaluate_text_reading_ease
2425
+ def agentic_tool(*args, *kwargs):
2426
+ pass
2427
+
2428
+ 2. Create evaluate_text_reading_ease decorator with thresholds and configuration
2429
+ .. code-block:: python
2430
+
2431
+ metric = TextReadingEaseMetric(thresholds=[MetricThreshold(type="lower_limit", value=70)])
2432
+ config = {"output_fields": ["generated_text"]}
2433
+ configuration = AgenticAIConfiguration(**config)
2434
+ evaluator = AgenticEvaluator()
2435
+ @evaluator.evaluate_text_reading_ease(metrics=[metric], configuration=configuration)
2436
+ def agentic_tool(*args, *kwargs):
2437
+ pass
2438
+ """
2439
+ return TextReadingEaseDecorator(api_client=self.api_client,
2440
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2441
+ metric_results=self.__online_metric_results,
2442
+ execution_counts=self.__execution_counts,
2443
+ nodes_being_run=self.__nodes_being_run,
2444
+ lock=update_lock,
2445
+ compute_real_time=compute_real_time).evaluate_text_reading_ease(func, configuration=configuration, metrics=metrics)
2446
+
2447
+ def evaluate_readability(self,
2448
+ func: Optional[Callable] = None,
2449
+ *,
2450
+ configuration: Optional[AgenticAIConfiguration] = None,
2451
+ metrics: list[GenAIMetric] = [],
2452
+ compute_real_time: Optional[bool] = True
2453
+ ) -> dict:
2454
+ """
2455
+ An evaluation decorator for computing answer readability metrics on an agentic tool.
2456
+ Readability metrics include TextReadingEaseMetric and TextGradeLevelMetric
2457
+
2458
+ For more details, see :class:`ibm_watsonx_gov.metrics.TextReadingEaseMetric`, :class:`ibm_watsonx_gov.metrics.TextGradeLevelMetric`
2459
+
2460
+ Args:
2461
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
2462
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
2463
+ metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to MetricGroup.READABILITY.get_metrics().
2464
+
2465
+ Raises:
2466
+ Exception: If there is any error while evaluation.
2467
+
2468
+ Returns:
2469
+ dict: The result of the wrapped tool.
2470
+
2471
+ Example:
2472
+ 1. Basic usage
2473
+ .. code-block:: python
2474
+
2475
+ evaluator = AgenticEvaluator()
2476
+ @evaluator.evaluate_readability
2477
+ def agentic_tool(*args, *kwargs):
2478
+ pass
2479
+
2480
+ 2. Usage with different thresholds and methods for some of the metrics in the group
2481
+ .. code-block:: python
2482
+
2483
+ metric_1 = TextGradeLevelMetric(thresholds=[MetricThreshold(type="lower_limit", value=6)])
2484
+ metric_2 = TextReadingEaseMetric(thresholds=[MetricThreshold(type="lower_limit", value=70)])
2485
+ config = {"output_fields": ["generated_text"]}
2486
+ configuration = AgenticAIConfiguration(**config)
2487
+ evaluator = AgenticEvaluator()
2488
+ @evaluator.evaluate_readability(metrics=[metric_1, metric_2], configuration=configuration)
2489
+ def agentic_tool(*args, *kwargs):
2490
+ pass
2491
+ """
2492
+ return ReadabilityDecorator(api_client=self.api_client,
2493
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2494
+ metric_results=self.__online_metric_results,
2495
+ execution_counts=self.__execution_counts,
2496
+ nodes_being_run=self.__nodes_being_run,
2497
+ lock=update_lock,
2498
+ compute_real_time=compute_real_time).evaluate_readability(func, configuration=configuration, metrics=metrics)
2499
+
2500
+ def evaluate_keyword_detection(self,
2501
+ func: Optional[Callable] = None,
2502
+ *,
2503
+ configuration: Optional[AgenticAIConfiguration] = None,
2504
+ metrics: list[GenAIMetric],
2505
+ compute_real_time: Optional[bool] = True,
2506
+ ) -> dict:
2507
+ """
2508
+ An evaluation decorator for computing keyword detection on an agentic tool.
2509
+
2510
+ For more details, see :class:`ibm_watsonx_gov.metrics.KeywordDetectionMetric`
2511
+
2512
+ Args:
2513
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
2514
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
2515
+ metrics (list[GenAIMetric]): The list of metrics to compute as part of this evaluator.
2516
+
2517
+ Raises:
2518
+ Exception: If there is any error while evaluation.
2519
+
2520
+ Returns:
2521
+ dict: The result of the wrapped tool.
2522
+
2523
+ Example:
2524
+ 1. Create evaluate_keyword_detection decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
2525
+ .. code-block:: python
2526
+ metric = KeywordDetectionMetric(keywords=["..."])
2527
+ evaluator = AgenticEvaluator()
2528
+ @evaluator.evaluate_keyword_detection(metrics=[metric])
2529
+ def agentic_tool(*args, *kwargs):
2530
+ pass
2531
+
2532
+ 2. Create evaluate_keyword_detection decorator with thresholds and configuration
2533
+ .. code-block:: python
2534
+
2535
+ metric = KeywordDetectionMetric(thresholds=MetricThreshold(type="upper_limit", value=0), keywords=["..."])
2536
+ config = {"input_fields": ["input"]}
2537
+ configuration = AgenticAIConfiguration(**config)
2538
+ evaluator = AgenticEvaluator()
2539
+ @evaluator.evaluate_keyword_detection(metrics=[metric], configuration=configuration)
2540
+ def agentic_tool(*args, *kwargs):
2541
+ pass
2542
+ """
2543
+ return KeywordDetectionDecorator(api_client=self.api_client,
2544
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2545
+ metric_results=self.__online_metric_results,
2546
+ execution_counts=self.__execution_counts,
2547
+ nodes_being_run=self.__nodes_being_run,
2548
+ lock=update_lock,
2549
+ compute_real_time=compute_real_time).evaluate_keyword_detection(func, configuration=configuration, metrics=metrics)
2550
+
2551
+ def evaluate_regex(self,
2552
+ func: Optional[Callable] = None,
2553
+ *,
2554
+ configuration: Optional[AgenticAIConfiguration] = None,
2555
+ metrics: list[GenAIMetric],
2556
+ compute_real_time: Optional[bool] = True,
2557
+ ) -> dict:
2558
+ """
2559
+ An evaluation decorator for computing regex detection on an agentic tool.
2560
+
2561
+ For more details, see :class:`ibm_watsonx_gov.metrics.RegexDetectionMetric`
2562
+
2563
+ Args:
2564
+ func (Optional[Callable], optional): The tool on which the metric is to be computed.
2565
+ configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
2566
+ metrics (list[GenAIMetric]): The list of metrics to compute as part of this evaluator.
2567
+
2568
+ Raises:
2569
+ Exception: If there is any error while evaluation.
2570
+
2571
+ Returns:
2572
+ dict: The result of the wrapped tool.
2573
+
2574
+ Example:
2575
+ 1. Create evaluate_regex decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
2576
+ .. code-block:: python
2577
+ metric = RegexDetectionMetric(regex_patterns=["..."])
2578
+ evaluator = AgenticEvaluator()
2579
+ @evaluator.evaluate_regex(metrics=[metric])
2580
+ def agentic_tool(*args, *kwargs):
2581
+ pass
2582
+
2583
+ 2. Create evaluate_regex decorator with thresholds and configuration
2584
+ .. code-block:: python
2585
+ metric = RegexDetectionMetric(thresholds=MetricThreshold(type="upper_limit", value=0), regex_patterns=["..."])
2586
+ config = {"input_fields": ["input"]}
2587
+ configuration = AgenticAIConfiguration(**config)
2588
+ evaluator = AgenticEvaluator()
2589
+ @evaluator.evaluate_regex(metrics=[metric], configuration=configuration)
2590
+ def agentic_tool(*args, *kwargs):
2591
+ pass
2592
+ """
2593
+ return RegexDetectionDecorator(api_client=self.api_client,
2594
+ configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
2595
+ metric_results=self.__online_metric_results,
2596
+ execution_counts=self.__execution_counts,
2597
+ nodes_being_run=self.__nodes_being_run,
2598
+ lock=update_lock,
2599
+ compute_real_time=compute_real_time).evaluate_regex(func, configuration=configuration, metrics=metrics)
2600
+
2601
+ def generate_insights(self,
2602
+ applies_to: list[str] = AGENTIC_RESULT_COMPONENTS,
2603
+ top_k: int = 3,
2604
+ llm_model=None,
2605
+ output_format: str = "html",
2606
+ percentile_threshold: float = 95.0,
2607
+ metric_group_weights: Optional[dict] = None,
2608
+ metric_weights: Optional[dict] = None):
2609
+ """
2610
+ Generate top k insights from evaluation metrics based on their significance.
2611
+
2612
+ This method analyzes the evaluation results and identifies the most significant metrics
2613
+ based on their values and thresholds. It can optionally generate natural language
2614
+ report of these insights using a provided LLM model.
2615
+
2616
+ Args:
2617
+ applies_to (list[str]): The component levels at which insights should be computed.
2618
+ Can include "conversation", "message", and/or "node".
2619
+ Defaults to all three levels.
2620
+ top_k (int): The number of top insights to generate. Defaults to 3.
2621
+ llm_model (optional): A language model to generate natural language report
2622
+ of the insights. If not provided, only structured insights
2623
+ will be returned.
2624
+ output_format (str): The format for the output. Defaults to "html".
2625
+ percentile_threshold (float): Percentile to use as threshold for cost/latency metrics.
2626
+ Defaults to 95.0. Higher values indicate worse performance
2627
+ for these metrics. For example, 95.0 means values above the
2628
+ 95th percentile are considered violations.
2629
+ metric_group_weights (dict, optional): Custom weights for metric groups.
2630
+ Keys are group names, values are weights (1.0-5.0).
2631
+ 1.0 is the minimum weight, 5.0 is the maximum weight.
2632
+ Example: {"answer_quality": 2.0, "content_safety": 1.5}
2633
+ metric_weights (dict, optional): Custom weights for individual metrics.
2634
+ Keys are metric names, values are weights (1.0-5.0).
2635
+ 1.0 is the minimum weight, 5.0 is the maximum weight.
2636
+ Example: {"answer_relevance": 2.0, "faithfulness": 1.8}
2637
+
2638
+ Returns:
2639
+ List[dict]: A list of the top k insights, each containing:
2640
+ - metric_name: Name of the metric
2641
+ - applies_to: Component level the metric applies to
2642
+ - group: The metric group to which the metric belongs to
2643
+ - violations_count: The number of times the metric value violated the threshold
2644
+ - node_name: Name of the node (if applies_to is "node")
2645
+ - value: The metric value
2646
+ - threshold: The threshold dictionary containing value and type (if applicable)
2647
+ - mmr_score: A score indicating the significance of this insight
2648
+
2649
+ Examples:
2650
+ 1. Generate top 3 insights across all component levels
2651
+ .. code-block:: python
2652
+
2653
+ evaluator = AgenticEvaluator()
2654
+ # ... run evaluation ...
2655
+ insights = evaluator.generate_insights()
2656
+
2657
+
2658
+ 2. Generate top 5 insights for node-level metrics only
2659
+ .. code-block:: python
2660
+
2661
+ insights = evaluator.generate_insights(
2662
+ applies_to=["node"],
2663
+ top_k=5
2664
+ )
2665
+
2666
+ 3. Generate insights with natural language explanations
2667
+ .. code-block:: python
2668
+
2669
+
2670
+ from ibm_watsonx_gov.entities.foundation_model import WxAIFoundationModel
2671
+
2672
+ llm = WxAIFoundationModel(
2673
+ model_id="meta-llama/llama-3-70b-instruct",
2674
+ project_id="your-project-id"
2675
+ )
2676
+
2677
+ insights = evaluator.generate_insights(
2678
+ top_k=3,
2679
+ llm_model=llm
2680
+ )
2681
+
2682
+
2683
+ 4. Generate insights with custom metric weights
2684
+ .. code-block:: python
2685
+
2686
+
2687
+ insights = evaluator.generate_insights(
2688
+ top_k=3,
2689
+ metric_group_weights={"retrieval_quality": 2.0, "content_safety": 1.5},
2690
+ metric_weights={"answer_relevance": 2.5, "faithfulness": 2.0}
2691
+ )
2692
+
2693
+ """
2694
+ from ibm_watsonx_gov.utils.insights_generator import InsightsGenerator
2695
+
2696
+ # Get the evaluation result
2697
+ eval_result = self.get_result()
2698
+ if not eval_result:
2699
+ logger.warning(
2700
+ "No evaluation results available. Please run evaluation first.")
2701
+ return []
2702
+
2703
+ # Get aggregated metrics results for the specified component levels
2704
+ # Include individual results to compute violations_count for percentile-based metrics
2705
+ aggregated_metrics = eval_result.get_aggregated_metrics_results(
2706
+ applies_to=applies_to,
2707
+ include_individual_results=True
2708
+ )
2709
+
2710
+ # Use the InsightsGenerator to select top k metrics based on significance
2711
+ insights_generator = InsightsGenerator(
2712
+ top_k=top_k, applies_to=applies_to, metrics=aggregated_metrics, llm_model=llm_model,
2713
+ percentile_threshold=percentile_threshold,
2714
+ metric_group_weights=metric_group_weights, metric_weights=metric_weights)
2715
+ top_k_metrics = insights_generator.select_top_k_metrics()
2716
+
2717
+ # Generate natural language insights if a model is provided
2718
+ if llm_model and top_k_metrics:
2719
+ result = insights_generator.generate_structured_insights(
2720
+ top_metrics=top_k_metrics,
2721
+ output_format=output_format
2722
+ )
2723
+ return result
2724
+
2725
+ return top_k_metrics