ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
@@ -0,0 +1,115 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from datetime import timedelta
11
+ from typing import Annotated, Dict, Optional
12
+
13
+ from pydantic import Field
14
+
15
+ from ibm_watsonx_gov.entities.agentic_app import AgenticApp
16
+ from ibm_watsonx_gov.entities.enums import MessageStatus, MetricGroup, MetricValueType
17
+ from ibm_watsonx_gov.entities.evaluation_result import AgentMetricResult
18
+ from ibm_watsonx_gov.entities.mapping import Mapping
19
+ from ibm_watsonx_gov.evaluators.base_evaluator import BaseEvaluator
20
+ from ibm_watsonx_gov.evaluators.impl.evaluate_metrics_impl import _evaluate_metrics
21
+ from ibm_watsonx_gov.traces.span_util import flatten_attributes
22
+ from ibm_watsonx_gov.traces.trace_utils import TraceUtils
23
+
24
+
25
+ class AgenticTracesEvaluator(BaseEvaluator):
26
+ """
27
+ The class to evaluate agentic applications based on the traces generated.
28
+ """
29
+ agentic_app: Annotated[Optional[AgenticApp], Field(
30
+ title="Agentic application configuration details", description="The agentic application configuration details.", default=None)]
31
+
32
+ def compute_metrics(self, spans: list[dict], mapping: Mapping, **kwargs) -> list[AgentMetricResult]:
33
+ """
34
+ Computes the agentic metrics based on the spans/traces provided as a list.
35
+
36
+ Args:
37
+ spans (list[AgentMetricResult]): The spans on which the metrics need to be computed
38
+ mapping (Mapping): The various mappings for finding the metric inputs.
39
+
40
+ Returns:
41
+ list[AgentMetricResult]: The computed metric results
42
+ """
43
+ span_trees = TraceUtils.build_span_trees(
44
+ spans=spans, agentic_app=self.agentic_app)
45
+ metrics_result = []
46
+ for span_tree in span_trees:
47
+ # Process only the spans that are associated with the agent application
48
+ attrs = flatten_attributes(span_tree.span.attributes)
49
+ if not attrs.get("traceloop.span.kind") == "workflow":
50
+ continue
51
+
52
+ data = span_tree.get_values(mapping)
53
+
54
+ mr = self.compute_message_level_metrics(data, **kwargs)
55
+ metrics_result.extend(mr)
56
+
57
+ return metrics_result
58
+
59
+ def compute_message_level_metrics(self, data: Dict, **kwargs) -> list[AgentMetricResult]:
60
+ metric_results = []
61
+
62
+ start_time = data.get("start_time")
63
+ end_time = data.get("end_time")
64
+
65
+ if start_time is None or end_time is None:
66
+ raise Exception("start_time and/or end_time are missing.")
67
+
68
+ message_id = data.get("message_id")
69
+ conversation_id = data.get("conversation_id")
70
+
71
+ if message_id is None or conversation_id is None:
72
+ raise Exception(
73
+ "message_id and/or conversation_id are missing.")
74
+
75
+ duration: timedelta = (end_time - start_time)
76
+ duration = duration.total_seconds()
77
+
78
+ metric_results.append(AgentMetricResult(name="duration",
79
+ display_name="Message Duration",
80
+ value=duration,
81
+ group=MetricGroup.PERFORMANCE,
82
+ applies_to="message",
83
+ message_id=message_id,
84
+ conversation_id=conversation_id))
85
+
86
+ metric_results.append(AgentMetricResult(name="status",
87
+ display_name="Message Status",
88
+ value_type=MetricValueType.CATEGORICAL.value,
89
+ value=data.get(
90
+ "status", MessageStatus.UNKNOWN.value),
91
+ group=MetricGroup.MESSAGE_COMPLETION,
92
+ applies_to="message",
93
+ message_id=message_id,
94
+ conversation_id=conversation_id))
95
+
96
+ if not self.agentic_app:
97
+ return metric_results
98
+
99
+ metric_result = _evaluate_metrics(configuration=self.agentic_app.metrics_configuration.configuration,
100
+ data=data,
101
+ metrics=self.agentic_app.metrics_configuration.metrics,
102
+ metric_groups=self.agentic_app.metrics_configuration.metric_groups,
103
+ api_client=kwargs.get("api_client"),
104
+ ignore_validation_errors=True).to_dict()
105
+ for mr in metric_result:
106
+ node_result = {
107
+ "applies_to": "message",
108
+ "message_id": message_id,
109
+ "conversation_id": conversation_id,
110
+ **mr
111
+ }
112
+
113
+ metric_results.append(AgentMetricResult(**node_result))
114
+
115
+ return metric_results
@@ -0,0 +1,22 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from pydantic import BaseModel, ConfigDict, Field
11
+ from typing_extensions import Annotated
12
+
13
+ from ibm_watsonx_gov.clients.api_client import APIClient
14
+
15
+
16
+ class BaseEvaluator(BaseModel):
17
+ """
18
+ The base class for all evaluators.
19
+ """
20
+ model_config = ConfigDict(arbitrary_types_allowed=True)
21
+ api_client: Annotated[APIClient | None,
22
+ Field(name="The IBM watsonx.governance client.", default=None)]
File without changes
@@ -0,0 +1,187 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ import pandas as pd
11
+ from pydantic import Field, PrivateAttr
12
+ from typing_extensions import Annotated
13
+
14
+ from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
15
+ from ibm_watsonx_gov.entities.enums import MetricGroup
16
+ from ibm_watsonx_gov.entities.evaluation_result import MetricsEvaluationResult
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.evaluators.base_evaluator import BaseEvaluator
19
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
20
+
21
+
22
+ class MetricsEvaluator(BaseEvaluator):
23
+ """
24
+ The class to evaluate the metrics and display the results.
25
+
26
+ Examples:
27
+ 1. Evaluate metrics by passing data as a dataframe and default configuration
28
+ .. code-block:: python
29
+
30
+ os.environ["WATSONX_APIKEY"] = "..."
31
+
32
+ evaluator = MetricsEvaluator()
33
+ df = pd.read_csv("")
34
+ metrics = [AnswerSimilarityMetric()]
35
+
36
+ result = evaluator.evaluate(data=df, metrics=metrics)
37
+
38
+ 2. Evaluate metrics by passing data as a json and default configuration
39
+ .. code-block:: python
40
+
41
+ os.environ["WATSONX_APIKEY"] = "..."
42
+
43
+ evaluator = MetricsEvaluator()
44
+ json_data = {"input_text": "..."}
45
+ metrics=[HAPMetric()]
46
+
47
+ result = evaluator.evaluate(data=json_data, metrics=metrics)
48
+
49
+ 3. Evaluate metrics by passing configuration and api_client
50
+ .. code-block:: python
51
+
52
+ config = GenAIConfiguration(input_fields=["question"],
53
+ context_fields=["context"],
54
+ output_fields=["generated_text"],
55
+ reference_fields=["reference_answer"])
56
+ wxgov_client = APIClient(credentials=Credentials(api_key=""))
57
+ evaluator = MetricsEvaluator(configuration=config, api_client=wxgov_client)
58
+ df = pd.read_csv("")
59
+ metrics = [AnswerSimilarityMetric()]
60
+
61
+ result = evaluator.evaluate(data=df, metrics=metrics)
62
+
63
+ 4. Evaluate metrics by passing metric groups
64
+ .. code-block:: python
65
+
66
+ os.environ["WATSONX_APIKEY"] = "..."
67
+
68
+ evaluator = MetricsEvaluator()
69
+ df = pd.read_csv("")
70
+ metrics = [AnswerSimilarityMetric()]
71
+ metric_groups = [MetricGroup.RETRIEVAL_QUALITY]
72
+
73
+ result = evaluator.evaluate(data=df, metrics=metrics, metric_groups=metric_groups)
74
+
75
+ 5. Display the results
76
+ .. code-block:: python
77
+
78
+ # Get the results in the required format from the output of the evaluate method
79
+ result.to_json()
80
+ result.to_df()
81
+ result.to_dict()
82
+
83
+ # Display the results
84
+ evaluator.display_table()
85
+ evaluator.display_insights()
86
+
87
+
88
+
89
+ """
90
+ configuration: Annotated[GenAIConfiguration,
91
+ Field(title="Generative AI Configuration",
92
+ description="The configuration for metrics evaluation.",
93
+ default=GenAIConfiguration())]
94
+ _data: Annotated[pd.DataFrame | dict | None,
95
+ PrivateAttr(default=None)]
96
+ _metrics: Annotated[list[GenAIMetric] | None,
97
+ PrivateAttr(default=None)]
98
+ _metric_groups: Annotated[list[MetricGroup] | None,
99
+ PrivateAttr(default=None)]
100
+ _result: Annotated[MetricsEvaluationResult | None,
101
+ PrivateAttr(default=None)]
102
+
103
+ def evaluate(
104
+ self,
105
+ data: pd.DataFrame | dict,
106
+ metrics: list[GenAIMetric] = [],
107
+ metric_groups: list[MetricGroup] = [],
108
+ **kwargs) -> MetricsEvaluationResult:
109
+ """
110
+ Evaluate the metrics for the given data.
111
+
112
+ Args:
113
+ data (pd.DataFrame | dict): The data to be evaluated.
114
+ metrics (list[GenAIMetric], optional): The metrics to be evaluated. Defaults to [].
115
+ metric_groups (list[MetricGroup], optional): The metric groups to be evaluated. Defaults to [].
116
+ **kwargs: Additional keyword arguments.
117
+
118
+ Returns:
119
+ MetricsEvaluationResult: The result of the evaluation.
120
+ """
121
+ return run_in_event_loop(
122
+ self.evaluate_async,
123
+ data=data,
124
+ metrics=metrics,
125
+ metric_groups=metric_groups,
126
+ **kwargs,
127
+ )
128
+
129
+ async def evaluate_async(
130
+ self,
131
+ data: pd.DataFrame | dict,
132
+ metrics: list[GenAIMetric] = [],
133
+ metric_groups: list[MetricGroup] = [],
134
+ **kwargs
135
+ ) -> MetricsEvaluationResult:
136
+ """
137
+ asynchronously evaluate the metrics for the given data.
138
+
139
+ Args:
140
+ data (pd.DataFrame | dict): The data to be evaluated.
141
+ metrics (list[GenAIMetric], optional): The metrics to be evaluated. Defaults to [].
142
+ metric_groups (list[MetricGroup], optional): The metric groups to be evaluated. Defaults to [].
143
+ **kwargs: Additional keyword arguments.
144
+
145
+ Returns:
146
+ MetricsEvaluationResult: The result of the evaluation.
147
+ """
148
+ from ..evaluators.impl.evaluate_metrics_impl import (
149
+ _evaluate_metrics_async, _resolve_metric_dependencies)
150
+ self._data = data
151
+ self._metrics = _resolve_metric_dependencies(
152
+ metrics=metrics, metric_groups=metric_groups
153
+ )
154
+ self._metric_groups = metric_groups
155
+ self._result: MetricsEvaluationResult = await _evaluate_metrics_async(
156
+ configuration=self.configuration,
157
+ data=data,
158
+ metrics=self._metrics,
159
+ api_client=self.api_client,
160
+ **kwargs,
161
+ )
162
+ return self._result
163
+
164
+ def display_table(self):
165
+ """
166
+ Display the metrics result as a table.
167
+ """
168
+ try:
169
+ from ibm_watsonx_gov.visualizations import display_table
170
+ except:
171
+ ImportError(
172
+ "Please install the required dependencies 'ibm-watsonx-gov[visualization]' to display the results.")
173
+ display_table(self._result.to_df(data=self._data))
174
+
175
+ def display_insights(self):
176
+ """
177
+ Display the metrics result in a venn diagram based on the metrics threshold.
178
+ """
179
+ try:
180
+ from ibm_watsonx_gov.visualizations import ModelInsights
181
+ except:
182
+ ImportError(
183
+ "Please install the required dependencies 'ibm-watsonx-gov[visualization]' to display the results.")
184
+ model_insights = ModelInsights(
185
+ configuration=self.configuration, metrics=self._metrics)
186
+ model_insights.display_metrics(
187
+ metrics_result=self._result.to_df(data=self._data))
@@ -0,0 +1,89 @@
1
+
2
+ # ----------------------------------------------------------------------------------------------------
3
+ # IBM Confidential
4
+ # Licensed Materials - Property of IBM
5
+ # 5737-H76, 5900-A3Q
6
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
7
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
8
+ # GSA ADPSchedule Contract with IBM Corp.
9
+ # ----------------------------------------------------------------------------------------------------
10
+
11
+
12
+ from ibm_watsonx_gov.clients.api_client import APIClient
13
+ from ibm_watsonx_gov.config.model_risk_configuration import \
14
+ ModelRiskConfiguration
15
+ from ibm_watsonx_gov.entities.model_risk_result import ModelRiskResult
16
+ from ibm_watsonx_gov.evaluators.base_evaluator import BaseEvaluator
17
+ from IPython.display import display
18
+ from pydantic import Field, PrivateAttr
19
+ from typing_extensions import Annotated
20
+
21
+
22
+ class ModelRiskEvaluator(BaseEvaluator):
23
+ """
24
+ The class to evaluate the foundational model risk and display the results.
25
+
26
+ Example:
27
+ 1. Basic usage
28
+ .. code-block:: python
29
+
30
+ configuration = ModelRiskConfiguration(
31
+ model_details=model_details,
32
+ risk_dimensions=risk_dimensions,
33
+ max_sample_size=max_sample_size,
34
+ pdf_report_output_path=pdf_report_output_path
35
+ )
36
+ wxgov_client = APIClient(credentials=Credentials(api_key=""))
37
+ evaluator = ModelRiskEvaluator(
38
+ configuration=config, api_client=wxgov_client)
39
+
40
+ result = evaluator.evaluate()
41
+
42
+ # Get the results in the required format
43
+ result.to_json()
44
+
45
+ # Display the results
46
+ evaluator.display_table()
47
+ evaluator.download_model_risk_report()
48
+ """
49
+ configuration: Annotated[ModelRiskConfiguration,
50
+ Field(name="The configuration for model risk evaluation.")]
51
+ api_client: Annotated[APIClient | None,
52
+ Field(name="The IBM watsonx.governance client.", default=None)]
53
+
54
+ _result: Annotated[ModelRiskResult | None,
55
+ PrivateAttr(default=None)]
56
+
57
+ def evaluate(self) -> ModelRiskResult:
58
+ """
59
+ Evaluates the risk of a Foundation model.
60
+
61
+ Returns:
62
+ ModelRiskResult: The result of the model risk evaluation.
63
+ """
64
+ from ibm_watsonx_gov.evaluators.impl.evaluate_model_risk_impl import \
65
+ _evaluate_model_risk
66
+
67
+ self._result = _evaluate_model_risk(
68
+ self.configuration,
69
+ self.api_client,
70
+ )
71
+
72
+ return self._result
73
+
74
+ def display_table(self):
75
+ for risk in self._result.risks:
76
+ print(f"\n--- Risk: {risk.name} ---")
77
+ for benchmark in risk.benchmarks:
78
+ print(f"Benchmark: {benchmark.name}")
79
+ display(benchmark.get_metric_df())
80
+
81
+ def download_model_risk_report(self):
82
+ """
83
+ Downloads the model risk report and returns the download link.
84
+ """
85
+ from ibm_wos_utils.joblib.utils.notebook_utils import \
86
+ create_download_link_for_file
87
+
88
+ return create_download_link_for_file(
89
+ self._result.output_file_path)
@@ -0,0 +1,93 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated
11
+
12
+ from pydantic import Field, PrivateAttr
13
+
14
+ from ibm_watsonx_gov.entities.agentic_app import AgenticApp, Node
15
+ from ibm_watsonx_gov.entities.agentic_evaluation_result import \
16
+ AgenticEvaluationResult
17
+ from ibm_watsonx_gov.evaluators.base_evaluator import BaseEvaluator
18
+ from ibm_watsonx_gov.traces.span_util import flatten_attributes
19
+ from ibm_watsonx_gov.traces.trace_utils import TraceUtils
20
+ from ibm_watsonx_gov.utils.aggregation_util import \
21
+ get_agentic_evaluation_result
22
+ from ibm_watsonx_gov.utils.async_util import (gather_with_concurrency,
23
+ run_in_event_loop)
24
+ from ibm_watsonx_gov.utils.python_utils import add_if_unique
25
+
26
+
27
+ class TracesEvaluator(BaseEvaluator):
28
+ """
29
+ The class to evaluate agentic applications based on the traces generated.
30
+ """
31
+ agentic_app: Annotated[AgenticApp,
32
+ Field(title="Agentic application configuration details",
33
+ description="The agentic application configuration details.")]
34
+ __nodes: Annotated[list[Node], PrivateAttr(default=[])]
35
+
36
+ def evaluate(self, spans: list[dict], **kwargs) -> AgenticEvaluationResult:
37
+ """
38
+ Computes the agentic metrics based on the spans/traces provided as a list.
39
+
40
+ Args:
41
+ spans (list[AgentMetricResult]): The spans on which the metrics need to be computed.
42
+
43
+ Returns:
44
+ list[AgentMetricResult]: The computed metric results
45
+ """
46
+ metrics_result = []
47
+ node_data = []
48
+ messages_data = []
49
+ mapping_data = []
50
+ coros = []
51
+ max_concurrency = kwargs.get("max_concurrency", 10)
52
+ span_trees = TraceUtils.build_span_trees(
53
+ spans=spans, agentic_app=self.agentic_app)
54
+ for span_tree in span_trees:
55
+ # Process only the spans that are associated with the agent application
56
+ attrs = flatten_attributes(span_tree.span.attributes)
57
+ if not attrs.get("traceloop.span.kind") == "workflow":
58
+ continue
59
+
60
+ # Append coroutine for data
61
+ coros.append(
62
+ TraceUtils.compute_metrics_from_trace_async_v2(span_tree=span_tree,
63
+ message_io_mapping=self.agentic_app.message_io_mapping,
64
+ metrics_configuration=self.agentic_app.metrics_configuration,
65
+ api_client=self.api_client, **kwargs
66
+ )
67
+ )
68
+ # Run all coroutines in parallel with concurrency control
69
+ results = run_in_event_loop(
70
+ gather_with_concurrency,
71
+ coros=coros,
72
+ max_concurrency=max_concurrency)
73
+
74
+ # Process results
75
+ for mr, md, nd, mpd, ns in results:
76
+ metrics_result.extend(mr)
77
+ messages_data.append(md)
78
+ node_data.extend(nd)
79
+ mapping_data.append(mpd)
80
+
81
+ for n in ns:
82
+ add_if_unique(n, self.__nodes, ["name", "func_name"], [
83
+ "foundation_models"])
84
+
85
+ result = get_agentic_evaluation_result(
86
+ metrics_result=metrics_result, nodes=self.__nodes)
87
+
88
+ result.messages_data = messages_data
89
+ result.nodes_data = node_data
90
+ result.metrics_mapping_data = mapping_data
91
+ result.nodes = self.__nodes
92
+
93
+ return result
@@ -0,0 +1,66 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from functools import partial
11
+ from typing import Callable, Optional
12
+
13
+ from wrapt import decorator
14
+
15
+ from ibm_watsonx_gov.config import AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics import (AnswerRelevanceMetric,
19
+ AnswerSimilarityMetric,
20
+ FaithfulnessMetric,
21
+ UnsuccessfulRequestsMetric)
22
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
23
+
24
+
25
+ class AnswerQualityDecorator(BaseMetricDecorator):
26
+ def evaluate_answer_quality(self,
27
+ func: Optional[Callable] = None,
28
+ *,
29
+ configuration: Optional[AgenticAIConfiguration] = None,
30
+ metrics: list[GenAIMetric] = []
31
+ ) -> dict:
32
+ """
33
+ An evaluation decorator for computing answer quality metrics on an agentic node.
34
+ """
35
+ if func is None:
36
+ return partial(self.evaluate_answer_quality, configuration=configuration, metrics=metrics)
37
+
38
+ if not metrics:
39
+ metrics = MetricGroup.ANSWER_QUALITY.get_metrics()
40
+
41
+ @decorator
42
+ def wrapper(func, instance, args, kwargs):
43
+
44
+ try:
45
+ self.validate(func=func, metrics=metrics,
46
+ valid_metric_types=(AnswerRelevanceMetric, FaithfulnessMetric, UnsuccessfulRequestsMetric, AnswerSimilarityMetric))
47
+
48
+ metric_inputs = [EvaluatorFields.INPUT_FIELDS,
49
+ EvaluatorFields.CONTEXT_FIELDS]
50
+ metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
51
+ metric_references = [EvaluatorFields.REFERENCE_FIELDS]
52
+
53
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
54
+ configuration=configuration,
55
+ metrics=metrics,
56
+ metric_inputs=metric_inputs,
57
+ metric_outputs=metric_outputs,
58
+ metric_references=metric_references,
59
+ metric_groups=[MetricGroup.ANSWER_QUALITY])
60
+
61
+ return original_result
62
+ except Exception as ex:
63
+ raise Exception(
64
+ f"There was an error while evaluating answer quality metrics on {func.__name__},") from ex
65
+
66
+ return wrapper(func)
@@ -0,0 +1,76 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from functools import partial
11
+ from typing import Callable, Optional
12
+
13
+ from wrapt import decorator
14
+
15
+ from ibm_watsonx_gov.config import AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics import (HAPMetric, PIIMetric,
19
+ PromptSafetyRiskMetric)
20
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
21
+ from ibm_watsonx_gov.metrics.evasiveness.evasiveness_metric import \
22
+ EvasivenessMetric
23
+ from ibm_watsonx_gov.metrics.harm.harm_metric import HarmMetric
24
+ from ibm_watsonx_gov.metrics.harm_engagement.harm_engagement_metric import \
25
+ HarmEngagementMetric
26
+ from ibm_watsonx_gov.metrics.jailbreak.jailbreak_metric import JailbreakMetric
27
+ from ibm_watsonx_gov.metrics.profanity.profanity_metric import ProfanityMetric
28
+ from ibm_watsonx_gov.metrics.sexual_content.sexual_content_metric import \
29
+ SexualContentMetric
30
+ from ibm_watsonx_gov.metrics.social_bias.social_bias_metric import \
31
+ SocialBiasMetric
32
+ from ibm_watsonx_gov.metrics.unethical_behavior.unethical_behavior_metric import \
33
+ UnethicalBehaviorMetric
34
+ from ibm_watsonx_gov.metrics.violence.violence_metric import ViolenceMetric
35
+
36
+
37
+ class ContentSafetyDecorator(BaseMetricDecorator):
38
+ def evaluate_content_safety(self,
39
+ func: Optional[Callable] = None,
40
+ *,
41
+ configuration: Optional[AgenticAIConfiguration] = None,
42
+ metrics: list[GenAIMetric] = []
43
+ ) -> dict:
44
+ """
45
+ An evaluation decorator for computing content safety metrics on an agentic node.
46
+ """
47
+
48
+ if func is None:
49
+ return partial(self.evaluate_content_safety, configuration=configuration, metrics=metrics)
50
+
51
+ if not metrics:
52
+ metrics = MetricGroup.CONTENT_SAFETY.get_metrics()
53
+
54
+ @decorator
55
+ def wrapper(func, instance, args, kwargs):
56
+
57
+ try:
58
+ self.validate(func=func, metrics=metrics,
59
+ valid_metric_types=(PromptSafetyRiskMetric, HAPMetric, PIIMetric, HarmMetric, SocialBiasMetric, ProfanityMetric, SexualContentMetric,
60
+ UnethicalBehaviorMetric, ViolenceMetric, HarmEngagementMetric, EvasivenessMetric, JailbreakMetric))
61
+
62
+ metric_inputs = [EvaluatorFields.INPUT_FIELDS]
63
+
64
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
65
+ configuration=configuration,
66
+ metrics=metrics,
67
+ metric_inputs=metric_inputs,
68
+ metric_outputs=[],
69
+ metric_groups=[MetricGroup.CONTENT_SAFETY])
70
+
71
+ return original_result
72
+ except Exception as ex:
73
+ raise Exception(
74
+ f"There was an error while evaluating content safety metrics on {func.__name__},") from ex
75
+
76
+ return wrapper(func)