ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,1304 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ import ast
10
+
11
+ import ipywidgets as widgets
12
+ import matplotlib.pyplot as plt
13
+ import pandas as pd
14
+ from IPython.display import HTML, display
15
+ from itables.widget import ITable
16
+ from matplotlib.axes import Axes
17
+ from matplotlib_venn import venn2, venn2_circles, venn3, venn3_circles
18
+ from matplotlib_venn.layout.venn2 import \
19
+ DefaultLayoutAlgorithm as Venn2DefaultLayoutAlgorithm
20
+ from matplotlib_venn.layout.venn3 import \
21
+ DefaultLayoutAlgorithm as Venn3DefaultLayoutAlgorithm
22
+
23
+ from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
24
+ from ibm_watsonx_gov.entities.enums import TaskType
25
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
26
+ from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
27
+
28
+ from .metric_descriptions import metric_description_mapping
29
+
30
+
31
+ class ModelInsights():
32
+ """
33
+ Class to display venn diagrams using metric violations
34
+ NOTE: For venn diagram interactivity, `ipympl` (jupyter-matplotlib) Jupyter extension needs to be installed
35
+ """
36
+ # Color constants used to style the circles
37
+ PURPLE = "#8A3FFC"
38
+ CYAN = "#1192E8"
39
+ TEAL = "#009D9A"
40
+ COLORS = [PURPLE, CYAN, TEAL]
41
+
42
+ # general constants
43
+ MAX_METRIC_GROUP_SIZE = 3
44
+ DEFAULT_SELECTED_METRICS_COUNT = 9
45
+
46
+ def __init__(
47
+ self,
48
+ configuration: GenAIConfiguration,
49
+ metrics: list[GenAIMetric],
50
+ ) -> None:
51
+ """
52
+ ModelInsights construction. This will parse and validate the configuration
53
+
54
+ Notes:
55
+ - The visualization and interactivity features in the module are not supported
56
+ by the jupyter notebook within VS Code. It is recommended to use Jupyter notebook
57
+ or Jupyter lab from the web browser to take advantage of the features of this module
58
+ - Supported task types: 'question_answering', 'classification', 'summarization',
59
+ 'generation', 'extraction', 'retrieval_augmented_generation'
60
+
61
+ Args:
62
+ configuration (GenAIConfiguration): Metric evaluation configuration
63
+ metrics (list[GenAIMetric]): List of metrics to visualize
64
+ """
65
+ self.logger = GovSDKLogger.get_logger(__name__)
66
+ self.configuration: GenAIConfiguration = configuration
67
+ self.metrics: list[GenAIMetric] = metrics
68
+ self.metric_config = self.__parse_metrics_object(self.metrics)
69
+ self.df: pd.DataFrame = None
70
+ self.violation_sets: set = {}
71
+ self.violations = pd.DataFrame()
72
+ self.config_metric_ids = []
73
+ self.selected_patch_id = None
74
+ self.venn_diagram_callback_id = None
75
+ self.violation_summary_and_table_output = widgets.Output()
76
+ self.faithfulness_attributions_output = widgets.Output()
77
+ self.metric_groups = []
78
+
79
+ self.__init_stylesheet()
80
+
81
+ def __parse_metrics_object(self, metrics: list[GenAIMetric]):
82
+ parsed_metrics = {}
83
+ for metric in metrics:
84
+ metric_name = metric.name
85
+ if metric.method:
86
+ metric_name += f".{metric.method}"
87
+ for metric_threshold in metric.thresholds:
88
+ parsed_metrics[metric_name] = {
89
+ "type": metric_threshold.type,
90
+ "threshold": metric_threshold.value,
91
+ }
92
+ return parsed_metrics
93
+
94
+ def __reset_state(self):
95
+ """
96
+ Helper to reset the object state.
97
+ """
98
+ self.violation_sets = {}
99
+ self.violations = pd.DataFrame()
100
+ self.config_metric_ids = []
101
+ self.selected_patch_id = None
102
+ self.venn_diagram_callback_id = None
103
+ self.metric_groups = []
104
+
105
+ def __init_stylesheet(self):
106
+ """
107
+ Helper to initialize all needed custom css for the html components
108
+ """
109
+ styles = HTML(
110
+ """
111
+ <style>
112
+ .reset_input_style > input {
113
+ border: unset !important;
114
+ background: unset !important;
115
+ }
116
+
117
+ .violations_table td {
118
+ white-space: nowrap; text-overflow:ellipsis; overflow: hidden; max-width:1px;
119
+ }
120
+
121
+ .tooltip {
122
+ position: relative;
123
+ }
124
+ .tooltip .tooltiptext {
125
+ visibility: hidden;
126
+ width: 120px;
127
+ background-color: #555;
128
+ color: #fff;
129
+ text-align: center;
130
+ border-radius: 6px;
131
+ padding: 5px 0;
132
+ position: absolute;
133
+ z-index: 1;
134
+ bottom: 125%;
135
+ left: 50%;
136
+ margin-left: -60px;
137
+ opacity: 0;
138
+ transition: opacity 0.3s;
139
+ }
140
+ .tooltip .tooltiptext::after {
141
+ content: "";
142
+ position: absolute;
143
+ top: 100%;
144
+ left: 50%;
145
+ margin-left: -5px;
146
+ border-width: 5px;
147
+ border-style: solid;
148
+ border-color: #555 transparent transparent transparent;
149
+ }
150
+ .tooltip:hover .tooltiptext {
151
+ visibility: visible;
152
+ opacity: 1;
153
+ }
154
+ mark:hover {
155
+ background-color: white !important;
156
+ }
157
+ </style>
158
+ """
159
+ )
160
+
161
+ try:
162
+ display(styles)
163
+ except Exception as e:
164
+ message = f"Failed to inject css styling. {e}"
165
+ self.logger.error(message)
166
+ raise (message)
167
+
168
+ def __process_df(self, metric_df: pd.DataFrame):
169
+ """
170
+ Parse the dataframe based on the provided config
171
+ """
172
+ self.logger.info(
173
+ f"processing the input metrics dataframe with {metric_df.columns}")
174
+
175
+ # Check if the required columns exist based on the task_type
176
+ required_columns = []
177
+ if self.configuration.task_type == TaskType.RAG.value:
178
+ required_columns.extend(
179
+ [*self.configuration.output_fields, *self.configuration.input_fields,
180
+ *self.configuration.context_fields]
181
+ )
182
+ else:
183
+ self.logger.info(
184
+ f"Dataframe columns were not validated for task_type: '{self.configuration.task_type}'"
185
+ )
186
+
187
+ missing_columns = set(required_columns) - set(metric_df.columns)
188
+ if len(missing_columns) > 0:
189
+ message = f"Missing columns from the dataframe. {missing_columns}"
190
+ self.logger.error(message)
191
+ raise Exception(message)
192
+
193
+ for metric in self.metrics:
194
+ metric_id = f"{metric.name}.{metric.method}" if metric.method else metric.name
195
+ self.logger.info(
196
+ f"metric_id: {metric_id}, config: {metric.thresholds}")
197
+
198
+ if metric_id not in metric_df.columns:
199
+ self.logger.warning(
200
+ f"metric_id {metric_id} is not present in the dataframe"
201
+ )
202
+ continue
203
+
204
+ if len(metric.thresholds) == 1:
205
+ if metric.thresholds[0].type == "lower_limit":
206
+ violated_records = metric_df[metric_df[metric_id]
207
+ < metric.thresholds[0].value]
208
+ else:
209
+ violated_records = metric_df[metric_df[metric_id]
210
+ > metric.thresholds[0].value]
211
+ else:
212
+ lower_limit = None
213
+ upper_limit = None
214
+
215
+ for threshold in metric.thresholds:
216
+ if threshold.type == "lower_limit":
217
+ lower_limit = threshold.value
218
+ else:
219
+ upper_limit = threshold.value
220
+
221
+ if lower_limit is None or upper_limit is None:
222
+ message = f"Invalid metrics thresholds. duplicated threshold type. {metric.thresholds}"
223
+ self.logger.error(message)
224
+ raise Exception(message)
225
+
226
+ violated_records = metric_df[(metric_df[metric_id] > upper_limit) & (
227
+ metric_df[metric_id] < lower_limit)]
228
+
229
+ self.violation_sets[metric_id] = set(violated_records.index)
230
+ self.violations = pd.concat(
231
+ [self.violations, violated_records])
232
+
233
+ self.config_metric_ids.append(
234
+ {
235
+ "metric_id": metric_id,
236
+ "violation_count": (
237
+ len(self.violation_sets[metric_id])
238
+ if metric_id in violated_records.keys()
239
+ else 0
240
+ ),
241
+ }
242
+ )
243
+
244
+ # Compute the default metric grouping
245
+ self.df = metric_df
246
+ self.__find_metric_grouping()
247
+
248
+ self.logger.info(
249
+ f"Finished processing input dataframe. {self.config_metric_ids}"
250
+ )
251
+
252
+ def __metric_overlaps(self, metric_id: str, config_filter=None):
253
+ """
254
+ Helper method to check for violations overlap between metrics. this will return a list of the provided
255
+ metric id and the top two metric ids with the largest overlap.
256
+ """
257
+ self.logger.info(
258
+ f"getting metric overlap for metric_id {metric_id}. filters: {config_filter}"
259
+ )
260
+
261
+ if metric_id not in self.violation_sets:
262
+ # no violations for this metric id, we can skip it
263
+ self.logger.info("No violations for {metric_id}. Skipping")
264
+ return
265
+
266
+ intersections = [] # list to store a tuple of metric id and intersection size
267
+ current_set = self.violation_sets[metric_id]
268
+ for violation, v in self.violation_sets.items():
269
+ if violation == metric_id:
270
+ # skip comparing to self
271
+ continue
272
+ if config_filter is not None and violation not in config_filter:
273
+ # skip comparing with metrics that are selected
274
+ continue
275
+
276
+ # check if the metric id already added to a group already
277
+ is_used = False
278
+ for i in range(len(self.metric_groups)):
279
+ for j in range(len(self.metric_groups[i])):
280
+ if violation == self.metric_groups[i][j]:
281
+ is_used = True
282
+ break
283
+ if is_used:
284
+ break
285
+ if is_used:
286
+ continue
287
+
288
+ intersections.append((violation, len(v.intersection(current_set))))
289
+
290
+ # sort the metrics by the size of the intersection
291
+ intersections = sorted(intersections, key=lambda x: x[1], reverse=True)
292
+ self.logger.info(
293
+ f"sorted overlaps with metric_id {metric_id} = {intersections}"
294
+ )
295
+
296
+ # return a list of the current metric id and the top two metrics by intersection size
297
+ return [metric_id] + [
298
+ intersections[i][0] for i in range(min(len(intersections), 2))
299
+ ]
300
+
301
+ def __find_metric_grouping(self, config_filter=None):
302
+ """
303
+ Function to find metrics grouping to be used for generating the venn diagrams.
304
+ The logic is to find the metric id with the most violations, then group it with the
305
+ metrics with the most overlap with it.
306
+ """
307
+ self.logger.info(
308
+ f"building metric grouping. filter {config_filter}")
309
+
310
+ # Sort the metric ids descending by the number of the violations
311
+ sorted_metrics = sorted(
312
+ self.config_metric_ids, key=lambda d: d["violation_count"], reverse=True
313
+ )
314
+
315
+ # temporary list to keep track of metric ids that we already grouped
316
+ used_metric_ids = []
317
+ for i in range(len(sorted_metrics)):
318
+ self.logger.info(
319
+ f"Checking metric grouping for {sorted_metrics[i]}")
320
+
321
+ # Check if the violation count is 0, since the list is sorted, this means we can break from
322
+ # the for loop as all the rest of metrics do no have any violations
323
+ if sorted_metrics[i]["violation_count"] == 0:
324
+ self.logger.info(
325
+ "Metric does not have any violation -- metric grouping is done"
326
+ )
327
+ break
328
+
329
+ # Check if we already included this metric in another group
330
+ if sorted_metrics[i]["metric_id"] in used_metric_ids:
331
+ self.logger.info("Metric already used. skipping")
332
+ continue
333
+
334
+ # In case the current metric id is not in the config filter (not selected) we can skip this iteration
335
+ if (
336
+ config_filter is not None
337
+ and sorted_metrics[i]["metric_id"] not in config_filter
338
+ ):
339
+ self.logger.info(
340
+ "Metric is not included in the filter. skipping")
341
+ continue
342
+
343
+ # Check which other unused metrics that have the most overlap with the current metric id
344
+ self.metric_groups.append(
345
+ self.__metric_overlaps(
346
+ sorted_metrics[i]["metric_id"], config_filter)
347
+ )
348
+
349
+ # Mark add the current metric id and the other metrics grouped with is as used
350
+ used_metric_ids.extend(self.metric_groups[-1])
351
+
352
+ # Check if we reached the configured group size and break
353
+ if len(self.metric_groups) == self.MAX_METRIC_GROUP_SIZE:
354
+ self.logger.info(
355
+ f"Reached the maximum group size: {self.MAX_METRIC_GROUP_SIZE} -- metric grouping is done"
356
+ )
357
+ break
358
+
359
+ self.logger.info(
360
+ f"Finished finding metric grouping. metric groups: {self.metric_groups}"
361
+ )
362
+
363
+ def __is_in_circle(
364
+ self,
365
+ circle_center_x: float,
366
+ circle_center_y: float,
367
+ circle_r: float,
368
+ x: float,
369
+ y: float,
370
+ ):
371
+ """
372
+ Helper to identify if a given point is in a circle.
373
+ """
374
+ if (x - circle_center_x) * (x - circle_center_x) + (y - circle_center_y) * (
375
+ y - circle_center_y
376
+ ) <= circle_r * circle_r:
377
+ return True
378
+ else:
379
+ return False
380
+
381
+ def render_venn_diagrams(self, group_index=None, filters=None):
382
+ """
383
+ Function to render multiple interactive venn diagrams
384
+ """
385
+ self.logger.info(
386
+ f"Rendering venn diagrams. group_index: {group_index}, filters: {filters}"
387
+ )
388
+
389
+ # Reset the context of matplotlib, this insures we start with an empty figure
390
+ plt.clf()
391
+ plt.close("all")
392
+
393
+ # If we have the group index, we need to check if at least one item is selected in the filters
394
+ if group_index is not None:
395
+ num_of_diagrams = 1 if any(list(filters.values())) else 0
396
+ else:
397
+ # Check how many venn diagrams (plots) to draw
398
+ num_of_diagrams = len(self.metric_groups)
399
+
400
+ self.logger.info(
401
+ f"Number of venn diagrams to render is {num_of_diagrams}")
402
+
403
+ if num_of_diagrams == 0:
404
+ self.logger.warning("No venn diagrams to render.")
405
+ print("There are no diagrams to display.")
406
+ return
407
+
408
+ # Set up the diagrams layout
409
+ # align diagrams horizontally
410
+ fig, axes = plt.subplots(1, num_of_diagrams)
411
+ plt.tight_layout()
412
+
413
+ diagram_list = []
414
+
415
+ # 2 or more venn diagrams
416
+ if num_of_diagrams > 1:
417
+ fig.set_figwidth(fig.get_figwidth() * num_of_diagrams * 0.8)
418
+ for i in range(num_of_diagrams):
419
+ self.logger.info(
420
+ f"building venn diagram #{i} out of {num_of_diagrams}"
421
+ )
422
+ # set the config for each of the filters
423
+ metric_filters = {}
424
+ for metric in self.metric_groups[i]:
425
+ metric_filters[metric] = True
426
+ diagram_list.append(
427
+ (
428
+ axes[i],
429
+ self.__build_venn(filters=metric_filters, ax=axes[i]),
430
+ self.metric_groups[i],
431
+ )
432
+ )
433
+
434
+ # One venn diagram only
435
+ elif num_of_diagrams == 1:
436
+ metric_filters = {}
437
+
438
+ # Check if there metric id filter is provided, otherwise use all metrics in the group
439
+ if filters is not None:
440
+ for metric_id, is_used in filters.items():
441
+ if is_used is True:
442
+ metric_filters[metric_id] = is_used
443
+ else:
444
+ for metric in self.metric_groups[
445
+ 0 if group_index is None else group_index
446
+ ]:
447
+ metric_filters[metric] = True
448
+
449
+ diagram_list.append(
450
+ (
451
+ axes,
452
+ self.__build_venn(filters=metric_filters, ax=axes),
453
+ list(metric_filters.keys()),
454
+ )
455
+ )
456
+
457
+ @self.violation_summary_and_table_output.capture()
458
+ def venn_callback(event):
459
+ """
460
+ On click handler for venn diagrams. This will determine which venn diagram got clicked
461
+ and update the violation summary and table to reflect the patch that got selected
462
+ """
463
+ self.logger.info(f"Handling venn diagram click event: {event}")
464
+ self.logger.info(f"Diagrams to be processed: {diagram_list}")
465
+ self.logger.info(
466
+ f"Selected patch_id: {self.selected_patch_id}")
467
+ # Start by clearing the UI, this includes the violation summary and violation table
468
+ self.violation_summary_and_table_output.clear_output()
469
+ self.faithfulness_attributions_output.clear_output()
470
+
471
+ # Check if we have a selected patch already and update the style
472
+ if self.selected_patch_id is not None:
473
+ # go over all the venn diagrams and set the opacity
474
+ for ax, venn, _ in diagram_list:
475
+ for patch in venn.patches:
476
+ if patch is not None:
477
+ patch.set_alpha(0.25)
478
+
479
+ # Identify the clicked diagram, set the patch opacity, and determine which records to display
480
+ for ax, venn, labels in diagram_list:
481
+
482
+ # If the event is not in this venn diagram, skip to the next one
483
+ if not ax.in_axes(event):
484
+ continue
485
+
486
+ # Determine which circles are located on the clicked coordinates, this insures we consider
487
+ # the intersection between circles
488
+ clicked_metric_ids = {} # dict to store which metric ids got clicked
489
+ for i in range(len(venn.centers)):
490
+ if i >= len(labels):
491
+ clicked_metric_ids[""] = False
492
+ break
493
+ clicked_metric_ids[labels[i]] = self.__is_in_circle(
494
+ venn.centers[i].x,
495
+ venn.centers[i].y,
496
+ venn.radii[i],
497
+ event.xdata,
498
+ event.ydata,
499
+ )
500
+
501
+ # Determine the patch id
502
+ patch_id = ""
503
+ for _, is_selected in clicked_metric_ids.items():
504
+ patch_id = patch_id + ("1" if is_selected is True else "0")
505
+
506
+ # The click event was not on any patch, no further actions need to be done
507
+ if patch_id in ["00", "000"]:
508
+ return
509
+
510
+ # reduce the opacity of all patches
511
+ for patch in venn.patches:
512
+ if patch is not None:
513
+ patch.set_alpha(0.10)
514
+
515
+ # set the opacity of the selected patch
516
+ patch = venn.get_patch_by_id(patch_id)
517
+ patch.set_alpha(1)
518
+ self.selected_patch_id = (ax, patch_id)
519
+
520
+ # Determine the selected record ids based on the patch id
521
+ violated_record_ids = set()
522
+ for i in range(min(len(patch_id), len(labels))):
523
+ if patch_id[i] == "1":
524
+ if len(violated_record_ids) == 0: # First record to be added
525
+ violated_record_ids = self.violation_sets.get(
526
+ labels[i], set()
527
+ )
528
+ else:
529
+ violated_record_ids = violated_record_ids.intersection(
530
+ self.violation_sets.get(labels[i], set())
531
+ )
532
+
533
+ for i in range(min(len(patch_id), len(labels))):
534
+ if patch_id[i] == "0":
535
+ violated_record_ids = (
536
+ violated_record_ids
537
+ - self.violation_sets.get(labels[i], set())
538
+ )
539
+
540
+ # Check how many violated records under each metric id from the clicked venn diagram
541
+ metric_ids_violation_count = {}
542
+ for metric_id in labels:
543
+ metric_ids_violation_count[metric_id] = len(
544
+ violated_record_ids.intersection(
545
+ self.violation_sets[metric_id])
546
+ )
547
+
548
+ self.logger.info(
549
+ f"Updated venn diagram. selected_patch_id: {self.selected_patch_id}, metric_ids_violation_count: {metric_ids_violation_count}"
550
+ )
551
+
552
+ # Update the UI based on the clicked section of the venn diagram
553
+ self.print_violation_summary(metric_ids_violation_count)
554
+ self.show_violations_table_by_violation_ids(
555
+ list(violated_record_ids))
556
+
557
+ # Register matplotlib callback to handle all clicks on the plots
558
+ self.venn_diagram_callback_id = plt.gcf().canvas.mpl_connect(
559
+ "button_press_event", venn_callback
560
+ )
561
+
562
+ plt.show()
563
+
564
+ def __build_venn(self, filters: dict[str, any], ax: Axes):
565
+ """
566
+ Helper function to generate a single venn diagram and implement its styling
567
+ """
568
+ self.logger.info(
569
+ f"Building venn diagram. filters: {filters}, ax: {ax}")
570
+
571
+ # Check the filters and processed violation sets to determine what violations we would add to the venn diagrams
572
+ # items from filters object will be ignored if the metric id does not exist in the config, dataframe, or has no violations
573
+ sets = []
574
+ labels = []
575
+ for key, value in filters.items():
576
+ if key in self.violation_sets.keys() and value is True:
577
+ if len(self.violation_sets[key]) > 0:
578
+ sets.append(self.violation_sets.get(key, set()))
579
+ labels.append(key)
580
+ venn = None
581
+ circles = [] # Store circles object to be able to style the borders
582
+ try:
583
+ if len(sets) == 1:
584
+ # matplotlib_venn does not support diagrams with 1 set only. We need to
585
+ # add an empty set and hide it in this case
586
+ venn = venn2(
587
+ [sets[0], set()],
588
+ set_labels=labels,
589
+ set_colors=self.COLORS[0:2],
590
+ alpha=0.25,
591
+ ax=ax,
592
+ )
593
+ circles = venn2_circles(
594
+ subsets=[sets[0], set()], linewidth=1, ax=ax)
595
+
596
+ # hide the 0 from the empty set and move the label to the center
597
+ venn.hide_zeroes()
598
+ label = venn.get_label_by_id("A")
599
+ label.set_horizontalalignment("center")
600
+ elif len(sets) == 2:
601
+ venn = venn2(
602
+ sets,
603
+ set_labels=labels,
604
+ set_colors=self.COLORS[0:2],
605
+ alpha=0.25,
606
+ ax=ax,
607
+ layout_algorithm=Venn2DefaultLayoutAlgorithm(
608
+ fixed_subset_sizes=(1, 1, 1)
609
+ ),
610
+ )
611
+ circles = venn2_circles(
612
+ subsets=sets,
613
+ linewidth=1,
614
+ ax=ax,
615
+ layout_algorithm=Venn2DefaultLayoutAlgorithm(
616
+ fixed_subset_sizes=(1, 1, 1)
617
+ ),
618
+ )
619
+ elif len(sets) == 3:
620
+ venn = venn3(
621
+ sets,
622
+ set_labels=labels,
623
+ set_colors=self.COLORS,
624
+ alpha=0.25,
625
+ ax=ax,
626
+ layout_algorithm=Venn3DefaultLayoutAlgorithm(
627
+ fixed_subset_sizes=(1, 1, 1, 1, 1, 1, 1)
628
+ ),
629
+ )
630
+ circles = venn3_circles(
631
+ subsets=sets,
632
+ linewidth=1,
633
+ ax=ax,
634
+ layout_algorithm=Venn3DefaultLayoutAlgorithm(
635
+ fixed_subset_sizes=(1, 1, 1, 1, 1, 1, 1)
636
+ ),
637
+ )
638
+ else:
639
+ self.logger.warning(
640
+ "No metrics were selected for the venn diagram")
641
+ print("you must select 1 to 3 metrics to display the venn diagram")
642
+
643
+ # Set the circles borders
644
+ for circle, color in zip(circles, self.COLORS):
645
+ circle.set_edgecolor(color)
646
+ except Exception as e:
647
+ message = f"Failed to build venn diagrams. {e}"
648
+ self.logger.error(message)
649
+ raise Exception(message)
650
+
651
+ return venn
652
+
653
+ def __get_faithfulness_highlight(self, score: float):
654
+ """
655
+ Helper to translate the faithfulness score to text
656
+ """
657
+ if score >= 0.75:
658
+ return "Faithful"
659
+ if score < 0.75 and score >= 0.3:
660
+ return "Somewhat faithful"
661
+ return "Unfaithful"
662
+
663
+ def __highlight_faithfulness(self, input: str, attributions: list[tuple[str, float]]):
664
+ """
665
+ Helper to highlight sections of the input based on a list of substrings and their scores.
666
+ This is intended to highlight the faithfulness attributions in both answers and contexts.
667
+ Note: this helper does not handle attributions overlapping.
668
+ """
669
+ # Remove unwanted whitespaces
670
+ result = " ".join(input.split())
671
+
672
+ # Go over each attribution and highlight in the context based on its score
673
+ for attribution in attributions:
674
+ # Remove unwanted whitespaces
675
+ attribution_value = " ".join(attribution[0].split())
676
+
677
+ # Determine the highlight color
678
+ color = ""
679
+ if attribution[1] >= 0.75:
680
+ color = "green"
681
+ elif attribution[1] < 0.75 and attribution[1] >= 0.3:
682
+ color = "yellow"
683
+ else:
684
+ color = "red"
685
+
686
+ # Find the attribution in the context and highlight
687
+ result = result.replace(
688
+ attribution_value,
689
+ f"""
690
+ <mark style='background-color: {color}' class='tooltip'>{attribution[0]}<span class='tooltiptext'>faithfulness score: {attribution[1]}</span></mark>
691
+ """
692
+ )
693
+ return result
694
+
695
+ def render_faithfulness_attributions(self, selected_violation):
696
+ """
697
+ This function will render a table of each faithfulness attribution of the answer with its score. When
698
+ a row is selected, the contexts will be listed with each attribution highlighted and color coded based on its score.
699
+ """
700
+ # The object is converted to a string in the dataframe if it was loaded as a csv, we need to parse it back to a dict
701
+ if isinstance(selected_violation["faithfulness_attributions"], str):
702
+ faithfulness_attributions = ast.literal_eval(
703
+ selected_violation["faithfulness_attributions"]
704
+ )
705
+ else:
706
+ faithfulness_attributions = selected_violation["faithfulness_attributions"]
707
+
708
+ attributions_df = pd.DataFrame.from_dict(faithfulness_attributions)
709
+
710
+ attributions_table = ITable(
711
+ # only display certain columns
712
+ df=attributions_df[["output_text", "faithfulness_score"]],
713
+ caption="Faithfulness attributions",
714
+ classes="display wrap compact",
715
+ select="single",
716
+ )
717
+ attributions_output = widgets.Output()
718
+
719
+ @attributions_output.capture()
720
+ def on_row_clicked(change):
721
+ """
722
+ Callback handler when a row is selected. It will list all the context with highlighting which sections of the context
723
+ attributed to the answer and its faithfulness score
724
+ """
725
+ attributions_output.clear_output()
726
+
727
+ try:
728
+ # Check if we do not need to render the attributions, this would be in these cases:
729
+ # - The update is to deselect a record
730
+ # - The faithfulness attributions is not provided in the dataframe
731
+ if (
732
+ len(change["new"]) < 1
733
+ or "faithfulness_attributions" not in self.df.columns
734
+ ):
735
+ return
736
+
737
+ # Go over all the attributions and build a dict for the data that will be rendered
738
+ attributions_data = {}
739
+ for attribution in faithfulness_attributions[change["new"][0]]["attributions"]:
740
+ attributions_data[attribution["feature_name"]
741
+ ] = selected_violation[attribution["feature_name"]]
742
+
743
+ # Create a list of tuples that contain the attribution text and its score, this will be used to
744
+ # highlight the sections in the context
745
+ attrib_tuple = []
746
+ for feature_value, faithfulness_score in zip(attribution["feature_values"], attribution["faithfulness_scores"]):
747
+ attrib_tuple.append(
748
+ (feature_value, faithfulness_score))
749
+
750
+ attributions_data[attribution["feature_name"]] = self.__highlight_faithfulness(
751
+ attributions_data[attribution["feature_name"]], attrib_tuple)
752
+
753
+ html = ""
754
+ for context_column in self.configuration.context_fields:
755
+ context = attributions_data.get(
756
+ context_column, selected_violation[context_column])
757
+ html += f"<h3>{context_column}</h3>"
758
+ html += f"<p>{context}</p>"
759
+
760
+ display(HTML(html))
761
+ except Exception as e:
762
+ message = f"Failed to render faithfulness attributions. {e}"
763
+ self.logger.error(message)
764
+ raise Exception(message)
765
+
766
+ # Connect row selection callback
767
+ attributions_table.observe(on_row_clicked, names=["selected_rows"])
768
+
769
+ display(attributions_table, attributions_output)
770
+
771
+ def render_question_and_answer_faithfulness(self, selected_violation):
772
+ """
773
+ Function to parse the faithfulness attributions, build html code, and display it
774
+ """
775
+ self.logger.info(
776
+ f"Rendering question and answer faithfulness. Selected violation: {selected_violation}"
777
+ )
778
+
779
+ try:
780
+ display(
781
+ HTML(
782
+ f"""
783
+ <div>
784
+ <h2>Question</h2>
785
+ <p>{selected_violation[self.configuration.input_fields[0]]}</p>
786
+ <h2>Answer</h2>
787
+ <ul>
788
+ <li>{selected_violation[self.configuration.output_fields[0]]}</li>
789
+ <li>{self.__get_faithfulness_highlight(selected_violation['faithfulness'])} {selected_violation['faithfulness']}</li>
790
+ </ul>
791
+ <div>
792
+ """
793
+ )
794
+ )
795
+ except Exception as e:
796
+ message = f"Failed to render faithfulness attributions. {e}"
797
+ self.logger.error(message)
798
+ raise Exception(message)
799
+
800
+ def show_violations_table_by_violation_ids(self, violation_ids: list[int]):
801
+ """
802
+ Function to display records by in ids list
803
+ """
804
+ self.logger.info(
805
+ f"Displaying violation table by violation ids. Total violations: {len(violation_ids)}"
806
+ )
807
+
808
+ try:
809
+ violations_table = ITable(
810
+ # Select violated records by id
811
+ df=self.df[self.df.index.isin(violation_ids)],
812
+ caption="Violated Records",
813
+ buttons=[{"extend": "csvHtml5", "text": "Download"}],
814
+ classes="display nowrap compact violations_table",
815
+ select="single",
816
+ )
817
+ except Exception as e:
818
+ message = f"Failed to create violation table. {e}"
819
+ self.logger.error(message)
820
+ raise Exception(message)
821
+
822
+ @self.faithfulness_attributions_output.capture()
823
+ def on_row_clicked(change):
824
+ """
825
+ Callback handler when a row is selected. This will display the record faithfulness attribution if it exist.
826
+ """
827
+ # Reset the faithfulness attributions section
828
+ self.faithfulness_attributions_output.clear_output()
829
+
830
+ self.logger.info(
831
+ f"Violation table row selected. Event: {change}")
832
+
833
+ # Check if we do not need to render the attributions, this would be in these cases:
834
+ # - The update is to deselect a record
835
+ # - The faithfulness attributions is not provided in the dataframe
836
+ if (
837
+ len(change["new"]) < 1
838
+ or "faithfulness_attributions" not in self.df.columns
839
+ ):
840
+ return
841
+
842
+ # Pass all columns of the selected row to be rendered in the attributions section
843
+ self.render_question_and_answer_faithfulness(
844
+ violations_table.df.iloc[change["new"][0]]
845
+ )
846
+ self.render_faithfulness_attributions(
847
+ violations_table.df.iloc[change["new"][0]]
848
+ )
849
+
850
+ # Connect row selection callback
851
+ violations_table.observe(on_row_clicked, names=["selected_rows"])
852
+
853
+ # Display the table and the faithfulness attributions below it
854
+ try:
855
+ display(violations_table, self.faithfulness_attributions_output)
856
+ except Exception as e:
857
+ message = f"Failed to render violation table. {e}"
858
+ self.logger.error(message)
859
+ raise Exception(message)
860
+
861
+ def __reset_venn_diagram(self):
862
+ """
863
+ Resets the diagram by clearing matplotlib, disconnecting on click callback, and clearing the selected patch
864
+ """
865
+ self.logger.info("Resetting Venn Diagrams.")
866
+ plt.clf()
867
+ plt.gcf().canvas.mpl_disconnect(self.venn_diagram_callback_id)
868
+ self.selected_patch_id = None
869
+
870
+ def print_violation_summary(self, metric_ids_violation_count):
871
+ """
872
+ Helper method to format and display the violated records summary. This will highlight this information:
873
+ - metric id
874
+ - configured threshold
875
+ - number of violated records
876
+ """
877
+ self.logger.info(
878
+ f"Printing violation summary. Metric ids violation count: {metric_ids_violation_count}"
879
+ )
880
+
881
+ html_violations_list = []
882
+ for metric_id, count in metric_ids_violation_count.items():
883
+ html_violations_list.append(
884
+ f"""
885
+ <li>{metric_id} ({self.metric_config[metric_id]['threshold']})
886
+ <ul>
887
+ <li>{count} violated records</li>
888
+ </ul>
889
+ </li>
890
+ """
891
+ )
892
+
893
+ try:
894
+ display(
895
+ HTML(
896
+ f"""
897
+ <div>
898
+ <h3>Violations:</h3>
899
+ <ul>
900
+ {''.join(html_violations_list)}
901
+ </ul>
902
+ </div>
903
+ """
904
+ )
905
+ )
906
+ except Exception as e:
907
+ message = f"Failed to render violation summary. {e}"
908
+ self.logger.error(message)
909
+ raise Exception(message)
910
+
911
+ def __print_rca(self, metric_ids_violation_count: dict[str, int]):
912
+ """
913
+ Function to print the root cause analysis to the user
914
+ Note: This depends on ibm_metrics_plugin
915
+ """
916
+ raise Exception("RCA is not supported.")
917
+ self.logger.info(
918
+ f"Printing RCA. Metric ids violation count: {metric_ids_violation_count}"
919
+ )
920
+ # Based on the count, build the argument generate the RCA and build the html metric RCA list
921
+ evaluation_analysis_argument = ""
922
+ rca_metrics_html = ""
923
+ for metric_id, count in metric_ids_violation_count.items():
924
+ evaluation_analysis_argument += (
925
+ f"{metric_id}:eq:{'low' if count > 0 else 'high'},"
926
+ )
927
+ rca_metrics_html += (
928
+ f"<li>{'Low' if count > 0 else 'High'}: {metric_id}</li>"
929
+ )
930
+
931
+ try:
932
+ # Generate the RCA using the metrics plugin
933
+ rca = EvalAnalysisProvider().get_metrics_eval_analysis(
934
+ evaluation_analysis_argument
935
+ )
936
+ except Exception as e:
937
+ message = f"Failed to get metric evaluation analysis. {e}"
938
+ self.logger.error(message)
939
+ raise Exception(message)
940
+
941
+ # Build the html based on the generated RCA values
942
+ causes_html = ""
943
+ for cause in rca["causes"]:
944
+ causes_html += f"<li>{cause}</li>"
945
+
946
+ # Build the accordion for the recommendations section, this needs to be added into
947
+ # an output widget to then be displayed in the accordion
948
+ recommendations_html = ""
949
+ for recommendation in rca["recommendations"]:
950
+ recommendations_html += f"<li>{recommendation}</li>"
951
+ recommendations_output = widgets.Output()
952
+ with recommendations_output:
953
+ try:
954
+ display(
955
+ HTML(
956
+ f"""
957
+ <h2>Recommendations</h2>
958
+ <ul>
959
+ {recommendations_html}
960
+ </ul>
961
+ """
962
+ )
963
+ )
964
+ except Exception as e:
965
+ message = f"Failed to render recommendations. {e}"
966
+ self.logger.error(message)
967
+ raise Exception(message)
968
+
969
+ recommendations_accordion = widgets.Accordion(
970
+ children=[recommendations_output], titles=[
971
+ "See recommended actions"]
972
+ )
973
+
974
+ try:
975
+ display(
976
+ HTML(
977
+ f"""
978
+ <h1>Root cause analysis</h1>
979
+ <ul>
980
+ {rca_metrics_html}
981
+ </ul>
982
+ <h3>What does this mean?</h3>
983
+ <p>{rca['description']}</p>
984
+ <h3>What could be the cause?</h3>
985
+ <ul>
986
+ {causes_html}
987
+ </ul>"""
988
+ ),
989
+ recommendations_accordion,
990
+ )
991
+ except Exception as e:
992
+ message = f"failed to render RCA. {e}"
993
+ self.logger.error(message)
994
+ raise Exception(message)
995
+
996
+ def __get_metric_id_description(self, metric_id: str) -> widgets.Output:
997
+ """
998
+ Helper to create an icon with metric id description
999
+ """
1000
+ output = widgets.Output(layout={'align_self': 'center'})
1001
+ metric_description = metric_description_mapping.get(metric_id, None)
1002
+
1003
+ # If the metric id description exist, populate the output widget, otherwise keep it empty
1004
+ if metric_description:
1005
+ with output:
1006
+ metric_description_icon = widgets.Text(
1007
+ value="\u24D8", tooltip=metric_description)
1008
+ metric_description_icon.add_class("reset_input_style")
1009
+ metric_description_icon.disabled = True
1010
+ metric_description_icon.layout = widgets.Layout(width='35px')
1011
+ display(metric_description_icon)
1012
+
1013
+ return output
1014
+
1015
+ def show_all_metrics_dropdown(self):
1016
+ """
1017
+ Function to render the widget UI. This will render the following:
1018
+ - Dropdown component to select metrics
1019
+ - Default selected metrics based on the top metrics with violated records
1020
+ - Venn diagrams of the selected metrics
1021
+
1022
+ Note: For the venn diagrams to be interactive `ipympl` backend should by enabled, this can be done by:
1023
+ - installing ipympl Jupyter extension
1024
+ - explicitly enable `ipympl` backend by adding this line to the notebook `%matplotlib ipympl`
1025
+ """
1026
+ self.logger.info("Displaying interactive metric id drop down view")
1027
+
1028
+ # Create an output widget for each component, this helps in customizing the layout of the ui
1029
+ dropdown_output = widgets.Output()
1030
+ checkbox_output = widgets.Output()
1031
+ venn_output = widgets.Output()
1032
+
1033
+ # Sort the metric based on the number of violated records
1034
+ sorted_metrics = sorted(
1035
+ self.config_metric_ids, key=lambda d: d["violation_count"], reverse=True
1036
+ )
1037
+
1038
+ # Define the dropdown widget to select the metric ids
1039
+ dropdown = widgets.Dropdown(
1040
+ options=[metric["metric_id"] for metric in sorted_metrics],
1041
+ description="Metrics",
1042
+ )
1043
+
1044
+ # Select the top metrics with violation based on the configured limit
1045
+ selected_metrics = [metric["metric_id"]
1046
+ for metric in sorted_metrics[0:3]]
1047
+
1048
+ self.logger.info(
1049
+ f"Dropdown metrics: {sorted_metrics}, selected metrics: {selected_metrics}"
1050
+ )
1051
+
1052
+ def add_to_checkboxes(metric_id: str):
1053
+ """
1054
+ Callback handler to add metrics to the checkbox list, this will be called when selecting a metric from the dropdown.
1055
+ """
1056
+ self.logger.info(
1057
+ f"Metric id: {metric_id} is being added the checkboxes list"
1058
+ )
1059
+
1060
+ checkbox_metrics = {}
1061
+ metric_descriptions = {}
1062
+ checkbox_output.clear_output()
1063
+
1064
+ # Add the metric id to the list if it is not there already
1065
+ if metric_id not in selected_metrics:
1066
+ selected_metrics.append(metric_id)
1067
+
1068
+ self.logger.info(
1069
+ f"Updated selected metrics: {selected_metrics}")
1070
+
1071
+ # create the checkbox widgets based on the selected metric ids
1072
+ for metric in selected_metrics:
1073
+ checkbox_metrics[metric] = widgets.Checkbox(
1074
+ value=True, description=metric
1075
+ )
1076
+ metric_descriptions[metric] = self.__get_metric_id_description(
1077
+ metric)
1078
+
1079
+ def on_checkbox_updated(**kwargs):
1080
+ """
1081
+ Callback handler that gets triggered by adding / removing items from the checkbox list. This handler will update the venn
1082
+ diagram on any change on the metric list
1083
+ This will be triggered by these two cases:
1084
+ - If a new metric is selected from the dropdown
1085
+ - If a metric got unselected from the checkbox
1086
+ """
1087
+ self.logger.info(
1088
+ f"Checkboxes are updated. kwargs {kwargs}")
1089
+
1090
+ # Clear the venn diagrams before updating them to show the new selection
1091
+ self.__reset_venn_diagram()
1092
+ venn_output.clear_output()
1093
+
1094
+ # Find which metrics got deselected and remove them from the UI
1095
+ for k, v in kwargs.items():
1096
+ if v is False:
1097
+ try:
1098
+ if k in selected_metrics:
1099
+ selected_metrics.remove(k)
1100
+ checkbox_metrics[k].close()
1101
+ metric_description = metric_descriptions.pop(
1102
+ k, None)
1103
+ if metric_description:
1104
+ metric_description.close()
1105
+ except Exception as e:
1106
+ message = f"Failed to remove checkbox from the list. {e}"
1107
+ self.logger.error(message)
1108
+ raise Exception(message)
1109
+
1110
+ # Reset the current groups and regenerate them based on the new selection
1111
+ self.metric_groups = []
1112
+ self.__find_metric_grouping(selected_metrics)
1113
+ with venn_output:
1114
+ self.render_venn_diagrams()
1115
+
1116
+ # Connect the call back to update the checkboxes when an item is deselected
1117
+ interactive_checkboxes = widgets.interactive_output(
1118
+ on_checkbox_updated, checkbox_metrics
1119
+ )
1120
+ with checkbox_output:
1121
+ try:
1122
+ checkboxes_list = []
1123
+ for checkbox, metric_description in zip(list(checkbox_metrics.values()), list(metric_descriptions.values())):
1124
+ checkboxes_list.append(
1125
+ widgets.HBox([checkbox, metric_description]))
1126
+ ui = widgets.VBox(checkboxes_list)
1127
+ display(ui, interactive_checkboxes)
1128
+ except Exception as e:
1129
+ message = f"Failed to display checkboxes. {e}"
1130
+ self.logger.error(message)
1131
+ raise Exception(message)
1132
+
1133
+ # Connect the callback to update the checkboxes when a metric id is selected from the dropdown
1134
+ with dropdown_output:
1135
+ widgets.interact(add_to_checkboxes, metric_id=dropdown)
1136
+
1137
+ try:
1138
+ display(
1139
+ widgets.HBox(
1140
+ [
1141
+ venn_output,
1142
+ widgets.VBox(
1143
+ [dropdown_output, checkbox_output],
1144
+ layout=widgets.Layout(margin="33px 0 0 0"),
1145
+ ),
1146
+ ]
1147
+ ),
1148
+ self.violation_summary_and_table_output,
1149
+ )
1150
+ except Exception as e:
1151
+ message = f"Failed to display dropdown menu and checkboxes. {e}"
1152
+ self.logger.error(message)
1153
+ raise Exception(message)
1154
+
1155
+ def show_checkboxes_with_venn(self, metric_group_index: int):
1156
+ """
1157
+ Display venn diagram for the selected metric group along with checkboxes to select which metrics should be shown
1158
+ """
1159
+ if metric_group_index >= len(self.metric_groups):
1160
+ message = f"Metric group index ({metric_group_index}) is out of bound"
1161
+ self.logger.error(message)
1162
+ raise Exception(message)
1163
+
1164
+ self.logger.info(
1165
+ f"Showing venn diagram with metric group index: {metric_group_index}, metric ids {self.metric_groups[metric_group_index]}"
1166
+ )
1167
+
1168
+ # create the checkbox widgets based on the selected metric ids
1169
+ checkbox_metrics = {}
1170
+ metric_descriptions = {}
1171
+ for metric in self.metric_groups[metric_group_index]:
1172
+ checkbox_metrics[metric] = widgets.Checkbox(
1173
+ value=True, description=metric)
1174
+ metric_descriptions[metric] = self.__get_metric_id_description(
1175
+ metric)
1176
+ venn_diagram_output = widgets.Output()
1177
+ checkboxes_output = widgets.Output()
1178
+
1179
+ def on_checkbox_updated(**kwargs):
1180
+ """
1181
+ Helper to handle checkboxes updates. This will trigger rerendering the venn diagram based on the new selection.
1182
+ """
1183
+ self.logger.info(f"Checkboxes updated: {kwargs}")
1184
+ venn_diagram_output.clear_output()
1185
+ with venn_diagram_output:
1186
+ self.render_venn_diagrams(metric_group_index, filters=kwargs)
1187
+
1188
+ # Connect the call back to update the checkboxes when an item is deselected
1189
+ interactive_checkboxes = widgets.interactive_output(
1190
+ on_checkbox_updated, checkbox_metrics
1191
+ )
1192
+ with checkboxes_output:
1193
+ try:
1194
+ checkboxes_list = []
1195
+ for checkbox, metric_description in zip(list(checkbox_metrics.values()), list(metric_descriptions.values())):
1196
+ checkboxes_list.append(widgets.HBox(
1197
+ [checkbox, metric_description]))
1198
+ ui = widgets.VBox(checkboxes_list)
1199
+ display(ui, interactive_checkboxes)
1200
+ except Exception as e:
1201
+ message = f"Failed to display interactive checkboxes. {e}"
1202
+ self.logger.error(message)
1203
+ raise Exception(message)
1204
+
1205
+ checkboxes_output.layout = widgets.Layout(margin="33px 0 0 0")
1206
+
1207
+ try:
1208
+ display(widgets.HBox(
1209
+ [venn_diagram_output, checkboxes_output]))
1210
+ except Exception as e:
1211
+ message = f"Failed to display venn diagram and checkboxes output. {e}"
1212
+ self.logger.error(message)
1213
+ raise Exception(message)
1214
+
1215
+ def display_metrics(self, metrics_result: pd.DataFrame):
1216
+ """Method to display ModelInsights
1217
+
1218
+ Args:
1219
+ metrics_result (pd.DataFrame): _description_
1220
+ """
1221
+ # Process the DataFrame
1222
+ self.__reset_state()
1223
+ self.__process_df(metrics_result)
1224
+
1225
+ # Check if there were no violations
1226
+ if len(self.violations) == 0:
1227
+ print("No violations were detected.")
1228
+ return
1229
+
1230
+ # Check if we need to display the custom metrics tab,
1231
+ # this is needed when we have more than one metric group
1232
+ show_custom_metrics_tab: bool = len(self.metric_groups) > 1
1233
+
1234
+ # The number of tabs should be the number of found groups, if we have more than one
1235
+ # metric group an extra tab is added for custom metric selection
1236
+ tabs_count = len(self.metric_groups) + 1 \
1237
+ if show_custom_metrics_tab else len(self.metric_groups)
1238
+
1239
+ self.logger.info(
1240
+ "Displaying venn diagrams using tabs. Total tab count: {tabs_count}"
1241
+ )
1242
+
1243
+ # create tabs with the the length of the groups
1244
+ tabs = widgets.Tab()
1245
+ tab_output = widgets.Output() # Reuse the same output for all tabs..
1246
+ tabs_content = [tab_output for _ in range(tabs_count)]
1247
+ tabs_titles = [str(i + 1) for i in range(tabs_count)]
1248
+ tabs.children = tabs_content
1249
+ tabs.titles = tabs_titles
1250
+
1251
+ # render content for the default
1252
+ with tab_output:
1253
+ self.show_checkboxes_with_venn(0)
1254
+ try:
1255
+ display(self.violation_summary_and_table_output)
1256
+ except Exception as e:
1257
+ message = f"Failed to display violation summary and table output. {e}"
1258
+ self.logger.error(message)
1259
+ raise Exception(message)
1260
+
1261
+ @tab_output.capture()
1262
+ def on_tab_change(event):
1263
+ """
1264
+ Callback handler to render the content on tab change.
1265
+ """
1266
+ self.logger.info(f"Tab changed. event {event}")
1267
+
1268
+ # We are only interested in tab change events
1269
+ if event["name"] != "selected_index":
1270
+ return
1271
+
1272
+ # Clear all the content of the tab
1273
+ tab_output.clear_output()
1274
+ self.violation_summary_and_table_output.clear_output()
1275
+ self.faithfulness_attributions_output.clear_output()
1276
+ self.__reset_venn_diagram()
1277
+
1278
+ # If the last tab is selected and we have custom metric tab then render it
1279
+ if show_custom_metrics_tab and event["new"] == tabs_count - 1:
1280
+ self.show_all_metrics_dropdown()
1281
+ else:
1282
+ # If the previous tab was the custom tab, re compute the metric groups
1283
+ if show_custom_metrics_tab and event["old"] == tabs_count - 1:
1284
+ self.metric_groups = []
1285
+ self.__find_metric_grouping()
1286
+
1287
+ # Render the venn diagram based which metric group corresponds to the selected tab
1288
+ self.show_checkboxes_with_venn(event["new"])
1289
+ try:
1290
+ display(self.violation_summary_and_table_output)
1291
+ except Exception as e:
1292
+ message = f"Failed to display violation summary and table output. {e}"
1293
+ self.logger.error(message)
1294
+ raise Exception(message)
1295
+
1296
+ # Register callback handler for tabs events
1297
+ tabs.observe(on_tab_change)
1298
+
1299
+ try:
1300
+ display(tabs)
1301
+ except Exception as e:
1302
+ message = f"Failed to display tabs. {e}"
1303
+ self.logger.error(message)
1304
+ raise Exception(message)