ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,1285 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ """
11
+ Insights Generator Module
12
+
13
+ This module provides utilities for generating insights from evaluation metrics.
14
+ It includes functionality to:
15
+ - Select the most significant metrics using relevance scoring and MMR (Maximal Marginal Relevance)
16
+ - Generate natural language insights report for metrics using LLM models
17
+ - Calculate severity and relevance scores for metrics
18
+ """
19
+
20
+ import math
21
+ from typing import Any, Dict, List, Optional, Union
22
+
23
+ from ibm_watsonx_gov.entities.enums import ModelProviderType
24
+ from ibm_watsonx_gov.entities.foundation_model import (
25
+ AWSBedrockFoundationModel, AzureOpenAIFoundationModel,
26
+ CustomFoundationModel, GoogleAIStudioFoundationModel,
27
+ OpenAIFoundationModel, PortKeyGateway, RITSFoundationModel,
28
+ VertexAIFoundationModel, WxAIFoundationModel)
29
+ from ibm_watsonx_gov.entities.llm_judge import LLMJudge
30
+ from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
31
+
32
+
33
+ class InsightsReport:
34
+ """
35
+ A wrapper class for insights reports that displays properly in Jupyter notebooks.
36
+
37
+ This class ensures that text reports with newlines are rendered correctly
38
+ when displayed in Jupyter notebooks, rather than showing escaped \\n characters.
39
+ """
40
+
41
+ def __init__(self, content: str, format_type: str = "text"):
42
+ """
43
+ Initialize the InsightsReport.
44
+
45
+ Args:
46
+ content: The report content (text, HTML, or JSON string)
47
+ format_type: The format type ("text", "html", or "json")
48
+ """
49
+ self.content = content
50
+ self.format_type = format_type
51
+
52
+ def __str__(self) -> str:
53
+ """Return the content as a string."""
54
+ return self.content
55
+
56
+ def __repr__(self) -> str:
57
+ """Return a string representation of the object."""
58
+ return f"InsightsReport(format_type='{self.format_type}', length={len(self.content)})"
59
+
60
+ def _repr_html_(self) -> Optional[str]:
61
+ """
62
+ Return HTML representation for Jupyter notebooks.
63
+
64
+ This method is called by Jupyter to render the object.
65
+ For HTML format, return the HTML directly.
66
+ For text format, wrap in <pre> tags to preserve formatting.
67
+ """
68
+ if self.format_type == "html":
69
+ return self.content
70
+ elif self.format_type == "text":
71
+ # Wrap text in <pre> tags to preserve formatting and newlines
72
+ import html as pyhtml
73
+ return f"<pre>{pyhtml.escape(self.content)}</pre>"
74
+ return None
75
+
76
+
77
+ # Lazy import for LangChain dependencies
78
+ try:
79
+ from langchain_ibm import ChatWatsonx
80
+ from langchain_openai import AzureChatOpenAI, ChatOpenAI
81
+ except ImportError:
82
+ ChatWatsonx = None
83
+ AzureChatOpenAI = None
84
+ ChatOpenAI = None
85
+
86
+ logger = GovSDKLogger.get_logger(__name__)
87
+
88
+ # Metric group weights define the relative importance of different metric categories
89
+ # Higher weights indicate more critical metric groups
90
+ default_metric_group_weights: Dict[str, float] = {
91
+ "business": 4.0, # Business outcome metrics (highest priority)
92
+ "answer_quality": 3.0, # Quality of generated answers
93
+ "content_safety": 3.0, # Safety and ethical considerations
94
+ "retrieval_quality": 2.0, # Quality of retrieved information
95
+ "system_reliability": 2.0, # System reliability and availability
96
+ "performance": 1.75, # Performance and latency metrics
97
+ "usage": 1.0, # Resource usage metrics
98
+ "cost": 1.0, # Cost-related metrics
99
+ "other": 1.0 # Miscellaneous metrics
100
+ }
101
+
102
+ # Metric weights define the relative importance of individual metrics within their groups
103
+ # Higher weights indicate more critical individual metrics
104
+ default_metric_weights: Dict[str, float] = {
105
+ # Business Outcome Metrics
106
+ "thumbs_up_rate": 4.0,
107
+ "thumbs_down_rate": 4.0,
108
+
109
+ # Answer Quality
110
+ "answer_relevance": 4.0,
111
+ "faithfulness": 4.0,
112
+ "answer_similarity": 1.5,
113
+
114
+ # Content Safety
115
+ "evasiveness": 2.0,
116
+ "hap": 4.0,
117
+ "harm": 4.0,
118
+ "harm_engagement": 4.0,
119
+ "jailbreak": 4.0,
120
+ "pii": 4.0,
121
+ "profanity": 4.0,
122
+ "sexual_content": 4.0,
123
+ "social_bias": 4.0,
124
+ "unethical_behavior": 2.0,
125
+ "violence": 4.0,
126
+
127
+ # Retrieval Quality
128
+ "ndcg": 3.0,
129
+ "context_relevance": 2.5,
130
+ "average_precision": 2.5,
131
+ "retrieval_precision": 2.0,
132
+ "hit_rate": 1.5,
133
+ "reciprocal_rank": 1.5,
134
+
135
+ # Cost
136
+ "prompt_tokens": 2.0,
137
+ "completion_tokens": 2.0,
138
+ "tool_calls_count": 2.0,
139
+ "total_tokens": 2.0,
140
+ "total_tool_calls": 2.0,
141
+ "cost": 2.0,
142
+ "input_token_count": 2.0,
143
+ "output_token_count": 2.0,
144
+
145
+ # Performance
146
+ "latency": 3.5,
147
+ "duration": 3.5,
148
+
149
+ # System Reliability
150
+ "unsuccessful_requests": 4.0,
151
+ }
152
+
153
+
154
+ class InsightsGenerator:
155
+ """
156
+ A utility class for generating insights from evaluation metrics.
157
+
158
+ This class provides methods to analyze evaluation metrics and generate
159
+ meaningful insights, including selecting the most significant metrics
160
+ and optionally generating an insights report in natural language.
161
+
162
+ The class uses a combination of relevance scoring and Maximal Marginal Relevance (MMR)
163
+ to select diverse and significant metrics from a larger set of evaluation results.
164
+ """
165
+
166
+ # Metrics that should use percentile-based thresholds (cost and latency metrics)
167
+ PERCENTILE_BASED_METRICS = {
168
+ "duration", "latency",
169
+ "cost", "input_token_count", "output_token_count",
170
+ "prompt_tokens", "completion_tokens", "total_tokens"
171
+ }
172
+
173
+ def __init__(self, metrics: List[Any], top_k: int = 3, applies_to: Optional[Union[str, List[str]]] = None,
174
+ percentile_threshold: float = 95.0,
175
+ llm_model: Union[LLMJudge, WxAIFoundationModel, OpenAIFoundationModel,
176
+ AzureOpenAIFoundationModel, Any] = None,
177
+ metric_group_weights: Optional[Dict[str, float]] = None,
178
+ metric_weights: Optional[Dict[str, float]] = None):
179
+ """
180
+ Initialize the InsightsGenerator with the provided metrics, k, and applies_to.
181
+
182
+ Args:
183
+ metrics: List of metric dictionaries
184
+ top_k: Number of top metrics to select
185
+ applies_to: Filter by component level. Can be:
186
+ - None: No filtering (default)
187
+ - str: Single component level (e.g., "node", "message", "conversation")
188
+ - List[str]: Multiple component levels (e.g., ["node", "message"])
189
+ percentile_threshold: Percentile to use as threshold for cost/latency metrics (default: 95.0)
190
+ Higher values indicate worse performance for these metrics.
191
+ llm_model: LLM model for generating insights. Can be:
192
+ - LLMJudge instance (wraps a FoundationModel)
193
+ - FoundationModel instance directly (e.g., WxAIFoundationModel)
194
+ - Any object with a generate() method
195
+ metric_group_weights: Optional custom weights for metric groups.
196
+ If provided, these will override the default weights for the specified groups.
197
+ Each weight must be a float between 1.0 and 5.0 (inclusive).
198
+ metric_weights: Optional custom weights for individual metrics.
199
+ If provided, these will override the default weights for the specified metrics.
200
+ Each weight must be a float between 1.0 and 5.0 (inclusive).
201
+
202
+ Raises:
203
+ ValueError: If any custom weight is not between 1.0 and 5.0
204
+
205
+ Examples:
206
+ >>> # Using WxAIFoundationModel directly
207
+ >>> model = WxAIFoundationModel(
208
+ ... model_id="ibm/granite-3-3-8b-instruct",
209
+ ... project_id=PROJECT_ID
210
+ ... )
211
+ >>> generator = InsightsGenerator(metrics, top_k=3, llm_model=model)
212
+
213
+ >>> # Using LLMJudge wrapper
214
+ >>> llm_judge = LLMJudge(model=model)
215
+ >>> generator = InsightsGenerator(metrics, top_k=3, llm_model=llm_judge)
216
+ """
217
+ self.metrics = metrics
218
+ self.k = top_k
219
+ self.percentile_threshold = percentile_threshold
220
+ self.llm_model = llm_model
221
+
222
+ # Validate and merge custom weights with default weights
223
+ self.metric_group_weights = default_metric_group_weights.copy()
224
+ if metric_group_weights:
225
+ self._validate_weights(metric_group_weights, "metric group")
226
+ self.metric_group_weights.update(metric_group_weights)
227
+ logger.info(
228
+ f"Applied custom metric group weights: {metric_group_weights}")
229
+
230
+ self.metric_weights = default_metric_weights.copy()
231
+ if metric_weights:
232
+ self._validate_weights(metric_weights, "metric")
233
+ self.metric_weights.update(metric_weights)
234
+ logger.info(
235
+ f"Applied custom metric weights: {metric_weights}")
236
+
237
+ # Normalize applies_to to always be a list or None
238
+ if applies_to is None:
239
+ self.applies_to = None
240
+ elif isinstance(applies_to, str):
241
+ self.applies_to = [applies_to]
242
+ elif isinstance(applies_to, list):
243
+ self.applies_to = applies_to
244
+ else:
245
+ raise TypeError(
246
+ f"applies_to must be None, str, or List[str], got {type(applies_to).__name__}")
247
+
248
+ @staticmethod
249
+ def _validate_weights(weights: Dict[str, float], weight_type: str) -> None:
250
+ """
251
+ Validate that all weights are between 1.0 and 5.0 (inclusive).
252
+
253
+ Args:
254
+ weights: Dictionary of weights to validate
255
+ weight_type: Type of weight for error message (e.g., "metric", "metric group")
256
+
257
+ Raises:
258
+ ValueError: If any weight is not between 1.0 and 5.0
259
+ """
260
+ for name, weight in weights.items():
261
+ if not isinstance(weight, (int, float)):
262
+ raise ValueError(
263
+ f"Invalid {weight_type} weight for '{name}': {weight}. "
264
+ f"Weight must be a number between 1.0 and 5.0."
265
+ )
266
+ if weight < 1.0 or weight > 5.0:
267
+ raise ValueError(
268
+ f"Invalid {weight_type} weight for '{name}': {weight}. "
269
+ f"Weight must be between 1.0 and 5.0 (inclusive). "
270
+ f"1.0 is the minimum weight and 5.0 is the maximum weight."
271
+ )
272
+
273
+ def select_top_k_metrics(self) -> List[Any]:
274
+ """
275
+ Select the top k most significant metrics from the provided list using MMR algorithm.
276
+
277
+ This method uses a greedy selection approach that balances relevance and diversity:
278
+ 1. First metric is selected based purely on relevance score
279
+ 2. Subsequent metrics are selected using MMR to ensure diversity
280
+
281
+ Returns:
282
+ List[dict]: Top k metrics with their original data intact. The metrics are
283
+ ordered by their selection order (most significant first).
284
+
285
+ Raises:
286
+ ValueError: If k is not a positive integer
287
+ TypeError: If metrics is not a list
288
+
289
+ Examples:
290
+ >>> metrics = [
291
+ ... {"name": "faithfulness", "value": 0.85, "group": "answer_quality", "severity": 0.3},
292
+ ... {"name": "hap", "value": 0.95, "group": "content_safety", "severity": 0.1}
293
+ ... ]
294
+ >>> top_metrics = InsightsGenerator.select_top_k_metrics(metrics, k=2)
295
+ >>> # Filter for node-level metrics only
296
+ >>> node_metrics = InsightsGenerator.select_top_k_metrics(metrics, k=2, applies_to="node")
297
+ """
298
+ # Input validation
299
+ if not isinstance(self.metrics, list):
300
+ raise TypeError(
301
+ f"metrics must be a list, got {type(self.metrics).__name__}")
302
+
303
+ if not isinstance(self.k, int) or self.k <= 0:
304
+ raise ValueError(f"k must be a positive integer, got {self.k}")
305
+
306
+ if not self.metrics:
307
+ logger.warning(
308
+ "Empty metrics list provided to select_top_k_metrics")
309
+ return []
310
+
311
+ # Validate metric structure
312
+ for i, metric in enumerate(self.metrics):
313
+ if not isinstance(metric, dict):
314
+ logger.warning(
315
+ f"Metric at index {i} is not a dictionary, skipping")
316
+ continue
317
+ if "name" not in metric or "group" not in metric:
318
+ logger.warning(
319
+ f"Metric at index {i} missing required fields 'name' or 'group'")
320
+
321
+ # Calculate severity if not already set
322
+ if "severity" not in metric:
323
+ # Check if metric has explicit thresholds
324
+ if "thresholds" in metric and metric.get("thresholds"):
325
+ try:
326
+ sev = self._severity(
327
+ metric["value"],
328
+ metric["thresholds"][0]["value"],
329
+ metric["thresholds"][0]["type"]
330
+ )
331
+ metric["severity"] = sev
332
+ except (KeyError, IndexError, TypeError) as e:
333
+ logger.warning(
334
+ f"Could not calculate severity for metric {metric.get('name')}: {e}")
335
+ metric["severity"] = 0.0
336
+ # For cost/latency metrics without thresholds, use percentile-based threshold
337
+ elif metric.get("name") in self.PERCENTILE_BASED_METRICS:
338
+ try:
339
+ threshold_val, sev = self._compute_percentile_based_severity(
340
+ metric)
341
+ metric["severity"] = sev
342
+ metric["threshold"] = threshold_val
343
+ # Also compute violations_count from individual results
344
+ violations = self._compute_violations_count_from_individual_results(
345
+ metric)
346
+ if violations is not None:
347
+ metric["violations_count"] = violations
348
+ logger.debug(
349
+ f"Computed percentile-based severity {sev:.4f} and violations_count {metric.get('violations_count', 0)} for {metric.get('name')}")
350
+ except Exception as e:
351
+ logger.warning(
352
+ f"Could not calculate percentile-based severity for metric {metric.get('name')}: {e}")
353
+ metric["severity"] = 0.0
354
+ else:
355
+ metric["severity"] = 0.0
356
+
357
+ selected: List[Any] = []
358
+ candidates: List[dict] = self.metrics[:]
359
+
360
+ while candidates and len(selected) < self.k:
361
+ if not selected:
362
+ # First metric: select based on relevance score
363
+ best = max(candidates, key=self._relevance_score)
364
+ # Store the relevance score as MMR score for the first metric
365
+ best["mmr_score"] = self._relevance_score(best)
366
+ else:
367
+ # Apply MMR (Maximal Marginal Relevance)
368
+ best = max(
369
+ candidates,
370
+ key=lambda c: self._compute_mmr_score(
371
+ c,
372
+ selected))
373
+ # Store the MMR score
374
+ best["mmr_score"] = self._compute_mmr_score(
375
+ best, selected)
376
+
377
+ selected.append(best)
378
+ candidates.remove(best)
379
+
380
+ # Sort selected metrics by MMR score in descending order
381
+ selected.sort(key=lambda m: m.get("mmr_score", 0), reverse=True)
382
+
383
+ # Remove individual_results from the returned metrics to avoid exposing unnecessary data
384
+ for metric in selected:
385
+ if "individual_results" in metric:
386
+ del metric["individual_results"]
387
+
388
+ return selected
389
+
390
+ @staticmethod
391
+ def _severity(value: float, threshold: float, direction: str) -> float:
392
+ """
393
+ Compute severity of threshold violation based on how far the value deviates from threshold.
394
+
395
+ The severity score increases exponentially as the violation becomes more severe,
396
+ using the formula: 1 - exp(-2 * relative_violation)
397
+
398
+ Args:
399
+ value (float): The actual metric value
400
+ threshold (float): The threshold value to compare against
401
+ direction (str): Direction of the threshold check:
402
+ - "upper_limit": value should be below threshold
403
+ - "lower_limit": value should be above threshold
404
+
405
+ Returns:
406
+ float: Severity score between 0.0 and 1.0, where:
407
+ - 0.0 indicates no violation
408
+ - 1.0 indicates severe violation
409
+
410
+ Examples:
411
+ >>> InsightsGenerator._severity(0.9, 0.8, "upper_limit") # 12.5% over limit
412
+ 0.22 # Moderate severity
413
+ >>> InsightsGenerator._severity(0.5, 0.8, "lower_limit") # 37.5% below limit
414
+ 0.53 # Higher severity
415
+ """
416
+ if threshold == 0:
417
+ return 0.0
418
+
419
+ if direction == "upper_limit":
420
+ rel = max(0.0, (value - threshold) / abs(threshold))
421
+ else:
422
+ rel = max(0.0, (threshold - value) / abs(threshold))
423
+
424
+ return max(0.0, min(1.0, 1 - math.exp(-2 * rel)))
425
+
426
+ def _compute_percentile_based_severity(self, metric: dict) -> tuple[float, float]:
427
+ """
428
+ Compute severity for cost/latency metrics using percentile-based thresholds.
429
+
430
+ For metrics without explicit thresholds (like duration, cost, token counts),
431
+ this method uses the specified percentile from the metric's percentiles data
432
+ as a dynamic threshold. Values above this percentile are considered violations.
433
+
434
+ Args:
435
+ metric (dict): Metric dictionary containing:
436
+ - value (float): The actual metric value
437
+ - percentiles (dict, optional): Dictionary with percentile values
438
+ - name (str): Metric name
439
+
440
+ Returns:
441
+ float: Severity score between 0.0 and 1.0, where:
442
+ - 0.0 indicates value is at or below the percentile threshold
443
+ - Higher values indicate increasingly severe violations
444
+
445
+ Examples:
446
+ >>> metric = {
447
+ ... "name": "duration",
448
+ ... "value": 8.5,
449
+ ... "percentiles": {"95": 5.0, "99": 7.0}
450
+ ... }
451
+ >>> generator = InsightsGenerator([], k=3, percentile_threshold=95.0)
452
+ >>> generator._compute_percentile_based_severity(metric)
453
+ 0.53 # Value is 70% above 95th percentile
454
+ """
455
+ if not isinstance(metric, dict):
456
+ return 0.0
457
+
458
+ value = metric.get("value")
459
+ if value is None:
460
+ return 0.0
461
+
462
+ # Get percentiles data
463
+ percentiles = metric.get("percentiles")
464
+ if not percentiles or not isinstance(percentiles, dict):
465
+ logger.debug(
466
+ f"No percentiles data available for {metric.get('name')}, severity set to 0.0")
467
+ return 0.0
468
+
469
+ # Get the threshold percentile value (e.g., 95th percentile)
470
+ percentile_key = str(int(self.percentile_threshold))
471
+ threshold_value = percentiles.get(percentile_key)
472
+
473
+ if threshold_value is None:
474
+ logger.debug(
475
+ f"Percentile {percentile_key} not found for {metric.get('name')}, severity set to 0.0")
476
+ return 0.0
477
+
478
+ # For cost/latency metrics, higher values are worse (upper_limit behavior)
479
+ # Calculate severity using the same formula as _severity method
480
+ return threshold_value, self._severity(value, threshold_value, "upper_limit")
481
+
482
+ def _compute_violations_count_from_individual_results(self, metric: dict) -> Optional[int]:
483
+ """
484
+ Compute violations_count for percentile-based metrics using individual results.
485
+ Also updates the metric value to show the maximum violating value instead of the mean.
486
+
487
+ For metrics with percentile-based thresholds, this method counts how many
488
+ individual measurements exceeded the percentile threshold and replaces the
489
+ aggregated value with the maximum violating value for better visibility.
490
+
491
+ Args:
492
+ metric (dict): Metric dictionary containing:
493
+ - value (float): The aggregated metric value (will be replaced with max violating value)
494
+ - percentiles (dict): Dictionary with percentile values
495
+ - individual_results (list, optional): List of individual metric measurements
496
+ - name (str): Metric name
497
+
498
+ Returns:
499
+ Optional[int]: Number of violations, or None if individual_results are not available
500
+
501
+ Examples:
502
+ >>> metric = {
503
+ ... "name": "duration",
504
+ ... "value": 8.5,
505
+ ... "percentiles": {"95": 5.0},
506
+ ... "individual_results": [
507
+ ... {"value": 3.0}, {"value": 6.0}, {"value": 9.0}, {"value": 4.0}
508
+ ... ]
509
+ ... }
510
+ >>> generator = InsightsGenerator([], k=3, percentile_threshold=95.0)
511
+ >>> generator._compute_violations_count_from_individual_results(metric)
512
+ 2 # Two values (6.0 and 9.0) exceed the 95th percentile threshold of 5.0
513
+ # metric["value"] is now 9.0 (the maximum violating value)
514
+ """
515
+ if not isinstance(metric, dict):
516
+ return None
517
+
518
+ # Get individual results
519
+ individual_results = metric.get("individual_results")
520
+ if not individual_results or not isinstance(individual_results, list):
521
+ logger.debug(
522
+ f"No individual_results available for {metric.get('name')}, cannot compute violations_count")
523
+ return None
524
+
525
+ threshold_value = metric.get("threshold")
526
+
527
+ if threshold_value is None:
528
+ return None
529
+
530
+ # Find all individual results that exceed the threshold
531
+ # For cost/latency metrics, higher values are worse (violations)
532
+ violating_values = [
533
+ result.get("value")
534
+ for result in individual_results
535
+ if isinstance(result, dict) and result.get("value") is not None
536
+ and result.get("value") > threshold_value
537
+ ]
538
+
539
+ violations_count = len(violating_values)
540
+
541
+ # If there are violations, replace the aggregated value with the maximum violating value
542
+ if violations_count > 0:
543
+ max_violating_value = max(violating_values)
544
+ metric["value"] = max_violating_value
545
+ logger.debug(
546
+ f"Replaced aggregated value with max violating value {max_violating_value} for {metric.get('name')}")
547
+
548
+ logger.debug(
549
+ f"Computed violations_count={violations_count} for {metric.get('name')} "
550
+ f"from {len(individual_results)} individual results with threshold={threshold_value}")
551
+
552
+ return violations_count
553
+
554
+ def _relevance_score(
555
+ self,
556
+ metric: dict,
557
+ w_sev: float = 0.7,
558
+ w_frq: float = 0.3) -> float:
559
+ """
560
+ Compute the relevance score for a metric based on severity, frequency, and importance weights.
561
+
562
+ The relevance score combines:
563
+ 1. Severity of threshold violations (weighted by w_sev)
564
+ 2. Frequency of violations (weighted by w_frq)
565
+ 3. Metric group importance (from metric_group_weights)
566
+ 4. Individual metric importance (from metric_weights)
567
+
568
+ Args:
569
+ metric (dict): Metric dictionary containing:
570
+ - name (str): Metric name
571
+ - group (str): Metric group
572
+ - severity (float, optional): Severity score (0-1)
573
+ - violations_count (int, optional): Number of violations
574
+ w_sev (float, optional): Weight for severity component. Defaults to 0.7.
575
+ w_frq (float, optional): Weight for frequency component. Defaults to 0.3.
576
+
577
+ Returns:
578
+ float: Relevance score (higher values indicate more relevant/important metrics)
579
+
580
+ Note:
581
+ If violations_count is not present in the metric, it defaults to 0.
582
+ Unknown metric groups default to weight 1.0.
583
+ Unknown metric names default to weight 1.0.
584
+ """
585
+ if "violations_count" not in metric:
586
+ metric["violations_count"] = 0
587
+
588
+ base_score = (
589
+ w_sev * metric.get("severity", 0.0) +
590
+ w_frq * metric["violations_count"]
591
+ )
592
+
593
+ group_weight = self.metric_group_weights.get(
594
+ metric.get("group", "other"), 1.0)
595
+ metric_weight = self.metric_weights.get(metric.get("name", ""), 1.0)
596
+
597
+ return base_score * group_weight * metric_weight
598
+
599
+ @staticmethod
600
+ def _similarity(
601
+ metric_1: dict,
602
+ metric_2: dict,
603
+ method: str = "category") -> float:
604
+ """
605
+ Compute similarity between two metrics for diversity calculation in MMR.
606
+
607
+ Args:
608
+ metric_1 (dict): First metric dictionary
609
+ metric_2 (dict): Second metric dictionary
610
+ method (str, optional): Similarity calculation method. Defaults to "category".
611
+ - "category": Returns 1.0 if metrics are in same group, 0.0 otherwise
612
+ - "euclidean": Returns similarity based on Euclidean distance of
613
+ violations_count and severity
614
+
615
+ Returns:
616
+ float: Similarity score between 0.0 (completely different) and 1.0 (identical)
617
+
618
+ Examples:
619
+ >>> m1 = {"group": "answer_quality", "violations_count": 2, "severity": 0.5}
620
+ >>> m2 = {"group": "answer_quality", "violations_count": 3, "severity": 0.6}
621
+ >>> InsightsGenerator._similarity(m1, m2, "category")
622
+ 1.0 # Same group
623
+ >>> m3 = {"group": "content_safety", "violations_count": 2, "severity": 0.5}
624
+ >>> InsightsGenerator._similarity(m1, m3, "category")
625
+ 0.0 # Different group
626
+ """
627
+ if method == "euclidean":
628
+ distance = ((metric_1.get("violations_count", 0) -
629
+ metric_2.get("violations_count", 0)) ** 2 +
630
+ (metric_1.get("severity", 0.0) -
631
+ metric_2.get("severity", 0.0)) ** 2)
632
+ return 1.0 / (1.0 + math.sqrt(distance))
633
+ elif method == "category":
634
+ return 1.0 if metric_1.get(
635
+ "group") == metric_2.get("group") else 0.0
636
+ return 0.0
637
+
638
+ def _compute_mmr_score(
639
+ self,
640
+ candidate: dict,
641
+ selected: List[dict],
642
+ lambda_val: float = 0.5) -> float:
643
+ """
644
+ Compute Maximal Marginal Relevance (MMR) score for a candidate metric.
645
+
646
+ MMR balances relevance and diversity by penalizing candidates that are too similar
647
+ to already selected metrics. The score is computed as:
648
+ MMR = λ * relevance + (1-λ) * diversity
649
+
650
+ Args:
651
+ candidate (dict): Candidate metric to score
652
+ selected (List[dict]): List of already selected metrics
653
+ lambda_val (float, optional): Balance parameter between relevance and diversity.
654
+ Defaults to 0.5.
655
+ - Higher values (closer to 1.0) favor relevance
656
+ - Lower values (closer to 0.0) favor diversity
657
+
658
+ Returns:
659
+ float: MMR score (higher values indicate better candidates considering both
660
+ relevance and diversity)
661
+
662
+ Raises:
663
+ ValueError: If selected list is empty
664
+
665
+ Note:
666
+ This method is used internally by select_top_k_metrics to ensure diverse
667
+ metric selection.
668
+ """
669
+ if not selected:
670
+ raise ValueError(
671
+ "selected list cannot be empty for MMR computation")
672
+
673
+ rel = self._relevance_score(metric=candidate)
674
+ max_sim = max(
675
+ InsightsGenerator._similarity(
676
+ metric_1=candidate,
677
+ metric_2=s) for s in selected)
678
+ diversity = 1.0 - max_sim
679
+ score = lambda_val * rel + (1 - lambda_val) * diversity
680
+ return score
681
+
682
+ @staticmethod
683
+ def _convert_to_langchain_model(
684
+ llm_model: Union[WxAIFoundationModel, OpenAIFoundationModel,
685
+ AzureOpenAIFoundationModel, Any]
686
+ ) -> Any:
687
+ """
688
+ Convert a foundation model to a LangChain-compatible model.
689
+
690
+ Args:
691
+ llm_model: A FoundationModel instance
692
+ (e.g., WxAIFoundationModel, OpenAIFoundationModel, AzureOpenAIFoundationModel)
693
+
694
+ Returns:
695
+ LangChain-compatible model with invoke() method
696
+
697
+ Raises:
698
+ Exception: If the provider type is not supported
699
+
700
+ Examples:
701
+ >>> # Using WxAIFoundationModel directly
702
+ >>> model = WxAIFoundationModel(model_id="ibm/granite-3-3-8b-instruct", project_id=PROJECT_ID)
703
+ >>> langchain_model = InsightsGenerator._convert_to_langchain_model(model)
704
+
705
+ """
706
+ # Extract the foundation model from LLMJudge if needed
707
+
708
+ foundation_model = llm_model
709
+
710
+ provider_type = foundation_model.provider.type
711
+
712
+ if provider_type == ModelProviderType.IBM_WATSONX_AI:
713
+ if ChatWatsonx is None:
714
+ raise ImportError(
715
+ "langchain_ibm is required for WatsonX models. Install it with: pip install langchain-ibm")
716
+
717
+ parameters = {
718
+ "decoding_method": "greedy",
719
+ "max_new_tokens": 512,
720
+ "min_new_tokens": 1,
721
+ "stop_sequences": [".", "<|eom_id|>"]
722
+ }
723
+ return ChatWatsonx(
724
+ model_id=foundation_model.model_id,
725
+ url=foundation_model.provider.credentials.url,
726
+ apikey=foundation_model.provider.credentials.api_key,
727
+ project_id=foundation_model.project_id,
728
+ params=parameters,
729
+ )
730
+ elif provider_type == ModelProviderType.AZURE_OPENAI:
731
+ if AzureChatOpenAI is None:
732
+ raise ImportError(
733
+ "langchain_openai is required for Azure OpenAI models. Install it with: pip install langchain-openai")
734
+
735
+ credentials = foundation_model.provider.credentials
736
+ model_id = foundation_model.model_name
737
+ azure_openapi_host = credentials.url
738
+ api_version = credentials.api_version
739
+ model_base = model_id.split("/")[-1].replace(".", "-")
740
+ azure_endpoint = \
741
+ f'{azure_openapi_host}/openai/deployments/{model_base}/chat/completions?api-version={api_version}'
742
+ parameters = {"temperature": 0}
743
+ return AzureChatOpenAI(
744
+ api_key=credentials.api_key,
745
+ azure_endpoint=azure_endpoint,
746
+ api_version=api_version,
747
+ max_retries=2,
748
+ **parameters
749
+ )
750
+ elif provider_type == ModelProviderType.OPENAI:
751
+ if ChatOpenAI is None:
752
+ raise ImportError(
753
+ "langchain_openai is required for OpenAI models. Install it with: pip install langchain-openai")
754
+
755
+ model_name = foundation_model.model_name
756
+ return ChatOpenAI(
757
+ model=model_name,
758
+ max_retries=2,
759
+ temperature=0.0
760
+ )
761
+ else:
762
+ raise Exception(
763
+ f"Unsupported provider type: {provider_type}. Supported types are: IBM_WATSONX_AI, AZURE_OPENAI, RITS, OPENAI")
764
+
765
+ def generate_structured_insights(self,
766
+ top_metrics: List[Any],
767
+ output_format: str = "html",
768
+ top_k: int = 3
769
+ ) -> Union[str, InsightsReport]:
770
+ """
771
+ Generate structured insights with top insights, root causes, and recommendations.
772
+
773
+ This method analyzes metrics and generates a comprehensive report including:
774
+ - Top K most significant insights
775
+ - Likely root causes
776
+ - Actionable recommendations
777
+
778
+ Args:
779
+ metrics (List[Any]): List of metric dictionaries or objects
780
+ output_format (str, optional): Output format ("text", "json", or "html"). Defaults to "html".
781
+ Returns:
782
+ Union[str, InsightsReport]: For "text" format, returns InsightsReport object that displays
783
+ properly in Jupyter notebooks. For "html" and "json" formats, returns formatted string.
784
+
785
+ Note:
786
+ For text format in Jupyter notebooks, the returned InsightsReport object will automatically
787
+ render with proper formatting. If you need the raw string, use str(result) or result.content.
788
+
789
+ Examples:
790
+ >>> metrics = [
791
+ ... {"name": "latency", "value": 7.21, "group": "performance", "threshold": 3.0},
792
+ ... {"name": "average_precision", "value": 0.0, "group": "retrieval_quality", "threshold": 0.7}
793
+ ... ]
794
+ >>> insights = InsightsGenerator.generate_structured_insights(metrics, llm_judge)
795
+ >>> # Filter for node-level metrics only
796
+ >>> node_insights = InsightsGenerator.generate_structured_insights(metrics, llm_judge, applies_to="node")
797
+ >>> # Filter for multiple component levels
798
+ >>> multi_insights = InsightsGenerator.generate_structured_insights(metrics, llm_judge, applies_to=["node", "message"])
799
+ """
800
+ import json as json_module
801
+ from datetime import datetime
802
+
803
+ # Build structured input for LLM
804
+ llm_input = {
805
+ "top_metrics": [],
806
+ "summary_stats": {
807
+ "total_metrics": len(self.metrics),
808
+ "metrics_by_group": {}
809
+ }
810
+ }
811
+
812
+ # Process metrics
813
+ for metric in top_metrics:
814
+ if isinstance(metric, dict):
815
+ metric_dict = metric
816
+ elif hasattr(metric, '__dict__'):
817
+ metric_dict = metric.__dict__
818
+ else:
819
+ continue
820
+
821
+ # Get threshold value - either explicit or from percentiles
822
+ threshold_value = metric_dict.get("threshold")
823
+ if threshold_value is None and metric_dict.get("thresholds"):
824
+ threshold_value = metric_dict.get(
825
+ "thresholds", [{}])[0].get("value")
826
+
827
+ metric_info = {
828
+ "name": metric_dict.get("name", "Unknown"),
829
+ "value": metric_dict.get("value"),
830
+ "group": metric_dict.get("group", "other"),
831
+ "mmr_score": metric_dict.get("mmr_score"),
832
+ "violations_count": metric_dict.get("violations_count", 0),
833
+ "threshold": threshold_value,
834
+ "applies_to": metric_dict.get("applies_to", "unknown"),
835
+ "node_name": metric_dict.get("node_name", "")
836
+ }
837
+ llm_input["top_metrics"].append(metric_info)
838
+
839
+ # Update group stats
840
+ group = metric_info["group"]
841
+ if group not in llm_input["summary_stats"]["metrics_by_group"]:
842
+ llm_input["summary_stats"]["metrics_by_group"][group] = 0
843
+ llm_input["summary_stats"]["metrics_by_group"][group] += 1
844
+
845
+ # Create comprehensive prompt
846
+ prompt = f"""
847
+ You are an analyst writing for engineering and product stakeholders (including business users).
848
+ Using ONLY the JSON below, produce a structured analysis with three sections:
849
+
850
+ 1) Top {top_k} Insights:
851
+ - Provide exactly {top_k} key insights based on the top_metrics list, in the same order as provided
852
+ - Each insight should be 1-2 sentences, business-friendly (focus on user/customer impact)
853
+ - Mention the metric name, group, value, and threshold (if available)
854
+ - Explain the significance and potential impact on users
855
+ - Use specific numbers from the JSON
856
+
857
+ 2) Likely Root Causes:
858
+ - Provide 3 concise bullet points of probable causes based on the metrics data
859
+ - Consider patterns across multiple metrics
860
+ - Be specific and actionable
861
+
862
+ 3) Recommendations:
863
+ - Provide 4-6 actionable recommendations
864
+ - Prioritize by impact (first = highest priority)
865
+ - Be specific and include concrete next steps
866
+
867
+ DO NOT invent or change numbers — use only data present in the JSON.
868
+ Keep the analysis concise and actionable.
869
+
870
+ Structured data (do NOT modify):
871
+ {json_module.dumps(llm_input, indent=2)}
872
+ """
873
+
874
+ try:
875
+ # Generate insights using LLM
876
+ if isinstance(self.llm_model, (WxAIFoundationModel, OpenAIFoundationModel,
877
+ AzureOpenAIFoundationModel)):
878
+ try:
879
+
880
+ from ibm_watsonx_gov.metrics.llm_validation.llm_validation_impl import \
881
+ generate_llm_response
882
+
883
+ # Convert foundation model to LangChain-compatible model
884
+ langchain_model = InsightsGenerator._convert_to_langchain_model(
885
+ self.llm_model)
886
+
887
+ system_message = "You are a helpful, concise system reliability analyst."
888
+ response = generate_llm_response(
889
+ langchain_model,
890
+ system_message,
891
+ prompt
892
+ )
893
+ except Exception as e:
894
+ logger.warning(f"Error generating insights: {str(e)}")
895
+ response = InsightsGenerator._generate_fallback_insights(
896
+ llm_input, top_k)
897
+ else:
898
+ # For custom models with generate() method
899
+ response = self.llm_model.generate(prompt).strip()
900
+
901
+ if output_format == "html":
902
+ return InsightsGenerator._format_structured_as_html(
903
+ response, llm_input)
904
+ elif output_format == "json":
905
+ return InsightsGenerator._format_structured_as_json(
906
+ response, llm_input)
907
+ else:
908
+ # Return InsightsReport object for proper Jupyter notebook display
909
+ text_content = InsightsGenerator._format_structured_as_text(
910
+ response, llm_input)
911
+ return InsightsReport(text_content, format_type="text")
912
+
913
+ except Exception as e:
914
+ logger.error(f"Failed to generate structured insights: {str(e)}")
915
+ fallback_content = InsightsGenerator._generate_fallback_insights(
916
+ llm_input, top_k)
917
+ return InsightsReport(fallback_content, format_type="text")
918
+
919
+ @staticmethod
920
+ def _generate_fallback_insights(llm_input: dict, top_k: int) -> str:
921
+ """Generate fallback insights when LLM generation fails."""
922
+ fallback = f"Top {top_k} Insights (Fallback):\n\n"
923
+ for i, m in enumerate(llm_input["top_metrics"], 1):
924
+ threshold_text = f", threshold: {m['threshold']}" if m.get(
925
+ 'threshold') is not None else ""
926
+ fallback += f"{i}. {m['name']} ({m['group']}): value={m['value']}{threshold_text}\n"
927
+ return fallback
928
+
929
+ @staticmethod
930
+ def _extract_list_items(text: str) -> List[str]:
931
+ """
932
+ Extract list items from text by splitting on numbered items or bullet points.
933
+
934
+ Args:
935
+ text: Text containing numbered or bulleted list items
936
+
937
+ Returns:
938
+ List of cleaned text items without markers
939
+ """
940
+ import re
941
+
942
+ items = re.split(
943
+ r'\n\s*(?=\d+[\.\)]\s+|-\s+|\*\s+|•\s+)', text)
944
+ cleaned_items = []
945
+ for item in items:
946
+ # Remove leading bullet/number markers
947
+ cleaned = re.sub(
948
+ r'^\s*(?:\d+[\.\)]\s*|-\s+|\*\s+|•\s+)', '', item.strip())
949
+ if cleaned:
950
+ cleaned_items.append(cleaned)
951
+ return cleaned_items
952
+
953
+ @staticmethod
954
+ def _parse_insights_sections(text: str) -> dict:
955
+ """
956
+ Parse the insights text into structured sections.
957
+
958
+ Args:
959
+ text: Raw insights text from LLM
960
+
961
+ Returns:
962
+ Dictionary with keys: top_insights, root_causes, recommendations
963
+ """
964
+ import re
965
+
966
+ sections = {
967
+ "top_insights": [],
968
+ "root_causes": [],
969
+ "recommendations": []
970
+ }
971
+
972
+ # Split by common section headers
973
+ top_insights_match = re.search(
974
+ r'(?:Top \d+ Insights?:|1\)\s*Top \d+ Insights?:)(.*?)(?=(?:Likely Root Causes?:|2\)|$))',
975
+ text, re.DOTALL | re.IGNORECASE)
976
+ root_causes_match = re.search(
977
+ r'(?:Likely Root Causes?:|2\)\s*Likely Root Causes?:)(.*?)(?=(?:Recommendations?:|3\)|$))',
978
+ text, re.DOTALL | re.IGNORECASE)
979
+ recommendations_match = re.search(
980
+ r'(?:Recommendations?:|3\)\s*Recommendations?:)(.*?)$',
981
+ text, re.DOTALL | re.IGNORECASE)
982
+
983
+ # Extract top insights
984
+ if top_insights_match:
985
+ insights_text = top_insights_match.group(1).strip()
986
+ sections["top_insights"] = InsightsGenerator._extract_list_items(
987
+ insights_text)
988
+
989
+ # Extract root causes
990
+ if root_causes_match:
991
+ causes_text = root_causes_match.group(1).strip()
992
+ sections["root_causes"] = InsightsGenerator._extract_list_items(
993
+ causes_text)
994
+
995
+ # Extract recommendations
996
+ if recommendations_match:
997
+ recs_text = recommendations_match.group(1).strip()
998
+ sections["recommendations"] = InsightsGenerator._extract_list_items(
999
+ recs_text)
1000
+
1001
+ return sections
1002
+
1003
+ @staticmethod
1004
+ def _wrap_text(text: str, width: int = 76, indent: str = "") -> str:
1005
+ """
1006
+ Wrap text to a specified width with optional indentation for continuation lines.
1007
+
1008
+ Args:
1009
+ text: Text to wrap
1010
+ width: Maximum line width (default: 76)
1011
+ indent: Indentation string for continuation lines (default: "")
1012
+
1013
+ Returns:
1014
+ Wrapped text with proper line breaks
1015
+ """
1016
+ import textwrap
1017
+
1018
+ # Use textwrap to handle the wrapping
1019
+ wrapper = textwrap.TextWrapper(
1020
+ width=width,
1021
+ subsequent_indent=indent,
1022
+ break_long_words=False,
1023
+ break_on_hyphens=False
1024
+ )
1025
+
1026
+ return wrapper.fill(text)
1027
+
1028
+ @staticmethod
1029
+ def _format_structured_as_html(insights_text: str, llm_input: dict) -> str:
1030
+ """Format structured insights as HTML report with proper bullet points."""
1031
+ import html as pyhtml
1032
+ from datetime import datetime, timezone
1033
+
1034
+ # Parse the insights
1035
+ parsed = InsightsGenerator._parse_insights_sections(insights_text)
1036
+
1037
+ html_lines = [
1038
+ "<html><head><meta charset='utf-8'><title>AI System Insights Report</title>",
1039
+ "<style>",
1040
+ "body { font-family: Arial, sans-serif; margin: 20px; line-height: 1.6; }",
1041
+ "h1 { color: #333; }",
1042
+ "h2 { color: #666; margin-top: 30px; }",
1043
+ "h3 { color: #888; margin-top: 20px; }",
1044
+ "ul, ol { line-height: 1.8; margin-left: 20px; }",
1045
+ "li { margin-bottom: 10px; }",
1046
+ "table { border-collapse: collapse; width: 100%; margin-top: 20px; }",
1047
+ "th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }",
1048
+ "th { background-color: #f2f2f2; font-weight: bold; }",
1049
+ ".metric-value { font-weight: bold; color: #d9534f; }",
1050
+ ".metric-group { color: #5bc0de; font-style: italic; }",
1051
+ ".section { margin-bottom: 30px; }",
1052
+ "</style></head><body>",
1053
+ "<h1>AI System Insights Report</h1>",
1054
+ f"<p><em>Generated: {datetime.now(timezone.utc).isoformat()}</em></p>",
1055
+ "<hr>"
1056
+ ]
1057
+
1058
+ # Add Top Insights section
1059
+ if parsed["top_insights"]:
1060
+ html_lines.append("<div class='section'>")
1061
+ html_lines.append(
1062
+ f"<h2>Top {len(parsed['top_insights'])} Insights</h2>")
1063
+ html_lines.append("<ol>")
1064
+ for insight in parsed["top_insights"]:
1065
+ html_lines.append(f"<li>{pyhtml.escape(insight)}</li>")
1066
+ html_lines.append("</ol>")
1067
+ html_lines.append("</div>")
1068
+
1069
+ # Add Root Causes section
1070
+ if parsed["root_causes"]:
1071
+ html_lines.append("<div class='section'>")
1072
+ html_lines.append("<h2>Likely Root Causes</h2>")
1073
+ html_lines.append("<ul>")
1074
+ for cause in parsed["root_causes"]:
1075
+ html_lines.append(f"<li>{pyhtml.escape(cause)}</li>")
1076
+ html_lines.append("</ul>")
1077
+ html_lines.append("</div>")
1078
+
1079
+ # Add Recommendations section
1080
+ if parsed["recommendations"]:
1081
+ html_lines.append("<div class='section'>")
1082
+ html_lines.append("<h2>Recommendations</h2>")
1083
+ html_lines.append("<ol>")
1084
+ for rec in parsed["recommendations"]:
1085
+ html_lines.append(f"<li>{pyhtml.escape(rec)}</li>")
1086
+ html_lines.append("</ol>")
1087
+ html_lines.append("</div>")
1088
+
1089
+ # If parsing failed, fall back to raw text
1090
+ if not any([parsed["top_insights"], parsed["root_causes"], parsed["recommendations"]]):
1091
+ html_lines.append("<div class='section'>")
1092
+ html_lines.append("<pre>")
1093
+ html_lines.append(pyhtml.escape(insights_text))
1094
+ html_lines.append("</pre>")
1095
+ html_lines.append("</div>")
1096
+
1097
+ html_lines.append("<hr>")
1098
+ html_lines.append("<h2>Summary Statistics</h2>")
1099
+ html_lines.append(
1100
+ f"<p>Total metrics analyzed: <strong>{llm_input['summary_stats']['total_metrics']}</strong></p>")
1101
+ html_lines.append("<h3>Metrics by Group</h3>")
1102
+ html_lines.append("<table><tr><th>Group</th><th>Count</th></tr>")
1103
+
1104
+ for group, count in llm_input['summary_stats']['metrics_by_group'].items(
1105
+ ):
1106
+ html_lines.append(
1107
+ f"<tr><td>{pyhtml.escape(group)}</td><td>{count}</td></tr>")
1108
+
1109
+ html_lines.append("</table>")
1110
+ html_lines.append("<h3>Top Metrics Details</h3>")
1111
+
1112
+ # Check if any metrics have applies_to='node' to determine if we should show node_name column
1113
+ has_node_metrics = any(m.get('applies_to') ==
1114
+ 'node' for m in llm_input['top_metrics'])
1115
+
1116
+ # Build table header based on whether we have node metrics
1117
+ if has_node_metrics:
1118
+ html_lines.append(
1119
+ "<table><tr><th>Metric</th><th>Group</th><th>Node Name</th><th>Value</th><th>Threshold</th><th>Violations</th></tr>")
1120
+ else:
1121
+ html_lines.append(
1122
+ "<table><tr><th>Metric</th><th>Group</th><th>Value</th><th>Threshold</th><th>Violations</th></tr>")
1123
+
1124
+ for m in llm_input['top_metrics']:
1125
+ threshold_val = m.get('threshold', 'N/A')
1126
+ applies_to = m.get('applies_to', 'unknown')
1127
+ node_name = m.get('node_name', '')
1128
+
1129
+ # Build row based on whether we're showing node_name column
1130
+ if has_node_metrics:
1131
+ # Only show node_name value if applies_to is 'node'
1132
+ node_name_display = pyhtml.escape(
1133
+ node_name) if applies_to == 'node' and node_name else '-'
1134
+ html_lines.append(
1135
+ f"<tr><td>{pyhtml.escape(str(m['name']))}</td>"
1136
+ f"<td class='metric-group'>{pyhtml.escape(str(m['group']))}</td>"
1137
+ f"<td>{node_name_display}</td>"
1138
+ f"<td class='metric-value'>{pyhtml.escape(str(m['value']))}</td>"
1139
+ f"<td>{pyhtml.escape(str(threshold_val))}</td>"
1140
+ f"<td>{m.get('violations_count', 0)}</td></tr>")
1141
+ else:
1142
+ html_lines.append(
1143
+ f"<tr><td>{pyhtml.escape(str(m['name']))}</td>"
1144
+ f"<td class='metric-group'>{pyhtml.escape(str(m['group']))}</td>"
1145
+ f"<td class='metric-value'>{pyhtml.escape(str(m['value']))}</td>"
1146
+ f"<td>{pyhtml.escape(str(threshold_val))}</td>"
1147
+ f"<td>{m.get('violations_count', 0)}</td></tr>")
1148
+
1149
+ html_lines.append("</table></body></html>")
1150
+ return "\n".join(html_lines)
1151
+
1152
+ @staticmethod
1153
+ def _format_structured_as_json(insights_text: str, llm_input: dict) -> str:
1154
+ """Format structured insights as JSON with cleaned formatting."""
1155
+ import json as json_module
1156
+ from datetime import datetime, timezone
1157
+
1158
+ # Parse the insights
1159
+ parsed_insights = InsightsGenerator._parse_insights_sections(
1160
+ insights_text)
1161
+
1162
+ # Clean up top_metrics by removing newlines from node_name
1163
+ cleaned_metrics = []
1164
+ for metric in llm_input["top_metrics"]:
1165
+ cleaned_metric = metric.copy()
1166
+ if "node_name" in cleaned_metric and cleaned_metric["node_name"]:
1167
+ # Replace newlines and multiple spaces with single space
1168
+ cleaned_metric["node_name"] = ' '.join(
1169
+ cleaned_metric["node_name"].split())
1170
+ cleaned_metrics.append(cleaned_metric)
1171
+
1172
+ # Create JSON structure with parsed insights
1173
+ output = {
1174
+ "generated_at": datetime.now(timezone.utc).isoformat(),
1175
+ "insights": {
1176
+ "top_insights": parsed_insights["top_insights"],
1177
+ "root_causes": parsed_insights["root_causes"],
1178
+ "recommendations": parsed_insights["recommendations"]
1179
+ },
1180
+ "summary_stats": llm_input["summary_stats"],
1181
+ "top_metrics": cleaned_metrics
1182
+ }
1183
+
1184
+ return json_module.dumps(output, indent=2, ensure_ascii=False)
1185
+
1186
+ @staticmethod
1187
+ def _format_structured_as_text(insights_text: str, llm_input: dict) -> str:
1188
+ """Format structured insights as plain text with proper formatting for Jupyter notebooks."""
1189
+ from datetime import datetime, timezone
1190
+
1191
+ # Parse the insights
1192
+ parsed = InsightsGenerator._parse_insights_sections(insights_text)
1193
+
1194
+ lines = [
1195
+ "=" * 80,
1196
+ "AI System Insights Report",
1197
+ "=" * 80,
1198
+ f"Generated: {datetime.now(timezone.utc).isoformat()}",
1199
+ ""
1200
+ ]
1201
+
1202
+ # Add Top Insights section
1203
+ if parsed["top_insights"]:
1204
+ lines.append(f"1) Top {len(parsed['top_insights'])} Insights:")
1205
+ lines.append("")
1206
+ for i, insight in enumerate(parsed["top_insights"], 1):
1207
+ # Wrap long lines for better readability
1208
+ wrapped_insight = InsightsGenerator._wrap_text(
1209
+ insight, width=76, indent=" ")
1210
+ lines.append(f" {i}. {wrapped_insight}")
1211
+ lines.append("")
1212
+
1213
+ # Add Root Causes section
1214
+ if parsed["root_causes"]:
1215
+ lines.append("2) Likely Root Causes:")
1216
+ lines.append("")
1217
+ for cause in parsed["root_causes"]:
1218
+ wrapped_cause = InsightsGenerator._wrap_text(
1219
+ cause, width=76, indent=" ")
1220
+ lines.append(f" - {wrapped_cause}")
1221
+ lines.append("")
1222
+
1223
+ # Add Recommendations section
1224
+ if parsed["recommendations"]:
1225
+ lines.append("3) Recommendations:")
1226
+ lines.append("")
1227
+ for i, rec in enumerate(parsed["recommendations"], 1):
1228
+ wrapped_rec = InsightsGenerator._wrap_text(
1229
+ rec, width=76, indent=" ")
1230
+ lines.append(f" {i}) {wrapped_rec}")
1231
+ lines.append("")
1232
+
1233
+ # If parsing failed, fall back to raw text
1234
+ if not any([parsed["top_insights"], parsed["root_causes"], parsed["recommendations"]]):
1235
+ lines.append(insights_text)
1236
+ lines.append("")
1237
+
1238
+ lines.extend([
1239
+ "=" * 80,
1240
+ "Summary Statistics",
1241
+ "=" * 80,
1242
+ f"Total metrics analyzed: {llm_input['summary_stats']['total_metrics']}",
1243
+ ""
1244
+ ])
1245
+
1246
+ # Metrics by Group
1247
+ if llm_input['summary_stats']['metrics_by_group']:
1248
+ lines.append("Metrics by Group:")
1249
+ for group, count in llm_input['summary_stats']['metrics_by_group'].items():
1250
+ lines.append(f" - {group}: {count}")
1251
+ lines.append("")
1252
+
1253
+ # Top Metrics Details
1254
+ lines.extend([
1255
+ "Top Metrics Details:",
1256
+ "-" * 80
1257
+ ])
1258
+
1259
+ for m in llm_input['top_metrics']:
1260
+ threshold_val = m.get('threshold', 'N/A')
1261
+ applies_to = m.get('applies_to', 'unknown')
1262
+ node_name = m.get('node_name', '')
1263
+
1264
+ # Clean node name by replacing newlines and extra spaces
1265
+ if node_name:
1266
+ node_name = ' '.join(node_name.split())
1267
+
1268
+ # Build metric line with node name if it's a node-level metric
1269
+ if applies_to == 'node' and node_name:
1270
+ lines.append(
1271
+ f" • {m['name']} ({m['group']}) [Node: {node_name}]:"
1272
+ )
1273
+ lines.append(
1274
+ f" value={m['value']}, threshold={threshold_val}, violations={m.get('violations_count', 0)}"
1275
+ )
1276
+ else:
1277
+ lines.append(
1278
+ f" • {m['name']} ({m['group']}):"
1279
+ )
1280
+ lines.append(
1281
+ f" value={m['value']}, threshold={threshold_val}, violations={m.get('violations_count', 0)}"
1282
+ )
1283
+
1284
+ lines.append("=" * 80)
1285
+ return "\n".join(lines)