ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
@@ -0,0 +1,258 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ import copy
11
+ from typing import Annotated, List, Literal
12
+
13
+ import pandas as pd
14
+
15
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
16
+
17
+ from lazy_imports import LazyModule, load
18
+ try:
19
+ # Create lazy module for LangChain imports
20
+ langchain_imports = LazyModule(
21
+ "from langchain_ibm import ChatWatsonx",
22
+ "from langchain_openai import AzureChatOpenAI",
23
+ "from langchain_openai import ChatOpenAI",
24
+ name="lazy_langchain_imports"
25
+ )
26
+ load(langchain_imports)
27
+
28
+ # Create aliases
29
+ ChatWatsonx = langchain_imports.ChatWatsonx
30
+ AzureChatOpenAI = langchain_imports.AzureChatOpenAI
31
+ ChatOpenAI = langchain_imports.ChatOpenAI
32
+ except ImportError:
33
+ ChatWatsonx = None
34
+ AzureChatOpenAI = None
35
+ ChatOpenAI = None
36
+ import warnings
37
+ warnings.warn("LangChain dependencies not available")
38
+
39
+ from pydantic import Field
40
+
41
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
42
+ from ibm_watsonx_gov.entities.enums import ModelProviderType, TaskType
43
+ from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
44
+ RecordMetricResult)
45
+ from ibm_watsonx_gov.entities.llm_judge import LLMJudge
46
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
47
+ from ibm_watsonx_gov.metrics.llm_validation.evaluation_criteria import (
48
+ EvaluationCriteria, get_default_evaluation_criteria)
49
+ from ibm_watsonx_gov.metrics.llm_validation.llm_validation_constants import (
50
+ LLMValidation, LLMValidationFields)
51
+ from ibm_watsonx_gov.metrics.llm_validation.llm_validation_impl import (
52
+ generate_issues_and_map_to_records, llm_validation_per_record,
53
+ reverse_mapping)
54
+
55
+
56
+ def get_prompt_field(configuration: GenAIConfiguration, available_fields=None):
57
+ if available_fields is None:
58
+ available_fields = []
59
+ prompt_field = configuration.prompt_field
60
+ if not prompt_field:
61
+ prompt_field = LLMValidationFields.INPUT_FIELD.value
62
+ if not prompt_field:
63
+ raise ValueError("Model input not found in data")
64
+ if available_fields and prompt_field not in available_fields:
65
+ raise ValueError(
66
+ f"prompt_field {prompt_field} not found in data. available fields: {available_fields}")
67
+ return prompt_field
68
+
69
+
70
+ class LLMValidationMetric(GenAIMetric):
71
+ """Defines the implementation for computing the LLMValidation metric.
72
+
73
+ .. code-block:: python
74
+
75
+ from ibm_watsonx_gov.entities.foundation_model import WxAIFoundationModel
76
+ llm_judge=LLMJudge(model=WxAIFoundationModel(model_id="model_id"))
77
+
78
+ .. code-block:: python
79
+
80
+ metric = LLMValidationMetric(llm_judge=llm_judge)
81
+ """
82
+ name: Annotated[Literal["llm_validation"],
83
+ Field(default=LLMValidation)]
84
+ tasks: Annotated[list[TaskType], Field(
85
+ default=[TaskType.RAG, TaskType.SUMMARIZATION])]
86
+ thresholds: Annotated[list[MetricThreshold], Field(default=[MetricThreshold(
87
+ type="lower_limit", value=0.7)])]
88
+ method: Annotated[Literal["llm_as_judge"],
89
+ Field(description=f"The method used to compute the metric.",
90
+ default="llm_as_judge")]
91
+ llm_judge: Annotated[LLMJudge | None, Field(
92
+ description=f"The LLM judge used to compute the metric.")]
93
+ evaluation_criteria: Annotated[EvaluationCriteria | None, Field(
94
+ description=f"Evaluation Criteria for metric the computation", default_factory=get_default_evaluation_criteria)]
95
+
96
+ def evaluate(self, data: pd.DataFrame,
97
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
98
+ **kwargs) -> AggregateMetricResult:
99
+ record_level_metrics = self.get_record_level_metrics(
100
+ data, configuration)
101
+ aggregated_results = self.get_aggregated_results_from_individual_results(
102
+ record_level_metrics)
103
+ return aggregated_results
104
+
105
+ def get_record_level_metrics(self, data: pd.DataFrame | dict,
106
+ configuration: GenAIConfiguration | AgenticAIConfiguration) \
107
+ -> List[RecordMetricResult]:
108
+ # generate evaluator llm
109
+ llm = self.generate_evaluating_model()
110
+
111
+ # prepare the data
112
+ eval_df = copy.deepcopy(data)
113
+ prompt_field = get_prompt_field(
114
+ configuration, available_fields=list(eval_df.columns))
115
+ eval_df[LLMValidationFields.INPUT_FIELD.value] = eval_df.apply(
116
+ lambda r: r[prompt_field], axis=1)
117
+ eval_df = eval_df.fillna("")
118
+ eval_df[LLMValidationFields.OUTPUT_FIELD.value] = eval_df.apply(lambda r: "\n".join([r[output_field]
119
+ for output_field in
120
+ configuration.output_fields]),
121
+ axis=1)
122
+
123
+ # # call the per-record evaluating function
124
+ eval_df = llm_validation_per_record(
125
+ df=eval_df,
126
+ llm=llm,
127
+ input_col=LLMValidationFields.INPUT_FIELD.value,
128
+ output_col=LLMValidationFields.OUTPUT_FIELD.value,
129
+ text_col=LLMValidationFields.TEXT_FIELD.value,
130
+ score_col=LLMValidationFields.SCORE_FIELD.value,
131
+ summary_col=LLMValidationFields.SUMMARY_FIELD.value,
132
+ evaluation_criteria=self.evaluation_criteria
133
+ )
134
+
135
+ record_level_metrics = []
136
+ for _, row in eval_df.iterrows():
137
+ record_level_metrics.append(
138
+ RecordMetricResult(
139
+ name=self.name,
140
+ method=self.method,
141
+ provider="",
142
+ value=row[LLMValidationFields.SCORE_FIELD.value],
143
+ record_id=row[configuration.record_id_field],
144
+ additional_info={
145
+ LLMValidationFields.TEXT_FIELD.value: row[LLMValidationFields.TEXT_FIELD.value],
146
+ LLMValidationFields.SUMMARY_FIELD.value: row[LLMValidationFields.SUMMARY_FIELD.value],
147
+ LLMValidationFields.RECURRING_ISSUE_FIELD.value: "",
148
+ LLMValidationFields.RECURRING_ISSUE_IDS_FIELD.value: ""
149
+ },
150
+ thresholds=self.thresholds,
151
+ )
152
+ )
153
+
154
+ return record_level_metrics
155
+
156
+ def get_aggregated_results_from_individual_results(self, record_level_metrics: List[RecordMetricResult]) \
157
+ -> AggregateMetricResult:
158
+ summaries_list = [r.additional_info[LLMValidationFields.SUMMARY_FIELD.value]
159
+ # TODO!!!! use and map only records with score < 1
160
+ if r.value is not None and r.value < 1 else ""
161
+ for r in record_level_metrics]
162
+ llm = self.generate_evaluating_model()
163
+ recurring_issues_to_record_ids = generate_issues_and_map_to_records(
164
+ summaries_list=summaries_list,
165
+ llm=llm,
166
+ )
167
+ recurring_issues = list(recurring_issues_to_record_ids.keys())
168
+ record_to_matching_issues_ids = reverse_mapping(
169
+ recurring_issues_to_record_ids)
170
+
171
+ for i, r in enumerate(record_level_metrics):
172
+ matching_issues_ids = record_to_matching_issues_ids.get(i, [])
173
+ matching_issues = [recurring_issues[i]
174
+ for i in matching_issues_ids]
175
+ r.additional_info[LLMValidationFields.RECURRING_ISSUE_IDS_FIELD.value] = matching_issues_ids
176
+ r.additional_info[LLMValidationFields.RECURRING_ISSUE_FIELD.value] = matching_issues
177
+
178
+ values = [
179
+ record.value for record in record_level_metrics if record.value is not None]
180
+ mean = sum(values) / len(values)
181
+ evaluation_criteria = self.evaluation_criteria.to_dict(
182
+ ) if self.evaluation_criteria else {}
183
+ recurring_issues_count = {
184
+ k: len(v) for k, v in recurring_issues_to_record_ids.items()}
185
+
186
+ aggregate_result = AggregateMetricResult(
187
+ name=self.name,
188
+ method=self.method,
189
+ provider="",
190
+ value=mean,
191
+ total_records=len(record_level_metrics),
192
+ record_level_metrics=record_level_metrics,
193
+ min=min(values),
194
+ max=max(values),
195
+ mean=mean,
196
+ thresholds=self.thresholds,
197
+ additional_info={"recurring_issues": recurring_issues_to_record_ids,
198
+ "evaluation_criteria": evaluation_criteria,
199
+ "recurring_issues_count": recurring_issues_count,
200
+ }
201
+ )
202
+
203
+ return aggregate_result
204
+
205
+ def generate_evaluating_model(self):
206
+ provider_type = self.llm_judge.model.provider.type
207
+ if provider_type == ModelProviderType.IBM_WATSONX_AI:
208
+ parameters = {
209
+ "decoding_method": "greedy",
210
+ "max_new_tokens": 512,
211
+ "min_new_tokens": 1,
212
+ "stop_sequences": [".", "<|eom_id|>"],
213
+ "enable-auto-tool-choice": False,
214
+ "tool-call-parser": False
215
+ }
216
+ return ChatWatsonx(
217
+ model_id=self.llm_judge.model.model_id,
218
+ url=self.llm_judge.model.provider.credentials.url,
219
+ apikey=self.llm_judge.model.provider.credentials.api_key,
220
+ project_id=self.llm_judge.model.project_id,
221
+ params=parameters,
222
+ )
223
+ elif provider_type == ModelProviderType.AZURE_OPENAI:
224
+ credentials = self.llm_judge.model.provider.credentials
225
+ model_id = self.llm_judge.model.model_name
226
+ azure_openapi_host = credentials.url
227
+ api_version = credentials.api_version
228
+ model_base = model_id.split("/")[-1].replace(".", "-")
229
+ azure_endpoint = \
230
+ f'{azure_openapi_host}/openai/deployments/{model_base}/chat/completions?api-version={api_version}'
231
+ parameters = {"temperature": 0}
232
+ return AzureChatOpenAI(api_key=credentials.api_key,
233
+ azure_endpoint=azure_endpoint,
234
+ api_version=api_version,
235
+ max_retries=2,
236
+ **parameters
237
+ )
238
+ elif provider_type == ModelProviderType.RITS:
239
+ credentials = self.llm_judge.model.provider.credentials
240
+ judge_model_id = self.llm_judge.model.model_name
241
+ model_base = judge_model_id.split("/")[-1].replace(".", "-")
242
+ rits_base_url = f'{credentials.hostname}/{model_base}/v1'
243
+ return ChatOpenAI(
244
+ model=judge_model_id,
245
+ api_key='/',
246
+ base_url=rits_base_url,
247
+ default_headers={'RITS_API_KEY': credentials.api_key},
248
+ max_retries=2,
249
+ temperature=0.0
250
+ )
251
+ elif provider_type == ModelProviderType.OPENAI:
252
+ model_name = self.llm_judge.model.model_name
253
+ return ChatOpenAI(
254
+ model=model_name,
255
+ max_retries=2,
256
+ temperature=0.0
257
+ )
258
+ raise Exception(f"Unknown provider type {provider_type}.")
@@ -0,0 +1,106 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ # --- Prompt Templates ---
11
+ full_response_eval_system_prompt = """You are an impartial judge evaluating the quality of an AI model's response. You will receive:
12
+
13
+ Input: The text the model was asked to process or respond to.
14
+ Output: The model's response text.
15
+ Your task is to score the model's response on a scale of 0 to 1, considering the following criteria.
16
+ You may also consider other relevant factors that contribute to the overall quality of the response.
17
+
18
+ Evaluation Criteria:
19
+ {evaluation_criteria}
20
+
21
+ Provide a score from 0 to 1 and explain your reasoning clearly and concisely. End the response with 'Final Score: <score>' (e.g., 'Final Score: 0.7').
22
+ """
23
+
24
+ full_response_eval_human_prompt = """The response to analyze:
25
+ Input: '{model_input}'
26
+ Output: '{model_output}'
27
+
28
+ --- Begin Evaluation ---
29
+ Textual Evaluation: [Your textual evaluation here]
30
+ Evaluation score: [Your score here]
31
+ """
32
+
33
+ summarization_system_prompt = """
34
+ You are given an evaluation text produced by a judge model. Summarize the text in a few sentences.
35
+ Focus on the core reasoning for the score and the final score itself.
36
+ Remove redundancies and make it concise while keeping the essential information.
37
+ Disregard the score given by the model and focus on the textual feedback.
38
+ """
39
+
40
+ summarization_human_prompt = """
41
+ Evaluation Text to Summarize:
42
+ {evaluation_text}
43
+ """
44
+
45
+ map_shortcomings_system_prompt = """You are an expert analyst reviewing evaluation feedback for AI model responses.
46
+ Your task is to determine which of the following common shortcomings are mentioned or implied in the evaluation text.
47
+ The shortcoming must be included with negative sentiment. For instance, if the shortcoming is
48
+ "The answer lacks factual accuracy" and the evaluation text mentions "The answer is factually accurate", than the shortcoming is NOT mentioned in the evaluation text.
49
+
50
+ Analyze the evaluation text and determine which shortcomings are present.
51
+ For each shortcoming, respond with a 1 if it is mentioned or implied, or 0 if it is not mentioned.
52
+ Your response should be a Python list of {num_shortcomings} binary values (0 or 1).
53
+ For example: [1,0,0,1,0,0,0] would mean shortcomings 1 and 4 are present, and the others are not.
54
+ Respond ONLY with the list in the format [0,1,0,...] with no additional text.
55
+ """
56
+
57
+ map_shortcomings_human_prompt = """
58
+ The shortcomings list to analyze:
59
+ {shortcomings_list}
60
+
61
+ Evaluation text to analyze:
62
+ {eval_text}\n
63
+
64
+ Which shortcomings (1-{num_shortcomings}) are mentioned or implied in this evaluation? Respond with a Python list of {num_shortcomings} binary values (0 or 1) in the format [0,1,0,...]."
65
+ """
66
+
67
+ recurrent_issues_synthesis_system_prompt ="""You are an expert analyst tasked with identifying common themes in evaluation feedback for an AI model's answers. Below is a collection of evaluation texts assessing the quality of different answers.
68
+ Your goal is to identify the most significant and frequent types of shortcomings or negative feedback mentioned in these evaluations. Please provide a list of concise phrases describing these common issues. Focus on actionable feedback points that could help improve the model's responses.
69
+
70
+ Guidelines for identifying shortcomings:
71
+ 1. Look for patterns across multiple evaluations
72
+ 2. Focus on specific, actionable issues rather than general complaints
73
+ 3. Consider both content-related issues (accuracy, completeness) and presentation issues (clarity, structure)
74
+ 4. Prioritize issues that appear frequently or have significant impact
75
+ 5. Be specific but concise in your descriptions
76
+ 6. Ensure the issues are distinct and not overlapping.
77
+
78
+ Do NOT list positive feedback. Focus only on areas for improvement or reasons for lower scores.
79
+ Present the output ONLY as a Python list of strings. Your response MUST start with '[' and end with ']'.
80
+ """
81
+
82
+ recurrent_issues_synthesis_human_prompt = """--- Begin Evaluation Texts ---
83
+ {concatenated_evaluation_text}
84
+ --- End Evaluation Texts ---
85
+
86
+ Synthesized List of Common Shortcomings (Python List format ONLY):
87
+ """
88
+
89
+ shortcomings_clustering_system_prompt = """You are given a list of short action items that describe recurring issues found in responses generated by a language model. These items may contain duplicates or very similar entries phrased differently.
90
+ Your task is to analyze the list, remove duplicates and consolidate redundant items into a smaller set of distinct, clearly described issues.
91
+ Instructions:
92
+ - Group nearly identical feedback items that refer to the same concerns.
93
+ - If there are two issues assessing different aspects of the same topic - do not merge them.
94
+ - Do not merge issues with the same topic but opposite concerns (e.g: overly verbose / not verbose enough).
95
+ - For each group, write a single and clear issue that captures the common idea.
96
+ - Ensure that each issue addresses only a single concern or aspect. Do not merge distinct issues with related topics.
97
+ - Ensure that the final list avoids redundancy and represents the full variety of distinct concerns from the original list.
98
+ - Ensure no important information is lost from the original list — all key concerns must be preserved.
99
+ - Explain your reasoning for each consolidation decision (e.g., which items were grouped together and why).
100
+ - Finish your response by outputting 'Final list:' and then a properly formatted Python list of strings (with each element in double quotes) containing the consolidated issues.
101
+ """
102
+
103
+ shortcomings_clustering_human_prompt = """Now process the following list:
104
+ {recurring_issues_list}
105
+ """
106
+
File without changes
@@ -0,0 +1,298 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ import re
11
+ from typing import Annotated, Literal, Optional, Self
12
+
13
+ import pandas as pd
14
+ from pydantic import Field, field_validator, model_validator
15
+
16
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
17
+ from ibm_watsonx_gov.entities.criteria import CriteriaCatalog, Option
18
+ from ibm_watsonx_gov.entities.enums import (MetricGroup, MetricType,
19
+ MetricValueType, TaskType)
20
+ from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
21
+ from ibm_watsonx_gov.entities.llm_judge import LLMJudge
22
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
23
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
24
+
25
+ try:
26
+ from ibm_watsonx_gov.providers.eval_assist_provider import (
27
+ VARIABLES_PATTERN, EvalAssistProvider)
28
+ except:
29
+ pass
30
+
31
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
32
+ from ibm_watsonx_gov.utils.constants import CUSTOM_TYPE
33
+
34
+
35
+ class LLMAsJudgeMetric(GenAIMetric):
36
+ """
37
+ Defines the LLMAsJudge metric class.
38
+
39
+ The LLMAsJudge metric evaluates the model input, output text based on the provided criteria or the grader prompt using a judge llm.
40
+
41
+ Examples:
42
+ 1. Create LLMAsJudge metric with user defined grader prompt.
43
+ .. code-block:: python
44
+
45
+ # Define LLM Judge using watsonx.ai
46
+ # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
47
+ llm_judge = LLMJudge(model=WxAIFoundationModel(
48
+ model_id="llama-3-3-70b-instruct",
49
+ project_id="<PROJECT_ID>"))
50
+ prompt_template="You are presented with a response generated subject to a context.\\nContext: \\n {context} \\n Response: {response} \\n. Is the response faithful according to context?\\nChoose an option:\\n- 'Yes' if The response is faithful according to context.\\n- 'No' if The response is not faithful according to context."
51
+ options=["Yes", "No"])
52
+ # Optionally the numeric mapping for the string option can be specified as below
53
+ # options={"Yes": 1, "No": 0}
54
+ metric = LLMAsJudgeMetric(llm_judge=llm_judge,
55
+ prompt_template=prompt_template,
56
+ options=options)
57
+ evaluator = MetricsEvaluator()
58
+ evaluation_result = evaluator.evaluate(data=data,
59
+ metrics=metrics)
60
+
61
+ 2. Create an LLMAsJudge metric using the predefined criteria provided in the IBM watsonx.governance SDK’s criteria catalog.
62
+ .. code-block:: python
63
+
64
+ # Define LLM Judge using watsonx.ai
65
+ # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
66
+ llm_judge = LLMJudge(model=WxAIFoundationModel(
67
+ model_id="llama-3-3-70b-instruct",
68
+ project_id="<PROJECT_ID>"))
69
+
70
+ # Display the catalog
71
+ CriteriaCatalog.display_criteria_catalog(CriteriaCatalog.get_criteria())
72
+
73
+ # Initialize the LLMAsJudgeMetric with any of the available criteria.
74
+ metric = LLMAsJudgeMetric(name="conciseness",
75
+ output_field="generated_text",
76
+ llm_judge=llm_judge)
77
+ evaluator = MetricsEvaluator()
78
+ evaluation_result = evaluator.evaluate(data=data,
79
+ metrics=metrics)
80
+
81
+ 3. Create LLMAsJudge metric with user defined criteria and with default options. It is recommended to provide the options along with the description as shown in the next example for better accuracy.
82
+ .. code-block:: python
83
+
84
+ # Define LLM Judge using watsonx.ai
85
+ # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
86
+ llm_judge = LLMJudge(model=WxAIFoundationModel(
87
+ model_id="llama-3-3-70b-instruct",
88
+ project_id="<PROJECT_ID>"))
89
+ criteria_description="Is the {generated_text} faithful according to {context}?"
90
+ # When using the criteria description, its required to specify the output field if its other than generated_text.
91
+ metric = LLMAsJudgeMetric(name="factuality",
92
+ llm_judge=llm_judge,
93
+ criteria_description=criteria_description,
94
+ # output_field="generated_text"
95
+ )
96
+ evaluator = MetricsEvaluator()
97
+ evaluation_result = evaluator.evaluate(data=data,
98
+ metrics=metrics)
99
+
100
+ 4. Create LLMAsJudge metric with user defined criteria and options.
101
+ .. code-block:: python
102
+
103
+ # Define LLM Judge using watsonx.ai
104
+ # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
105
+ llm_judge = LLMJudge(model=WxAIFoundationModel(
106
+ model_id="llama-3-3-70b-instruct",
107
+ project_id="<PROJECT_ID>"))
108
+ criteria_description="Is the {response} faithful according to {context}?"
109
+ options=[Option(name="Yes",
110
+ description="The {response} is faithful according to {context}.",
111
+ value=1.0),
112
+ Option(name="No",
113
+ description="The {response} is not faithful according to {context}.",
114
+ value=0.0)])
115
+ # When using the criteria description, its required to specify the output field if its other than generated_text.
116
+ metric = LLMAsJudgeMetric(name="factuality",
117
+ llm_judge=llm_judge,
118
+ criteria_description=criteria_description,
119
+ options=options,
120
+ output_field="response")
121
+ evaluator = MetricsEvaluator()
122
+ evaluation_result = evaluator.evaluate(data=data,
123
+ metrics=metrics)
124
+ """
125
+ name: Annotated[str,
126
+ Field(title="Name",
127
+ description="The llm as judge metric name. The name should be in lower snake case format.")]
128
+ display_name: Annotated[Optional[str],
129
+ Field(title="Display Name",
130
+ description="The llm as judge metric display name. If not specified, its derived from the name.",
131
+ default=None)]
132
+ type_: Annotated[CUSTOM_TYPE, Field(title="Metric type",
133
+ description="The type of the metric.",
134
+ serialization_alias="type",
135
+ default=MetricType.CUSTOM.value,
136
+ frozen=True,
137
+ examples=[MetricType.CUSTOM.value])]
138
+ value_type: Annotated[str, Field(title="Metric value type",
139
+ description="The type of the metric value. Indicates whether the metric value is numeric or categorical. The default value is categorical.",
140
+ serialization_alias="type", default=MetricValueType.CATEGORICAL.value,
141
+ examples=MetricValueType.values())]
142
+ llm_judge: Annotated[LLMJudge,
143
+ Field(title="LLM Judge",
144
+ description="The LLM judge to be used for evaluation.")]
145
+ criteria_description: Annotated[Optional[str],
146
+ Field(title="Criteria Description",
147
+ description="The description of the evaluation criteria used to compute the metric.",
148
+ examples=[
149
+ "Is the {response} concise and to the point?"],
150
+ default=None)]
151
+ prompt_template: Annotated[Optional[str],
152
+ Field(title="Prompt Template",
153
+ description="The grader prompt template used to compute the metric.",
154
+ default=None,
155
+ examples=["You are an expert grader. Your job is to evaluate how factually grounded an AI-generated answer is based on a given context. \n ## Grading Scale: \n Rate the answer either Yes or No:"])]
156
+ options: Annotated[list[Option] | list[dict] | list[str] | dict,
157
+ Field(title="Options",
158
+ description="The list of options of the judge response.",
159
+ default=[Option(name="Yes",
160
+ value=1.0),
161
+ Option(name="No",
162
+ value=0.0)],
163
+ examples=[["Yes", "No"], [{"name": "Yes", "value": 1}, {"name": "No", "value": 0}], [{"name": "Yes", "value": 1, "description": ""}, {"name": "No", "value": 0, "description": ""}]]),
164
+ ]
165
+ output_field: Annotated[Optional[str], Field(title="Output Field",
166
+ description="The model generated output field in the data. This is required when providing the criteria description. Default value is 'generated_text'.",
167
+ default="generated_text",
168
+ examples=["output"])]
169
+ group: Annotated[str,
170
+ Field(title="Group",
171
+ description="The metric group. The default group name is custom.",
172
+ default=MetricGroup.CUSTOM.value)]
173
+ thresholds: Annotated[list[MetricThreshold],
174
+ Field(title="Thresholds",
175
+ description="The metric thresholds.",
176
+ default=[MetricThreshold(type="lower_limit", value=0.7)])]
177
+ tasks: Annotated[list[TaskType],
178
+ Field(title="Tasks",
179
+ description="The list of supported tasks.",
180
+ default=[])]
181
+ method: Annotated[Literal["llm_as_judge"],
182
+ Field(title="Method",
183
+ description="The method used to compute the metric.",
184
+ default="llm_as_judge", frozen=True)]
185
+
186
+ @field_validator("options", mode="before")
187
+ def parse_options(cls, value):
188
+ if isinstance(value, list):
189
+ if isinstance(value[0], str):
190
+ return [Option(name=v) for v in value]
191
+ elif isinstance(value, dict):
192
+ return [Option(name=k, value=v) for k, v in value.items()]
193
+
194
+ return value
195
+
196
+ @model_validator(mode="after")
197
+ def validate(self) -> Self:
198
+
199
+ # Set criteria description and options based on the criteria name.
200
+ if not self.criteria_description and not self.prompt_template:
201
+ try:
202
+ criteria_obj = CriteriaCatalog.get_criteria([self.name])
203
+ except Exception:
204
+ raise ValueError(
205
+ "The provided criteria name is unavailable in the catalog. Choose a criteria from the catalog or provide criteria_description or prompt_template to proceed.")
206
+ self.criteria_description = criteria_obj[0].description
207
+ self.options = criteria_obj[0].options
208
+
209
+ if self.criteria_description and not self.output_field:
210
+ raise ValueError(
211
+ "The `output_field` value is invalid. Please provide valid value for `output_field` attribute.")
212
+
213
+ if self.value_type == MetricValueType.NUMERIC.value:
214
+ for o in self.options:
215
+ if o.value is None:
216
+ raise ValueError(
217
+ f"The option is invalid. The metric value type is numeric, but the criteria option '{o.name}' does not have a valid value. Please provide a valid option.")
218
+
219
+ if not bool(re.fullmatch(r'[a-z][a-z0-9]*(?:_[a-z0-9]+)*', self.name)):
220
+ raise ValueError(
221
+ "The metric name should be in lower snake case format.")
222
+
223
+ if not self.display_name:
224
+ words = self.name.split('_')
225
+ self.display_name = ' '.join(word.capitalize() for word in words)
226
+
227
+ return self
228
+
229
+ def evaluate(self,
230
+ data: pd.DataFrame,
231
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
232
+ **kwargs) -> AggregateMetricResult:
233
+ # If ran in sync mode, block until it is done
234
+ return run_in_event_loop(
235
+ self.evaluate_async,
236
+ data=data,
237
+ configuration=configuration,
238
+ **kwargs,
239
+ )
240
+
241
+ async def evaluate_async(self, data: pd.DataFrame,
242
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
243
+ **kwargs) -> AggregateMetricResult:
244
+
245
+ data_cols = data.columns.to_list()
246
+ self.__validate_fields(data_cols)
247
+
248
+ context_fields = []
249
+ if self.criteria_description:
250
+ if self.output_field not in data_cols:
251
+ raise ValueError(
252
+ f"The output field {self.output_field} is not present in the data.")
253
+
254
+ ctx_fields = list(self.__criteria_fields)
255
+ ctx_fields.remove(self.output_field)
256
+ context_fields = ctx_fields
257
+ provider = EvalAssistProvider(metric_name=self.name,
258
+ display_name=self.display_name,
259
+ value_type=self.value_type,
260
+ criteria_description=self.criteria_description,
261
+ llm_judge=self.llm_judge,
262
+ metric_group=self.group,
263
+ metric_method=self.method,
264
+ thresholds=self.thresholds,
265
+ prompt_template=self.prompt_template,
266
+ options=self.options,
267
+ prediction_field=self.output_field,
268
+ context_fields=context_fields,
269
+ record_id_field=configuration.record_id_field,
270
+ **kwargs)
271
+
272
+ return await provider.evaluate_async(data)
273
+
274
+ def __validate_fields(self, data_cols):
275
+ if self.criteria_description:
276
+ fields_from_criteria = set()
277
+ fields_from_options = set()
278
+ fields_from_criteria.update(re.findall(
279
+ VARIABLES_PATTERN, self.criteria_description))
280
+ for option in self.options:
281
+ fields_from_options.update(re.findall(
282
+ VARIABLES_PATTERN, option.description))
283
+
284
+ if (not all(field in data_cols for field in fields_from_criteria)):
285
+ raise ValueError(
286
+ f"The fields provided in the criteria description {fields_from_criteria} are not present in the data.")
287
+ if (not all(field in data_cols for field in fields_from_options)):
288
+ raise ValueError(
289
+ f"The fields provided in the options description {fields_from_options} are not present in the data.")
290
+ self.__criteria_fields = fields_from_criteria | fields_from_options
291
+ elif self.prompt_template:
292
+ fields_from_prompt = set()
293
+ fields_from_prompt.update(re.findall(
294
+ VARIABLES_PATTERN, self.prompt_template))
295
+
296
+ if (not all(field in data_cols for field in fields_from_prompt)):
297
+ raise ValueError(
298
+ f"The fields provided in the prompt template {fields_from_prompt} are not present in the data.")
File without changes