ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,59 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from functools import partial
10
+ from typing import Callable, Optional
11
+
12
+ from wrapt import decorator
13
+
14
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
15
+ AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics import TextGradeLevelMetric, TextReadingEaseMetric
19
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
20
+
21
+
22
+ class ReadabilityDecorator(BaseMetricDecorator):
23
+ def evaluate_readability(self,
24
+ func: Optional[Callable] = None,
25
+ *,
26
+ configuration: Optional[AgenticAIConfiguration] = None,
27
+ metrics: list[GenAIMetric] = []
28
+ ) -> dict:
29
+ """
30
+ An evaluation decorator for computing readability metric on an agentic node.
31
+ """
32
+ if func is None:
33
+ return partial(self.evaluate_readability, configuration=configuration, metrics=metrics)
34
+
35
+ if not metrics:
36
+ metrics = MetricGroup.READABILITY.get_metrics()
37
+
38
+ @decorator
39
+ def wrapper(func, instance, args, kwargs):
40
+
41
+ try:
42
+ self.validate(func=func, metrics=metrics,
43
+ valid_metric_types=(TextGradeLevelMetric, TextReadingEaseMetric,))
44
+
45
+ metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
46
+
47
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
48
+ configuration=configuration,
49
+ metrics=metrics,
50
+ metric_inputs=[],
51
+ metric_outputs=metric_outputs,
52
+ metric_groups=[MetricGroup.READABILITY])
53
+
54
+ return original_result
55
+ except Exception as ex:
56
+ raise Exception(
57
+ f"There was an error while evaluating readability metric on {func.__name__},") from ex
58
+
59
+ return wrapper(func)
@@ -0,0 +1,63 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from functools import partial
11
+ from typing import Callable, Optional
12
+
13
+ from wrapt import decorator
14
+
15
+ from ibm_watsonx_gov.config import AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics import (AveragePrecisionMetric,
19
+ ContextRelevanceMetric, HitRateMetric,
20
+ NDCGMetric, ReciprocalRankMetric,
21
+ RetrievalPrecisionMetric)
22
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
23
+
24
+
25
+ class RetrievalQualityDecorator(BaseMetricDecorator):
26
+ def evaluate_retrieval_quality(self,
27
+ func: Optional[Callable] = None,
28
+ *,
29
+ configuration: Optional[AgenticAIConfiguration] = None,
30
+ metrics: list[GenAIMetric] = []
31
+ ) -> dict:
32
+ """
33
+ An evaluation decorator for computing retrieval quality metrics on an agentic node.
34
+ """
35
+ if func is None:
36
+ return partial(self.evaluate_retrieval_quality, configuration=configuration, metrics=metrics)
37
+
38
+ if not metrics:
39
+ metrics = MetricGroup.RETRIEVAL_QUALITY.get_metrics()
40
+
41
+ @decorator
42
+ def wrapper(func, instance, args, kwargs):
43
+
44
+ try:
45
+ self.validate(func=func, metrics=metrics,
46
+ valid_metric_types=(NDCGMetric, ContextRelevanceMetric, ReciprocalRankMetric, RetrievalPrecisionMetric, AveragePrecisionMetric, HitRateMetric))
47
+
48
+ metric_inputs = [EvaluatorFields.INPUT_FIELDS]
49
+ metric_outputs = [EvaluatorFields.CONTEXT_FIELDS]
50
+
51
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
52
+ configuration=configuration,
53
+ metrics=metrics,
54
+ metric_inputs=metric_inputs,
55
+ metric_outputs=metric_outputs,
56
+ metric_groups=[MetricGroup.RETRIEVAL_QUALITY])
57
+
58
+ return original_result
59
+ except Exception as ex:
60
+ raise Exception(
61
+ f"There was an error while evaluating retrieval quality metrics on {func.__name__},") from ex
62
+
63
+ return wrapper(func)
@@ -0,0 +1,58 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from functools import partial
10
+ from typing import Callable, Optional
11
+
12
+ from wrapt import decorator
13
+
14
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
15
+ AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
19
+ from ibm_watsonx_gov.metrics import CostMetric, InputTokenCountMetric, OutputTokenCountMetric
20
+
21
+
22
+ class UsageDecorator(BaseMetricDecorator):
23
+ def evaluate_usage(self,
24
+ func: Optional[Callable] = None,
25
+ *,
26
+ configuration: Optional[AgenticAIConfiguration] = None,
27
+ metrics: list[GenAIMetric] = []
28
+ ) -> dict:
29
+ """
30
+ An evaluation decorator for computing usage metric on an agent invocation.
31
+ """
32
+ if func is None:
33
+ return partial(self.evaluate_usage, configuration=configuration, metrics=metrics)
34
+
35
+ if not metrics:
36
+ metrics = MetricGroup.USAGE.get_metrics()
37
+
38
+ @decorator
39
+ def wrapper(func, instance, args, kwargs):
40
+
41
+ try:
42
+ self.validate(func=func, metrics=metrics,
43
+ valid_metric_types=(CostMetric, InputTokenCountMetric, OutputTokenCountMetric))
44
+
45
+ metric_inputs = [EvaluatorFields.MODEL_USAGE_DETAIL_FIELDS,
46
+ EvaluatorFields.INPUT_TOKEN_COUNT_FIELDS, EvaluatorFields.OUTPUT_TOKEN_COUNT_FIELDS]
47
+
48
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
49
+ configuration=configuration,
50
+ metrics=metrics,
51
+ metric_inputs=metric_inputs)
52
+
53
+ return original_result
54
+ except Exception as ex:
55
+ raise Exception(
56
+ f"There was an error while evaluating usage metric on {func.__name__},") from ex
57
+
58
+ return wrapper(func)
@@ -0,0 +1,74 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+
11
+ from typing import Annotated, Union
12
+
13
+ from pydantic import Field
14
+
15
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
16
+
17
+ from .answer_relevance.answer_relevance_metric import AnswerRelevanceMetric
18
+ from .answer_similarity.answer_similarity_metric import AnswerSimilarityMetric
19
+ from .average_precision.average_precision_metric import AveragePrecisionMetric
20
+ from .cost.cost_metric import CostMetric
21
+ from .duration.duration_metric import DurationMetric
22
+ from .evasiveness.evasiveness_metric import EvasivenessMetric
23
+ from .faithfulness.faithfulness_metric import FaithfulnessMetric
24
+ from .hap.hap_metric import HAPMetric
25
+ from .hap.input_hap_metric import InputHAPMetric
26
+ from .hap.output_hap_metric import OutputHAPMetric
27
+ from .harm.harm_metric import HarmMetric
28
+ from .harm_engagement.harm_engagement_metric import HarmEngagementMetric
29
+ from .hit_rate.hit_rate_metric import HitRateMetric
30
+ from .input_token_count.input_token_count_metric import InputTokenCountMetric
31
+ from .jailbreak.jailbreak_metric import JailbreakMetric
32
+ from .keyword_detection.keyword_detection_metric import KeywordDetectionMetric
33
+ from .llm_validation.llm_validation_metric import LLMValidationMetric
34
+ from .llmaj.llmaj_metric import LLMAsJudgeMetric
35
+ from .ndcg.ndcg_metric import NDCGMetric
36
+ from .output_token_count.output_token_count_metric import \
37
+ OutputTokenCountMetric
38
+ from .pii.input_pii_metric import InputPIIMetric
39
+ from .pii.output_pii_metric import OutputPIIMetric
40
+ from .pii.pii_metric import PIIMetric
41
+ from .profanity.profanity_metric import ProfanityMetric
42
+ from .prompt_safety_risk.prompt_safety_risk_metric import \
43
+ PromptSafetyRiskMetric
44
+ from .reciprocal_rank.reciprocal_rank_metric import ReciprocalRankMetric
45
+ from .regex_detection.regex_detection_metric import RegexDetectionMetric
46
+ from .retrieval_precision.retrieval_precision_metric import \
47
+ RetrievalPrecisionMetric
48
+ from .sexual_content.sexual_content_metric import SexualContentMetric
49
+ from .social_bias.social_bias_metric import SocialBiasMetric
50
+ from .status.status_metric import StatusMetric
51
+ from .text_grade_level.text_grade_level_metric import TextGradeLevelMetric
52
+ from .text_reading_ease.text_reading_ease_metric import TextReadingEaseMetric
53
+ from .tool_call_accuracy.tool_call_accuracy_metric import \
54
+ ToolCallAccuracyMetric
55
+ from .tool_call_parameter_accuracy.tool_call_parameter_accuracy_metric import \
56
+ ToolCallParameterAccuracyMetric
57
+ from .tool_call_relevance.tool_call_relevance_metric import \
58
+ ToolCallRelevanceMetric
59
+ from .tool_call_syntactic_accuracy.tool_call_syntactic_accuracy_metric import \
60
+ ToolCallSyntacticAccuracyMetric
61
+ from .topic_relevance.topic_relevance_metric import TopicRelevanceMetric
62
+ from .unethical_behavior.unethical_behavior_metric import \
63
+ UnethicalBehaviorMetric
64
+ from .unsuccessful_requests.unsuccessful_requests_metric import \
65
+ UnsuccessfulRequestsMetric
66
+ from .user_id.user_id_metric import UserIdMetric
67
+ from .violence.violence_metric import ViolenceMetric
68
+
69
+ from .context_relevance.context_relevance_metric import ContextRelevanceMetric # isort:skip
70
+
71
+ METRICS_UNION = Annotated[Union[
72
+ tuple([c for c in GenAIMetric.__subclasses__() if c is not LLMAsJudgeMetric])
73
+ ], Field(
74
+ discriminator="name")]
@@ -0,0 +1,8 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
@@ -0,0 +1,63 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from functools import partial
11
+ from typing import Callable, Optional
12
+
13
+ from wrapt import decorator
14
+
15
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
16
+ AgenticAIConfiguration
17
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
18
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
19
+ from ibm_watsonx_gov.metrics.answer_relevance.answer_relevance_metric import \
20
+ AnswerRelevanceMetric
21
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
22
+
23
+
24
+ class AnswerRelevanceDecorator(BaseMetricDecorator):
25
+ def evaluate_answer_relevance(self,
26
+ func: Optional[Callable] = None,
27
+ *,
28
+ configuration: Optional[AgenticAIConfiguration] = None,
29
+ metrics: list[GenAIMetric] = []
30
+ ) -> dict:
31
+ """
32
+ An evaluation decorator for computing answer relevance metric on an agentic node.
33
+ """
34
+ if func is None:
35
+ return partial(self.evaluate_answer_relevance, configuration=configuration, metrics=metrics)
36
+
37
+ if not metrics:
38
+ metrics = [AnswerRelevanceMetric()]
39
+
40
+ @decorator
41
+ def wrapper(func, instance, args, kwargs):
42
+
43
+ try:
44
+ self.validate(func=func, metrics=metrics,
45
+ valid_metric_types=(AnswerRelevanceMetric))
46
+
47
+ metric_inputs = [
48
+ EvaluatorFields.INPUT_FIELDS
49
+ ]
50
+ metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
51
+
52
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
53
+ configuration=configuration,
54
+ metrics=metrics,
55
+ metric_inputs=metric_inputs,
56
+ metric_outputs=metric_outputs)
57
+
58
+ return original_result
59
+ except Exception as ex:
60
+ raise Exception(
61
+ f"There was an error while evaluating answer relevance metric on {func.__name__},") from ex
62
+
63
+ return wrapper(func)
@@ -0,0 +1,260 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ from lazy_imports import LazyModule, load
14
+ from pydantic import Field, model_validator
15
+ from typing_extensions import Self
16
+
17
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
18
+ from ibm_watsonx_gov.entities.base_classes import Error
19
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
20
+ from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
21
+ RecordMetricResult)
22
+ from ibm_watsonx_gov.entities.llm_judge import LLMJudge
23
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
24
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
25
+ from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
26
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
27
+ from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
28
+ from ibm_watsonx_gov.utils.validation_util import (validate_input,
29
+ validate_llm_as_judge,
30
+ validate_output,
31
+ validate_small_model_method,
32
+ validate_unitxt_method)
33
+
34
+ # Create lazy module for Unitxt imports
35
+ unitxt_provider = LazyModule(
36
+ "from ibm_watsonx_gov.providers.unitxt_provider import UnitxtColumnMapping",
37
+ "from ibm_watsonx_gov.providers.unitxt_provider import UnitxtProvider",
38
+ name="lazy_unitxt_provider"
39
+ )
40
+ load(unitxt_provider)
41
+ UnitxtColumnMapping = unitxt_provider.UnitxtColumnMapping
42
+ UnitxtProvider = unitxt_provider.UnitxtProvider
43
+
44
+ logger = GovSDKLogger.get_logger(__name__)
45
+ ANSWER_RELEVANCE = "answer_relevance"
46
+ UNITXT_METRIC_NAME = ANSWER_RELEVANCE
47
+ unitxt_methods = [
48
+ "token_recall",
49
+ "llm_as_judge",
50
+ "granite_guardian",
51
+ "answer_relevance_model"
52
+ ]
53
+
54
+
55
+ class AnswerRelevanceMetric(GenAIMetric):
56
+ """
57
+ Defines the Answer Relevance metric class.
58
+
59
+ The Answer Relevance metric measures the relevance of the generated text to the given input query.
60
+ It can be computed using the below methods:
61
+
62
+ 1. token_recall (default)
63
+ 2. llm_as_judge
64
+ 3. granite_guardian
65
+ 4. answer_relevance_model
66
+
67
+ Examples:
68
+ 1. Create Answer Relevance metric with default parameters and compute using metrics evaluator.
69
+ .. code-block:: python
70
+
71
+ metric = AnswerRelevanceMetric()
72
+ result = MetricsEvaluator().evaluate(data={"input_text": "...", "generated_text": "..."},
73
+ metrics=[metric])
74
+
75
+ 2. Create Answer Relevance metric with a custom thresholds and method.
76
+ .. code-block:: python
77
+
78
+ thresholds = [MetricThreshold(type="lower_limit", value=0.5)]
79
+ method = "token_recall"
80
+ metric = AnswerRelevanceMetric(
81
+ method=method, thresholds=thresholds)
82
+
83
+ 3. Create Answer Relevance metric with llm_as_judge method.
84
+ .. code-block:: python
85
+
86
+ # Define LLM Judge using watsonx.ai
87
+ # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
88
+ llm_judge = LLMJudge(model=WxAIFoundationModel(
89
+ model_id="ibm/granite-3-3-8b-instruct",
90
+ project_id="<PROJECT_ID>"))
91
+ metric = AnswerRelevanceMetric(llm_judge=llm_judge)
92
+
93
+ 4. Create Answer Relevance metric with granite_guardian method.
94
+ .. code-block:: python
95
+
96
+ metric = AnswerRelevanceMetric(method="granite_guardian")
97
+
98
+ 5. Create Answer Relevance metric with answer_relevance_model method. Currently available only in On-Prem version.
99
+ .. code-block:: python
100
+
101
+ metric = AnswerRelevanceMetric(method="answer_relevance_model")
102
+
103
+ """
104
+ name: Annotated[Literal["answer_relevance"],
105
+ Field(title="Name",
106
+ description="The answer relevance metric name.",
107
+ default=ANSWER_RELEVANCE, frozen=True)]
108
+ display_name: Annotated[Literal["Answer Relevance"],
109
+ Field(title="Display Name",
110
+ description="The answer relevance metric display name.",
111
+ default="Answer Relevance", frozen=True)]
112
+ tasks: Annotated[list[TaskType],
113
+ Field(title="Tasks",
114
+ description="The list of supported tasks.",
115
+ default=[TaskType.RAG, TaskType.QA])]
116
+ thresholds: Annotated[list[MetricThreshold],
117
+ Field(title="Thresholds",
118
+ description="The metric thresholds.",
119
+ default=[MetricThreshold(type="lower_limit", value=0.7)])]
120
+ method: Annotated[Literal["token_recall", "llm_as_judge", "granite_guardian", "answer_relevance_model"],
121
+ Field(title="Method",
122
+ description="The method used to compute the metric. This field is optional and when `llm_judge` is provided, the method would be set to `llm_as_judge`.The `answer_relevance_model` method is currently available only in On-Prem version.",
123
+ default="token_recall")]
124
+ group: Annotated[MetricGroup,
125
+ Field(title="Group",
126
+ description="The metric group.",
127
+ default=MetricGroup.ANSWER_QUALITY, frozen=True)]
128
+ llm_judge: Annotated[LLMJudge | None,
129
+ Field(title="LLM Judge",
130
+ description="The LLM judge used to compute the metric.",
131
+ default=None)]
132
+
133
+ @model_validator(mode="after")
134
+ def set_llm_judge_default_method(self) -> Self:
135
+ # If llm_judge is set, set the method to llm_as_judge
136
+ if self.llm_judge:
137
+ self.method = "llm_as_judge"
138
+ return self
139
+
140
+ def evaluate(self,
141
+ data: pd.DataFrame,
142
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
143
+ **kwargs) -> AggregateMetricResult:
144
+ # If ran in sync mode, block until it is done
145
+ return run_in_event_loop(
146
+ self.evaluate_async,
147
+ data=data,
148
+ configuration=configuration,
149
+ **kwargs,
150
+ )
151
+
152
+ def __is_supported(self, **kwargs):
153
+ # Currently supported only in CPD and ypqa
154
+ return kwargs.get(
155
+ "api_client").credentials.region == "ypqa" or kwargs.get("api_client").is_cpd
156
+
157
+ async def evaluate_async(self, data: pd.DataFrame,
158
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
159
+ **kwargs) -> AggregateMetricResult:
160
+
161
+ data_cols = data.columns.to_list()
162
+ try:
163
+ validate_input(data_cols, configuration)
164
+ validate_output(data_cols, configuration)
165
+ validate_unitxt_method(self.name, self.method, unitxt_methods)
166
+ validate_llm_as_judge(self.name, self.method,
167
+ self.llm_judge, configuration.llm_judge)
168
+ validate_small_model_method(
169
+ self.name, self.method, self.__is_supported(**kwargs), unitxt_methods)
170
+ except ValueError as ve:
171
+ if kwargs.get("ignore_validation_errors"):
172
+ message = f"Skipping '{self.name}' computation because the validation failed. Details: {str(ve)}"
173
+ logger.warning(message)
174
+ return
175
+ raise ve
176
+
177
+ # Separate the data into a dataframe with no None values and a dataframe with None values
178
+ required_fields = configuration.input_fields + configuration.output_fields
179
+ mask_has_none = data[required_fields].isna().any(axis=1)
180
+ df_with_none = data[mask_has_none]
181
+ df_without_none = data[mask_has_none == False]
182
+
183
+ # Compute the metrics only for the dataframe with no None values
184
+ aggregated_metric_result = None
185
+ if not df_without_none.empty:
186
+ # Define the mapping if the method is not using the default one
187
+ if self.method == "token_recall":
188
+ column_mapping = UnitxtColumnMapping(
189
+ answer="prediction/answer",
190
+ question="task_data/question",
191
+ )
192
+ else:
193
+ column_mapping = UnitxtColumnMapping()
194
+ if self.method in ["granite_guardian", "answer_relevance_model"]:
195
+ kwargs["detector_params"] = {
196
+ "method": self.method, "threshold": 0.001}
197
+ provider = DetectorsProvider(configuration=configuration,
198
+ metric_name=self.name,
199
+ metric_display_name=self.display_name,
200
+ metric_method=self.method,
201
+ metric_group=MetricGroup.ANSWER_QUALITY,
202
+ thresholds=self.thresholds,
203
+ **kwargs)
204
+ else:
205
+ provider = UnitxtProvider(
206
+ configuration=configuration,
207
+ metric_name=self.name,
208
+ metric_display_name=self.display_name,
209
+ metric_method=self.method,
210
+ metric_prefix="metrics.rag.external_rag",
211
+ metric_alias=UNITXT_METRIC_NAME,
212
+ metric_group=self.group,
213
+ column_mapping=column_mapping,
214
+ llm_judge=self.llm_judge,
215
+ thresholds=self.thresholds,
216
+ **kwargs,
217
+ )
218
+
219
+ aggregated_metric_result = await provider.evaluate_async(data=df_without_none)
220
+
221
+ # Update the metric result with record level metrics results for the records with missing values
222
+ if not df_with_none.empty:
223
+ # Create None results for records with missing values
224
+ none_results = []
225
+ for _, row in df_with_none.iterrows():
226
+ record_result = RecordMetricResult(
227
+ name=self.name,
228
+ display_name=self.display_name,
229
+ method=self.method,
230
+ group=self.group,
231
+ value=None,
232
+ record_id=row[configuration.record_id_field],
233
+ thresholds=self.thresholds,
234
+ errors=[Error(
235
+ code="BAD_REQUEST", message_en="The value of required fields input or output is None.")]
236
+ )
237
+ none_results.append(record_result)
238
+
239
+ # Merge the results
240
+ if aggregated_metric_result:
241
+ all_record_results = aggregated_metric_result.record_level_metrics + none_results
242
+ aggregated_metric_result.record_level_metrics = all_record_results
243
+ aggregated_metric_result.total_records = len(
244
+ all_record_results)
245
+ else:
246
+ aggregated_metric_result = AggregateMetricResult(
247
+ name=self.name,
248
+ display_name=self.display_name,
249
+ method=self.method,
250
+ group=self.group,
251
+ value=None,
252
+ total_records=len(none_results),
253
+ record_level_metrics=none_results,
254
+ min=None,
255
+ max=None,
256
+ mean=None,
257
+ thresholds=self.thresholds
258
+ )
259
+
260
+ return aggregated_metric_result
File without changes
@@ -0,0 +1,66 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from functools import partial
11
+ from typing import Callable, Optional
12
+
13
+ from wrapt import decorator
14
+
15
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
16
+ AgenticAIConfiguration
17
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
18
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
19
+ from ibm_watsonx_gov.metrics.answer_similarity.answer_similarity_metric import \
20
+ AnswerSimilarityMetric
21
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
22
+
23
+
24
+ class AnswerSimilarityDecorator(BaseMetricDecorator):
25
+ def evaluate_answer_similarity(self,
26
+ func: Optional[Callable] = None,
27
+ *,
28
+ configuration: Optional[AgenticAIConfiguration] = None,
29
+ metrics: list[GenAIMetric] = []
30
+ ) -> dict:
31
+ """
32
+ An evaluation decorator for computing answer similarity metric on an agentic node.
33
+ """
34
+ if func is None:
35
+ return partial(self.evaluate_answer_similarity, configuration=configuration, metrics=metrics)
36
+
37
+ if not metrics:
38
+ metrics = [AnswerSimilarityMetric()]
39
+
40
+ @decorator
41
+ def wrapper(func, instance, args, kwargs):
42
+
43
+ try:
44
+ self.validate(func=func, metrics=metrics,
45
+ valid_metric_types=(AnswerSimilarityMetric,))
46
+
47
+ metric_inputs = [
48
+ EvaluatorFields.INPUT_FIELDS,
49
+ EvaluatorFields.CONTEXT_FIELDS
50
+ ]
51
+ metric_references = [EvaluatorFields.REFERENCE_FIELDS]
52
+ metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
53
+
54
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
55
+ configuration=configuration,
56
+ metrics=metrics,
57
+ metric_inputs=metric_inputs,
58
+ metric_outputs=metric_outputs,
59
+ metric_references=metric_references)
60
+
61
+ return original_result
62
+ except Exception as ex:
63
+ raise Exception(
64
+ f"There was an error while evaluating answer similarity metric on {func.__name__},") from ex
65
+
66
+ return wrapper(func)