ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,60 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from functools import partial
10
+ from typing import Callable, Optional
11
+
12
+ from wrapt import decorator
13
+
14
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
15
+ AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
19
+ from ibm_watsonx_gov.metrics.context_relevance.context_relevance_metric import \
20
+ ContextRelevanceMetric
21
+
22
+
23
+ class ContextRelevanceDecorator(BaseMetricDecorator):
24
+ def evaluate_context_relevance(self,
25
+ func: Optional[Callable] = None,
26
+ *,
27
+ configuration: Optional[AgenticAIConfiguration] = None,
28
+ metrics: list[GenAIMetric] = []
29
+ ) -> dict:
30
+ """
31
+ An evaluation decorator for computing context relevance metric on an agentic node.
32
+ """
33
+ if func is None:
34
+ return partial(self.evaluate_context_relevance, configuration=configuration, metrics=metrics)
35
+
36
+ if not metrics:
37
+ metrics = [ContextRelevanceMetric()]
38
+
39
+ @decorator
40
+ def wrapper(func, instance, args, kwargs):
41
+
42
+ try:
43
+ self.validate(func=func, metrics=metrics,
44
+ valid_metric_types=(ContextRelevanceMetric,))
45
+
46
+ metric_inputs = [EvaluatorFields.INPUT_FIELDS]
47
+ metric_outputs = [EvaluatorFields.CONTEXT_FIELDS]
48
+
49
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
50
+ configuration=configuration,
51
+ metrics=metrics,
52
+ metric_inputs=metric_inputs,
53
+ metric_outputs=metric_outputs)
54
+
55
+ return original_result
56
+ except Exception as ex:
57
+ raise Exception(
58
+ f"There was an error while evaluating context relevance metric on {func.__name__},") from ex
59
+
60
+ return wrapper(func)
@@ -0,0 +1,414 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ from pydantic import Field, model_validator
14
+ from typing_extensions import Self
15
+
16
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
17
+ AgenticAIConfiguration
18
+ from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
19
+ from ibm_watsonx_gov.entities.base_classes import Error
20
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
21
+ from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
22
+ RecordMetricResult)
23
+ from ibm_watsonx_gov.entities.llm_judge import LLMJudge
24
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
25
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
26
+ from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
27
+ from ibm_watsonx_gov.providers.unitxt_provider import UnitxtProvider
28
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
29
+ from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
30
+ from ibm_watsonx_gov.utils.python_utils import transform_str_to_list
31
+ from ibm_watsonx_gov.utils.validation_util import (validate_context,
32
+ validate_input,
33
+ validate_llm_as_judge,
34
+ validate_small_model_method,
35
+ validate_unitxt_method)
36
+
37
+ logger = GovSDKLogger.get_logger(__name__)
38
+ CONTEXT_RELEVANCE = "context_relevance"
39
+ CONTEXT_RELEVANCE_DISPLAY_NAME = "Context Relevance"
40
+
41
+
42
+ class ContextRelevanceResult(RecordMetricResult):
43
+ name: str = CONTEXT_RELEVANCE
44
+ display_name: str = CONTEXT_RELEVANCE_DISPLAY_NAME
45
+ group: MetricGroup = MetricGroup.RETRIEVAL_QUALITY
46
+ additional_info: dict[Literal["contexts_values"],
47
+ list[float]] = {"contexts_values": []}
48
+ evidences: list | None = None
49
+
50
+
51
+ unitxt_methods = [
52
+ "token_precision",
53
+ "sentence_bert_bge",
54
+ "sentence_bert_mini_lm",
55
+ "llm_as_judge",
56
+ "granite_guardian",
57
+ "context_relevance_model"
58
+ ]
59
+
60
+
61
+ class ContextRelevanceMetric(GenAIMetric):
62
+ """
63
+ Defines the Context Relevance metric class.
64
+
65
+ The Context Relevance metric measures the relevance of the contexts to the given input query.
66
+ It can be computed using the below methods:
67
+
68
+ 1. token_precision (default)
69
+ 2. sentence_bert_bge
70
+ 3. sentence_bert_mini_lm
71
+ 4. llm_as_judge
72
+ 5. granite_guardian
73
+ 6. context_relevance_model
74
+
75
+ If there are multiple context fields, the context relevance score is computed by combining all the contexts.
76
+
77
+ To compute the individual context relevance scores, set the `compute_per_context` flag to True. The default value is False.
78
+ When `compute_per_context` is set to True, the metric value is taken as the maximum of the combined context relevance score and the context relevance scores for each context.
79
+
80
+ The other retrieval quality metrics use per context scores for computation. Its recommended to set the `compute_per_context` flag to True when computing the retrieval quality metrics for better accuracy.
81
+
82
+ Examples:
83
+ 1. Create Context Relevance metric with default parameters and compute using metrics evaluator.
84
+ .. code-block:: python
85
+
86
+ metric = ContextRelevanceMetric()
87
+ result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": "..."},
88
+ metrics=[metric])
89
+ # A list of contexts can also be passed as shown below
90
+ result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
91
+ metrics=[metric])
92
+
93
+ 2. Create Context Relevance metric with a custom thresholds and method.
94
+ .. code-block:: python
95
+
96
+ thresholds = [MetricThreshold(type="lower_limit", value=0.5)]
97
+ method = "sentence_bert_bge"
98
+ metric = ContextRelevanceMetric(
99
+ method=method, thresholds=thresholds)
100
+
101
+ 3. Create Context Relevance metric with llm_as_judge method.
102
+ .. code-block:: python
103
+
104
+ # Define LLM Judge using watsonx.ai
105
+ # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
106
+ llm_judge = LLMJudge(model=WxAIFoundationModel(
107
+ model_id="ibm/granite-3-3-8b-instruct",
108
+ project_id="<PROJECT_ID>"))
109
+ metric = ContextRelevanceMetric(llm_judge=llm_judge)
110
+
111
+ 4. Create Context Relevance metric with granite_guardian method.
112
+ .. code-block:: python
113
+
114
+ metric = ContextRelevanceMetric(method="granite_guardian")
115
+
116
+ 4. Create Context Relevance metric with context_relevance_model method. Currently available only in On-Prem version.
117
+ .. code-block:: python
118
+
119
+ metric = ContextRelevanceMetric(method="context_relevance_model")
120
+ """
121
+ name: Annotated[Literal["context_relevance"],
122
+ Field(title="Name",
123
+ description="The context relevance metric name.",
124
+ default=CONTEXT_RELEVANCE, frozen=True)]
125
+ display_name: Annotated[Literal["Context Relevance"],
126
+ Field(title="Display Name",
127
+ description="The context relevance metric display name.",
128
+ default=CONTEXT_RELEVANCE_DISPLAY_NAME, frozen=True)]
129
+ tasks: Annotated[list[TaskType],
130
+ Field(title="Tasks",
131
+ description="The list of supported tasks.",
132
+ default=[TaskType.RAG])]
133
+ thresholds: Annotated[list[MetricThreshold],
134
+ Field(title="Thresholds",
135
+ description="The metric thresholds.",
136
+ default=[MetricThreshold(type="lower_limit", value=0.7)])]
137
+ method: Annotated[Literal["token_precision", "sentence_bert_bge", "sentence_bert_mini_lm", "llm_as_judge", "granite_guardian", "context_relevance_model"],
138
+ Field(title="Method",
139
+ description="The method used to compute the metric. This field is optional and when `llm_judge` is provided, the method would be set to `llm_as_judge`. The `context_relevance_model` is currently available only in On-Prem version.",
140
+ default="token_precision")]
141
+ group: Annotated[MetricGroup,
142
+ Field(title="Group",
143
+ description="The metric group.",
144
+ default=MetricGroup.RETRIEVAL_QUALITY, frozen=True)]
145
+ llm_judge: Annotated[LLMJudge | None,
146
+ Field(title="LLM Judge",
147
+ description="The LLM judge used to compute the metric.",
148
+ default=None)]
149
+ compute_per_context: Annotated[bool,
150
+ Field(title="Compute per context",
151
+ description="The flag to compute the relevance score of each context. The default value is False. Setting the flag to True increases the latency and cost of metric computation.",
152
+ default=False)]
153
+
154
+ @model_validator(mode="after")
155
+ def set_llm_judge_default_method(self) -> Self:
156
+ # If llm_judge is set, set the method to llm_as_judge
157
+ if self.llm_judge:
158
+ self.method = "llm_as_judge"
159
+ return self
160
+
161
+ def __is_supported(self, **kwargs):
162
+ # Currently supported only in CPD and ypqa
163
+ return kwargs.get(
164
+ "api_client").credentials.region == "ypqa" or kwargs.get("api_client").is_cpd
165
+
166
+ def __validate_context_relevance_inputs(self, data: pd.DataFrame | dict, configuration: GenAIConfiguration | AgenticAIConfiguration, **kwargs):
167
+ data_cols = data.columns.to_list()
168
+ validate_input(data_cols, configuration)
169
+ validate_context(data_cols, configuration)
170
+ validate_unitxt_method(self.name, self.method, unitxt_methods)
171
+ validate_llm_as_judge(self.name, self.method,
172
+ self.llm_judge, configuration.llm_judge)
173
+ validate_small_model_method(
174
+ self.name, self.method, self.__is_supported(**kwargs), unitxt_methods)
175
+
176
+ async def get_combined_context_scores(self, data: pd.DataFrame | dict, configuration: GenAIConfiguration | AgenticAIConfiguration, **kwargs):
177
+ """
178
+ Method to compute context relevance on the complete context.
179
+ Returns the metric result along with a list of the scores.
180
+ """
181
+ if self.method in ["granite_guardian", "context_relevance_model"]:
182
+ self.__provider = "detectors"
183
+ kwargs["detector_params"] = {
184
+ "method": self.method, "threshold": 0.001}
185
+ provider = DetectorsProvider(configuration=configuration,
186
+ metric_name=self.name,
187
+ metric_method=self.method,
188
+ metric_display_name=self.display_name,
189
+ metric_group=self.group,
190
+ thresholds=self.thresholds,
191
+ **kwargs)
192
+ else:
193
+ self.__provider = "unitxt"
194
+ provider = UnitxtProvider(
195
+ configuration=configuration,
196
+ metric_name=self.name,
197
+ metric_display_name=self.display_name,
198
+ metric_method=self.method,
199
+ metric_group=self.group,
200
+ metric_prefix="metrics.rag.external_rag",
201
+ llm_judge=self.llm_judge,
202
+ thresholds=self.thresholds,
203
+ **kwargs)
204
+ result = await provider.evaluate_async(data=data)
205
+ final_res, scores_list = self.get_combined_context_score(result)
206
+
207
+ return final_res, scores_list
208
+
209
+ async def get_per_context_scores(self, data: pd.DataFrame | dict, configuration: GenAIConfiguration | AgenticAIConfiguration, context_fields: list, combined_context_scores: list | None, **kwargs):
210
+ # Method to get metric scores on individual contexts
211
+ contexts_result: list[AggregateMetricResult] = []
212
+ for context in context_fields:
213
+ context_config = configuration.model_copy()
214
+ context_config.context_fields = [context]
215
+ if self.method in ["granite_guardian", "context_relevance_model"]:
216
+ kwargs["detector_params"] = {
217
+ "method": self.method, "threshold": 0.001}
218
+ provider = DetectorsProvider(configuration=context_config,
219
+ metric_name=self.name,
220
+ metric_display_name=self.display_name,
221
+ metric_method=self.method,
222
+ metric_group=self.group,
223
+ thresholds=self.thresholds,
224
+ **kwargs)
225
+ else:
226
+ provider = UnitxtProvider(
227
+ configuration=context_config,
228
+ metric_name=self.name,
229
+ metric_display_name=self.display_name,
230
+ metric_method=self.method,
231
+ metric_group=self.group,
232
+ metric_prefix="metrics.rag.external_rag",
233
+ llm_judge=self.llm_judge,
234
+ thresholds=self.thresholds,
235
+ **kwargs)
236
+ res = await provider.evaluate_async(data=data)
237
+ contexts_result.append(res)
238
+ final_res = self.get_context_scores(
239
+ contexts_result, combined_context_scores)
240
+ return final_res
241
+
242
+ def get_combined_context_score(self, contexts_result):
243
+ # Method to process the response
244
+ final_res: list[ContextRelevanceResult] = []
245
+ # Get record level metrics. This will be a list of `RecordMetricResult` objects.
246
+ context_results = [contexts_result.record_level_metrics]
247
+ for record_level_metric in context_results:
248
+ # Get record level values
249
+ values = [
250
+ context_value.value for context_value in record_level_metric]
251
+ # convert None values to 0.0
252
+ values = [x if x is not None else 0.0 for x in values]
253
+ combined_context_values = [[value] for value in values]
254
+ evidences = []
255
+ for context_value in record_level_metric:
256
+ if context_value.evidences:
257
+ evidences.append(
258
+ context_value.evidences[0])
259
+
260
+ for combined_context_value in combined_context_values:
261
+ record_result = ContextRelevanceResult(
262
+ method=self.method,
263
+ provider=self.__provider,
264
+ value=max(combined_context_value),
265
+ record_id=record_level_metric[0].record_id,
266
+ additional_info={
267
+ "contexts_values": combined_context_value},
268
+ evidences=evidences,
269
+ thresholds=self.thresholds,
270
+ group=MetricGroup.RETRIEVAL_QUALITY.value
271
+ )
272
+ final_res.append(record_result)
273
+ return final_res, combined_context_values
274
+
275
+ def get_context_scores(self, contexts_result, combined_context_scores):
276
+ final_res: list[ContextRelevanceResult] = []
277
+ # Get record level metrics from contexts_result object
278
+ record_level_metrics_list = [
279
+ cr.record_level_metrics for cr in contexts_result]
280
+ record_level_metrics_list = [
281
+ list(x) for x in zip(*record_level_metrics_list)]
282
+ # Extract only the context scores. This will be a 2d array. Each list represents a data row and each element a context.
283
+ record_level_context_scores = [
284
+ [rc.value for rc in record_level_metric]
285
+ for record_level_metric in record_level_metrics_list
286
+ ]
287
+ # Iterate over the lists to get context scores and record_ids
288
+ for context_score, combined_context_score, record_metric in zip(record_level_context_scores, combined_context_scores, record_level_metrics_list):
289
+ values = context_score + combined_context_score
290
+ values = [x if x is not None else 0.0 for x in values]
291
+ # Add evidences for computing metric using `context_relevance_model` method
292
+ evidences = []
293
+ for context_value in record_metric:
294
+ if context_value.evidences:
295
+ evidences.append(
296
+ context_value.evidences[0])
297
+
298
+ record_result = ContextRelevanceResult(
299
+ method=self.method,
300
+ provider=self.__provider,
301
+ value=max(values),
302
+ record_id=record_metric[0].record_id,
303
+ additional_info={"contexts_values": values},
304
+ evidences=evidences,
305
+ thresholds=self.thresholds
306
+ )
307
+ final_res.append(record_result)
308
+ return final_res
309
+
310
+ def evaluate(self,
311
+ data: pd.DataFrame,
312
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
313
+ **kwargs) -> AggregateMetricResult:
314
+ # If ran in sync mode, block until it is done
315
+ return run_in_event_loop(
316
+ self.evaluate_async,
317
+ data=data,
318
+ configuration=configuration,
319
+ **kwargs,
320
+ )
321
+
322
+ async def evaluate_async(self, data: pd.DataFrame | dict,
323
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
324
+ **kwargs) -> AggregateMetricResult:
325
+ try:
326
+ # validate inputs
327
+ self.__validate_context_relevance_inputs(
328
+ data, configuration, **kwargs)
329
+ self.__provider = None
330
+ except ValueError as ve:
331
+ if kwargs.get("ignore_validation_errors"):
332
+ message = f"Skipping '{self.name}' computation because the validation failed. Details: {str(ve)}"
333
+ logger.warning(message)
334
+ return
335
+ raise ve
336
+
337
+ # Determine rows with None values: input is None OR all context fields are None
338
+ input_has_none = data[configuration.input_fields].isna().any(axis=1)
339
+ all_contexts_none = data[configuration.context_fields].isna().all(
340
+ axis=1)
341
+ mask_has_none = input_has_none | all_contexts_none
342
+ df_with_none = data[mask_has_none]
343
+ df_without_none = data[mask_has_none == False]
344
+
345
+ final_res = []
346
+ if not df_without_none.empty:
347
+ context_fields = configuration.context_fields
348
+ # Check if we need to expand the contexts column:
349
+ if len(configuration.context_fields) == 1:
350
+ context = context_fields[0]
351
+ df_without_none[context] = df_without_none[context].apply(
352
+ transform_str_to_list)
353
+ contexts_count = len(df_without_none[context].iloc[0])
354
+ context_fields = [
355
+ f"context_{i}" for i in range(contexts_count)]
356
+ df_without_none[context_fields] = pd.DataFrame(
357
+ df_without_none[context].to_list(), index=df_without_none.index)
358
+
359
+ # compute combined context scores
360
+ final_res, scores_list = await self.get_combined_context_scores(
361
+ df_without_none, configuration, **kwargs)
362
+
363
+ # compute per context score based on the toggle
364
+ if self.compute_per_context:
365
+ final_res = await self.get_per_context_scores(
366
+ df_without_none, configuration, context_fields, scores_list, **kwargs)
367
+
368
+ # Handle records with missing values
369
+ if not df_with_none.empty:
370
+ none_results = []
371
+ for _, row in df_with_none.iterrows():
372
+ record_result = ContextRelevanceResult(
373
+ method=self.method,
374
+ value=None,
375
+ record_id=row[configuration.record_id_field],
376
+ thresholds=self.thresholds,
377
+ errors=[Error(
378
+ code="BAD_REQUEST", message_en="The value of required fields input or context is None.")]
379
+ )
380
+ none_results.append(record_result)
381
+
382
+ # Merge the results
383
+ final_res = final_res + none_results
384
+
385
+ # Create the aggregate result
386
+ values = [
387
+ record.value for record in final_res if record.value is not None]
388
+ if values:
389
+ mean = sum(values) / len(values)
390
+ min_val = min(values)
391
+ max_val = max(values)
392
+ value = mean
393
+ else:
394
+ mean = None
395
+ min_val = None
396
+ max_val = None
397
+ value = None
398
+
399
+ aggregate_result = AggregateMetricResult(
400
+ name=self.name,
401
+ display_name=self.display_name,
402
+ method=self.method,
403
+ provider=self.__provider,
404
+ group=MetricGroup.RETRIEVAL_QUALITY,
405
+ value=value,
406
+ total_records=len(final_res),
407
+ record_level_metrics=final_res,
408
+ min=min_val,
409
+ max=max_val,
410
+ mean=mean,
411
+ thresholds=self.thresholds
412
+ )
413
+
414
+ return aggregate_result
@@ -0,0 +1,8 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
@@ -0,0 +1,58 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from functools import partial
10
+ from typing import Callable, Optional
11
+
12
+ from wrapt import decorator
13
+
14
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
15
+ AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
19
+ from ibm_watsonx_gov.metrics.cost.cost_metric import \
20
+ CostMetric
21
+
22
+
23
+ class CostDecorator(BaseMetricDecorator):
24
+ def evaluate_cost(self,
25
+ func: Optional[Callable] = None,
26
+ *,
27
+ configuration: Optional[AgenticAIConfiguration] = None,
28
+ metrics: list[GenAIMetric] = []
29
+ ) -> dict:
30
+ """
31
+ An evaluation decorator for computing cost metric on message level.
32
+ """
33
+ if func is None:
34
+ return partial(self.evaluate_cost, configuration=configuration, metrics=metrics)
35
+
36
+ if not metrics:
37
+ metrics = [CostMetric()]
38
+
39
+ @decorator
40
+ def wrapper(func, instance, args, kwargs):
41
+
42
+ try:
43
+ self.validate(func=func, metrics=metrics,
44
+ valid_metric_types=(CostMetric,))
45
+
46
+ metric_inputs = [EvaluatorFields.MODEL_USAGE_DETAIL_FIELDS]
47
+
48
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
49
+ configuration=configuration,
50
+ metrics=metrics,
51
+ metric_inputs=metric_inputs)
52
+
53
+ return original_result
54
+ except Exception as ex:
55
+ raise Exception(
56
+ f"There was an error while evaluating cost metric on {func.__name__},") from ex
57
+
58
+ return wrapper(func)