ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,219 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ from lazy_imports import LazyModule, load
14
+ from pydantic import Field, model_validator
15
+ from typing_extensions import Self
16
+
17
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
18
+ from ibm_watsonx_gov.entities.base_classes import Error
19
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
20
+ from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
21
+ RecordMetricResult)
22
+ from ibm_watsonx_gov.entities.llm_judge import LLMJudge
23
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
24
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
25
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
26
+ from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
27
+ from ibm_watsonx_gov.utils.validation_util import (validate_llm_as_judge,
28
+ validate_output,
29
+ validate_reference,
30
+ validate_unitxt_method)
31
+
32
+ unitxt_provider = LazyModule(
33
+ "from ibm_watsonx_gov.providers.unitxt_provider import UnitxtProvider",
34
+ name="lazy_unitxt_provider"
35
+ )
36
+ load(unitxt_provider)
37
+ UnitxtProvider = unitxt_provider.UnitxtProvider
38
+
39
+ logger = GovSDKLogger.get_logger(__name__)
40
+ UNITXT_METRIC_NAME = "answer_correctness"
41
+
42
+ unitxt_methods = ["token_recall",
43
+ "bert_score_recall",
44
+ "sentence_bert_mini_lm",
45
+ "llm_as_judge",
46
+ ]
47
+
48
+
49
+ class AnswerSimilarityMetric(GenAIMetric):
50
+ """
51
+ Defines the Answer Similarity metric class.
52
+
53
+ The Answer Similarity metric measures the similarity between the generated text and the ground truth.
54
+ It can be computed using the below methods:
55
+
56
+ 1. token_recall (default)
57
+ 2. bert_score_recall
58
+ 3. sentence_bert_mini_lm
59
+ 4. llm_as_judge
60
+
61
+ Examples:
62
+ 1. Create Answer Similarity metric with default parameters and compute using metrics evaluator.
63
+ .. code-block:: python
64
+
65
+ metric = AnswerSimilarityMetric()
66
+ result = MetricsEvaluator().evaluate(data={"generated_text": "...", "ground_truth": "..."},
67
+ metrics=[metric])
68
+
69
+ 2. Create Answer Similarity metric with a custom threshold and method.
70
+ .. code-block:: python
71
+
72
+ threshold = MetricThreshold(type="lower_limit", value=0.5)
73
+ method = "sentence_bert_mini_lm"
74
+ metric = AnswerSimilarityMetric(method=method, threshold=threshold)
75
+
76
+ 3. Create Answer Similarity metric with llm_as_judge method.
77
+ .. code-block:: python
78
+
79
+ # Define LLM Judge using watsonx.ai
80
+ # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
81
+ llm_judge = LLMJudge(model=WxAIFoundationModel(
82
+ model_id="ibm/granite-3-3-8b-instruct",
83
+ project_id="<PROJECT_ID>"
84
+ ))
85
+ metric = AnswerSimilarityMetric(llm_judge=llm_judge)
86
+ """
87
+ name: Annotated[Literal["answer_similarity"],
88
+ Field(title="Name",
89
+ description="The answer similarity metric name.",
90
+ default="answer_similarity", frozen=True)]
91
+ display_name: Annotated[Literal["Answer Similarity"],
92
+ Field(title="Display Name",
93
+ description="The answer similarity metric display name.",
94
+ default="Answer Similarity", frozen=True)]
95
+ tasks: Annotated[list[TaskType],
96
+ Field(title="Tasks",
97
+ description="The list of supported tasks.",
98
+ default=[TaskType.RAG, TaskType.QA])]
99
+ is_reference_free: Annotated[bool,
100
+ Field(title="Is Reference free",
101
+ description="The flag to indicate whether this metric needs a reference for computation. This metric needs reference value to compute.",
102
+ default=False, frozen=True)]
103
+ thresholds: Annotated[list[MetricThreshold],
104
+ Field(title="Thresholds",
105
+ description="The metric thresholds.",
106
+ default=[MetricThreshold(type="lower_limit", value=0.7)])]
107
+ method: Annotated[Literal["token_recall", "bert_score_recall", "sentence_bert_mini_lm", "llm_as_judge"],
108
+ Field(title="Method",
109
+ description="The method used to compute the metric. This field is optional and when `llm_judge` is provided, the method would be set to `llm_as_judge`.",
110
+ default="token_recall")]
111
+ group: Annotated[MetricGroup,
112
+ Field(title="Group",
113
+ description="The metric group.",
114
+ default=MetricGroup.ANSWER_QUALITY, frozen=True)]
115
+ llm_judge: Annotated[LLMJudge | None,
116
+ Field(title="LLM Judge",
117
+ description="The LLM judge used to compute the metric.",
118
+ default=None)]
119
+
120
+ @model_validator(mode="after")
121
+ def set_llm_judge_default_method(self) -> Self:
122
+ # If llm_judge is set, set the method to llm_as_judge
123
+ if self.llm_judge:
124
+ self.method = "llm_as_judge"
125
+ return self
126
+
127
+ def evaluate(self,
128
+ data: pd.DataFrame,
129
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
130
+ **kwargs) -> AggregateMetricResult:
131
+ # If ran in sync mode, block until it is done
132
+ return run_in_event_loop(
133
+ self.evaluate_async,
134
+ data=data,
135
+ configuration=configuration,
136
+ **kwargs,
137
+ )
138
+
139
+ async def evaluate_async(self, data: pd.DataFrame,
140
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
141
+ **kwargs) -> AggregateMetricResult:
142
+
143
+ data_cols = data.columns.to_list()
144
+
145
+ try:
146
+ validate_output(data_cols, configuration)
147
+ validate_reference(data_cols, configuration)
148
+ validate_unitxt_method(self.name, self.method, unitxt_methods)
149
+ validate_llm_as_judge(self.name, self.method,
150
+ self.llm_judge, configuration.llm_judge)
151
+ except ValueError as ve:
152
+ if kwargs.get("ignore_validation_errors"):
153
+ message = f"Skipping '{self.name}' computation because the validation failed. Details: {str(ve)}"
154
+ logger.warning(message)
155
+ return
156
+ raise ve
157
+
158
+ # Separate the data into a dataframe with no None values and a dataframe with None values
159
+ required_fields = configuration.output_fields + configuration.reference_fields
160
+ mask_has_none = data[required_fields].isna().any(axis=1)
161
+ df_with_none = data[mask_has_none]
162
+ df_without_none = data[mask_has_none == False]
163
+
164
+ # Compute the metrics only for the dataframe with no None values
165
+ aggregated_metric_result = None
166
+ if not df_without_none.empty:
167
+ provider = UnitxtProvider(configuration=configuration,
168
+ metric_name=self.name,
169
+ metric_display_name=self.display_name,
170
+ metric_method=self.method,
171
+ metric_prefix="metrics.rag.external_rag",
172
+ metric_alias=UNITXT_METRIC_NAME,
173
+ metric_group=self.group,
174
+ llm_judge=self.llm_judge,
175
+ thresholds=self.thresholds,
176
+ **kwargs)
177
+
178
+ aggregated_metric_result = await provider.evaluate_async(data=df_without_none)
179
+
180
+ # Update the metric result with record level metrics results for the records with missing values
181
+ if not df_with_none.empty:
182
+ # Create None results for records with missing values
183
+ none_results = []
184
+ for _, row in df_with_none.iterrows():
185
+ record_result = RecordMetricResult(
186
+ name=self.name,
187
+ display_name=self.display_name,
188
+ method=self.method,
189
+ group=self.group,
190
+ value=None,
191
+ record_id=row[configuration.record_id_field],
192
+ thresholds=self.thresholds,
193
+ errors=[Error(
194
+ code="BAD_REQUEST", message_en="The value of required fields output or reference is None.")]
195
+ )
196
+ none_results.append(record_result)
197
+
198
+ # Merge the results
199
+ if aggregated_metric_result:
200
+ all_record_results = aggregated_metric_result.record_level_metrics + none_results
201
+ aggregated_metric_result.record_level_metrics = all_record_results
202
+ aggregated_metric_result.total_records = len(
203
+ all_record_results)
204
+ else:
205
+ aggregated_metric_result = AggregateMetricResult(
206
+ name=self.name,
207
+ display_name=self.display_name,
208
+ method=self.method,
209
+ group=self.group,
210
+ value=None,
211
+ total_records=len(none_results),
212
+ record_level_metrics=none_results,
213
+ min=None,
214
+ max=None,
215
+ mean=None,
216
+ thresholds=self.thresholds
217
+ )
218
+
219
+ return aggregated_metric_result
File without changes
@@ -0,0 +1,62 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from functools import partial
10
+ from typing import Callable, Optional
11
+
12
+ from wrapt import decorator
13
+
14
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
15
+ AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics.average_precision.average_precision_metric import \
19
+ AveragePrecisionMetric
20
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
21
+ from ibm_watsonx_gov.metrics.context_relevance.context_relevance_metric import \
22
+ ContextRelevanceMetric
23
+
24
+
25
+ class AveragePrecisionDecorator(BaseMetricDecorator):
26
+ def evaluate_average_precision(self,
27
+ func: Optional[Callable] = None,
28
+ *,
29
+ configuration: Optional[AgenticAIConfiguration] = None,
30
+ metrics: list[GenAIMetric] = []
31
+ ) -> dict:
32
+ """
33
+ An evaluation decorator for computing average precision metric on an agentic node.
34
+ """
35
+ if func is None:
36
+ return partial(self.evaluate_average_precision, configuration=configuration, metrics=metrics)
37
+
38
+ if not metrics:
39
+ metrics = [AveragePrecisionMetric()]
40
+
41
+ @decorator
42
+ def wrapper(func, instance, args, kwargs):
43
+
44
+ try:
45
+ self.validate(func=func, metrics=metrics,
46
+ valid_metric_types=(AveragePrecisionMetric, ContextRelevanceMetric))
47
+
48
+ metric_inputs = [EvaluatorFields.INPUT_FIELDS]
49
+ metric_outputs = [EvaluatorFields.CONTEXT_FIELDS]
50
+
51
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
52
+ configuration=configuration,
53
+ metrics=metrics,
54
+ metric_inputs=metric_inputs,
55
+ metric_outputs=metric_outputs)
56
+
57
+ return original_result
58
+ except Exception as ex:
59
+ raise Exception(
60
+ f"There was an error while evaluating average precision metric on {func.__name__},") from ex
61
+
62
+ return wrapper(func)
@@ -0,0 +1,174 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Any, Literal
11
+
12
+ import pandas as pd
13
+ from pydantic import Field, TypeAdapter, field_validator
14
+
15
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
17
+ from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
18
+ RecordMetricResult)
19
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
20
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
21
+ from ibm_watsonx_gov.metrics.context_relevance.context_relevance_metric import (
22
+ CONTEXT_RELEVANCE, ContextRelevanceMetric, ContextRelevanceResult)
23
+
24
+ AVERAGE_PRECISION = "average_precision"
25
+ AVERAGE_PRECISION_DISPLAY_NAME = "Average Precision"
26
+
27
+
28
+ class AveragePrecisionResult(RecordMetricResult):
29
+ name: str = AVERAGE_PRECISION
30
+ display_name: str = AVERAGE_PRECISION_DISPLAY_NAME
31
+ group: MetricGroup = MetricGroup.RETRIEVAL_QUALITY
32
+
33
+
34
+ class AveragePrecisionMetric(GenAIMetric):
35
+ """
36
+ Defines the Average Precision metric class.
37
+
38
+ The Average Precision metric measures the quality of how a retrieval system ranks relevant contexts.
39
+ The Context Relevance metric is computed as a pre requisite to compute this metric.
40
+
41
+ Examples:
42
+ 1. Create Average Precision metric with default parameters and compute using metrics evaluator.
43
+ .. code-block:: python
44
+
45
+ metric = AveragePrecisionMetric()
46
+ result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": "..."},
47
+ metrics=[metric])
48
+ # A list of contexts can also be passed as shown below
49
+ result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
50
+ metrics=[metric])
51
+
52
+ 2. Create Average Precision metric with a custom threshold.
53
+ .. code-block:: python
54
+
55
+ threshold = MetricThreshold(type="lower_limit", value=0.5)
56
+ metric = AveragePrecisionMetric(method=method, threshold=threshold)
57
+
58
+ 3. Create Average Precision metric with llm_as_judge method.
59
+ .. code-block:: python
60
+
61
+ # Define LLM Judge using watsonx.ai
62
+ # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
63
+ llm_judge = LLMJudge(model=WxAIFoundationModel(
64
+ model_id="ibm/granite-3-3-8b-instruct",
65
+ project_id="<PROJECT_ID>"
66
+ ))
67
+ cr_metric = ContextRelevanceMetric(llm_judge=llm_judge)
68
+ ap_metric = AveragePrecisionMetric()
69
+ result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
70
+ metrics=[cr_metric, ap_metric])
71
+ """
72
+ name: Annotated[Literal["average_precision"],
73
+ Field(title="Name",
74
+ description="The average precision metric name.",
75
+ default=AVERAGE_PRECISION, frozen=True)]
76
+ display_name: Annotated[Literal["Average Precision"],
77
+ Field(title="Display Name",
78
+ description="The average precision metric display name.",
79
+ default=AVERAGE_PRECISION_DISPLAY_NAME, frozen=True)]
80
+ tasks: Annotated[list[TaskType],
81
+ Field(title="Tasks",
82
+ description="The list of supported tasks.",
83
+ default=[TaskType.RAG])]
84
+ thresholds: Annotated[list[MetricThreshold],
85
+ Field(title="Thresholds",
86
+ description="The metric thresholds.",
87
+ default=[MetricThreshold(type="lower_limit", value=0.7)])]
88
+ metric_dependencies: Annotated[list[GenAIMetric],
89
+ Field(title="Metric dependencies",
90
+ description="The list of metric dependencies",
91
+ default=[ContextRelevanceMetric()])]
92
+ group: Annotated[MetricGroup,
93
+ Field(title="Group",
94
+ description="The metric group.",
95
+ default=MetricGroup.RETRIEVAL_QUALITY, frozen=True)]
96
+
97
+ @field_validator("metric_dependencies", mode="before")
98
+ @classmethod
99
+ def metric_dependencies_validator(cls, value: Any):
100
+ if value:
101
+ value = [TypeAdapter(Annotated[ContextRelevanceMetric, Field(
102
+ discriminator="name")]).validate_python(
103
+ m) for m in value]
104
+ return value
105
+
106
+ def evaluate(
107
+ self,
108
+ data: pd.DataFrame,
109
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
110
+ metrics_result: list[AggregateMetricResult],
111
+ **kwargs,
112
+ ) -> AggregateMetricResult:
113
+ record_level_metrics = []
114
+ scores = []
115
+
116
+ context_relevance_result: list[ContextRelevanceResult] = next(
117
+ (metric_result.record_level_metrics for metric_result in metrics_result if metric_result.name == CONTEXT_RELEVANCE), None)
118
+
119
+ if context_relevance_result is None:
120
+ raise Exception(
121
+ f"Failed to evaluate {self.name} metric. Missing context relevance metric result")
122
+
123
+ for relevance_result in context_relevance_result:
124
+ score = self.__compute(
125
+ relevance_scores=relevance_result.additional_info.get(
126
+ "contexts_values", []),
127
+ threshold=self.thresholds[0].value,
128
+ )
129
+ scores.append(score)
130
+ record_level_metrics.append(
131
+ AveragePrecisionResult(
132
+ method="",
133
+ provider="",
134
+ record_id=relevance_result.record_id,
135
+ value=score,
136
+ thresholds=self.thresholds
137
+ )
138
+ )
139
+
140
+ mean = sum(scores) / len(scores)
141
+ aggregate_metric_score = AggregateMetricResult(
142
+ name=self.name,
143
+ display_name=self.display_name,
144
+ method="",
145
+ provider="",
146
+ group=self.group,
147
+ min=min(scores),
148
+ max=max(scores),
149
+ mean=mean,
150
+ value=mean,
151
+ total_records=len(record_level_metrics),
152
+ record_level_metrics=record_level_metrics,
153
+ thresholds=self.thresholds
154
+ )
155
+
156
+ return aggregate_metric_score
157
+
158
+ def __compute(self, relevance_scores: list[float], threshold: float) -> float:
159
+ relevancy_at_k = []
160
+ for i, score in enumerate(relevance_scores):
161
+ if score >= threshold:
162
+ relevancy_at_k.append(i + 1)
163
+ total_relevant_items = len(relevancy_at_k)
164
+ if total_relevant_items == 0:
165
+ return 0
166
+ precision_sum = 0
167
+ relevant_rank = 0
168
+ for k in relevancy_at_k:
169
+ relevant_rank += 1
170
+ precision_at_k = relevant_rank / k
171
+ precision_sum += precision_at_k
172
+ average_precision = precision_sum / total_relevant_items
173
+ average_precision_rounded = round(average_precision, 1)
174
+ return average_precision_rounded
@@ -0,0 +1,193 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from functools import partial
11
+ from json import dumps
12
+ from threading import Lock
13
+ from typing import Any, Callable, Set
14
+
15
+ from ibm_watsonx_gov.clients.api_client import APIClient
16
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
17
+ AgenticAIConfiguration
18
+ from ibm_watsonx_gov.entities.agentic_app import MetricsConfiguration
19
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
20
+ from ibm_watsonx_gov.entities.evaluation_result import AgentMetricResult
21
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
22
+ from ibm_watsonx_gov.evaluators.impl.evaluate_metrics_impl import \
23
+ _evaluate_metrics_async
24
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
25
+ from ibm_watsonx_gov.utils.python_utils import get_argument_value
26
+
27
+ try:
28
+ from ibm_agent_analytics.instrumentation.utils import (
29
+ AIEventRecorder, get_current_trace_id, record_span_attributes)
30
+ except:
31
+ pass
32
+
33
+
34
+ class BaseMetricDecorator():
35
+ """
36
+ Base class for all metric decorators
37
+ """
38
+
39
+ def __init__(self, api_client: APIClient = None, configuration: AgenticAIConfiguration = None,
40
+ compute_real_time: bool = True, metric_results: list[AgentMetricResult] = [],
41
+ execution_counts: dict[str, dict[str, int]] = {},
42
+ nodes_being_run: dict[str, Set[str]] = {}, lock: Lock = None):
43
+ self.api_client = api_client
44
+ self.configuration = configuration
45
+ self.compute_real_time = compute_real_time
46
+ self.metric_results = metric_results
47
+ self.execution_counts = execution_counts
48
+ self.nodes_being_run = nodes_being_run
49
+ self.lock = lock
50
+
51
+ def validate(self, *, func: Callable, metrics: list[GenAIMetric], valid_metric_types: tuple[Any]):
52
+ if not metrics:
53
+ raise ValueError(
54
+ "The 'metrics' argument can not be empty.")
55
+
56
+ invalid_metrics = [metric.name for metric in metrics if not isinstance(
57
+ metric, valid_metric_types)]
58
+ if len(invalid_metrics):
59
+ raise ValueError(
60
+ f"The evaluator '{func.__name__}' is not applicable for "
61
+ f"computing the metrics: {', '.join(invalid_metrics)}")
62
+
63
+ def compute_helper(self, *, func: Callable,
64
+ args: tuple,
65
+ kwargs: dict[str, Any],
66
+ configuration: AgenticAIConfiguration,
67
+ metrics: list[GenAIMetric],
68
+ metric_inputs: list[EvaluatorFields],
69
+ metric_outputs: list[EvaluatorFields],
70
+ metric_references: list[EvaluatorFields] = [],
71
+ metric_groups: list[MetricGroup] = []) -> dict:
72
+ """
73
+ Helper method for computing metrics.
74
+
75
+ Does the following:
76
+ 1. Computes node latency metric, and appends the result to the :py:attr:`AgenticEvaluation.metric_results` attribute.
77
+ 2. Calls the original node.
78
+ 3. Computes the list of metrics given, and appends the result to the :py:attr:`AgenticEvaluation.metric_results` attribute.
79
+ 4. Returns the result of the original node without any changes.
80
+
81
+ Args:
82
+ func (Callable): The node on which the metric is to be computed
83
+ args (tuple): The tuple of positional arguments passed to the node
84
+ kwargs (dict[str, Any]): The dictionary of keyword arguments passed to the node
85
+ configuration (AgenticAIConfiguration): The node specific configuration
86
+ metrics (list[GenAIMetric]): The list of metrics to compute.
87
+ metric_inputs (list[EvaluatorFields]): The list of inputs for the metric.
88
+ metric_outputs (list[EvaluatorFields]): The list of outputs for the metric.
89
+ metric_references (list[EvaluatorFields], optional): The optional list of references for the metric. Defaults to [].
90
+
91
+ Raises:
92
+ ValueError: If the record id field is missing from the node inputs.
93
+
94
+ Returns:
95
+ dict: The result of the wrapped node.
96
+ """
97
+
98
+ get_arg_value = partial(
99
+ get_argument_value, func=func, args=args, kwargs=kwargs)
100
+
101
+ defaults = metric_inputs + metric_outputs + metric_references
102
+ _configuration = AgenticAIConfiguration.create_configuration(app_config=self.configuration,
103
+ method_config=configuration,
104
+ defaults=defaults)
105
+ _configuration.record_id_field = _configuration.message_id_field
106
+
107
+ _data = {}
108
+ # Add record id to the data
109
+ _field = getattr(_configuration, EvaluatorFields.MESSAGE_ID_FIELD.value,
110
+ EvaluatorFields.get_default_fields_mapping()[EvaluatorFields.MESSAGE_ID_FIELD])
111
+
112
+ try:
113
+ _message_id_value = get_arg_value(
114
+ param_name=_field) or get_current_trace_id()
115
+ except ValueError:
116
+ _message_id_value = get_current_trace_id()
117
+
118
+ if _message_id_value is None:
119
+ raise ValueError(
120
+ f"The {_field} is required for evaluation. Please add it while invoking the application.")
121
+
122
+ _data[_field] = _message_id_value
123
+
124
+ if _message_id_value not in self.nodes_being_run:
125
+ self.nodes_being_run[_message_id_value] = set()
126
+ if _message_id_value not in self.execution_counts:
127
+ self.execution_counts[_message_id_value] = dict()
128
+
129
+ if func.__name__ not in self.nodes_being_run[_message_id_value]:
130
+ self.nodes_being_run[_message_id_value].add(func.__name__)
131
+ self.execution_counts[_message_id_value][func.__name__] = self.execution_counts[_message_id_value].get(
132
+ func.__name__, 0) + 1
133
+
134
+ original_result = func(*args, **kwargs)
135
+
136
+ metric_result = []
137
+ if self.compute_real_time:
138
+ for field in metric_inputs + metric_references:
139
+ _field = getattr(_configuration, field.value)
140
+ if not (isinstance(_field, list)):
141
+ _field = [_field]
142
+ _data.update(dict(map(lambda f: (
143
+ f, get_arg_value(param_name=f)), _field)))
144
+
145
+ for field in metric_outputs:
146
+ _field = getattr(_configuration, field.value)
147
+ if not (isinstance(_field, list)):
148
+ _field = [_field]
149
+ _data.update(dict(map(lambda f: (
150
+ f, original_result.get(f)), _field)))
151
+
152
+ metric_result = run_in_event_loop(
153
+ _evaluate_metrics_async,
154
+ configuration=_configuration,
155
+ data=_data,
156
+ metrics=metrics,
157
+ metric_groups=metric_groups,
158
+ api_client=self.api_client
159
+ )
160
+ metric_result = metric_result.to_dict()
161
+
162
+ for mr in metric_result:
163
+ node_result = {
164
+ "applies_to": "node",
165
+ "node_name": func.__name__,
166
+ **mr
167
+ }
168
+ node_result["message_id"] = node_result["record_id"]
169
+ amr = AgentMetricResult(**node_result)
170
+
171
+ AIEventRecorder.record_metric(name=amr.name,
172
+ value=amr.value,
173
+ attributes={"wxgov.result.metric": amr.model_dump_json(exclude_unset=True)})
174
+ metrics_configuration = MetricsConfiguration(
175
+ configuration=_configuration, metrics=metrics)
176
+ record_span_attributes({"wxgov.config.metrics."+str(type(self)).split(".")[2]: dumps({
177
+ "metrics_configuration": metrics_configuration.model_dump(mode="json"),
178
+ "compute_real_time": "true"
179
+ })})
180
+
181
+ with self.lock:
182
+ self.metric_results.append(amr)
183
+
184
+ else:
185
+ metrics_configuration = MetricsConfiguration(
186
+ configuration=_configuration, metrics=metrics)
187
+ # Store the configuration of metrics to compute in traces
188
+ record_span_attributes({"wxgov.config.metrics."+str(type(self)).split(".")[2]: dumps({
189
+ "metrics_configuration": metrics_configuration.model_dump(mode="json"),
190
+ "compute_real_time": "false"
191
+ })})
192
+
193
+ return original_result
@@ -0,0 +1,8 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------