ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,166 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
14
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
15
+ from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
16
+ from ibm_watsonx_gov.entities.llm_judge import LLMJudge
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
19
+ from ibm_watsonx_gov.providers.tool_call_metric_provider import \
20
+ ToolCallMetricProvider
21
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
22
+ from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
23
+ from ibm_watsonx_gov.utils.validation_util import (validate_input,
24
+ validate_llm_as_judge,
25
+ validate_tool_calls)
26
+ from pydantic import Field
27
+
28
+ logger = GovSDKLogger.get_logger(__name__)
29
+ TOOL_CALL_RELEVANCE = "tool_call_relevance"
30
+
31
+
32
+ class ToolCallRelevanceMetric(GenAIMetric):
33
+ """
34
+ ToolCallRelevanceMetric assesses whether this function call correctly implements
35
+ the user's immediate request as the appropriate next step in the conversation.
36
+ Compares against all available functions in the tool inventory to determine if
37
+ the selection aligns with user intent and context.
38
+
39
+ The ToolCallRelevanceMetric will be computed using llm_as_judge.
40
+
41
+ Examples:
42
+ 1. Create ToolCallRelevanceMetric by passing the basic configuration.
43
+ .. code-block:: python
44
+
45
+ config = GenAIConfiguration(tools = [get_weather,fetch_stock_price])
46
+ evaluator = MetricsEvaluator(configuration=config)
47
+ df = pd.read_csv("")
48
+ llm_judge = LLMJudge(
49
+ model=WxAIFoundationModel(
50
+ model_id="meta-llama/llama-3-3-70b-instruct",
51
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
52
+ )
53
+ )
54
+ metrics = [ToolCallRelevanceMetric(llm_judge=llm_judge)]
55
+ result = evaluator.evaluate(data=df, metrics=metrics)
56
+
57
+ 2. Create ToolCallRelevanceMetric by passing custom tool calls field in configuration.
58
+ .. code-block:: python
59
+
60
+ config = GenAIConfiguration(tools = [get_weather,fetch_stock_price],
61
+ tool_calls_field="tools_used")
62
+ evaluator = MetricsEvaluator(configuration=config)
63
+ df = pd.read_csv("")
64
+ llm_judge = LLMJudge(
65
+ model=WxAIFoundationModel(
66
+ model_id="meta-llama/llama-3-3-70b-instruct",
67
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
68
+ )
69
+ )
70
+ metrics = [ToolCallRelevanceMetric(llm_judge=llm_judge)]
71
+ result = evaluator.evaluate(data=df, metrics=metrics)
72
+
73
+ 3. Create ToolCallRelevanceMetric with a custom threshold.
74
+ .. code-block:: python
75
+
76
+ llm_judge = LLMJudge(
77
+ model=WxAIFoundationModel(
78
+ model_id="meta-llama/llama-3-3-70b-instruct",
79
+ project_id=os.getenv("WATSONX_PROJECT_ID"),
80
+ )
81
+ )
82
+ threshold = MetricThreshold(type="upper_limit", value=0.8)
83
+ metric = ToolCallRelevanceMetric(llm_judge=llm_judge, threshold=threshold)
84
+
85
+ """
86
+
87
+ name: Annotated[Literal["tool_call_relevance"], Field(title="Metric Name",
88
+ description="The name of metric.",
89
+ default=TOOL_CALL_RELEVANCE)]
90
+ display_name: Annotated[Literal["Tool Call Relevance"], Field(title="Display Name",
91
+ description="The tool call relevance metric display name.",
92
+ default="Tool Call Relevance", frozen=True)]
93
+ tasks: Annotated[list[TaskType], Field(title="Task Type",
94
+ description="The generative task type.",
95
+ default=[TaskType.RAG])]
96
+ group: Annotated[MetricGroup, Field(
97
+ default=MetricGroup.TOOL_CALL_QUALITY, frozen=True)]
98
+
99
+ llm_judge: Annotated[LLMJudge | None, Field(
100
+ description="The LLM judge used to compute the metric.", default=None)]
101
+
102
+ method: Annotated[Literal["llm_as_judge"], Field(title="Computation Method",
103
+ description="The method used to compute the metric.",
104
+ default="llm_as_judge")]
105
+ thresholds: Annotated[list[MetricThreshold], Field(title="Metric threshold",
106
+ description="Value that defines the violation limit for the metric",
107
+ default=[MetricThreshold(
108
+ type="lower_limit", value=0.8)]
109
+ )]
110
+ metric_mapping_name: Annotated[Literal["function_selection_appropriateness"], Field(title="Metric Mapping Name",
111
+ description="The mapping name of metric with llmevalkit.",
112
+ default="function_selection_appropriateness")]
113
+
114
+ async def evaluate_async(self, data: pd.DataFrame,
115
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
116
+ **kwargs) -> AggregateMetricResult:
117
+ """
118
+ Evaluate the data for ToolCallRelevanceMetric
119
+ Args:
120
+ data (pd.DataFrame | dict): Data to be evaluated
121
+ configuration (GenAIConfiguration | AgenticAIConfiguration): Metrics configuration
122
+ **kwargs: Additional keyword arguments
123
+
124
+ Returns:
125
+ AggregateMetricResult: The computed metrics
126
+ """
127
+ data_cols = data.columns.to_list()
128
+
129
+ try:
130
+ validate_tool_calls(data_cols, configuration)
131
+ validate_input(data_cols, configuration)
132
+ validate_llm_as_judge(self.name, self.method,
133
+ self.llm_judge, configuration.llm_judge)
134
+ except ValueError as ve:
135
+ if kwargs.get("ignore_validation_errors"):
136
+ message = f"Skipping '{self.name}' computation because the validation failed. Details: {str(ve)}"
137
+ logger.warning(message)
138
+ return
139
+ raise ve
140
+
141
+ tool_call_provider = ToolCallMetricProvider(
142
+ configuration=configuration, metric=self)
143
+ metric_config = {
144
+ "general_metrics": None,
145
+ "function_metrics": [self.metric_mapping_name],
146
+ "parameter_metrics": None,
147
+ "transform_enabled": False
148
+ }
149
+ metric_result = await tool_call_provider.compute_metrics(
150
+ data, syntactic_only=False, metric_result_mapping_name="function_selection", **metric_config)
151
+
152
+ return metric_result
153
+
154
+ def evaluate(
155
+ self,
156
+ data: pd.DataFrame | dict,
157
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
158
+ **kwargs,
159
+ ):
160
+ # If ran in sync mode, block until it is done
161
+ return run_in_event_loop(
162
+ self.evaluate_async,
163
+ data=data,
164
+ configuration=configuration,
165
+ **kwargs,
166
+ )
@@ -0,0 +1,66 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from functools import partial
10
+ from typing import Callable, Optional
11
+
12
+ from wrapt import decorator
13
+
14
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
15
+ AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
19
+ from ibm_watsonx_gov.metrics.tool_call_syntactic_accuracy.tool_call_syntactic_accuracy_metric import \
20
+ ToolCallSyntacticAccuracyMetric
21
+ from ibm_watsonx_gov.providers.tool_call_metric_provider import \
22
+ ToolCallMetricProvider
23
+
24
+
25
+ class ToolCallSyntacticAccuracyDecorator(BaseMetricDecorator):
26
+ def evaluate_tool_call_syntactic_accuracy(self,
27
+ func: Optional[Callable] = None,
28
+ *,
29
+ configuration: Optional[AgenticAIConfiguration] = None,
30
+ metrics: list[GenAIMetric] = []
31
+ ) -> dict:
32
+ """
33
+ An evaluation decorator for computing tool_call_syntactic_accuracy metric on an agentic node.
34
+ """
35
+ if func is None:
36
+ return partial(self.evaluate_tool_call_syntactic_accuracy, configuration=configuration, metrics=metrics)
37
+
38
+ if not metrics:
39
+ metrics = [ToolCallSyntacticAccuracyMetric()]
40
+
41
+ @decorator
42
+ def wrapper(func, instance, args, kwargs):
43
+
44
+ try:
45
+ self.validate(func=func, metrics=metrics,
46
+ valid_metric_types=(ToolCallSyntacticAccuracyMetric,))
47
+
48
+ metric_outputs = [
49
+ EvaluatorFields.TOOL_CALLS_FIELD, EvaluatorFields.OUTPUT_FIELDS]
50
+
51
+ if isinstance(configuration.tools, list) and all(callable(item) for item in configuration.tools):
52
+ configuration.tools = ToolCallMetricProvider.get_tools_list_schema(
53
+ configuration.tools)
54
+
55
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
56
+ configuration=configuration,
57
+ metrics=metrics,
58
+ metric_inputs=[],
59
+ metric_outputs=metric_outputs)
60
+
61
+ return original_result
62
+ except Exception as ex:
63
+ raise Exception(
64
+ f"There was an error while evaluating tool call syntactic metric on {func.__name__},") from ex
65
+
66
+ return wrapper(func)
@@ -0,0 +1,121 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
14
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
15
+ from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
16
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
17
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
18
+ from ibm_watsonx_gov.providers.tool_call_metric_provider import \
19
+ ToolCallMetricProvider
20
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
21
+ from ibm_watsonx_gov.utils.validation_util import validate_tool_calls
22
+ from pydantic import Field
23
+
24
+ TOOL_CALLING_SYNTACTIC_ACCURACY = "tool_call_syntactic_accuracy"
25
+
26
+
27
+ class ToolCallSyntacticAccuracyMetric(GenAIMetric):
28
+ """
29
+ .. deprecated:: 1.2.0
30
+ Use :class:`ibm_watsonx_gov.metrics.ToolCallAccuracyMetric` with syntactic method instead.
31
+
32
+ ToolCallSyntacticAccuracyMetric compute the tool call syntactic correctness
33
+ by validating tool call against the schema of the list of available tools.
34
+
35
+ The ToolCallSyntacticAccuracy metric will be computed by performing the syntactic checks.
36
+
37
+ Examples:
38
+ 1. Create ToolCallSyntacticAccuracy metric by passing the basic configuration.
39
+ .. code-block:: python
40
+
41
+ config = GenAIConfiguration(tools = [get_weather,fetch_stock_price])
42
+ evaluator = MetricsEvaluator(configuration=config)
43
+ df = pd.read_csv("")
44
+ metrics = [ToolCallSyntacticAccuracyMetric()]
45
+ result = evaluator.evaluate(data=df, metrics=metrics)
46
+
47
+ 2. Create ToolCallSyntacticAccuracy metric by passing custom tool calls field in configuration.
48
+ .. code-block:: python
49
+
50
+ config = GenAIConfiguration(tools = [get_weather,fetch_stock_price],
51
+ tool_calls_field="tools_used")
52
+ evaluator = MetricsEvaluator(configuration=config)
53
+ df = pd.read_csv("")
54
+ metrics = [ToolCallSyntacticAccuracyMetric()]
55
+ result = evaluator.evaluate(data=df, metrics=metrics)
56
+
57
+ 3. Create ToolCallSyntacticAccuracy metric with a custom threshold.
58
+ .. code-block:: python
59
+
60
+ threshold = MetricThreshold(type="upper_limit", value=0.8)
61
+ metric = ToolCallSyntacticAccuracyMetric(threshold=threshold)
62
+ """
63
+
64
+ name: Annotated[Literal["tool_call_syntactic_accuracy"], Field(title="Metric Name",
65
+ description="The name of metric.",
66
+ default=TOOL_CALLING_SYNTACTIC_ACCURACY)]
67
+ display_name: Annotated[Literal["Tool Call Syntactic Accuracy"], Field(title="Display Name",
68
+ description="The tool call syntactic accuracy metric display name.",
69
+ default="Tool Call Syntactic Accuracy", frozen=True)]
70
+ tasks: Annotated[list[TaskType], Field(title="Task Type",
71
+ description="The generative task type.",
72
+ default=[TaskType.RAG])]
73
+ group: Annotated[MetricGroup, Field(title="Group",
74
+ description="The metric group.",
75
+ default=MetricGroup.TOOL_CALL_QUALITY, frozen=True)]
76
+ method: Annotated[Literal["syntactic_check"], Field(title="Computation Method",
77
+ description="The method used to compute the metric.",
78
+ default="syntactic_check")]
79
+ thresholds: Annotated[list[MetricThreshold], Field(title="Metric threshold",
80
+ description="Value that defines the violation limit for the metric",
81
+ default=[MetricThreshold(
82
+ type="lower_limit", value=0.7)]
83
+ )]
84
+
85
+ async def evaluate_async(self, data: pd.DataFrame | dict,
86
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
87
+ **kwargs) -> AggregateMetricResult:
88
+ """
89
+ Evaluate the data for ToolCallSyntacticAccuracyMetric
90
+ Args:
91
+ data (pd.DataFrame | dict): Data to be evaluated
92
+ configuration (GenAIConfiguration | AgenticAIConfiguration): Metrics configuration
93
+
94
+ Returns:
95
+ AggregateMetricResult: The computed metrics
96
+ """
97
+ # Validate tool calls field in data and tools in configuration
98
+ data_cols = data.columns.to_list()
99
+ validate_tool_calls(data_cols, configuration)
100
+
101
+ tool_call_provider = ToolCallMetricProvider(
102
+ configuration=configuration, metric=self)
103
+
104
+ # Compute the metrics
105
+ metric_result = await tool_call_provider.compute_metrics(data)
106
+
107
+ return metric_result
108
+
109
+ def evaluate(
110
+ self,
111
+ data: pd.DataFrame | dict,
112
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
113
+ **kwargs,
114
+ ):
115
+ # If ran in sync mode, block until it is done
116
+ return run_in_event_loop(
117
+ self.evaluate_async,
118
+ data=data,
119
+ configuration=configuration,
120
+ **kwargs,
121
+ )
@@ -0,0 +1,8 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
@@ -0,0 +1,57 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from functools import partial
11
+ from typing import Callable, Optional
12
+
13
+ from wrapt import decorator
14
+
15
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
16
+ AgenticAIConfiguration
17
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
18
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
19
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
20
+ from ibm_watsonx_gov.metrics.topic_relevance.topic_relevance_metric import \
21
+ TopicRelevanceMetric
22
+
23
+
24
+ class TopicRelevanceDecorator(BaseMetricDecorator):
25
+
26
+ def evaluate_topic_relevance(self,
27
+ func: Optional[Callable] = None,
28
+ *,
29
+ configuration: Optional[AgenticAIConfiguration] = None,
30
+ metrics: list[GenAIMetric],
31
+ ) -> dict:
32
+ """
33
+ An evaluation decorator for computing topic relevance metric on an agentic node.
34
+ """
35
+ if func is None:
36
+ return partial(self.evaluate_topic_relevance, configuration=configuration, metrics=metrics)
37
+
38
+ @decorator
39
+ def wrapper(func, instance, args, kwargs):
40
+
41
+ try:
42
+ self.validate(func=func, metrics=metrics,
43
+ valid_metric_types=(TopicRelevanceMetric))
44
+
45
+ metric_inputs = [EvaluatorFields.INPUT_FIELDS]
46
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
47
+ configuration=configuration,
48
+ metrics=metrics,
49
+ metric_inputs=metric_inputs,
50
+ metric_outputs=[])
51
+
52
+ return original_result
53
+ except Exception as ex:
54
+ raise Exception(
55
+ f"There was an error while evaluating topic relevance metric on {func.__name__},") from ex
56
+
57
+ return wrapper(func)
@@ -0,0 +1,106 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ from pydantic import Field
14
+
15
+ from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
17
+ from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
18
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
19
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
20
+ from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
21
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
22
+ from ibm_watsonx_gov.utils.validation_util import validate_input
23
+
24
+ TOPIC_RELEVANCE = "topic_relevance"
25
+
26
+
27
+ class TopicRelevanceMetric(GenAIMetric):
28
+ """
29
+ Defines the TopicRelevance metric class.
30
+
31
+ The TopicRelevance metric evaluates how closely the input content aligns with the topic specified by the system_prompt.
32
+
33
+ Note : system_prompt is mandatory
34
+
35
+ Examples:
36
+ 1. Create TopicRelevance metric with default parameters and compute using metrics evaluator.
37
+ .. code-block:: python
38
+
39
+ metric = TopicRelevanceMetric(system_prompt="...")
40
+ result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
41
+
42
+ 2. Create TopicRelevance metric with a custom threshold.
43
+ .. code-block:: python
44
+
45
+ threshold = MetricThreshold(type="lower_limit", value=0.5)
46
+ metric = TopicRelevanceMetric(threshold=threshold, system_prompt="...")
47
+ """
48
+ name: Annotated[Literal["topic_relevance"],
49
+ Field(title="Name",
50
+ description="The topic relevance metric name.",
51
+ default=TOPIC_RELEVANCE, frozen=True)]
52
+ display_name: Annotated[Literal["Topic Relevance"],
53
+ Field(title="Display Name",
54
+ description="The topic relevance metric display name.",
55
+ default="Topic Relevance", frozen=True)]
56
+ thresholds: Annotated[list[MetricThreshold],
57
+ Field(title="Thresholds",
58
+ description="The metric thresholds.",
59
+ default=[MetricThreshold(type="lower_limit", value=0.7)])]
60
+ tasks: Annotated[list[TaskType],
61
+ Field(title="Tasks",
62
+ description="The list of supported tasks.",
63
+ default=TaskType.values(), frozen=True)]
64
+ # TODO uncomment when the metric is pushed to prod
65
+ # group: Annotated[MetricGroup, Field(title="Group",
66
+ # description="The metric group.",
67
+ # default=MetricGroup.CONTENT_SAFETY, frozen=True)]
68
+ system_prompt: Annotated[str, Field(title="System Prompt",
69
+ description=f"The AI model system prompt which contains instructions to define its overall behavior.")]
70
+
71
+ async def evaluate_async(
72
+ self,
73
+ data: pd.DataFrame | dict,
74
+ configuration: GenAIConfiguration,
75
+ **kwargs
76
+ ) -> list[AggregateMetricResult]:
77
+ if not self.system_prompt:
78
+ raise AssertionError(
79
+ f"The system_prompt field is required but was missing from the input.")
80
+
81
+ validate_input(data.columns.to_list(), configuration)
82
+ # Set system_prompt as part of the detector parameters
83
+ kwargs["detector_params"] = {"system_prompt": self.system_prompt}
84
+ provider = DetectorsProvider(configuration=configuration,
85
+ metric_name=self.name,
86
+ metric_display_name=self.display_name,
87
+ metric_method=self.method,
88
+ metric_group=MetricGroup.CONTENT_SAFETY,
89
+ thresholds=self.thresholds,
90
+ **kwargs)
91
+ aggregated_metric_result = provider.evaluate(data=data)
92
+ return aggregated_metric_result
93
+
94
+ def evaluate(
95
+ self,
96
+ data: pd.DataFrame | dict,
97
+ configuration: GenAIConfiguration,
98
+ **kwargs,
99
+ ):
100
+ # If ran in sync mode, block until it is done
101
+ return run_in_event_loop(
102
+ self.evaluate_async,
103
+ data=data,
104
+ configuration=configuration,
105
+ **kwargs,
106
+ )
@@ -0,0 +1,8 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
@@ -0,0 +1,61 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from functools import partial
11
+ from typing import Callable, Optional
12
+
13
+ from wrapt import decorator
14
+
15
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
16
+ AgenticAIConfiguration
17
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
18
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
19
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
20
+ from ibm_watsonx_gov.metrics.unethical_behavior.unethical_behavior_metric import \
21
+ UnethicalBehaviorMetric
22
+
23
+
24
+ class UnethicalBehaviorDecorator(BaseMetricDecorator):
25
+
26
+ def evaluate_unethical_behavior(self,
27
+ func: Optional[Callable] = None,
28
+ *,
29
+ configuration: Optional[AgenticAIConfiguration] = None,
30
+ metrics: list[GenAIMetric] = []
31
+ ) -> dict:
32
+ """
33
+ An evaluation decorator for computing unethical behavior on an agentic node via granite guardian.
34
+ """
35
+ if func is None:
36
+ return partial(self.evaluate_unethical_behavior, configuration=configuration, metrics=metrics)
37
+
38
+ if not metrics:
39
+ metrics = [UnethicalBehaviorMetric()]
40
+
41
+ @decorator
42
+ def wrapper(func, instance, args, kwargs):
43
+
44
+ try:
45
+ self.validate(func=func, metrics=metrics,
46
+ valid_metric_types=(UnethicalBehaviorMetric))
47
+
48
+ metric_inputs = [EvaluatorFields.INPUT_FIELDS]
49
+
50
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
51
+ configuration=configuration,
52
+ metrics=metrics,
53
+ metric_inputs=metric_inputs,
54
+ metric_outputs=[])
55
+
56
+ return original_result
57
+ except Exception as ex:
58
+ raise Exception(
59
+ f"There was an error while evaluating unethical behavior on {func.__name__},") from ex
60
+
61
+ return wrapper(func)