ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,262 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # OCO Source Materials
4
+ # 5900-A3Q, 5737-H76
5
+ # Copyright IBM Corp. 2025
6
+ # The source code for this program is not published or other-wise divested of its trade
7
+ # secrets, irrespective of what has been deposited with the U.S.Copyright Office.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from ibm_watsonx_ai.foundation_models.prompts.prompt_template import (
11
+ DetachedPromptTemplate, PromptTemplate)
12
+
13
+ from ibm_watsonx_gov.entities.container import (BaseMonitor, ProjectContainer,
14
+ SpaceContainer)
15
+ from ibm_watsonx_gov.entities.credentials import Credentials
16
+ from ibm_watsonx_gov.entities.enums import EvaluationStage
17
+ from ibm_watsonx_gov.entities.prompt_setup import PromptSetup
18
+ from ibm_watsonx_gov.prompt_evaluator.impl.prompt_evaluator_impl import \
19
+ PromptEvaluatorImpl
20
+
21
+
22
+ class PromptEvaluator:
23
+ """
24
+ PromptEvaluator is a class that sets up a prompt template and evaluates the risks associated with it.
25
+
26
+ Example
27
+ -------
28
+ .. code-block:: python
29
+
30
+ # Create the prompt evaluator
31
+ evaluator = PromptEvaluator(
32
+ credentials=Credentials(api_key="")
33
+ )
34
+
35
+ # Create the prompt setup
36
+ prompt_setup = PromptSetup(
37
+ task_type=TaskType.RAG,
38
+ question_field="question",
39
+ context_fields=["context1"],
40
+ label_column="answer",
41
+ )
42
+
43
+ # Create the prompt template
44
+ prompt_template = PromptTemplate(
45
+ name="test",
46
+ description="description",
47
+ input_variables=["question", "context1"],
48
+ input_text="Answer the below question from the given context only and do not use the knowledge outside the context. Context: {context1} Question: {question} Answer:",
49
+ model_id="ibm/granite-3-3-8b-instruct",
50
+ task_ids=[TaskType.RAG.value]
51
+ )
52
+
53
+ # Provide the development container details
54
+ development_container = ProjectContainer(
55
+ container_id="3acf420f-526a-4007-abe7-78a03435aac2",
56
+ monitors=[
57
+ GenerativeAIQualityMonitor(),
58
+ ]
59
+ )
60
+
61
+ # Evaluate the risk based on the provided dataset
62
+ evaluator.evaluate_risk(
63
+ prompt_setup=prompt_setup,
64
+ prompt_template=prompt_template,
65
+ containers=[development_container],
66
+ environments=[EvaluationStage.DEVELOPMENT],
67
+ input_file_path="./rag_dataset.csv",
68
+ )
69
+
70
+ # Show the evaluation result
71
+ evaluator.get_monitor_metrics(
72
+ monitor=BaseMonitor(monitor_name="generative_ai_quality"),
73
+ environment=EvaluationStage.DEVELOPMENT,
74
+ show_table=True,
75
+ )
76
+
77
+ evaluator.get_dataset_records(
78
+ dataset_type="gen_ai_quality_metrics",
79
+ environment=EvaluationStage.DEVELOPMENT,
80
+ show_table=True,
81
+ )
82
+
83
+ """
84
+
85
+ def __init__(self, credentials: Credentials | None = None):
86
+ """
87
+ Initializes the code assistant with the provided credentials.
88
+
89
+ Args:
90
+ credentials (Credentials): The credentials required for authentication and authorization.
91
+ """
92
+ self.__evaluator = PromptEvaluatorImpl(credentials)
93
+
94
+ def e2e_prompt_evaluation(
95
+ self,
96
+ config: dict[str, any],
97
+ input_file_path: str = None,
98
+ ):
99
+ """
100
+ Method to set up and evaluate the prompt template end to end with a simplified interface.
101
+
102
+ Examples:
103
+
104
+ .. code-block:: python
105
+
106
+ # Create the prompt evaluator
107
+ evaluator = PromptEvaluator(
108
+ credentials=Credentials(api_key="")
109
+ )
110
+
111
+ # detached prompt configuration example
112
+ detached_prompt_config = {
113
+ "prompt_setup": {
114
+ "problem_type": TaskType.RAG.value,
115
+ "context_fields": ["context1"],
116
+ },
117
+ "development_project_id": "3acf420f-526a-4007-abe7-78a03435aac2",
118
+ "detached_prompt_template": {
119
+ "name": "detached prompt experiment",
120
+ "model_id": "ibm/granite-3-2-8b-instruct",
121
+ "input_text": "Answer the below question from the given context only and do not use the knowledge outside the context. Context: {context1} Question: {question} Answer:",
122
+ "input_variables": ["question", "context1"],
123
+ "detached_model_url": "https://us-south.ml.cloud.ibm.com/ml/v1/deployments/insurance_test_deployment/text/generation?version=2021-05-01",
124
+ "task_ids": [TaskType.RAG.value],
125
+ }
126
+
127
+ # prompt configuration example
128
+ prompt_config = {
129
+ "prompt_setup": {
130
+ "problem_type": TaskType.RAG.value,
131
+ "context_fields": ["context1"],
132
+ },
133
+ "development_project_id": "3acf420f-526a-4007-abe7-78a03435aac2",
134
+ "prompt_template": {
135
+ "name": "prompt experiment",
136
+ "model_id": "ibm/granite-3-2-8b-instruct",
137
+ "input_text": "Answer the below question from the given context only and do not use the knowledge outside the context. Context: {context1} Question: {question} Answer:",
138
+ "input_variables": ["question", "context1"],
139
+ "task_ids": [TaskType.RAG.value],
140
+ },
141
+ // optional usecase configuration
142
+ "ai_usecase_id": "b1504848-3cf9-4ab9-9d46-d688e34a0295",
143
+ "catalog_id": "7bca9a52-7c90-4fb4-b3ef-3194e25a8452", // same as inventory_id
144
+ "approach_id": "80b3a883-015f-498a-86f3-55ba74b5374b",
145
+ "approach_version": "0.0.2",
146
+ }
147
+
148
+ # Evaluate the risk based on the provided dataset
149
+ evaluator.e2e_prompt_evaluation(
150
+ config=config,
151
+ input_file_path="./rag_dataset.csv",
152
+ )
153
+
154
+ # Show the evaluation result
155
+ evaluator.get_monitor_metrics(
156
+ monitor=BaseMonitor(monitor_name="generative_ai_quality"),
157
+ environment=EvaluationStage.DEVELOPMENT,
158
+ show_table=True,
159
+ )
160
+
161
+ evaluator.get_dataset_records(
162
+ dataset_type="gen_ai_quality_metrics",
163
+ environment=EvaluationStage.DEVELOPMENT,
164
+ show_table=True,
165
+ )
166
+ Args:
167
+ config (dict[str, any]): configurations dictionary
168
+ input_file_path (str, optional): Path to the input to evaluate. This can be a local file or link to a file. The propmt template evaluation will be skipped if this argument is no set.
169
+ """
170
+ self.__evaluator.e2e_prompt_evaluation(config, input_file_path)
171
+
172
+ def evaluate_risk(
173
+ self,
174
+ prompt_setup: PromptSetup,
175
+ containers: list[ProjectContainer | SpaceContainer],
176
+ input_file_path: str,
177
+ prompt_template: PromptTemplate | DetachedPromptTemplate = None,
178
+ prompt_template_id: str = None,
179
+ environments: list[EvaluationStage] = [EvaluationStage.DEVELOPMENT],
180
+ ):
181
+ """
182
+ Evaluate the risk of a given input file path for a list of containers. Note either prompt_template or prompt_template_id should be provided.
183
+
184
+ Args:
185
+ prompt_template (PromptTemplate | DetachedPromptTemplate, optional): The prompt template to use for evaluation.
186
+ prompt_template_id (str, optional): The prompt template id to use for evaluation.
187
+ containers (list[ProjectContainer | SpaceContainer]): The containers details.
188
+ input_file_path (str): The path to the input file to evaluate.
189
+ environments (list[EvaluationStage], optional): The list of evaluation stages to do the evaluation in. Defaults to [EvaluationStage.DEVELOPMENT].
190
+ """
191
+ self.__evaluator.evaluate_risk(
192
+ prompt_setup=prompt_setup,
193
+ prompt_template=prompt_template,
194
+ prompt_template_id=prompt_template_id,
195
+ containers=containers,
196
+ evaluation_stages=environments,
197
+ input_file_path=input_file_path,
198
+ )
199
+
200
+ def get_monitor_metrics(
201
+ self,
202
+ monitor: BaseMonitor,
203
+ environment: EvaluationStage = EvaluationStage.DEVELOPMENT,
204
+ show_table: bool = False,
205
+ ):
206
+ """
207
+ Get monitors metrics for a given monitor in a specific environment.
208
+
209
+ Args:
210
+ monitor (BaseMonitor): monitor to get the metrics for.
211
+ environment (EvaluationStage, optional): monitor environment. Defaults to EvaluationStage.DEVELOPMENT.
212
+ show_table (bool, optional): Flag to print the result table. Defaults to False.
213
+
214
+ Returns:
215
+ dict[str, any]: Monitor metrics dictionary
216
+ """
217
+ return self.__evaluator.get_monitor_metrics(
218
+ evaluation_stage=environment,
219
+ monitor=monitor,
220
+ show_table=show_table,
221
+ )
222
+
223
+ def get_dataset_records(
224
+ self,
225
+ dataset_type: str,
226
+ environment: EvaluationStage = EvaluationStage.DEVELOPMENT,
227
+ show_table: bool = False,
228
+ ) -> dict[str, any]:
229
+ """
230
+ Retrieve dataset records for a given dataset type and environment.
231
+
232
+ Args:
233
+ dataset_type (str): The type of dataset to retrieve records for.
234
+ environment (EvaluationStage, optional): The environment to retrieve records from. Defaults to EvaluationStage.DEVELOPMENT.
235
+ show_table (bool, optional): Whether to display the dataset records as a table. Defaults to False.
236
+
237
+ Returns:
238
+ dict[str, any]: A dictionary containing the dataset records.
239
+ """
240
+ return self.__evaluator.get_dataset_records(
241
+ evaluation_stage=environment,
242
+ dataset_type=dataset_type,
243
+ show_table=show_table,
244
+ )
245
+
246
+ def get_prompt_template_id(
247
+ self,
248
+ environment: EvaluationStage = EvaluationStage.DEVELOPMENT,
249
+ ) -> str:
250
+ """
251
+ Retrieves the prompt template ID based on the specified environment.
252
+
253
+ Args:
254
+ environment (EvaluationStage, optional): The environment for which to retrieve the prompt template ID.
255
+ Defaults to EvaluationStage.DEVELOPMENT.
256
+
257
+ Returns:
258
+ str: The prompt template ID corresponding to the specified environment.
259
+ """
260
+ return self.__evaluator.get_prompt_template_id(
261
+ environment=environment
262
+ )
@@ -0,0 +1,8 @@
1
+ import os
2
+ import sys
3
+
4
+ # Dynamically add the path to `llmevalkit`
5
+ # points to `tool_calling_hallucination`
6
+ current_dir = os.path.dirname(__file__)
7
+ if current_dir not in sys.path:
8
+ sys.path.insert(0, current_dir)
@@ -0,0 +1,415 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ import asyncio
10
+ import json
11
+ from typing import Optional
12
+
13
+ import aiohttp
14
+ import pandas as pd
15
+ from ibm_watson_openscale import APIClient as WOSClient
16
+
17
+ from ibm_watsonx_gov.clients.usage_client import validate_usage_client
18
+ from ibm_watsonx_gov.config import GenAIConfiguration
19
+ from ibm_watsonx_gov.entities.base_classes import Error
20
+ from ibm_watsonx_gov.entities.enums import (EvaluationProvider,
21
+ GraniteGuardianRisks, MetricGroup)
22
+ from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
23
+ RecordMetricResult)
24
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
25
+ from ibm_watsonx_gov.providers.tool_call_metric_provider import \
26
+ ToolCallMetricProvider
27
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
28
+ from ibm_watsonx_gov.utils.python_utils import (get_authenticator_token,
29
+ transform_str_to_list)
30
+
31
+ ACCURACY_METRICS = ["topic_relevance", "tool_call_accuracy"]
32
+ RAG_EVIDENCE_METRICS = ["faithfulness",
33
+ "context_relevance"]
34
+ RAG_SMALL_MODELS = ["faithfulness_model", "context_relevance_model"]
35
+ EVIDENCES = ["hap", "pii", "keyword", "regex"]
36
+
37
+
38
+ class DetectorsProvider():
39
+ # Status codes for BAD_GATEWAY, SERVICE_UNAVAILABLE and GATEWAY_TIMEOUT
40
+ RETRY_AFTER_STATUS_CODES = [502, 503, 504]
41
+ RETRY_COUNT = 3
42
+ BACK_OFF_FACTOR = 1
43
+
44
+ def __init__(
45
+ self,
46
+ configuration: GenAIConfiguration,
47
+ metric_name: str,
48
+ metric_display_name: str,
49
+ metric_method: Optional[str] = None,
50
+ metric_group: MetricGroup = None,
51
+ thresholds: list[MetricThreshold] = [],
52
+ **kwargs,
53
+ ) -> None:
54
+ if not kwargs.get("api_client"):
55
+ raise ValueError(
56
+ f"IBM WatsonX Governance SDK must be initialized to compute {metric_name} using {metric_method}. Please initialize the client to proceed, or remove this metric from the evaluation.")
57
+ if not isinstance(kwargs.get("api_client").wos_client, WOSClient):
58
+ raise ValueError(
59
+ f"watsonx.governance service instance id is required to compute {metric_name} using {metric_method}. You can set the service instance ID using the `WXG_SERVICE_INSTANCE_ID` environment variable in the notebook and retry.")
60
+ base = self.__get_base_url(metric_name)
61
+ self.base_url = base.format(
62
+ self.get_detector_url(kwargs.get("api_client")))
63
+ self.configuration: GenAIConfiguration = configuration
64
+ self.configuration_: dict[str, any] = {}
65
+ self.metric_name = metric_name
66
+ self.metric_display_name = metric_display_name
67
+ self.metric_method = metric_method
68
+ self.metric_group = metric_group
69
+ self.service_instance_id = self.get_service_instance_id(
70
+ kwargs.get("api_client"))
71
+ self.thresholds = thresholds
72
+ self.detector_params = kwargs.get("detector_params", None)
73
+ validate_usage_client(kwargs.get("usage_client"))
74
+
75
+ def evaluate(self, data: pd.DataFrame) -> AggregateMetricResult:
76
+ return run_in_event_loop(
77
+ self.evaluate_async,
78
+ data=data
79
+ )
80
+
81
+ async def evaluate_async(self, data: pd.DataFrame) -> AggregateMetricResult:
82
+ """
83
+ Entry point method to compute the configured detectors-based metrics.
84
+ Args:
85
+ data: Input test data
86
+ """
87
+ try:
88
+ json_payloads, record_ids = self.__pre_process_data(data=data)
89
+ result = await self.__compute_metric(json_payloads)
90
+ aggregated_result = self.__post_process(result, record_ids)
91
+ return aggregated_result
92
+
93
+ except Exception as e:
94
+ raise Exception(
95
+ f"Error while computing metrics: {self.metric_name}. Reason: {str(e)}")
96
+
97
+ def __pre_process_data(self, data: pd.DataFrame):
98
+ """
99
+ Creates payload for each row in the test data.
100
+ """
101
+ # read data based on the metric.
102
+ input_content = data[self.configuration.input_fields[0]].to_list()
103
+ output_content, context_content, tool_calls_content, tools_catalog_content = None, None, None, None
104
+
105
+ if self.metric_name in ["answer_relevance", "faithfulness"]:
106
+ output_content = data[self.configuration.output_fields[0]].to_list(
107
+ )
108
+
109
+ if self.metric_name in ["context_relevance", "faithfulness"]:
110
+ if len(self.configuration.context_fields) > 1:
111
+ context_content = data[self.configuration.context_fields].values.tolist(
112
+ )
113
+ elif len(self.configuration.context_fields) == 1:
114
+ context_content = data[self.configuration.context_fields[0]].apply(
115
+ transform_str_to_list).tolist()
116
+
117
+ if self.metric_name == "tool_call_accuracy":
118
+ # Get the tools catalog i.e., a list of available tools
119
+ tools_catalog_content = self.get_tools_catalog_content()
120
+ # Get the tool calls list
121
+ tool_calls_content = self.get_tool_calls_content(data)
122
+
123
+ payloads_json = self.__get_json_payloads(
124
+ input_content, output_content, context_content, tools_catalog_content, tool_calls_content)
125
+ record_ids = data[self.configuration.record_id_field].to_list()
126
+ return payloads_json, record_ids
127
+
128
+ async def send_with_retries(self, payload, session: aiohttp.ClientSession):
129
+ """
130
+ Asynchronously calls the detections API with retries and returns the responses.
131
+ Returns an error if all retries fail or an exception is caught.
132
+ """
133
+ for attempt in range(self.RETRY_COUNT):
134
+ try:
135
+ async with session.post(
136
+ url=self.base_url,
137
+ headers=self.__get_headers(),
138
+ data=payload,
139
+ ssl=self.verify
140
+ ) as response:
141
+
142
+ response_status = response.status
143
+ response_text = await response.text()
144
+
145
+ if response_status == 200:
146
+ return json.loads(response_text)
147
+
148
+ elif response_status in self.RETRY_AFTER_STATUS_CODES and attempt < self.RETRY_COUNT - 1:
149
+ await asyncio.sleep(self.BACK_OFF_FACTOR * (2 ** attempt))
150
+ continue # retry
151
+ else:
152
+ return {
153
+ "error": Error(
154
+ code=str(response_status),
155
+ message_en=str(json.loads(response_text))
156
+ )
157
+ }
158
+
159
+ except Exception as e:
160
+ return {
161
+ "error": Error(
162
+ code="REQUEST_FAILED",
163
+ message_en=str(e)
164
+ )
165
+ }
166
+
167
+ async def __compute_metric(self, api_payloads: list):
168
+ async with aiohttp.ClientSession() as session:
169
+ tasks = [self.send_with_retries(payload, session)
170
+ for payload in api_payloads]
171
+ responses = await asyncio.gather(*tasks, return_exceptions=True)
172
+ return responses
173
+
174
+ def __get_additional_info(self, results) -> list:
175
+ info = []
176
+ if self.metric_name in RAG_EVIDENCE_METRICS:
177
+ for result in results:
178
+ info.append(result["evidence"][0])
179
+ else:
180
+ for result in results:
181
+ info.append({"text": result["text"], "score": result["score"],
182
+ "start": result["start"], "end": result["end"]})
183
+ if info:
184
+ return info
185
+ return []
186
+
187
+ def __post_process(self, results: list, record_ids: list) -> AggregateMetricResult:
188
+ """
189
+ Process the responses and aggregate the results.
190
+ """
191
+ record_level_metrics: list[RecordMetricResult] = []
192
+ values = []
193
+ errors = []
194
+ for result, record_id in zip(results, record_ids):
195
+ if self.metric_name == "keyword":
196
+ metric_name = "keyword_detection"
197
+ elif self.metric_name == "regex":
198
+ metric_name = "regex_detection"
199
+ else:
200
+ metric_name = self.metric_name
201
+ record_data = {
202
+ "name": metric_name,
203
+ "display_name": self.metric_display_name,
204
+ "method": self.metric_method,
205
+ "provider": EvaluationProvider.DETECTORS.value,
206
+ "group": self.metric_group,
207
+ "record_id": record_id,
208
+ "thresholds": self.thresholds,
209
+ }
210
+
211
+ if "error" in result:
212
+ record_level_metrics.append(RecordMetricResult(
213
+ **record_data,
214
+ value=None,
215
+ errors=[Error(code=result["error"].code,
216
+ message_en=str(result["error"].message_en))]
217
+ ))
218
+ errors.append(Error(code=result["error"].code,
219
+ message_en=str(result["error"].message_en)))
220
+ else:
221
+ value = 0
222
+ if len(result["detections"]) > 0:
223
+ # Return the highest of all detections' scores.
224
+ score_list = []
225
+ for detection in result["detections"]:
226
+ score_list.append(detection["score"])
227
+ score = max(score_list)
228
+ value = round(
229
+ 1 - score if self.metric_name in ACCURACY_METRICS else score, 4)
230
+ # provide evidences for few metrics
231
+ evidences = None
232
+ if self.metric_name in EVIDENCES or (self.metric_name in RAG_EVIDENCE_METRICS and self.metric_method in RAG_SMALL_MODELS):
233
+ evidences = self.__get_additional_info(
234
+ result["detections"])
235
+
236
+ record_level_metrics.append(RecordMetricResult(
237
+ **record_data,
238
+ value=value,
239
+ **({"evidences": evidences} if evidences else {})
240
+ ))
241
+ values.append(value)
242
+
243
+ # creating AggregateMetricResult
244
+ if values:
245
+ mean_val = round(sum(values) / len(values), 4)
246
+ min_val = min(values)
247
+ max_val = max(values)
248
+ value = mean_val
249
+ error_info = {}
250
+ else:
251
+ mean_val = min_val = max_val = None
252
+ value = "Error"
253
+ error_info = {"errors": errors}
254
+ aggregated_result = AggregateMetricResult(
255
+ name=self.metric_name,
256
+ display_name=self.metric_display_name,
257
+ method=self.metric_method,
258
+ group=self.metric_group,
259
+ provider=EvaluationProvider.DETECTORS.value,
260
+ value=value,
261
+ total_records=len(results),
262
+ record_level_metrics=record_level_metrics,
263
+ min=min_val,
264
+ max=max_val,
265
+ mean=mean_val,
266
+ thresholds=self.thresholds,
267
+ **error_info
268
+ )
269
+
270
+ # return the aggregated result
271
+ return aggregated_result
272
+
273
+ def __get_json_payloads(self, input_contents: list, output_contents: list | None, context_contents: list | None, tools_catalog_content: list | None, tool_calls_content: list | None) -> list:
274
+ # Method to create the request payload.
275
+ json_payloads = []
276
+ metric_name = self.set_metric_name(self.metric_name)
277
+
278
+ if self.metric_name == "answer_relevance":
279
+ for (input, output) in zip(input_contents, output_contents):
280
+ payload_json = {
281
+ "detectors": {
282
+ metric_name: self.detector_params or {}
283
+ },
284
+ "prompt": input,
285
+ "generated_text": output
286
+ }
287
+ json_payloads.append(json.dumps(payload_json))
288
+
289
+ elif self.metric_name == "context_relevance":
290
+ for (input, context) in zip(input_contents, context_contents):
291
+ payload_json = {
292
+ "detectors": {
293
+ metric_name: self.detector_params or {}
294
+ },
295
+ "input": input,
296
+ "context_type": "docs",
297
+ "context": context
298
+ }
299
+ json_payloads.append(json.dumps(payload_json))
300
+
301
+ elif self.metric_name == "faithfulness":
302
+ for (output, context) in zip(output_contents, context_contents):
303
+ payload_json = {
304
+ "detectors": {
305
+ metric_name: self.detector_params or {}
306
+ },
307
+ "input": output,
308
+ "context_type": "docs",
309
+ "context": context
310
+ }
311
+ json_payloads.append(json.dumps(payload_json))
312
+
313
+ elif self.metric_name == "tool_call_accuracy":
314
+ for (input, tool_call) in zip(input_contents, tool_calls_content):
315
+ payload_json = {
316
+ "detectors": {
317
+ metric_name: self.detector_params or {}
318
+ },
319
+ "messages": [{"content": input, "role": "user"},
320
+ {"tool_calls": tool_call, "role": "assistant"}],
321
+ "tools": tools_catalog_content
322
+ }
323
+ json_payloads.append(json.dumps(payload_json))
324
+ else:
325
+ for input in input_contents:
326
+ payload_json = {
327
+ "detectors": {
328
+ metric_name: self.detector_params or {}
329
+ },
330
+ "input": input
331
+ }
332
+ json_payloads.append(json.dumps(payload_json))
333
+ return json_payloads
334
+
335
+ def __get_headers(self):
336
+ # Method to create request headers
337
+ headers = {}
338
+ headers["Content-Type"] = "application/json"
339
+ headers["Authorization"] = f"Bearer {get_authenticator_token(self.wos_client.authenticator)}"
340
+ headers["x-governance-instance-id"] = self.service_instance_id
341
+ headers["origin"] = "sdk"
342
+ return headers
343
+
344
+ def get_detector_url(self, api_client):
345
+ """
346
+ Sets the wos_client and returns the service url
347
+ """
348
+ self.wos_client = api_client.wos_client
349
+ self.verify = not api_client.credentials.disable_ssl
350
+ if api_client.credentials.version:
351
+ return api_client.credentials.url
352
+ else:
353
+ from ibm_watsonx_gov.utils.url_mapping import WOS_URL_MAPPING
354
+ urls = WOS_URL_MAPPING.get(api_client.credentials.url)
355
+ return urls.wml_url
356
+
357
+ def get_service_instance_id(self, api_client):
358
+ """
359
+ Sets the wos_client and returns the service instance id
360
+ """
361
+
362
+ self.wos_client = api_client.wos_client
363
+ return self.wos_client.service_instance_id
364
+
365
+ def set_metric_name(self, metric_name):
366
+ """
367
+ Sets metric name as 'granite guardian' for Granite Guardian risks
368
+ """
369
+ # Set metric name to harm for computing PSR using GG
370
+ if self.metric_name == "prompt_safety_risk" and self.metric_method == "granite_guardian":
371
+ metric_name = "harm"
372
+ metric_name = "granite_guardian" if metric_name in GraniteGuardianRisks.values() else metric_name
373
+ return metric_name
374
+
375
+ def __get_base_url(self, metric_name):
376
+ """
377
+ Returns the inference proxy end-point to be invoked based on the metric.
378
+ """
379
+ if metric_name == "answer_relevance":
380
+ return "{}/ml/v1/text/detection/generated?version=2023-10-25"
381
+ elif metric_name in ["context_relevance", "faithfulness"]:
382
+ return "{}/ml/v1/text/detection/context?version=2023-10-25"
383
+ elif metric_name == "tool_call_accuracy":
384
+ return "{}/ml/v1/text/detection/chat?version=2023-10-25"
385
+ else:
386
+ return "{}/ml/v1/text/detection?version=2023-10-25"
387
+
388
+ def get_tool_calls_content(self, data):
389
+ tool_calls_content = []
390
+ if self.configuration.tool_calls_field:
391
+ data[self.configuration.tool_calls_field] = data[self.configuration.tool_calls_field].apply(
392
+ lambda x: json.loads(x) if isinstance(x, str) else x)
393
+ for _, row in data.iterrows():
394
+ tool_calls = ToolCallMetricProvider.extract_tool_calls_from_response(
395
+ row[self.configuration.tool_calls_field])
396
+ for tc in tool_calls:
397
+ if isinstance(tc["function"]["arguments"], str):
398
+ tc["function"]["arguments"] = json.loads(
399
+ tc["function"]["arguments"])
400
+ tool_calls_content.append(tool_calls)
401
+
402
+ return tool_calls_content
403
+
404
+ def get_tools_catalog_content(self):
405
+ # Get the specification of tools used in the application
406
+ # in proper format if it is a list of Callable
407
+ if isinstance(self.configuration.tools, list) and all(callable(item) for item in self.configuration.tools):
408
+ tools_catalog_content = ToolCallMetricProvider.get_tools_list_schema(
409
+ self.configuration.tools)
410
+ # converting the model to a json object
411
+ tools_catalog_content = [
412
+ tool_spec.model_dump() for tool_spec in tools_catalog_content]
413
+ else:
414
+ tools_catalog_content = self.configuration.tools
415
+ return tools_catalog_content