ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
@@ -0,0 +1,103 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ from pydantic import Field
14
+
15
+ from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
17
+ from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
18
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
19
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
20
+ from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
21
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
22
+ from ibm_watsonx_gov.utils.validation_util import validate_input
23
+
24
+ SEXUAL_CONTENT = "sexual_content"
25
+
26
+
27
+ class SexualContentMetric(GenAIMetric):
28
+ """
29
+ Defines the Sexual Content metric class.
30
+
31
+ The Sexual Content metric measures the risk of content explicitly related to sexual activities, anatomy, or desires. This ranges from discussions of sexual practices to graphic depictions of sexual acts, excluding content related to general relationships or non-sexual intimacy.
32
+ It is computed using the granite guardian model.
33
+
34
+ Examples:
35
+ 1. Create Sexual Content metric with default parameters and compute using metrics evaluator.
36
+ .. code-block:: python
37
+
38
+ metric = SexualContentMetric()
39
+ result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
40
+
41
+ 2. Create Sexual Content metric with a custom threshold.
42
+ .. code-block:: python
43
+
44
+ threshold = MetricThreshold(type="lower_limit", value=0.5)
45
+ metric = SexualContentMetric(threshold=threshold)
46
+ """
47
+ name: Annotated[Literal["sexual_content"],
48
+ Field(title="Name",
49
+ description="The sexual content metric name.",
50
+ default=SEXUAL_CONTENT, frozen=True)]
51
+ display_name: Annotated[Literal["Sexual Content"],
52
+ Field(title="Display Name",
53
+ description="The sexual content metric display name.",
54
+ default="Sexual Content", frozen=True)]
55
+ method: Annotated[Literal["granite_guardian"],
56
+ Field(title="Method",
57
+ description="The method used to compute harm metric.",
58
+ default="granite_guardian")]
59
+ tasks: Annotated[list[TaskType],
60
+ Field(title="Tasks",
61
+ description="The list of supported tasks.",
62
+ default=TaskType.values(), frozen=True)]
63
+ thresholds: Annotated[list[MetricThreshold],
64
+ Field(title="Thresholds",
65
+ description="The metric thresholds.",
66
+ default=[MetricThreshold(type="upper_limit", value=0.5)])]
67
+ group: Annotated[MetricGroup,
68
+ Field(title="Group",
69
+ description="The metric group.",
70
+ default=MetricGroup.CONTENT_SAFETY, frozen=True)]
71
+
72
+ async def evaluate_async(
73
+ self,
74
+ data: pd.DataFrame,
75
+ configuration: GenAIConfiguration,
76
+ **kwargs
77
+ ) -> list[AggregateMetricResult]:
78
+
79
+ validate_input(data.columns.to_list(), configuration)
80
+ kwargs["detector_params"] = {"risk_name": SEXUAL_CONTENT}
81
+ provider = DetectorsProvider(configuration=configuration,
82
+ metric_name=self.name,
83
+ metric_display_name=self.display_name,
84
+ metric_method=self.method,
85
+ metric_group=self.group,
86
+ thresholds=self.thresholds,
87
+ **kwargs)
88
+ aggregated_metric_result = await provider.evaluate_async(data=data)
89
+ return aggregated_metric_result
90
+
91
+ def evaluate(
92
+ self,
93
+ data: pd.DataFrame | dict,
94
+ configuration: GenAIConfiguration,
95
+ **kwargs,
96
+ ):
97
+ # If ran in sync mode, block until it is done
98
+ return run_in_event_loop(
99
+ self.evaluate_async,
100
+ data=data,
101
+ configuration=configuration,
102
+ **kwargs,
103
+ )
@@ -0,0 +1,8 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
@@ -0,0 +1,62 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from functools import partial
11
+ from typing import Callable, Optional
12
+
13
+ from wrapt import decorator
14
+
15
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
16
+ AgenticAIConfiguration
17
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
18
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
19
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
20
+ from ibm_watsonx_gov.metrics.harm.harm_metric import HarmMetric
21
+ from ibm_watsonx_gov.metrics.social_bias.social_bias_metric import \
22
+ SocialBiasMetric
23
+
24
+
25
+ class SocialBiasDecorator(BaseMetricDecorator):
26
+
27
+ def evaluate_social_bias(self,
28
+ func: Optional[Callable] = None,
29
+ *,
30
+ configuration: Optional[AgenticAIConfiguration] = None,
31
+ metrics: list[GenAIMetric] = []
32
+ ) -> dict:
33
+ """
34
+ An evaluation decorator for computing social bias on an agentic node via granite guardian.
35
+ """
36
+ if func is None:
37
+ return partial(self.evaluate_social_bias, configuration=configuration, metrics=metrics)
38
+
39
+ if not metrics:
40
+ metrics = [SocialBiasMetric()]
41
+
42
+ @decorator
43
+ def wrapper(func, instance, args, kwargs):
44
+
45
+ try:
46
+ self.validate(func=func, metrics=metrics,
47
+ valid_metric_types=(SocialBiasMetric))
48
+
49
+ metric_inputs = [EvaluatorFields.INPUT_FIELDS]
50
+
51
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
52
+ configuration=configuration,
53
+ metrics=metrics,
54
+ metric_inputs=metric_inputs,
55
+ metric_outputs=[])
56
+
57
+ return original_result
58
+ except Exception as ex:
59
+ raise Exception(
60
+ f"There was an error while evaluating social bias on {func.__name__},") from ex
61
+
62
+ return wrapper(func)
@@ -0,0 +1,103 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ from pydantic import Field
14
+
15
+ from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
17
+ from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
18
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
19
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
20
+ from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
21
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
22
+ from ibm_watsonx_gov.utils.validation_util import validate_input
23
+
24
+ SOCIAL_BIAS = "social_bias"
25
+
26
+
27
+ class SocialBiasMetric(GenAIMetric):
28
+ """
29
+ Defines the Social Bias metric class.
30
+
31
+ The Social Bias metric measures the risk of systemic prejudice against groups based on shared identity or characteristics, often stemming from stereotypes or cultural influences. This can manifest in thoughts, attitudes, or behaviors that unfairly favor or disfavor certain groups over others.
32
+ It is computed using the granite guardian model.
33
+
34
+ Examples:
35
+ 1. Create Social Bias metric with default parameters and compute using metrics evaluator.
36
+ .. code-block:: python
37
+
38
+ metric = SocialBiasMetric()
39
+ result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
40
+
41
+ 2. Create Social Bias metric with a custom threshold.
42
+ .. code-block:: python
43
+
44
+ threshold = MetricThreshold(type="lower_limit", value=0.5)
45
+ metric = SocialBiasMetric(threshold=threshold)
46
+ """
47
+ name: Annotated[Literal["social_bias"],
48
+ Field(title="Name",
49
+ description="The social bias metric name.",
50
+ default=SOCIAL_BIAS, frozen=True)]
51
+ display_name: Annotated[Literal["Social Bias"],
52
+ Field(title="Display Name",
53
+ description="The social bias metric display name.",
54
+ default="Social Bias", frozen=True)]
55
+ method: Annotated[Literal["granite_guardian"],
56
+ Field(title="Method",
57
+ description="The method used to compute harm metric.",
58
+ default="granite_guardian")]
59
+ tasks: Annotated[list[TaskType],
60
+ Field(title="Tasks",
61
+ description="The list of supported tasks.",
62
+ default=TaskType.values(), frozen=True)]
63
+ thresholds: Annotated[list[MetricThreshold],
64
+ Field(title="Thresholds",
65
+ description="The metric thresholds.",
66
+ default=[MetricThreshold(type="upper_limit", value=0.5)])]
67
+ group: Annotated[MetricGroup,
68
+ Field(title="Group",
69
+ description="The metric group.",
70
+ default=MetricGroup.CONTENT_SAFETY, frozen=True)]
71
+
72
+ async def evaluate_async(
73
+ self,
74
+ data: pd.DataFrame | dict,
75
+ configuration: GenAIConfiguration,
76
+ **kwargs
77
+ ) -> list[AggregateMetricResult]:
78
+
79
+ validate_input(data.columns.to_list(), configuration)
80
+ kwargs["detector_params"] = {"risk_name": SOCIAL_BIAS}
81
+ provider = DetectorsProvider(configuration=configuration,
82
+ metric_name=self.name,
83
+ metric_display_name=self.display_name,
84
+ metric_method=self.method,
85
+ metric_group=self.group,
86
+ thresholds=self.thresholds,
87
+ **kwargs)
88
+ aggregated_metric_result = await provider.evaluate_async(data=data)
89
+ return aggregated_metric_result
90
+
91
+ def evaluate(
92
+ self,
93
+ data: pd.DataFrame | dict,
94
+ configuration: GenAIConfiguration,
95
+ **kwargs,
96
+ ):
97
+ # If ran in sync mode, block until it is done
98
+ return run_in_event_loop(
99
+ self.evaluate_async,
100
+ data=data,
101
+ configuration=configuration,
102
+ **kwargs,
103
+ )
File without changes
@@ -0,0 +1,113 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
14
+ from ibm_watsonx_gov.entities.enums import (CategoryClassificationType,
15
+ MessageStatus, MetricGroup,
16
+ MetricValueType, TaskType)
17
+ from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
18
+ RecordMetricResult)
19
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
20
+ from ibm_watsonx_gov.utils.async_util import run_in_event_loop
21
+ from pydantic import Field
22
+
23
+ STATUS = "status"
24
+
25
+
26
+ class StatusMetric(GenAIMetric):
27
+ """
28
+ Defines the Status metric class.
29
+
30
+ The Status metric measures the status of the message processing, which can be one of the following values:
31
+ - successful
32
+ - failure
33
+ - unknown
34
+ Examples:
35
+ 1. Create Status metric with default parameters and compute using metrics AgenticEvaluator.
36
+ .. code-block:: python
37
+
38
+ agent_app = AgenticApp(name="Rag agent",
39
+ metrics_configuration=MetricsConfiguration(metrics=[
40
+ StatusMetric()]))
41
+
42
+ evaluator = AgenticEvaluator(agentic_app=agent_app)
43
+ evaluator.start_run()
44
+ result = rag_app.invoke({"input_text": "What is concept drift?", "ground_truth": "Concept drift occurs when the statistical properties of the target variable change over time, causing a machine learning model’s predictions to become less accurate."})
45
+ evaluator.end_run()
46
+ """
47
+ name: Annotated[Literal["status"],
48
+ Field(title="Name",
49
+ description="The status metric name.",
50
+ default=STATUS, frozen=True)]
51
+ display_name: Annotated[Literal["Status"],
52
+ Field(title="Display Name",
53
+ description="The status metric display name.",
54
+ default="Status", frozen=True)]
55
+ tasks: Annotated[list[TaskType],
56
+ Field(title="Tasks",
57
+ description="The list of supported tasks.",
58
+ default=TaskType.values(), frozen=True)]
59
+ group: Annotated[MetricGroup,
60
+ Field(title="Group",
61
+ description="The metric group.",
62
+ default=MetricGroup.MESSAGE_COMPLETION, frozen=True)]
63
+ category_classification: Annotated[dict[str, list[str]], Field(
64
+ title="Category Classification",
65
+ description="The category classification of the metrics values.",
66
+ default={
67
+ CategoryClassificationType.FAVOURABLE.value: [MessageStatus.SUCCESSFUL.value],
68
+ CategoryClassificationType.UNFAVOURABLE.value: [MessageStatus.FAILURE.value],
69
+ CategoryClassificationType.NEUTRAL.value: [
70
+ MessageStatus.UNKNOWN.value]
71
+ },
72
+ )]
73
+
74
+ async def evaluate_async(
75
+ self,
76
+ data: pd.DataFrame | dict,
77
+ configuration: GenAIConfiguration,
78
+ **kwargs
79
+ ) -> list[AggregateMetricResult]:
80
+
81
+ record_level_metrics: list[RecordMetricResult] = []
82
+ for _, row in data.iterrows():
83
+ record_level_metrics.append(
84
+ RecordMetricResult(
85
+ name=self.name,
86
+ display_name=self.display_name,
87
+ method=self.method,
88
+ label=row.get(
89
+ configuration.status_field) or MessageStatus.UNKNOWN.value,
90
+ value=None,
91
+ category_classification=self.category_classification,
92
+ group=self.group,
93
+ record_id=row[configuration.record_id_field],
94
+ value_type=MetricValueType.CATEGORICAL.value)
95
+ )
96
+
97
+ aggregated_metric_result = AggregateMetricResult.create(
98
+ record_level_metrics)
99
+ return aggregated_metric_result
100
+
101
+ def evaluate(
102
+ self,
103
+ data: pd.DataFrame | dict,
104
+ configuration: GenAIConfiguration,
105
+ **kwargs,
106
+ ):
107
+ # If ran in sync mode, block until it is done
108
+ return run_in_event_loop(
109
+ self.evaluate_async,
110
+ data=data,
111
+ configuration=configuration,
112
+ **kwargs,
113
+ )
@@ -0,0 +1,8 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
@@ -0,0 +1,59 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from functools import partial
10
+ from typing import Callable, Optional
11
+
12
+ from wrapt import decorator
13
+
14
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
15
+ AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
19
+ from ibm_watsonx_gov.metrics.text_grade_level.text_grade_level_metric import \
20
+ TextGradeLevelMetric
21
+
22
+
23
+ class TextGradeLevelDecorator(BaseMetricDecorator):
24
+ def evaluate_text_grade_level(self,
25
+ func: Optional[Callable] = None,
26
+ *,
27
+ configuration: Optional[AgenticAIConfiguration] = None,
28
+ metrics: list[GenAIMetric] = []
29
+ ) -> dict:
30
+ """
31
+ An evaluation decorator for computing text grade level metric on an agentic node.
32
+ """
33
+ if func is None:
34
+ return partial(self.evaluate_text_grade_level, configuration=configuration, metrics=metrics)
35
+
36
+ if not metrics:
37
+ metrics = [TextGradeLevelMetric()]
38
+
39
+ @decorator
40
+ def wrapper(func, instance, args, kwargs):
41
+
42
+ try:
43
+ self.validate(func=func, metrics=metrics,
44
+ valid_metric_types=(TextGradeLevelMetric,))
45
+
46
+ metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
47
+
48
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
49
+ configuration=configuration,
50
+ metrics=metrics,
51
+ metric_inputs=[],
52
+ metric_outputs=metric_outputs)
53
+
54
+ return original_result
55
+ except Exception as ex:
56
+ raise Exception(
57
+ f"There was an error while evaluating text grade level metric on {func.__name__},") from ex
58
+
59
+ return wrapper(func)
@@ -0,0 +1,127 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+
10
+ from typing import Annotated, Literal
11
+
12
+ import pandas as pd
13
+ import textstat
14
+ from pydantic import Field
15
+
16
+ from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
17
+ from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
18
+ from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
19
+ RecordMetricResult)
20
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
21
+ from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
22
+ from ibm_watsonx_gov.utils.python_utils import replace_none_with_empty_string
23
+ from ibm_watsonx_gov.utils.validation_util import validate_output
24
+
25
+ TEXT_GRADE_LEVEL = "text_grade_level"
26
+ TEXT_GRADE_LEVEL_DISPLAY_NAME = "Text Grade Level"
27
+ FLESCH_KINCAID_GRADE = "flesch_kincaid_grade"
28
+ TEXTSTAT = "textstat"
29
+
30
+
31
+ class TextGradeLevelResult(RecordMetricResult):
32
+ name: str = TEXT_GRADE_LEVEL
33
+ display_name: str = TEXT_GRADE_LEVEL_DISPLAY_NAME
34
+ provider: str = TEXTSTAT
35
+ method: str = FLESCH_KINCAID_GRADE
36
+
37
+
38
+ class TextGradeLevelMetric(GenAIMetric):
39
+ """
40
+ Defines the Text Grade Level metric class.
41
+
42
+ The Text Grade Level metric measures the approximate reading US grade level of a text.
43
+ It is computed using the flesch_kincaid_grade method.
44
+ Its possible values typically range from 0 to 12+
45
+
46
+ - Negative scores are rare and only occur with artificially simple texts.
47
+ - No strict upper limit—some highly complex texts can score 30+, but these are extremely hard to read.
48
+
49
+ Examples:
50
+ 1. Create Text Grade Level metric with default parameters and compute using metrics evaluator.
51
+ .. code-block:: python
52
+
53
+ metric = TextGradeLevelMetric()
54
+ result = MetricsEvaluator().evaluate(data={"generated_text": "..."},
55
+ metrics=[metric])
56
+
57
+ 2. Create Text Grade Level metric with a custom threshold.
58
+ .. code-block:: python
59
+
60
+ threshold = MetricThreshold(type="lower_limit", value=6)
61
+ metric = TextGradeLevelMetric(thresholds=[threshold])
62
+ """
63
+ name: Annotated[Literal["text_grade_level"],
64
+ Field(title="name",
65
+ description="The text grade level metric name.",
66
+ default=TEXT_GRADE_LEVEL, frozen=True)]
67
+ display_name: Annotated[Literal["Text Grade Level"],
68
+ Field(title="Display Name",
69
+ description="The text grade level metric display name.",
70
+ default=TEXT_GRADE_LEVEL_DISPLAY_NAME, frozen=True)]
71
+ method: Annotated[Literal["flesch_kincaid_grade"],
72
+ Field(title="Method",
73
+ description="The method used to compute text grade level metric.",
74
+ default=FLESCH_KINCAID_GRADE)]
75
+ tasks: Annotated[list[TaskType],
76
+ Field(title="Tasks",
77
+ description="The list of supported tasks.",
78
+ default=TaskType.values(), frozen=True)]
79
+ group: Annotated[MetricGroup,
80
+ Field(title="Group",
81
+ description="The metric group.",
82
+ default=MetricGroup.READABILITY, frozen=True)]
83
+ thresholds: Annotated[list[MetricThreshold],
84
+ Field(title="Thresholds",
85
+ description="The metric thresholds.",
86
+ default=[MetricThreshold(type="lower_limit", value=6)])]
87
+
88
+ def evaluate(
89
+ self,
90
+ data: pd.DataFrame,
91
+ configuration: GenAIConfiguration | AgenticAIConfiguration,
92
+ **kwargs,
93
+ ) -> list[AggregateMetricResult]:
94
+ from ibm_watsonx_gov.utils.aggregation_util import get_summaries
95
+
96
+ validate_output(data.columns.to_list(), configuration)
97
+ record_level_metrics = []
98
+ predictions = data[configuration.output_fields[0]].to_list()
99
+ record_ids = data[configuration.record_id_field].to_list()
100
+ replace_none_with_empty_string(predictions)
101
+
102
+ all_scores = self._compute(predictions=predictions)
103
+ record_level_metrics = [
104
+ TextGradeLevelResult(record_id=record_id,
105
+ value=score, thresholds=self.thresholds, group=MetricGroup.READABILITY.value)
106
+ for score, record_id in zip(all_scores, record_ids)
107
+ ]
108
+ summary = get_summaries(all_scores)
109
+ aggregate_metric_scores = AggregateMetricResult(
110
+ name=self.name,
111
+ display_name=self.display_name,
112
+ provider=TEXTSTAT,
113
+ method=self.method,
114
+ group=self.group,
115
+ min=summary.get("min"),
116
+ max=summary.get("max"),
117
+ mean=summary.get("mean"),
118
+ value=summary.get("mean"),
119
+ total_records=len(record_level_metrics),
120
+ record_level_metrics=record_level_metrics,
121
+ thresholds=self.thresholds,
122
+ )
123
+
124
+ return aggregate_metric_scores
125
+
126
+ def _compute(self, predictions: list) -> list:
127
+ return [textstat.flesch_kincaid_grade(pred) for pred in predictions]
@@ -0,0 +1,8 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
@@ -0,0 +1,59 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ from functools import partial
10
+ from typing import Callable, Optional
11
+
12
+ from wrapt import decorator
13
+
14
+ from ibm_watsonx_gov.config.agentic_ai_configuration import \
15
+ AgenticAIConfiguration
16
+ from ibm_watsonx_gov.entities.enums import EvaluatorFields
17
+ from ibm_watsonx_gov.entities.metric import GenAIMetric
18
+ from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
19
+ from ibm_watsonx_gov.metrics.text_reading_ease.text_reading_ease_metric import \
20
+ TextReadingEaseMetric
21
+
22
+
23
+ class TextReadingEaseDecorator(BaseMetricDecorator):
24
+ def evaluate_text_reading_ease(self,
25
+ func: Optional[Callable] = None,
26
+ *,
27
+ configuration: Optional[AgenticAIConfiguration] = None,
28
+ metrics: list[GenAIMetric] = []
29
+ ) -> dict:
30
+ """
31
+ An evaluation decorator for computing text reading ease metric on an agentic node.
32
+ """
33
+ if func is None:
34
+ return partial(self.evaluate_text_reading_ease, configuration=configuration, metrics=metrics)
35
+
36
+ if not metrics:
37
+ metrics = [TextReadingEaseMetric()]
38
+
39
+ @decorator
40
+ def wrapper(func, instance, args, kwargs):
41
+
42
+ try:
43
+ self.validate(func=func, metrics=metrics,
44
+ valid_metric_types=(TextReadingEaseMetric,))
45
+
46
+ metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
47
+
48
+ original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
49
+ configuration=configuration,
50
+ metrics=metrics,
51
+ metric_inputs=[],
52
+ metric_outputs=metric_outputs)
53
+
54
+ return original_result
55
+ except Exception as ex:
56
+ raise Exception(
57
+ f"There was an error while evaluating text reading ease metric on {func.__name__},") from ex
58
+
59
+ return wrapper(func)