ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
@@ -0,0 +1,411 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Dict, List, Optional, Union
3
+ import json
4
+ import logging
5
+
6
+ from ..types import (
7
+ ParameterComparisonResult,
8
+ ToolCallComparisonResult,
9
+ ComparisonConfig,
10
+ ParameterStatus,
11
+ ToolSpecFunction,
12
+ ToolSpecParameter,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class BaseComparator(ABC):
19
+ """Abstract base class for tool call comparison strategies."""
20
+
21
+ def __init__(self, config: ComparisonConfig):
22
+ self.config = config
23
+
24
+ @abstractmethod
25
+ def compare_parameter(
26
+ self,
27
+ param_name: str,
28
+ predicted_value: Any,
29
+ ground_truth_value: Any,
30
+ context: Optional[Dict[str, Any]] = None,
31
+ custom_instructions: Optional[str] = None,
32
+ ) -> ParameterComparisonResult:
33
+ """Compare a single parameter between predicted and ground truth."""
34
+ pass
35
+
36
+ @abstractmethod
37
+ def compare_function_name(
38
+ self,
39
+ predicted_name: str,
40
+ ground_truth_name: str,
41
+ context: Optional[Dict[str, Any]] = None,
42
+ ) -> float:
43
+ """Compare function names and return similarity score."""
44
+ pass
45
+
46
+ def _extract_tool_spec(
47
+ self, function_name: str, tool_specs: Optional[List[Dict[str, Any]]]
48
+ ) -> Optional[ToolSpecFunction]:
49
+ """Extract tool specification for the given function."""
50
+ if not tool_specs:
51
+ return None
52
+
53
+ for spec in tool_specs:
54
+ if spec.get("function", {}).get("name") == function_name:
55
+ return ToolSpecFunction.from_openai_spec(spec)
56
+
57
+ return None
58
+
59
+ def _resolve_parameters_with_defaults(
60
+ self, provided_params: Dict[str, Any], tool_spec: Optional[ToolSpecFunction]
61
+ ) -> Dict[str, Any]:
62
+ """Resolve parameters by applying defaults from tool specification."""
63
+ resolved = provided_params.copy()
64
+
65
+ if tool_spec:
66
+ for param_def in tool_spec.parameters:
67
+ if param_def.name not in resolved and param_def.default is not None:
68
+ resolved[param_def.name] = param_def.default
69
+
70
+ return resolved
71
+
72
+ def _determine_parameter_status(
73
+ self,
74
+ param_name: str,
75
+ predicted_params: Dict[str, Any],
76
+ ground_truth_params: Dict[str, Any],
77
+ predicted_resolved: Dict[str, Any],
78
+ ground_truth_resolved: Dict[str, Any],
79
+ ) -> ParameterStatus:
80
+ """Determine the status of a parameter in both calls."""
81
+ pred_present = param_name in predicted_params
82
+ gt_present = param_name in ground_truth_params
83
+ pred_resolved = param_name in predicted_resolved
84
+ gt_resolved = param_name in ground_truth_resolved
85
+
86
+ if pred_present and gt_present:
87
+ return ParameterStatus.BOTH_PRESENT
88
+ elif not pred_present and not gt_present:
89
+ if pred_resolved and gt_resolved:
90
+ return ParameterStatus.BOTH_DEFAULT
91
+ else:
92
+ return ParameterStatus.BOTH_MISSING
93
+ elif not pred_present and gt_present:
94
+ if pred_resolved:
95
+ return ParameterStatus.PRED_DEFAULT
96
+ else:
97
+ return ParameterStatus.PRED_MISSING
98
+ elif pred_present and not gt_present:
99
+ if gt_resolved:
100
+ return ParameterStatus.GT_DEFAULT
101
+ else:
102
+ return ParameterStatus.GT_MISSING
103
+ else:
104
+ return ParameterStatus.BOTH_MISSING
105
+
106
+ def _normalize_value(self, value: Any, expected_type: Optional[str] = None) -> Any:
107
+ """Normalize values for comparison (e.g., string numbers to numbers)."""
108
+ if not self.config.normalize_types:
109
+ return value
110
+
111
+ if expected_type == "integer" and isinstance(value, str):
112
+ try:
113
+ return int(value)
114
+ except ValueError:
115
+ pass
116
+ elif expected_type == "boolean" and isinstance(value, str):
117
+ return value.lower() in ("true", "1", "yes", "on")
118
+ elif expected_type == "number" and isinstance(value, str):
119
+ try:
120
+ return float(value)
121
+ except ValueError:
122
+ pass
123
+ elif expected_type == "array" and isinstance(value, str):
124
+ try:
125
+ return json.loads(value)
126
+ except (json.JSONDecodeError, ValueError):
127
+ pass
128
+ elif expected_type == "object" and isinstance(value, str):
129
+ try:
130
+ return json.loads(value)
131
+ except (json.JSONDecodeError, ValueError):
132
+ pass
133
+
134
+ return value
135
+
136
+ def _get_parameter_weight(self, param_name: str) -> float:
137
+ """Get weight for a specific parameter."""
138
+ for weight_config in self.config.parameter_weights:
139
+ if weight_config.name == param_name:
140
+ return weight_config.weight
141
+ return 1.0
142
+
143
+ def _is_critical_parameter(self, param_name: str) -> bool:
144
+ """Check if a parameter is marked as critical."""
145
+ for weight_config in self.config.parameter_weights:
146
+ if weight_config.name == param_name:
147
+ return weight_config.is_critical
148
+ return False
149
+
150
+ def _calculate_weighted_score(
151
+ self, param_results: List[ParameterComparisonResult]
152
+ ) -> float:
153
+ """Calculate weighted parameter score considering importance."""
154
+ if not param_results:
155
+ return 1.0
156
+
157
+ total_weight = 0
158
+ weighted_sum = 0
159
+ critical_failures = 0
160
+
161
+ for result in param_results:
162
+ weight = self._get_parameter_weight(result.parameter_name)
163
+ total_weight += weight
164
+ weighted_sum += result.score * weight
165
+
166
+ # Check for critical parameter failures
167
+ if (
168
+ self._is_critical_parameter(result.parameter_name)
169
+ and not result.is_match
170
+ ):
171
+ critical_failures += 1
172
+
173
+ base_score = weighted_sum / total_weight if total_weight > 0 else 0
174
+
175
+ # Apply critical failure penalty
176
+ if critical_failures > 0:
177
+ penalty = self.config.critical_parameter_penalty * critical_failures
178
+ base_score *= 1 - penalty
179
+
180
+ return max(0.0, min(1.0, base_score))
181
+
182
+ def compare_tool_calls(
183
+ self,
184
+ predicted_call: Dict[str, Any],
185
+ ground_truth_call: Dict[str, Any],
186
+ conversation_history: Optional[List[Dict[str, str]]] = None,
187
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
188
+ ) -> ToolCallComparisonResult:
189
+ """Main comparison method orchestrating the full comparison."""
190
+
191
+ # Extract function names
192
+ pred_name = predicted_call.get("function", {}).get("name", "")
193
+ gt_name = ground_truth_call.get("function", {}).get("name", "")
194
+
195
+ # Compare function names
196
+ fn_score = self.compare_function_name(pred_name, gt_name)
197
+ fn_match = fn_score >= 0.95 # High threshold for exact match
198
+
199
+ # Extract tool specification
200
+ tool_spec = self._extract_tool_spec(
201
+ gt_name, tool_specs
202
+ ) or self._extract_tool_spec(pred_name, tool_specs)
203
+
204
+ # Extract and parse parameters
205
+ pred_params = predicted_call.get("function", {}).get("arguments", {})
206
+ gt_params = ground_truth_call.get("function", {}).get("arguments", {})
207
+
208
+ if isinstance(pred_params, str):
209
+ try:
210
+ pred_params = json.loads(pred_params)
211
+ except json.JSONDecodeError:
212
+ logger.warning(f"Failed to parse predicted parameters: {pred_params}")
213
+ pred_params = {}
214
+
215
+ if isinstance(gt_params, str):
216
+ try:
217
+ gt_params = json.loads(gt_params)
218
+ except json.JSONDecodeError:
219
+ logger.warning(f"Failed to parse ground truth parameters: {gt_params}")
220
+ gt_params = {}
221
+
222
+ # Resolve parameters with defaults
223
+ pred_resolved = self._resolve_parameters_with_defaults(pred_params, tool_spec)
224
+ gt_resolved = self._resolve_parameters_with_defaults(gt_params, tool_spec)
225
+
226
+ # Determine all parameters to compare
227
+ params_to_compare = self.config.parameters_to_compare
228
+ if params_to_compare is None:
229
+ if self.config.include_default_parameters:
230
+ # Include all parameters that appear in either call or have defaults
231
+ params_to_compare = set(pred_resolved.keys()) | set(gt_resolved.keys())
232
+ else:
233
+ # Only explicit parameters
234
+ params_to_compare = set(pred_params.keys()) | set(gt_params.keys())
235
+
236
+ # Find missing required parameters and unexpected parameters
237
+ missing_required = []
238
+ unexpected_params = []
239
+
240
+ if tool_spec:
241
+ required_params = {p.name for p in tool_spec.parameters if p.required}
242
+ all_defined_params = {p.name for p in tool_spec.parameters}
243
+
244
+ # Check for missing required parameters
245
+ for req_param in required_params:
246
+ if req_param not in pred_resolved and req_param not in gt_resolved:
247
+ missing_required.append(req_param)
248
+
249
+ # Check for unexpected parameters
250
+ for param_name in params_to_compare:
251
+ if param_name not in all_defined_params:
252
+ unexpected_params.append(param_name)
253
+
254
+ # Compare each parameter
255
+ param_results = []
256
+ context = {
257
+ "conversation_history": conversation_history,
258
+ "tool_specs": tool_specs,
259
+ "tool_spec": tool_spec,
260
+ "predicted_call": predicted_call,
261
+ "ground_truth_call": ground_truth_call,
262
+ "function_name": gt_name or pred_name,
263
+ }
264
+
265
+ for param_name in params_to_compare:
266
+ pred_val = pred_params.get(param_name)
267
+ gt_val = gt_params.get(param_name)
268
+ pred_resolved_val = pred_resolved.get(param_name)
269
+ gt_resolved_val = gt_resolved.get(param_name)
270
+
271
+ # Get parameter definition from tool spec
272
+ param_def = None
273
+ if tool_spec:
274
+ param_def = next(
275
+ (p for p in tool_spec.parameters if p.name == param_name), None
276
+ )
277
+
278
+ # Determine parameter status
279
+ param_status = self._determine_parameter_status(
280
+ param_name, pred_params, gt_params, pred_resolved, gt_resolved
281
+ )
282
+
283
+ # Enhanced context for this parameter
284
+ param_context = context.copy()
285
+ param_context.update(
286
+ {
287
+ "parameter_definition": param_def.dict() if param_def else None,
288
+ "parameter_status": param_status,
289
+ "predicted_resolved": pred_resolved_val,
290
+ "ground_truth_resolved": gt_resolved_val,
291
+ }
292
+ )
293
+
294
+ param_result = self.compare_parameter(
295
+ param_name,
296
+ pred_resolved_val,
297
+ gt_resolved_val,
298
+ param_context,
299
+ custom_instructions=context.get("custom_instructions"),
300
+ )
301
+
302
+ # Enhance result with additional information
303
+ param_result.predicted_resolved_value = pred_resolved_val
304
+ param_result.ground_truth_resolved_value = gt_resolved_val
305
+ param_result.parameter_status = param_status
306
+ param_result.parameter_definition = param_def.dict() if param_def else None
307
+ param_result.is_required = param_def.required if param_def else False
308
+ param_result.default_value = param_def.default if param_def else None
309
+
310
+ param_results.append(param_result)
311
+
312
+ # Calculate overall score using weighted approach
313
+ param_score = self._calculate_weighted_score(param_results)
314
+
315
+ overall_score = (
316
+ self.config.weight_function_name * fn_score
317
+ + self.config.weight_parameters * param_score
318
+ )
319
+
320
+ # Apply penalties for missing required parameters
321
+ if missing_required:
322
+ penalty = len(missing_required) * self.config.missing_parameter_penalty
323
+ overall_score *= 1 - penalty
324
+ overall_score = max(0.0, overall_score)
325
+
326
+ # Generate overall explanation
327
+ overall_explanation = self._generate_overall_explanation(
328
+ fn_match,
329
+ fn_score,
330
+ param_results,
331
+ overall_score,
332
+ missing_required,
333
+ unexpected_params,
334
+ )
335
+
336
+ return ToolCallComparisonResult(
337
+ predicted_call=predicted_call,
338
+ ground_truth_call=ground_truth_call,
339
+ function_name_match=fn_match,
340
+ function_name_score=fn_score,
341
+ parameter_results=param_results,
342
+ overall_score=overall_score,
343
+ overall_explanation=overall_explanation,
344
+ strategy_used=self.config.strategy,
345
+ missing_required_params=missing_required,
346
+ unexpected_params=unexpected_params,
347
+ metadata={
348
+ "tool_spec_used": tool_spec.dict() if tool_spec else None,
349
+ "parameters_compared": list(params_to_compare),
350
+ "default_parameters_included": self.config.include_default_parameters,
351
+ },
352
+ )
353
+
354
+ def _generate_overall_explanation(
355
+ self,
356
+ fn_match: bool,
357
+ fn_score: float,
358
+ param_results: List[ParameterComparisonResult],
359
+ overall_score: float,
360
+ missing_required: List[str],
361
+ unexpected_params: List[str],
362
+ ) -> str:
363
+ """Generate human-readable explanation of comparison results."""
364
+ explanations = []
365
+
366
+ # Function name analysis
367
+ if fn_match:
368
+ explanations.append("Function names match exactly.")
369
+ else:
370
+ explanations.append(f"Function names differ (similarity: {fn_score:.2f}).")
371
+
372
+ # Parameter analysis
373
+ if param_results:
374
+ matches = sum(1 for r in param_results if r.is_match)
375
+ total = len(param_results)
376
+ explanations.append(f"Parameters: {matches}/{total} matches.")
377
+
378
+ # Break down by status
379
+ status_counts = {}
380
+ for result in param_results:
381
+ status_counts[result.parameter_status] = (
382
+ status_counts.get(result.parameter_status, 0) + 1
383
+ )
384
+
385
+ if status_counts:
386
+ status_summary = ", ".join(
387
+ [
388
+ f"{status.value}: {count}"
389
+ for status, count in status_counts.items()
390
+ ]
391
+ )
392
+ explanations.append(f"Parameter status breakdown: {status_summary}")
393
+
394
+ if matches < total:
395
+ mismatches = [r.parameter_name for r in param_results if not r.is_match]
396
+ explanations.append(f"Mismatched parameters: {', '.join(mismatches)}")
397
+
398
+ # Issues
399
+ if missing_required:
400
+ explanations.append(
401
+ f"Missing required parameters: {', '.join(missing_required)}"
402
+ )
403
+
404
+ if unexpected_params:
405
+ explanations.append(
406
+ f"Unexpected parameters: {', '.join(unexpected_params)}"
407
+ )
408
+
409
+ explanations.append(f"Overall similarity score: {overall_score:.2f}")
410
+
411
+ return " ".join(explanations)