ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,387 @@
1
+ from typing import Any, Dict, List, Optional, Union
2
+ import asyncio
3
+ import logging
4
+
5
+ from llmevalkit.llm import LLMClient
6
+ from llmevalkit.llm.output_parser import ValidatingLLMClient
7
+ from .types import (
8
+ ComparisonStrategy,
9
+ ComparisonConfig,
10
+ ToolCallComparisonResult,
11
+ )
12
+ from .comparators.exact_match import (
13
+ ExactMatchComparator,
14
+ )
15
+ from .comparators.llm_judge import (
16
+ LLMJudgeComparator,
17
+ )
18
+ from .comparators.fuzzy_string import (
19
+ FuzzyStringComparator,
20
+ )
21
+ from .comparators.hybrid import (
22
+ HybridComparator,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class ComparisonPipeline:
29
+ """
30
+ Enhanced pipeline for comparing predicted tool calls against ground truth.
31
+
32
+ Features:
33
+ - Proper handling of missing parameters with defaults
34
+ - Integration with ValidatingLLMClient for structured outputs
35
+ - Support for tool specifications to resolve parameter defaults
36
+ - Multiple comparison strategies with intelligent fallbacks
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ strategy: ComparisonStrategy = ComparisonStrategy.EXACT_MATCH,
42
+ llm_client: Optional[Union[LLMClient, ValidatingLLMClient]] = None,
43
+ config: Optional[ComparisonConfig] = None,
44
+ ):
45
+ # Use strategy from config if provided, otherwise use parameter
46
+ if config:
47
+ self.strategy = config.strategy
48
+ self.config = config
49
+ else:
50
+ self.strategy = strategy
51
+ self.config = ComparisonConfig(strategy=strategy)
52
+
53
+ # Ensure we have ValidatingLLMClient for LLM-based strategies
54
+ if self.strategy in [
55
+ ComparisonStrategy.LLM_JUDGE,
56
+ ComparisonStrategy.CODE_AGENT,
57
+ ]:
58
+ if not llm_client:
59
+ raise ValueError(
60
+ f"LLM client required for {self.strategy.value} strategy"
61
+ )
62
+
63
+ # Wrap regular LLM client with mock wrapper for testing
64
+ if not isinstance(llm_client, ValidatingLLMClient):
65
+ # For testing purposes, create a simple mock wrapper
66
+ class MockValidatingClient:
67
+ def __init__(self, base_client):
68
+ self._base_client = base_client
69
+
70
+ def generate(self, prompt, schema=None, **kwargs):
71
+ return self._base_client.generate(prompt, schema, **kwargs)
72
+
73
+ async def generate_async(self, prompt, schema=None, **kwargs):
74
+ if hasattr(self._base_client, "generate_async"):
75
+ return await self._base_client.generate_async(
76
+ prompt, schema, **kwargs
77
+ )
78
+ else:
79
+ return self.generate(prompt, schema, **kwargs)
80
+
81
+ self.llm_client = MockValidatingClient(llm_client)
82
+ else:
83
+ self.llm_client = llm_client
84
+ else:
85
+ self.llm_client = llm_client # Optional for non-LLM strategies
86
+
87
+ # Initialize the appropriate comparator
88
+ self.comparator = self._create_comparator()
89
+
90
+ def _create_comparator(self):
91
+ """Factory method to create the appropriate comparator."""
92
+ if self.strategy == ComparisonStrategy.EXACT_MATCH:
93
+ return ExactMatchComparator(self.config)
94
+ elif self.strategy == ComparisonStrategy.NORMALIZED_MATCH:
95
+ # Enhanced exact match with type normalization
96
+ config = self.config.model_copy()
97
+ config.normalize_types = True
98
+ return ExactMatchComparator(config)
99
+ elif self.strategy == ComparisonStrategy.FUZZY_STRING:
100
+ return FuzzyStringComparator(self.config)
101
+ elif self.strategy == ComparisonStrategy.LLM_JUDGE:
102
+ return LLMJudgeComparator(self.config, self.llm_client)
103
+ elif self.strategy == ComparisonStrategy.HYBRID:
104
+ return HybridComparator(self.config, self.llm_client)
105
+ # TODO: Implement remaining strategies
106
+ # elif self.strategy == ComparisonStrategy.SEMANTIC_SIMILARITY:
107
+ # return SemanticSimilarityComparator(self.config)
108
+ # elif self.strategy == ComparisonStrategy.CODE_AGENT:
109
+ # return CodeAgentComparator(self.config, self.llm_client)
110
+ else:
111
+ raise ValueError(f"Strategy {self.strategy} not implemented yet")
112
+
113
+ def compare(
114
+ self,
115
+ predicted_call: Dict[str, Any],
116
+ ground_truth_call: Dict[str, Any],
117
+ conversation_history: Optional[List[Dict[str, str]]] = None,
118
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
119
+ **kwargs,
120
+ ) -> ToolCallComparisonResult:
121
+ """
122
+ Compare a predicted tool call against ground truth.
123
+
124
+ Args:
125
+ predicted_call: The tool call made by the agent
126
+ ground_truth_call: The expected/correct tool call
127
+ conversation_history: Context from the conversation
128
+ tool_specs: Available tool specifications (OpenAI format)
129
+ **kwargs: Additional comparison parameters
130
+
131
+ Returns:
132
+ ToolCallComparisonResult with detailed comparison analysis
133
+ """
134
+ try:
135
+ result = self.comparator.compare_tool_calls(
136
+ predicted_call=predicted_call,
137
+ ground_truth_call=ground_truth_call,
138
+ conversation_history=conversation_history,
139
+ tool_specs=tool_specs,
140
+ )
141
+
142
+ # Add pipeline metadata
143
+ result.metadata.update(
144
+ {
145
+ "pipeline_strategy": self.strategy.value,
146
+ "config_used": self.config.dict(),
147
+ "llm_client_type": (
148
+ type(self.llm_client).__name__ if self.llm_client else None
149
+ ),
150
+ }
151
+ )
152
+
153
+ return result
154
+
155
+ except Exception as e:
156
+ logger.error(f"Comparison failed: {e}")
157
+
158
+ # Create fallback result
159
+ return self._create_error_result(predicted_call, ground_truth_call, str(e))
160
+
161
+ def _create_error_result(
162
+ self,
163
+ predicted_call: Dict[str, Any],
164
+ ground_truth_call: Dict[str, Any],
165
+ error_message: str,
166
+ ) -> ToolCallComparisonResult:
167
+ """Create a result object for error cases."""
168
+ return ToolCallComparisonResult(
169
+ predicted_call=predicted_call,
170
+ ground_truth_call=ground_truth_call,
171
+ function_name_match=False,
172
+ function_name_score=0.0,
173
+ parameter_results=[],
174
+ overall_score=0.0,
175
+ overall_explanation=f"Comparison failed: {error_message}",
176
+ strategy_used=self.strategy,
177
+ metadata={"error": error_message},
178
+ )
179
+
180
+ def batch_compare(
181
+ self, comparisons: List[Dict[str, Any]], **kwargs
182
+ ) -> List[ToolCallComparisonResult]:
183
+ """
184
+ Perform batch comparison of multiple tool call pairs.
185
+
186
+ Args:
187
+ comparisons: List of dicts with 'predicted', 'ground_truth', and optional context
188
+
189
+ Returns:
190
+ List of comparison results
191
+ """
192
+ results = []
193
+ for i, comp_data in enumerate(comparisons):
194
+ try:
195
+ result = self.compare(
196
+ predicted_call=comp_data["predicted"],
197
+ ground_truth_call=comp_data["ground_truth"],
198
+ conversation_history=comp_data.get("conversation_history"),
199
+ tool_specs=comp_data.get("tool_specs"),
200
+ **kwargs,
201
+ )
202
+ result.metadata["batch_index"] = i
203
+ results.append(result)
204
+
205
+ except Exception as e:
206
+ logger.error(f"Batch comparison failed for item {i}: {e}")
207
+ error_result = self._create_error_result(
208
+ comp_data.get("predicted", {}),
209
+ comp_data.get("ground_truth", {}),
210
+ str(e),
211
+ )
212
+ error_result.metadata["batch_index"] = i
213
+ results.append(error_result)
214
+
215
+ return results
216
+
217
+ async def compare_async(
218
+ self,
219
+ predicted_call: Dict[str, Any],
220
+ ground_truth_call: Dict[str, Any],
221
+ conversation_history: Optional[List[Dict[str, str]]] = None,
222
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
223
+ **kwargs,
224
+ ) -> ToolCallComparisonResult:
225
+ """Async version of comparison (for LLM-based strategies)."""
226
+
227
+ # Ensure custom_instructions is provided for compatibility
228
+ if "custom_instructions" not in kwargs:
229
+ kwargs["custom_instructions"] = " "
230
+
231
+ # Check if comparator supports async
232
+ if hasattr(self.comparator, "compare_tool_calls_async"):
233
+ try:
234
+ result = await self.comparator.compare_tool_calls_async(
235
+ predicted_call=predicted_call,
236
+ ground_truth_call=ground_truth_call,
237
+ conversation_history=conversation_history,
238
+ tool_specs=tool_specs,
239
+ **kwargs,
240
+ )
241
+
242
+ # Add pipeline metadata
243
+ result.metadata.update(
244
+ {
245
+ "pipeline_strategy": self.strategy.value,
246
+ "config_used": self.config.dict(),
247
+ "execution_mode": "async",
248
+ }
249
+ )
250
+
251
+ return result
252
+
253
+ except Exception as e:
254
+ logger.error(f"Async comparison failed: {e}")
255
+ return self._create_error_result(
256
+ predicted_call, ground_truth_call, str(e)
257
+ )
258
+ else:
259
+ # Fallback to sync version in thread
260
+ return await asyncio.to_thread(
261
+ self.compare,
262
+ predicted_call,
263
+ ground_truth_call,
264
+ conversation_history,
265
+ tool_specs,
266
+ **kwargs,
267
+ )
268
+
269
+ async def batch_compare_async(
270
+ self,
271
+ comparisons: List[Dict[str, Any]],
272
+ max_concurrent: int = 10,
273
+ progress_callback: Optional[callable] = None,
274
+ **kwargs,
275
+ ) -> List[ToolCallComparisonResult]:
276
+ """
277
+ Efficient async batch processing with concurrency control.
278
+
279
+ Args:
280
+ comparisons: List of comparison tasks
281
+ max_concurrent: Maximum concurrent comparisons
282
+ progress_callback: Optional callback for progress updates
283
+
284
+ Returns:
285
+ List of comparison results in original order
286
+ """
287
+ semaphore = asyncio.Semaphore(max_concurrent)
288
+
289
+ async def compare_with_semaphore(comp_data, index):
290
+ async with semaphore:
291
+ try:
292
+ result = await self.compare_async(
293
+ predicted_call=comp_data["predicted"],
294
+ ground_truth_call=comp_data["ground_truth"],
295
+ conversation_history=comp_data.get("conversation_history"),
296
+ tool_specs=comp_data.get("tool_specs"),
297
+ **kwargs,
298
+ )
299
+ result.metadata["batch_index"] = index
300
+
301
+ if progress_callback:
302
+ progress_callback(index + 1, len(comparisons))
303
+
304
+ return result
305
+
306
+ except Exception as e:
307
+ logger.error(f"Async batch comparison failed for item {index}: {e}")
308
+ error_result = self._create_error_result(
309
+ comp_data.get("predicted", {}),
310
+ comp_data.get("ground_truth", {}),
311
+ str(e),
312
+ )
313
+ error_result.metadata["batch_index"] = index
314
+ return error_result
315
+
316
+ # Create tasks for all comparisons
317
+ tasks = [
318
+ compare_with_semaphore(comp_data, i)
319
+ for i, comp_data in enumerate(comparisons)
320
+ ]
321
+
322
+ # Execute all tasks concurrently
323
+ results = await asyncio.gather(*tasks, return_exceptions=True)
324
+
325
+ # Handle any exceptions that weren't caught
326
+ final_results = []
327
+ for i, result in enumerate(results):
328
+ if isinstance(result, Exception):
329
+ error_result = self._create_error_result(
330
+ comparisons[i].get("predicted", {}),
331
+ comparisons[i].get("ground_truth", {}),
332
+ str(result),
333
+ )
334
+ error_result.metadata["batch_index"] = i
335
+ final_results.append(error_result)
336
+ else:
337
+ final_results.append(result)
338
+
339
+ return final_results
340
+
341
+ def get_comparison_summary(
342
+ self, results: List[ToolCallComparisonResult]
343
+ ) -> Dict[str, Any]:
344
+ """Generate summary statistics for a batch of comparison results."""
345
+ if not results:
346
+ return {"error": "No results provided"}
347
+
348
+ total = len(results)
349
+ exact_matches = sum(1 for r in results if r.overall_score >= 0.95)
350
+ semantic_matches = sum(1 for r in results if 0.8 <= r.overall_score < 0.95)
351
+ partial_matches = sum(1 for r in results if 0.3 <= r.overall_score < 0.8)
352
+ no_matches = sum(1 for r in results if r.overall_score < 0.3)
353
+
354
+ avg_score = sum(r.overall_score for r in results) / total
355
+ avg_confidence = (
356
+ sum(
357
+ sum(p.confidence for p in r.parameter_results)
358
+ / len(r.parameter_results)
359
+ for r in results
360
+ if r.parameter_results
361
+ )
362
+ / sum(1 for r in results if r.parameter_results)
363
+ if any(r.parameter_results for r in results)
364
+ else 0.0
365
+ )
366
+
367
+ function_name_accuracy = (
368
+ sum(1 for r in results if r.function_name_match) / total
369
+ )
370
+
371
+ return {
372
+ "total_comparisons": total,
373
+ "exact_matches": exact_matches,
374
+ "semantic_matches": semantic_matches,
375
+ "partial_matches": partial_matches,
376
+ "no_matches": no_matches,
377
+ "accuracy_breakdown": {
378
+ "exact": exact_matches / total,
379
+ "semantic": semantic_matches / total,
380
+ "partial": partial_matches / total,
381
+ "none": no_matches / total,
382
+ },
383
+ "average_overall_score": avg_score,
384
+ "average_confidence": avg_confidence,
385
+ "function_name_accuracy": function_name_accuracy,
386
+ "strategy_used": self.strategy.value,
387
+ }
@@ -0,0 +1,178 @@
1
+ from __future__ import annotations
2
+ from typing import Any, Dict, List, Optional, Union, Literal
3
+ from pydantic import BaseModel, Field
4
+ from enum import Enum
5
+
6
+
7
+ class ComparisonStrategy(str, Enum):
8
+ EXACT_MATCH = "exact_match"
9
+ NORMALIZED_MATCH = "normalized_match" # Handle type conversions
10
+ FUZZY_STRING = "fuzzy_string" # String similarity
11
+ SEMANTIC_SIMILARITY = "semantic_similarity" # Embeddings-based
12
+ LLM_JUDGE = "llm_judge"
13
+ CODE_AGENT = "code_agent"
14
+ HYBRID = "hybrid"
15
+
16
+
17
+ class ParameterStatus(str, Enum):
18
+ BOTH_PRESENT = "both_present"
19
+ PRED_MISSING = "predicted_missing"
20
+ GT_MISSING = "ground_truth_missing"
21
+ BOTH_MISSING = "both_missing"
22
+ PRED_DEFAULT = "predicted_uses_default"
23
+ GT_DEFAULT = "ground_truth_uses_default"
24
+ BOTH_DEFAULT = "both_use_default"
25
+
26
+
27
+ class ParameterComparisonResult(BaseModel):
28
+ parameter_name: str
29
+ predicted_value: Any
30
+ ground_truth_value: Any
31
+ # Add actual resolved values (after applying defaults)
32
+ predicted_resolved_value: Any
33
+ ground_truth_resolved_value: Any
34
+ parameter_status: ParameterStatus
35
+ comparison_strategy: ComparisonStrategy
36
+ score: float = Field(ge=0.0, le=1.0) # 0.0 - 1.0
37
+ explanation: str
38
+ evidence: Optional[str] = None
39
+ is_match: bool
40
+ confidence: float = Field(ge=0.0, le=1.0)
41
+ error_type: Optional[str] = None
42
+ # Tool spec information
43
+ parameter_definition: Optional[Dict[str, Any]] = None
44
+ is_required: bool = False
45
+ default_value: Optional[Any] = None
46
+
47
+
48
+ class BulkParameterComparisonResult(BaseModel):
49
+ """Result for bulk parameter comparison in a single LLM prompt."""
50
+
51
+ function_name: str
52
+ parameter_results: List[ParameterComparisonResult]
53
+ overall_parameter_score: float = Field(ge=0.0, le=1.0)
54
+ overall_explanation: str = ""
55
+ comparison_strategy: ComparisonStrategy
56
+ confidence: float = Field(default=0.5, ge=0.0, le=1.0)
57
+ metadata: Dict[str, Any] = Field(default_factory=dict)
58
+
59
+
60
+ class ParameterWeight(BaseModel):
61
+ name: str
62
+ weight: float = Field(default=1.0, ge=0.0)
63
+ is_critical: bool = (
64
+ False # If critical param fails, overall score heavily penalized
65
+ )
66
+
67
+
68
+ class ToolCallComparisonResult(BaseModel):
69
+ predicted_call: Dict[str, Any]
70
+ ground_truth_call: Dict[str, Any]
71
+ function_name_match: bool
72
+ function_name_score: float = Field(ge=0.0, le=1.0)
73
+ parameter_results: List[ParameterComparisonResult]
74
+ overall_score: float = Field(ge=0.0, le=1.0)
75
+ overall_explanation: str
76
+ strategy_used: Union[ComparisonStrategy, List[ComparisonStrategy]]
77
+ # Enhanced metadata
78
+ metadata: Dict[str, Any] = Field(default_factory=dict)
79
+ missing_required_params: List[str] = Field(default_factory=list)
80
+ unexpected_params: List[str] = Field(default_factory=list)
81
+
82
+
83
+ class ComparisonConfig(BaseModel):
84
+ strategy: ComparisonStrategy
85
+ parameters_to_compare: Optional[List[str]] = (
86
+ None # If None, compare all including defaults
87
+ )
88
+ strategy_config: Dict[str, Any] = Field(default_factory=dict)
89
+ weight_function_name: float = Field(default=0.3, ge=0.0, le=1.0)
90
+ weight_parameters: float = Field(default=0.7, ge=0.0, le=1.0)
91
+ normalize_scores: bool = True
92
+
93
+ # Enhanced parameter handling
94
+ parameter_weights: List[ParameterWeight] = Field(default_factory=list)
95
+ critical_parameter_penalty: float = Field(default=0.5, ge=0.0, le=1.0)
96
+ include_default_parameters: bool = (
97
+ True # Whether to compare parameters with default values
98
+ )
99
+ missing_parameter_penalty: float = Field(
100
+ default=0.2, ge=0.0, le=1.0
101
+ ) # Penalty for missing non-required params
102
+
103
+ # LLM Judge specific settings
104
+ llm_bulk_comparison: bool = Field(
105
+ default=False,
106
+ description="If True, compare all parameters in one LLM prompt instead of individual prompts",
107
+ )
108
+
109
+ # Type normalization settings
110
+ normalize_types: bool = True
111
+ string_similarity_threshold: float = Field(default=0.8, ge=0.0, le=1.0)
112
+ numeric_tolerance: float = Field(default=0.01, ge=0.0)
113
+
114
+ # LLM settings
115
+ llm_temperature: float = Field(default=0.1, ge=0.0, le=2.0)
116
+ llm_max_retries: int = Field(default=3, ge=1)
117
+ llm_timeout: float = Field(default=30.0, gt=0.0)
118
+
119
+ # Fallback strategy
120
+ fallback_strategy: ComparisonStrategy = ComparisonStrategy.EXACT_MATCH
121
+
122
+
123
+ class FunctionCallInput(BaseModel):
124
+ """Input structure for LLM judge evaluation."""
125
+
126
+ expected: Dict[str, Any]
127
+ actual: Dict[str, Any]
128
+ context: Optional[str] = None
129
+
130
+
131
+ class ToolSpecParameter(BaseModel):
132
+ """Represents a parameter definition from tool specification."""
133
+
134
+ name: str
135
+ type: str
136
+ description: Optional[str] = None
137
+ required: bool = False
138
+ default: Optional[Any] = None
139
+ enum: Optional[List[Any]] = None
140
+ format: Optional[str] = None
141
+ properties: Optional[Dict[str, Any]] = None # For object types
142
+
143
+
144
+ class ToolSpecFunction(BaseModel):
145
+ """Represents a function definition from tool specification."""
146
+
147
+ name: str
148
+ description: Optional[str] = None
149
+ parameters: List[ToolSpecParameter] = Field(default_factory=list)
150
+
151
+ @classmethod
152
+ def from_openai_spec(cls, spec: Dict[str, Any]) -> "ToolSpecFunction":
153
+ """Create from OpenAI function specification format."""
154
+ func_def = spec.get("function", {})
155
+ params_schema = func_def.get("parameters", {})
156
+ properties = params_schema.get("properties", {})
157
+ required = set(params_schema.get("required", []))
158
+
159
+ parameters = []
160
+ for param_name, param_def in properties.items():
161
+ parameters.append(
162
+ ToolSpecParameter(
163
+ name=param_name,
164
+ type=param_def.get("type", "string"),
165
+ description=param_def.get("description"),
166
+ required=param_name in required,
167
+ default=param_def.get("default"),
168
+ enum=param_def.get("enum"),
169
+ format=param_def.get("format"),
170
+ properties=param_def.get("properties"),
171
+ )
172
+ )
173
+
174
+ return cls(
175
+ name=func_def.get("name", ""),
176
+ description=func_def.get("description"),
177
+ parameters=parameters,
178
+ )