ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
@@ -0,0 +1,489 @@
1
+ [
2
+ {
3
+ "name": "parameter_hallucination_check",
4
+ "task_description": "You are an expert evaluator assessing whether a **specific parameter value**-identified as `parameter_name`-in a tool call is **grounded** in the provided conversation history or tool specification, or whether it is **hallucinated**.\n\nYour task is to make a **strictly evidence-based** judgment. Evaluate this parameter alone-ignore other parameters or external knowledge. The value must:\n- Be explicitly supported by the dialogue, prior tool calls, or tool specification\n- Be appropriate for the context and user intent\n- Respect constraint rules defined in the specification\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value is grounded in the conversation or specification.\n---\n\n### Rating Scale\n\nAssign a score from 1 to 5 based on how clearly the value is grounded:\n\n**5 - Perfectly Grounded** \nExplicitly stated in conversation or matches a clearly documented and contextually appropriate default.\n\n**4 - Mostly Grounded** \nNot directly quoted but clearly follows from the conversation or is a minor variation on a documented default.\n\n**3 - Ambiguous** \nRequires non-trivial inference or transformation to relate the value to the dialogue or specification.\n\n**2 - Mostly Ungrounded** \nLoosely related, incorrectly formatted, or partially contradicts the spec or context.\n\n**1 - Completely Ungrounded** \nInvented, unrelated, or clearly in conflict with the dialogue or specification.\n\nBe conservative in your scoring: Use 1-3 for weak or unverified grounding, and 4-5 only when evidence is explicit or clearly reliable.\n\n---\n\n### Acceptable Sources for Grounding\nA value may be considered grounded if it comes from:\n- Explicit user input\n- Clearly implied user intent (minimal inference)\n- Assistant statements explicitly confirmed by the user\n- Outputs of previous tool calls\n- Documented default values in the tool specification\n\n---\n\n### Ungrounded Patterns\nMark a value ungrounded if:\n- It is absent from both conversation and tool spec\n- It depends on vague or missing input (e.g., \"tomorrow\" without a date format)\n- It uses the wrong type, unit, or format\n- It contradicts other parameters or tool constraints\n- It inaccurately rephrases or transforms the user's input\n\n---\n\n### Handling Defaults\nA default value may be accepted only if:\n- It is explicitly documented in the tool spec\n- It fits the context of the user's request\n- It does not contradict stated or implied intent\nOtherwise, treat the value as ungrounded.\n\n---\n\n### Parameter Relationships\nEnsure this parameter does not contradict other parameters in the same call.\nExample: If `add_day=false` and `day=\"Sunday\"` but the spec says `add_day` must be true to use `day`, the `day` value is ungrounded.\n\n---\n\n### Final Guideline\nLarge language models frequently hallucinate parameter values or apply defaults inappropriately. Your careful and conservative evaluation prevents propagation of such errors.",
5
+ "jsonschema": {
6
+ "title": "parameter_hallucination_check",
7
+ "description": "Assessment of tool call parameter hallucination, following the rubric defined above.",
8
+ "type": "object",
9
+ "additionalProperties": false,
10
+ "properties": {
11
+ "evidence": {
12
+ "type": "string",
13
+ "description": "Provide a short sentence or two with EXACT quotes from the conversation history (user messages, assistant responses, previous tool outputs) or tool specification that directly support your assessment. For grounded values, cite the specific text where this value originates. For ungrounded values, demonstrate the absence of supporting evidence or cite contradictory information."
14
+ },
15
+ "explanation": {
16
+ "type": "string",
17
+ "description": "Explain in one or two sentences why the specific parameter value is grounded or ungrounded. Reference exact sources from the conversation or tool specification that either support or contradict the parameter value. Address: 1) Where the value originated from, 2) Whether this source is sufficient for grounding, 3) Any format or type compliance issues, 4) Any contradictions with other parameters or specifications, and 5) For default values, whether they're documented and appropriate for this context."
18
+ },
19
+ "output": {
20
+ "type": "integer",
21
+ "minimum": 1,
22
+ "maximum": 5,
23
+ "threshold_low": 4,
24
+ "threshold_high": 5,
25
+ "description": "Parameter Hallucination Score (1-5):\n\n- 5: Perfectly grounded\n The parameter value is directly quoted in the conversation or exactly matches a documented default that fits the users intent.\n Example:\n User: \"Book a hotel in Rome.\"\n → Tool call: search_hotels(city=\"Rome\")\n\n- 4: Grounded via logical reasoning\n The value is not directly quoted but clearly follows from the conversation through straightforward, logic-based inference.\n Example:\n User: \"I'd like a room for me and my wife.\"\n → Tool call: search_hotels(guests=2)\n\n- 3: Ambiguous\n The value might relate to the conversation or domain context, but grounding requires assumptions or nontrivial interpretation not clearly supported by evidence.\n Example:\n User: \"Let's go somewhere warm.\"\n → Tool call: search_hotels(city=\"Barcelona\")\n\n- 2: Likely hallucinated\n The value is loosely connected to the topic but lacks grounding in the conversation or tool specification, or only partially aligns.\n Example:\n User: \"Find hotels.\"\n → Tool call: search_hotels(city=\"Berlin\")\n (No city mentioned)\n\n- 1: Clearly hallucinated\n The parameter value is invented, irrelevant, or contradicts the dialogue or tool specification.\n Example:\n User: \"Book a hotel.\"\n → Tool call: search_hotels(city=\"Mars\")"
26
+ },
27
+ "confidence": {
28
+ "type": "number",
29
+ "minimum": 0,
30
+ "maximum": 1,
31
+ "threshold_low": 0,
32
+ "threshold_high": 1,
33
+ "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
34
+ },
35
+ "correction": {
36
+ "type": "object",
37
+ "properties": {
38
+ "reason_types": {
39
+ "type": "array",
40
+ "description": "A short sentence or two with types of issues with the parameter value, if any. Use one or more of these values: FORMAT_ERROR (wrong format or type), MISSING_INFORMATION (needs more data), PARAMETER_CONTRADICTION (conflicts with other parameters), DEFAULT_ISSUE (inappropriate default), OTHER (explain in reasons).",
41
+ "items": {
42
+ "type": "string",
43
+ "enum": [
44
+ "FORMAT_ERROR",
45
+ "MISSING_INFORMATION",
46
+ "PARAMETER_CONTRADICTION",
47
+ "DEFAULT_ISSUE",
48
+ "OTHER"
49
+ ]
50
+ }
51
+ },
52
+ "reasons": {
53
+ "type": "string",
54
+ "description": "Concise explanation of the specific issues with the parameter value."
55
+ },
56
+ "parameter": {
57
+ "type": "object",
58
+ "description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.",
59
+ "additionalProperties": true
60
+ }
61
+ },
62
+ "description": "For well-grounded values: Provide an empty object {}. For ungrounded values: Provide an object with reason_types, reasons, and parameter correction.",
63
+ "required": []
64
+ }
65
+ },
66
+ "required": [
67
+ "explanation",
68
+ "evidence",
69
+ "output",
70
+ "confidence",
71
+ "correction"
72
+ ]
73
+ },
74
+ "examples": [
75
+ {
76
+ "user_kwargs": {
77
+ "conversation_context": [
78
+ {
79
+ "role": "user",
80
+ "content": "Translate 'hello' to Spanish."
81
+ }
82
+ ],
83
+ "tool_inventory": [
84
+ {
85
+ "type": "function",
86
+ "function": {
87
+ "name": "translate_text",
88
+ "description": "Translate given text",
89
+ "parameters": {
90
+ "type": "object",
91
+ "properties": {
92
+ "text": {
93
+ "type": "string",
94
+ "description": "The text to be translated"
95
+ },
96
+ "target_lang": {
97
+ "type": "string",
98
+ "description": "The target language code using ISO 639-1 (e.g., 'en' for English, 'es' for Spanish, 'fr' for French)"
99
+ }
100
+ },
101
+ "required": [
102
+ "text",
103
+ "target_lang"
104
+ ]
105
+ }
106
+ }
107
+ }
108
+ ],
109
+ "tool_call": {
110
+ "id": "call_001",
111
+ "type": "function",
112
+ "function": {
113
+ "name": "translate_text",
114
+ "arguments": "{ \"text\": \"hello\", \"target_lang\": \"es\" }"
115
+ }
116
+ },
117
+ "parameter_name": "target_lang",
118
+ "parameter_value": "es"
119
+ },
120
+ "output": {
121
+ "evidence": "User said: \"Translate 'hello' to Spanish.\" The tool spec defines target_lang as a string. 'es' is the ISO 639-1 code for Spanish.",
122
+ "explanation": "The value 'es' is a grounded and appropriate transformation of the user's explicit request for Spanish. Assuming standard practice for language codes in translation APIs, the ISO code 'es' matches both the user intent and specification.",
123
+ "output": 5,
124
+ "confidence": 0.97,
125
+ "correction": {}
126
+ }
127
+ },
128
+ {
129
+ "user_kwargs": {
130
+ "conversation_context": [
131
+ {
132
+ "role": "user",
133
+ "content": "Fetch my latest tweets."
134
+ }
135
+ ],
136
+ "tool_inventory": [
137
+ {
138
+ "type": "function",
139
+ "function": {
140
+ "name": "get_tweets",
141
+ "description": "Retrieve recent tweets",
142
+ "parameters": {
143
+ "type": "object",
144
+ "properties": {
145
+ "username": {
146
+ "type": "string",
147
+ "description": "The Twitter username to fetch tweets for"
148
+ },
149
+ "count": {
150
+ "type": "integer",
151
+ "description": "The number of recent tweets to retrieve"
152
+ }
153
+ },
154
+ "required": [
155
+ "username",
156
+ "count"
157
+ ]
158
+ }
159
+ }
160
+ }
161
+ ],
162
+ "tool_call": {
163
+ "id": "call_001",
164
+ "type": "function",
165
+ "function": {
166
+ "name": "get_tweets",
167
+ "arguments": "{ \"username\": \"elonmusk\", \"count\": 20 }"
168
+ }
169
+ },
170
+ "parameter_name": "count",
171
+ "parameter_value": 20
172
+ },
173
+ "output": {
174
+ "evidence": "User said: \"Fetch my latest tweets.\" There is no mention of a tweet count. Tool spec lists 'count' as an integer parameter but provides no default.",
175
+ "explanation": "The value 20 is ungrounded. The user did not specify how many tweets to fetch, and the tool specification does not define a default value for the 'count' parameter. The agent's choice appears arbitrary.",
176
+ "output": 1,
177
+ "confidence": 0.95,
178
+ "correction": {
179
+ "reason_types": [
180
+ "MISSING_INFORMATION"
181
+ ],
182
+ "reasons": "The value was invented without support in the user message or spec. There is no evidence for selecting 20.",
183
+ "parameter": {
184
+ "need_more_information": "How many tweets would you like to retrieve? Please specify a number."
185
+ }
186
+ }
187
+ }
188
+ },
189
+ {
190
+ "user_kwargs": {
191
+ "conversation_context": [
192
+ {
193
+ "role": "user",
194
+ "content": "Can you show me just a few of my latest messages?"
195
+ }
196
+ ],
197
+ "tool_inventory": [
198
+ {
199
+ "type": "function",
200
+ "function": {
201
+ "name": "get_messages",
202
+ "description": "Retrieve user's messages",
203
+ "parameters": {
204
+ "type": "object",
205
+ "properties": {
206
+ "user_id": {
207
+ "type": "integer",
208
+ "description": "The ID of the user whose messages are being retrieved"
209
+ },
210
+ "limit": {
211
+ "type": "integer",
212
+ "description": "The maximum number of messages to retrieve"
213
+ }
214
+ },
215
+ "required": [
216
+ "user_id",
217
+ "limit"
218
+ ]
219
+ }
220
+ }
221
+ }
222
+ ],
223
+ "tool_call": {
224
+ "id": "call_002",
225
+ "type": "function",
226
+ "function": {
227
+ "name": "get_messages",
228
+ "arguments": "{ \"user_id\": 456, \"limit\": 3 }"
229
+ }
230
+ },
231
+ "parameter_name": "limit",
232
+ "parameter_value": 3
233
+ },
234
+ "output": {
235
+ "evidence": "User said: 'just a few of my latest messages.' The phrase 'a few' is vague and subjective, and while 3 is a reasonable interpretation, it is not explicitly stated.",
236
+ "explanation": "The value limit=3 is a plausible interpretation of the user's request for 'a few' messages. However, the term is ambiguous and does not unambiguously support the specific value of 3. The assistant made a judgment call based on conversational context, but without explicit grounding or a documented default, the score is 3.",
237
+ "output": 3,
238
+ "confidence": 0.79,
239
+ "correction": {
240
+ "reason_types": [
241
+ "MISSING_INFORMATION"
242
+ ],
243
+ "reasons": "The user's phrasing is vague and does not explicitly support the value 3. Clarification is needed to ensure the correct value is used.",
244
+ "parameter": {
245
+ "need_more_information": "You asked for 'a few' messages. Could you specify how many exactly you'd like to see?"
246
+ }
247
+ }
248
+ }
249
+ }
250
+ ]
251
+ },
252
+ {
253
+ "name": "parameter_value_format_alignment",
254
+ "task_description": "Evaluate whether the specific parameter value (named in \"parameter_name\") conforms exactly to the required type, format, and unit conventions defined in the API specification.\n\nYour judgment must be strictly evidence-based: rely exclusively on the provided API documentation and the parameter value. Do not assume or infer formatting rules that are not explicitly documented.\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided to ensure it meets the exact type, format, and unit requirements. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value aligns with the specification.\n---\n\n### 1. Data Type Compliance (Output = 4-5)\n- Check that the value matches the required data type (e.g., string, integer, boolean, object, array)\n- Verify that numbers are represented as numeric types, not strings\n- Ensure strings are quoted correctly if required\n- Confirm booleans are true/false (not \"true\"/\"false\") when the type is boolean\n\n---\n\n### 2. Format Specification Compliance\n- Validate adherence to documented format constraints, including:\n - Date and time formats (e.g., ISO 8601, YYYY-MM-DD, timezone presence)\n - Currency (e.g., $USD, symbol placement, decimal precision)\n - Special patterns (e.g., phone numbers, email addresses, postal codes)\n- Enforce regular expressions or formatting rules defined in the schema\n\n---\n\n### 3. Unit Compliance\n- Verify presence or absence of unit suffixes/prefixes as specified (e.g., \"kg\", \"seconds\")\n- Confirm unit types are consistent with spec (e.g., Celsius vs Fahrenheit)\n- Reject any extra units when raw values are required\n\n---\n\n### 4. Consistency Requirements\n- Ensure values are consistent with related parameters (e.g., both timestamps in UTC)\n- Validate compatibility with other parameter values in the same call\n\n---\n\n### Format Alignment Rating Scale\n- **5 - Perfect Alignment**: Value exactly matches required type, format, and units\n- **4 - Minor Deviation**: Value is fundamentally correct but with minor formatting issues\n- **3 - Moderate Deviation**: Type or format is partially incorrect; may cause issues\n- **2 - Major Deviation**: Type and format are significantly wrong; likely to fail\n- **1 - Complete Mismatch**: Value is entirely incompatible with specification\n\n---\n\n### Final Rule\nBe conservative in your assessment:\n- If the spec is ambiguous and the value seems plausible, prefer a higher score (4-5)\n- If the spec is clear and the value deviates, assign a lower score (1-3)\n\nYour evaluation is critical: even minor formatting issues can lead to tool failures or incorrect outputs. Careful review ensures reliability in downstream tool use.",
255
+ "jsonschema": {
256
+ "title": "parameter_value_format_alignment",
257
+ "description": "Assessment of tool call parameter value format alignment, following the rubric defined above.",
258
+ "type": "object",
259
+ "additionalProperties": false,
260
+ "properties": {
261
+ "evidence": {
262
+ "type": "string",
263
+ "description": "Give a short sentence or two with quotes of the spec's type/format definition and the actual provided value that supports your judgment. Include EXACT text from the tool specification regarding this parameter's requirements."
264
+ },
265
+ "explanation": {
266
+ "type": "string",
267
+ "description": "Explain in one or two sentence precisely why the parameter value does or does not match the specification's type, format, and unit requirements. Reference specific details from the parameter specification including data type, format requirements, unit conventions, and any pattern constraints. For non-conforming values, explain exactly what is incorrect and how it deviates from requirements."
268
+ },
269
+ "output": {
270
+ "type": "integer",
271
+ "minimum": 1,
272
+ "maximum": 5,
273
+ "threshold_low": 4,
274
+ "threshold_high": 5,
275
+ "description": "An integer from 1 to 5 indicating how well the parameter value conforms to the required type, format, and unit as defined in the API specification.\n\nScore meanings:\n\n5 - Perfect Alignment: The value exactly matches the expected type, format, and unit. Example: expected date in 'YYYY-MM-DD', value is '2025-08-05'.\n\n4 - Minor Deviation: The value has small formatting issues (e.g., missing leading zeros) but is still likely to be accepted by the API or parser unless strict validation is enforced. Example: expected '2025-08-05', value is '2025-8-5'.\n\n3 - Moderate Deviation: The value partially matches the expected format but is likely to be rejected or misinterpreted by automatic processing. Example: expected 'YYYY-MM-DD', value is 'August 5, 2025'.\n\n2 - Major Deviation: The value significantly violates the expected type, format, or unit and is very likely to fail. Example: expected 'YYYY-MM-DD', value is '08/05/2025'.\n\n1 - Complete Mismatch: The value is entirely incompatible with the required format or type. Example: expected boolean, value is 'maybe'."
276
+ },
277
+ "confidence": {
278
+ "type": "number",
279
+ "minimum": 0,
280
+ "maximum": 1,
281
+ "threshold_low": 0,
282
+ "threshold_high": 1,
283
+ "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
284
+ },
285
+ "correction": {
286
+ "type": "object",
287
+ "properties": {
288
+ "reason_types": {
289
+ "type": "array",
290
+ "description": "A short sentence or two with types of format issues with the parameter value, if any. Use one or more of these values: TYPE_ERROR (wrong data type), FORMAT_ERROR (incorrect format), UNIT_ERROR (wrong/missing units), PATTERN_ERROR (doesn't match required pattern), CONSISTENCY_ERROR (inconsistent with related parameters), OTHER (explain in reasons).",
291
+ "items": {
292
+ "type": "string",
293
+ "enum": [
294
+ "TYPE_ERROR",
295
+ "FORMAT_ERROR",
296
+ "UNIT_ERROR",
297
+ "PATTERN_ERROR",
298
+ "CONSISTENCY_ERROR",
299
+ "OTHER"
300
+ ]
301
+ }
302
+ },
303
+ "reasons": {
304
+ "type": "string",
305
+ "description": "Concise explanation of the specific format issues with the parameter value."
306
+ },
307
+ "parameter": {
308
+ "type": "object",
309
+ "description": "An object containing the corrected parameter value in the form: { \"<parameter_name>\": <corrected_value> }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.",
310
+ "additionalProperties": true
311
+ }
312
+ },
313
+ "description": "For correctly formatted values: Provide an empty object {}. For format issues: Provide an object with reason_types, reasons, and parameter correction.",
314
+ "required": []
315
+ }
316
+ },
317
+ "required": [
318
+ "explanation",
319
+ "evidence",
320
+ "output",
321
+ "confidence",
322
+ "correction"
323
+ ]
324
+ },
325
+ "examples": [
326
+ {
327
+ "user_kwargs": {
328
+ "conversation_context": [
329
+ {
330
+ "role": "user",
331
+ "content": "Set a short countdown for 12 minutes."
332
+ }
333
+ ],
334
+ "tool_inventory": [
335
+ {
336
+ "type": "function",
337
+ "function": {
338
+ "name": "set_timer",
339
+ "description": "Set a countdown timer",
340
+ "parameters": {
341
+ "type": "object",
342
+ "properties": {
343
+ "duration": {
344
+ "type": "string",
345
+ "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')"
346
+ }
347
+ },
348
+ "required": [
349
+ "duration"
350
+ ]
351
+ }
352
+ }
353
+ }
354
+ ],
355
+ "tool_call": {
356
+ "id": "call_001",
357
+ "type": "function",
358
+ "function": {
359
+ "name": "set_timer",
360
+ "arguments": "{ \"duration\": \"12 minutes\" }"
361
+ }
362
+ },
363
+ "parameter_name": "duration",
364
+ "parameter_value": "12 minutes"
365
+ },
366
+ "output": {
367
+ "evidence": "The tool spec states that 'duration' must be a string in the format 'number + unit', with examples such as '5 minutes' and '30 seconds'. The value '12 minutes' follows this format precisely: it consists of a number followed by a space and a valid time unit.",
368
+ "explanation": "The parameter value '12 minutes' aligns perfectly with the required type, structure, and unit conventions defined in the spec. While not explicitly given as an example, it is fully consistent with the format and requires no normalization or interpretation.",
369
+ "output": 5,
370
+ "confidence": 0.99,
371
+ "correction": {}
372
+ }
373
+ },
374
+ {
375
+ "user_kwargs": {
376
+ "conversation_context": [
377
+ {
378
+ "role": "user",
379
+ "content": "Start a countdown for 5 minutes."
380
+ }
381
+ ],
382
+ "tool_inventory": [
383
+ {
384
+ "type": "function",
385
+ "function": {
386
+ "name": "set_timer",
387
+ "description": "Set a countdown timer",
388
+ "parameters": {
389
+ "type": "object",
390
+ "properties": {
391
+ "duration": {
392
+ "type": "string",
393
+ "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')"
394
+ }
395
+ },
396
+ "required": [
397
+ "duration"
398
+ ]
399
+ }
400
+ }
401
+ }
402
+ ],
403
+ "tool_call": {
404
+ "id": "call_001",
405
+ "type": "function",
406
+ "function": {
407
+ "name": "set_timer",
408
+ "arguments": "{ \"duration\": \"300000\" }"
409
+ }
410
+ },
411
+ "parameter_name": "duration",
412
+ "parameter_value": "300000"
413
+ },
414
+ "output": {
415
+ "evidence": "The tool specification defines 'duration' as a string following the 'number + unit' format (e.g., '5 minutes'). The value '300000' is a string, but it lacks a time unit and does not follow the documented pattern.",
416
+ "explanation": "The value is likely intended to represent milliseconds, but the format is both undocumented and inconsistent with the specification. It fails both unit and pattern requirements, and cannot be reliably interpreted as valid input without assumptions or external conventions.",
417
+ "output": 2,
418
+ "confidence": 0.95,
419
+ "correction": {
420
+ "reason_types": [
421
+ "FORMAT_ERROR",
422
+ "UNIT_ERROR"
423
+ ],
424
+ "reasons": "Missing explicit time unit and nonconformant format. Specification requires 'number + unit' format, which this value does not follow.",
425
+ "parameter": {
426
+ "duration": "5 minutes"
427
+ }
428
+ }
429
+ }
430
+ },
431
+ {
432
+ "user_kwargs": {
433
+ "conversation_context": [
434
+ {
435
+ "role": "user",
436
+ "content": "Set a timer for five minutes please."
437
+ }
438
+ ],
439
+ "tool_inventory": [
440
+ {
441
+ "type": "function",
442
+ "function": {
443
+ "name": "set_timer",
444
+ "description": "Set a countdown timer",
445
+ "parameters": {
446
+ "type": "object",
447
+ "properties": {
448
+ "duration": {
449
+ "type": "string",
450
+ "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')"
451
+ }
452
+ },
453
+ "required": [
454
+ "duration"
455
+ ]
456
+ }
457
+ }
458
+ }
459
+ ],
460
+ "tool_call": {
461
+ "id": "call_001",
462
+ "type": "function",
463
+ "function": {
464
+ "name": "set_timer",
465
+ "arguments": "{ \"duration\": \"5Minutes\" }"
466
+ }
467
+ },
468
+ "parameter_name": "duration",
469
+ "parameter_value": "5Minutes"
470
+ },
471
+ "output": {
472
+ "evidence": "The specification explicitly defines the duration format as 'number + unit' with examples like '5 minutes'. The given value '5Minutes' omits the required space, resulting in a concatenated string.",
473
+ "explanation": "The input conveys the intended meaning and includes all required semantic elements (number and unit), but the lack of a space between them breaks the documented pattern. This deviation is moderate-it introduces a risk of failure in strict parsers that expect a space-separated format.",
474
+ "output": 3,
475
+ "confidence": 0.89,
476
+ "correction": {
477
+ "reason_types": [
478
+ "FORMAT_ERROR"
479
+ ],
480
+ "reasons": "Value contains both components but violates the explicit formatting requirement of space separation.",
481
+ "parameter": {
482
+ "duration": "5 minutes"
483
+ }
484
+ }
485
+ }
486
+ }
487
+ ]
488
+ }
489
+ ]
@@ -0,0 +1,7 @@
1
+ """Trajectory reflection metrics."""
2
+ from .trajectory import (
3
+ get_trajectory_reflection_prompt,
4
+ TrajectoryReflectionPrompt,
5
+ )
6
+
7
+ __all__ = ["get_trajectory_reflection_prompt", "TrajectoryReflectionPrompt"]
@@ -0,0 +1,43 @@
1
+ from typing import Any, Dict, List, Union
2
+
3
+ from llmevalkit.function_calling.metrics.base import FunctionMetricsPrompt
4
+
5
+ _trajectory_system = (
6
+ "### Task Description and Role:\n\n"
7
+ "{{ task_description }}\n\n"
8
+ "Your output must conform to the following JSON schema, in the same order as the fields appear in the schema:\n"
9
+ "{{ metric_jsonschema }}"
10
+ )
11
+
12
+ _trajectory_user: str = (
13
+ "End-to-end user to agent interaction:\n"
14
+ "{{ trajectory }}\n\n"
15
+ "Tool Specification:\n"
16
+ "{{ tool_inventory }}\n\n"
17
+ "Return a JSON object as specified in the system prompt. You MUST keep the same order of fields in the JSON object as provided in the JSON schema and examples."
18
+ )
19
+
20
+
21
+ class TrajectoryReflectionPrompt(FunctionMetricsPrompt):
22
+ """Prompt builder for trajectory reflection metrics."""
23
+
24
+ system_template = _trajectory_system
25
+ user_template = _trajectory_user
26
+
27
+
28
+ def get_trajectory_reflection_prompt(
29
+ prompt: TrajectoryReflectionPrompt,
30
+ trajectory: Union[str, List[Dict[str, str]]],
31
+ tool_inventory: List[Dict[str, Any]],
32
+ ) -> List[Dict[str, str]]:
33
+ """
34
+ Build the messages for a trajectory reflection evaluation.
35
+
36
+ Returns the list of chat messages (system -> [few-shot] -> user).
37
+ """
38
+ return prompt.build_messages(
39
+ user_kwargs={
40
+ "trajectory": trajectory,
41
+ "tool_inventory": tool_inventory,
42
+ }
43
+ )