ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,188 @@
1
+ import json
2
+ from typing import Any, Dict, List, Optional, Tuple
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from llmevalkit.prompt.runner import (
7
+ PromptRunner,
8
+ PromptResult,
9
+ SyncGen,
10
+ AsyncGen,
11
+ Prompt,
12
+ PromptAndSchema,
13
+ )
14
+ from llmevalkit.metrics.prompt import MetricPrompt
15
+ from llmevalkit.metrics.field import NumericField
16
+ from llmevalkit.metrics.metric import Metric
17
+
18
+
19
+ class MetricRunResult(BaseModel):
20
+ """
21
+ Structured result for a single metric invocation.
22
+ """
23
+
24
+ metric_name: str
25
+ jsonschema: Dict[str, Any]
26
+ prompt: Prompt
27
+ raw_response: Any
28
+ numeric_thresholds_checks: Dict[str, bool]
29
+ error: Optional[str]
30
+ is_important: bool
31
+ importance_reason: Optional[str]
32
+ is_correct: bool
33
+ correctness_reason: Optional[str]
34
+ is_issue: bool
35
+
36
+
37
+ class MetricRunner:
38
+ """
39
+ Orchestrates running multiple metrics via LLM calls.
40
+ """
41
+
42
+ def __init__(
43
+ self, entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None
44
+ ) -> None:
45
+ """
46
+ Args:
47
+ entries: Optional list of (MetricPrompt, user_kwargs) pairs.
48
+ """
49
+ self.entries: List[Dict[str, Any]] = []
50
+ if entries:
51
+ for mp, kw in entries:
52
+ self.add(mp, kw)
53
+
54
+ def add(self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]) -> None:
55
+ """
56
+ Add a metric to run.
57
+
58
+ Args:
59
+ metric_prompt: MetricPrompt instance.
60
+ user_kwargs: Dict of variables to render the user template.
61
+ """
62
+ messages = metric_prompt.build_messages(user_kwargs)
63
+ self.entries.append(
64
+ {
65
+ "metric_prompt": metric_prompt,
66
+ "user_kwargs": user_kwargs,
67
+ "messages": messages,
68
+ "schema": metric_prompt.metric.to_jsonschema(),
69
+ }
70
+ )
71
+
72
+ def remove(self, index: int) -> None:
73
+ """Remove the entry at the given index."""
74
+ self.entries.pop(index)
75
+
76
+ def clear(self) -> None:
77
+ """Remove all entries."""
78
+ self.entries.clear()
79
+
80
+ def _assemble_prompts(self) -> List[PromptAndSchema]:
81
+ return [(e["messages"], e["schema"]) for e in self.entries]
82
+
83
+ def _process_results(
84
+ self, prompt_results: List[PromptResult]
85
+ ) -> List[MetricRunResult]:
86
+ """
87
+ Combine PromptResult with metric parsing, threshold checks,
88
+ importance and correctness determinations.
89
+ """
90
+ results: List[MetricRunResult] = []
91
+
92
+ for entry, pr in zip(self.entries, prompt_results):
93
+ mp: MetricPrompt = entry["metric_prompt"]
94
+ metric: Metric = mp.metric
95
+
96
+ # default values
97
+ numeric_thresholds_checks: Dict[str, bool] = {}
98
+ err = pr.error
99
+ is_imp = False
100
+ imp_reason = None
101
+ is_corr = False
102
+ corr_reason = None
103
+ data = None
104
+
105
+ if pr.error is None:
106
+ try:
107
+ # parse raw response into JSON-compatible dict
108
+ raw = pr.response
109
+ if isinstance(raw, str):
110
+ data = json.loads(raw)
111
+ else:
112
+ data = raw
113
+
114
+ # numeric threshold checks
115
+ for field in metric.fields:
116
+ if isinstance(field, NumericField):
117
+ val = data.get(field.name)
118
+ ok = False
119
+ if isinstance(val, (int, float)):
120
+ ok = field.is_within_threshold(val)
121
+ numeric_thresholds_checks[field.name] = ok
122
+
123
+ # importance and correctness
124
+ is_imp, imp_reason = metric.is_important(data)
125
+ is_corr, corr_reason = metric.is_correct(data)
126
+
127
+ except Exception as e:
128
+ err = str(e)
129
+
130
+ # Build the result model
131
+ result = MetricRunResult(
132
+ metric_name=metric.name,
133
+ jsonschema=entry["schema"],
134
+ prompt=pr.prompt,
135
+ raw_response=pr.response,
136
+ numeric_thresholds_checks=numeric_thresholds_checks,
137
+ error=err,
138
+ is_important=is_imp,
139
+ importance_reason=imp_reason,
140
+ is_correct=is_corr,
141
+ correctness_reason=corr_reason,
142
+ is_issue=is_imp and not is_corr,
143
+ )
144
+ results.append(result)
145
+
146
+ return results
147
+
148
+ def run_all(
149
+ self,
150
+ gen_fn: SyncGen,
151
+ prompt_param_name: str = "prompt",
152
+ schema_param_name: Optional[str] = None,
153
+ **kwargs: Any,
154
+ ) -> List[MetricRunResult]:
155
+ """
156
+ Run all metrics using a synchronous single-prompt generator.
157
+ """
158
+ prompts = self._assemble_prompts()
159
+ runner = PromptRunner(prompts)
160
+ pr_results = runner.run_all(
161
+ gen_fn,
162
+ prompt_param_name=prompt_param_name,
163
+ schema_param_name=schema_param_name,
164
+ **kwargs,
165
+ )
166
+ return self._process_results(pr_results)
167
+
168
+ async def run_async(
169
+ self,
170
+ async_fn: AsyncGen,
171
+ max_parallel: int = 10,
172
+ prompt_param_name: str = "prompt",
173
+ schema_param_name: Optional[str] = None,
174
+ **kwargs: Any,
175
+ ) -> List[MetricRunResult]:
176
+ """
177
+ Run all metrics using asynchronous single-prompt generation.
178
+ """
179
+ prompts = self._assemble_prompts()
180
+ runner = PromptRunner(prompts)
181
+ pr_results = await runner.run_async(
182
+ async_fn,
183
+ max_parallel=max_parallel,
184
+ prompt_param_name=prompt_param_name,
185
+ schema_param_name=schema_param_name,
186
+ **kwargs,
187
+ )
188
+ return self._process_results(pr_results)
@@ -0,0 +1,403 @@
1
+ import json
2
+ from typing import Any, Dict, List, Optional, Tuple, Type
3
+
4
+ import jsonschema
5
+ from jinja2 import BaseLoader, Environment, Template
6
+ from llmevalkit.metrics.field import (CorrectionField, EvidenceField,
7
+ ExplanationField, NumericField)
8
+ from llmevalkit.metrics.metric import Metric
9
+ from llmevalkit.metrics.utils import (remove_threshold_fields,
10
+ validate_template_context)
11
+ from pydantic import BaseModel, ValidationError, create_model
12
+
13
+ # Jinja2 environment for string templates
14
+ _jinja_env = Environment(loader=BaseLoader(), autoescape=False)
15
+
16
+
17
+ class MetricPrompt:
18
+ """
19
+ Combines a Metric with system and user prompt templates, plus optional few-shot examples.
20
+
21
+ Attributes:
22
+ metric: Metric instance describing the schema to validate outputs.
23
+ system_template: Jinja2 Template for the system message.
24
+ user_template: Jinja2 Template for the user message.
25
+ examples: List of (user_kwargs, output_dict) pairs.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ metric: Metric,
31
+ system_template: str,
32
+ user_template: str,
33
+ *,
34
+ system_kwargs_defaults: Optional[Dict[str, Any]] = None,
35
+ ) -> None:
36
+ """
37
+ Args:
38
+ metric: Metric whose JSONSchema will be used for validation.
39
+ system_template: Template string for the system message; may reference keys.
40
+ user_template: Template string for the user message; may reference user_kwargs keys.
41
+ system_kwargs_defaults: Optional default context for system template.
42
+ """
43
+ if not isinstance(system_template, str):
44
+ raise TypeError("system_template must be a string")
45
+ if not isinstance(user_template, str):
46
+ raise TypeError("user_template must be a string")
47
+ if not system_kwargs_defaults:
48
+ system_kwargs_defaults = {} # Default to empty dict if None
49
+ if not isinstance(system_kwargs_defaults, dict):
50
+ raise TypeError("system_kwargs_defaults must be a dict")
51
+ if not isinstance(metric, Metric):
52
+ raise TypeError("metric must be an instance of Metric")
53
+
54
+ self._system_template_str: str = system_template
55
+ self._user_template_str: str = user_template
56
+
57
+ # Compile Jinja2 templates
58
+ self._system_tmpl: Template = _jinja_env.from_string(system_template)
59
+ self._user_tmpl: Template = _jinja_env.from_string(user_template)
60
+
61
+ # Store defaults for system context
62
+ # This allows overriding system context without modifying the template
63
+ # during prompt building
64
+ self.system_kwargs_defaults: Dict[str,
65
+ Any] = system_kwargs_defaults.copy()
66
+
67
+ # Initialize examples list
68
+ # This will hold (user_kwargs, output) pairs for few-shot prompting
69
+ self.examples: List[Tuple[Dict[str, Any], Dict[str, Any]]] = []
70
+
71
+ # Store the metric for validation
72
+ # This allows the prompt to validate example outputs against the metric's schema
73
+ self.metric = metric
74
+
75
+ # --- Getters and Setters ---
76
+
77
+ def get_system_template(self) -> str:
78
+ return self._system_tmpl.source
79
+
80
+ def set_system_template(self, template_str: str) -> None:
81
+ self._system_tmpl = _jinja_env.from_string(template_str)
82
+
83
+ def get_user_template(self) -> str:
84
+ return self._user_tmpl.source
85
+
86
+ def set_user_template(self, template_str: str) -> None:
87
+ """
88
+ Setting a new user template clears existing examples.
89
+ """
90
+ self._user_tmpl = _jinja_env.from_string(template_str)
91
+ self.examples.clear()
92
+
93
+ def get_system_kwargs_defaults(self) -> Dict[str, Any]:
94
+ return dict(self.system_kwargs_defaults)
95
+
96
+ def set_system_kwargs_defaults(self, defaults: Dict[str, Any]) -> None:
97
+ self.system_kwargs_defaults = defaults
98
+
99
+ # --- Example Management ---
100
+
101
+ def add_example(self, user_kwargs: Dict[str, Any], output: Dict[str, Any]) -> None:
102
+ """
103
+ Add a few-shot example.
104
+
105
+ Validates that `output` adheres to this.metric's JSONSchema.
106
+
107
+ Args:
108
+ user_kwargs: Variables for rendering the user_template.
109
+ output: Dict matching the metric's schema.
110
+
111
+ Raises:
112
+ ValidationError if output invalid.
113
+ """
114
+ schema = self.metric.to_jsonschema()
115
+ # 1) JSONSchema structural validation
116
+ jsonschema.validate(instance=output, schema=schema)
117
+ # 2) Pydantic type/enum validation
118
+ Model: Type[BaseModel] = self._build_response_model()
119
+ try:
120
+ Model.model_validate(output)
121
+ except ValidationError as e:
122
+ raise ValueError(f"Example output failed validation: {e}")
123
+ self.examples.append((user_kwargs, output))
124
+
125
+ # --- Prompt Building ---
126
+
127
+ def build_messages(
128
+ self,
129
+ user_kwargs: Dict[str, Any],
130
+ system_kwargs: Optional[Dict[str, Any]] = None,
131
+ ) -> List[Dict[str, str]]:
132
+ """
133
+ Build the full chat messages sequence:
134
+
135
+ 1. System message rendered with:
136
+ - metric_jsonschema
137
+ - plus any system_kwargs (overrides defaults)
138
+ 2. For each example:
139
+ - User message from user_template with example user_kwargs
140
+ - Assistant message: JSON dump of example output
141
+ 3. Final user message with provided user_kwargs
142
+
143
+ Args:
144
+ user_kwargs: Variables for the final user prompt.
145
+ system_kwargs: Optional overrides for system template context.
146
+
147
+ Returns:
148
+ List of {"role": "...", "content": "..."} dicts.
149
+ """
150
+ msgs: List[Dict[str, str]] = []
151
+ # Prepare system context
152
+ ctx = self.system_kwargs_defaults
153
+ ctx["metric_jsonschema"] = json.dumps(
154
+ remove_threshold_fields(self.metric.to_jsonschema())
155
+ )
156
+
157
+ if system_kwargs:
158
+ ctx.update(system_kwargs)
159
+
160
+ # 1) System message
161
+ sys_text = self._system_tmpl.render(**ctx)
162
+ msgs.append({"role": "system", "content": sys_text})
163
+
164
+ try:
165
+ # 2) Few-shot examples
166
+ for ex_user_kwargs, ex_output in self.examples:
167
+ ex_user_kwargs_parsed = {
168
+ k: json.dumps(d) for k, d in ex_user_kwargs.items()
169
+ }
170
+ user_text = self._user_tmpl.render(**ex_user_kwargs_parsed)
171
+ msgs.append({"role": "user", "content": user_text})
172
+ assistant_text = json.dumps(ex_output, indent=None)
173
+ msgs.append({"role": "assistant", "content": assistant_text})
174
+
175
+ # 3) Final user message
176
+ final_user_kwargs_parsed = {}
177
+ for key, obj in user_kwargs.items():
178
+ final_user_kwargs_parsed[key] = json.dumps(obj)
179
+ final_user = self._user_tmpl.render(**final_user_kwargs_parsed)
180
+ except Exception as e:
181
+ raise e
182
+
183
+ msgs.append({"role": "user", "content": final_user})
184
+
185
+ return msgs
186
+
187
+ def build_messages(
188
+ self,
189
+ user_kwargs: Dict[str, Any],
190
+ system_kwargs: Optional[Dict[str, Any]] = None,
191
+ ) -> List[Dict[str, str]]:
192
+ """
193
+ Build the full chat messages sequence:
194
+
195
+ 1. System message rendered with:
196
+ - metric_jsonschema
197
+ - plus any system_kwargs (overrides defaults)
198
+ 2. For each example:
199
+ - User message from user_template with example user_kwargs
200
+ - Assistant message: JSON dump of example output
201
+ 3. Final user message with provided user_kwargs
202
+
203
+ Args:
204
+ user_kwargs: Variables for the final user prompt.
205
+ system_kwargs: Optional overrides for system template context.
206
+
207
+ Returns:
208
+ List of {"role": "...", "content": "..."} dicts.
209
+ """
210
+ msgs: List[Dict[str, str]] = []
211
+
212
+ # Prepare system context
213
+ ctx = self.system_kwargs_defaults.copy()
214
+ ctx["metric_jsonschema"] = json.dumps(
215
+ remove_threshold_fields(self.metric.to_jsonschema())
216
+ )
217
+ if system_kwargs:
218
+ ctx.update(system_kwargs)
219
+
220
+ # Validate and render system message
221
+ validate_template_context(
222
+ _jinja_env, self._system_template_str, ctx, "system_template"
223
+ )
224
+
225
+ sys_text = self._system_tmpl.render(**ctx)
226
+ msgs.append({"role": "system", "content": sys_text})
227
+
228
+ try:
229
+ # Few-shot examples
230
+ for ex_user_kwargs, ex_output in self.examples:
231
+ ex_user_kwargs_parsed = {
232
+ k: json.dumps(d) for k, d in ex_user_kwargs.items()
233
+ }
234
+ validate_template_context(
235
+ _jinja_env,
236
+ self._user_template_str,
237
+ ex_user_kwargs_parsed,
238
+ "user_template (example)",
239
+ )
240
+ user_text = self._user_tmpl.render(**ex_user_kwargs_parsed)
241
+ msgs.append({"role": "user", "content": user_text})
242
+
243
+ assistant_text = json.dumps(ex_output, indent=None)
244
+ msgs.append({"role": "assistant", "content": assistant_text})
245
+
246
+ # Final user message
247
+ final_user_kwargs_parsed = {
248
+ k: json.dumps(obj) for k, obj in user_kwargs.items()
249
+ }
250
+ validate_template_context(
251
+ _jinja_env,
252
+ self._user_template_str,
253
+ final_user_kwargs_parsed,
254
+ "user_template (final)",
255
+ )
256
+ # Render final user message
257
+ final_user = self._user_tmpl.render(**final_user_kwargs_parsed)
258
+
259
+ except Exception as e:
260
+ raise e
261
+
262
+ msgs.append({"role": "user", "content": final_user})
263
+ return msgs
264
+
265
+ def _build_response_model(self) -> Type[BaseModel]:
266
+ """
267
+ Dynamically construct a Pydantic model matching metric.to_jsonschema().
268
+ Used to enforce types beyond JSONSchema.
269
+ """
270
+ schema = self.metric.to_jsonschema()
271
+ props = schema.get("properties", {})
272
+ fields: Dict[str, Tuple[Any, Any]] = {}
273
+ for name, subs in props.items():
274
+ jtype = subs.get("type")
275
+ # map JSONSchema types -> Python types
276
+ if name in schema.get("required", []):
277
+ secondary_type = ...
278
+ else:
279
+ secondary_type = None
280
+
281
+ if jtype == "integer":
282
+ py = (int, secondary_type)
283
+ elif jtype == "number":
284
+ py = (float, secondary_type)
285
+ elif jtype == "string":
286
+ py = (str, secondary_type)
287
+ elif jtype == "boolean":
288
+ py = (bool, secondary_type)
289
+ elif jtype == "object":
290
+ py = (dict, secondary_type)
291
+ else:
292
+ py = (Any, secondary_type)
293
+ # handle enums
294
+ if "enum" in subs:
295
+ from typing import Literal
296
+
297
+ enum_vals = subs["enum"]
298
+ py = (Literal[tuple(enum_vals)], secondary_type)
299
+
300
+ # handle additional properties
301
+ if subs.get("additionalProperties", False):
302
+ # If additionalProperties is true, we allow any type
303
+ py = (Dict[str, Any], secondary_type)
304
+ fields[name] = py
305
+
306
+ Model = create_model(schema.get("title", "ResponseModel"), **fields)
307
+ return Model
308
+
309
+
310
+ # --- Example Subclass: RelevancePrompt ---
311
+
312
+
313
+ class RelevanceMetric(Metric):
314
+ """
315
+ Metric for assessing relevance of a response to its context.
316
+ """
317
+
318
+ def __init__(self) -> None:
319
+ desc = "Rate how relevant the response is to the given context on a 0-1 scale."
320
+ super().__init__(
321
+ name="Relevance",
322
+ description=desc,
323
+ fields=[
324
+ ExplanationField(
325
+ name="explanation",
326
+ json_type="string",
327
+ description="Why the response is or is not relevant, step by step.",
328
+ ),
329
+ EvidenceField(
330
+ name="evidence",
331
+ json_type="string",
332
+ description="Portion of context or response that supports your relevance rating.",
333
+ ),
334
+ NumericField(
335
+ name="output",
336
+ json_type="number",
337
+ description="Relevance score from 0.0 (not relevant) to 1.0 (fully relevant).",
338
+ jsonschema_extra={"minimum": 0.0, "maximum": 1.0},
339
+ extra_params={"threshold_low": 0.0, "threshold_high": 1.0},
340
+ ),
341
+ NumericField(
342
+ name="confidence",
343
+ json_type="number",
344
+ description="Confidence in your relevance judgment (0.0-1.0).",
345
+ jsonschema_extra={"minimum": 0.0, "maximum": 1.0},
346
+ extra_params={"threshold_low": 0.0, "threshold_high": 1.0},
347
+ ),
348
+ CorrectionField(
349
+ name="correction",
350
+ json_type="object",
351
+ description="If relevance is low, suggest how to improve relevance.",
352
+ ),
353
+ ],
354
+ )
355
+
356
+
357
+ class RelevancePrompt(MetricPrompt):
358
+ """
359
+ Prompt builder specialized for the RelevanceMetric.
360
+ Provides default templates and example usage.
361
+ """
362
+
363
+ def __init__(self) -> None:
364
+ metric = RelevanceMetric()
365
+ system_tmpl = (
366
+ "You are an expert judge that assesses response relevance. "
367
+ "Here is the JSONSchema for your response:\n"
368
+ "{{ metric_jsonschema }}"
369
+ )
370
+ user_tmpl = (
371
+ "Context: {{ context }}\n"
372
+ "Response: {{ response }}\n"
373
+ "Provide your evaluation as JSON as specified in the system prompt."
374
+ )
375
+ super().__init__(metric, system_tmpl, user_tmpl)
376
+
377
+ # Initialize default few-shot examples
378
+ self.add_example(
379
+ {
380
+ "context": "The sky is blue.",
381
+ "response": "The sky appears azure due to Rayleigh scattering.",
382
+ },
383
+ {
384
+ "evidence": "The sky appears azure due to Rayleigh scattering.",
385
+ "explanation": "The response directly addresses sky color by naming scattering physics.",
386
+ "output": 1.0,
387
+ "confidence": 0.9,
388
+ "correction": {},
389
+ },
390
+ )
391
+ self.add_example(
392
+ {
393
+ "context": "What is the capital of France?",
394
+ "response": "The moon orbits Earth every 27 days.",
395
+ },
396
+ {
397
+ "evidence": "The moon orbits Earth every 27 days.",
398
+ "explanation": "The response is about lunar orbit, unrelated to capitals.",
399
+ "output": 0.0,
400
+ "confidence": 0.8,
401
+ "correction": {"suggestion": "The capital of France is Paris."},
402
+ },
403
+ )
@@ -0,0 +1,46 @@
1
+ from jinja2 import meta, Environment
2
+ from typing import Dict, Any
3
+
4
+
5
+ def remove_threshold_fields(schema: dict) -> dict:
6
+ """
7
+ Recursively removes 'threshold_low' and 'threshold_high' fields from a JSON schema.
8
+ """
9
+ if isinstance(schema, dict):
10
+ # Remove the threshold fields if present
11
+ schema.pop("threshold_low", None)
12
+ schema.pop("threshold_high", None)
13
+ # Recurse into nested dictionaries and lists
14
+ for key, value in schema.items():
15
+ if isinstance(value, dict):
16
+ schema[key] = remove_threshold_fields(value)
17
+ elif isinstance(value, list):
18
+ schema[key] = [
19
+ remove_threshold_fields(item) if isinstance(item, dict) else item
20
+ for item in value
21
+ ]
22
+ return schema
23
+
24
+
25
+ def validate_template_context(
26
+ env: Environment,
27
+ template_str: str,
28
+ context: Dict[str, Any],
29
+ template_name: str = "",
30
+ ):
31
+ parsed = env.parse(template_str)
32
+ required_vars = meta.find_undeclared_variables(parsed)
33
+
34
+ # Allow custom_instructions, tool_specification, and custom_schema to be optional since they're in conditional blocks
35
+ optional_vars = {"custom_instructions", "tool_specification", "custom_schema"}
36
+
37
+ missing_or_empty = [
38
+ var
39
+ for var in required_vars
40
+ if var not in optional_vars
41
+ and (var not in context or context[var] in (None, [], {}, ()))
42
+ ]
43
+ if missing_or_empty:
44
+ raise ValueError(
45
+ f"Missing or empty variables in template '{template_name or 'unnamed'}': {missing_or_empty}"
46
+ )