ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,255 @@
1
+ from __future__ import annotations
2
+ from abc import ABC
3
+ from typing import Any, Dict, List, Literal, Optional, Type, TypeVar
4
+
5
+ from pydantic import BaseModel, Field as PydanticField, PrivateAttr, model_validator
6
+
7
+ JSONType = Literal["integer", "number", "string", "boolean", "object", "array"]
8
+ TField = TypeVar("TField", bound="BaseField")
9
+ BaseFieldRegistry: List[Type[BaseField]] = []
10
+
11
+
12
+ class BaseField(BaseModel, ABC):
13
+ """
14
+ Abstract representation of a single metric field.
15
+
16
+ Attributes:
17
+ name: Identifier of the field (used as JSON key).
18
+ json_type: JSON Schema type of the field.
19
+ description: Human-friendly description of the field's purpose.
20
+ jsonschema_extra: Additional JSONSchema keywords (e.g., enum, pattern).
21
+ extra_params: Non-JSONSchema attributes (e.g., thresholds).
22
+ """
23
+
24
+ name: str
25
+ json_type: JSONType
26
+ description: str = PydanticField(
27
+ "No description provided. Please specify what this field represents.",
28
+ description="A clear description of this field's meaning.",
29
+ )
30
+ jsonschema_extra: Dict[str, Any] = PydanticField(
31
+ default_factory=dict,
32
+ description="Additional JSONSchema constraints for this field.",
33
+ )
34
+ extra_params: Dict[str, Any] = PydanticField(
35
+ default_factory=dict,
36
+ description="Extra parameters not included in the JSONSchema (e.g., thresholds).",
37
+ )
38
+
39
+ def __init_subclass__(cls, **kwargs):
40
+ super().__init_subclass__(**kwargs)
41
+ if not getattr(cls, "__abstract__", False):
42
+ BaseFieldRegistry.insert(0, cls)
43
+
44
+ @classmethod
45
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
46
+ """Override in subclasses to signal compatibility with a JSONSchema snippet."""
47
+ return False
48
+
49
+ @classmethod
50
+ def from_jsonschema(cls, name: str, schema: Dict[str, Any]) -> BaseField:
51
+ """
52
+ Instantiate the appropriate Field subclass from a JSONSchema property.
53
+ The first subclass whose `can_handle` returns True is used.
54
+ Falls back to GenericField.
55
+ """
56
+ for field_cls in BaseFieldRegistry:
57
+ if field_cls.can_handle(name, schema):
58
+ desc = schema.get("description", "")
59
+ extra = {
60
+ k: v for k, v in schema.items() if k not in ("type", "description")
61
+ }
62
+ return field_cls(
63
+ name=name,
64
+ json_type=schema.get("type", "string"),
65
+ description=desc,
66
+ jsonschema_extra=extra,
67
+ extra_params={},
68
+ )
69
+ return GenericField(
70
+ name=name,
71
+ json_type=schema.get("type", "string"),
72
+ description=schema.get("description", ""),
73
+ jsonschema_extra={
74
+ k: v for k, v in schema.items() if k not in ("type", "description")
75
+ },
76
+ extra_params={},
77
+ )
78
+
79
+ def to_jsonschema(self) -> Dict[str, Any]:
80
+ return {
81
+ "type": self.json_type,
82
+ "description": self.description,
83
+ **self.jsonschema_extra,
84
+ }
85
+
86
+ # --- Getters and Setters ---
87
+
88
+ def get_name(self) -> str:
89
+ return self.name
90
+
91
+ def set_name(self, name: str) -> None:
92
+ self.name = name
93
+
94
+ def get_description(self) -> str:
95
+ return self.description
96
+
97
+ def set_description(self, description: str) -> None:
98
+ self.description = description
99
+
100
+ def get_jsonschema_extra(self) -> Dict[str, Any]:
101
+ return dict(self.jsonschema_extra)
102
+
103
+ def set_jsonschema_extra(self, extra: Dict[str, Any]) -> None:
104
+ self.jsonschema_extra = extra
105
+
106
+ def get_extra_param(self, key: str) -> Any:
107
+ return self.extra_params.get(key)
108
+
109
+ def set_extra_param(self, key: str, value: Any) -> None:
110
+ self.extra_params[key] = value
111
+
112
+
113
+ class NumericField(BaseField):
114
+ """
115
+ Numeric field (integer or number) with optional thresholds.
116
+ The `extra_params` dict may include:
117
+ - threshold_low: minimal acceptable value (for validation)
118
+ - threshold_high: maximal acceptable value
119
+ """
120
+
121
+ threshold_low: Optional[float] = PydanticField(
122
+ None, description="Lower bound for correctness checks (not in JSONSchema)."
123
+ )
124
+ threshold_high: Optional[float] = PydanticField(
125
+ None, description="Upper bound for correctness checks (not in JSONSchema)."
126
+ )
127
+
128
+ __abstract__ = False
129
+
130
+ @model_validator(mode="before")
131
+ def extract_thresholds(cls, values: Dict[str, Any]) -> Dict[str, Any]:
132
+ extra = values.get("jsonschema_extra", {})
133
+ if "threshold_low" in extra:
134
+ values["threshold_low"] = extra["threshold_low"]
135
+ if "threshold_high" in extra:
136
+ values["threshold_high"] = extra["threshold_high"]
137
+ return values
138
+
139
+ @classmethod
140
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
141
+ return schema.get("type") in ("integer", "number")
142
+
143
+ @classmethod
144
+ def from_jsonschema(cls, name: str, schema: Dict[str, Any]) -> NumericField:
145
+ """
146
+ Create a NumericField from a JSONSchema property.
147
+ """
148
+ return NumericField(
149
+ name=name,
150
+ json_type=schema.get("type", "number"),
151
+ description=schema.get("description", ""),
152
+ jsonschema_extra={
153
+ k: v for k, v in schema.items() if k not in ("type", "description")
154
+ },
155
+ extra_params={},
156
+ )
157
+
158
+ def to_jsonschema(self) -> Dict[str, Any]:
159
+ return super().to_jsonschema()
160
+
161
+ def is_within_threshold(self, value: float) -> bool:
162
+ if self.threshold_low is not None and value < self.threshold_low:
163
+ return False
164
+ if self.threshold_high is not None and value > self.threshold_high:
165
+ return False
166
+ return True
167
+
168
+
169
+ class EnumField(BaseField):
170
+ """
171
+ Field whose value must be one of a fixed set of options.
172
+ Expects `jsonschema_extra["enum"]` to be a list of allowed values.
173
+ """
174
+
175
+ __abstract__ = False
176
+
177
+ @classmethod
178
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
179
+ return "enum" in schema
180
+
181
+
182
+ class ExplanationField(BaseField):
183
+ """
184
+ Free-form explanation of the metric's reasoning.
185
+ """
186
+
187
+ __abstract__ = False
188
+
189
+ def __init__(self, **data: Any):
190
+ data.setdefault(
191
+ "description",
192
+ "A detailed, step-by-step explanation of the reasoning behind the metric's value.",
193
+ )
194
+ super().__init__(**data)
195
+
196
+ @classmethod
197
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
198
+ return name.lower() == "explanation" and schema.get("type") == "string"
199
+
200
+
201
+ class EvidenceField(BaseField):
202
+ """
203
+ The specific quote or reference that supports the metric's evaluation.
204
+ """
205
+
206
+ __abstract__ = False
207
+
208
+ def __init__(self, **data: Any):
209
+ data.setdefault(
210
+ "description",
211
+ "The exact quote or reference from the input or context that justifies the metric's value.",
212
+ )
213
+ super().__init__(**data)
214
+
215
+ @classmethod
216
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
217
+ return name.lower() == "evidence" and schema.get("type") == "string"
218
+
219
+
220
+ class CorrectionField(BaseField):
221
+ """
222
+ A structured suggestion (as JSON) for correcting or improving the output.
223
+ """
224
+
225
+ __abstract__ = False
226
+
227
+ def __init__(self, **data: Any):
228
+ data.setdefault(
229
+ "description",
230
+ "A JSON-formatted suggestion for how to correct or improve the output if needed.",
231
+ )
232
+ super().__init__(**data)
233
+
234
+ @classmethod
235
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
236
+ return name.lower() == "correction" and schema.get("type") == "object"
237
+
238
+
239
+ class GenericField(BaseField):
240
+ """
241
+ Fallback field type for any property not handled by other classes.
242
+ """
243
+
244
+ __abstract__ = False
245
+
246
+ def __init__(self, **data: Any):
247
+ data.setdefault(
248
+ "description",
249
+ f"A generic field named '{data.get('name')}' of type {data.get('json_type')}.",
250
+ )
251
+ super().__init__(**data)
252
+
253
+ @classmethod
254
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
255
+ return True
@@ -0,0 +1,332 @@
1
+ from __future__ import annotations
2
+ import json
3
+ from typing import Any, Dict, List, Optional, Set, Tuple, Type, TypeVar
4
+
5
+ from llmevalkit.metrics.field import (
6
+ BaseField,
7
+ ExplanationField,
8
+ EvidenceField,
9
+ CorrectionField,
10
+ NumericField,
11
+ )
12
+
13
+ TMetric = TypeVar("TMetric", bound="Metric")
14
+
15
+
16
+ class Metric:
17
+ """
18
+ Abstract representation of an evaluation metric composed of multiple fields.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ name: str,
24
+ description: str,
25
+ fields: Optional[List[BaseField]] = None,
26
+ required: Optional[List[str]] = None,
27
+ additional_properties: bool = True,
28
+ ) -> None:
29
+ """
30
+ Args:
31
+ name: Unique metric identifier.
32
+ description: Full description of what this metric measures.
33
+ fields: List of BaseField instances composing this metric.
34
+ required: List of field names that must appear in results.
35
+ Defaults to all provided fields.
36
+ """
37
+ self.name = name
38
+ self.description = description
39
+ self.fields: List[BaseField] = fields or []
40
+ self.additional_properties = additional_properties
41
+ # Determine required fields
42
+ if required is not None:
43
+ self.required_fields: Set[str] = set(required)
44
+ else:
45
+ self.required_fields: Set[str] = {f.name for f in self.fields}
46
+
47
+ # Validate required_fields
48
+ known = {f.name for f in self.fields}
49
+ missing = self.required_fields - known
50
+ if missing:
51
+ raise ValueError(
52
+ f"Required fields {missing} not among metric fields {known}"
53
+ )
54
+
55
+ def to_jsonschema(self) -> Dict[str, Any]:
56
+ """
57
+ Build a JSONSchema representation of this metric.
58
+
59
+ Returns:
60
+ A dict with keys:
61
+ - title: self.name
62
+ - description: self.description
63
+ - type: "object"
64
+ - properties: mapping field.name → field.to_jsonschema()
65
+ - required: list of required field names
66
+ """
67
+ props: Dict[str, Any] = {f.name: f.to_jsonschema() for f in self.fields}
68
+ return {
69
+ "title": self.name,
70
+ "description": self.description,
71
+ "type": "object",
72
+ "properties": props,
73
+ "required": sorted(self.required_fields),
74
+ "additionalProperties": self.additional_properties,
75
+ }
76
+
77
+ def add_field(self, field: BaseField, required: bool = True) -> None:
78
+ """
79
+ Add a new field to this metric.
80
+
81
+ Args:
82
+ field: BaseField instance.
83
+ required: Whether this field must appear in results.
84
+ """
85
+ if any(f.name == field.name for f in self.fields):
86
+ raise ValueError(f"Field '{field.name}' already defined")
87
+ self.fields.append(field)
88
+ if required:
89
+ self.required_fields.add(field.name)
90
+
91
+ def remove_field(self, name: str) -> None:
92
+ """
93
+ Remove a field by name.
94
+
95
+ Args:
96
+ name: Name of field to remove.
97
+ """
98
+ self.fields = [f for f in self.fields if f.name != name]
99
+ self.required_fields.discard(name)
100
+
101
+ @classmethod
102
+ def from_jsonschema(cls: Type[TMetric], schema: Dict[str, Any]) -> Metric:
103
+ """
104
+ Reconstruct a Metric from a JSONSchema dict.
105
+
106
+ Args:
107
+ schema: dict with 'title', 'description', 'properties', 'required'.
108
+
109
+ Returns:
110
+ Metric instance with fields populated.
111
+ """
112
+ name: str = schema.get("title", "")
113
+ description: str = schema.get("description", "")
114
+ props: Dict[str, Any] = schema.get("properties", {})
115
+ required: List[str] = schema.get("required", [])
116
+ additional_props: bool = schema.get("additionalProperties", True)
117
+ fields: List[BaseField] = []
118
+ for fname, fschema in props.items():
119
+ # If type is number or integer, use NumericField
120
+ if fschema.get("type") in ("number", "integer"):
121
+ field = NumericField.from_jsonschema(fname, fschema)
122
+ else:
123
+ field = BaseField.from_jsonschema(fname, fschema)
124
+ fields.append(field)
125
+ return cls(
126
+ name=name,
127
+ description=description,
128
+ fields=fields,
129
+ required=required,
130
+ additional_properties=additional_props,
131
+ )
132
+
133
+ def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
134
+ """
135
+ A result is 'important' if its confidence lies within the defined confidence thresholds.
136
+
137
+ Args:
138
+ result: Parsed metric result with at least 'confidence'.
139
+
140
+ Returns:
141
+ (important: bool, reason: Optional[str])
142
+ """
143
+ try:
144
+ conf = float(result.get("confidence", 0.0))
145
+ except (TypeError, ValueError):
146
+ return False, "Invalid confidence value"
147
+ # locate the confidence field
148
+ conf_field = next((f for f in self.fields if f.name == "confidence"), None)
149
+ if isinstance(conf_field, NumericField):
150
+ ok = conf_field.is_within_threshold(conf)
151
+ reason = (
152
+ None
153
+ if ok
154
+ else f"Confidence {conf} outside [{conf_field.threshold_low},{conf_field.threshold_high}]"
155
+ )
156
+ return ok, reason
157
+ return False, "Confidence field not defined"
158
+
159
+ def is_correct(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
160
+ """
161
+ A result is 'correct' if it is important AND its output lies within thresholds.
162
+
163
+ Args:
164
+ result: Parsed metric result with 'output' and 'confidence'.
165
+
166
+ Returns:
167
+ (correct: bool, reason: Optional[str])
168
+ """
169
+ important, imp_reason = self.is_important(result)
170
+ if not important:
171
+ return True, f"Not important: {imp_reason}"
172
+ # check output
173
+ try:
174
+ val = float(result.get("output", 0.0))
175
+ except (TypeError, ValueError):
176
+ return False, "Invalid output value"
177
+ out_field = next((f for f in self.fields if f.name == "output"), None)
178
+ if isinstance(out_field, NumericField):
179
+ ok = out_field.is_within_threshold(val)
180
+ reason = (
181
+ None
182
+ if ok
183
+ else f"Output {val} outside [{out_field.threshold_low},{out_field.threshold_high}]"
184
+ )
185
+ return ok, reason
186
+ return False, "Output field not defined"
187
+
188
+ def parse_response(self, response: str) -> Dict[str, Any]:
189
+ """
190
+ Parse a raw response string into a structured dict.
191
+
192
+ Args:
193
+ response: Raw response string.
194
+
195
+ Returns:
196
+ Parsed response as a dict.
197
+ """
198
+ # Default implementation: assume JSON string
199
+ try:
200
+ return json.loads(response)
201
+ except json.JSONDecodeError as e:
202
+ raise ValueError(f"Failed to parse response: {e}") from e
203
+
204
+
205
+ class StandardMetric(Metric):
206
+ """
207
+ A standard metric with common fields:
208
+ - explanation: string, detailed reasoning.
209
+ - evidence: string, supporting quote or reference.
210
+ - output: numeric value within specified range.
211
+ - confidence: numeric confidence within specified range.
212
+ - correction: object, structured suggestion for improvement.
213
+ Also provides convenience methods `is_important` and `is_correct`.
214
+ """
215
+
216
+ def __init__(
217
+ self,
218
+ name: str,
219
+ description: str,
220
+ *,
221
+ output_range: Tuple[float, float] = (0.0, 1.0),
222
+ confidence_range: Tuple[float, float] = (0.0, 1.0),
223
+ ) -> None:
224
+ """
225
+ Args:
226
+ name: Metric identifier.
227
+ description: Explanation of what the metric measures.
228
+ output_range: (min, max) allowed for the 'output' field.
229
+ confidence_range: (min, max) for the 'confidence' field.
230
+
231
+ Fields created:
232
+ - explanation: "A detailed, step-by-step explanation of the reasoning."
233
+ - evidence: "The exact quote or evidence supporting the reasoning."
234
+ - output: numeric in output_range
235
+ - confidence: numeric in confidence_range
236
+ - correction: structured suggestion if output below threshold
237
+ """
238
+ # Prepare fields
239
+ min_out, max_out = output_range
240
+ min_conf, max_conf = confidence_range
241
+
242
+ explanation = ExplanationField(
243
+ name="explanation",
244
+ json_type="string",
245
+ description="A detailed, step-by-step explanation of the reasoning behind the output value.",
246
+ )
247
+ evidence = EvidenceField(
248
+ name="evidence",
249
+ json_type="string",
250
+ description="The exact quote or reference that supports the output value.",
251
+ )
252
+ output = NumericField(
253
+ name="output",
254
+ json_type=(
255
+ "number"
256
+ if isinstance(min_out, float) or isinstance(max_out, float)
257
+ else "integer"
258
+ ),
259
+ description=f"Primary numeric score for this metric (range {min_out} to {max_out}).",
260
+ jsonschema_extra={"minimum": min_out, "maximum": max_out},
261
+ extra_params={"threshold_low": min_out, "threshold_high": max_out},
262
+ )
263
+ confidence = NumericField(
264
+ name="confidence",
265
+ json_type="number",
266
+ description=f"Confidence in the output value (range {min_conf} to {max_conf}).",
267
+ jsonschema_extra={"minimum": min_conf, "maximum": max_conf},
268
+ extra_params={"threshold_low": min_conf, "threshold_high": max_conf},
269
+ )
270
+ correction = CorrectionField(
271
+ name="correction",
272
+ json_type="object",
273
+ description="Structured suggestion for how to correct or improve the output if needed.",
274
+ )
275
+
276
+ fields = [explanation, evidence, output, confidence, correction]
277
+ super().__init__(name=name, description=description, fields=fields)
278
+
279
+ def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
280
+ """
281
+ A result is 'important' if its confidence lies within the defined confidence thresholds.
282
+
283
+ Args:
284
+ result: Parsed metric result with at least 'confidence'.
285
+
286
+ Returns:
287
+ (important: bool, reason: Optional[str])
288
+ """
289
+ try:
290
+ conf = float(result.get("confidence", 0.0))
291
+ except (TypeError, ValueError):
292
+ return False, "Invalid confidence value"
293
+ # locate the confidence field
294
+ conf_field = next((f for f in self.fields if f.name == "confidence"), None)
295
+ if isinstance(conf_field, NumericField):
296
+ ok = conf_field.is_within_threshold(conf)
297
+ reason = (
298
+ None
299
+ if ok
300
+ else f"Confidence {conf} outside [{conf_field.threshold_low},{conf_field.threshold_high}]"
301
+ )
302
+ return ok, reason
303
+ return False, "Confidence field not defined"
304
+
305
+ def is_correct(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
306
+ """
307
+ A result is 'correct' if it is important AND its output lies within thresholds.
308
+
309
+ Args:
310
+ result: Parsed metric result with 'output' and 'confidence'.
311
+
312
+ Returns:
313
+ (correct: bool, reason: Optional[str])
314
+ """
315
+ important, imp_reason = self.is_important(result)
316
+ if not important:
317
+ return True, f"Not important: {imp_reason}"
318
+ # check output
319
+ try:
320
+ val = float(result.get("output", 0.0))
321
+ except (TypeError, ValueError):
322
+ return False, "Invalid output value"
323
+ out_field = next((f for f in self.fields if f.name == "output"), None)
324
+ if isinstance(out_field, NumericField):
325
+ ok = out_field.is_within_threshold(val)
326
+ reason = (
327
+ None
328
+ if ok
329
+ else f"Output {val} outside [{out_field.threshold_low},{out_field.threshold_high}]"
330
+ )
331
+ return ok, reason
332
+ return False, "Output field not defined"