ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,816 @@
1
+ import re
2
+ import math
3
+ import asyncio
4
+ import json
5
+ from typing import (
6
+ Any,
7
+ Dict,
8
+ List,
9
+ Optional,
10
+ Tuple,
11
+ Union,
12
+ )
13
+
14
+ from llmevalkit.llm import LLMClient
15
+ from llmevalkit.metrics import MetricRunner, MetricRunResult
16
+ from llmevalkit.function_calling import load_prompts_from_list, PromptKind
17
+ from llmevalkit.function_calling.pipeline.adapters import (
18
+ BaseAdapter,
19
+ OpenAIAdapter,
20
+ )
21
+ from llmevalkit.function_calling.pipeline.types import (
22
+ ToolSpec,
23
+ ToolCall,
24
+ TransformResult,
25
+ SemanticCategoryResult,
26
+ SemanticResult,
27
+ )
28
+ from llmevalkit.function_calling.pipeline.transformation_prompts import (
29
+ GENERATE_CODE_SYSTEM,
30
+ GENERATE_CODE_USER,
31
+ GENERATE_CODE_SCHEMA,
32
+ build_multi_extract_units_schema,
33
+ MULTI_EXTRACT_UNITS_SYSTEM,
34
+ MULTI_EXTRACT_UNITS_USER,
35
+ )
36
+
37
+ from llmevalkit.function_calling import (
38
+ GeneralMetricsPrompt,
39
+ FunctionSelectionPrompt,
40
+ ParameterMetricsPrompt,
41
+ TrajectoryReflectionPrompt,
42
+ )
43
+
44
+
45
+ class SemanticChecker:
46
+ """
47
+ Orchestrates semantic metrics (and optional unit-transforms)
48
+ for a single function call.
49
+
50
+ Args:
51
+ general_metrics: JSON-schema dicts for general metrics.
52
+ function_metrics: JSON-schema dicts for function-selection metrics.
53
+ parameter_metrics: JSON-schema dicts for parameter-level metrics.
54
+ metrics_client: an llmevalkit LLMClient for metric evaluation.
55
+ codegen_client: an llmevalkit LLMClient for transformation codegen.
56
+ transform_enabled: whether to run unit-conversion checks.
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ metrics_client: LLMClient,
62
+ *,
63
+ general_metrics: Optional[List[Dict[str, Any]]] = None,
64
+ function_metrics: Optional[List[Dict[str, Any]]] = None,
65
+ parameter_metrics: Optional[List[Dict[str, Any]]] = None,
66
+ trajectory_metrics: Optional[List[Dict[str, Any]]] = None,
67
+ codegen_client: Optional[LLMClient] = None,
68
+ transform_enabled: Optional[bool] = False,
69
+ ) -> None:
70
+ # Validate clients
71
+ # if not isinstance(metrics_client, LLMClient):
72
+ # raise TypeError("metrics_client must be an llmevalkit LLMClient")
73
+ self.metrics_client = metrics_client
74
+
75
+ self.transform_enabled = transform_enabled
76
+ self.codegen_client = codegen_client
77
+ # if not codegen_client or not isinstance(codegen_client, LLMClient):
78
+ # self.codegen_client = metrics_client
79
+
80
+ self.general_prompts = []
81
+ if general_metrics is not None:
82
+ self.general_prompts = load_prompts_from_list(
83
+ general_metrics, PromptKind.GENERAL
84
+ )
85
+
86
+ self.function_prompts = []
87
+ if function_metrics is not None:
88
+ self.function_prompts = load_prompts_from_list(
89
+ function_metrics, PromptKind.FUNCTION_SELECTION
90
+ )
91
+
92
+ self.parameter_prompts = []
93
+ if parameter_metrics is not None:
94
+ self.parameter_prompts = load_prompts_from_list(
95
+ parameter_metrics, PromptKind.PARAMETER
96
+ )
97
+
98
+ self.trajectory_prompts = []
99
+ if trajectory_metrics is not None:
100
+ self.trajectory_prompts = load_prompts_from_list(
101
+ trajectory_metrics, PromptKind.TRAJECTORY
102
+ )
103
+
104
+ def _make_adapter(self, apis_specs, tool_call):
105
+ first = apis_specs[0]
106
+ if isinstance(first, ToolSpec):
107
+ return OpenAIAdapter(apis_specs, tool_call)
108
+ raise TypeError("Unsupported spec type")
109
+
110
+ def _collect_params(self, adapter: BaseAdapter) -> Dict[str, Any]:
111
+ """
112
+ Return a mapping of every parameter name in the spec inventory
113
+ to its value from the call (or defaulted if missing).
114
+ """
115
+ call_args = adapter.get_parameters()
116
+ merged: Dict[str, Any] = {}
117
+ # Find the function in the inventory
118
+ function_parameters = (
119
+ adapter.get_tool_spec(adapter.get_function_name())
120
+ .get("parameters", {})
121
+ .get("properties", {})
122
+ )
123
+
124
+ for pname, pschema in function_parameters.items():
125
+ if pname in call_args:
126
+ merged[pname] = call_args[pname]
127
+ elif "default" in pschema:
128
+ merged[pname] = pschema["default"]
129
+ else:
130
+ merged[pname] = (
131
+ f"Default value from parameter description (if defined): '{pschema.get('description', 'No description provided')}'"
132
+ f" Otherwise, by the default value of type: {pschema.get('type', 'object')}"
133
+ )
134
+ return merged
135
+
136
+ def extract_all_units_sync(
137
+ self,
138
+ context: Union[str, List[Dict[str, str]]],
139
+ adapter: BaseAdapter,
140
+ params: List[str],
141
+ retries: int = 1,
142
+ ) -> Dict[str, Dict[str, Any]]:
143
+ """
144
+ Synchronously extract user_value/user_units_or_format/spec_units_or_format for every parameter in `params`
145
+ by issuing a single LLM call.
146
+ Returns a dict mapping each parameter name to its classification object.
147
+ """
148
+ # Build the combined JSON Schema requiring one object per parameter
149
+ multi_schema = build_multi_extract_units_schema(params)
150
+ schema_str = json.dumps(multi_schema, indent=2)
151
+
152
+ # Build the "full_spec" JSON Schema snippet for all parameters
153
+ full_spec_json = json.dumps(
154
+ adapter.get_tool_spec(adapter.get_function_name()).model_dump(),
155
+ indent=2,
156
+ )
157
+
158
+ # Format system and user prompts
159
+ system_prompt = MULTI_EXTRACT_UNITS_SYSTEM.format(schema=schema_str)
160
+ user_prompt = MULTI_EXTRACT_UNITS_USER.format(
161
+ context=context,
162
+ full_spec=full_spec_json,
163
+ parameter_names=", ".join(params),
164
+ )
165
+
166
+ # Single synchronous LLM call
167
+ try:
168
+ response: Dict[str, Any] = self.metrics_client.generate(
169
+ prompt=[
170
+ {"role": "system", "content": system_prompt},
171
+ {"role": "user", "content": user_prompt},
172
+ ],
173
+ schema=multi_schema,
174
+ retries=retries,
175
+ )
176
+ except Exception:
177
+ response = {
178
+ pname: {
179
+ "user_value": None,
180
+ "user_units_or_format": None,
181
+ "spec_units_or_format": None,
182
+ }
183
+ for pname in params
184
+ }
185
+
186
+ return response
187
+
188
+ def run_sync(
189
+ self,
190
+ apis_specs: List[ToolSpec],
191
+ tool_call: ToolCall,
192
+ context: Union[str, List[Dict[str, str]]],
193
+ retries: int = 1,
194
+ transform_enabled: Optional[bool] = None,
195
+ ) -> SemanticResult:
196
+ """
197
+ Synchronous semantic-only evaluation.
198
+
199
+ Returns a SemanticResult:
200
+ {
201
+ "general": {metric_name: result, …} or None
202
+ "function_selection": {…} or None
203
+ "parameter": {param_name: {metric_name: result}, …} or None
204
+ "transform": {param_name: TransformResult, …} or None
205
+ }
206
+ """
207
+ # 1) Normalize via adapter
208
+ adapter = self._make_adapter(apis_specs, tool_call)
209
+ tools_inventory_summary = adapter.get_tools_inventory_summary()
210
+ call_dict = adapter.get_call_dict()
211
+ fn_name = adapter.get_function_name()
212
+ cur_tool_spec = adapter.get_tool_spec(fn_name)
213
+ params = self._collect_params(adapter)
214
+
215
+ if transform_enabled is not None:
216
+ old_transform_enabled = self.transform_enabled
217
+ self.transform_enabled = transform_enabled
218
+
219
+ # 2) GENERAL METRICS
220
+ general_results: Optional[SemanticCategoryResult]
221
+ entries: List[Tuple[GeneralMetricsPrompt, Dict[str, Any]]] = []
222
+ for prompt in self.general_prompts:
223
+ entries.append(
224
+ (
225
+ prompt,
226
+ {
227
+ "conversation_context": context,
228
+ "tool_inventory": cur_tool_spec,
229
+ "tool_call": call_dict,
230
+ },
231
+ )
232
+ )
233
+ if entries:
234
+ try:
235
+ runner = MetricRunner(entries)
236
+ sync_results = runner.run_all(
237
+ self.metrics_client.generate,
238
+ prompt_param_name="prompt",
239
+ schema_param_name="schema",
240
+ retries=retries,
241
+ )
242
+ general_results = SemanticCategoryResult.from_results(sync_results)
243
+ except Exception as e:
244
+ general_results = {"error": str(e)}
245
+ else:
246
+ general_results = None
247
+
248
+ # 3) FUNCTION-SELECTION METRICS
249
+ function_results: Optional[SemanticCategoryResult]
250
+ func_entries: List[Tuple[FunctionSelectionPrompt, Dict[str, Any]]] = []
251
+ for prompt in self.function_prompts:
252
+ func_entries.append(
253
+ (
254
+ prompt,
255
+ {
256
+ "conversation_context": context,
257
+ "tools_inventory": tools_inventory_summary,
258
+ "proposed_tool_call": call_dict,
259
+ "selected_function": fn_name,
260
+ },
261
+ )
262
+ )
263
+ if func_entries:
264
+ try:
265
+ runner = MetricRunner(func_entries)
266
+ sync_results = runner.run_all(
267
+ self.metrics_client.generate,
268
+ prompt_param_name="prompt",
269
+ schema_param_name="schema",
270
+ retries=retries,
271
+ )
272
+ function_results = SemanticCategoryResult.from_results(sync_results)
273
+ except Exception as e:
274
+ function_results = {"error": str(e)}
275
+ else:
276
+ function_results = None
277
+
278
+ # 4) PARAMETER-LEVEL METRICS
279
+ parameter_results: Optional[Dict[str, SemanticCategoryResult]] = {}
280
+ for pname, pval in params.items():
281
+ # Each parameter has its own prompts
282
+ try:
283
+ param_entries: List[Tuple[ParameterMetricsPrompt, Dict[str, Any]]] = []
284
+ for prompt in self.parameter_prompts:
285
+ param_entries.append(
286
+ (
287
+ prompt,
288
+ {
289
+ "conversation_context": context,
290
+ "tool_inventory": cur_tool_spec,
291
+ "tool_call": call_dict,
292
+ "parameter_name": pname,
293
+ "parameter_value": pval,
294
+ },
295
+ )
296
+ )
297
+ runner = MetricRunner(param_entries)
298
+ sync_results = runner.run_all(
299
+ self.metrics_client.generate,
300
+ prompt_param_name="prompt",
301
+ schema_param_name="schema",
302
+ retries=retries,
303
+ )
304
+ parameter_results[pname] = SemanticCategoryResult.from_results(
305
+ sync_results
306
+ )
307
+ except Exception as e:
308
+ parameter_results[pname] = {"error": str(e)}
309
+
310
+ if not parameter_results:
311
+ parameter_results = None
312
+
313
+ # Base SemanticResult without transforms
314
+ result = SemanticResult(
315
+ general=general_results,
316
+ function_selection=function_results,
317
+ parameter=parameter_results,
318
+ )
319
+
320
+ # 5) OPTIONAL TRANSFORMS
321
+ params = adapter.get_parameters()
322
+ if self.transform_enabled and params:
323
+ if transform_enabled is not None:
324
+ self.transform_enabled = old_transform_enabled
325
+
326
+ transform_out: Dict[str, TransformResult] = {}
327
+
328
+ # 5a) Extract units for all parameters in one synchronous call
329
+ units_map = self.extract_all_units_sync(
330
+ context=context,
331
+ adapter=adapter,
332
+ params=list(params.keys()),
333
+ retries=retries,
334
+ )
335
+
336
+ # 5b) Generate code & execute for each parameter needing conversion
337
+ for pname, units in units_map.items():
338
+ user_units = units.get("user_units_or_format") or ""
339
+ spec_units = units.get("spec_units_or_format") or ""
340
+ user_value = units.get("user_value")
341
+ transformation_summary = units.get("transformation_summary", "")
342
+ gen_code = ""
343
+
344
+ # Only generate code if user_units differs from spec_units and user_value is present
345
+ if (
346
+ user_units
347
+ and user_value is not None
348
+ and spec_units
349
+ and (user_units != spec_units)
350
+ ):
351
+ try:
352
+ prompt = GENERATE_CODE_USER.format(
353
+ old_value=user_value,
354
+ old_units=user_units,
355
+ transformed_value=str(params[pname]),
356
+ transformed_units=spec_units,
357
+ transformed_type=type(params[pname]).__name__,
358
+ transformation_summary=transformation_summary,
359
+ )
360
+ gen_code = self.codegen_client.generate(
361
+ prompt=[
362
+ {"role": "system", "content": GENERATE_CODE_SYSTEM},
363
+ {"role": "user", "content": prompt},
364
+ ],
365
+ schema=GENERATE_CODE_SCHEMA,
366
+ retries=retries,
367
+ ).get("generated_code", "")
368
+ except Exception:
369
+ gen_code = ""
370
+
371
+ # 5c) Execute & validate
372
+ tr = self._execute_code_and_validate(
373
+ code=gen_code,
374
+ user_val=str(user_value or ""),
375
+ api_val=str(params[pname]),
376
+ units=units,
377
+ )
378
+ transform_out[pname] = tr
379
+
380
+ if transform_out:
381
+ result.transform = transform_out
382
+ else:
383
+ result.transform = None
384
+
385
+ return result
386
+
387
+ async def extract_all_units(
388
+ self,
389
+ context: Union[str, List[Dict[str, str]]],
390
+ adapter: BaseAdapter,
391
+ params: List[str],
392
+ retries: int = 1,
393
+ ) -> Dict[str, Dict[str, Any]]:
394
+ """
395
+ Call the LLM once to extract user_value/user_units_or_format/spec_units_or_format
396
+ for every parameter in `params`. Returns a dict:
397
+ { parameter_name: {"user_value": ..., "user_units_or_format": ..., "spec_units_or_format": ...}, ... }
398
+ """
399
+ # 1) Build the JSON Schema that requires one object per parameter
400
+ multi_schema = build_multi_extract_units_schema(params)
401
+ schema_str = json.dumps(multi_schema, indent=2)
402
+
403
+ # 2) Build the "full_spec" JSON Schema snippet for all parameters
404
+ full_spec_json = json.dumps(
405
+ adapter.get_tool_spec(adapter.get_function_name()),
406
+ indent=2,
407
+ )
408
+
409
+ # 3) Fill in system and user prompts
410
+ system_prompt = MULTI_EXTRACT_UNITS_SYSTEM.format(schema=schema_str)
411
+
412
+ user_prompt = MULTI_EXTRACT_UNITS_USER.format(
413
+ context=context,
414
+ full_spec=full_spec_json,
415
+ parameter_names=", ".join(params),
416
+ )
417
+
418
+ # 4) Fire a single async LLM call
419
+ try:
420
+ response: Dict[str, Any] = await self.metrics_client.generate_async(
421
+ prompt=[
422
+ {"role": "system", "content": system_prompt},
423
+ {"role": "user", "content": user_prompt},
424
+ ],
425
+ schema=multi_schema,
426
+ retries=retries,
427
+ )
428
+ except Exception:
429
+ # If the LLM fails, default to no-information for each parameter
430
+ response = {
431
+ pname: {
432
+ "user_value": None,
433
+ "user_units_or_format": None,
434
+ "spec_units_or_format": None,
435
+ }
436
+ for pname in params
437
+ }
438
+
439
+ return response
440
+
441
+ async def run_async(
442
+ self,
443
+ apis_specs: List[ToolSpec],
444
+ tool_call: ToolCall,
445
+ context: Union[str, List[Dict[str, str]]],
446
+ retries: int = 1,
447
+ max_parallel: int = 10,
448
+ transform_enabled: Optional[bool] = None,
449
+ ) -> SemanticResult:
450
+ """
451
+ Asynchronous semantic-only evaluation with concurrency.
452
+ Returns a SemanticResult with:
453
+ - general: results of general metrics
454
+ - function_selection: results of function-selection metrics
455
+ - parameter: results of parameter-level metrics
456
+ - transform: (optional) unit-conversion transforms if enabled
457
+ """
458
+ adapter = self._make_adapter(apis_specs, tool_call)
459
+ tools_inventory_summary = adapter.get_tools_inventory_summary()
460
+ call_dict = adapter.get_call_dict()
461
+ fn_name = adapter.get_function_name()
462
+ cur_tool_spec = adapter.get_tool_spec(fn_name)
463
+ params = self._collect_params(adapter)
464
+
465
+ # Handle optional override of transform_enabled
466
+ if transform_enabled is not None:
467
+ old_transform_enabled = self.transform_enabled
468
+ self.transform_enabled = transform_enabled
469
+
470
+ # 2) GENERAL METRICS
471
+ general_results: SemanticCategoryResult = {}
472
+ general_entries: List[Tuple[GeneralMetricsPrompt, Dict[str, Any]]] = []
473
+ general_async_results: List[MetricRunResult] = []
474
+
475
+ for prompt in self.general_prompts:
476
+ general_entries.append(
477
+ (
478
+ prompt,
479
+ {
480
+ "conversation_context": context,
481
+ "tool_inventory": cur_tool_spec,
482
+ "tool_call": call_dict,
483
+ },
484
+ )
485
+ )
486
+
487
+ # 3) FUNCTION-SELECTION METRICS
488
+ function_results: SemanticCategoryResult = {}
489
+ func_entries: List[Tuple[FunctionSelectionPrompt, Dict[str, Any]]] = []
490
+ function_async_results: List[MetricRunResult] = []
491
+
492
+ for prompt in self.function_prompts:
493
+ func_entries.append(
494
+ (
495
+ prompt,
496
+ {
497
+ "conversation_context": context,
498
+ "tools_inventory": tools_inventory_summary,
499
+ "proposed_tool_call": call_dict,
500
+ "selected_function": fn_name,
501
+ },
502
+ )
503
+ )
504
+
505
+ # 4) PARAMETER-LEVEL METRICS
506
+ parameter_results: Dict[str, SemanticCategoryResult] = {}
507
+ parameter_async_results: Dict[str, List[MetricRunResult]] = {}
508
+ param_entries: List[Tuple[ParameterMetricsPrompt, Dict[str, Any]]] = []
509
+
510
+ for pname, pval in params.items():
511
+ for prompt in self.parameter_prompts:
512
+ param_entries.append(
513
+ (
514
+ prompt,
515
+ {
516
+ "conversation_context": context,
517
+ "tool_inventory": cur_tool_spec,
518
+ "tool_call": call_dict,
519
+ "parameter_name": pname,
520
+ "parameter_value": pval,
521
+ },
522
+ )
523
+ )
524
+
525
+ # Run all metric prompts in parallel (up to max_parallel)
526
+ try:
527
+ all_entries = general_entries + func_entries + param_entries
528
+ runner = MetricRunner(all_entries)
529
+ async_results = await runner.run_async(
530
+ self.metrics_client.generate_async,
531
+ prompt_param_name="prompt",
532
+ schema_param_name="schema",
533
+ retries=retries,
534
+ max_parallel=max_parallel,
535
+ )
536
+
537
+ # Split the results back into categories
538
+ for entry, result in zip(all_entries, async_results):
539
+ prompt_obj, ctx_dict = entry
540
+ if isinstance(prompt_obj, GeneralMetricsPrompt) and isinstance(
541
+ result, MetricRunResult
542
+ ):
543
+ general_async_results.append(result)
544
+ elif isinstance(prompt_obj, FunctionSelectionPrompt) and isinstance(
545
+ result, MetricRunResult
546
+ ):
547
+ function_async_results.append(result)
548
+ elif isinstance(prompt_obj, ParameterMetricsPrompt) and isinstance(
549
+ result, MetricRunResult
550
+ ):
551
+ pname = ctx_dict["parameter_name"]
552
+ parameter_async_results.setdefault(pname, []).append(result)
553
+
554
+ # Aggregate general results
555
+ if general_async_results:
556
+ general_results = SemanticCategoryResult.from_results(
557
+ general_async_results
558
+ )
559
+ else:
560
+ general_results = None
561
+
562
+ # Aggregate function-selection results
563
+ if function_async_results:
564
+ function_results = SemanticCategoryResult.from_results(
565
+ function_async_results
566
+ )
567
+ else:
568
+ function_results = None
569
+
570
+ # Aggregate parameter-level results
571
+ if parameter_async_results:
572
+ for pname, results in parameter_async_results.items():
573
+ if results:
574
+ parameter_results[pname] = SemanticCategoryResult.from_results(
575
+ results
576
+ )
577
+ else:
578
+ parameter_results[pname] = None
579
+ else:
580
+ parameter_results = None
581
+
582
+ except Exception as e:
583
+ # In case any metric-run fails, record the error
584
+ general_results = {"error": str(e)}
585
+ function_results = {"error": str(e)}
586
+ parameter_results = {"error": str(e)}
587
+
588
+ # Construct the base SemanticResult
589
+ result = SemanticResult(
590
+ general=general_results,
591
+ function_selection=function_results,
592
+ parameter=parameter_results,
593
+ )
594
+
595
+ # -------------------------------------------------------------------
596
+ # 5) Optional TRANSFORMS: Unit extraction & code generation
597
+ # -------------------------------------------------------------------
598
+ params = adapter.get_parameters()
599
+ if self.transform_enabled and params:
600
+ # Restore transform_enabled if overridden
601
+ if transform_enabled is not None:
602
+ self.transform_enabled = old_transform_enabled
603
+
604
+ # 5.1) Extract units for ALL parameters in one LLM call
605
+ units_map = await self.extract_all_units(
606
+ context=context,
607
+ adapter=adapter,
608
+ params=list(params.keys()),
609
+ retries=retries,
610
+ )
611
+
612
+ # 5.2) Generate conversion code for parameters that need it
613
+ code_tasks: Dict[str, asyncio.Task] = {}
614
+ for pname, units in units_map.items():
615
+ user_units = units.get("user_units_or_format") or ""
616
+ spec_units = units.get("spec_units_or_format") or ""
617
+ user_value = units.get("user_value")
618
+ transformation_summary = units.get("transformation_summary", "")
619
+ if (
620
+ user_units
621
+ and user_value
622
+ and spec_units
623
+ and (user_units != spec_units)
624
+ ):
625
+ # Generate code only if units differ and value is present
626
+ prompt = GENERATE_CODE_USER.format(
627
+ old_value=user_value,
628
+ old_units=user_units,
629
+ transformed_value=str(params[pname]),
630
+ transformed_units=spec_units,
631
+ transformed_type=type(params[pname]).__name__,
632
+ transformation_summary=transformation_summary,
633
+ )
634
+ code_tasks[pname] = asyncio.create_task(
635
+ self.codegen_client.generate_async(
636
+ prompt=[
637
+ {"role": "system", "content": GENERATE_CODE_SYSTEM},
638
+ {"role": "user", "content": prompt},
639
+ ],
640
+ schema=GENERATE_CODE_SCHEMA,
641
+ retries=retries,
642
+ )
643
+ )
644
+
645
+ # 5.3) Await up to max_parallel code-generation tasks
646
+ semaphore = asyncio.Semaphore(max_parallel)
647
+
648
+ async def run_with_semaphore(task: asyncio.Task):
649
+ async with semaphore:
650
+ return await task
651
+
652
+ wrapped_code_tasks = [
653
+ asyncio.create_task(run_with_semaphore(t)) for t in code_tasks.values()
654
+ ]
655
+ try:
656
+ code_responses = await asyncio.gather(*wrapped_code_tasks)
657
+ except Exception:
658
+ # If code generation fails, set all to None
659
+ code_responses = [None] * len(wrapped_code_tasks)
660
+
661
+ # 5.4) Map code responses back to parameter names
662
+ code_map: Dict[str, Dict[str, Any]] = {}
663
+ for pname, response in zip(code_tasks.keys(), code_responses):
664
+ if response is not None:
665
+ code_map[pname] = response
666
+
667
+ # 5.5) Execute generated code and validate conversions
668
+ transform_map: Dict[str, TransformResult] = {}
669
+ for pname, code_resp in code_map.items():
670
+ gen_code = code_resp.get("generated_code", "")
671
+ units_info = units_map[pname]
672
+ if not gen_code:
673
+ transform_map[pname] = TransformResult(
674
+ units=units_info,
675
+ generated_code="",
676
+ execution_success=False,
677
+ correct=True,
678
+ execution_output=None,
679
+ correction=None,
680
+ error="No code generated",
681
+ )
682
+ continue
683
+
684
+ tr = self._execute_code_and_validate(
685
+ code=gen_code,
686
+ user_val=str(units_info.get("user_value") or ""),
687
+ api_val=str(params[pname]),
688
+ units=units_info,
689
+ )
690
+ transform_map[pname] = tr
691
+
692
+ if transform_map:
693
+ result.transform = transform_map
694
+ else:
695
+ result.transform = None
696
+
697
+ return result
698
+
699
+ def _execute_code_and_validate(
700
+ self,
701
+ code: str,
702
+ user_val: str,
703
+ api_val: str,
704
+ units: Dict[str, Any],
705
+ ) -> TransformResult:
706
+ """
707
+ Strip code fences, install imports, exec code, compare, return TransformResult.
708
+ """
709
+ clean = re.sub(r"^```(?:python)?|```$", "", code, flags=re.MULTILINE).strip()
710
+
711
+ # install imports
712
+ for mod in set(
713
+ re.findall(r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE)
714
+ ):
715
+ try:
716
+ __import__(mod)
717
+ except ImportError as e:
718
+ return TransformResult(
719
+ units=units,
720
+ generated_code=clean,
721
+ execution_success=False,
722
+ correct=True,
723
+ execution_output=None,
724
+ correction=None,
725
+ error=f"Error: {e}. Could not import module '{mod}'. Please install the package and try again,"
726
+ " or run the generated code manually:\n"
727
+ f"transformation_code({user_val}) == convert_example_str_transformed_to_transformed_type({api_val})",
728
+ )
729
+
730
+ ns: Dict[str, Any] = {}
731
+ try:
732
+ exec(clean, ns)
733
+ fn_t = ns.get("transformation_code")
734
+ fn_c = ns.get("convert_example_str_transformed_to_transformed_type")
735
+ if not callable(fn_t) or not callable(fn_c):
736
+ raise ValueError("Generated code missing required functions")
737
+
738
+ out_t = fn_t(user_val)
739
+ out_c = fn_c(api_val)
740
+ if isinstance(out_t, (int, float)) and isinstance(out_c, (int, float)):
741
+ success = math.isclose(out_t, out_c, abs_tol=1e-3)
742
+ else:
743
+ success = str(out_t) == str(out_c)
744
+
745
+ correction = None
746
+ if not success:
747
+ correction = (
748
+ f"The transformation code validation found an issue with the units transformation "
749
+ f"of the parameter.\n"
750
+ f"The user request value is '{user_val}' with units '{units.get('user_units_or_format')}' and "
751
+ f"the API call value is '{api_val}' with units '{units.get('spec_units_or_format')}'.\n"
752
+ f"Expected transformation is '{out_t}' based on the code.\n"
753
+ )
754
+
755
+ correct = correction is None
756
+
757
+ return TransformResult(
758
+ units=units,
759
+ generated_code=clean,
760
+ execution_success=True,
761
+ correct=correct,
762
+ execution_output={"transformed": out_t, "converted": out_c},
763
+ correction=correction,
764
+ error=None,
765
+ )
766
+ except Exception as e:
767
+ return TransformResult(
768
+ units=units,
769
+ generated_code=clean,
770
+ execution_success=False,
771
+ correct=True,
772
+ execution_output=None,
773
+ correction=None,
774
+ error=str(e),
775
+ )
776
+
777
+ async def run_trajectory_async(
778
+ self,
779
+ trajectory: Union[str, List[Dict[str, str]]],
780
+ tool_inventory: List[Dict[str, Any]],
781
+ retries: int = 1,
782
+ max_parallel: int = 10,
783
+ ) -> Optional[SemanticCategoryResult]:
784
+ """
785
+ Asynchronous trajectory evaluation.
786
+ """
787
+ trajectory_results: Optional[SemanticCategoryResult]
788
+ entries: List[Tuple[TrajectoryReflectionPrompt, Dict[str, Any]]] = []
789
+ for prompt in self.trajectory_prompts:
790
+ entries.append(
791
+ (
792
+ prompt,
793
+ {
794
+ "trajectory": trajectory,
795
+ "tool_inventory": tool_inventory,
796
+ },
797
+ )
798
+ )
799
+
800
+ if not entries:
801
+ return None
802
+
803
+ try:
804
+ runner = MetricRunner(entries)
805
+ async_results = await runner.run_async(
806
+ self.metrics_client.generate_async,
807
+ prompt_param_name="prompt",
808
+ schema_param_name="schema",
809
+ retries=retries,
810
+ max_parallel=max_parallel,
811
+ )
812
+ trajectory_results = SemanticCategoryResult.from_results(async_results)
813
+ except Exception as e:
814
+ trajectory_results = {"error": str(e)}
815
+
816
+ return trajectory_results