ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,186 @@
1
+ import copy
2
+ import re
3
+
4
+ _JSON_TYPE_ALIASES = {
5
+ "str": "string",
6
+ "string": "string",
7
+ "int": "integer",
8
+ "integer": "integer",
9
+ "float": "number",
10
+ "double": "number",
11
+ "number": "number",
12
+ "bool": "boolean",
13
+ "boolean": "boolean",
14
+ "dict": "object",
15
+ "object": "object",
16
+ "list": "array",
17
+ "array": "array",
18
+ "null": "null",
19
+ }
20
+
21
+ _ALLOWED_JSON_TYPES = set(_JSON_TYPE_ALIASES.values())
22
+
23
+ _NAME_RE = re.compile(r"^[A-Za-z0-9_]{1,64}$")
24
+
25
+ def _sanitize_name(name: str) -> str:
26
+ name = re.sub(r"[^A-Za-z0-9_]", "_", str(name))
27
+ if not name:
28
+ name = "tool"
29
+ if name[0].isdigit():
30
+ name = f"fn_{name}"
31
+ return name[:64]
32
+
33
+ def _normalize_type(t):
34
+ if isinstance(t, list):
35
+ fixed = list({ _JSON_TYPE_ALIASES.get(x, x) for x in t })
36
+ return [x for x in fixed if x in _ALLOWED_JSON_TYPES] or ["string"]
37
+ if isinstance(t, str):
38
+ return _JSON_TYPE_ALIASES.get(t, t) if _JSON_TYPE_ALIASES.get(t, t) in _ALLOWED_JSON_TYPES else "string"
39
+ return "string"
40
+
41
+ def _clean_description(desc):
42
+ if desc is None:
43
+ return None
44
+ s = str(desc).strip()
45
+ # collapse excessive whitespace
46
+ s = re.sub(r"\s+\n", "\n", s)
47
+ s = re.sub(r"\n\s+", "\n", s)
48
+ return s
49
+
50
+ def _fix_schema(schema, notes, path="parameters"):
51
+ """
52
+ Recursively fix a JSON Schema-ish dict in place.
53
+ """
54
+ if not isinstance(schema, dict):
55
+ notes.append(f"{path}: non-dict schema replaced with empty object")
56
+ return {"type": "object"}
57
+
58
+ out = dict(schema)
59
+
60
+ # type
61
+ if "type" in out:
62
+ out["type"] = _normalize_type(out["type"])
63
+ # For top-level parameters or any object-like node, ensure properties shape if object
64
+ if out.get("type") == "object":
65
+ props = out.get("properties", {})
66
+ if not isinstance(props, dict):
67
+ notes.append(f"{path}.properties: not a dict, replaced with empty dict")
68
+ props = {}
69
+ fixed_props = {}
70
+ for k, v in props.items():
71
+ fixed_props[k] = _fix_schema(v if isinstance(v, dict) else {"type": v}, notes, f"{path}.properties.{k}")
72
+ out["properties"] = fixed_props
73
+
74
+ # required: only keep keys that exist in properties and are strings
75
+ if "required" in out:
76
+ req = out["required"]
77
+ if isinstance(req, list):
78
+ req_clean = [r for r in req if isinstance(r, str) and r in out["properties"]]
79
+ if req_clean != req:
80
+ notes.append(f"{path}.required: pruned invalid entries")
81
+ out["required"] = req_clean
82
+ else:
83
+ notes.append(f"{path}.required: not a list, removed")
84
+ out.pop("required", None)
85
+
86
+ # additionalProperties is fine as is if present
87
+ elif out.get("type") == "array":
88
+ # ensure items
89
+ items = out.get("items")
90
+ if not isinstance(items, dict):
91
+ notes.append(f"{path}.items: missing or invalid, set to permissive object")
92
+ out["items"] = {}
93
+ else:
94
+ out["items"] = _fix_schema(items, notes, f"{path}.items")
95
+
96
+ # description
97
+ if "description" in out:
98
+ cleaned = _clean_description(out.get("description"))
99
+ if cleaned != out.get("description"):
100
+ notes.append(f"{path}.description: normalized whitespace")
101
+ out["description"] = cleaned
102
+
103
+ # normalize leaf shorthand like {"type": "str"} already handled
104
+ return out
105
+
106
+ def to_valid_openai_tool(spec: dict):
107
+ """
108
+ Convert a possibly-invalid tool specification dict into a valid
109
+ OpenAI tool spec dict. Returns (converted_dict, notes).
110
+ """
111
+ notes = []
112
+ if not isinstance(spec, dict):
113
+ raise TypeError("spec must be a dict")
114
+
115
+ spec = copy.deepcopy(spec)
116
+
117
+ # Unwrap or detect shape
118
+ if spec.get("type") == "function" and isinstance(spec.get("function"), dict):
119
+ fn = spec["function"]
120
+ else:
121
+ # Maybe user passed just the function block
122
+ fn = spec
123
+
124
+ name = fn.get("name")
125
+ if not name:
126
+ notes.append("function.name missing, set to 'tool'")
127
+ name = "tool"
128
+ new_name = _sanitize_name(name)
129
+ if new_name != name:
130
+ notes.append(f"function.name sanitized to '{new_name}'")
131
+ name = new_name
132
+
133
+ description = fn.get("description")
134
+ description = _clean_description(description) if description is not None else ""
135
+ if not isinstance(description, str):
136
+ notes.append("function.description not a string, coerced")
137
+ description = str(description)
138
+
139
+ # Parameters
140
+ raw_params = fn.get("parameters")
141
+ if not isinstance(raw_params, dict):
142
+ if raw_params is not None:
143
+ notes.append("function.parameters not a dict, replaced with empty object schema")
144
+ raw_params = {}
145
+ # Ensure object type
146
+ if raw_params.get("type") != "object":
147
+ raw_params["type"] = "object"
148
+ raw_params.setdefault("properties", {})
149
+ parameters = _fix_schema(raw_params, notes, "parameters")
150
+
151
+ # Final envelope
152
+ out = {
153
+ "type": "function",
154
+ "function": {
155
+ "name": name,
156
+ "description": description,
157
+ "parameters": parameters,
158
+ }
159
+ }
160
+
161
+ return out, notes
162
+
163
+ # --------- Example usage ---------
164
+ if __name__ == "__main__":
165
+ messy = {
166
+ "type": "function",
167
+ "function": {
168
+ "name": "GenerateKyvernoTool!",
169
+ "description": "The tool to generate a Kyverno policy.\n",
170
+ "parameters": {
171
+ "type": "object",
172
+ "properties": {
173
+ "sentence": {"type": "str", "description": "...\n"},
174
+ "policy_file": {"type": "str", "description": "filepath."},
175
+ "current_policy_file": {"type": "str", "description": "optional", "default": ""}
176
+ },
177
+ "required": ["sentence", "policy_file", "nonexistent_param"]
178
+ }
179
+ }
180
+ }
181
+
182
+ fixed, notes = to_valid_openai_tool(messy)
183
+ print(fixed)
184
+ print("Notes:")
185
+ for n in notes:
186
+ print("-", n)
@@ -0,0 +1,411 @@
1
+ # Function-Calling Reflection Pipeline
2
+
3
+ This directory implements a full **pre-call reflection** workflow for conversational agents making API (function) calls. It leverages:
4
+
5
+ - **Static schema checks** - Ensure that calls conform exactly to the API schema and naming rules.
6
+ - **Semantic LLM-driven metrics** - Evaluate the deeper meaning, context alignment, and correctness of calls beyond syntax.
7
+ - **Optional unit-conversion transforms** via code generation
8
+
9
+ All LLM and metric logic lives inside this package—no external frameworks are required.
10
+
11
+ ---
12
+
13
+ ## Table of Contents
14
+
15
+ 1. [Syntactic Checks](##yntactic-checks)
16
+ 2. [Semantic Metrics](#semantic-metrics)
17
+ 3. [Quickstart](#quickstart)
18
+ 4. [Directory Structure](#directory-structure)
19
+ 5. [ReflectionPipeline API](#reflectionpipeline-api)
20
+ - `static_only`
21
+ - `semantic_sync` / `semantic_async`
22
+ - `run_sync` / `run_async`
23
+ 6. [Example Usage](#example-usage)
24
+ 7. [Custom Metrics](#custom-metrics)
25
+ 8. [Transform-Enabled Mode](#transform-enabled-mode)
26
+ 9. [Error Handling & Logging](#error-handling--logging)
27
+
28
+ ---
29
+
30
+
31
+ ## Syntactic Checks
32
+
33
+ These catch straightforward, schema-level errors against your API specification:
34
+
35
+ * **NonExistentFunction**
36
+
37
+ *Description:* The function name does not appear in the API spec.
38
+
39
+ *Mistake Example:* Calling `get_customer_profile` when only `get_user_profile` is defined.
40
+
41
+ * **NonExistentParameter**
42
+
43
+ *Description:* One or more parameters are not defined for the chosen function.
44
+
45
+ *Mistake Example:* Using `user` in `get_user_profile(user=42)` when the function expects `user_id`.
46
+
47
+ * **IncorrectParameterType**
48
+
49
+ *Description:* Provided parameter values do not match the expected types.
50
+
51
+ *Mistake Example:* Passing `"true"` (string) to a boolean parameter `is_active`, instead of `true`.
52
+
53
+ * **MissingRequiredParameter**
54
+
55
+ *Description:* A required parameter is omitted.
56
+
57
+ *Mistake Example:* Calling `list_events(start_date="2025-05-01")` without the required `end_date`.
58
+
59
+ * **AllowedValuesViolation**
60
+
61
+ *Description:* A parameter value falls outside its allowed enumeration.
62
+
63
+ *Mistake Example:* Passing `"urgent"` to `priority` when only `"low"`, `"medium"`, or `"high"` are allowed.
64
+
65
+ * **JsonSchemaValidation**
66
+
67
+ *Description:* The API call does not conform to the provided JSON Schema
68
+
69
+ Note that We flag types errors in **IncorrectParameterType**, and all other validations (that are not type or Enum) are under **JsonSchemaValidation**.
70
+
71
+ *Examples of Checked Constraints:*
72
+ * Numeric constraints: minimum, maximum, exclusiveMinimum, exclusiveMaximum, multipleOf
73
+ * String constraints: minLength, maxLength, pattern, format (e.g., email, date, URI)
74
+ * Array constraints: items, minItems, maxItems, uniqueItems, contains
75
+
76
+ * **EmptyApiSpec**
77
+
78
+ *Description:* There are no API specifications provided or they are invalid
79
+
80
+ * **InvalidApiSpec**
81
+
82
+ *Description:* The API specifications provided are not valid Tool or ToolSpec instances
83
+
84
+ * **InvalidToolCall**
85
+
86
+ *Description:* The provided ToolCall is not a valid instance of ToolCall
87
+
88
+ ---
89
+
90
+ # Semantic Metrics
91
+
92
+ Each semantic metric outputs a JSON object with fields customized in the JSONL definition files:
93
+
94
+ * **explanation**: Detailed reasoning behind the judgment.
95
+ * **evidence**: Exact conversation or spec excerpts supporting the assessment.
96
+ * **output**: Numeric rating on a 1-5 scale (5=best, 1=worst).
97
+ * **confidence**: Judge's confidence in the assessment (0.0-1.0).
98
+ * **correction**: Structured object containing issue types, explanations, and suggested fixes.
99
+ * **actionable_recommendation**: Specific developer guidance when issues are detected.
100
+
101
+ You can add, remove, or modify metrics by editing the JSONL definitions.
102
+
103
+ ### 2.1 Function Selection Metric
104
+
105
+ Assesses whether this function call correctly implements the user's immediate request as the appropriate next step in the conversation. Compares against all available functions in the tool inventory to determine if the selection aligns with user intent and context.
106
+
107
+ **Rating Scale:**
108
+ - 5: Perfect match for user request
109
+ - 4: Good match with minor misalignment
110
+ - 3: Adequate match (threshold for acceptability)
111
+ - 2: Poor match for user request
112
+ - 1: Completely irrelevant function
113
+
114
+ *Mistake Example:* User: "What time is it in Tokyo?" Call: `translate_text(text="Hello", target_language="en")` instead of `get_time(timezone="Tokyo")`.
115
+
116
+ ### 2.2 Agentic Metric
117
+
118
+ Evaluates whether a tool call satisfies prerequisite constraints and relationships defined in conversation history and tool inventory. Checks for explicit prerequisites, tool sequencing requirements, redundancy, parameter completeness, and parameter value relationships.
119
+
120
+ **Rating Scale:**
121
+ - 5: All agentic constraints satisfied
122
+ - 4: Minor insignificant issues that don't block execution
123
+ - 3: Significant issues requiring additional information
124
+ - 2: Major issues preventing proper execution
125
+ - 1: Completely inappropriate given context
126
+
127
+ *Mistake Example:* User: "Translate 'Hola' to English." Call: `translate_text(text="Hola", target="en")` when the tool description explicitly requires a prior call to `detect_language(text="Hola")`.
128
+
129
+ ### 2.3 Grounding Metrics
130
+
131
+ #### 2.3.1 General Parameter Value Grounding
132
+
133
+ Assesses whether ALL parameter values in a function call are directly supported by conversation history or API specifications. Identifies hallucinated values, missing information, format errors, and contradictory values.
134
+
135
+ **Rating Scale:**
136
+ - 5: All parameter values correctly grounded and formatted
137
+ - 4: Some values may need more information but not hallucinated
138
+ - 3: Some values hallucinated or have format errors
139
+ - 2: Multiple values incorrect or contradictory
140
+ - 1: All values incorrect or missing
141
+
142
+ *Mistake Example:* User: "Fetch my profile." Call: `get_user_profile(user_id=42)` when no user ID was mentioned in conversation or available from context.
143
+
144
+ #### 2.3.2 Individual Parameter Hallucination Check
145
+
146
+ Evaluates whether a SPECIFIC parameter value is grounded in evidence or hallucinated. Checks sources, format compliance, value relationships, and default handling.
147
+
148
+ **Rating Scale:**
149
+ - 5: Perfectly grounded in conversation or documented defaults
150
+ - 4: Mostly grounded with minimal inference
151
+ - 3: Ambiguously grounded requiring substantial inference
152
+ - 2: Mostly ungrounded with tenuous connection
153
+ - 1: Completely hallucinated with no basis
154
+
155
+ *Mistake Example:* User: "Fetch my latest tweets." Call: `get_tweets(username="elonmusk", count=20)` when count was not specified by user and has no documented default.
156
+
157
+ #### 2.3.3 Value Format Alignment
158
+
159
+ Checks if a specific parameter value exactly conforms to required type, format, and unit conventions in the API specification.
160
+
161
+ **Rating Scale:**
162
+ - 5: Perfect alignment with specified type, format, units
163
+ - 4: Minor deviation unlikely to affect function
164
+ - 3: Moderate deviation that might affect function
165
+ - 2: Major deviation likely to cause function failure
166
+ - 1: Complete mismatch certain to cause failure
167
+
168
+ *Mistake Example:* User: "Start a countdown for 5 minutes." Call: `set_timer(duration="300000")` instead of `set_timer(duration="5 minutes")`.
169
+
170
+ ## Use Cases
171
+
172
+ For different use cases, we suggest to execute different metrics, as follows:
173
+
174
+ ### Fast Track Single-Turn
175
+
176
+ Execute (1) function selection (2) global parameter value grounding
177
+
178
+ ### Slow Track Single-Turn
179
+
180
+ Execute (1) function selection (2) per-parameter hallucination check (3) per-parameter value format check
181
+
182
+ ### Fast Track Agentic
183
+
184
+ Execute (1) function selection (2) global agentic metric (3) global parameter value grounding
185
+
186
+ ### Slow Track Agentic
187
+
188
+ Execute (1) function selection (2) global agentic metric (3) per-parameter hallucination check (4) per-parameter value format check
189
+
190
+ ---
191
+
192
+ **Customization:** Modify metrics, thresholds, and fields by editing your JSONL configuration files.
193
+
194
+ ---
195
+
196
+ ## Quickstart
197
+
198
+ ```bash
199
+ pip install llmevalkit[litellm] # or your preferred extras
200
+ ```
201
+
202
+ ```python
203
+ from llmevalkit.llm.registry import get_llm
204
+ from llmevalkit.function_calling.pipeline.pipeline import ReflectionPipeline
205
+
206
+ # 1) Pick your LLM provider and initialize clients
207
+ MetricsClient = get_llm("litellm.watsonx.output_val")
208
+ CodegenClient = get_llm("litellm.watsonx.output_val")
209
+ metrics_client = MetricsClient(model_name="meta-llama/llama-3-3-70b-instruct")
210
+ codegen_client = CodegenClient(model_name="meta-llama/llama-3-3-70b-instruct")
211
+
212
+ # 2) Create pipeline (loads bundled metrics JSONL by default)
213
+ pipeline = ReflectionPipeline(
214
+ metrics_client=metrics_client,
215
+ codegen_client=codegen_client,
216
+ transform_enabled=False
217
+ )
218
+
219
+ # 3) Define your API specs (OpenAI-style function definitions)
220
+ apis_specs = [
221
+ { "type":"function", "function": { ... } },
222
+ ...
223
+ ]
224
+
225
+ # 4) Provide a tool_call and context
226
+ call = {
227
+ "id":"1","type":"function",
228
+ "function":{"name":"get_weather","arguments":{"location":"Berlin"}}
229
+ }
230
+ context = "User: What's the weather in Berlin?"
231
+
232
+ # 5) Run end-to-end reflection
233
+ result = pipeline.run_sync(
234
+ conversation=context,
235
+ inventory=apis_specs,
236
+ call=call,
237
+ continue_on_static=False,
238
+ retries=2
239
+ )
240
+ print(result.model_dump_json(indent=2))
241
+ ```
242
+
243
+ ---
244
+
245
+ ## Directory Structure
246
+
247
+ ```
248
+ src/llmevalkit/function_calling/
249
+ ├── __init__.py
250
+ ├── metrics/ <- MetricPrompt templates & JSONL definitions
251
+ │ ├── base.py
252
+ │ ├── loader.py
253
+ │ ├── function_call/
254
+ │ │ ├── general.py
255
+ │ │ └── general_metrics.jsonl
256
+ │ ├── function_selection/
257
+ │ │ ├── function_selection.py
258
+ │ │ └── function_selection_metrics.jsonl
259
+ │ └── parameter/
260
+ │ ├── parameter.py
261
+ │ └── parameter_metrics.jsonl
262
+ ├── pipeline/
263
+ │ ├── adapters.py <- API-spec / call normalization
264
+ │ ├── pipeline.py <- High-level ReflectionPipeline
265
+ │ ├── semantic_checker.py <- Core LLM metrics orchestration
266
+ │ ├── static_checker.py <- JSONSchema-based validation
267
+ │ ├── transformation_prompts.py <- Unit-conversion prompts
268
+ │ └── types.py <- Pydantic models for inputs & outputs
269
+ └── examples/
270
+ └── function_calling/
271
+ └── pipeline.py <- Complete runnable example
272
+ ```
273
+
274
+ ---
275
+
276
+ ## ReflectionPipeline API
277
+
278
+ ### Initialization
279
+
280
+ ```python
281
+ ReflectionPipeline(
282
+ metrics_client: LLMClient,
283
+ codegen_client: LLMClient,
284
+ transform_enabled: bool = False,
285
+ general_metrics: Optional[Path] = None,
286
+ function_metrics: Optional[Path] = None,
287
+ parameter_metrics: Optional[Path] = None,
288
+ transform_examples: Optional[Dict[str,str]] = None,
289
+ )
290
+ ```
291
+
292
+ - **`metrics_client`**: llmevalkit LLM client for semantic metrics (e.g. output-validating OpenAI or LiteLLM).
293
+ - **`codegen_client`**: llmevalkit LLM client for code generation (required if `transform_enabled=True`).
294
+ - **`*_metrics`**: override paths to your own JSONL metric definitions (otherwise uses `metrics/.../*.json`).
295
+ - **`transform_enabled`**: whether to run unit-conversion checks.
296
+
297
+ ### `static_only(conversation, inventory, call) → StaticResult`
298
+
299
+ - Runs pure JSON-schema validation on `call` against `inventory` specs.
300
+ - Checks required parameters, types, enums, etc.
301
+
302
+ ### `semantic_sync(conversation, inventory, call, retries=1) → SemanticResult`
303
+
304
+ - Runs LLM-driven metric evaluations **synchronously**.
305
+ - Returns per-category semantic results.
306
+
307
+ ### `semantic_async(conversation, inventory, call, retries=1, max_parallel=10) → SemanticResult`
308
+
309
+ - Same as above, but issues LLM calls in parallel.
310
+
311
+ ### `run_sync(conversation, inventory, call, continue_on_static=False, retries=1) → PipelineResult`
312
+
313
+ - Full pipeline:
314
+ 1. Static checks
315
+ 2. Semantic metrics (if static passes or `continue_on_static=True`)
316
+ 3. Aggregates final `PipelineResult` with `static`, `semantic`, and `overall_valid`.
317
+
318
+ ### `run_async(...)`
319
+
320
+ - Asynchronous equivalent of `run_sync`.
321
+
322
+ ---
323
+
324
+ ## Example Usage
325
+
326
+ See `examples/function_calling/pipeline/example.py` for a complete, runnable demo:
327
+
328
+ ```bash
329
+ python examples/function_calling/pipeline/example.py
330
+ ```
331
+
332
+ It will:
333
+
334
+ 1. Define three sample functions (`get_weather`, `create_event`, `translate_text`).
335
+ 2. Initialize Watsonx clients.
336
+ 3. Run sync reflection for valid and invalid calls.
337
+ 4. Print nicely formatted JSON results.
338
+
339
+ ---
340
+
341
+ ## Custom Metrics
342
+
343
+ By default we ship three JSONL files under `metrics/...`:
344
+
345
+ - **General**: overall call quality
346
+ - **Function-Selection**: was the right function chosen?
347
+ - **Parameter**: correctness of each parameter value
348
+
349
+ Each line in a `.json` file is a JSON object:
350
+
351
+ ```jsonc
352
+ // general_metrics.json
353
+ {"name":"Clarity", "description":"Rate clarity of the intent","schema":{...},
354
+ "thresholds":{"output":[0,1],"confidence":[0,1]},
355
+ "examples":[
356
+ {"user_kwargs":{...}, "output":{...}}
357
+ ]}
358
+ ```
359
+
360
+ To add your own:
361
+
362
+ 1. Create a new `.json` in any folder.
363
+ 2. Pass its path into the pipeline constructor:
364
+
365
+ ```python
366
+ pipeline = ReflectionPipeline(
367
+ metrics_client=...,
368
+ codegen_client=...,
369
+ general_metrics="path/to/my_general.json",
370
+ function_metrics="path/to/my_func.json",
371
+ parameter_metrics="path/to/my_param.json",
372
+ )
373
+ ```
374
+
375
+ 3. Follow the same JSONL format:
376
+ - `schema`: valid JSON-Schema object
377
+ - `thresholds`: dict of numeric field thresholds
378
+ - `examples`: few-shot examples validating against that schema
379
+
380
+ ---
381
+
382
+ ## Transform-Enabled Mode
383
+
384
+ If you want automated unit conversions:
385
+
386
+ ```python
387
+ pipeline = ReflectionPipeline(
388
+ metrics_client=metrics_client,
389
+ codegen_client=codegen_client,
390
+ transform_enabled=True,
391
+ transform_examples=my_transform_examples_dict,
392
+ )
393
+ ```
394
+
395
+ - Uses two additional LLM prompts (in `transformation_prompts.py`):
396
+ 1. **Extract units** from context
397
+ 2. **Generate transformation code**
398
+
399
+ - Finally executes the generated code in-process and reports a `TransformResult` per parameter.
400
+
401
+ ---
402
+
403
+ ## Error Handling & Logging
404
+
405
+ - Each stage wraps exceptions with clear, contextual messages.
406
+ - The LLM clients emit optional hooks (`hooks=[...]`) for tracing or metrics.
407
+ - In semantic phases, malformed or missing fields result in per-metric errors rather than crashing the entire pipeline.
408
+
409
+ ---
410
+
411
+ Enjoy robust, end-to-end reflection on your function calls—static and semantic—powered entirely by `llmevalkit`!
@@ -0,0 +1,27 @@
1
+ from .metrics import (
2
+ GeneralMetricsPrompt,
3
+ FunctionSelectionPrompt,
4
+ ParameterMetricsPrompt,
5
+ TrajectoryReflectionPrompt,
6
+ get_general_metrics_prompt,
7
+ get_parameter_metrics_prompt,
8
+ get_trajectory_reflection_prompt,
9
+ load_prompts_from_jsonl,
10
+ load_prompts_from_list,
11
+ load_prompts_from_metrics,
12
+ PromptKind,
13
+ )
14
+
15
+ __all__ = [
16
+ "GeneralMetricsPrompt",
17
+ "FunctionSelectionPrompt",
18
+ "ParameterMetricsPrompt",
19
+ "TrajectoryReflectionPrompt",
20
+ "get_general_metrics_prompt",
21
+ "get_parameter_metrics_prompt",
22
+ "get_trajectory_reflection_prompt",
23
+ "load_prompts_from_jsonl",
24
+ "load_prompts_from_list",
25
+ "load_prompts_from_metrics",
26
+ "PromptKind",
27
+ ]