ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,516 @@
1
+ import ast
2
+ import asyncio
3
+ import json
4
+ import uuid
5
+ from typing import List, Dict, Any
6
+
7
+ from llmevalkit.function_calling.consts import (
8
+ METRIC_AGENTIC_CONSTRAINTS_SATISFACTION,
9
+ METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
10
+ METRIC_GENERAL_HALLUCINATION_CHECK,
11
+ METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT,
12
+ METRIC_PARAMETER_HALLUCINATION_CHECK,
13
+ METRIC_PARAMETER_VALUE_FORMAT_ALIGNMENT,
14
+ )
15
+ from llmevalkit.function_calling.pipeline.pipeline import ReflectionPipeline
16
+ from llmevalkit.function_calling.pipeline.types import ToolCall, ToolSpec
17
+ from llmevalkit.llm.base import get_llm
18
+
19
+
20
+ def convert_tool_calls_to_openai_format(
21
+ tool_calls: List[Dict[str, Any]],
22
+ ) -> List[ToolCall]:
23
+ """
24
+ Convert a list of tool calls into OpenAI-compatible tool call format.
25
+
26
+ Args:
27
+ tool_calls (List[Dict[str, Any]]): Input tool calls in format:
28
+ [{"name": "...", "arguments": {...}}]
29
+
30
+ Returns:
31
+ List[Dict[str, Any]]: Tool calls in OpenAI format.
32
+ """
33
+ openai_tool_calls = []
34
+ for call in tool_calls:
35
+ openai_tool_calls.append(
36
+ ToolCall(
37
+ **{
38
+ "id": f"call_{uuid.uuid4().hex[:8]}", # unique id
39
+ "type": "function",
40
+ "function": {
41
+ "name": call["name"],
42
+ "arguments": json.dumps(call.get("arguments", {})),
43
+ },
44
+ }
45
+ )
46
+ )
47
+ return openai_tool_calls
48
+
49
+
50
+ class ConversationFormatError(ValueError):
51
+ """Raised when the conversation format is invalid for this extractor."""
52
+
53
+
54
+ def extract_tool_specs_from_conversation(
55
+ conversation: List[Dict[str, Any]], remove_turn: bool = False
56
+ ) -> List[Dict[str, Any]]:
57
+ """
58
+ Extract tool specifications from the system turn in the conversation history.
59
+ The function looks for a system message whose content contains a list of tool specs (as a stringified list of dicts),
60
+ parses it, and returns it as a list of dictionaries.
61
+
62
+ Args:
63
+ conversation: List of conversation turns.
64
+ remove_turn: If True, remove the tool specification turn from the conversation.
65
+
66
+ Returns:
67
+ List of tool specifications as dictionaries.
68
+
69
+ Raises:
70
+ ConversationFormatError: If no valid tool specifications are found in the conversation.
71
+ """
72
+ import ast
73
+
74
+ for i, turn in enumerate(conversation):
75
+ if turn.get("role") == "system":
76
+ content = turn.get("content", "")
77
+ # Heuristic: look for a string that looks like a list of dicts (starts with [ and contains 'function')
78
+ # and is not markdown or code block formatted
79
+ # Try to find the first occurrence of a list starting with [ and ending with ]
80
+ start = content.find("[")
81
+ end = content.rfind("]")
82
+ if start != -1 and end != -1 and "function" in content[start:end]:
83
+ possible_list = content[start : end + 1]
84
+ try:
85
+ # Use ast.literal_eval for safety (content is usually single quotes)
86
+ tool_specs = ast.literal_eval(possible_list)
87
+ if isinstance(tool_specs, list):
88
+ # If requested, remove the turn containing tool specs from the conversation
89
+ if remove_turn:
90
+ conversation.pop(i)
91
+ return tool_specs
92
+ except Exception:
93
+ pass
94
+
95
+ # If we got here, we didn't find any valid tool specifications
96
+ raise ConversationFormatError("No valid tool specifications found in conversation.")
97
+
98
+
99
+ def extract_tool_calls_to_reflect(
100
+ conversation: List[Dict[str, Any]],
101
+ ) -> List[Dict[str, Any]]:
102
+ """
103
+ Extract tool calls to reflect by identifying patterns in the conversation.
104
+
105
+ The function looks for two patterns:
106
+ 1. An assistant message with content 'Act: (Please return only a JSON string)'
107
+ followed by a user message containing the JSON tool calls
108
+ 2. An assistant message containing "Act:" followed by a tool call in JSON format
109
+ within the same message
110
+
111
+ Args:
112
+ conversation: List of conversation turns
113
+
114
+ Returns:
115
+ A flat list of tool call dicts: [{"name": "GenerateKyvernoTool", "arguments": {...}}, ...]
116
+
117
+ Raises:
118
+ ConversationFormatError: On validation or parsing issues
119
+ """
120
+ if not conversation:
121
+ raise ConversationFormatError("Conversation is empty.")
122
+
123
+ tool_calls = []
124
+
125
+ # Pattern 1: Look for the pattern where the tool call is in the message following "Act:" message
126
+ for i in range(len(conversation) - 1): # Stop at the second-to-last message
127
+ current_turn = conversation[i]
128
+ next_turn = conversation[i + 1]
129
+
130
+ # Look for assistant message with the specific marker
131
+ if (
132
+ current_turn.get("role") == "assistant"
133
+ and isinstance(current_turn.get("content"), str)
134
+ and "Act: (Please return only a JSON string)" in current_turn.get("content")
135
+ ):
136
+ # The next message should contain the JSON for the tool call
137
+ content = next_turn.get("content", "")
138
+ if isinstance(content, str):
139
+ # Try to parse the content directly first
140
+ try:
141
+ parsed = ast.literal_eval(content)
142
+ if isinstance(parsed, list):
143
+ for item in parsed:
144
+ if isinstance(item, dict) and "name" in item:
145
+ tool_calls.append(item)
146
+ except (SyntaxError, ValueError):
147
+ # Continue to next pattern if this fails
148
+ pass
149
+
150
+ # Pattern 2: Look for "Act:" followed by JSON tool call in the same message
151
+ if not tool_calls:
152
+ for turn in conversation:
153
+ if turn.get("role") == "assistant" and isinstance(turn.get("content"), str):
154
+ content = turn.get("content", "")
155
+ # Look for the pattern "Act: \n[{...}]"
156
+ if "Act:" in content:
157
+ act_start = content.find("Act:")
158
+ if act_start != -1:
159
+ # Find the start of the JSON array after "Act:"
160
+ json_start = content.find("[", act_start)
161
+ json_end = content.rfind("]", act_start)
162
+
163
+ if (
164
+ json_start != -1
165
+ and json_end != -1
166
+ and json_start < json_end
167
+ ):
168
+ # Extract the JSON array
169
+ json_str = content[json_start : json_end + 1]
170
+ try:
171
+ # Try to parse the content
172
+ parsed = json.loads(json_str)
173
+ if isinstance(parsed, list):
174
+ for item in parsed:
175
+ if (
176
+ isinstance(item, dict)
177
+ and "name" in item
178
+ and "arguments" in item
179
+ ):
180
+ tool_calls.append(item)
181
+ except json.JSONDecodeError:
182
+ # Try with ast.literal_eval as a fallback
183
+ try:
184
+ parsed = ast.literal_eval(json_str)
185
+ if isinstance(parsed, list):
186
+ for item in parsed:
187
+ if (
188
+ isinstance(item, dict)
189
+ and "name" in item
190
+ and "arguments" in item
191
+ ):
192
+ tool_calls.append(item)
193
+ except (SyntaxError, ValueError):
194
+ # Continue to next message if this fails
195
+ pass
196
+
197
+ if not tool_calls:
198
+ raise ConversationFormatError(
199
+ "No valid tool calls found to reflect in the conversation."
200
+ )
201
+
202
+ return tool_calls
203
+
204
+
205
+ async def reflect_ciso_agent_conversation(
206
+ conversation: List[Dict[str, Any]],
207
+ ) -> List[Dict[str, Any]]:
208
+ """
209
+ Reflect the CISO agent conversation by extracting relevant information from the conversation history.
210
+ """
211
+ # Extract tool specifications and remove that turn from conversation
212
+ tool_specs = extract_tool_specs_from_conversation(conversation, remove_turn=True)
213
+
214
+ tool_specs_objs = [
215
+ ToolSpec(**spec) for spec in tool_specs if isinstance(spec, dict)
216
+ ]
217
+
218
+ # Extract tool calls from the conversation
219
+ tool_calls = extract_tool_calls_to_reflect(conversation)
220
+ converted_tool_calls = convert_tool_calls_to_openai_format(tool_calls)
221
+
222
+ # Define ReflectionPipeline object
223
+ MetricsClientCls = get_llm("litellm.rits.output_val")
224
+ metrics_client = MetricsClientCls(
225
+ model_name="meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
226
+ )
227
+ reflection_pipeline = ReflectionPipeline(
228
+ metrics_client=metrics_client,
229
+ general_metrics=[
230
+ METRIC_GENERAL_HALLUCINATION_CHECK,
231
+ METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT,
232
+ ],
233
+ function_metrics=[
234
+ METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
235
+ METRIC_AGENTIC_CONSTRAINTS_SATISFACTION,
236
+ ],
237
+ parameter_metrics=[
238
+ # METRIC_PARAMETER_VALUE_FORMAT_ALIGNMENT,
239
+ # METRIC_PARAMETER_HALLUCINATION_CHECK,
240
+ ],
241
+ runtime_pipeline=True, # Uncomment this line to enable evaluation pipeline,
242
+ # i.e. with actionable recommendations for the agent development.
243
+ )
244
+
245
+ # Reflect the tool calls in the conversation
246
+ reflection_outputs = []
247
+ for tool_call_to_reflect in converted_tool_calls:
248
+ reflection_outputs.append(
249
+ await reflection_pipeline.run_async(
250
+ conversation=conversation,
251
+ inventory=tool_specs_objs,
252
+ call=tool_call_to_reflect,
253
+ continue_on_static=True,
254
+ )
255
+ )
256
+
257
+ return reflection_outputs
258
+
259
+
260
+ def create_reflection_result_summary(
261
+ reflection_output: Dict[str, Any],
262
+ ) -> Dict[str, Any]:
263
+ """
264
+ Generate the final result with syntactic_errors, semantic_errors, and corrections.
265
+ """
266
+ original_tool_call = reflection_output["inputs"]["tool_call"]["function"]
267
+ result = {
268
+ "syntactic_errors": {},
269
+ "semantic_errors": {},
270
+ "corrections": {},
271
+ "overall_valid": reflection_output.get("overall_valid"),
272
+ "original_tool_call": {
273
+ "name": original_tool_call["name"],
274
+ "arguments": original_tool_call["parsed_arguments"],
275
+ },
276
+ }
277
+
278
+ # 1. Process syntactic errors from static metrics
279
+ static = reflection_output.get("static", {})
280
+ static_metrics = static.get("metrics", {})
281
+ for metric_name, metric_data in static_metrics.items():
282
+ if not metric_data.get("valid", True):
283
+ explanation = metric_data.get("explanation", "No explanation provided")
284
+ result["syntactic_errors"][metric_name] = explanation
285
+
286
+ # 2. Process semantic errors - prioritize function_selection over general
287
+ semantic = reflection_output.get("semantic", {})
288
+
289
+ # Check function_selection metrics first
290
+ function_selection = semantic.get("function_selection", {})
291
+ if function_selection: # Only process if function_selection exists
292
+ function_selection_metrics = function_selection.get("metrics", {})
293
+
294
+ for metric_name, metric_data in function_selection_metrics.items():
295
+ if metric_data.get("is_issue", False):
296
+ raw_response = metric_data.get("raw_response", {})
297
+ explanation = raw_response.get("explanation", "No explanation provided")
298
+ result["semantic_errors"][metric_name] = explanation
299
+
300
+ # Add corrections from function_selection based on specific error types
301
+ correction = raw_response.get("correction", {})
302
+ if metric_name == "function_selection_appropriateness" and (
303
+ "corrected_function" in correction
304
+ or "corrected_function_name" in correction
305
+ ):
306
+ # For function selection errors, we only need the tool name
307
+ corrected_func = correction.get(
308
+ "corrected_function"
309
+ ) or correction.get("corrected_function_name")
310
+ if corrected_func == "no_function":
311
+ pass
312
+ elif isinstance(corrected_func, dict) and "name" in corrected_func:
313
+ result["corrections"]["corrected_tool_name"] = corrected_func[
314
+ "name"
315
+ ]
316
+ elif isinstance(corrected_func, str):
317
+ try:
318
+ parsed = json.loads(corrected_func)
319
+ if isinstance(parsed, dict) and "name" in parsed:
320
+ result["corrections"]["corrected_tool_name"] = parsed[
321
+ "name"
322
+ ]
323
+ except json.JSONDecodeError:
324
+ result["corrections"][
325
+ "corrected_tool_name"
326
+ ] = corrected_func
327
+ elif (
328
+ metric_name == "agentic_constraints_satisfaction"
329
+ and "prerequisite_tool_calls" in correction
330
+ ):
331
+ if correction["prerequisite_tool_calls"]: # Only add if not empty
332
+ result["corrections"]["prerequisite_tool_calls"] = correction[
333
+ "prerequisite_tool_calls"
334
+ ]
335
+
336
+ # Only process general metrics if no function_selection errors
337
+ # if not has_function_selection_errors:
338
+ general = semantic.get("general", {})
339
+ if general: # Only process if general exists
340
+ general_metrics = general.get("metrics", {})
341
+
342
+ for metric_name, metric_data in general_metrics.items():
343
+ if metric_data.get("is_issue", False):
344
+ raw_response = metric_data.get("raw_response", {})
345
+ explanation = raw_response.get("explanation", "No explanation provided")
346
+ result["semantic_errors"][metric_name] = explanation
347
+
348
+ # Add corrections from general metrics based on specific error types
349
+ correction = raw_response.get("correction", {})
350
+ if (
351
+ metric_name
352
+ in ["general_hallucination_check", "value_format_alignment"]
353
+ and "tool_call" in correction
354
+ ):
355
+ # For general errors, we provide the complete corrected tool call
356
+ corrected_call = correction["tool_call"]
357
+ if isinstance(corrected_call, str):
358
+ try:
359
+ corrected_call = json.loads(corrected_call)
360
+ except json.JSONDecodeError:
361
+ pass # Keep as string if parsing fails
362
+ if corrected_call:
363
+ result["corrections"]["corrected_tool_call"] = corrected_call
364
+
365
+ return result
366
+
367
+
368
+ if __name__ == "__main__":
369
+ # Example usage
370
+ # tool_call = [
371
+ # {
372
+ # "name": "GenerateKyvernoTool",
373
+ # "arguments": {
374
+ # "sentence": "Generate a Kyverno policy to check if the cluster-admin role is only used where required.",
375
+ # "policy_file": "/tmp/agent/20250122154450/policy.yaml",
376
+ # },
377
+ # }
378
+ # ]
379
+
380
+ # tool_calls_converted = convert_tool_calls_to_openai_format(tool_call)
381
+ # for tool_call in tool_calls_converted:
382
+ # print(tool_call.model_dump_json(indent=2))
383
+
384
+ # tool_specs = [
385
+ # {
386
+ # "type": "function",
387
+ # "function": {
388
+ # "name": "GenerateKyvernoTool",
389
+ # "description": "The tool to generate a Kyverno policy. This tool returns the generated Kyverno policy. This can be used for updating existing Kyverno policy.\n",
390
+ # "parameters": {
391
+ # "type": "object",
392
+ # "properties": {
393
+ # "sentence": {
394
+ # "type": "str",
395
+ # "description": "A comprehensive description to request Kyverno policy generation.\nThis must be containing any level of details about the Kyverno policy to be generated.\nIf you got any errors especially about syntax when you invoked this function previously, mention it here for improving the generation result this time.\nFor example, when you got an error like `.spec.rules[0].match.any[0].selector: field not declared in schema`, add the following to the sentence: `previous trial failed because .spec.rules[0].match.any[0].selector is not available field for Kyverno policy`\n",
396
+ # },
397
+ # "policy_file": {
398
+ # "type": "str",
399
+ # "description": "filepath for the Kyverno policy to be saved.",
400
+ # },
401
+ # "current_policy_file": {
402
+ # "type": "str",
403
+ # "description": "filepath of the current Kyverno policy to be updated. Only needed when updating an existing policy",
404
+ # "default": "",
405
+ # },
406
+ # },
407
+ # "required": ["sentence", "policy_file"],
408
+ # },
409
+ # },
410
+ # },
411
+ # {
412
+ # "type": "function",
413
+ # "function": {
414
+ # "name": "RunKubectlTool",
415
+ # "description": 'The tool to execute a kubectl command.\nThis tool returns the following:\n - return_code: if 0, the command was successful, otherwise, failure.\n - stdout: standard output of the command (only when `return_output` is True)\n - stderr: standard error of the command (only when error occurred)\n - script_file: saved script path if applicable\n\nFor example, to execute `kubectl get pod -n default --kubeconfig kubeconfig.yaml`,\nTool Input should be the following:\n{"args": "get pod -n default --kubeconfig kubeconfig.yaml", "output_file": "", "return_output": "True", "script_file": ""}\n\nHint:\n- If you need to get all pods in all namespaces, you can do it by `kubectl get pods --all-namespaces --kubeconfig <kubeconfig_path> -o json`\n',
416
+ # "parameters": {
417
+ # "type": "object",
418
+ # "properties": {
419
+ # "args": {
420
+ # "type": "str",
421
+ # "description": "command arguments after `kubectl`. `--kubeconfig` should be specified here. Multiple commands with `;` or `&&` is not allowed. Using pipe `|` for jq are not allowed too. Just save the entire JSON if you want.",
422
+ # },
423
+ # "output_file": {
424
+ # "type": "str",
425
+ # "description": "The filepath to save the result. If empty string, not save anything",
426
+ # "default": "",
427
+ # },
428
+ # "return_output": {
429
+ # "type": "str",
430
+ # "description": 'A boolean string. Set this to "True" if you want to get the command output',
431
+ # "default": "False",
432
+ # },
433
+ # "script_file": {
434
+ # "type": "str",
435
+ # "description": "A filepath. If provided, save the kubectl command as a script at the specified file.",
436
+ # "default": "",
437
+ # },
438
+ # },
439
+ # "required": ["args"],
440
+ # },
441
+ # },
442
+ # },
443
+ # ]
444
+
445
+ conversation_history = [
446
+ {
447
+ "role": "system",
448
+ "content": "<|start_header_id|>system<|end_header_id|>\nYou are a helpful assistant with access to the following function calls. Your task is to produce a sequence of function calls necessary to generate response to the user utterance. Use the following function calls as required. Do not make up an original tool not listed below. All function arguments must be defined under `arguments` attribute.\n<|eot_id|>\n",
449
+ },
450
+ {
451
+ "role": "system",
452
+ "content": '<|start_header_id|>user<|end_header_id|>\nWhat profession does Nicholas Ray and Elia Kazan have in common?\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\nTho: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.\nAct: \n[{"name": "Search", "arguments": {"topic": "Nicholas Ray"}}]\nObs: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 - June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.\nTho: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.\nAct: \n[{"name": "Search", "arguments": {"topic": "Elia Kazan"}}]\nObs: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.\nTho: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.\nAct: \n[{"name": "Finish"}]\n<|eot_id|>\n',
453
+ },
454
+ {
455
+ "role": "user",
456
+ "content": '<|start_header_id|>system<|end_header_id|>\nThe above dialog is an example flow to solve the original question.\nTho (thought) comes first to describe your thought in a natural language text, then Act (action) in JSON to execute functions, and Obs (observation) is a feedback of the action and then the next thought is a reasoning of the next action.\nEach "Tho" must be a reasoning description to come up with the next action. This must not be an action string because it may cause confusion of the action history.\nEach "Act" must be a list object like the following\n [{"name": "<FUNC_NAME>", "arguments": {"<KEY>": "<VALUE>"}}]\nWhen you got an answer, you must return an action [{"name": "Finish"}]\nWhen you see errors and you can\'t fix them, previous personas before you might be cause. To get back to them, call `Error` function with some error report like `[{"name": "Error", "arguments": {"error": "<ERROR DETAILS>"}}]`\nThe function names in the dialog are just examples, you must use functions you have.\nYou must simplify your answer as much as possible.\nLong explanation is not needed.\nEven when you need to do multiple actions, generate just the very next 1 action.\nWhen you are seeing the same error again and again, please forget the details and remind your original goal.\n<|eot_id|>\n',
457
+ },
458
+ {
459
+ "role": "user",
460
+ "content": '<|start_header_id|>system<|end_header_id|>\nYou are requested to achieve the given `Goal`. When an `Original Goal` is given, it contains requests to other people and tools you can use are not always sufficient to achieve this original goal. In such case, you can quit our work. Then next person will continue the task.\nYour thought is not directly transfered to actions, so you must transfer them to actions especially about errors.\nThe action you thought must be a SINGLE LINE JSON and do not include any explanation for it in Act: step.\nWhen you perform policy evaluation and the result was false, which means the compliance requirement is not satisfied, just report them and do not remediate it.\nWhen an action has an argument `sentence`, it is an input text for LLM service to generate something (e.g. Playbook). You have to specify details here as much as possible.\nWhen you want to use newline in the sentence arguemnts, you have to escape it like {"sentence": "this is the first line.\\\\nthis is the second line."} and escape all quoation in the arguments like {"sentence": "escape quotes like \\"this\\" when you use quotes"}\nSpecify `script_file` as "script.sh" when you want to use RunKubectl to get `collected_data.json`.\nWhen the goal says `Minimize something`, it means no instances are allowed.\nWhen the goal says `role` for Kubernetes, it means both RoleBinding and ClusterRoleBinding should be in scope.\nWhen the goal mentions some extra conditions like `exclude xxxx namespace` and `apply this policy only to resources labeled as yyyy=zzzz`, then these conditions must be included in `sentence` of GenerateKyvernoTool\nAfter deploying a Kyverno policy, do not validate it by deploying other resources. Just finish.\nAfter generating a Playbook, you have to run it with RunPlaybook.\n When it is required to get some existing Kyverno policies, you can do `kubectl get clusterpolicies --kubeconfig /path/to/kubeconfig`.\nWhen getting names of exisintg kyverno clusterpolicies, you should not specify "-o" option. Just to do kubectl get without name and -o\n\n<|eot_id|>\n',
461
+ },
462
+ {
463
+ "role": "system",
464
+ "content": "<|start_header_id|>system<|end_header_id|>\nYour Goal:\n```\nGenerate a Kyverno policy to check if the cluster-admin role is only used where required.\nSave the generated policy to policy.yaml in the workdir.\nDeploy the generated policy to the Kubernetes cluster with kubeconfig /tmp/agent/20250122154450/kubeconfig.yaml.\n\n```\n\nOriginal Goal (NOTE: This is NOT your goal; Do not do everything here):\n```\nI would like to check if the following condition is satisfiled, given a Kubernetes cluster with `kubeconfig.yaml`\n Ensure that the cluster-admin role is only used where required.\n\nTo check the condition, do the following steps.\n- deploy a Kyverno policy to the cluster\n- chcek if the policy is correctly deployed.\n\n\nIf deploying the policy failed and if you can fix the issue, you will do it and try deploying again.\nOnce you get a final answer, you can quit the work.\n\nThe cluster's kubeconfig is at `/tmp/agent/20250122154450/kubeconfig.yaml`.\nYou can use `/tmp/agent/20250122154450/` as your workdir.\n\n```\n\n\nSteps Already Done and Results:\n```yaml\n- input:\n goal: |\n Generate a Kyverno policy to check if the cluster-admin role is only used where required.\n Save the generated policy to policy.yaml in the workdir.\n Deploy the generated policy to the Kubernetes cluster with kubeconfig /tmp/agent/20250122154450/kubeconfig.yaml.\n kubeconfig: /tmp/agent/20250122154450/kubeconfig.yaml\n workdir: /tmp/agent/20250122154450/\n node: kyverno\n output:\n deployed_resource:\n kind: ClusterPolicy\n name: restrict-cluster-admin\n path_to_generated_kyverno_policy: /tmp/agent/20250122154450/policy.yaml\n- input:\n goal: |\n Get the deployed Kyverno policy from the cluster with kubeconfig /tmp/agent/20250122154450/kubeconfig.yaml.\n Save the result to deployed_policy.json.\n kubeconfig: /tmp/agent/20250122154450/kubeconfig.yaml\n workdir: /tmp/agent/20250122154450/\n node: kubernetes\n output:\n error: error message\n\n```\n\n<|eot_id|>\n",
465
+ },
466
+ {
467
+ "role": "system",
468
+ "content": "<|start_header_id|>system<|end_header_id|>\n[{'type': 'function', 'function': {'name': 'GenerateKyvernoTool', 'description': 'The tool to generate a Kyverno policy. This tool returns the generated Kyverno policy. This can be used for updating existing Kyverno policy.\\n', 'parameters': {'type': 'object', 'properties': {'sentence': {'type': 'str', 'description': 'A comprehensive description to request Kyverno policy generation.\\nThis must be containing any level of details about the Kyverno policy to be generated.\\nIf you got any errors especially about syntax when you invoked this function previously, mention it here for improving the generation result this time.\\nFor example, when you got an error like `.spec.rules[0].match.any[0].selector: field not declared in schema`, add the following to the sentence: `previous trial failed because .spec.rules[0].match.any[0].selector is not available field for Kyverno policy`\\n'}, 'policy_file': {'type': 'str', 'description': 'filepath for the Kyverno policy to be saved.'}, 'current_policy_file': {'type': 'str', 'description': 'filepath of the current Kyverno policy to be updated. Only needed when updating an existing policy', 'default': ''}}, 'required': ['sentence', 'policy_file']}}}, {'type': 'function', 'function': {'name': 'RunKubectlTool', 'description': 'The tool to execute a kubectl command.\\nThis tool returns the following:\\n - return_code: if 0, the command was successful, otherwise, failure.\\n - stdout: standard output of the command (only when `return_output` is True)\\n - stderr: standard error of the command (only when error occurred)\\n - script_file: saved script path if applicable\\n\\nFor example, to execute `kubectl get pod -n default --kubeconfig kubeconfig.yaml`,\\nTool Input should be the following:\\n{\"args\": \"get pod -n default --kubeconfig kubeconfig.yaml\", \"output_file\": \"\", \"return_output\": \"True\", \"script_file\": \"\"}\\n\\nHint:\\n- If you need to get all pods in all namespaces, you can do it by `kubectl get pods --all-namespaces --kubeconfig <kubeconfig_path> -o json`\\n', 'parameters': {'type': 'object', 'properties': {'args': {'type': 'str', 'description': 'command arguments after `kubectl`. `--kubeconfig` should be specified here. Multiple commands with `;` or `&&` is not allowed. Using pipe `|` for jq are not allowed too. Just save the entire JSON if you want.'}, 'output_file': {'type': 'str', 'description': 'The filepath to save the result. If empty string, not save anything', 'default': ''}, 'return_output': {'type': 'str', 'description': 'A boolean string. Set this to \"True\" if you want to get the command output', 'default': False}, 'script_file': {'type': 'str', 'description': 'A filepath. If provided, save the kubectl command as a script at the specified file.', 'default': ''}}, 'required': ['args']}}}]\n<|eot_id|>\n",
469
+ },
470
+ {
471
+ "role": "assistant",
472
+ "content": "<|start_header_id|>assistant<|end_header_id|>Tho: ",
473
+ },
474
+ {
475
+ "role": "assistant",
476
+ "content": " To achieve the given goal, I need to generate a Kyverno policy to check if the cluster-admin role is only used where required, save it to policy.yaml, and then deploy it to the Kubernetes cluster. First, I will generate the Kyverno policy using the GenerateKyvernoTool function.\nAct:",
477
+ },
478
+ {
479
+ "role": "assistant",
480
+ "content": "<|start_header_id|>assistant<|end_header_id|>Act: (Please return only a JSON string)\n",
481
+ },
482
+ {
483
+ "role": "assistant",
484
+ "content": '[{"name": "GenerateKyvernoTool", "arguments": {"sentence": "Generate a Kyverno policy to check if the cluster-admin role is only used where required. The policy should validate ClusterRoleBindings and check if the role is bound to a user or a service account.", "policy_file": "policy.yaml"}}]',
485
+ },
486
+ {
487
+ "role": "user",
488
+ "content": "[{'name': 'GenerateKyvernoTool', 'arguments': {'sentence': 'Generate a Kyverno policy to check if the cluster-admin role is only used where required. The policy should validate ClusterRoleBindings and check if the role is bound to a user or a service account.', 'policy_file': 'policy.yaml'}}]",
489
+ },
490
+ {
491
+ "role": "assistant",
492
+ "content": "<|start_header_id|>assistant<|end_header_id|>Obs: <|eot_id|>",
493
+ },
494
+ {
495
+ "role": "user",
496
+ "content": 'The generated policy is below:\n```yaml\n\napiVersion: kyverno.io/v1\nkind: ClusterPolicy\nmetadata:\n name: restrict-cluster-admin-role\nspec:\n rules:\n - name: validate-cluster-admin-binding\n match:\n any:\n - resources:\n kinds:\n - ClusterRoleBinding\n validate:\n message: "ClusterRoleBinding for cluster-admin role should be reviewed."\n pattern:\n roleRef:\n name: "cluster-admin"\n kind: "ClusterRole"\n subjects:\n - kind: "ServiceAccount"\n\n```\n\nThis policy file has been saved at /tmp/agent/20250122154450/policy.yaml.\n',
497
+ },
498
+ ]
499
+
500
+ # Extract tool calls to reflect
501
+ # raw_tool_calls = extract_tool_calls_to_reflect(conversation_history)
502
+ #
503
+ # print("Extracted tool calls:")
504
+ # print(json.dumps(raw_tool_calls, indent=2))
505
+
506
+ # Convert them to OpenAI tool_call objects
507
+ # openai_tool_calls = convert_tool_calls_to_openai_format(raw_tool_calls)
508
+ #
509
+ # print("Converted OpenAI tool calls:")
510
+ # for call in openai_tool_calls:
511
+ # print(f"{call.model_dump_json(indent=2)}")
512
+
513
+ # Call the main reflection function
514
+ reflections = asyncio.run(reflect_ciso_agent_conversation(conversation_history))
515
+
516
+ print(reflections)
@@ -0,0 +1,111 @@
1
+ import asyncio
2
+ import json
3
+ import re
4
+ from typing import Dict, List, Any
5
+
6
+ from llmevalkit.ciso_agent.main import create_reflection_result_summary, reflect_ciso_agent_conversation
7
+
8
+ _HEADER_RE = re.compile(r"<\|start_header_id\|>.*?<\|end_header_id\|>\n?", re.DOTALL)
9
+ _CONTROL_TOKENS = [
10
+ "<|eot_id|>",
11
+ "<|python_end|>",
12
+ "<|eom|>",
13
+ ]
14
+
15
+ def _clean_content(text: str) -> str:
16
+ if not text:
17
+ return ""
18
+ # Keep only the part before the first end of turn marker, if present
19
+ parts = text.split("<|eot_id|>", 1)
20
+ text = parts[0]
21
+ # Remove any residual header blocks like <|start_header_id|>role<|end_header_id|>
22
+ text = _HEADER_RE.sub("", text)
23
+ # Remove stray control tokens that sometimes appear inline
24
+ for tok in _CONTROL_TOKENS:
25
+ text = text.replace(tok, "")
26
+ return text.strip()
27
+
28
+ def convert_langtrace_to_openai_messages(
29
+ data: Dict[str, Any],
30
+ key_prefix: str = "gen_ai.prompt",
31
+ merge_consecutive: bool = True,
32
+ ) -> List[Dict[str, str]]:
33
+ """
34
+ Convert a flat dict that contains keys like:
35
+ gen_ai.prompt.0.role, gen_ai.prompt.0.content, ...
36
+ into an OpenAI messages list:
37
+ [{"role": "system", "content": "..."}, ...]
38
+
39
+ Args:
40
+ data: source dictionary
41
+ key_prefix: base prefix for prompt entries
42
+ merge_consecutive: if True, merge adjacent messages that share the same role
43
+
44
+ Returns:
45
+ List of messages suitable for OpenAI chat completions
46
+ """
47
+ # Collect all indices N for which a role exists
48
+ indices = []
49
+ prefix_dot = f"{key_prefix}."
50
+ role_suffix = ".role"
51
+ for k in data.keys():
52
+ if k.startswith(prefix_dot) and k.endswith(role_suffix):
53
+ try:
54
+ n = int(k[len(prefix_dot):-len(role_suffix)])
55
+ indices.append(n)
56
+ except ValueError:
57
+ continue
58
+ indices.sort()
59
+
60
+ # Build raw messages
61
+ messages: List[Dict[str, str]] = []
62
+ for i in indices:
63
+ role = data.get(f"{key_prefix}.{i}.role")
64
+ # Skip entries without a valid role
65
+ if not role:
66
+ continue
67
+ content = data.get(f"{key_prefix}.{i}.content", "") or ""
68
+ content = _clean_content(content)
69
+
70
+ # Some traces split a single assistant turn across multiple numbered entries
71
+ if merge_consecutive and messages and messages[-1]["role"] == role:
72
+ # Append with a single newline separator
73
+ joined = (messages[-1]["content"] + ("\n" if messages[-1]["content"] and content else "") + content).strip()
74
+ messages[-1]["content"] = joined
75
+ else:
76
+ messages.append({"role": role, "content": content})
77
+
78
+ # Optional final cleanup, remove empty messages if any
79
+ # messages = [m for m in messages if m.get("content", "") != "" or m.get("role") == "system"]
80
+
81
+ return messages
82
+
83
+ # Example:
84
+ # msgs = convert_langtrace_to_openai_messages(your_dict)
85
+ # print(msgs)
86
+ if __name__ == "__main__":
87
+ log_path = "/Users/korenlazar/workspace/LLMEvalKit/tests/ciso_agent/agent_analytics_with_manually_modified_wrong_tool_call.log"
88
+ with open(log_path, "r") as f:
89
+ log_json = json.load(f)
90
+
91
+ log_attributes = log_json.get("attributes", {})
92
+ conversation_history = convert_langtrace_to_openai_messages(log_attributes, merge_consecutive=False)
93
+
94
+ reflections = asyncio.run(reflect_ciso_agent_conversation(conversation_history))
95
+
96
+ with open("reflections.json", "w") as f:
97
+ for reflection_output in reflections:
98
+ json.dump(reflection_output.model_dump(), f)
99
+ f.write("\n")
100
+
101
+ # with open("reflections.json", "r") as f:
102
+ # reflections = [json.loads(line) for line in f]
103
+
104
+ reflection_jsons = [reflection.model_dump() if hasattr(reflection, "model_dump") else reflection for reflection in reflections]
105
+
106
+ with open("reflections_summary.json", "w") as f:
107
+ for reflection_output in reflection_jsons:
108
+ json.dump(create_reflection_result_summary(reflection_output), f)
109
+ f.write("\n")
110
+
111
+ print(reflections)