ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import asyncio
|
|
3
|
+
import json
|
|
4
|
+
import uuid
|
|
5
|
+
from typing import List, Dict, Any
|
|
6
|
+
|
|
7
|
+
from llmevalkit.function_calling.consts import (
|
|
8
|
+
METRIC_AGENTIC_CONSTRAINTS_SATISFACTION,
|
|
9
|
+
METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
|
|
10
|
+
METRIC_GENERAL_HALLUCINATION_CHECK,
|
|
11
|
+
METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT,
|
|
12
|
+
METRIC_PARAMETER_HALLUCINATION_CHECK,
|
|
13
|
+
METRIC_PARAMETER_VALUE_FORMAT_ALIGNMENT,
|
|
14
|
+
)
|
|
15
|
+
from llmevalkit.function_calling.pipeline.pipeline import ReflectionPipeline
|
|
16
|
+
from llmevalkit.function_calling.pipeline.types import ToolCall, ToolSpec
|
|
17
|
+
from llmevalkit.llm.base import get_llm
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def convert_tool_calls_to_openai_format(
|
|
21
|
+
tool_calls: List[Dict[str, Any]],
|
|
22
|
+
) -> List[ToolCall]:
|
|
23
|
+
"""
|
|
24
|
+
Convert a list of tool calls into OpenAI-compatible tool call format.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
tool_calls (List[Dict[str, Any]]): Input tool calls in format:
|
|
28
|
+
[{"name": "...", "arguments": {...}}]
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
List[Dict[str, Any]]: Tool calls in OpenAI format.
|
|
32
|
+
"""
|
|
33
|
+
openai_tool_calls = []
|
|
34
|
+
for call in tool_calls:
|
|
35
|
+
openai_tool_calls.append(
|
|
36
|
+
ToolCall(
|
|
37
|
+
**{
|
|
38
|
+
"id": f"call_{uuid.uuid4().hex[:8]}", # unique id
|
|
39
|
+
"type": "function",
|
|
40
|
+
"function": {
|
|
41
|
+
"name": call["name"],
|
|
42
|
+
"arguments": json.dumps(call.get("arguments", {})),
|
|
43
|
+
},
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
return openai_tool_calls
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ConversationFormatError(ValueError):
|
|
51
|
+
"""Raised when the conversation format is invalid for this extractor."""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def extract_tool_specs_from_conversation(
|
|
55
|
+
conversation: List[Dict[str, Any]], remove_turn: bool = False
|
|
56
|
+
) -> List[Dict[str, Any]]:
|
|
57
|
+
"""
|
|
58
|
+
Extract tool specifications from the system turn in the conversation history.
|
|
59
|
+
The function looks for a system message whose content contains a list of tool specs (as a stringified list of dicts),
|
|
60
|
+
parses it, and returns it as a list of dictionaries.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
conversation: List of conversation turns.
|
|
64
|
+
remove_turn: If True, remove the tool specification turn from the conversation.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
List of tool specifications as dictionaries.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
ConversationFormatError: If no valid tool specifications are found in the conversation.
|
|
71
|
+
"""
|
|
72
|
+
import ast
|
|
73
|
+
|
|
74
|
+
for i, turn in enumerate(conversation):
|
|
75
|
+
if turn.get("role") == "system":
|
|
76
|
+
content = turn.get("content", "")
|
|
77
|
+
# Heuristic: look for a string that looks like a list of dicts (starts with [ and contains 'function')
|
|
78
|
+
# and is not markdown or code block formatted
|
|
79
|
+
# Try to find the first occurrence of a list starting with [ and ending with ]
|
|
80
|
+
start = content.find("[")
|
|
81
|
+
end = content.rfind("]")
|
|
82
|
+
if start != -1 and end != -1 and "function" in content[start:end]:
|
|
83
|
+
possible_list = content[start : end + 1]
|
|
84
|
+
try:
|
|
85
|
+
# Use ast.literal_eval for safety (content is usually single quotes)
|
|
86
|
+
tool_specs = ast.literal_eval(possible_list)
|
|
87
|
+
if isinstance(tool_specs, list):
|
|
88
|
+
# If requested, remove the turn containing tool specs from the conversation
|
|
89
|
+
if remove_turn:
|
|
90
|
+
conversation.pop(i)
|
|
91
|
+
return tool_specs
|
|
92
|
+
except Exception:
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
# If we got here, we didn't find any valid tool specifications
|
|
96
|
+
raise ConversationFormatError("No valid tool specifications found in conversation.")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def extract_tool_calls_to_reflect(
|
|
100
|
+
conversation: List[Dict[str, Any]],
|
|
101
|
+
) -> List[Dict[str, Any]]:
|
|
102
|
+
"""
|
|
103
|
+
Extract tool calls to reflect by identifying patterns in the conversation.
|
|
104
|
+
|
|
105
|
+
The function looks for two patterns:
|
|
106
|
+
1. An assistant message with content 'Act: (Please return only a JSON string)'
|
|
107
|
+
followed by a user message containing the JSON tool calls
|
|
108
|
+
2. An assistant message containing "Act:" followed by a tool call in JSON format
|
|
109
|
+
within the same message
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
conversation: List of conversation turns
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
A flat list of tool call dicts: [{"name": "GenerateKyvernoTool", "arguments": {...}}, ...]
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
ConversationFormatError: On validation or parsing issues
|
|
119
|
+
"""
|
|
120
|
+
if not conversation:
|
|
121
|
+
raise ConversationFormatError("Conversation is empty.")
|
|
122
|
+
|
|
123
|
+
tool_calls = []
|
|
124
|
+
|
|
125
|
+
# Pattern 1: Look for the pattern where the tool call is in the message following "Act:" message
|
|
126
|
+
for i in range(len(conversation) - 1): # Stop at the second-to-last message
|
|
127
|
+
current_turn = conversation[i]
|
|
128
|
+
next_turn = conversation[i + 1]
|
|
129
|
+
|
|
130
|
+
# Look for assistant message with the specific marker
|
|
131
|
+
if (
|
|
132
|
+
current_turn.get("role") == "assistant"
|
|
133
|
+
and isinstance(current_turn.get("content"), str)
|
|
134
|
+
and "Act: (Please return only a JSON string)" in current_turn.get("content")
|
|
135
|
+
):
|
|
136
|
+
# The next message should contain the JSON for the tool call
|
|
137
|
+
content = next_turn.get("content", "")
|
|
138
|
+
if isinstance(content, str):
|
|
139
|
+
# Try to parse the content directly first
|
|
140
|
+
try:
|
|
141
|
+
parsed = ast.literal_eval(content)
|
|
142
|
+
if isinstance(parsed, list):
|
|
143
|
+
for item in parsed:
|
|
144
|
+
if isinstance(item, dict) and "name" in item:
|
|
145
|
+
tool_calls.append(item)
|
|
146
|
+
except (SyntaxError, ValueError):
|
|
147
|
+
# Continue to next pattern if this fails
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
# Pattern 2: Look for "Act:" followed by JSON tool call in the same message
|
|
151
|
+
if not tool_calls:
|
|
152
|
+
for turn in conversation:
|
|
153
|
+
if turn.get("role") == "assistant" and isinstance(turn.get("content"), str):
|
|
154
|
+
content = turn.get("content", "")
|
|
155
|
+
# Look for the pattern "Act: \n[{...}]"
|
|
156
|
+
if "Act:" in content:
|
|
157
|
+
act_start = content.find("Act:")
|
|
158
|
+
if act_start != -1:
|
|
159
|
+
# Find the start of the JSON array after "Act:"
|
|
160
|
+
json_start = content.find("[", act_start)
|
|
161
|
+
json_end = content.rfind("]", act_start)
|
|
162
|
+
|
|
163
|
+
if (
|
|
164
|
+
json_start != -1
|
|
165
|
+
and json_end != -1
|
|
166
|
+
and json_start < json_end
|
|
167
|
+
):
|
|
168
|
+
# Extract the JSON array
|
|
169
|
+
json_str = content[json_start : json_end + 1]
|
|
170
|
+
try:
|
|
171
|
+
# Try to parse the content
|
|
172
|
+
parsed = json.loads(json_str)
|
|
173
|
+
if isinstance(parsed, list):
|
|
174
|
+
for item in parsed:
|
|
175
|
+
if (
|
|
176
|
+
isinstance(item, dict)
|
|
177
|
+
and "name" in item
|
|
178
|
+
and "arguments" in item
|
|
179
|
+
):
|
|
180
|
+
tool_calls.append(item)
|
|
181
|
+
except json.JSONDecodeError:
|
|
182
|
+
# Try with ast.literal_eval as a fallback
|
|
183
|
+
try:
|
|
184
|
+
parsed = ast.literal_eval(json_str)
|
|
185
|
+
if isinstance(parsed, list):
|
|
186
|
+
for item in parsed:
|
|
187
|
+
if (
|
|
188
|
+
isinstance(item, dict)
|
|
189
|
+
and "name" in item
|
|
190
|
+
and "arguments" in item
|
|
191
|
+
):
|
|
192
|
+
tool_calls.append(item)
|
|
193
|
+
except (SyntaxError, ValueError):
|
|
194
|
+
# Continue to next message if this fails
|
|
195
|
+
pass
|
|
196
|
+
|
|
197
|
+
if not tool_calls:
|
|
198
|
+
raise ConversationFormatError(
|
|
199
|
+
"No valid tool calls found to reflect in the conversation."
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return tool_calls
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
async def reflect_ciso_agent_conversation(
|
|
206
|
+
conversation: List[Dict[str, Any]],
|
|
207
|
+
) -> List[Dict[str, Any]]:
|
|
208
|
+
"""
|
|
209
|
+
Reflect the CISO agent conversation by extracting relevant information from the conversation history.
|
|
210
|
+
"""
|
|
211
|
+
# Extract tool specifications and remove that turn from conversation
|
|
212
|
+
tool_specs = extract_tool_specs_from_conversation(conversation, remove_turn=True)
|
|
213
|
+
|
|
214
|
+
tool_specs_objs = [
|
|
215
|
+
ToolSpec(**spec) for spec in tool_specs if isinstance(spec, dict)
|
|
216
|
+
]
|
|
217
|
+
|
|
218
|
+
# Extract tool calls from the conversation
|
|
219
|
+
tool_calls = extract_tool_calls_to_reflect(conversation)
|
|
220
|
+
converted_tool_calls = convert_tool_calls_to_openai_format(tool_calls)
|
|
221
|
+
|
|
222
|
+
# Define ReflectionPipeline object
|
|
223
|
+
MetricsClientCls = get_llm("litellm.rits.output_val")
|
|
224
|
+
metrics_client = MetricsClientCls(
|
|
225
|
+
model_name="meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
|
|
226
|
+
)
|
|
227
|
+
reflection_pipeline = ReflectionPipeline(
|
|
228
|
+
metrics_client=metrics_client,
|
|
229
|
+
general_metrics=[
|
|
230
|
+
METRIC_GENERAL_HALLUCINATION_CHECK,
|
|
231
|
+
METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT,
|
|
232
|
+
],
|
|
233
|
+
function_metrics=[
|
|
234
|
+
METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
|
|
235
|
+
METRIC_AGENTIC_CONSTRAINTS_SATISFACTION,
|
|
236
|
+
],
|
|
237
|
+
parameter_metrics=[
|
|
238
|
+
# METRIC_PARAMETER_VALUE_FORMAT_ALIGNMENT,
|
|
239
|
+
# METRIC_PARAMETER_HALLUCINATION_CHECK,
|
|
240
|
+
],
|
|
241
|
+
runtime_pipeline=True, # Uncomment this line to enable evaluation pipeline,
|
|
242
|
+
# i.e. with actionable recommendations for the agent development.
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Reflect the tool calls in the conversation
|
|
246
|
+
reflection_outputs = []
|
|
247
|
+
for tool_call_to_reflect in converted_tool_calls:
|
|
248
|
+
reflection_outputs.append(
|
|
249
|
+
await reflection_pipeline.run_async(
|
|
250
|
+
conversation=conversation,
|
|
251
|
+
inventory=tool_specs_objs,
|
|
252
|
+
call=tool_call_to_reflect,
|
|
253
|
+
continue_on_static=True,
|
|
254
|
+
)
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return reflection_outputs
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def create_reflection_result_summary(
|
|
261
|
+
reflection_output: Dict[str, Any],
|
|
262
|
+
) -> Dict[str, Any]:
|
|
263
|
+
"""
|
|
264
|
+
Generate the final result with syntactic_errors, semantic_errors, and corrections.
|
|
265
|
+
"""
|
|
266
|
+
original_tool_call = reflection_output["inputs"]["tool_call"]["function"]
|
|
267
|
+
result = {
|
|
268
|
+
"syntactic_errors": {},
|
|
269
|
+
"semantic_errors": {},
|
|
270
|
+
"corrections": {},
|
|
271
|
+
"overall_valid": reflection_output.get("overall_valid"),
|
|
272
|
+
"original_tool_call": {
|
|
273
|
+
"name": original_tool_call["name"],
|
|
274
|
+
"arguments": original_tool_call["parsed_arguments"],
|
|
275
|
+
},
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
# 1. Process syntactic errors from static metrics
|
|
279
|
+
static = reflection_output.get("static", {})
|
|
280
|
+
static_metrics = static.get("metrics", {})
|
|
281
|
+
for metric_name, metric_data in static_metrics.items():
|
|
282
|
+
if not metric_data.get("valid", True):
|
|
283
|
+
explanation = metric_data.get("explanation", "No explanation provided")
|
|
284
|
+
result["syntactic_errors"][metric_name] = explanation
|
|
285
|
+
|
|
286
|
+
# 2. Process semantic errors - prioritize function_selection over general
|
|
287
|
+
semantic = reflection_output.get("semantic", {})
|
|
288
|
+
|
|
289
|
+
# Check function_selection metrics first
|
|
290
|
+
function_selection = semantic.get("function_selection", {})
|
|
291
|
+
if function_selection: # Only process if function_selection exists
|
|
292
|
+
function_selection_metrics = function_selection.get("metrics", {})
|
|
293
|
+
|
|
294
|
+
for metric_name, metric_data in function_selection_metrics.items():
|
|
295
|
+
if metric_data.get("is_issue", False):
|
|
296
|
+
raw_response = metric_data.get("raw_response", {})
|
|
297
|
+
explanation = raw_response.get("explanation", "No explanation provided")
|
|
298
|
+
result["semantic_errors"][metric_name] = explanation
|
|
299
|
+
|
|
300
|
+
# Add corrections from function_selection based on specific error types
|
|
301
|
+
correction = raw_response.get("correction", {})
|
|
302
|
+
if metric_name == "function_selection_appropriateness" and (
|
|
303
|
+
"corrected_function" in correction
|
|
304
|
+
or "corrected_function_name" in correction
|
|
305
|
+
):
|
|
306
|
+
# For function selection errors, we only need the tool name
|
|
307
|
+
corrected_func = correction.get(
|
|
308
|
+
"corrected_function"
|
|
309
|
+
) or correction.get("corrected_function_name")
|
|
310
|
+
if corrected_func == "no_function":
|
|
311
|
+
pass
|
|
312
|
+
elif isinstance(corrected_func, dict) and "name" in corrected_func:
|
|
313
|
+
result["corrections"]["corrected_tool_name"] = corrected_func[
|
|
314
|
+
"name"
|
|
315
|
+
]
|
|
316
|
+
elif isinstance(corrected_func, str):
|
|
317
|
+
try:
|
|
318
|
+
parsed = json.loads(corrected_func)
|
|
319
|
+
if isinstance(parsed, dict) and "name" in parsed:
|
|
320
|
+
result["corrections"]["corrected_tool_name"] = parsed[
|
|
321
|
+
"name"
|
|
322
|
+
]
|
|
323
|
+
except json.JSONDecodeError:
|
|
324
|
+
result["corrections"][
|
|
325
|
+
"corrected_tool_name"
|
|
326
|
+
] = corrected_func
|
|
327
|
+
elif (
|
|
328
|
+
metric_name == "agentic_constraints_satisfaction"
|
|
329
|
+
and "prerequisite_tool_calls" in correction
|
|
330
|
+
):
|
|
331
|
+
if correction["prerequisite_tool_calls"]: # Only add if not empty
|
|
332
|
+
result["corrections"]["prerequisite_tool_calls"] = correction[
|
|
333
|
+
"prerequisite_tool_calls"
|
|
334
|
+
]
|
|
335
|
+
|
|
336
|
+
# Only process general metrics if no function_selection errors
|
|
337
|
+
# if not has_function_selection_errors:
|
|
338
|
+
general = semantic.get("general", {})
|
|
339
|
+
if general: # Only process if general exists
|
|
340
|
+
general_metrics = general.get("metrics", {})
|
|
341
|
+
|
|
342
|
+
for metric_name, metric_data in general_metrics.items():
|
|
343
|
+
if metric_data.get("is_issue", False):
|
|
344
|
+
raw_response = metric_data.get("raw_response", {})
|
|
345
|
+
explanation = raw_response.get("explanation", "No explanation provided")
|
|
346
|
+
result["semantic_errors"][metric_name] = explanation
|
|
347
|
+
|
|
348
|
+
# Add corrections from general metrics based on specific error types
|
|
349
|
+
correction = raw_response.get("correction", {})
|
|
350
|
+
if (
|
|
351
|
+
metric_name
|
|
352
|
+
in ["general_hallucination_check", "value_format_alignment"]
|
|
353
|
+
and "tool_call" in correction
|
|
354
|
+
):
|
|
355
|
+
# For general errors, we provide the complete corrected tool call
|
|
356
|
+
corrected_call = correction["tool_call"]
|
|
357
|
+
if isinstance(corrected_call, str):
|
|
358
|
+
try:
|
|
359
|
+
corrected_call = json.loads(corrected_call)
|
|
360
|
+
except json.JSONDecodeError:
|
|
361
|
+
pass # Keep as string if parsing fails
|
|
362
|
+
if corrected_call:
|
|
363
|
+
result["corrections"]["corrected_tool_call"] = corrected_call
|
|
364
|
+
|
|
365
|
+
return result
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
if __name__ == "__main__":
|
|
369
|
+
# Example usage
|
|
370
|
+
# tool_call = [
|
|
371
|
+
# {
|
|
372
|
+
# "name": "GenerateKyvernoTool",
|
|
373
|
+
# "arguments": {
|
|
374
|
+
# "sentence": "Generate a Kyverno policy to check if the cluster-admin role is only used where required.",
|
|
375
|
+
# "policy_file": "/tmp/agent/20250122154450/policy.yaml",
|
|
376
|
+
# },
|
|
377
|
+
# }
|
|
378
|
+
# ]
|
|
379
|
+
|
|
380
|
+
# tool_calls_converted = convert_tool_calls_to_openai_format(tool_call)
|
|
381
|
+
# for tool_call in tool_calls_converted:
|
|
382
|
+
# print(tool_call.model_dump_json(indent=2))
|
|
383
|
+
|
|
384
|
+
# tool_specs = [
|
|
385
|
+
# {
|
|
386
|
+
# "type": "function",
|
|
387
|
+
# "function": {
|
|
388
|
+
# "name": "GenerateKyvernoTool",
|
|
389
|
+
# "description": "The tool to generate a Kyverno policy. This tool returns the generated Kyverno policy. This can be used for updating existing Kyverno policy.\n",
|
|
390
|
+
# "parameters": {
|
|
391
|
+
# "type": "object",
|
|
392
|
+
# "properties": {
|
|
393
|
+
# "sentence": {
|
|
394
|
+
# "type": "str",
|
|
395
|
+
# "description": "A comprehensive description to request Kyverno policy generation.\nThis must be containing any level of details about the Kyverno policy to be generated.\nIf you got any errors especially about syntax when you invoked this function previously, mention it here for improving the generation result this time.\nFor example, when you got an error like `.spec.rules[0].match.any[0].selector: field not declared in schema`, add the following to the sentence: `previous trial failed because .spec.rules[0].match.any[0].selector is not available field for Kyverno policy`\n",
|
|
396
|
+
# },
|
|
397
|
+
# "policy_file": {
|
|
398
|
+
# "type": "str",
|
|
399
|
+
# "description": "filepath for the Kyverno policy to be saved.",
|
|
400
|
+
# },
|
|
401
|
+
# "current_policy_file": {
|
|
402
|
+
# "type": "str",
|
|
403
|
+
# "description": "filepath of the current Kyverno policy to be updated. Only needed when updating an existing policy",
|
|
404
|
+
# "default": "",
|
|
405
|
+
# },
|
|
406
|
+
# },
|
|
407
|
+
# "required": ["sentence", "policy_file"],
|
|
408
|
+
# },
|
|
409
|
+
# },
|
|
410
|
+
# },
|
|
411
|
+
# {
|
|
412
|
+
# "type": "function",
|
|
413
|
+
# "function": {
|
|
414
|
+
# "name": "RunKubectlTool",
|
|
415
|
+
# "description": 'The tool to execute a kubectl command.\nThis tool returns the following:\n - return_code: if 0, the command was successful, otherwise, failure.\n - stdout: standard output of the command (only when `return_output` is True)\n - stderr: standard error of the command (only when error occurred)\n - script_file: saved script path if applicable\n\nFor example, to execute `kubectl get pod -n default --kubeconfig kubeconfig.yaml`,\nTool Input should be the following:\n{"args": "get pod -n default --kubeconfig kubeconfig.yaml", "output_file": "", "return_output": "True", "script_file": ""}\n\nHint:\n- If you need to get all pods in all namespaces, you can do it by `kubectl get pods --all-namespaces --kubeconfig <kubeconfig_path> -o json`\n',
|
|
416
|
+
# "parameters": {
|
|
417
|
+
# "type": "object",
|
|
418
|
+
# "properties": {
|
|
419
|
+
# "args": {
|
|
420
|
+
# "type": "str",
|
|
421
|
+
# "description": "command arguments after `kubectl`. `--kubeconfig` should be specified here. Multiple commands with `;` or `&&` is not allowed. Using pipe `|` for jq are not allowed too. Just save the entire JSON if you want.",
|
|
422
|
+
# },
|
|
423
|
+
# "output_file": {
|
|
424
|
+
# "type": "str",
|
|
425
|
+
# "description": "The filepath to save the result. If empty string, not save anything",
|
|
426
|
+
# "default": "",
|
|
427
|
+
# },
|
|
428
|
+
# "return_output": {
|
|
429
|
+
# "type": "str",
|
|
430
|
+
# "description": 'A boolean string. Set this to "True" if you want to get the command output',
|
|
431
|
+
# "default": "False",
|
|
432
|
+
# },
|
|
433
|
+
# "script_file": {
|
|
434
|
+
# "type": "str",
|
|
435
|
+
# "description": "A filepath. If provided, save the kubectl command as a script at the specified file.",
|
|
436
|
+
# "default": "",
|
|
437
|
+
# },
|
|
438
|
+
# },
|
|
439
|
+
# "required": ["args"],
|
|
440
|
+
# },
|
|
441
|
+
# },
|
|
442
|
+
# },
|
|
443
|
+
# ]
|
|
444
|
+
|
|
445
|
+
conversation_history = [
|
|
446
|
+
{
|
|
447
|
+
"role": "system",
|
|
448
|
+
"content": "<|start_header_id|>system<|end_header_id|>\nYou are a helpful assistant with access to the following function calls. Your task is to produce a sequence of function calls necessary to generate response to the user utterance. Use the following function calls as required. Do not make up an original tool not listed below. All function arguments must be defined under `arguments` attribute.\n<|eot_id|>\n",
|
|
449
|
+
},
|
|
450
|
+
{
|
|
451
|
+
"role": "system",
|
|
452
|
+
"content": '<|start_header_id|>user<|end_header_id|>\nWhat profession does Nicholas Ray and Elia Kazan have in common?\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\nTho: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.\nAct: \n[{"name": "Search", "arguments": {"topic": "Nicholas Ray"}}]\nObs: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 - June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.\nTho: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.\nAct: \n[{"name": "Search", "arguments": {"topic": "Elia Kazan"}}]\nObs: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.\nTho: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.\nAct: \n[{"name": "Finish"}]\n<|eot_id|>\n',
|
|
453
|
+
},
|
|
454
|
+
{
|
|
455
|
+
"role": "user",
|
|
456
|
+
"content": '<|start_header_id|>system<|end_header_id|>\nThe above dialog is an example flow to solve the original question.\nTho (thought) comes first to describe your thought in a natural language text, then Act (action) in JSON to execute functions, and Obs (observation) is a feedback of the action and then the next thought is a reasoning of the next action.\nEach "Tho" must be a reasoning description to come up with the next action. This must not be an action string because it may cause confusion of the action history.\nEach "Act" must be a list object like the following\n [{"name": "<FUNC_NAME>", "arguments": {"<KEY>": "<VALUE>"}}]\nWhen you got an answer, you must return an action [{"name": "Finish"}]\nWhen you see errors and you can\'t fix them, previous personas before you might be cause. To get back to them, call `Error` function with some error report like `[{"name": "Error", "arguments": {"error": "<ERROR DETAILS>"}}]`\nThe function names in the dialog are just examples, you must use functions you have.\nYou must simplify your answer as much as possible.\nLong explanation is not needed.\nEven when you need to do multiple actions, generate just the very next 1 action.\nWhen you are seeing the same error again and again, please forget the details and remind your original goal.\n<|eot_id|>\n',
|
|
457
|
+
},
|
|
458
|
+
{
|
|
459
|
+
"role": "user",
|
|
460
|
+
"content": '<|start_header_id|>system<|end_header_id|>\nYou are requested to achieve the given `Goal`. When an `Original Goal` is given, it contains requests to other people and tools you can use are not always sufficient to achieve this original goal. In such case, you can quit our work. Then next person will continue the task.\nYour thought is not directly transfered to actions, so you must transfer them to actions especially about errors.\nThe action you thought must be a SINGLE LINE JSON and do not include any explanation for it in Act: step.\nWhen you perform policy evaluation and the result was false, which means the compliance requirement is not satisfied, just report them and do not remediate it.\nWhen an action has an argument `sentence`, it is an input text for LLM service to generate something (e.g. Playbook). You have to specify details here as much as possible.\nWhen you want to use newline in the sentence arguemnts, you have to escape it like {"sentence": "this is the first line.\\\\nthis is the second line."} and escape all quoation in the arguments like {"sentence": "escape quotes like \\"this\\" when you use quotes"}\nSpecify `script_file` as "script.sh" when you want to use RunKubectl to get `collected_data.json`.\nWhen the goal says `Minimize something`, it means no instances are allowed.\nWhen the goal says `role` for Kubernetes, it means both RoleBinding and ClusterRoleBinding should be in scope.\nWhen the goal mentions some extra conditions like `exclude xxxx namespace` and `apply this policy only to resources labeled as yyyy=zzzz`, then these conditions must be included in `sentence` of GenerateKyvernoTool\nAfter deploying a Kyverno policy, do not validate it by deploying other resources. Just finish.\nAfter generating a Playbook, you have to run it with RunPlaybook.\n When it is required to get some existing Kyverno policies, you can do `kubectl get clusterpolicies --kubeconfig /path/to/kubeconfig`.\nWhen getting names of exisintg kyverno clusterpolicies, you should not specify "-o" option. Just to do kubectl get without name and -o\n\n<|eot_id|>\n',
|
|
461
|
+
},
|
|
462
|
+
{
|
|
463
|
+
"role": "system",
|
|
464
|
+
"content": "<|start_header_id|>system<|end_header_id|>\nYour Goal:\n```\nGenerate a Kyverno policy to check if the cluster-admin role is only used where required.\nSave the generated policy to policy.yaml in the workdir.\nDeploy the generated policy to the Kubernetes cluster with kubeconfig /tmp/agent/20250122154450/kubeconfig.yaml.\n\n```\n\nOriginal Goal (NOTE: This is NOT your goal; Do not do everything here):\n```\nI would like to check if the following condition is satisfiled, given a Kubernetes cluster with `kubeconfig.yaml`\n Ensure that the cluster-admin role is only used where required.\n\nTo check the condition, do the following steps.\n- deploy a Kyverno policy to the cluster\n- chcek if the policy is correctly deployed.\n\n\nIf deploying the policy failed and if you can fix the issue, you will do it and try deploying again.\nOnce you get a final answer, you can quit the work.\n\nThe cluster's kubeconfig is at `/tmp/agent/20250122154450/kubeconfig.yaml`.\nYou can use `/tmp/agent/20250122154450/` as your workdir.\n\n```\n\n\nSteps Already Done and Results:\n```yaml\n- input:\n goal: |\n Generate a Kyverno policy to check if the cluster-admin role is only used where required.\n Save the generated policy to policy.yaml in the workdir.\n Deploy the generated policy to the Kubernetes cluster with kubeconfig /tmp/agent/20250122154450/kubeconfig.yaml.\n kubeconfig: /tmp/agent/20250122154450/kubeconfig.yaml\n workdir: /tmp/agent/20250122154450/\n node: kyverno\n output:\n deployed_resource:\n kind: ClusterPolicy\n name: restrict-cluster-admin\n path_to_generated_kyverno_policy: /tmp/agent/20250122154450/policy.yaml\n- input:\n goal: |\n Get the deployed Kyverno policy from the cluster with kubeconfig /tmp/agent/20250122154450/kubeconfig.yaml.\n Save the result to deployed_policy.json.\n kubeconfig: /tmp/agent/20250122154450/kubeconfig.yaml\n workdir: /tmp/agent/20250122154450/\n node: kubernetes\n output:\n error: error message\n\n```\n\n<|eot_id|>\n",
|
|
465
|
+
},
|
|
466
|
+
{
|
|
467
|
+
"role": "system",
|
|
468
|
+
"content": "<|start_header_id|>system<|end_header_id|>\n[{'type': 'function', 'function': {'name': 'GenerateKyvernoTool', 'description': 'The tool to generate a Kyverno policy. This tool returns the generated Kyverno policy. This can be used for updating existing Kyverno policy.\\n', 'parameters': {'type': 'object', 'properties': {'sentence': {'type': 'str', 'description': 'A comprehensive description to request Kyverno policy generation.\\nThis must be containing any level of details about the Kyverno policy to be generated.\\nIf you got any errors especially about syntax when you invoked this function previously, mention it here for improving the generation result this time.\\nFor example, when you got an error like `.spec.rules[0].match.any[0].selector: field not declared in schema`, add the following to the sentence: `previous trial failed because .spec.rules[0].match.any[0].selector is not available field for Kyverno policy`\\n'}, 'policy_file': {'type': 'str', 'description': 'filepath for the Kyverno policy to be saved.'}, 'current_policy_file': {'type': 'str', 'description': 'filepath of the current Kyverno policy to be updated. Only needed when updating an existing policy', 'default': ''}}, 'required': ['sentence', 'policy_file']}}}, {'type': 'function', 'function': {'name': 'RunKubectlTool', 'description': 'The tool to execute a kubectl command.\\nThis tool returns the following:\\n - return_code: if 0, the command was successful, otherwise, failure.\\n - stdout: standard output of the command (only when `return_output` is True)\\n - stderr: standard error of the command (only when error occurred)\\n - script_file: saved script path if applicable\\n\\nFor example, to execute `kubectl get pod -n default --kubeconfig kubeconfig.yaml`,\\nTool Input should be the following:\\n{\"args\": \"get pod -n default --kubeconfig kubeconfig.yaml\", \"output_file\": \"\", \"return_output\": \"True\", \"script_file\": \"\"}\\n\\nHint:\\n- If you need to get all pods in all namespaces, you can do it by `kubectl get pods --all-namespaces --kubeconfig <kubeconfig_path> -o json`\\n', 'parameters': {'type': 'object', 'properties': {'args': {'type': 'str', 'description': 'command arguments after `kubectl`. `--kubeconfig` should be specified here. Multiple commands with `;` or `&&` is not allowed. Using pipe `|` for jq are not allowed too. Just save the entire JSON if you want.'}, 'output_file': {'type': 'str', 'description': 'The filepath to save the result. If empty string, not save anything', 'default': ''}, 'return_output': {'type': 'str', 'description': 'A boolean string. Set this to \"True\" if you want to get the command output', 'default': False}, 'script_file': {'type': 'str', 'description': 'A filepath. If provided, save the kubectl command as a script at the specified file.', 'default': ''}}, 'required': ['args']}}}]\n<|eot_id|>\n",
|
|
469
|
+
},
|
|
470
|
+
{
|
|
471
|
+
"role": "assistant",
|
|
472
|
+
"content": "<|start_header_id|>assistant<|end_header_id|>Tho: ",
|
|
473
|
+
},
|
|
474
|
+
{
|
|
475
|
+
"role": "assistant",
|
|
476
|
+
"content": " To achieve the given goal, I need to generate a Kyverno policy to check if the cluster-admin role is only used where required, save it to policy.yaml, and then deploy it to the Kubernetes cluster. First, I will generate the Kyverno policy using the GenerateKyvernoTool function.\nAct:",
|
|
477
|
+
},
|
|
478
|
+
{
|
|
479
|
+
"role": "assistant",
|
|
480
|
+
"content": "<|start_header_id|>assistant<|end_header_id|>Act: (Please return only a JSON string)\n",
|
|
481
|
+
},
|
|
482
|
+
{
|
|
483
|
+
"role": "assistant",
|
|
484
|
+
"content": '[{"name": "GenerateKyvernoTool", "arguments": {"sentence": "Generate a Kyverno policy to check if the cluster-admin role is only used where required. The policy should validate ClusterRoleBindings and check if the role is bound to a user or a service account.", "policy_file": "policy.yaml"}}]',
|
|
485
|
+
},
|
|
486
|
+
{
|
|
487
|
+
"role": "user",
|
|
488
|
+
"content": "[{'name': 'GenerateKyvernoTool', 'arguments': {'sentence': 'Generate a Kyverno policy to check if the cluster-admin role is only used where required. The policy should validate ClusterRoleBindings and check if the role is bound to a user or a service account.', 'policy_file': 'policy.yaml'}}]",
|
|
489
|
+
},
|
|
490
|
+
{
|
|
491
|
+
"role": "assistant",
|
|
492
|
+
"content": "<|start_header_id|>assistant<|end_header_id|>Obs: <|eot_id|>",
|
|
493
|
+
},
|
|
494
|
+
{
|
|
495
|
+
"role": "user",
|
|
496
|
+
"content": 'The generated policy is below:\n```yaml\n\napiVersion: kyverno.io/v1\nkind: ClusterPolicy\nmetadata:\n name: restrict-cluster-admin-role\nspec:\n rules:\n - name: validate-cluster-admin-binding\n match:\n any:\n - resources:\n kinds:\n - ClusterRoleBinding\n validate:\n message: "ClusterRoleBinding for cluster-admin role should be reviewed."\n pattern:\n roleRef:\n name: "cluster-admin"\n kind: "ClusterRole"\n subjects:\n - kind: "ServiceAccount"\n\n```\n\nThis policy file has been saved at /tmp/agent/20250122154450/policy.yaml.\n',
|
|
497
|
+
},
|
|
498
|
+
]
|
|
499
|
+
|
|
500
|
+
# Extract tool calls to reflect
|
|
501
|
+
# raw_tool_calls = extract_tool_calls_to_reflect(conversation_history)
|
|
502
|
+
#
|
|
503
|
+
# print("Extracted tool calls:")
|
|
504
|
+
# print(json.dumps(raw_tool_calls, indent=2))
|
|
505
|
+
|
|
506
|
+
# Convert them to OpenAI tool_call objects
|
|
507
|
+
# openai_tool_calls = convert_tool_calls_to_openai_format(raw_tool_calls)
|
|
508
|
+
#
|
|
509
|
+
# print("Converted OpenAI tool calls:")
|
|
510
|
+
# for call in openai_tool_calls:
|
|
511
|
+
# print(f"{call.model_dump_json(indent=2)}")
|
|
512
|
+
|
|
513
|
+
# Call the main reflection function
|
|
514
|
+
reflections = asyncio.run(reflect_ciso_agent_conversation(conversation_history))
|
|
515
|
+
|
|
516
|
+
print(reflections)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
from typing import Dict, List, Any
|
|
5
|
+
|
|
6
|
+
from llmevalkit.ciso_agent.main import create_reflection_result_summary, reflect_ciso_agent_conversation
|
|
7
|
+
|
|
8
|
+
_HEADER_RE = re.compile(r"<\|start_header_id\|>.*?<\|end_header_id\|>\n?", re.DOTALL)
|
|
9
|
+
_CONTROL_TOKENS = [
|
|
10
|
+
"<|eot_id|>",
|
|
11
|
+
"<|python_end|>",
|
|
12
|
+
"<|eom|>",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
def _clean_content(text: str) -> str:
|
|
16
|
+
if not text:
|
|
17
|
+
return ""
|
|
18
|
+
# Keep only the part before the first end of turn marker, if present
|
|
19
|
+
parts = text.split("<|eot_id|>", 1)
|
|
20
|
+
text = parts[0]
|
|
21
|
+
# Remove any residual header blocks like <|start_header_id|>role<|end_header_id|>
|
|
22
|
+
text = _HEADER_RE.sub("", text)
|
|
23
|
+
# Remove stray control tokens that sometimes appear inline
|
|
24
|
+
for tok in _CONTROL_TOKENS:
|
|
25
|
+
text = text.replace(tok, "")
|
|
26
|
+
return text.strip()
|
|
27
|
+
|
|
28
|
+
def convert_langtrace_to_openai_messages(
|
|
29
|
+
data: Dict[str, Any],
|
|
30
|
+
key_prefix: str = "gen_ai.prompt",
|
|
31
|
+
merge_consecutive: bool = True,
|
|
32
|
+
) -> List[Dict[str, str]]:
|
|
33
|
+
"""
|
|
34
|
+
Convert a flat dict that contains keys like:
|
|
35
|
+
gen_ai.prompt.0.role, gen_ai.prompt.0.content, ...
|
|
36
|
+
into an OpenAI messages list:
|
|
37
|
+
[{"role": "system", "content": "..."}, ...]
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
data: source dictionary
|
|
41
|
+
key_prefix: base prefix for prompt entries
|
|
42
|
+
merge_consecutive: if True, merge adjacent messages that share the same role
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
List of messages suitable for OpenAI chat completions
|
|
46
|
+
"""
|
|
47
|
+
# Collect all indices N for which a role exists
|
|
48
|
+
indices = []
|
|
49
|
+
prefix_dot = f"{key_prefix}."
|
|
50
|
+
role_suffix = ".role"
|
|
51
|
+
for k in data.keys():
|
|
52
|
+
if k.startswith(prefix_dot) and k.endswith(role_suffix):
|
|
53
|
+
try:
|
|
54
|
+
n = int(k[len(prefix_dot):-len(role_suffix)])
|
|
55
|
+
indices.append(n)
|
|
56
|
+
except ValueError:
|
|
57
|
+
continue
|
|
58
|
+
indices.sort()
|
|
59
|
+
|
|
60
|
+
# Build raw messages
|
|
61
|
+
messages: List[Dict[str, str]] = []
|
|
62
|
+
for i in indices:
|
|
63
|
+
role = data.get(f"{key_prefix}.{i}.role")
|
|
64
|
+
# Skip entries without a valid role
|
|
65
|
+
if not role:
|
|
66
|
+
continue
|
|
67
|
+
content = data.get(f"{key_prefix}.{i}.content", "") or ""
|
|
68
|
+
content = _clean_content(content)
|
|
69
|
+
|
|
70
|
+
# Some traces split a single assistant turn across multiple numbered entries
|
|
71
|
+
if merge_consecutive and messages and messages[-1]["role"] == role:
|
|
72
|
+
# Append with a single newline separator
|
|
73
|
+
joined = (messages[-1]["content"] + ("\n" if messages[-1]["content"] and content else "") + content).strip()
|
|
74
|
+
messages[-1]["content"] = joined
|
|
75
|
+
else:
|
|
76
|
+
messages.append({"role": role, "content": content})
|
|
77
|
+
|
|
78
|
+
# Optional final cleanup, remove empty messages if any
|
|
79
|
+
# messages = [m for m in messages if m.get("content", "") != "" or m.get("role") == "system"]
|
|
80
|
+
|
|
81
|
+
return messages
|
|
82
|
+
|
|
83
|
+
# Example:
|
|
84
|
+
# msgs = convert_langtrace_to_openai_messages(your_dict)
|
|
85
|
+
# print(msgs)
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
log_path = "/Users/korenlazar/workspace/LLMEvalKit/tests/ciso_agent/agent_analytics_with_manually_modified_wrong_tool_call.log"
|
|
88
|
+
with open(log_path, "r") as f:
|
|
89
|
+
log_json = json.load(f)
|
|
90
|
+
|
|
91
|
+
log_attributes = log_json.get("attributes", {})
|
|
92
|
+
conversation_history = convert_langtrace_to_openai_messages(log_attributes, merge_consecutive=False)
|
|
93
|
+
|
|
94
|
+
reflections = asyncio.run(reflect_ciso_agent_conversation(conversation_history))
|
|
95
|
+
|
|
96
|
+
with open("reflections.json", "w") as f:
|
|
97
|
+
for reflection_output in reflections:
|
|
98
|
+
json.dump(reflection_output.model_dump(), f)
|
|
99
|
+
f.write("\n")
|
|
100
|
+
|
|
101
|
+
# with open("reflections.json", "r") as f:
|
|
102
|
+
# reflections = [json.loads(line) for line in f]
|
|
103
|
+
|
|
104
|
+
reflection_jsons = [reflection.model_dump() if hasattr(reflection, "model_dump") else reflection for reflection in reflections]
|
|
105
|
+
|
|
106
|
+
with open("reflections_summary.json", "w") as f:
|
|
107
|
+
for reflection_output in reflection_jsons:
|
|
108
|
+
json.dump(create_reflection_result_summary(reflection_output), f)
|
|
109
|
+
f.write("\n")
|
|
110
|
+
|
|
111
|
+
print(reflections)
|