ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,2725 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import time
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from threading import Lock
|
|
14
|
+
from typing import Annotated, Callable, List, Optional, Set
|
|
15
|
+
from uuid import uuid4
|
|
16
|
+
|
|
17
|
+
from pydantic import Field, PrivateAttr
|
|
18
|
+
|
|
19
|
+
from ibm_watsonx_gov.ai_experiments.ai_experiments_client import \
|
|
20
|
+
AIExperimentsClient
|
|
21
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration
|
|
22
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
23
|
+
TracingConfiguration
|
|
24
|
+
from ibm_watsonx_gov.entities import ai_experiment as ai_experiment_entity
|
|
25
|
+
from ibm_watsonx_gov.entities.agentic_app import AgenticApp, Node
|
|
26
|
+
from ibm_watsonx_gov.entities.agentic_evaluation_result import \
|
|
27
|
+
AgenticEvaluationResult
|
|
28
|
+
from ibm_watsonx_gov.entities.ai_evaluation import AIEvaluationAsset
|
|
29
|
+
from ibm_watsonx_gov.entities.ai_experiment import (AIExperiment,
|
|
30
|
+
AIExperimentRun,
|
|
31
|
+
AIExperimentRunRequest)
|
|
32
|
+
from ibm_watsonx_gov.entities.evaluation_result import AgentMetricResult
|
|
33
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
34
|
+
from ibm_watsonx_gov.evaluators.base_evaluator import BaseEvaluator
|
|
35
|
+
from ibm_watsonx_gov.metric_groups.answer_quality.answer_quality_decorator import \
|
|
36
|
+
AnswerQualityDecorator
|
|
37
|
+
from ibm_watsonx_gov.metric_groups.content_safety.content_safety_decorator import \
|
|
38
|
+
ContentSafetyDecorator
|
|
39
|
+
from ibm_watsonx_gov.metric_groups.readability.readability_decorator import \
|
|
40
|
+
ReadabilityDecorator
|
|
41
|
+
from ibm_watsonx_gov.metric_groups.retrieval_quality.retrieval_quality_decorator import \
|
|
42
|
+
RetrievalQualityDecorator
|
|
43
|
+
from ibm_watsonx_gov.metrics.answer_relevance.answer_relevance_decorator import \
|
|
44
|
+
AnswerRelevanceDecorator
|
|
45
|
+
from ibm_watsonx_gov.metrics.answer_similarity.answer_similarity_decorator import \
|
|
46
|
+
AnswerSimilarityDecorator
|
|
47
|
+
from ibm_watsonx_gov.metrics.average_precision.average_precision_decorator import \
|
|
48
|
+
AveragePrecisionDecorator
|
|
49
|
+
from ibm_watsonx_gov.metrics.context_relevance.context_relevance_decorator import \
|
|
50
|
+
ContextRelevanceDecorator
|
|
51
|
+
from ibm_watsonx_gov.metrics.evasiveness.evasiveness_decorator import \
|
|
52
|
+
EvasivenessDecorator
|
|
53
|
+
from ibm_watsonx_gov.metrics.faithfulness.faithfulness_decorator import \
|
|
54
|
+
FaithfulnessDecorator
|
|
55
|
+
from ibm_watsonx_gov.metrics.hap.hap_decorator import HAPDecorator
|
|
56
|
+
from ibm_watsonx_gov.metrics.harm.harm_decorator import HarmDecorator
|
|
57
|
+
from ibm_watsonx_gov.metrics.harm_engagement.harm_engagement_decorator import \
|
|
58
|
+
HarmEngagementDecorator
|
|
59
|
+
from ibm_watsonx_gov.metrics.hit_rate.hit_rate_decorator import \
|
|
60
|
+
HitRateDecorator
|
|
61
|
+
from ibm_watsonx_gov.metrics.jailbreak.jailbreak_decorator import \
|
|
62
|
+
JailbreakDecorator
|
|
63
|
+
from ibm_watsonx_gov.metrics.keyword_detection.keyword_detection_decorator import \
|
|
64
|
+
KeywordDetectionDecorator
|
|
65
|
+
from ibm_watsonx_gov.metrics.ndcg.ndcg_decorator import NDCGDecorator
|
|
66
|
+
from ibm_watsonx_gov.metrics.pii.pii_decorator import PIIDecorator
|
|
67
|
+
from ibm_watsonx_gov.metrics.profanity.profanity_decorator import \
|
|
68
|
+
ProfanityDecorator
|
|
69
|
+
from ibm_watsonx_gov.metrics.prompt_safety_risk.prompt_safety_risk_decorator import \
|
|
70
|
+
PromptSafetyRiskDecorator
|
|
71
|
+
from ibm_watsonx_gov.metrics.reciprocal_rank.reciprocal_rank_decorator import \
|
|
72
|
+
ReciprocalRankDecorator
|
|
73
|
+
from ibm_watsonx_gov.metrics.regex_detection.regex_detection_decorator import \
|
|
74
|
+
RegexDetectionDecorator
|
|
75
|
+
from ibm_watsonx_gov.metrics.retrieval_precision.retrieval_precision_decorator import \
|
|
76
|
+
RetrievalPrecisionDecorator
|
|
77
|
+
from ibm_watsonx_gov.metrics.sexual_content.sexual_content_decorator import \
|
|
78
|
+
SexualContentDecorator
|
|
79
|
+
from ibm_watsonx_gov.metrics.social_bias.social_bias_decorator import \
|
|
80
|
+
SocialBiasDecorator
|
|
81
|
+
from ibm_watsonx_gov.metrics.text_grade_level.text_grade_level_decorator import \
|
|
82
|
+
TextGradeLevelDecorator
|
|
83
|
+
from ibm_watsonx_gov.metrics.text_reading_ease.text_reading_ease_decorator import \
|
|
84
|
+
TextReadingEaseDecorator
|
|
85
|
+
from ibm_watsonx_gov.metrics.tool_call_accuracy.tool_call_accuracy_decorator import \
|
|
86
|
+
ToolCallAccuracyDecorator
|
|
87
|
+
from ibm_watsonx_gov.metrics.tool_call_parameter_accuracy.tool_call_parameter_accuracy_decorator import \
|
|
88
|
+
ToolCallParameterAccuracyDecorator
|
|
89
|
+
from ibm_watsonx_gov.metrics.tool_call_relevance.tool_call_relevance_decorator import \
|
|
90
|
+
ToolCallRelevanceDecorator
|
|
91
|
+
from ibm_watsonx_gov.metrics.tool_call_syntactic_accuracy.tool_call_syntactic_accuracy_decorator import \
|
|
92
|
+
ToolCallSyntacticAccuracyDecorator
|
|
93
|
+
from ibm_watsonx_gov.metrics.topic_relevance.topic_relevance_decorator import \
|
|
94
|
+
TopicRelevanceDecorator
|
|
95
|
+
from ibm_watsonx_gov.metrics.unethical_behavior.unethical_behavior_decorator import \
|
|
96
|
+
UnethicalBehaviorDecorator
|
|
97
|
+
from ibm_watsonx_gov.metrics.unsuccessful_requests.unsuccessful_requests_decorator import \
|
|
98
|
+
UnsuccessfulRequestsDecorator
|
|
99
|
+
from ibm_watsonx_gov.metrics.violence.violence_decorator import \
|
|
100
|
+
ViolenceDecorator
|
|
101
|
+
from ibm_watsonx_gov.traces.span_util import get_attributes
|
|
102
|
+
from ibm_watsonx_gov.traces.trace_utils import TraceUtils
|
|
103
|
+
from ibm_watsonx_gov.utils.aggregation_util import \
|
|
104
|
+
get_agentic_evaluation_result
|
|
105
|
+
from ibm_watsonx_gov.utils.async_util import (gather_with_concurrency,
|
|
106
|
+
run_in_event_loop)
|
|
107
|
+
from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
|
|
108
|
+
from ibm_watsonx_gov.utils.python_utils import add_if_unique
|
|
109
|
+
from ibm_watsonx_gov.utils.singleton_meta import SingletonMeta
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
from ibm_watsonx_gov.traces.span_exporter import WxGovSpanExporter
|
|
113
|
+
except Exception:
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
logger = GovSDKLogger.get_logger(__name__)
|
|
117
|
+
PROCESS_TRACES = True
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
from ibm_agent_analytics.instrumentation import agent_analytics_sdk
|
|
122
|
+
from ibm_agent_analytics.instrumentation.configs import OTLPCollectorConfig
|
|
123
|
+
from ibm_agent_analytics.instrumentation.utils import get_current_trace_id
|
|
124
|
+
except ImportError as e:
|
|
125
|
+
logger.warning(str(e))
|
|
126
|
+
PROCESS_TRACES = False
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
update_lock = Lock()
|
|
130
|
+
TRACE_LOG_FILE_NAME = os.getenv(
|
|
131
|
+
"TRACE_LOG_FILE_NAME", f"experiment_traces_{str(uuid4())}")
|
|
132
|
+
TRACE_LOG_FILE_PATH = os.getenv("TRACE_LOG_FILE_PATH", "./wxgov_traces")
|
|
133
|
+
|
|
134
|
+
AI_SERVICE_QUALITY = "ai_service_quality"
|
|
135
|
+
CUSTOM_METRICS = "custom_metrics"
|
|
136
|
+
MAX_CONCURRENCY = 10
|
|
137
|
+
AGENTIC_RESULT_COMPONENTS = ["conversation", "message", "node"]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class AgenticEvaluator(BaseEvaluator, metaclass=SingletonMeta):
|
|
141
|
+
"""
|
|
142
|
+
The class to evaluate agentic application.
|
|
143
|
+
|
|
144
|
+
Examples:
|
|
145
|
+
1. Evaluate Agent with default parameters. This will compute only the performance(latency, duration) and usage(cost, input_token_count, output_token_count) metrics.
|
|
146
|
+
.. code-block:: python
|
|
147
|
+
|
|
148
|
+
agentic_evaluator = AgenticEvaluator()
|
|
149
|
+
agentic_evaluator.start_run()
|
|
150
|
+
# Invoke the agentic application
|
|
151
|
+
agentic_evaluator.end_run()
|
|
152
|
+
result = agentic_evaluator.get_result()
|
|
153
|
+
|
|
154
|
+
2. Evaluate Agent by specifying the agent or message level metrics and the node level metrics which will be computed post graph invocation when end_run() is called.
|
|
155
|
+
.. code-block:: python
|
|
156
|
+
|
|
157
|
+
# Below example provides the node configuration to compute the ContextRelevanceMetric and all the Retrieval Quality group metrics.
|
|
158
|
+
nodes = [Node(name="Retrieval Node",
|
|
159
|
+
metrics_configurations=[MetricsConfiguration(metrics=[ContextRelevanceMetric()],
|
|
160
|
+
metric_groups=[MetricGroup.RETRIEVAL_QUALITY])])]
|
|
161
|
+
# Please refer to MetricsConfiguration class for advanced usage where the fields details can be specified, in case the graph state has the attributes with non default names.
|
|
162
|
+
|
|
163
|
+
# Below example provides the agent configuration to compute the AnswerRelevanceMetric and all the Content Safety group metrics on agent or message level.
|
|
164
|
+
agentic_app = AgenticApp(name="Agentic App",
|
|
165
|
+
metrics_configuration=MetricsConfiguration(metrics=[AnswerRelevanceMetric()],
|
|
166
|
+
metric_groups=[MetricGroup.CONTENT_SAFETY]),
|
|
167
|
+
nodes=nodes)
|
|
168
|
+
|
|
169
|
+
agentic_evaluator = AgenticEvaluator(agentic_app=agentic_app)
|
|
170
|
+
agentic_evaluator.start_run()
|
|
171
|
+
# Invoke the agentic application
|
|
172
|
+
agentic_evaluator.end_run()
|
|
173
|
+
result = agentic_evaluator.get_result()
|
|
174
|
+
|
|
175
|
+
3. Evaluate Agent by specifying the agent or message level metrics and use decorator to compute node level metrics which will be computed during graph invocation.
|
|
176
|
+
.. code-block:: python
|
|
177
|
+
|
|
178
|
+
# Below example provides the agent configuration to compute the AnswerRelevanceMetric and all the Content Safety group metrics on agent or message level.
|
|
179
|
+
# Agent or message level metrics will be computed post graph invocation when end_run() is called.
|
|
180
|
+
agentic_app = AgenticApp(name="Agentic App",
|
|
181
|
+
metrics_configuration=MetricsConfiguration(metrics=[AnswerRelevanceMetric()],
|
|
182
|
+
metric_groups=[MetricGroup.CONTENT_SAFETY]))
|
|
183
|
+
|
|
184
|
+
agentic_evaluator = AgenticEvaluator(agentic_app=agentic_app)
|
|
185
|
+
|
|
186
|
+
# Add decorator when defining the node functions
|
|
187
|
+
@evaluator.evaluate_retrieval_quality(configuration=AgenticAIConfiguration(**{"input_fields": ["input_text"], "context_fields": ["local_context"]}))
|
|
188
|
+
@evaluator.evaluate_content_safety() # Here the default AgenticAIConfiguration is used
|
|
189
|
+
def local_search_node(state: GraphState, config: RunnableConfig) -> dict:
|
|
190
|
+
# Retrieve data from vector db
|
|
191
|
+
# ...
|
|
192
|
+
return {"local_context": []}
|
|
193
|
+
|
|
194
|
+
agentic_evaluator.start_run()
|
|
195
|
+
# Invoke the agentic application
|
|
196
|
+
agentic_evaluator.end_run()
|
|
197
|
+
result = agentic_evaluator.get_result()
|
|
198
|
+
|
|
199
|
+
4. Evaluate agent with experiment tracking
|
|
200
|
+
.. code-block:: python
|
|
201
|
+
|
|
202
|
+
tracing_config = TracingConfiguration(project_id=project_id)
|
|
203
|
+
agentic_evaluator = AgenticEvaluator(tracing_configuration=tracing_config)
|
|
204
|
+
|
|
205
|
+
agentic_evaluator.track_experiment(name="my_experiment")
|
|
206
|
+
agentic_evaluator.start_run(AIExperimentRunRequest(name="run1"))
|
|
207
|
+
# Invoke the agentic application
|
|
208
|
+
agentic_evaluator.end_run()
|
|
209
|
+
result = agentic_evaluator.get_result()
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
"""
|
|
213
|
+
agentic_app: Annotated[Optional[AgenticApp],
|
|
214
|
+
Field(title="Agentic application configuration details",
|
|
215
|
+
description="The agentic application configuration details.",
|
|
216
|
+
default=None)]
|
|
217
|
+
tracing_configuration: Annotated[Optional[TracingConfiguration],
|
|
218
|
+
Field(title="Tracing Configuration",
|
|
219
|
+
description="The tracing configuration details.",
|
|
220
|
+
default=None)]
|
|
221
|
+
ai_experiment_client: Annotated[Optional[AIExperimentsClient],
|
|
222
|
+
Field(title="AI experiments client",
|
|
223
|
+
description="The AI experiment client object.",
|
|
224
|
+
default=None)]
|
|
225
|
+
max_concurrency: Annotated[int,
|
|
226
|
+
Field(title="Max Concurrency",
|
|
227
|
+
description="The maximum concurrency to use for evaluating metrics.",
|
|
228
|
+
default=MAX_CONCURRENCY)]
|
|
229
|
+
__latest_experiment_name: Annotated[Optional[str], PrivateAttr(
|
|
230
|
+
default=None)]
|
|
231
|
+
__latest_experiment_id: Annotated[Optional[str], PrivateAttr(
|
|
232
|
+
default=None)]
|
|
233
|
+
__experiment_results: Annotated[dict,
|
|
234
|
+
PrivateAttr(default={})]
|
|
235
|
+
__run_results: Annotated[dict[str, AgenticEvaluationResult],
|
|
236
|
+
PrivateAttr(default={})]
|
|
237
|
+
__online_metric_results: Annotated[list[AgentMetricResult],
|
|
238
|
+
PrivateAttr(default=[])]
|
|
239
|
+
"""__metric_results holds the results of all the evaluations done for a particular evaluation instance."""
|
|
240
|
+
__execution_counts: Annotated[dict[str, dict[str, int]],
|
|
241
|
+
PrivateAttr(default={})]
|
|
242
|
+
"""__execution_counts holds the execution count for a particular node, for a given record_id."""
|
|
243
|
+
__nodes_being_run: Annotated[dict[str, Set[str]],
|
|
244
|
+
PrivateAttr(default={})]
|
|
245
|
+
"""__nodes_being_run holds the name of the current nodes being run for a given record_id. Multiple decorators can be applied on a single node using chaining. We don't want to hold multiple copies of same node here."""
|
|
246
|
+
__latest_run_name: Annotated[str, PrivateAttr(default=None)]
|
|
247
|
+
__nodes: Annotated[list[Node], PrivateAttr(default=[])]
|
|
248
|
+
__experiment_run_details: Annotated[AIExperimentRun, PrivateAttr(
|
|
249
|
+
default=None)]
|
|
250
|
+
__custom_metrics: Annotated[List[dict], PrivateAttr(default=None)]
|
|
251
|
+
|
|
252
|
+
def __init__(self, /, **data):
|
|
253
|
+
"""
|
|
254
|
+
Initialize the AgenticEvaluator object and start the tracing framework.
|
|
255
|
+
"""
|
|
256
|
+
super().__init__(**data)
|
|
257
|
+
# Initialize the agent analytics sdk
|
|
258
|
+
if PROCESS_TRACES:
|
|
259
|
+
tracing_params = self.__get_tracing_params(
|
|
260
|
+
data.get("tracing_configuration"))
|
|
261
|
+
|
|
262
|
+
agent_analytics_sdk.initialize_logging(
|
|
263
|
+
tracer_type=agent_analytics_sdk.SUPPORTED_TRACER_TYPES.CUSTOM,
|
|
264
|
+
custom_exporter=WxGovSpanExporter(
|
|
265
|
+
tracing_params.get("enable_local_traces"),
|
|
266
|
+
tracing_params.get("enable_server_traces"),
|
|
267
|
+
file_name=TRACE_LOG_FILE_NAME,
|
|
268
|
+
storage_path=TRACE_LOG_FILE_PATH,
|
|
269
|
+
# manually passing endpoint and timeout
|
|
270
|
+
endpoint=tracing_params.get("endpoint"),
|
|
271
|
+
timeout=tracing_params.get("timeout"),
|
|
272
|
+
headers=tracing_params.get("headers"),
|
|
273
|
+
),
|
|
274
|
+
new_trace_on_workflow=True,
|
|
275
|
+
resource_attributes={
|
|
276
|
+
"wxgov.config.agentic_app": self.agentic_app.model_dump_json(exclude_none=True) if self.agentic_app else "",
|
|
277
|
+
**tracing_params.get("resource_attributes")
|
|
278
|
+
},
|
|
279
|
+
# Check: does this config has any effect on CUSTOM exporters
|
|
280
|
+
config=OTLPCollectorConfig(
|
|
281
|
+
**tracing_params.get("otlp_config_dict")) if tracing_params.get("otlp_config_dict") else None
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
self.__latest_experiment_name = "experiment_1"
|
|
285
|
+
|
|
286
|
+
def __get_tracing_params(self, tracing_config):
|
|
287
|
+
tracing_params = {
|
|
288
|
+
"enable_local_traces": True,
|
|
289
|
+
"enable_server_traces": False,
|
|
290
|
+
"endpoint": None,
|
|
291
|
+
"timeout": None,
|
|
292
|
+
"headers": None,
|
|
293
|
+
"resource_attributes": {},
|
|
294
|
+
"otlp_config_dict": {}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
if tracing_config:
|
|
298
|
+
resource_attributes = tracing_config.resource_attributes
|
|
299
|
+
if tracing_config.project_id:
|
|
300
|
+
resource_attributes["wx-project-id"] = tracing_config.project_id
|
|
301
|
+
elif tracing_config.space_id:
|
|
302
|
+
resource_attributes["wx-space-id"] = tracing_config.space_id
|
|
303
|
+
tracing_params["resource_attributes"] = resource_attributes
|
|
304
|
+
otlp_collector_config = tracing_config.otlp_collector_config
|
|
305
|
+
|
|
306
|
+
if otlp_collector_config:
|
|
307
|
+
tracing_params["endpoint"] = otlp_collector_config.endpoint
|
|
308
|
+
tracing_params["timeout"] = otlp_collector_config.timeout
|
|
309
|
+
tracing_params["headers"] = otlp_collector_config.headers
|
|
310
|
+
tracing_params["otlp_config_dict"] = {k: v for k, v in otlp_collector_config.dict().items()
|
|
311
|
+
if k != "headers"}
|
|
312
|
+
tracing_params["enable_server_traces"] = True
|
|
313
|
+
tracing_params["enable_local_traces"] = tracing_config.log_traces_to_file
|
|
314
|
+
|
|
315
|
+
return tracing_params
|
|
316
|
+
|
|
317
|
+
def track_experiment(self, name: str = "experiment_1", description: str = None, use_existing: bool = True) -> str:
|
|
318
|
+
"""
|
|
319
|
+
Start tracking an experiment for the metrics evaluation.
|
|
320
|
+
The experiment will be created if it doesn't exist.
|
|
321
|
+
If an existing experiment with the same name is found, it will be reused based on the flag use_existing.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
project_id (string): The project id to store the experiment.
|
|
325
|
+
name (string): The name of the experiment.
|
|
326
|
+
description (str): The description of the experiment.
|
|
327
|
+
use_existing (bool): The flag to specify if the experiment should be reused if an existing experiment with the given name is found.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
The ID of AI experiment asset
|
|
331
|
+
"""
|
|
332
|
+
self.__latest_experiment_name = name
|
|
333
|
+
# Checking if the ai_experiment_name already exists with given name if use_existing is enabled.
|
|
334
|
+
# If it does reuse it, otherwise creating a new ai_experiment
|
|
335
|
+
# Set the experiment_name and experiment_id
|
|
336
|
+
self.ai_experiment_client = AIExperimentsClient(
|
|
337
|
+
api_client=self.api_client,
|
|
338
|
+
project_id=self.tracing_configuration.project_id
|
|
339
|
+
)
|
|
340
|
+
ai_experiment = None
|
|
341
|
+
if use_existing:
|
|
342
|
+
ai_experiment = self.ai_experiment_client.search(name)
|
|
343
|
+
|
|
344
|
+
# If no AI experiment exists with specified name or use_existing is False, create new AI experiment
|
|
345
|
+
if not ai_experiment:
|
|
346
|
+
ai_experiment_details = AIExperiment(
|
|
347
|
+
name=name,
|
|
348
|
+
description=description or "AI experiment for Agent governance"
|
|
349
|
+
)
|
|
350
|
+
ai_experiment = self.ai_experiment_client.create(
|
|
351
|
+
ai_experiment_details)
|
|
352
|
+
|
|
353
|
+
ai_experiment_id = ai_experiment.asset_id
|
|
354
|
+
|
|
355
|
+
# Experiment id will be set when the experiment is tracked and not set when the experiment is not tracked
|
|
356
|
+
self.__latest_experiment_id = ai_experiment_id
|
|
357
|
+
self.__run_results = {}
|
|
358
|
+
return ai_experiment_id
|
|
359
|
+
|
|
360
|
+
def start_run(self, run_request: AIExperimentRunRequest = AIExperimentRunRequest(name="run_1")) -> AIExperimentRun:
|
|
361
|
+
"""
|
|
362
|
+
Start a run to track the metrics computation within an experiment.
|
|
363
|
+
This method is required to be called before any metrics computation.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
run_request (AIExperimentRunRequest): The run_request instance containing name, source_name, source_url, custom_tags
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
The details of experiment run like id, name, description etc.
|
|
370
|
+
"""
|
|
371
|
+
name = run_request.name
|
|
372
|
+
self.__latest_run_name = name
|
|
373
|
+
self.__experiment_results[self.__latest_experiment_name] = self.__run_results
|
|
374
|
+
self.__start_time = time.time()
|
|
375
|
+
# Having experiment id indicates user is tracking experiments
|
|
376
|
+
if self.__latest_experiment_id:
|
|
377
|
+
# Create run object, having experiment id indicates user is tracking experiments
|
|
378
|
+
self.__experiment_run_details = AIExperimentRun(
|
|
379
|
+
run_id=str(uuid4()),
|
|
380
|
+
run_name=name,
|
|
381
|
+
source_name=run_request.source_name,
|
|
382
|
+
source_url=run_request.source_url,
|
|
383
|
+
custom_tags=run_request.custom_tags,
|
|
384
|
+
agent_method_name=run_request.agent_method_name,
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
return self.__experiment_run_details
|
|
388
|
+
|
|
389
|
+
def log_custom_metrics(self, custom_metrics):
|
|
390
|
+
"""
|
|
391
|
+
Collect the custom metrics provided by user and append with metrics of current run.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
custom_metrics (List[Dict]): custom metrics
|
|
395
|
+
"""
|
|
396
|
+
required_fields = ["name", "value"]
|
|
397
|
+
is_valid = True
|
|
398
|
+
for metric in custom_metrics:
|
|
399
|
+
# Check required fields
|
|
400
|
+
for key in required_fields:
|
|
401
|
+
if key not in metric or metric[key] in [None, ""]:
|
|
402
|
+
is_valid = False
|
|
403
|
+
|
|
404
|
+
# Conditional check: applies_to == "node" => node_name must exist and be non-empty
|
|
405
|
+
if metric.get("applies_to") == "node":
|
|
406
|
+
if "node_name" not in metric or metric["node_name"] in [None, ""]:
|
|
407
|
+
is_valid = False
|
|
408
|
+
|
|
409
|
+
if not is_valid:
|
|
410
|
+
message = "Invalid metrics formats. Required fields are 'name' and 'value'."
|
|
411
|
+
logger.error(message)
|
|
412
|
+
raise Exception(message)
|
|
413
|
+
|
|
414
|
+
self.__custom_metrics = custom_metrics
|
|
415
|
+
|
|
416
|
+
def end_run(self, track_notebook: Optional[bool] = False):
|
|
417
|
+
"""
|
|
418
|
+
End a run to collect and compute the metrics within the current run.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
track_notebook (bool): flag to specify storing the notebook with current run
|
|
422
|
+
|
|
423
|
+
"""
|
|
424
|
+
eval_result = self.__compute_metrics_from_traces()
|
|
425
|
+
self.__run_results[self.__latest_run_name] = eval_result
|
|
426
|
+
# Having experiment id indicates user is tracking experiments and its needed to submit the run details
|
|
427
|
+
if self.__latest_experiment_id:
|
|
428
|
+
self.__store_run_results(track_notebook)
|
|
429
|
+
|
|
430
|
+
self.__reset_results()
|
|
431
|
+
|
|
432
|
+
def compare_ai_experiments(self,
|
|
433
|
+
ai_experiments: List[AIExperiment] = None,
|
|
434
|
+
ai_evaluation_details: AIEvaluationAsset = None
|
|
435
|
+
) -> str:
|
|
436
|
+
"""
|
|
437
|
+
Creates an AI Evaluation asset to compare AI experiment runs.
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
ai_experiments (List[AIExperiment], optional):
|
|
441
|
+
List of AI experiments to be compared. If all runs for an experiment need to be compared, then specify the runs value as empty list for the experiment.
|
|
442
|
+
ai_evaluation_details (AIEvaluationAsset, optional):
|
|
443
|
+
An instance of AIEvaluationAsset having details (name, description and metrics configuration)
|
|
444
|
+
Returns:
|
|
445
|
+
An instance of AIEvaluationAsset.
|
|
446
|
+
|
|
447
|
+
Examples:
|
|
448
|
+
1. Create AI evaluation with list of experiment IDs
|
|
449
|
+
|
|
450
|
+
.. code-block:: python
|
|
451
|
+
|
|
452
|
+
# Initialize the API client with credentials
|
|
453
|
+
api_client = APIClient(credentials=Credentials(api_key="", url="wos_url"))
|
|
454
|
+
|
|
455
|
+
# Create the instance of Agentic evaluator
|
|
456
|
+
evaluator = AgenticEvaluator(api_client=api_client, tracing_configuration=TracingConfiguration(project_id=project_id))
|
|
457
|
+
|
|
458
|
+
# [Optional] Define evaluation configuration
|
|
459
|
+
evaluation_config = EvaluationConfig(
|
|
460
|
+
monitors={
|
|
461
|
+
"agentic_ai_quality": {
|
|
462
|
+
"parameters": {
|
|
463
|
+
"metrics_configuration": {}
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
# Create the evaluation asset
|
|
470
|
+
ai_evaluation_details = AIEvaluationAsset(
|
|
471
|
+
name="AI Evaluation for agent",
|
|
472
|
+
evaluation_configuration=evaluation_config
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# Compare two or more AI experiments using the evaluation asset
|
|
476
|
+
ai_experiment1 = AIExperiment(
|
|
477
|
+
asset_id = ai_experiment_id_1,
|
|
478
|
+
runs = [<Run1 details>, <Run2 details>] # Run details are returned by the start_run method
|
|
479
|
+
)
|
|
480
|
+
ai_experiment2 = AIExperiment(
|
|
481
|
+
asset_id = ai_experiment_id_2,
|
|
482
|
+
runs = [] # Empty list means all runs for this experiment will be compared
|
|
483
|
+
)
|
|
484
|
+
ai_evaluation_asset_href = evaluator.compare_ai_experiments(
|
|
485
|
+
ai_experiments = [ai_experiment_1, ai_experiment_2],
|
|
486
|
+
ai_evaluation_details=ai_evaluation_asset
|
|
487
|
+
)
|
|
488
|
+
"""
|
|
489
|
+
# If experiment runs to be compared are not provided, using all runs from the latest tracked experiment
|
|
490
|
+
if not ai_experiments:
|
|
491
|
+
ai_experiments = [AIExperiment(
|
|
492
|
+
asset_id=self.__latest_experiment_id, runs=[])]
|
|
493
|
+
|
|
494
|
+
# Construct experiment_runs map
|
|
495
|
+
ai_experiment_runs = {exp.asset_id: exp.runs for exp in ai_experiments}
|
|
496
|
+
|
|
497
|
+
ai_evaluation_asset = self.ai_experiment_client.create_ai_evaluation_asset(
|
|
498
|
+
ai_experiment_runs=ai_experiment_runs,
|
|
499
|
+
ai_evaluation_details=ai_evaluation_details
|
|
500
|
+
)
|
|
501
|
+
ai_evaluation_asset_href = self.ai_experiment_client.get_ai_evaluation_asset_href(
|
|
502
|
+
ai_evaluation_asset)
|
|
503
|
+
|
|
504
|
+
return ai_evaluation_asset_href
|
|
505
|
+
|
|
506
|
+
def __compute_metrics_from_traces(self):
|
|
507
|
+
"""
|
|
508
|
+
Computes the metrics using the traces collected in the log file.
|
|
509
|
+
"""
|
|
510
|
+
if not PROCESS_TRACES:
|
|
511
|
+
return
|
|
512
|
+
|
|
513
|
+
trace_log_file_path = Path(
|
|
514
|
+
f"{TRACE_LOG_FILE_PATH}/{TRACE_LOG_FILE_NAME}.log")
|
|
515
|
+
spans = []
|
|
516
|
+
for span in TraceUtils.stream_trace_data(trace_log_file_path):
|
|
517
|
+
spans.append(span)
|
|
518
|
+
|
|
519
|
+
metrics_result = []
|
|
520
|
+
coros = []
|
|
521
|
+
span_trees = TraceUtils.build_span_trees(
|
|
522
|
+
spans=spans, agentic_app=self.agentic_app)
|
|
523
|
+
for span_tree in span_trees:
|
|
524
|
+
# Process only the spans that are associated with the agent application
|
|
525
|
+
attrs = get_attributes(span_tree.span.attributes, [
|
|
526
|
+
"traceloop.span.kind"])
|
|
527
|
+
if not attrs.get("traceloop.span.kind") == "workflow":
|
|
528
|
+
continue
|
|
529
|
+
# Append coroutine for metric computation
|
|
530
|
+
coros.append(
|
|
531
|
+
TraceUtils.compute_metrics_from_trace_async(
|
|
532
|
+
span_tree=span_tree,
|
|
533
|
+
api_client=self.api_client,
|
|
534
|
+
max_concurrency=self.max_concurrency,
|
|
535
|
+
)
|
|
536
|
+
)
|
|
537
|
+
# Run all coroutines in parallel with concurrency control
|
|
538
|
+
results = run_in_event_loop(
|
|
539
|
+
gather_with_concurrency,
|
|
540
|
+
coros=coros,
|
|
541
|
+
max_concurrency=self.max_concurrency)
|
|
542
|
+
|
|
543
|
+
# Process results
|
|
544
|
+
for mr, ns, _ in results:
|
|
545
|
+
metrics_result.extend(mr)
|
|
546
|
+
for n in ns:
|
|
547
|
+
add_if_unique(n, self.__nodes, ["name", "func_name"], [
|
|
548
|
+
"foundation_models"])
|
|
549
|
+
|
|
550
|
+
return get_agentic_evaluation_result(
|
|
551
|
+
metrics_result=metrics_result, nodes=self.__nodes)
|
|
552
|
+
|
|
553
|
+
def __store_run_results(self, track_notebook: Optional[bool] = False):
|
|
554
|
+
|
|
555
|
+
aggregated_results = self.get_result().get_aggregated_metrics_results()
|
|
556
|
+
# Fetching the nodes details to update in experiment run
|
|
557
|
+
nodes = []
|
|
558
|
+
for node in self.get_nodes():
|
|
559
|
+
nodes.append(ai_experiment_entity.Node(
|
|
560
|
+
id=node.func_name, name=node.name, foundation_models=set(node.foundation_models)))
|
|
561
|
+
self.__experiment_run_details.nodes = nodes
|
|
562
|
+
# Duration of run in seconds
|
|
563
|
+
self.__experiment_run_details.duration = int(
|
|
564
|
+
time.time() - self.__start_time)
|
|
565
|
+
|
|
566
|
+
# Storing the run result as attachment and update the run info in AI experiment
|
|
567
|
+
# Todo - keeping the List[AggregateAgentMetricResult] - is that compatible? should store full AgenticEvaluationResult?
|
|
568
|
+
evaluation_result = {
|
|
569
|
+
AI_SERVICE_QUALITY: aggregated_results
|
|
570
|
+
}
|
|
571
|
+
# Adding custom metrics, if exist
|
|
572
|
+
if self.__custom_metrics:
|
|
573
|
+
evaluation_result[CUSTOM_METRICS] = self.__custom_metrics
|
|
574
|
+
|
|
575
|
+
self.ai_experiment_client.update(
|
|
576
|
+
self.__latest_experiment_id,
|
|
577
|
+
self.__experiment_run_details,
|
|
578
|
+
evaluation_result,
|
|
579
|
+
track_notebook,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
def get_nodes(self) -> list[Node]:
|
|
583
|
+
"""
|
|
584
|
+
Get the list of nodes used in the agentic application
|
|
585
|
+
|
|
586
|
+
Return:
|
|
587
|
+
nodes (list[Node]): The list of nodes used in the agentic application
|
|
588
|
+
"""
|
|
589
|
+
return self.__nodes
|
|
590
|
+
|
|
591
|
+
def get_result(self, run_name: Optional[str] = None) -> AgenticEvaluationResult:
|
|
592
|
+
"""
|
|
593
|
+
Get the AgenticEvaluationResult for the run. By default the result for the latest run is returned.
|
|
594
|
+
Specify the run name to get the result for a specific run.
|
|
595
|
+
Args:
|
|
596
|
+
run_name (string): The evaluation run name
|
|
597
|
+
Return:
|
|
598
|
+
agentic_evaluation_result (AgenticEvaluationResult): The AgenticEvaluationResult object for the run.
|
|
599
|
+
"""
|
|
600
|
+
if run_name:
|
|
601
|
+
result = self.__run_results.get(run_name)
|
|
602
|
+
else:
|
|
603
|
+
result = self.__run_results.get(self.__latest_run_name)
|
|
604
|
+
|
|
605
|
+
return result
|
|
606
|
+
|
|
607
|
+
def get_metric_result(self, metric_name: str, node_name: str) -> AgentMetricResult:
|
|
608
|
+
"""
|
|
609
|
+
Get the AgentMetricResult for the given metric and node name.
|
|
610
|
+
This is used to get the result of the metric computed during agent execution.
|
|
611
|
+
|
|
612
|
+
Args:
|
|
613
|
+
metric_name (string): The metric name
|
|
614
|
+
node_name (string): The node name
|
|
615
|
+
Return:
|
|
616
|
+
agent_metric_result (AgentMetricResult): The AgentMetricResult object for the metric.
|
|
617
|
+
"""
|
|
618
|
+
for metric in self.__online_metric_results:
|
|
619
|
+
if metric.applies_to == "node" and metric.name == metric_name \
|
|
620
|
+
and metric.node_name == node_name and metric.message_id == get_current_trace_id():
|
|
621
|
+
return metric
|
|
622
|
+
|
|
623
|
+
return None
|
|
624
|
+
|
|
625
|
+
def __reset_results(self):
|
|
626
|
+
self.__online_metric_results.clear()
|
|
627
|
+
self.__execution_counts.clear()
|
|
628
|
+
self.__nodes_being_run.clear()
|
|
629
|
+
trace_log_file_path = Path(
|
|
630
|
+
f"{TRACE_LOG_FILE_PATH}/{TRACE_LOG_FILE_NAME}.log")
|
|
631
|
+
if os.path.exists(trace_log_file_path):
|
|
632
|
+
os.remove(trace_log_file_path)
|
|
633
|
+
|
|
634
|
+
def evaluate_context_relevance(self,
|
|
635
|
+
func: Optional[Callable] = None,
|
|
636
|
+
*,
|
|
637
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
638
|
+
metrics: list[GenAIMetric] = [],
|
|
639
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
640
|
+
"""
|
|
641
|
+
An evaluation decorator for computing context relevance metric on an agentic node.
|
|
642
|
+
|
|
643
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.ContextRelevanceMetric`
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
func (Optional[Callable], optional): The node on which the metric is to be computed.
|
|
647
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
648
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ContextRelevanceMetric() ].
|
|
649
|
+
compute_real_time (Optional[bool], optional): The flag to indicate whether the metric should be computed along with the node execution or not.
|
|
650
|
+
|
|
651
|
+
Raises:
|
|
652
|
+
Exception: If there is any error while evaluation.
|
|
653
|
+
|
|
654
|
+
Returns:
|
|
655
|
+
dict: The result of the wrapped node.
|
|
656
|
+
|
|
657
|
+
Examples:
|
|
658
|
+
1. Basic usage
|
|
659
|
+
.. code-block:: python
|
|
660
|
+
|
|
661
|
+
evaluator = AgenticEvaluator()
|
|
662
|
+
@evaluator.evaluate_context_relevance
|
|
663
|
+
def agentic_node(*args, *kwargs):
|
|
664
|
+
pass
|
|
665
|
+
|
|
666
|
+
2. Usage with different thresholds and methods
|
|
667
|
+
.. code-block:: python
|
|
668
|
+
|
|
669
|
+
metric_1 = ContextRelevanceMetric(
|
|
670
|
+
method="sentence_bert_bge", thresholds=MetricThreshold(type="lower_limit", value=0.5))
|
|
671
|
+
metric_2 = ContextRelevanceMetric(
|
|
672
|
+
method="sentence_bert_mini_lm", thresholds=MetricThreshold(type="lower_limit", value=0.6))
|
|
673
|
+
metric_3 = ContextRelevanceMetric(
|
|
674
|
+
method="granite_guardian", thresholds=MetricThreshold(type="lower_limit", value=0.6))
|
|
675
|
+
evaluator = AgenticEvaluator()
|
|
676
|
+
@evaluator.evaluate_context_relevance(metrics=[metric_1, metric_2, metric_3])
|
|
677
|
+
def agentic_node(*args, *kwargs):
|
|
678
|
+
pass
|
|
679
|
+
"""
|
|
680
|
+
return ContextRelevanceDecorator(api_client=self.api_client,
|
|
681
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
682
|
+
metric_results=self.__online_metric_results,
|
|
683
|
+
execution_counts=self.__execution_counts,
|
|
684
|
+
nodes_being_run=self.__nodes_being_run,
|
|
685
|
+
lock=update_lock,
|
|
686
|
+
compute_real_time=compute_real_time).evaluate_context_relevance(func, configuration=configuration, metrics=metrics)
|
|
687
|
+
|
|
688
|
+
def evaluate_average_precision(self,
|
|
689
|
+
func: Optional[Callable] = None,
|
|
690
|
+
*,
|
|
691
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
692
|
+
metrics: list[GenAIMetric] = [],
|
|
693
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
694
|
+
"""
|
|
695
|
+
An evaluation decorator for computing average precision metric on an agentic tool.
|
|
696
|
+
This metric uses context relevance values for computation, context relevance metric would be computed as a prerequisite.
|
|
697
|
+
|
|
698
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.AveragePrecisionMetric`
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
702
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
703
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ AveragePrecisionMetric() ].
|
|
704
|
+
|
|
705
|
+
Raises:
|
|
706
|
+
Exception: If there is any error while evaluation.
|
|
707
|
+
|
|
708
|
+
Returns:
|
|
709
|
+
dict: The result of the wrapped tool.
|
|
710
|
+
|
|
711
|
+
Example:
|
|
712
|
+
1. Basic usage
|
|
713
|
+
.. code-block:: python
|
|
714
|
+
|
|
715
|
+
evaluator = AgenticEvaluator()
|
|
716
|
+
@evaluator.evaluate_average_precision
|
|
717
|
+
def agentic_tool(*args, *kwargs):
|
|
718
|
+
pass
|
|
719
|
+
|
|
720
|
+
2. Usage with different thresholds and methods
|
|
721
|
+
.. code-block:: python
|
|
722
|
+
|
|
723
|
+
metric_1 = AveragePrecisionMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
|
|
724
|
+
metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
|
|
725
|
+
|
|
726
|
+
evaluator = AgenticEvaluator()
|
|
727
|
+
@evaluator.evaluate_average_precision(metrics=[metric_1, metric_2])
|
|
728
|
+
def agentic_tool(*args, *kwargs):
|
|
729
|
+
pass
|
|
730
|
+
"""
|
|
731
|
+
return AveragePrecisionDecorator(api_client=self.api_client,
|
|
732
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
733
|
+
metric_results=self.__online_metric_results,
|
|
734
|
+
execution_counts=self.__execution_counts,
|
|
735
|
+
nodes_being_run=self.__nodes_being_run,
|
|
736
|
+
lock=update_lock,
|
|
737
|
+
compute_real_time=compute_real_time).evaluate_average_precision(func, configuration=configuration, metrics=metrics)
|
|
738
|
+
|
|
739
|
+
def evaluate_ndcg(self,
|
|
740
|
+
func: Optional[Callable] = None,
|
|
741
|
+
*,
|
|
742
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
743
|
+
metrics: list[GenAIMetric] = [],
|
|
744
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
745
|
+
"""
|
|
746
|
+
An evaluation decorator for computing ndcg metric on an agentic tool.
|
|
747
|
+
This metric uses context relevance values for computation, context relevance metric would be computed as a prerequisite.
|
|
748
|
+
|
|
749
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.NDCGMetric`
|
|
750
|
+
|
|
751
|
+
Args:
|
|
752
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
753
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
754
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ NDCGMetric() ].
|
|
755
|
+
|
|
756
|
+
Raises:
|
|
757
|
+
Exception: If there is any error while evaluation.
|
|
758
|
+
|
|
759
|
+
Returns:
|
|
760
|
+
dict: The result of the wrapped tool.
|
|
761
|
+
|
|
762
|
+
Example:
|
|
763
|
+
1. Basic usage
|
|
764
|
+
.. code-block:: python
|
|
765
|
+
|
|
766
|
+
evaluator = AgenticEvaluator()
|
|
767
|
+
@evaluator.evaluate_ndcg
|
|
768
|
+
def agentic_tool(*args, *kwargs):
|
|
769
|
+
pass
|
|
770
|
+
|
|
771
|
+
2. Usage with different thresholds and methods
|
|
772
|
+
.. code-block:: python
|
|
773
|
+
|
|
774
|
+
metric_1 = NDCGMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
|
|
775
|
+
metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
|
|
776
|
+
|
|
777
|
+
evaluator = AgenticEvaluator()
|
|
778
|
+
@evaluator.evaluate_ndcg(metrics=[metric_1, metric_2])
|
|
779
|
+
def agentic_tool(*args, *kwargs):
|
|
780
|
+
pass
|
|
781
|
+
"""
|
|
782
|
+
return NDCGDecorator(api_client=self.api_client,
|
|
783
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
784
|
+
metric_results=self.__online_metric_results,
|
|
785
|
+
execution_counts=self.__execution_counts,
|
|
786
|
+
nodes_being_run=self.__nodes_being_run,
|
|
787
|
+
lock=update_lock,
|
|
788
|
+
compute_real_time=compute_real_time).evaluate_ndcg(func, configuration=configuration, metrics=metrics)
|
|
789
|
+
|
|
790
|
+
def evaluate_reciprocal_rank(self,
|
|
791
|
+
func: Optional[Callable] = None,
|
|
792
|
+
*,
|
|
793
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
794
|
+
metrics: list[GenAIMetric] = [],
|
|
795
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
796
|
+
"""
|
|
797
|
+
An evaluation decorator for computing reciprocal precision metric on an agentic tool.
|
|
798
|
+
This metric uses context relevance values for computation, context relevance metric would be computed as a prerequisite.
|
|
799
|
+
|
|
800
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.ReciprocalRankMetric`
|
|
801
|
+
|
|
802
|
+
Args:
|
|
803
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
804
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
805
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ReciprocalRankMetric() ].
|
|
806
|
+
|
|
807
|
+
Raises:
|
|
808
|
+
Exception: If there is any error while evaluation.
|
|
809
|
+
|
|
810
|
+
Returns:
|
|
811
|
+
dict: The result of the wrapped tool.
|
|
812
|
+
|
|
813
|
+
Example:
|
|
814
|
+
1. Basic usage
|
|
815
|
+
.. code-block:: python
|
|
816
|
+
|
|
817
|
+
evaluator = AgenticEvaluator()
|
|
818
|
+
@evaluator.evaluate_reciprocal_rank
|
|
819
|
+
def agentic_tool(*args, *kwargs):
|
|
820
|
+
pass
|
|
821
|
+
|
|
822
|
+
2. Usage with different thresholds and methods
|
|
823
|
+
.. code-block:: python
|
|
824
|
+
|
|
825
|
+
metric_1 = ReciprocalRankMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
|
|
826
|
+
metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
|
|
827
|
+
|
|
828
|
+
evaluator = AgenticEvaluator()
|
|
829
|
+
@evaluator.evaluate_reciprocal_rank(metrics=[metric_1, metric_2])
|
|
830
|
+
def agentic_tool(*args, *kwargs):
|
|
831
|
+
pass
|
|
832
|
+
"""
|
|
833
|
+
return ReciprocalRankDecorator(api_client=self.api_client,
|
|
834
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
835
|
+
metric_results=self.__online_metric_results,
|
|
836
|
+
execution_counts=self.__execution_counts,
|
|
837
|
+
nodes_being_run=self.__nodes_being_run,
|
|
838
|
+
lock=update_lock,
|
|
839
|
+
compute_real_time=compute_real_time).evaluate_reciprocal_rank(func, configuration=configuration, metrics=metrics)
|
|
840
|
+
|
|
841
|
+
def evaluate_retrieval_precision(self,
|
|
842
|
+
func: Optional[Callable] = None,
|
|
843
|
+
*,
|
|
844
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
845
|
+
metrics: list[GenAIMetric] = [],
|
|
846
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
847
|
+
"""
|
|
848
|
+
An evaluation decorator for computing retrieval precision metric on an agentic tool.
|
|
849
|
+
This metric uses context relevance values for computation, context relevance metric would be computed as a prerequisite.
|
|
850
|
+
|
|
851
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.RetrievalPrecisionMetric`
|
|
852
|
+
|
|
853
|
+
Args:
|
|
854
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
855
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
856
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ RetrievalPrecisionMetric() ].
|
|
857
|
+
|
|
858
|
+
Raises:
|
|
859
|
+
Exception: If there is any error while evaluation.
|
|
860
|
+
|
|
861
|
+
Returns:
|
|
862
|
+
dict: The result of the wrapped tool.
|
|
863
|
+
|
|
864
|
+
Example:
|
|
865
|
+
1. Basic usage
|
|
866
|
+
.. code-block:: python
|
|
867
|
+
|
|
868
|
+
evaluator = AgenticEvaluator()
|
|
869
|
+
@evaluator.evaluate_retrieval_precision
|
|
870
|
+
def agentic_tool(*args, *kwargs):
|
|
871
|
+
pass
|
|
872
|
+
|
|
873
|
+
2. Usage with different thresholds and methods
|
|
874
|
+
.. code-block:: python
|
|
875
|
+
|
|
876
|
+
metric_1 = AveragePrecisionMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
|
|
877
|
+
metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
|
|
878
|
+
|
|
879
|
+
evaluator = AgenticEvaluator()
|
|
880
|
+
@evaluator.evaluate_retrieval_precision(metrics=[metric_1, metric_2])
|
|
881
|
+
def agentic_tool(*args, *kwargs):
|
|
882
|
+
pass
|
|
883
|
+
"""
|
|
884
|
+
return RetrievalPrecisionDecorator(api_client=self.api_client,
|
|
885
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
886
|
+
metric_results=self.__online_metric_results,
|
|
887
|
+
execution_counts=self.__execution_counts,
|
|
888
|
+
nodes_being_run=self.__nodes_being_run,
|
|
889
|
+
lock=update_lock,
|
|
890
|
+
compute_real_time=compute_real_time).evaluate_retrieval_precision(func, configuration=configuration, metrics=metrics)
|
|
891
|
+
|
|
892
|
+
def evaluate_hit_rate(self,
|
|
893
|
+
func: Optional[Callable] = None,
|
|
894
|
+
*,
|
|
895
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
896
|
+
metrics: list[GenAIMetric] = [],
|
|
897
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
898
|
+
"""
|
|
899
|
+
An evaluation decorator for computing hit rate metric on an agentic tool.
|
|
900
|
+
This metric uses context relevance values for computation, context relevance metric would be computed as a prerequisite.
|
|
901
|
+
|
|
902
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.HitRateMetric`
|
|
903
|
+
|
|
904
|
+
Args:
|
|
905
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
906
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
907
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ HitRateMetric() ].
|
|
908
|
+
|
|
909
|
+
Raises:
|
|
910
|
+
Exception: If there is any error while evaluation.
|
|
911
|
+
|
|
912
|
+
Returns:
|
|
913
|
+
dict: The result of the wrapped tool.
|
|
914
|
+
|
|
915
|
+
Example:
|
|
916
|
+
1. Basic usage
|
|
917
|
+
.. code-block:: python
|
|
918
|
+
|
|
919
|
+
evaluator = AgenticEvaluator()
|
|
920
|
+
@evaluator.evaluate_hit_rate
|
|
921
|
+
def agentic_tool(*args, *kwargs):
|
|
922
|
+
pass
|
|
923
|
+
|
|
924
|
+
2. Usage with different thresholds and methods
|
|
925
|
+
.. code-block:: python
|
|
926
|
+
|
|
927
|
+
metric_1 = HitRateMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
|
|
928
|
+
metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
|
|
929
|
+
|
|
930
|
+
evaluator = AgenticEvaluator()
|
|
931
|
+
@evaluator.evaluate_hit_rate(metrics=[metric_1, metric_2])
|
|
932
|
+
def agentic_tool(*args, *kwargs):
|
|
933
|
+
pass
|
|
934
|
+
"""
|
|
935
|
+
return HitRateDecorator(api_client=self.api_client,
|
|
936
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
937
|
+
metric_results=self.__online_metric_results,
|
|
938
|
+
execution_counts=self.__execution_counts,
|
|
939
|
+
nodes_being_run=self.__nodes_being_run,
|
|
940
|
+
lock=update_lock,
|
|
941
|
+
compute_real_time=compute_real_time).evaluate_hit_rate(func, configuration=configuration, metrics=metrics)
|
|
942
|
+
|
|
943
|
+
def evaluate_answer_similarity(self,
|
|
944
|
+
func: Optional[Callable] = None,
|
|
945
|
+
*,
|
|
946
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
947
|
+
metrics: list[GenAIMetric] = [],
|
|
948
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
949
|
+
"""
|
|
950
|
+
An evaluation decorator for computing answer similarity metric on an agentic node.
|
|
951
|
+
|
|
952
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.AnswerSimilarityMetric`
|
|
953
|
+
|
|
954
|
+
Args:
|
|
955
|
+
func (Optional[Callable], optional): The node on which the metric is to be computed.
|
|
956
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
957
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ AnswerSimilarityMetric() ].
|
|
958
|
+
compute_real_time (Optional[bool], optional): The flag to indicate whether the metric should be computed along with the node execution or not.
|
|
959
|
+
|
|
960
|
+
Raises:
|
|
961
|
+
Exception: If there is any error while evaluation.
|
|
962
|
+
|
|
963
|
+
Returns:
|
|
964
|
+
dict: The result of the wrapped node.
|
|
965
|
+
|
|
966
|
+
Examples:
|
|
967
|
+
1. Basic usage
|
|
968
|
+
.. code-block:: python
|
|
969
|
+
|
|
970
|
+
evaluator = AgenticEvaluator()
|
|
971
|
+
@evaluator.evaluate_answer_similarity
|
|
972
|
+
def agentic_node(*args, *kwargs):
|
|
973
|
+
pass
|
|
974
|
+
|
|
975
|
+
|
|
976
|
+
2. Usage with different thresholds and methods
|
|
977
|
+
.. code-block:: python
|
|
978
|
+
|
|
979
|
+
metric_1 = AnswerSimilarityMetric(
|
|
980
|
+
method="token_k_precision", threshold=MetricThreshold(type="lower_limit", value=0.5))
|
|
981
|
+
metric_2 = AnswerSimilarityMetric(
|
|
982
|
+
method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
|
|
983
|
+
|
|
984
|
+
evaluator = AgenticEvaluator()
|
|
985
|
+
@evaluator.evaluate_answer_similarity(metrics=[metric_1, metric_2])
|
|
986
|
+
def agentic_node(*args, *kwargs):
|
|
987
|
+
pass
|
|
988
|
+
"""
|
|
989
|
+
|
|
990
|
+
return AnswerSimilarityDecorator(api_client=self.api_client,
|
|
991
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
992
|
+
metric_results=self.__online_metric_results,
|
|
993
|
+
execution_counts=self.__execution_counts,
|
|
994
|
+
nodes_being_run=self.__nodes_being_run,
|
|
995
|
+
lock=update_lock,
|
|
996
|
+
compute_real_time=compute_real_time).evaluate_answer_similarity(func, configuration=configuration, metrics=metrics)
|
|
997
|
+
|
|
998
|
+
def evaluate_faithfulness(self,
|
|
999
|
+
func: Optional[Callable] = None,
|
|
1000
|
+
*,
|
|
1001
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1002
|
+
metrics: list[GenAIMetric] = [],
|
|
1003
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
1004
|
+
"""
|
|
1005
|
+
An evaluation decorator for computing faithfulness metric on an agentic node.
|
|
1006
|
+
|
|
1007
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.FaithfulnessMetric`
|
|
1008
|
+
|
|
1009
|
+
Args:
|
|
1010
|
+
func (Optional[Callable], optional): The node on which the metric is to be computed.
|
|
1011
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1012
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ FaithfulnessMetric() ].
|
|
1013
|
+
compute_real_time (Optional[bool], optional): The flag to indicate whether the metric should be computed along with the node execution or not.
|
|
1014
|
+
|
|
1015
|
+
Raises:
|
|
1016
|
+
Exception: If there is any error while evaluation.
|
|
1017
|
+
|
|
1018
|
+
Returns:
|
|
1019
|
+
dict: The result of the wrapped node.
|
|
1020
|
+
|
|
1021
|
+
Examples:
|
|
1022
|
+
1. Basic usage
|
|
1023
|
+
.. code-block:: python
|
|
1024
|
+
|
|
1025
|
+
evaluator = AgenticEvaluator()
|
|
1026
|
+
@evaluator.evaluate_faithfulness
|
|
1027
|
+
def agentic_node(*args, *kwargs):
|
|
1028
|
+
pass
|
|
1029
|
+
|
|
1030
|
+
2. Usage with different thresholds and methods
|
|
1031
|
+
.. code-block:: python
|
|
1032
|
+
|
|
1033
|
+
metric_1 = FaithfulnessMetric(method="token_k_precision", threshold=MetricThreshold(type="lower_limit", value=0.5))
|
|
1034
|
+
metric_2 = FaithfulnessMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
|
|
1035
|
+
|
|
1036
|
+
evaluator = AgenticEvaluator()
|
|
1037
|
+
@evaluator.evaluate_faithfulness(metrics=[metric_1, metric_2])
|
|
1038
|
+
def agentic_node(*args, *kwargs):
|
|
1039
|
+
pass
|
|
1040
|
+
"""
|
|
1041
|
+
|
|
1042
|
+
return FaithfulnessDecorator(api_client=self.api_client,
|
|
1043
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1044
|
+
metric_results=self.__online_metric_results,
|
|
1045
|
+
execution_counts=self.__execution_counts,
|
|
1046
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1047
|
+
lock=update_lock,
|
|
1048
|
+
compute_real_time=compute_real_time).evaluate_faithfulness(func, configuration=configuration, metrics=metrics)
|
|
1049
|
+
|
|
1050
|
+
def evaluate_unsuccessful_requests(self,
|
|
1051
|
+
func: Optional[Callable] = None,
|
|
1052
|
+
*,
|
|
1053
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1054
|
+
metrics: list[GenAIMetric] = [],
|
|
1055
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
1056
|
+
"""
|
|
1057
|
+
An evaluation decorator for computing unsuccessful requests metric on an agentic tool.
|
|
1058
|
+
|
|
1059
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.UnsuccessfulRequestsMetric`
|
|
1060
|
+
|
|
1061
|
+
Args:
|
|
1062
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1063
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1064
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ UnsuccessfulRequestsMetric() ].
|
|
1065
|
+
|
|
1066
|
+
Raises:
|
|
1067
|
+
Exception: If there is any error while evaluation.
|
|
1068
|
+
|
|
1069
|
+
Returns:
|
|
1070
|
+
dict: The result of the wrapped tool.
|
|
1071
|
+
|
|
1072
|
+
Example:
|
|
1073
|
+
1. Basic usage
|
|
1074
|
+
.. code-block:: python
|
|
1075
|
+
|
|
1076
|
+
evaluator = AgenticEvaluator()
|
|
1077
|
+
@evaluator.evaluate_unsuccessful_requests
|
|
1078
|
+
def agentic_tool(*args, *kwargs):
|
|
1079
|
+
pass
|
|
1080
|
+
|
|
1081
|
+
2. Usage with different thresholds and methods
|
|
1082
|
+
.. code-block:: python
|
|
1083
|
+
|
|
1084
|
+
metric_1 = UnsuccessfulRequestsMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
|
|
1085
|
+
|
|
1086
|
+
evaluator = AgenticEvaluator()
|
|
1087
|
+
@evaluator.evaluate_unsuccessful_requests(metrics=[metric_1])
|
|
1088
|
+
def agentic_tool(*args, *kwargs):
|
|
1089
|
+
pass
|
|
1090
|
+
"""
|
|
1091
|
+
|
|
1092
|
+
return UnsuccessfulRequestsDecorator(api_client=self.api_client,
|
|
1093
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1094
|
+
metric_results=self.__online_metric_results,
|
|
1095
|
+
execution_counts=self.__execution_counts,
|
|
1096
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1097
|
+
lock=update_lock,
|
|
1098
|
+
compute_real_time=compute_real_time).evaluate_unsuccessful_requests(func, configuration=configuration, metrics=metrics)
|
|
1099
|
+
|
|
1100
|
+
def evaluate_answer_relevance(self,
|
|
1101
|
+
func: Optional[Callable] = None,
|
|
1102
|
+
*,
|
|
1103
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1104
|
+
metrics: list[GenAIMetric] = [],
|
|
1105
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
1106
|
+
"""
|
|
1107
|
+
An evaluation decorator for computing answer relevance metric on an agentic tool.
|
|
1108
|
+
|
|
1109
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.AnswerRelevanceMetric`
|
|
1110
|
+
|
|
1111
|
+
Args:
|
|
1112
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1113
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1114
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ AnswerRelevanceMetric() ].
|
|
1115
|
+
|
|
1116
|
+
Raises:
|
|
1117
|
+
Exception: If there is any error while evaluation.
|
|
1118
|
+
|
|
1119
|
+
Returns:
|
|
1120
|
+
dict: The result of the wrapped tool.
|
|
1121
|
+
|
|
1122
|
+
Example:
|
|
1123
|
+
1. Basic usage
|
|
1124
|
+
.. code-block:: python
|
|
1125
|
+
|
|
1126
|
+
evaluator = AgenticEvaluator()
|
|
1127
|
+
@evaluator.evaluate_answer_relevance
|
|
1128
|
+
def agentic_tool(*args, *kwargs):
|
|
1129
|
+
pass
|
|
1130
|
+
|
|
1131
|
+
2. Usage with different thresholds and methods
|
|
1132
|
+
.. code-block:: python
|
|
1133
|
+
|
|
1134
|
+
metric_1 = AnswerRelevanceMetric(method="token_recall", thresholds=[MetricThreshold(type="lower_limit", value=0.5)])
|
|
1135
|
+
metric_2 = AnswerRelevanceMetric(method="granite_guardian", thresholds=[MetricThreshold(type="lower_limit", value=0.5)])
|
|
1136
|
+
|
|
1137
|
+
evaluator = AgenticEvaluator()
|
|
1138
|
+
@evaluator.evaluate_answer_relevance(metrics=[metric_1, metric_2])
|
|
1139
|
+
def agentic_tool(*args, *kwargs):
|
|
1140
|
+
pass
|
|
1141
|
+
"""
|
|
1142
|
+
|
|
1143
|
+
return AnswerRelevanceDecorator(api_client=self.api_client,
|
|
1144
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1145
|
+
metric_results=self.__online_metric_results,
|
|
1146
|
+
execution_counts=self.__execution_counts,
|
|
1147
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1148
|
+
lock=update_lock,
|
|
1149
|
+
compute_real_time=compute_real_time).evaluate_answer_relevance(func, configuration=configuration, metrics=metrics)
|
|
1150
|
+
|
|
1151
|
+
def evaluate_general_quality_with_llm(self,
|
|
1152
|
+
func: Optional[Callable] = None,
|
|
1153
|
+
*,
|
|
1154
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1155
|
+
metrics: list[GenAIMetric] = [],
|
|
1156
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
1157
|
+
"""
|
|
1158
|
+
An evaluation decorator for computing llm validation metric on an agentic node.
|
|
1159
|
+
|
|
1160
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.LLMValidationMetric`
|
|
1161
|
+
|
|
1162
|
+
Args:
|
|
1163
|
+
func (Optional[Callable], optional): The node on which the metric is to be computed.
|
|
1164
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1165
|
+
metrics (list[GenAIMetric]): The list of metrics to compute as part of this evaluator.
|
|
1166
|
+
compute_real_time (Optional[bool], optional): The flag to indicate whether the metric should be computed along with the node execution or not.
|
|
1167
|
+
When online is set to False, evaluate_metrics method should be invoked on the AgenticEvaluator to compute the metric.
|
|
1168
|
+
|
|
1169
|
+
Raises:
|
|
1170
|
+
Exception: If there is any error while evaluation.
|
|
1171
|
+
|
|
1172
|
+
Returns:
|
|
1173
|
+
dict: The result of the wrapped node.
|
|
1174
|
+
|
|
1175
|
+
Examples:
|
|
1176
|
+
1. Basic usage
|
|
1177
|
+
.. code-block:: python
|
|
1178
|
+
|
|
1179
|
+
evaluator = AgenticEvaluator()
|
|
1180
|
+
@evaluator.evaluate_general_quality_with_llm
|
|
1181
|
+
def agentic_node(*args, *kwargs):
|
|
1182
|
+
pass
|
|
1183
|
+
"""
|
|
1184
|
+
return LLMValidationDecorator(api_client=self.api_client,
|
|
1185
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1186
|
+
metric_results=self.__online_metric_results,
|
|
1187
|
+
execution_counts=self.__execution_counts,
|
|
1188
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1189
|
+
lock=update_lock,
|
|
1190
|
+
compute_real_time=compute_real_time).evaluate_general_quality_with_llm(func,
|
|
1191
|
+
configuration=configuration,
|
|
1192
|
+
metrics=metrics)
|
|
1193
|
+
|
|
1194
|
+
def evaluate_tool_call_parameter_accuracy(self,
|
|
1195
|
+
func: Optional[Callable] = None,
|
|
1196
|
+
*,
|
|
1197
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1198
|
+
metrics: list[GenAIMetric] = [],
|
|
1199
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
1200
|
+
"""
|
|
1201
|
+
An evaluation decorator for computing tool_call_parameter_accuracy metric on an agentic tool.
|
|
1202
|
+
|
|
1203
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.ToolCallParameterAccuracyMetric`
|
|
1204
|
+
|
|
1205
|
+
Args:
|
|
1206
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1207
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1208
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ToolCallParameterAccuracyMetric() ].
|
|
1209
|
+
|
|
1210
|
+
Raises:
|
|
1211
|
+
Exception: If there is any error while evaluation.
|
|
1212
|
+
|
|
1213
|
+
Returns:
|
|
1214
|
+
dict: The result of the wrapped tool.
|
|
1215
|
+
|
|
1216
|
+
Example:
|
|
1217
|
+
1. Basic usage
|
|
1218
|
+
.. code-block:: python
|
|
1219
|
+
|
|
1220
|
+
evaluator = AgenticEvaluator()
|
|
1221
|
+
tool_calls_metric_config={
|
|
1222
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1223
|
+
}
|
|
1224
|
+
llm_judge = LLMJudge(
|
|
1225
|
+
model=WxAIFoundationModel(
|
|
1226
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
1227
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
1228
|
+
)
|
|
1229
|
+
)
|
|
1230
|
+
metric_1 = ToolCallParameterAccuracyMetric(llm_judge=llm_judge)
|
|
1231
|
+
@evaluator.evaluate_tool_call_parameter_accuracy(configuration=AgenticAIConfiguration(**tool_calls_metric_config), metrics=[metric_1])
|
|
1232
|
+
def agentic_tool(*args, *kwargs):
|
|
1233
|
+
pass
|
|
1234
|
+
|
|
1235
|
+
2. Usage with custom tool calls field
|
|
1236
|
+
.. code-block:: python
|
|
1237
|
+
|
|
1238
|
+
evaluator = AgenticEvaluator()
|
|
1239
|
+
tool_calls_metric_config={
|
|
1240
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1241
|
+
"tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
|
|
1242
|
+
}
|
|
1243
|
+
llm_judge = LLMJudge(
|
|
1244
|
+
model=WxAIFoundationModel(
|
|
1245
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
1246
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
1247
|
+
)
|
|
1248
|
+
)
|
|
1249
|
+
metric_1 = ToolCallParameterAccuracyMetric(llm_judge=llm_judge)
|
|
1250
|
+
@evaluator.evaluate_tool_call_parameter_accuracy(configuration=AgenticAIConfiguration(**tool_calls_metric_config), metrics=[metric_1])
|
|
1251
|
+
def agentic_tool(*args, *kwargs):
|
|
1252
|
+
pass
|
|
1253
|
+
|
|
1254
|
+
3. Usage with different thresholds
|
|
1255
|
+
.. code-block:: python
|
|
1256
|
+
|
|
1257
|
+
llm_judge = LLMJudge(
|
|
1258
|
+
model=WxAIFoundationModel(
|
|
1259
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
1260
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
1261
|
+
)
|
|
1262
|
+
)
|
|
1263
|
+
metric_1 = ToolCallParameterAccuracyMetric(llm_judge=llm_judge, threshold=MetricThreshold(type="upper_limit", value=0.7))
|
|
1264
|
+
evaluator = AgenticEvaluator()
|
|
1265
|
+
tool_calls_metric_config={
|
|
1266
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1267
|
+
"tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
|
|
1268
|
+
}
|
|
1269
|
+
@evaluator.evaluate_tool_call_parameter_accuracy(configuration=AgenticAIConfiguration(**tool_calls_metric_config),metrics=[metric_1])
|
|
1270
|
+
def agentic_tool(*args, *kwargs):
|
|
1271
|
+
pass
|
|
1272
|
+
"""
|
|
1273
|
+
|
|
1274
|
+
return ToolCallParameterAccuracyDecorator(api_client=self.api_client,
|
|
1275
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1276
|
+
metric_results=self.__online_metric_results,
|
|
1277
|
+
execution_counts=self.__execution_counts,
|
|
1278
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1279
|
+
lock=update_lock,
|
|
1280
|
+
compute_real_time=compute_real_time).evaluate_tool_call_parameter_accuracy(func, configuration=configuration, metrics=metrics)
|
|
1281
|
+
|
|
1282
|
+
def evaluate_tool_call_relevance(self,
|
|
1283
|
+
func: Optional[Callable] = None,
|
|
1284
|
+
*,
|
|
1285
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1286
|
+
metrics: list[GenAIMetric] = [],
|
|
1287
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
1288
|
+
"""
|
|
1289
|
+
An evaluation decorator for computing tool_call_relevance metric on an agent tool.
|
|
1290
|
+
|
|
1291
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.ToolCallRelevanceMetric`
|
|
1292
|
+
|
|
1293
|
+
Args:
|
|
1294
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1295
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1296
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ToolCallRelevanceMetric() ].
|
|
1297
|
+
|
|
1298
|
+
Raises:
|
|
1299
|
+
Exception: If there is any error while evaluation.
|
|
1300
|
+
|
|
1301
|
+
Returns:
|
|
1302
|
+
dict: The result of the wrapped tool.
|
|
1303
|
+
|
|
1304
|
+
Example:
|
|
1305
|
+
1. Basic usage
|
|
1306
|
+
.. code-block:: python
|
|
1307
|
+
|
|
1308
|
+
evaluator = AgenticEvaluator()
|
|
1309
|
+
tool_call_relevance_config={
|
|
1310
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1311
|
+
}
|
|
1312
|
+
llm_judge = LLMJudge(
|
|
1313
|
+
model=WxAIFoundationModel(
|
|
1314
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
1315
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
1316
|
+
)
|
|
1317
|
+
)
|
|
1318
|
+
metric_1 = ToolCallRelevanceMetric(llm_judge=llm_judge)
|
|
1319
|
+
@evaluator.evaluate_tool_call_relevance(configuration=AgenticAIConfiguration(**tool_call_relevance_config), metrics=[metric_1])
|
|
1320
|
+
def agentic_tool(*args, *kwargs):
|
|
1321
|
+
pass
|
|
1322
|
+
|
|
1323
|
+
2. Usage with custom tool calls field
|
|
1324
|
+
.. code-block:: python
|
|
1325
|
+
|
|
1326
|
+
evaluator = AgenticEvaluator()
|
|
1327
|
+
tool_call_relevance_config={
|
|
1328
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1329
|
+
"tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
|
|
1330
|
+
}
|
|
1331
|
+
llm_judge = LLMJudge(
|
|
1332
|
+
model=WxAIFoundationModel(
|
|
1333
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
1334
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
1335
|
+
)
|
|
1336
|
+
)
|
|
1337
|
+
metric_1 = ToolCallRelevanceMetric(llm_judge=llm_judge)
|
|
1338
|
+
@evaluator.evaluate_tool_call_relevance(configuration=AgenticAIConfiguration(**tool_call_relevance_config), metrics=[metric_1])
|
|
1339
|
+
def agentic_tool(*args, *kwargs):
|
|
1340
|
+
pass
|
|
1341
|
+
|
|
1342
|
+
3. Usage with different thresholds
|
|
1343
|
+
.. code-block:: python
|
|
1344
|
+
|
|
1345
|
+
llm_judge = LLMJudge(
|
|
1346
|
+
model=WxAIFoundationModel(
|
|
1347
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
1348
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
1349
|
+
)
|
|
1350
|
+
)
|
|
1351
|
+
metric_1 = ToolCallRelevanceMetric(llm_judge=llm_judge, threshold=MetricThreshold(type="upper_limit", value=0.7))
|
|
1352
|
+
evaluator = AgenticEvaluator()
|
|
1353
|
+
tool_call_relevance_config={
|
|
1354
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1355
|
+
"tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
|
|
1356
|
+
}
|
|
1357
|
+
@evaluator.evaluate_tool_call_relevance(configuration=AgenticAIConfiguration(**tool_call_relevance_config),metrics=[metric_1])
|
|
1358
|
+
def agentic_tool(*args, *kwargs):
|
|
1359
|
+
pass
|
|
1360
|
+
"""
|
|
1361
|
+
|
|
1362
|
+
return ToolCallRelevanceDecorator(api_client=self.api_client,
|
|
1363
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1364
|
+
metric_results=self.__online_metric_results,
|
|
1365
|
+
execution_counts=self.__execution_counts,
|
|
1366
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1367
|
+
lock=update_lock,
|
|
1368
|
+
compute_real_time=compute_real_time).evaluate_tool_call_relevance(func, configuration=configuration, metrics=metrics)
|
|
1369
|
+
|
|
1370
|
+
def evaluate_tool_call_syntactic_accuracy(self,
|
|
1371
|
+
func: Optional[Callable] = None,
|
|
1372
|
+
*,
|
|
1373
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1374
|
+
metrics: list[GenAIMetric] = [],
|
|
1375
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
1376
|
+
"""
|
|
1377
|
+
An evaluation decorator for computing tool_call_syntactic_accuracy metric on an agent tool.
|
|
1378
|
+
|
|
1379
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.ToolCallSyntacticAccuracyMetric`
|
|
1380
|
+
|
|
1381
|
+
Args:
|
|
1382
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1383
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1384
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ToolCallSyntacticAccuracyMetric() ].
|
|
1385
|
+
|
|
1386
|
+
Raises:
|
|
1387
|
+
Exception: If there is any error while evaluation.
|
|
1388
|
+
|
|
1389
|
+
Returns:
|
|
1390
|
+
dict: The result of the wrapped tool.
|
|
1391
|
+
|
|
1392
|
+
Example:
|
|
1393
|
+
1. Basic usage
|
|
1394
|
+
.. code-block:: python
|
|
1395
|
+
|
|
1396
|
+
evaluator = AgenticEvaluator()
|
|
1397
|
+
tool_call_syntactic_metric_config={
|
|
1398
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1399
|
+
}
|
|
1400
|
+
@evaluator.evaluate_tool_call_syntactic_accuracy(configuration=AgenticAIConfiguration(**tool_call_syntactic_metric_config))
|
|
1401
|
+
def agentic_tool(*args, *kwargs):
|
|
1402
|
+
pass
|
|
1403
|
+
|
|
1404
|
+
2. Usage with custom tool calls field
|
|
1405
|
+
.. code-block:: python
|
|
1406
|
+
|
|
1407
|
+
evaluator = AgenticEvaluator()
|
|
1408
|
+
tool_call_syntactic_metric_config={
|
|
1409
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1410
|
+
"tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
|
|
1411
|
+
}
|
|
1412
|
+
@evaluator.evaluate_tool_call_syntactic_accuracy(configuration=AgenticAIConfiguration(**tool_call_syntactic_metric_config))
|
|
1413
|
+
def agentic_tool(*args, *kwargs):
|
|
1414
|
+
pass
|
|
1415
|
+
|
|
1416
|
+
3. Usage with different thresholds
|
|
1417
|
+
.. code-block:: python
|
|
1418
|
+
|
|
1419
|
+
metric_1 = ToolCallSyntacticAccuracyMetric(threshold=MetricThreshold(type="upper_limit", value=0.7))
|
|
1420
|
+
evaluator = AgenticEvaluator()
|
|
1421
|
+
tool_call_syntactic_metric_config={
|
|
1422
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1423
|
+
"tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
|
|
1424
|
+
}
|
|
1425
|
+
@evaluator.evaluate_tool_call_syntactic_accuracy(configuration=AgenticAIConfiguration(**tool_call_syntactic_metric_config),metrics=[metric_1])
|
|
1426
|
+
def agentic_tool(*args, *kwargs):
|
|
1427
|
+
pass
|
|
1428
|
+
"""
|
|
1429
|
+
return ToolCallSyntacticAccuracyDecorator(api_client=self.api_client,
|
|
1430
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1431
|
+
metric_results=self.__online_metric_results,
|
|
1432
|
+
execution_counts=self.__execution_counts,
|
|
1433
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1434
|
+
lock=update_lock,
|
|
1435
|
+
compute_real_time=compute_real_time).evaluate_tool_call_syntactic_accuracy(func, configuration=configuration, metrics=metrics)
|
|
1436
|
+
|
|
1437
|
+
def evaluate_tool_call_accuracy(self,
|
|
1438
|
+
func: Optional[Callable] = None,
|
|
1439
|
+
*,
|
|
1440
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1441
|
+
metrics: list[GenAIMetric] = [],
|
|
1442
|
+
compute_real_time: Optional[bool] = True) -> dict:
|
|
1443
|
+
"""
|
|
1444
|
+
An evaluation decorator for computing tool_call_accuracy metric on an agent tool.
|
|
1445
|
+
|
|
1446
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.ToolCallAccuracyMetric`
|
|
1447
|
+
|
|
1448
|
+
Args:
|
|
1449
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1450
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1451
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ToolCallAccuracyMetric() ].
|
|
1452
|
+
|
|
1453
|
+
Raises:
|
|
1454
|
+
Exception: If there is any error while evaluation.
|
|
1455
|
+
|
|
1456
|
+
Returns:
|
|
1457
|
+
dict: The result of the wrapped tool.
|
|
1458
|
+
|
|
1459
|
+
Example:
|
|
1460
|
+
1. Basic usage
|
|
1461
|
+
.. code-block:: python
|
|
1462
|
+
|
|
1463
|
+
evaluator = AgenticEvaluator()
|
|
1464
|
+
tool_call_metric_config={
|
|
1465
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1466
|
+
}
|
|
1467
|
+
@evaluator.evaluate_tool_call_accuracy(configuration=AgenticAIConfiguration(**tool_call_metric_config))
|
|
1468
|
+
def agentic_tool(*args, *kwargs):
|
|
1469
|
+
pass
|
|
1470
|
+
|
|
1471
|
+
2. Usage with custom tool calls field
|
|
1472
|
+
.. code-block:: python
|
|
1473
|
+
|
|
1474
|
+
evaluator = AgenticEvaluator()
|
|
1475
|
+
tool_call_metric_config={
|
|
1476
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1477
|
+
"tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
|
|
1478
|
+
}
|
|
1479
|
+
@evaluator.evaluate_tool_call_syntactic_accuracy(configuration=AgenticAIConfiguration(**tool_call_metric_config))
|
|
1480
|
+
def agentic_tool(*args, *kwargs):
|
|
1481
|
+
pass
|
|
1482
|
+
|
|
1483
|
+
3. Usage with different thresholds
|
|
1484
|
+
.. code-block:: python
|
|
1485
|
+
|
|
1486
|
+
metric_1 = ToolCallAccuracyMetric(threshold=MetricThreshold(type="upper_limit", value=0.7))
|
|
1487
|
+
metric_2 = ToolCallAccuracyMetric(threshold=MetricThreshold(type="upper_limit", value=0.9))
|
|
1488
|
+
evaluator = AgenticEvaluator()
|
|
1489
|
+
tool_call_metric_config={
|
|
1490
|
+
"tools":[get_weather, fetch_stock_price], # List of tools available to the agent
|
|
1491
|
+
"tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
|
|
1492
|
+
}
|
|
1493
|
+
@evaluator.evaluate_tool_call_accuracy(configuration=AgenticAIConfiguration(**tool_call_metric_config),metrics=[metric_1, metric_2])
|
|
1494
|
+
def agentic_tool(*args, *kwargs):
|
|
1495
|
+
pass
|
|
1496
|
+
|
|
1497
|
+
4. Usage with a list of dictionary items as tools
|
|
1498
|
+
.. code-block:: python
|
|
1499
|
+
available_tools = [{"type":"function","function":{"name":"f1_name","description":"f1_description.","parameters":{"parameter1":{"description":"parameter_description","type":"parameter_type","default":"default_value"}}}}]
|
|
1500
|
+
tool_call_metric_config={
|
|
1501
|
+
"tools":available_tools, # List of tools available to the agent
|
|
1502
|
+
"tool_calls_field": "tool_calls" # Graph state field to store the Agent's response/tool calls
|
|
1503
|
+
}
|
|
1504
|
+
metric = ToolCallAccuracyMetric()
|
|
1505
|
+
evaluator = AgenticEvaluator()
|
|
1506
|
+
@evaluator.evaluate_tool_call_accuracy(configuration=AgenticAIConfiguration(**tool_call_metric_config),metrics=[metric])
|
|
1507
|
+
def agentic_tool(*args, *kwargs):
|
|
1508
|
+
pass
|
|
1509
|
+
"""
|
|
1510
|
+
return ToolCallAccuracyDecorator(api_client=self.api_client,
|
|
1511
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1512
|
+
metric_results=self.__online_metric_results,
|
|
1513
|
+
execution_counts=self.__execution_counts,
|
|
1514
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1515
|
+
lock=update_lock,
|
|
1516
|
+
compute_real_time=compute_real_time).evaluate_tool_call_accuracy(func, configuration=configuration, metrics=metrics)
|
|
1517
|
+
|
|
1518
|
+
def evaluate_prompt_safety_risk(self,
|
|
1519
|
+
func: Optional[Callable] = None,
|
|
1520
|
+
*,
|
|
1521
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1522
|
+
metrics: list[GenAIMetric],
|
|
1523
|
+
compute_real_time: Optional[bool] = True,
|
|
1524
|
+
) -> dict:
|
|
1525
|
+
"""
|
|
1526
|
+
An evaluation decorator for computing prompt safety risk metric on an agentic tool.
|
|
1527
|
+
|
|
1528
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.PromptSafetyRiskMetric`
|
|
1529
|
+
|
|
1530
|
+
Args:
|
|
1531
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1532
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1533
|
+
metrics (list[GenAIMetric]): The list of metrics to compute as part of this evaluator.
|
|
1534
|
+
|
|
1535
|
+
Raises:
|
|
1536
|
+
Exception: If there is any error while evaluation.
|
|
1537
|
+
|
|
1538
|
+
Returns:
|
|
1539
|
+
dict: The result of the wrapped tool.
|
|
1540
|
+
|
|
1541
|
+
Example:
|
|
1542
|
+
1. Create evaluate_prompt_safety_risk decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
1543
|
+
.. code-block:: python
|
|
1544
|
+
|
|
1545
|
+
evaluator = AgenticEvaluator()
|
|
1546
|
+
@evaluator.evaluate_prompt_safety_risk(metrics=[PromptSafetyRiskMetric(system_prompt="...")])
|
|
1547
|
+
def agentic_tool(*args, *kwargs):
|
|
1548
|
+
pass
|
|
1549
|
+
|
|
1550
|
+
2. Create evaluate_prompt_safety_risk decorator with thresholds and configuration
|
|
1551
|
+
.. code-block:: python
|
|
1552
|
+
|
|
1553
|
+
metric = PromptSafetyRiskMetric(system_prompt="...", thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
1554
|
+
config = {"input_fields": ["input"]}
|
|
1555
|
+
configuration = AgenticAIConfiguration(**config)
|
|
1556
|
+
evaluator = AgenticEvaluator()
|
|
1557
|
+
@evaluator.evaluate_prompt_safety_risk(metrics=[metric], configuration=configuration)
|
|
1558
|
+
def agentic_tool(*args, *kwargs):
|
|
1559
|
+
pass
|
|
1560
|
+
"""
|
|
1561
|
+
return PromptSafetyRiskDecorator(api_client=self.api_client,
|
|
1562
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1563
|
+
metric_results=self.__online_metric_results,
|
|
1564
|
+
execution_counts=self.__execution_counts,
|
|
1565
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1566
|
+
lock=update_lock,
|
|
1567
|
+
compute_real_time=compute_real_time).evaluate_prompt_safety_risk(func, configuration=configuration, metrics=metrics)
|
|
1568
|
+
|
|
1569
|
+
def evaluate_hap(self,
|
|
1570
|
+
func: Optional[Callable] = None,
|
|
1571
|
+
*,
|
|
1572
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1573
|
+
metrics: list[GenAIMetric] = [],
|
|
1574
|
+
compute_real_time: Optional[bool] = True,
|
|
1575
|
+
) -> dict:
|
|
1576
|
+
"""
|
|
1577
|
+
An evaluation decorator for computing HAP metric on an agentic tool.
|
|
1578
|
+
|
|
1579
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.HAPMetric`
|
|
1580
|
+
|
|
1581
|
+
Args:
|
|
1582
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1583
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1584
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [HAPMetric()].
|
|
1585
|
+
|
|
1586
|
+
Raises:
|
|
1587
|
+
Exception: If there is any error while evaluation.
|
|
1588
|
+
|
|
1589
|
+
Returns:
|
|
1590
|
+
dict: The result of the wrapped tool.
|
|
1591
|
+
|
|
1592
|
+
Example:
|
|
1593
|
+
1. Create evaluate_hap decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
1594
|
+
.. code-block:: python
|
|
1595
|
+
|
|
1596
|
+
evaluator = AgenticEvaluator()
|
|
1597
|
+
@evaluator.evaluate_hap
|
|
1598
|
+
def agentic_tool(*args, *kwargs):
|
|
1599
|
+
pass
|
|
1600
|
+
|
|
1601
|
+
2. Create evaluate_hap decorator with thresholds and configuration
|
|
1602
|
+
.. code-block:: python
|
|
1603
|
+
|
|
1604
|
+
metric = HAPMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
1605
|
+
config = {"input_fields": ["input"]}
|
|
1606
|
+
configuration = AgenticAIConfiguration(**config)
|
|
1607
|
+
evaluator = AgenticEvaluator()
|
|
1608
|
+
@evaluator.evaluate_hap(metrics=[metric], configuration=configuration)
|
|
1609
|
+
def agentic_tool(*args, *kwargs):
|
|
1610
|
+
pass
|
|
1611
|
+
"""
|
|
1612
|
+
return HAPDecorator(api_client=self.api_client,
|
|
1613
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1614
|
+
metric_results=self.__online_metric_results,
|
|
1615
|
+
execution_counts=self.__execution_counts,
|
|
1616
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1617
|
+
lock=update_lock,
|
|
1618
|
+
compute_real_time=compute_real_time).evaluate_hap(func, configuration=configuration, metrics=metrics)
|
|
1619
|
+
|
|
1620
|
+
def evaluate_pii(self,
|
|
1621
|
+
func: Optional[Callable] = None,
|
|
1622
|
+
*,
|
|
1623
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1624
|
+
metrics: list[GenAIMetric] = [],
|
|
1625
|
+
compute_real_time: Optional[bool] = True,
|
|
1626
|
+
) -> dict:
|
|
1627
|
+
"""
|
|
1628
|
+
An evaluation decorator for computing PII metric on an agentic tool.
|
|
1629
|
+
|
|
1630
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.PIIMetric`
|
|
1631
|
+
|
|
1632
|
+
Args:
|
|
1633
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1634
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1635
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [PIIMetric()].
|
|
1636
|
+
|
|
1637
|
+
Raises:
|
|
1638
|
+
Exception: If there is any error while evaluation.
|
|
1639
|
+
|
|
1640
|
+
Returns:
|
|
1641
|
+
dict: The result of the wrapped tool.
|
|
1642
|
+
|
|
1643
|
+
Example:
|
|
1644
|
+
1. Create evaluate_pii decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
1645
|
+
.. code-block:: python
|
|
1646
|
+
|
|
1647
|
+
evaluator = AgenticEvaluator()
|
|
1648
|
+
@evaluator.evaluate_pii
|
|
1649
|
+
def agentic_tool(*args, *kwargs):
|
|
1650
|
+
pass
|
|
1651
|
+
|
|
1652
|
+
2. Create evaluate_pii decorator with thresholds and configuration
|
|
1653
|
+
.. code-block:: python
|
|
1654
|
+
|
|
1655
|
+
metric = PIIMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
1656
|
+
config = {"input_fields": ["input"]}
|
|
1657
|
+
configuration = AgenticAIConfiguration(**config)
|
|
1658
|
+
evaluator = AgenticEvaluator()
|
|
1659
|
+
@evaluator.evaluate_pii(metrics=[metric], configuration=configuration)
|
|
1660
|
+
def agentic_tool(*args, *kwargs):
|
|
1661
|
+
pass
|
|
1662
|
+
"""
|
|
1663
|
+
return PIIDecorator(api_client=self.api_client,
|
|
1664
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1665
|
+
metric_results=self.__online_metric_results,
|
|
1666
|
+
execution_counts=self.__execution_counts,
|
|
1667
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1668
|
+
lock=update_lock,
|
|
1669
|
+
compute_real_time=compute_real_time).evaluate_pii(func, configuration=configuration, metrics=metrics)
|
|
1670
|
+
|
|
1671
|
+
def evaluate_harm(self,
|
|
1672
|
+
func: Optional[Callable] = None,
|
|
1673
|
+
*,
|
|
1674
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1675
|
+
metrics: list[GenAIMetric] = [],
|
|
1676
|
+
compute_real_time: Optional[bool] = True,
|
|
1677
|
+
) -> dict:
|
|
1678
|
+
"""
|
|
1679
|
+
An evaluation decorator for computing harm risk on an agentic tool via granite guardian.
|
|
1680
|
+
|
|
1681
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.HarmMetric`
|
|
1682
|
+
|
|
1683
|
+
Args:
|
|
1684
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1685
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1686
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ HarmMetric() ]
|
|
1687
|
+
|
|
1688
|
+
Raises:
|
|
1689
|
+
Exception: If there is any error while evaluation.
|
|
1690
|
+
|
|
1691
|
+
Returns:
|
|
1692
|
+
dict: The result of the wrapped tool.
|
|
1693
|
+
|
|
1694
|
+
Example:
|
|
1695
|
+
1. Create evaluate_harm decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
1696
|
+
.. code-block:: python
|
|
1697
|
+
|
|
1698
|
+
evaluator = AgenticEvaluator()
|
|
1699
|
+
@evaluator.evaluate_harm
|
|
1700
|
+
def agentic_tool(*args, *kwargs):
|
|
1701
|
+
pass
|
|
1702
|
+
|
|
1703
|
+
2. Create evaluate_harm decorator with thresholds and configuration
|
|
1704
|
+
.. code-block:: python
|
|
1705
|
+
|
|
1706
|
+
metric = HarmMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
1707
|
+
config = {"input_fields": ["input"]}
|
|
1708
|
+
configuration = AgenticAIConfiguration(**config)
|
|
1709
|
+
evaluator = AgenticEvaluator()
|
|
1710
|
+
@evaluator.evaluate_harm(metrics=[metric], configuration=configuration)
|
|
1711
|
+
def agentic_tool(*args, *kwargs):
|
|
1712
|
+
pass
|
|
1713
|
+
"""
|
|
1714
|
+
return HarmDecorator(api_client=self.api_client,
|
|
1715
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1716
|
+
metric_results=self.__online_metric_results,
|
|
1717
|
+
execution_counts=self.__execution_counts,
|
|
1718
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1719
|
+
lock=update_lock,
|
|
1720
|
+
compute_real_time=compute_real_time).evaluate_harm(func, configuration=configuration, metrics=metrics)
|
|
1721
|
+
|
|
1722
|
+
def evaluate_social_bias(self,
|
|
1723
|
+
func: Optional[Callable] = None,
|
|
1724
|
+
*,
|
|
1725
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1726
|
+
metrics: list[GenAIMetric] = [],
|
|
1727
|
+
compute_real_time: Optional[bool] = True,
|
|
1728
|
+
) -> dict:
|
|
1729
|
+
"""
|
|
1730
|
+
An evaluation decorator for computing social bias on an agentic tool via granite guardian.
|
|
1731
|
+
|
|
1732
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.SocialBiasMetric`
|
|
1733
|
+
|
|
1734
|
+
Args:
|
|
1735
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1736
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1737
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ SocialBiasMetric() ]
|
|
1738
|
+
|
|
1739
|
+
Raises:
|
|
1740
|
+
Exception: If there is any error while evaluation.
|
|
1741
|
+
|
|
1742
|
+
Returns:
|
|
1743
|
+
dict: The result of the wrapped tool.
|
|
1744
|
+
|
|
1745
|
+
Example:
|
|
1746
|
+
1. Create evaluate_social_bias decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
1747
|
+
.. code-block:: python
|
|
1748
|
+
|
|
1749
|
+
evaluator = AgenticEvaluator()
|
|
1750
|
+
@evaluator.evaluate_social_bias
|
|
1751
|
+
def agentic_tool(*args, *kwargs):
|
|
1752
|
+
pass
|
|
1753
|
+
|
|
1754
|
+
2. Create evaluate_social_bias decorator with thresholds and configuration
|
|
1755
|
+
.. code-block:: python
|
|
1756
|
+
|
|
1757
|
+
metric = SocialBiasMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
1758
|
+
config = {"input_fields": ["input"]}
|
|
1759
|
+
configuration = AgenticAIConfiguration(**config)
|
|
1760
|
+
evaluator = AgenticEvaluator()
|
|
1761
|
+
@evaluator.evaluate_social_bias(metrics=[metric], configuration=configuration)
|
|
1762
|
+
def agentic_tool(*args, *kwargs):
|
|
1763
|
+
pass
|
|
1764
|
+
"""
|
|
1765
|
+
return SocialBiasDecorator(api_client=self.api_client,
|
|
1766
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1767
|
+
metric_results=self.__online_metric_results,
|
|
1768
|
+
execution_counts=self.__execution_counts,
|
|
1769
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1770
|
+
lock=update_lock,
|
|
1771
|
+
compute_real_time=compute_real_time).evaluate_social_bias(func, configuration=configuration, metrics=metrics)
|
|
1772
|
+
|
|
1773
|
+
def evaluate_profanity(self,
|
|
1774
|
+
func: Optional[Callable] = None,
|
|
1775
|
+
*,
|
|
1776
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1777
|
+
metrics: list[GenAIMetric] = [],
|
|
1778
|
+
compute_real_time: Optional[bool] = True,
|
|
1779
|
+
) -> dict:
|
|
1780
|
+
"""
|
|
1781
|
+
An evaluation decorator for computing profanity on an agentic tool via granite guardian.
|
|
1782
|
+
|
|
1783
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.ProfanityMetric`
|
|
1784
|
+
|
|
1785
|
+
Args:
|
|
1786
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1787
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1788
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
|
|
1789
|
+
|
|
1790
|
+
Raises:
|
|
1791
|
+
Exception: If there is any error while evaluation.
|
|
1792
|
+
|
|
1793
|
+
Returns:
|
|
1794
|
+
dict: The result of the wrapped tool.
|
|
1795
|
+
|
|
1796
|
+
Example:
|
|
1797
|
+
1. Create evaluate_profanity decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
1798
|
+
.. code-block:: python
|
|
1799
|
+
|
|
1800
|
+
evaluator = AgenticEvaluator()
|
|
1801
|
+
@evaluator.evaluate_profanity
|
|
1802
|
+
def agentic_tool(*args, *kwargs):
|
|
1803
|
+
pass
|
|
1804
|
+
|
|
1805
|
+
2. Create evaluate_profanity decorator with thresholds and configuration
|
|
1806
|
+
.. code-block:: python
|
|
1807
|
+
|
|
1808
|
+
metric = ProfanityMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
1809
|
+
config = {"input_fields": ["input"]}
|
|
1810
|
+
configuration = AgenticAIConfiguration(**config)
|
|
1811
|
+
evaluator = AgenticEvaluator()
|
|
1812
|
+
@evaluator.evaluate_profanity(metrics=[metric], configuration=configuration)
|
|
1813
|
+
def agentic_tool(*args, *kwargs):
|
|
1814
|
+
pass
|
|
1815
|
+
"""
|
|
1816
|
+
return ProfanityDecorator(api_client=self.api_client,
|
|
1817
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1818
|
+
metric_results=self.__online_metric_results,
|
|
1819
|
+
execution_counts=self.__execution_counts,
|
|
1820
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1821
|
+
lock=update_lock,
|
|
1822
|
+
compute_real_time=compute_real_time).evaluate_profanity(func, configuration=configuration, metrics=metrics)
|
|
1823
|
+
|
|
1824
|
+
def evaluate_sexual_content(self,
|
|
1825
|
+
func: Optional[Callable] = None,
|
|
1826
|
+
*,
|
|
1827
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1828
|
+
metrics: list[GenAIMetric] = [],
|
|
1829
|
+
compute_real_time: Optional[bool] = True,
|
|
1830
|
+
) -> dict:
|
|
1831
|
+
"""
|
|
1832
|
+
An evaluation decorator for computing sexual content on an agentic tool via granite guardian.
|
|
1833
|
+
|
|
1834
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.SexualContentMetric`
|
|
1835
|
+
|
|
1836
|
+
Args:
|
|
1837
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1838
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1839
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
|
|
1840
|
+
|
|
1841
|
+
Raises:
|
|
1842
|
+
Exception: If there is any error while evaluation.
|
|
1843
|
+
|
|
1844
|
+
Returns:
|
|
1845
|
+
dict: The result of the wrapped tool.
|
|
1846
|
+
|
|
1847
|
+
Example:
|
|
1848
|
+
1. Create evaluate_sexual_content decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
1849
|
+
.. code-block:: python
|
|
1850
|
+
|
|
1851
|
+
evaluator = AgenticEvaluator()
|
|
1852
|
+
@evaluator.evaluate_sexual_content
|
|
1853
|
+
def agentic_tool(*args, *kwargs):
|
|
1854
|
+
pass
|
|
1855
|
+
|
|
1856
|
+
2. Create evaluate_sexual_content decorator with thresholds and configuration
|
|
1857
|
+
.. code-block:: python
|
|
1858
|
+
|
|
1859
|
+
metric = SexualContentMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
1860
|
+
config = {"input_fields": ["input"]}
|
|
1861
|
+
configuration = AgenticAIConfiguration(**config)
|
|
1862
|
+
evaluator = AgenticEvaluator()
|
|
1863
|
+
@evaluator.evaluate_sexual_content(metrics=[metric], configuration=configuration)
|
|
1864
|
+
def agentic_tool(*args, *kwargs):
|
|
1865
|
+
pass
|
|
1866
|
+
"""
|
|
1867
|
+
return SexualContentDecorator(api_client=self.api_client,
|
|
1868
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1869
|
+
metric_results=self.__online_metric_results,
|
|
1870
|
+
execution_counts=self.__execution_counts,
|
|
1871
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1872
|
+
lock=update_lock,
|
|
1873
|
+
compute_real_time=compute_real_time).evaluate_sexual_content(func, configuration=configuration, metrics=metrics)
|
|
1874
|
+
|
|
1875
|
+
def evaluate_unethical_behavior(self,
|
|
1876
|
+
func: Optional[Callable] = None,
|
|
1877
|
+
*,
|
|
1878
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1879
|
+
metrics: list[GenAIMetric] = [],
|
|
1880
|
+
compute_real_time: Optional[bool] = True,
|
|
1881
|
+
) -> dict:
|
|
1882
|
+
"""
|
|
1883
|
+
An evaluation decorator for computing unethical behavior on an agentic tool via granite guardian.
|
|
1884
|
+
|
|
1885
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.UnethicalBehaviorMetric`
|
|
1886
|
+
|
|
1887
|
+
Args:
|
|
1888
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1889
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1890
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
|
|
1891
|
+
|
|
1892
|
+
Raises:
|
|
1893
|
+
Exception: If there is any error while evaluation.
|
|
1894
|
+
|
|
1895
|
+
Returns:
|
|
1896
|
+
dict: The result of the wrapped tool.
|
|
1897
|
+
|
|
1898
|
+
Example:
|
|
1899
|
+
1. Create evaluate_unethical_behavior decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
1900
|
+
.. code-block:: python
|
|
1901
|
+
|
|
1902
|
+
evaluator = AgenticEvaluator()
|
|
1903
|
+
@evaluator.evaluate_unethical_behavior
|
|
1904
|
+
def agentic_tool(*args, *kwargs):
|
|
1905
|
+
pass
|
|
1906
|
+
|
|
1907
|
+
2. Create evaluate_unethical_behavior decorator with thresholds and configuration
|
|
1908
|
+
.. code-block:: python
|
|
1909
|
+
|
|
1910
|
+
metric = UnethicalBehaviorMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
1911
|
+
config = {"input_fields": ["input"]}
|
|
1912
|
+
configuration = AgenticAIConfiguration(**config)
|
|
1913
|
+
evaluator = AgenticEvaluator()
|
|
1914
|
+
@evaluator.evaluate_unethical_behavior(metrics=[metric], configuration=configuration)
|
|
1915
|
+
def agentic_tool(*args, *kwargs):
|
|
1916
|
+
pass
|
|
1917
|
+
"""
|
|
1918
|
+
|
|
1919
|
+
return UnethicalBehaviorDecorator(api_client=self.api_client,
|
|
1920
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1921
|
+
metric_results=self.__online_metric_results,
|
|
1922
|
+
execution_counts=self.__execution_counts,
|
|
1923
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1924
|
+
lock=update_lock,
|
|
1925
|
+
compute_real_time=compute_real_time).evaluate_unethical_behavior(func, configuration=configuration, metrics=metrics)
|
|
1926
|
+
|
|
1927
|
+
def evaluate_violence(self,
|
|
1928
|
+
func: Optional[Callable] = None,
|
|
1929
|
+
*,
|
|
1930
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1931
|
+
metrics: list[GenAIMetric] = [],
|
|
1932
|
+
compute_real_time: Optional[bool] = True,
|
|
1933
|
+
) -> dict:
|
|
1934
|
+
"""
|
|
1935
|
+
An evaluation decorator for computing violence on an agentic tool via granite guardian.
|
|
1936
|
+
|
|
1937
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.ViolenceMetric`
|
|
1938
|
+
|
|
1939
|
+
Args:
|
|
1940
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1941
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1942
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
|
|
1943
|
+
|
|
1944
|
+
Raises:
|
|
1945
|
+
Exception: If there is any error while evaluation.
|
|
1946
|
+
|
|
1947
|
+
Returns:
|
|
1948
|
+
dict: The result of the wrapped tool.
|
|
1949
|
+
|
|
1950
|
+
Example:
|
|
1951
|
+
1. Create evaluate_violence decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
1952
|
+
.. code-block:: python
|
|
1953
|
+
|
|
1954
|
+
evaluator = AgenticEvaluator()
|
|
1955
|
+
@evaluator.evaluate_violence
|
|
1956
|
+
def agentic_tool(*args, *kwargs):
|
|
1957
|
+
pass
|
|
1958
|
+
|
|
1959
|
+
2. Create evaluate_violence decorator with thresholds and configuration
|
|
1960
|
+
.. code-block:: python
|
|
1961
|
+
|
|
1962
|
+
metric = ViolenceMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
1963
|
+
config = {"input_fields": ["input"]}
|
|
1964
|
+
configuration = AgenticAIConfiguration(**config)
|
|
1965
|
+
evaluator = AgenticEvaluator()
|
|
1966
|
+
@evaluator.evaluate_violence(metrics=[metric], configuration=configuration)
|
|
1967
|
+
def agentic_tool(*args, *kwargs):
|
|
1968
|
+
pass
|
|
1969
|
+
"""
|
|
1970
|
+
return ViolenceDecorator(api_client=self.api_client,
|
|
1971
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
1972
|
+
metric_results=self.__online_metric_results,
|
|
1973
|
+
execution_counts=self.__execution_counts,
|
|
1974
|
+
nodes_being_run=self.__nodes_being_run,
|
|
1975
|
+
lock=update_lock,
|
|
1976
|
+
compute_real_time=compute_real_time).evaluate_violence(func, configuration=configuration, metrics=metrics)
|
|
1977
|
+
|
|
1978
|
+
def evaluate_harm_engagement(self,
|
|
1979
|
+
func: Optional[Callable] = None,
|
|
1980
|
+
*,
|
|
1981
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
1982
|
+
metrics: list[GenAIMetric] = [],
|
|
1983
|
+
compute_real_time: Optional[bool] = True,
|
|
1984
|
+
) -> dict:
|
|
1985
|
+
"""
|
|
1986
|
+
An evaluation decorator for computing harm engagement on an agentic tool via granite guardian.
|
|
1987
|
+
|
|
1988
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.HarmEngagementMetric`
|
|
1989
|
+
|
|
1990
|
+
Args:
|
|
1991
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
1992
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
1993
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
|
|
1994
|
+
|
|
1995
|
+
Raises:
|
|
1996
|
+
Exception: If there is any error while evaluation.
|
|
1997
|
+
|
|
1998
|
+
Returns:
|
|
1999
|
+
dict: The result of the wrapped tool.
|
|
2000
|
+
|
|
2001
|
+
Example:
|
|
2002
|
+
1. Create evaluate_harm_engagement decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
2003
|
+
.. code-block:: python
|
|
2004
|
+
|
|
2005
|
+
evaluator = AgenticEvaluator()
|
|
2006
|
+
@evaluator.evaluate_harm_engagement
|
|
2007
|
+
def agentic_tool(*args, *kwargs):
|
|
2008
|
+
pass
|
|
2009
|
+
|
|
2010
|
+
2. Create evaluate_harm_engagement decorator with thresholds and configuration
|
|
2011
|
+
.. code-block:: python
|
|
2012
|
+
|
|
2013
|
+
metric = HarmEngagementMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
2014
|
+
config = {"input_fields": ["input"]}
|
|
2015
|
+
configuration = AgenticAIConfiguration(**config)
|
|
2016
|
+
evaluator = AgenticEvaluator()
|
|
2017
|
+
@evaluator.evaluate_harm_engagement(metrics=[metric], configuration=configuration)
|
|
2018
|
+
def agentic_tool(*args, *kwargs):
|
|
2019
|
+
pass
|
|
2020
|
+
"""
|
|
2021
|
+
return HarmEngagementDecorator(api_client=self.api_client,
|
|
2022
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2023
|
+
metric_results=self.__online_metric_results,
|
|
2024
|
+
execution_counts=self.__execution_counts,
|
|
2025
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2026
|
+
lock=update_lock,
|
|
2027
|
+
compute_real_time=compute_real_time).evaluate_harm_engagement(func, configuration=configuration, metrics=metrics)
|
|
2028
|
+
|
|
2029
|
+
def evaluate_evasiveness(self,
|
|
2030
|
+
func: Optional[Callable] = None,
|
|
2031
|
+
*,
|
|
2032
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
2033
|
+
metrics: list[GenAIMetric] = [],
|
|
2034
|
+
compute_real_time: Optional[bool] = True,
|
|
2035
|
+
) -> dict:
|
|
2036
|
+
"""
|
|
2037
|
+
An evaluation decorator for computing evasiveness on an agentic tool via granite guardian.
|
|
2038
|
+
|
|
2039
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.EvasivenessMetric`
|
|
2040
|
+
|
|
2041
|
+
Args:
|
|
2042
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
2043
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
2044
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
|
|
2045
|
+
|
|
2046
|
+
Raises:
|
|
2047
|
+
Exception: If there is any error while evaluation.
|
|
2048
|
+
|
|
2049
|
+
Returns:
|
|
2050
|
+
dict: The result of the wrapped tool.
|
|
2051
|
+
|
|
2052
|
+
Example:
|
|
2053
|
+
1. Create evaluate_evasiveness decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
2054
|
+
.. code-block:: python
|
|
2055
|
+
|
|
2056
|
+
evaluator = AgenticEvaluator()
|
|
2057
|
+
@evaluator.evaluate_evasiveness
|
|
2058
|
+
def agentic_tool(*args, *kwargs):
|
|
2059
|
+
pass
|
|
2060
|
+
|
|
2061
|
+
2. Create evaluate_evasiveness decorator with thresholds and configuration
|
|
2062
|
+
.. code-block:: python
|
|
2063
|
+
|
|
2064
|
+
metric = EvasivenessMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
2065
|
+
config = {"input_fields": ["input"]}
|
|
2066
|
+
configuration = AgenticAIConfiguration(**config)
|
|
2067
|
+
evaluator = AgenticEvaluator()
|
|
2068
|
+
@evaluator.evaluate_evasiveness(metrics=[metric], configuration=configuration)
|
|
2069
|
+
def agentic_tool(*args, *kwargs):
|
|
2070
|
+
pass
|
|
2071
|
+
"""
|
|
2072
|
+
return EvasivenessDecorator(api_client=self.api_client,
|
|
2073
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2074
|
+
metric_results=self.__online_metric_results,
|
|
2075
|
+
execution_counts=self.__execution_counts,
|
|
2076
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2077
|
+
lock=update_lock,
|
|
2078
|
+
compute_real_time=compute_real_time).evaluate_evasiveness(func, configuration=configuration, metrics=metrics)
|
|
2079
|
+
|
|
2080
|
+
def evaluate_jailbreak(self,
|
|
2081
|
+
func: Optional[Callable] = None,
|
|
2082
|
+
*,
|
|
2083
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
2084
|
+
metrics: list[GenAIMetric] = [],
|
|
2085
|
+
compute_real_time: Optional[bool] = True,
|
|
2086
|
+
) -> dict:
|
|
2087
|
+
"""
|
|
2088
|
+
An evaluation decorator for computing jailbreak on an agentic tool via granite guardian.
|
|
2089
|
+
|
|
2090
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.JailbreakMetric`
|
|
2091
|
+
|
|
2092
|
+
Args:
|
|
2093
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
2094
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
2095
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [ ProfanityMetric() ]
|
|
2096
|
+
|
|
2097
|
+
Raises:
|
|
2098
|
+
Exception: If there is any error while evaluation.
|
|
2099
|
+
|
|
2100
|
+
Returns:
|
|
2101
|
+
dict: The result of the wrapped tool.
|
|
2102
|
+
|
|
2103
|
+
Example:
|
|
2104
|
+
1. Create evaluate_jailbreak decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
2105
|
+
.. code-block:: python
|
|
2106
|
+
|
|
2107
|
+
evaluator = AgenticEvaluator()
|
|
2108
|
+
@evaluator.evaluate_jailbreak
|
|
2109
|
+
def agentic_tool(*args, *kwargs):
|
|
2110
|
+
pass
|
|
2111
|
+
|
|
2112
|
+
2. Create evaluate_jailbreak decorator with thresholds and configuration
|
|
2113
|
+
.. code-block:: python
|
|
2114
|
+
|
|
2115
|
+
metric = JailbreakMetric(thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
2116
|
+
config = {"input_fields": ["input"]}
|
|
2117
|
+
configuration = AgenticAIConfiguration(**config)
|
|
2118
|
+
evaluator = AgenticEvaluator()
|
|
2119
|
+
@evaluator.evaluate_jailbreak(metrics=[metric], configuration=configuration)
|
|
2120
|
+
def agentic_tool(*args, *kwargs):
|
|
2121
|
+
pass
|
|
2122
|
+
"""
|
|
2123
|
+
return JailbreakDecorator(api_client=self.api_client,
|
|
2124
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2125
|
+
metric_results=self.__online_metric_results,
|
|
2126
|
+
execution_counts=self.__execution_counts,
|
|
2127
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2128
|
+
lock=update_lock,
|
|
2129
|
+
compute_real_time=compute_real_time).evaluate_jailbreak(func, configuration=configuration, metrics=metrics)
|
|
2130
|
+
|
|
2131
|
+
def evaluate_topic_relevance(self,
|
|
2132
|
+
func: Optional[Callable] = None,
|
|
2133
|
+
*,
|
|
2134
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
2135
|
+
metrics: list[GenAIMetric],
|
|
2136
|
+
compute_real_time: Optional[bool] = True,
|
|
2137
|
+
) -> dict:
|
|
2138
|
+
"""
|
|
2139
|
+
An evaluation decorator for computing topic relevance on an agentic tool via off-topic detector.
|
|
2140
|
+
|
|
2141
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.TopicRelevanceMetric`
|
|
2142
|
+
|
|
2143
|
+
Args:
|
|
2144
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
2145
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
2146
|
+
metrics (list[GenAIMetric]): The list of metrics to compute as part of this evaluator.
|
|
2147
|
+
|
|
2148
|
+
Raises:
|
|
2149
|
+
Exception: If there is any error while evaluation.
|
|
2150
|
+
|
|
2151
|
+
Returns:
|
|
2152
|
+
dict: The result of the wrapped tool.
|
|
2153
|
+
|
|
2154
|
+
Example:
|
|
2155
|
+
1. Create evaluate_topic_relevance decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
2156
|
+
.. code-block:: python
|
|
2157
|
+
|
|
2158
|
+
metric = TopicRelevanceMetric(system_prompt="...")
|
|
2159
|
+
evaluator = AgenticEvaluator()
|
|
2160
|
+
@evaluator.evaluate_topic_relevance(metrics=[metric])
|
|
2161
|
+
def agentic_tool(*args, *kwargs):
|
|
2162
|
+
pass
|
|
2163
|
+
|
|
2164
|
+
2. Create evaluate_topic_relevance decorator with thresholds and configuration
|
|
2165
|
+
.. code-block:: python
|
|
2166
|
+
|
|
2167
|
+
metric = TopicRelevanceMetric(system_prompt="...", thresholds=MetricThreshold(type="lower_limit", value=0.7))
|
|
2168
|
+
evaluator = AgenticEvaluator()
|
|
2169
|
+
config = {"input_fields": ["input"]}
|
|
2170
|
+
configuration = AgenticAIConfiguration(**config)
|
|
2171
|
+
@evaluator.evaluate_topic_relevance(metrics=[metric], configuration=configuration)
|
|
2172
|
+
def agentic_tool(*args, *kwargs):
|
|
2173
|
+
pass
|
|
2174
|
+
"""
|
|
2175
|
+
return TopicRelevanceDecorator(api_client=self.api_client,
|
|
2176
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2177
|
+
metric_results=self.__online_metric_results,
|
|
2178
|
+
execution_counts=self.__execution_counts,
|
|
2179
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2180
|
+
lock=update_lock,
|
|
2181
|
+
compute_real_time=compute_real_time).evaluate_topic_relevance(func, configuration=configuration, metrics=metrics)
|
|
2182
|
+
|
|
2183
|
+
def evaluate_answer_quality(self,
|
|
2184
|
+
func: Optional[Callable] = None,
|
|
2185
|
+
*,
|
|
2186
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
2187
|
+
metrics: list[GenAIMetric] = [],
|
|
2188
|
+
compute_real_time: Optional[bool] = True
|
|
2189
|
+
) -> dict:
|
|
2190
|
+
"""
|
|
2191
|
+
An evaluation decorator for computing answer quality metrics on an agentic tool.
|
|
2192
|
+
Answer Quality metrics include Answer Relevance, Faithfulness, Answer Similarity, Unsuccessful Requests
|
|
2193
|
+
|
|
2194
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.AnswerRelevanceMetric`, :class:`ibm_watsonx_gov.metrics.FaithfulnessMetric`,
|
|
2195
|
+
:class:`ibm_watsonx_gov.metrics.UnsuccessfulRequestsMetric`, see :class:`ibm_watsonx_gov.metrics.AnswerSimilarityMetric`,
|
|
2196
|
+
|
|
2197
|
+
Args:
|
|
2198
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
2199
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
2200
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to MetricGroup.ANSWER_QUALITY.get_metrics().
|
|
2201
|
+
|
|
2202
|
+
Raises:
|
|
2203
|
+
Exception: If there is any error while evaluation.
|
|
2204
|
+
|
|
2205
|
+
Returns:
|
|
2206
|
+
dict: The result of the wrapped tool.
|
|
2207
|
+
|
|
2208
|
+
Example:
|
|
2209
|
+
1. Basic usage
|
|
2210
|
+
.. code-block:: python
|
|
2211
|
+
|
|
2212
|
+
evaluator = AgenticEvaluator()
|
|
2213
|
+
@evaluator.evaluate_answer_quality
|
|
2214
|
+
def agentic_tool(*args, *kwargs):
|
|
2215
|
+
pass
|
|
2216
|
+
|
|
2217
|
+
2. Usage with different thresholds and methods for some of the metrics in the group
|
|
2218
|
+
.. code-block:: python
|
|
2219
|
+
|
|
2220
|
+
metric_1 = FaithfulnessMetric(thresholds=MetricThreshold(type="lower_limit", value=0.5))
|
|
2221
|
+
metric_2 = AnswerRelevanceMetric(method="token_recall", thresholds=MetricThreshold(type="lower_limit", value=0.5))
|
|
2222
|
+
|
|
2223
|
+
evaluator = AgenticEvaluator()
|
|
2224
|
+
@evaluator.evaluate_answer_quality(metrics=[metric_1, metric_2])
|
|
2225
|
+
def agentic_tool(*args, *kwargs):
|
|
2226
|
+
pass
|
|
2227
|
+
"""
|
|
2228
|
+
return AnswerQualityDecorator(api_client=self.api_client,
|
|
2229
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2230
|
+
metric_results=self.__online_metric_results,
|
|
2231
|
+
execution_counts=self.__execution_counts,
|
|
2232
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2233
|
+
lock=update_lock,
|
|
2234
|
+
compute_real_time=compute_real_time).evaluate_answer_quality(func, configuration=configuration, metrics=metrics)
|
|
2235
|
+
|
|
2236
|
+
def evaluate_content_safety(self,
|
|
2237
|
+
func: Optional[Callable] = None,
|
|
2238
|
+
*,
|
|
2239
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
2240
|
+
metrics: list[GenAIMetric] = [],
|
|
2241
|
+
compute_real_time: Optional[bool] = True
|
|
2242
|
+
) -> dict:
|
|
2243
|
+
"""
|
|
2244
|
+
An evaluation decorator for computing content safety metrics on an agentic tool.
|
|
2245
|
+
Content Safety metrics include HAP, PII, Evasiveness, Harm, HarmEngagement, Jailbreak, Profanity, SexualContent, Social Bias, UnethicalBehavior and Violence
|
|
2246
|
+
|
|
2247
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.HAPMetric`,
|
|
2248
|
+
:class:`ibm_watsonx_gov.metrics.PIIMetric`, :class:`ibm_watsonx_gov.metrics.EvasivenessMetric`, :class:`ibm_watsonx_gov.metrics.HarmMetric`,
|
|
2249
|
+
:class:`ibm_watsonx_gov.metrics.HarmEngagementMetric`, :class:`ibm_watsonx_gov.metrics.JailbreakMetric`, :class:`ibm_watsonx_gov.metrics.ProfanityMetric`,
|
|
2250
|
+
:class:`ibm_watsonx_gov.metrics.SexualContentMetric`, :class:`ibm_watsonx_gov.metrics.SocialBiasMetric`, :class:`ibm_watsonx_gov.metrics.UnethicalBehaviorMetric`,
|
|
2251
|
+
:class:`ibm_watsonx_gov.metrics.ViolenceMetric`
|
|
2252
|
+
Args:
|
|
2253
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
2254
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
2255
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to MetricGroup.CONTENT_SAFETY.get_metrics().
|
|
2256
|
+
|
|
2257
|
+
Raises:
|
|
2258
|
+
Exception: If there is any error while evaluation.
|
|
2259
|
+
|
|
2260
|
+
Returns:
|
|
2261
|
+
dict: The result of the wrapped tool.
|
|
2262
|
+
|
|
2263
|
+
Example:
|
|
2264
|
+
1. Basic usage
|
|
2265
|
+
.. code-block:: python
|
|
2266
|
+
|
|
2267
|
+
evaluator = AgenticEvaluator()
|
|
2268
|
+
@evaluator.evaluate_content_safety
|
|
2269
|
+
def agentic_tool(*args, *kwargs):
|
|
2270
|
+
pass
|
|
2271
|
+
|
|
2272
|
+
2. Usage with different thresholds and methods for some of the metrics in the group
|
|
2273
|
+
.. code-block:: python
|
|
2274
|
+
|
|
2275
|
+
metric_1 = PIIMetric(thresholds=MetricThreshold(type="lower_limit", value=0.5))
|
|
2276
|
+
metric_2 = HAPMetric(thresholds=MetricThreshold(type="lower_limit", value=0.5))
|
|
2277
|
+
|
|
2278
|
+
evaluator = AgenticEvaluator()
|
|
2279
|
+
@evaluator.evaluate_content_safety(metrics=[metric_1, metric_2])
|
|
2280
|
+
def agentic_tool(*args, *kwargs):
|
|
2281
|
+
pass
|
|
2282
|
+
"""
|
|
2283
|
+
return ContentSafetyDecorator(api_client=self.api_client,
|
|
2284
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2285
|
+
metric_results=self.__online_metric_results,
|
|
2286
|
+
execution_counts=self.__execution_counts,
|
|
2287
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2288
|
+
lock=update_lock,
|
|
2289
|
+
compute_real_time=compute_real_time).evaluate_content_safety(func, configuration=configuration, metrics=metrics)
|
|
2290
|
+
|
|
2291
|
+
def evaluate_retrieval_quality(self,
|
|
2292
|
+
func: Optional[Callable] = None,
|
|
2293
|
+
*,
|
|
2294
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
2295
|
+
metrics: list[GenAIMetric] = [],
|
|
2296
|
+
compute_real_time: Optional[bool] = True
|
|
2297
|
+
) -> dict:
|
|
2298
|
+
"""
|
|
2299
|
+
An evaluation decorator for computing retrieval quality metrics on an agentic tool.
|
|
2300
|
+
Retrieval Quality metrics include Context Relevance, Retrieval Precision, Average Precision, Hit Rate, Reciprocal Rank, NDCG
|
|
2301
|
+
|
|
2302
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.ContextRelevanceMetric`, :class:`ibm_watsonx_gov.metrics.RetrievalPrecisionMetric`,
|
|
2303
|
+
:class:`ibm_watsonx_gov.metrics.AveragePrecisionMetric`, :class:`ibm_watsonx_gov.metrics.ReciprocalRankMetric`, :class:`ibm_watsonx_gov.metrics.HitRateMetric`,
|
|
2304
|
+
:class:`ibm_watsonx_gov.metrics.NDCGMetric`
|
|
2305
|
+
|
|
2306
|
+
Args:
|
|
2307
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
2308
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
2309
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to MetricGroup.RETRIEVAL_QUALITY.get_metrics().
|
|
2310
|
+
|
|
2311
|
+
Raises:
|
|
2312
|
+
Exception: If there is any error while evaluation.
|
|
2313
|
+
|
|
2314
|
+
Returns:
|
|
2315
|
+
dict: The result of the wrapped tool.
|
|
2316
|
+
|
|
2317
|
+
Example:
|
|
2318
|
+
1. Basic usage
|
|
2319
|
+
.. code-block:: python
|
|
2320
|
+
|
|
2321
|
+
evaluator = AgenticEvaluator()
|
|
2322
|
+
@evaluator.evaluate_retrieval_quality
|
|
2323
|
+
def agentic_tool(*args, *kwargs):
|
|
2324
|
+
pass
|
|
2325
|
+
|
|
2326
|
+
2. Usage with different thresholds and methods for some of the metrics in the group
|
|
2327
|
+
.. code-block:: python
|
|
2328
|
+
|
|
2329
|
+
metric_1 = NDCGMetric(threshold=MetricThreshold(type="lower_limit", value=0.5))
|
|
2330
|
+
metric_2 = ContextRelevanceMetric(method="sentence_bert_mini_lm", threshold=MetricThreshold(type="lower_limit", value=0.6))
|
|
2331
|
+
|
|
2332
|
+
evaluator = AgenticEvaluator()
|
|
2333
|
+
@evaluator.evaluate_retrieval_quality(metrics=[metric_1, metric_2])
|
|
2334
|
+
def agentic_tool(*args, *kwargs):
|
|
2335
|
+
pass
|
|
2336
|
+
"""
|
|
2337
|
+
return RetrievalQualityDecorator(api_client=self.api_client,
|
|
2338
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2339
|
+
metric_results=self.__online_metric_results,
|
|
2340
|
+
execution_counts=self.__execution_counts,
|
|
2341
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2342
|
+
lock=update_lock,
|
|
2343
|
+
compute_real_time=compute_real_time).evaluate_retrieval_quality(func, configuration=configuration, metrics=metrics)
|
|
2344
|
+
|
|
2345
|
+
def evaluate_text_grade_level(self,
|
|
2346
|
+
func: Optional[Callable] = None,
|
|
2347
|
+
*,
|
|
2348
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
2349
|
+
metrics: list[GenAIMetric] = [],
|
|
2350
|
+
compute_real_time: Optional[bool] = True,
|
|
2351
|
+
) -> dict:
|
|
2352
|
+
"""
|
|
2353
|
+
An evaluation decorator for computing text grade level metric on an agentic tool.
|
|
2354
|
+
|
|
2355
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.TextGradeLevelMetric`
|
|
2356
|
+
|
|
2357
|
+
Args:
|
|
2358
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
2359
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
2360
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [TextGradeLevelMetric()].
|
|
2361
|
+
|
|
2362
|
+
Raises:
|
|
2363
|
+
Exception: If there is any error while evaluation.
|
|
2364
|
+
|
|
2365
|
+
Returns:
|
|
2366
|
+
dict: The result of the wrapped tool.
|
|
2367
|
+
|
|
2368
|
+
Example:
|
|
2369
|
+
1. Basic usage
|
|
2370
|
+
.. code-block:: python
|
|
2371
|
+
|
|
2372
|
+
evaluator = AgenticEvaluator()
|
|
2373
|
+
@evaluator.evaluate_text_grade_level
|
|
2374
|
+
def agentic_tool(*args, *kwargs):
|
|
2375
|
+
pass
|
|
2376
|
+
|
|
2377
|
+
2. Create evaluate_text_grade_level decorator with thresholds and configuration
|
|
2378
|
+
.. code-block:: python
|
|
2379
|
+
|
|
2380
|
+
metric = TextGradeLevelMetric(thresholds=[MetricThreshold(type="lower_limit", value=6)])
|
|
2381
|
+
config = {"output_fields": ["generated_text"]}
|
|
2382
|
+
configuration = AgenticAIConfiguration(**config)
|
|
2383
|
+
evaluator = AgenticEvaluator()
|
|
2384
|
+
@evaluator.evaluate_text_grade_level(metrics=[metric], configuration=configuration)
|
|
2385
|
+
def agentic_tool(*args, *kwargs):
|
|
2386
|
+
pass
|
|
2387
|
+
"""
|
|
2388
|
+
return TextGradeLevelDecorator(api_client=self.api_client,
|
|
2389
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2390
|
+
metric_results=self.__online_metric_results,
|
|
2391
|
+
execution_counts=self.__execution_counts,
|
|
2392
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2393
|
+
lock=update_lock,
|
|
2394
|
+
compute_real_time=compute_real_time).evaluate_text_grade_level(func, configuration=configuration, metrics=metrics)
|
|
2395
|
+
|
|
2396
|
+
def evaluate_text_reading_ease(self,
|
|
2397
|
+
func: Optional[Callable] = None,
|
|
2398
|
+
*,
|
|
2399
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
2400
|
+
metrics: list[GenAIMetric] = [],
|
|
2401
|
+
compute_real_time: Optional[bool] = True,
|
|
2402
|
+
) -> dict:
|
|
2403
|
+
"""
|
|
2404
|
+
An evaluation decorator for computing text reading ease ease metric on an agentic tool.
|
|
2405
|
+
|
|
2406
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.TextReadingEaseMetric`
|
|
2407
|
+
|
|
2408
|
+
Args:
|
|
2409
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
2410
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
2411
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to [TextReadingEaseMetric()].
|
|
2412
|
+
|
|
2413
|
+
Raises:
|
|
2414
|
+
Exception: If there is any error while evaluation.
|
|
2415
|
+
|
|
2416
|
+
Returns:
|
|
2417
|
+
dict: The result of the wrapped tool.
|
|
2418
|
+
|
|
2419
|
+
Example:
|
|
2420
|
+
1. Basic usage
|
|
2421
|
+
.. code-block:: python
|
|
2422
|
+
|
|
2423
|
+
evaluator = AgenticEvaluator()
|
|
2424
|
+
@evaluator.evaluate_text_reading_ease
|
|
2425
|
+
def agentic_tool(*args, *kwargs):
|
|
2426
|
+
pass
|
|
2427
|
+
|
|
2428
|
+
2. Create evaluate_text_reading_ease decorator with thresholds and configuration
|
|
2429
|
+
.. code-block:: python
|
|
2430
|
+
|
|
2431
|
+
metric = TextReadingEaseMetric(thresholds=[MetricThreshold(type="lower_limit", value=70)])
|
|
2432
|
+
config = {"output_fields": ["generated_text"]}
|
|
2433
|
+
configuration = AgenticAIConfiguration(**config)
|
|
2434
|
+
evaluator = AgenticEvaluator()
|
|
2435
|
+
@evaluator.evaluate_text_reading_ease(metrics=[metric], configuration=configuration)
|
|
2436
|
+
def agentic_tool(*args, *kwargs):
|
|
2437
|
+
pass
|
|
2438
|
+
"""
|
|
2439
|
+
return TextReadingEaseDecorator(api_client=self.api_client,
|
|
2440
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2441
|
+
metric_results=self.__online_metric_results,
|
|
2442
|
+
execution_counts=self.__execution_counts,
|
|
2443
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2444
|
+
lock=update_lock,
|
|
2445
|
+
compute_real_time=compute_real_time).evaluate_text_reading_ease(func, configuration=configuration, metrics=metrics)
|
|
2446
|
+
|
|
2447
|
+
def evaluate_readability(self,
|
|
2448
|
+
func: Optional[Callable] = None,
|
|
2449
|
+
*,
|
|
2450
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
2451
|
+
metrics: list[GenAIMetric] = [],
|
|
2452
|
+
compute_real_time: Optional[bool] = True
|
|
2453
|
+
) -> dict:
|
|
2454
|
+
"""
|
|
2455
|
+
An evaluation decorator for computing answer readability metrics on an agentic tool.
|
|
2456
|
+
Readability metrics include TextReadingEaseMetric and TextGradeLevelMetric
|
|
2457
|
+
|
|
2458
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.TextReadingEaseMetric`, :class:`ibm_watsonx_gov.metrics.TextGradeLevelMetric`
|
|
2459
|
+
|
|
2460
|
+
Args:
|
|
2461
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
2462
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
2463
|
+
metrics (list[GenAIMetric], optional): The list of metrics to compute as part of this evaluator. Defaults to MetricGroup.READABILITY.get_metrics().
|
|
2464
|
+
|
|
2465
|
+
Raises:
|
|
2466
|
+
Exception: If there is any error while evaluation.
|
|
2467
|
+
|
|
2468
|
+
Returns:
|
|
2469
|
+
dict: The result of the wrapped tool.
|
|
2470
|
+
|
|
2471
|
+
Example:
|
|
2472
|
+
1. Basic usage
|
|
2473
|
+
.. code-block:: python
|
|
2474
|
+
|
|
2475
|
+
evaluator = AgenticEvaluator()
|
|
2476
|
+
@evaluator.evaluate_readability
|
|
2477
|
+
def agentic_tool(*args, *kwargs):
|
|
2478
|
+
pass
|
|
2479
|
+
|
|
2480
|
+
2. Usage with different thresholds and methods for some of the metrics in the group
|
|
2481
|
+
.. code-block:: python
|
|
2482
|
+
|
|
2483
|
+
metric_1 = TextGradeLevelMetric(thresholds=[MetricThreshold(type="lower_limit", value=6)])
|
|
2484
|
+
metric_2 = TextReadingEaseMetric(thresholds=[MetricThreshold(type="lower_limit", value=70)])
|
|
2485
|
+
config = {"output_fields": ["generated_text"]}
|
|
2486
|
+
configuration = AgenticAIConfiguration(**config)
|
|
2487
|
+
evaluator = AgenticEvaluator()
|
|
2488
|
+
@evaluator.evaluate_readability(metrics=[metric_1, metric_2], configuration=configuration)
|
|
2489
|
+
def agentic_tool(*args, *kwargs):
|
|
2490
|
+
pass
|
|
2491
|
+
"""
|
|
2492
|
+
return ReadabilityDecorator(api_client=self.api_client,
|
|
2493
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2494
|
+
metric_results=self.__online_metric_results,
|
|
2495
|
+
execution_counts=self.__execution_counts,
|
|
2496
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2497
|
+
lock=update_lock,
|
|
2498
|
+
compute_real_time=compute_real_time).evaluate_readability(func, configuration=configuration, metrics=metrics)
|
|
2499
|
+
|
|
2500
|
+
def evaluate_keyword_detection(self,
|
|
2501
|
+
func: Optional[Callable] = None,
|
|
2502
|
+
*,
|
|
2503
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
2504
|
+
metrics: list[GenAIMetric],
|
|
2505
|
+
compute_real_time: Optional[bool] = True,
|
|
2506
|
+
) -> dict:
|
|
2507
|
+
"""
|
|
2508
|
+
An evaluation decorator for computing keyword detection on an agentic tool.
|
|
2509
|
+
|
|
2510
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.KeywordDetectionMetric`
|
|
2511
|
+
|
|
2512
|
+
Args:
|
|
2513
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
2514
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
2515
|
+
metrics (list[GenAIMetric]): The list of metrics to compute as part of this evaluator.
|
|
2516
|
+
|
|
2517
|
+
Raises:
|
|
2518
|
+
Exception: If there is any error while evaluation.
|
|
2519
|
+
|
|
2520
|
+
Returns:
|
|
2521
|
+
dict: The result of the wrapped tool.
|
|
2522
|
+
|
|
2523
|
+
Example:
|
|
2524
|
+
1. Create evaluate_keyword_detection decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
2525
|
+
.. code-block:: python
|
|
2526
|
+
metric = KeywordDetectionMetric(keywords=["..."])
|
|
2527
|
+
evaluator = AgenticEvaluator()
|
|
2528
|
+
@evaluator.evaluate_keyword_detection(metrics=[metric])
|
|
2529
|
+
def agentic_tool(*args, *kwargs):
|
|
2530
|
+
pass
|
|
2531
|
+
|
|
2532
|
+
2. Create evaluate_keyword_detection decorator with thresholds and configuration
|
|
2533
|
+
.. code-block:: python
|
|
2534
|
+
|
|
2535
|
+
metric = KeywordDetectionMetric(thresholds=MetricThreshold(type="upper_limit", value=0), keywords=["..."])
|
|
2536
|
+
config = {"input_fields": ["input"]}
|
|
2537
|
+
configuration = AgenticAIConfiguration(**config)
|
|
2538
|
+
evaluator = AgenticEvaluator()
|
|
2539
|
+
@evaluator.evaluate_keyword_detection(metrics=[metric], configuration=configuration)
|
|
2540
|
+
def agentic_tool(*args, *kwargs):
|
|
2541
|
+
pass
|
|
2542
|
+
"""
|
|
2543
|
+
return KeywordDetectionDecorator(api_client=self.api_client,
|
|
2544
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2545
|
+
metric_results=self.__online_metric_results,
|
|
2546
|
+
execution_counts=self.__execution_counts,
|
|
2547
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2548
|
+
lock=update_lock,
|
|
2549
|
+
compute_real_time=compute_real_time).evaluate_keyword_detection(func, configuration=configuration, metrics=metrics)
|
|
2550
|
+
|
|
2551
|
+
def evaluate_regex(self,
|
|
2552
|
+
func: Optional[Callable] = None,
|
|
2553
|
+
*,
|
|
2554
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
2555
|
+
metrics: list[GenAIMetric],
|
|
2556
|
+
compute_real_time: Optional[bool] = True,
|
|
2557
|
+
) -> dict:
|
|
2558
|
+
"""
|
|
2559
|
+
An evaluation decorator for computing regex detection on an agentic tool.
|
|
2560
|
+
|
|
2561
|
+
For more details, see :class:`ibm_watsonx_gov.metrics.RegexDetectionMetric`
|
|
2562
|
+
|
|
2563
|
+
Args:
|
|
2564
|
+
func (Optional[Callable], optional): The tool on which the metric is to be computed.
|
|
2565
|
+
configuration (Optional[AgenticAIConfiguration], optional): The configuration specific to this evaluator. Defaults to None.
|
|
2566
|
+
metrics (list[GenAIMetric]): The list of metrics to compute as part of this evaluator.
|
|
2567
|
+
|
|
2568
|
+
Raises:
|
|
2569
|
+
Exception: If there is any error while evaluation.
|
|
2570
|
+
|
|
2571
|
+
Returns:
|
|
2572
|
+
dict: The result of the wrapped tool.
|
|
2573
|
+
|
|
2574
|
+
Example:
|
|
2575
|
+
1. Create evaluate_regex decorator with default parameters. By default, the metric uses the "input_text" from the graph state as the input.
|
|
2576
|
+
.. code-block:: python
|
|
2577
|
+
metric = RegexDetectionMetric(regex_patterns=["..."])
|
|
2578
|
+
evaluator = AgenticEvaluator()
|
|
2579
|
+
@evaluator.evaluate_regex(metrics=[metric])
|
|
2580
|
+
def agentic_tool(*args, *kwargs):
|
|
2581
|
+
pass
|
|
2582
|
+
|
|
2583
|
+
2. Create evaluate_regex decorator with thresholds and configuration
|
|
2584
|
+
.. code-block:: python
|
|
2585
|
+
metric = RegexDetectionMetric(thresholds=MetricThreshold(type="upper_limit", value=0), regex_patterns=["..."])
|
|
2586
|
+
config = {"input_fields": ["input"]}
|
|
2587
|
+
configuration = AgenticAIConfiguration(**config)
|
|
2588
|
+
evaluator = AgenticEvaluator()
|
|
2589
|
+
@evaluator.evaluate_regex(metrics=[metric], configuration=configuration)
|
|
2590
|
+
def agentic_tool(*args, *kwargs):
|
|
2591
|
+
pass
|
|
2592
|
+
"""
|
|
2593
|
+
return RegexDetectionDecorator(api_client=self.api_client,
|
|
2594
|
+
configuration=self.agentic_app.metrics_configuration.configuration if self.agentic_app else None,
|
|
2595
|
+
metric_results=self.__online_metric_results,
|
|
2596
|
+
execution_counts=self.__execution_counts,
|
|
2597
|
+
nodes_being_run=self.__nodes_being_run,
|
|
2598
|
+
lock=update_lock,
|
|
2599
|
+
compute_real_time=compute_real_time).evaluate_regex(func, configuration=configuration, metrics=metrics)
|
|
2600
|
+
|
|
2601
|
+
def generate_insights(self,
|
|
2602
|
+
applies_to: list[str] = AGENTIC_RESULT_COMPONENTS,
|
|
2603
|
+
top_k: int = 3,
|
|
2604
|
+
llm_model=None,
|
|
2605
|
+
output_format: str = "html",
|
|
2606
|
+
percentile_threshold: float = 95.0,
|
|
2607
|
+
metric_group_weights: Optional[dict] = None,
|
|
2608
|
+
metric_weights: Optional[dict] = None):
|
|
2609
|
+
"""
|
|
2610
|
+
Generate top k insights from evaluation metrics based on their significance.
|
|
2611
|
+
|
|
2612
|
+
This method analyzes the evaluation results and identifies the most significant metrics
|
|
2613
|
+
based on their values and thresholds. It can optionally generate natural language
|
|
2614
|
+
report of these insights using a provided LLM model.
|
|
2615
|
+
|
|
2616
|
+
Args:
|
|
2617
|
+
applies_to (list[str]): The component levels at which insights should be computed.
|
|
2618
|
+
Can include "conversation", "message", and/or "node".
|
|
2619
|
+
Defaults to all three levels.
|
|
2620
|
+
top_k (int): The number of top insights to generate. Defaults to 3.
|
|
2621
|
+
llm_model (optional): A language model to generate natural language report
|
|
2622
|
+
of the insights. If not provided, only structured insights
|
|
2623
|
+
will be returned.
|
|
2624
|
+
output_format (str): The format for the output. Defaults to "html".
|
|
2625
|
+
percentile_threshold (float): Percentile to use as threshold for cost/latency metrics.
|
|
2626
|
+
Defaults to 95.0. Higher values indicate worse performance
|
|
2627
|
+
for these metrics. For example, 95.0 means values above the
|
|
2628
|
+
95th percentile are considered violations.
|
|
2629
|
+
metric_group_weights (dict, optional): Custom weights for metric groups.
|
|
2630
|
+
Keys are group names, values are weights (1.0-5.0).
|
|
2631
|
+
1.0 is the minimum weight, 5.0 is the maximum weight.
|
|
2632
|
+
Example: {"answer_quality": 2.0, "content_safety": 1.5}
|
|
2633
|
+
metric_weights (dict, optional): Custom weights for individual metrics.
|
|
2634
|
+
Keys are metric names, values are weights (1.0-5.0).
|
|
2635
|
+
1.0 is the minimum weight, 5.0 is the maximum weight.
|
|
2636
|
+
Example: {"answer_relevance": 2.0, "faithfulness": 1.8}
|
|
2637
|
+
|
|
2638
|
+
Returns:
|
|
2639
|
+
List[dict]: A list of the top k insights, each containing:
|
|
2640
|
+
- metric_name: Name of the metric
|
|
2641
|
+
- applies_to: Component level the metric applies to
|
|
2642
|
+
- group: The metric group to which the metric belongs to
|
|
2643
|
+
- violations_count: The number of times the metric value violated the threshold
|
|
2644
|
+
- node_name: Name of the node (if applies_to is "node")
|
|
2645
|
+
- value: The metric value
|
|
2646
|
+
- threshold: The threshold dictionary containing value and type (if applicable)
|
|
2647
|
+
- mmr_score: A score indicating the significance of this insight
|
|
2648
|
+
|
|
2649
|
+
Examples:
|
|
2650
|
+
1. Generate top 3 insights across all component levels
|
|
2651
|
+
.. code-block:: python
|
|
2652
|
+
|
|
2653
|
+
evaluator = AgenticEvaluator()
|
|
2654
|
+
# ... run evaluation ...
|
|
2655
|
+
insights = evaluator.generate_insights()
|
|
2656
|
+
|
|
2657
|
+
|
|
2658
|
+
2. Generate top 5 insights for node-level metrics only
|
|
2659
|
+
.. code-block:: python
|
|
2660
|
+
|
|
2661
|
+
insights = evaluator.generate_insights(
|
|
2662
|
+
applies_to=["node"],
|
|
2663
|
+
top_k=5
|
|
2664
|
+
)
|
|
2665
|
+
|
|
2666
|
+
3. Generate insights with natural language explanations
|
|
2667
|
+
.. code-block:: python
|
|
2668
|
+
|
|
2669
|
+
|
|
2670
|
+
from ibm_watsonx_gov.entities.foundation_model import WxAIFoundationModel
|
|
2671
|
+
|
|
2672
|
+
llm = WxAIFoundationModel(
|
|
2673
|
+
model_id="meta-llama/llama-3-70b-instruct",
|
|
2674
|
+
project_id="your-project-id"
|
|
2675
|
+
)
|
|
2676
|
+
|
|
2677
|
+
insights = evaluator.generate_insights(
|
|
2678
|
+
top_k=3,
|
|
2679
|
+
llm_model=llm
|
|
2680
|
+
)
|
|
2681
|
+
|
|
2682
|
+
|
|
2683
|
+
4. Generate insights with custom metric weights
|
|
2684
|
+
.. code-block:: python
|
|
2685
|
+
|
|
2686
|
+
|
|
2687
|
+
insights = evaluator.generate_insights(
|
|
2688
|
+
top_k=3,
|
|
2689
|
+
metric_group_weights={"retrieval_quality": 2.0, "content_safety": 1.5},
|
|
2690
|
+
metric_weights={"answer_relevance": 2.5, "faithfulness": 2.0}
|
|
2691
|
+
)
|
|
2692
|
+
|
|
2693
|
+
"""
|
|
2694
|
+
from ibm_watsonx_gov.utils.insights_generator import InsightsGenerator
|
|
2695
|
+
|
|
2696
|
+
# Get the evaluation result
|
|
2697
|
+
eval_result = self.get_result()
|
|
2698
|
+
if not eval_result:
|
|
2699
|
+
logger.warning(
|
|
2700
|
+
"No evaluation results available. Please run evaluation first.")
|
|
2701
|
+
return []
|
|
2702
|
+
|
|
2703
|
+
# Get aggregated metrics results for the specified component levels
|
|
2704
|
+
# Include individual results to compute violations_count for percentile-based metrics
|
|
2705
|
+
aggregated_metrics = eval_result.get_aggregated_metrics_results(
|
|
2706
|
+
applies_to=applies_to,
|
|
2707
|
+
include_individual_results=True
|
|
2708
|
+
)
|
|
2709
|
+
|
|
2710
|
+
# Use the InsightsGenerator to select top k metrics based on significance
|
|
2711
|
+
insights_generator = InsightsGenerator(
|
|
2712
|
+
top_k=top_k, applies_to=applies_to, metrics=aggregated_metrics, llm_model=llm_model,
|
|
2713
|
+
percentile_threshold=percentile_threshold,
|
|
2714
|
+
metric_group_weights=metric_group_weights, metric_weights=metric_weights)
|
|
2715
|
+
top_k_metrics = insights_generator.select_top_k_metrics()
|
|
2716
|
+
|
|
2717
|
+
# Generate natural language insights if a model is provided
|
|
2718
|
+
if llm_model and top_k_metrics:
|
|
2719
|
+
result = insights_generator.generate_structured_insights(
|
|
2720
|
+
top_metrics=top_k_metrics,
|
|
2721
|
+
output_format=output_format
|
|
2722
|
+
)
|
|
2723
|
+
return result
|
|
2724
|
+
|
|
2725
|
+
return top_k_metrics
|