ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from datetime import timedelta
|
|
11
|
+
from typing import Annotated, Dict, Optional
|
|
12
|
+
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.entities.agentic_app import AgenticApp
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MessageStatus, MetricGroup, MetricValueType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AgentMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.mapping import Mapping
|
|
19
|
+
from ibm_watsonx_gov.evaluators.base_evaluator import BaseEvaluator
|
|
20
|
+
from ibm_watsonx_gov.evaluators.impl.evaluate_metrics_impl import _evaluate_metrics
|
|
21
|
+
from ibm_watsonx_gov.traces.span_util import flatten_attributes
|
|
22
|
+
from ibm_watsonx_gov.traces.trace_utils import TraceUtils
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AgenticTracesEvaluator(BaseEvaluator):
|
|
26
|
+
"""
|
|
27
|
+
The class to evaluate agentic applications based on the traces generated.
|
|
28
|
+
"""
|
|
29
|
+
agentic_app: Annotated[Optional[AgenticApp], Field(
|
|
30
|
+
title="Agentic application configuration details", description="The agentic application configuration details.", default=None)]
|
|
31
|
+
|
|
32
|
+
def compute_metrics(self, spans: list[dict], mapping: Mapping, **kwargs) -> list[AgentMetricResult]:
|
|
33
|
+
"""
|
|
34
|
+
Computes the agentic metrics based on the spans/traces provided as a list.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
spans (list[AgentMetricResult]): The spans on which the metrics need to be computed
|
|
38
|
+
mapping (Mapping): The various mappings for finding the metric inputs.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
list[AgentMetricResult]: The computed metric results
|
|
42
|
+
"""
|
|
43
|
+
span_trees = TraceUtils.build_span_trees(
|
|
44
|
+
spans=spans, agentic_app=self.agentic_app)
|
|
45
|
+
metrics_result = []
|
|
46
|
+
for span_tree in span_trees:
|
|
47
|
+
# Process only the spans that are associated with the agent application
|
|
48
|
+
attrs = flatten_attributes(span_tree.span.attributes)
|
|
49
|
+
if not attrs.get("traceloop.span.kind") == "workflow":
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
data = span_tree.get_values(mapping)
|
|
53
|
+
|
|
54
|
+
mr = self.compute_message_level_metrics(data, **kwargs)
|
|
55
|
+
metrics_result.extend(mr)
|
|
56
|
+
|
|
57
|
+
return metrics_result
|
|
58
|
+
|
|
59
|
+
def compute_message_level_metrics(self, data: Dict, **kwargs) -> list[AgentMetricResult]:
|
|
60
|
+
metric_results = []
|
|
61
|
+
|
|
62
|
+
start_time = data.get("start_time")
|
|
63
|
+
end_time = data.get("end_time")
|
|
64
|
+
|
|
65
|
+
if start_time is None or end_time is None:
|
|
66
|
+
raise Exception("start_time and/or end_time are missing.")
|
|
67
|
+
|
|
68
|
+
message_id = data.get("message_id")
|
|
69
|
+
conversation_id = data.get("conversation_id")
|
|
70
|
+
|
|
71
|
+
if message_id is None or conversation_id is None:
|
|
72
|
+
raise Exception(
|
|
73
|
+
"message_id and/or conversation_id are missing.")
|
|
74
|
+
|
|
75
|
+
duration: timedelta = (end_time - start_time)
|
|
76
|
+
duration = duration.total_seconds()
|
|
77
|
+
|
|
78
|
+
metric_results.append(AgentMetricResult(name="duration",
|
|
79
|
+
display_name="Message Duration",
|
|
80
|
+
value=duration,
|
|
81
|
+
group=MetricGroup.PERFORMANCE,
|
|
82
|
+
applies_to="message",
|
|
83
|
+
message_id=message_id,
|
|
84
|
+
conversation_id=conversation_id))
|
|
85
|
+
|
|
86
|
+
metric_results.append(AgentMetricResult(name="status",
|
|
87
|
+
display_name="Message Status",
|
|
88
|
+
value_type=MetricValueType.CATEGORICAL.value,
|
|
89
|
+
value=data.get(
|
|
90
|
+
"status", MessageStatus.UNKNOWN.value),
|
|
91
|
+
group=MetricGroup.MESSAGE_COMPLETION,
|
|
92
|
+
applies_to="message",
|
|
93
|
+
message_id=message_id,
|
|
94
|
+
conversation_id=conversation_id))
|
|
95
|
+
|
|
96
|
+
if not self.agentic_app:
|
|
97
|
+
return metric_results
|
|
98
|
+
|
|
99
|
+
metric_result = _evaluate_metrics(configuration=self.agentic_app.metrics_configuration.configuration,
|
|
100
|
+
data=data,
|
|
101
|
+
metrics=self.agentic_app.metrics_configuration.metrics,
|
|
102
|
+
metric_groups=self.agentic_app.metrics_configuration.metric_groups,
|
|
103
|
+
api_client=kwargs.get("api_client"),
|
|
104
|
+
ignore_validation_errors=True).to_dict()
|
|
105
|
+
for mr in metric_result:
|
|
106
|
+
node_result = {
|
|
107
|
+
"applies_to": "message",
|
|
108
|
+
"message_id": message_id,
|
|
109
|
+
"conversation_id": conversation_id,
|
|
110
|
+
**mr
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
metric_results.append(AgentMetricResult(**node_result))
|
|
114
|
+
|
|
115
|
+
return metric_results
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
11
|
+
from typing_extensions import Annotated
|
|
12
|
+
|
|
13
|
+
from ibm_watsonx_gov.clients.api_client import APIClient
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseEvaluator(BaseModel):
|
|
17
|
+
"""
|
|
18
|
+
The base class for all evaluators.
|
|
19
|
+
"""
|
|
20
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
21
|
+
api_client: Annotated[APIClient | None,
|
|
22
|
+
Field(name="The IBM watsonx.governance client.", default=None)]
|
|
File without changes
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from pydantic import Field, PrivateAttr
|
|
12
|
+
from typing_extensions import Annotated
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
15
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup
|
|
16
|
+
from ibm_watsonx_gov.entities.evaluation_result import MetricsEvaluationResult
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.evaluators.base_evaluator import BaseEvaluator
|
|
19
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MetricsEvaluator(BaseEvaluator):
|
|
23
|
+
"""
|
|
24
|
+
The class to evaluate the metrics and display the results.
|
|
25
|
+
|
|
26
|
+
Examples:
|
|
27
|
+
1. Evaluate metrics by passing data as a dataframe and default configuration
|
|
28
|
+
.. code-block:: python
|
|
29
|
+
|
|
30
|
+
os.environ["WATSONX_APIKEY"] = "..."
|
|
31
|
+
|
|
32
|
+
evaluator = MetricsEvaluator()
|
|
33
|
+
df = pd.read_csv("")
|
|
34
|
+
metrics = [AnswerSimilarityMetric()]
|
|
35
|
+
|
|
36
|
+
result = evaluator.evaluate(data=df, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
2. Evaluate metrics by passing data as a json and default configuration
|
|
39
|
+
.. code-block:: python
|
|
40
|
+
|
|
41
|
+
os.environ["WATSONX_APIKEY"] = "..."
|
|
42
|
+
|
|
43
|
+
evaluator = MetricsEvaluator()
|
|
44
|
+
json_data = {"input_text": "..."}
|
|
45
|
+
metrics=[HAPMetric()]
|
|
46
|
+
|
|
47
|
+
result = evaluator.evaluate(data=json_data, metrics=metrics)
|
|
48
|
+
|
|
49
|
+
3. Evaluate metrics by passing configuration and api_client
|
|
50
|
+
.. code-block:: python
|
|
51
|
+
|
|
52
|
+
config = GenAIConfiguration(input_fields=["question"],
|
|
53
|
+
context_fields=["context"],
|
|
54
|
+
output_fields=["generated_text"],
|
|
55
|
+
reference_fields=["reference_answer"])
|
|
56
|
+
wxgov_client = APIClient(credentials=Credentials(api_key=""))
|
|
57
|
+
evaluator = MetricsEvaluator(configuration=config, api_client=wxgov_client)
|
|
58
|
+
df = pd.read_csv("")
|
|
59
|
+
metrics = [AnswerSimilarityMetric()]
|
|
60
|
+
|
|
61
|
+
result = evaluator.evaluate(data=df, metrics=metrics)
|
|
62
|
+
|
|
63
|
+
4. Evaluate metrics by passing metric groups
|
|
64
|
+
.. code-block:: python
|
|
65
|
+
|
|
66
|
+
os.environ["WATSONX_APIKEY"] = "..."
|
|
67
|
+
|
|
68
|
+
evaluator = MetricsEvaluator()
|
|
69
|
+
df = pd.read_csv("")
|
|
70
|
+
metrics = [AnswerSimilarityMetric()]
|
|
71
|
+
metric_groups = [MetricGroup.RETRIEVAL_QUALITY]
|
|
72
|
+
|
|
73
|
+
result = evaluator.evaluate(data=df, metrics=metrics, metric_groups=metric_groups)
|
|
74
|
+
|
|
75
|
+
5. Display the results
|
|
76
|
+
.. code-block:: python
|
|
77
|
+
|
|
78
|
+
# Get the results in the required format from the output of the evaluate method
|
|
79
|
+
result.to_json()
|
|
80
|
+
result.to_df()
|
|
81
|
+
result.to_dict()
|
|
82
|
+
|
|
83
|
+
# Display the results
|
|
84
|
+
evaluator.display_table()
|
|
85
|
+
evaluator.display_insights()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
"""
|
|
90
|
+
configuration: Annotated[GenAIConfiguration,
|
|
91
|
+
Field(title="Generative AI Configuration",
|
|
92
|
+
description="The configuration for metrics evaluation.",
|
|
93
|
+
default=GenAIConfiguration())]
|
|
94
|
+
_data: Annotated[pd.DataFrame | dict | None,
|
|
95
|
+
PrivateAttr(default=None)]
|
|
96
|
+
_metrics: Annotated[list[GenAIMetric] | None,
|
|
97
|
+
PrivateAttr(default=None)]
|
|
98
|
+
_metric_groups: Annotated[list[MetricGroup] | None,
|
|
99
|
+
PrivateAttr(default=None)]
|
|
100
|
+
_result: Annotated[MetricsEvaluationResult | None,
|
|
101
|
+
PrivateAttr(default=None)]
|
|
102
|
+
|
|
103
|
+
def evaluate(
|
|
104
|
+
self,
|
|
105
|
+
data: pd.DataFrame | dict,
|
|
106
|
+
metrics: list[GenAIMetric] = [],
|
|
107
|
+
metric_groups: list[MetricGroup] = [],
|
|
108
|
+
**kwargs) -> MetricsEvaluationResult:
|
|
109
|
+
"""
|
|
110
|
+
Evaluate the metrics for the given data.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
data (pd.DataFrame | dict): The data to be evaluated.
|
|
114
|
+
metrics (list[GenAIMetric], optional): The metrics to be evaluated. Defaults to [].
|
|
115
|
+
metric_groups (list[MetricGroup], optional): The metric groups to be evaluated. Defaults to [].
|
|
116
|
+
**kwargs: Additional keyword arguments.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
MetricsEvaluationResult: The result of the evaluation.
|
|
120
|
+
"""
|
|
121
|
+
return run_in_event_loop(
|
|
122
|
+
self.evaluate_async,
|
|
123
|
+
data=data,
|
|
124
|
+
metrics=metrics,
|
|
125
|
+
metric_groups=metric_groups,
|
|
126
|
+
**kwargs,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
async def evaluate_async(
|
|
130
|
+
self,
|
|
131
|
+
data: pd.DataFrame | dict,
|
|
132
|
+
metrics: list[GenAIMetric] = [],
|
|
133
|
+
metric_groups: list[MetricGroup] = [],
|
|
134
|
+
**kwargs
|
|
135
|
+
) -> MetricsEvaluationResult:
|
|
136
|
+
"""
|
|
137
|
+
asynchronously evaluate the metrics for the given data.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
data (pd.DataFrame | dict): The data to be evaluated.
|
|
141
|
+
metrics (list[GenAIMetric], optional): The metrics to be evaluated. Defaults to [].
|
|
142
|
+
metric_groups (list[MetricGroup], optional): The metric groups to be evaluated. Defaults to [].
|
|
143
|
+
**kwargs: Additional keyword arguments.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
MetricsEvaluationResult: The result of the evaluation.
|
|
147
|
+
"""
|
|
148
|
+
from ..evaluators.impl.evaluate_metrics_impl import (
|
|
149
|
+
_evaluate_metrics_async, _resolve_metric_dependencies)
|
|
150
|
+
self._data = data
|
|
151
|
+
self._metrics = _resolve_metric_dependencies(
|
|
152
|
+
metrics=metrics, metric_groups=metric_groups
|
|
153
|
+
)
|
|
154
|
+
self._metric_groups = metric_groups
|
|
155
|
+
self._result: MetricsEvaluationResult = await _evaluate_metrics_async(
|
|
156
|
+
configuration=self.configuration,
|
|
157
|
+
data=data,
|
|
158
|
+
metrics=self._metrics,
|
|
159
|
+
api_client=self.api_client,
|
|
160
|
+
**kwargs,
|
|
161
|
+
)
|
|
162
|
+
return self._result
|
|
163
|
+
|
|
164
|
+
def display_table(self):
|
|
165
|
+
"""
|
|
166
|
+
Display the metrics result as a table.
|
|
167
|
+
"""
|
|
168
|
+
try:
|
|
169
|
+
from ibm_watsonx_gov.visualizations import display_table
|
|
170
|
+
except:
|
|
171
|
+
ImportError(
|
|
172
|
+
"Please install the required dependencies 'ibm-watsonx-gov[visualization]' to display the results.")
|
|
173
|
+
display_table(self._result.to_df(data=self._data))
|
|
174
|
+
|
|
175
|
+
def display_insights(self):
|
|
176
|
+
"""
|
|
177
|
+
Display the metrics result in a venn diagram based on the metrics threshold.
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
from ibm_watsonx_gov.visualizations import ModelInsights
|
|
181
|
+
except:
|
|
182
|
+
ImportError(
|
|
183
|
+
"Please install the required dependencies 'ibm-watsonx-gov[visualization]' to display the results.")
|
|
184
|
+
model_insights = ModelInsights(
|
|
185
|
+
configuration=self.configuration, metrics=self._metrics)
|
|
186
|
+
model_insights.display_metrics(
|
|
187
|
+
metrics_result=self._result.to_df(data=self._data))
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
|
|
2
|
+
# ----------------------------------------------------------------------------------------------------
|
|
3
|
+
# IBM Confidential
|
|
4
|
+
# Licensed Materials - Property of IBM
|
|
5
|
+
# 5737-H76, 5900-A3Q
|
|
6
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
7
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
8
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
9
|
+
# ----------------------------------------------------------------------------------------------------
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
from ibm_watsonx_gov.clients.api_client import APIClient
|
|
13
|
+
from ibm_watsonx_gov.config.model_risk_configuration import \
|
|
14
|
+
ModelRiskConfiguration
|
|
15
|
+
from ibm_watsonx_gov.entities.model_risk_result import ModelRiskResult
|
|
16
|
+
from ibm_watsonx_gov.evaluators.base_evaluator import BaseEvaluator
|
|
17
|
+
from IPython.display import display
|
|
18
|
+
from pydantic import Field, PrivateAttr
|
|
19
|
+
from typing_extensions import Annotated
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ModelRiskEvaluator(BaseEvaluator):
|
|
23
|
+
"""
|
|
24
|
+
The class to evaluate the foundational model risk and display the results.
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
1. Basic usage
|
|
28
|
+
.. code-block:: python
|
|
29
|
+
|
|
30
|
+
configuration = ModelRiskConfiguration(
|
|
31
|
+
model_details=model_details,
|
|
32
|
+
risk_dimensions=risk_dimensions,
|
|
33
|
+
max_sample_size=max_sample_size,
|
|
34
|
+
pdf_report_output_path=pdf_report_output_path
|
|
35
|
+
)
|
|
36
|
+
wxgov_client = APIClient(credentials=Credentials(api_key=""))
|
|
37
|
+
evaluator = ModelRiskEvaluator(
|
|
38
|
+
configuration=config, api_client=wxgov_client)
|
|
39
|
+
|
|
40
|
+
result = evaluator.evaluate()
|
|
41
|
+
|
|
42
|
+
# Get the results in the required format
|
|
43
|
+
result.to_json()
|
|
44
|
+
|
|
45
|
+
# Display the results
|
|
46
|
+
evaluator.display_table()
|
|
47
|
+
evaluator.download_model_risk_report()
|
|
48
|
+
"""
|
|
49
|
+
configuration: Annotated[ModelRiskConfiguration,
|
|
50
|
+
Field(name="The configuration for model risk evaluation.")]
|
|
51
|
+
api_client: Annotated[APIClient | None,
|
|
52
|
+
Field(name="The IBM watsonx.governance client.", default=None)]
|
|
53
|
+
|
|
54
|
+
_result: Annotated[ModelRiskResult | None,
|
|
55
|
+
PrivateAttr(default=None)]
|
|
56
|
+
|
|
57
|
+
def evaluate(self) -> ModelRiskResult:
|
|
58
|
+
"""
|
|
59
|
+
Evaluates the risk of a Foundation model.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
ModelRiskResult: The result of the model risk evaluation.
|
|
63
|
+
"""
|
|
64
|
+
from ibm_watsonx_gov.evaluators.impl.evaluate_model_risk_impl import \
|
|
65
|
+
_evaluate_model_risk
|
|
66
|
+
|
|
67
|
+
self._result = _evaluate_model_risk(
|
|
68
|
+
self.configuration,
|
|
69
|
+
self.api_client,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return self._result
|
|
73
|
+
|
|
74
|
+
def display_table(self):
|
|
75
|
+
for risk in self._result.risks:
|
|
76
|
+
print(f"\n--- Risk: {risk.name} ---")
|
|
77
|
+
for benchmark in risk.benchmarks:
|
|
78
|
+
print(f"Benchmark: {benchmark.name}")
|
|
79
|
+
display(benchmark.get_metric_df())
|
|
80
|
+
|
|
81
|
+
def download_model_risk_report(self):
|
|
82
|
+
"""
|
|
83
|
+
Downloads the model risk report and returns the download link.
|
|
84
|
+
"""
|
|
85
|
+
from ibm_wos_utils.joblib.utils.notebook_utils import \
|
|
86
|
+
create_download_link_for_file
|
|
87
|
+
|
|
88
|
+
return create_download_link_for_file(
|
|
89
|
+
self._result.output_file_path)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated
|
|
11
|
+
|
|
12
|
+
from pydantic import Field, PrivateAttr
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.entities.agentic_app import AgenticApp, Node
|
|
15
|
+
from ibm_watsonx_gov.entities.agentic_evaluation_result import \
|
|
16
|
+
AgenticEvaluationResult
|
|
17
|
+
from ibm_watsonx_gov.evaluators.base_evaluator import BaseEvaluator
|
|
18
|
+
from ibm_watsonx_gov.traces.span_util import flatten_attributes
|
|
19
|
+
from ibm_watsonx_gov.traces.trace_utils import TraceUtils
|
|
20
|
+
from ibm_watsonx_gov.utils.aggregation_util import \
|
|
21
|
+
get_agentic_evaluation_result
|
|
22
|
+
from ibm_watsonx_gov.utils.async_util import (gather_with_concurrency,
|
|
23
|
+
run_in_event_loop)
|
|
24
|
+
from ibm_watsonx_gov.utils.python_utils import add_if_unique
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TracesEvaluator(BaseEvaluator):
|
|
28
|
+
"""
|
|
29
|
+
The class to evaluate agentic applications based on the traces generated.
|
|
30
|
+
"""
|
|
31
|
+
agentic_app: Annotated[AgenticApp,
|
|
32
|
+
Field(title="Agentic application configuration details",
|
|
33
|
+
description="The agentic application configuration details.")]
|
|
34
|
+
__nodes: Annotated[list[Node], PrivateAttr(default=[])]
|
|
35
|
+
|
|
36
|
+
def evaluate(self, spans: list[dict], **kwargs) -> AgenticEvaluationResult:
|
|
37
|
+
"""
|
|
38
|
+
Computes the agentic metrics based on the spans/traces provided as a list.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
spans (list[AgentMetricResult]): The spans on which the metrics need to be computed.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
list[AgentMetricResult]: The computed metric results
|
|
45
|
+
"""
|
|
46
|
+
metrics_result = []
|
|
47
|
+
node_data = []
|
|
48
|
+
messages_data = []
|
|
49
|
+
mapping_data = []
|
|
50
|
+
coros = []
|
|
51
|
+
max_concurrency = kwargs.get("max_concurrency", 10)
|
|
52
|
+
span_trees = TraceUtils.build_span_trees(
|
|
53
|
+
spans=spans, agentic_app=self.agentic_app)
|
|
54
|
+
for span_tree in span_trees:
|
|
55
|
+
# Process only the spans that are associated with the agent application
|
|
56
|
+
attrs = flatten_attributes(span_tree.span.attributes)
|
|
57
|
+
if not attrs.get("traceloop.span.kind") == "workflow":
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
# Append coroutine for data
|
|
61
|
+
coros.append(
|
|
62
|
+
TraceUtils.compute_metrics_from_trace_async_v2(span_tree=span_tree,
|
|
63
|
+
message_io_mapping=self.agentic_app.message_io_mapping,
|
|
64
|
+
metrics_configuration=self.agentic_app.metrics_configuration,
|
|
65
|
+
api_client=self.api_client, **kwargs
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
# Run all coroutines in parallel with concurrency control
|
|
69
|
+
results = run_in_event_loop(
|
|
70
|
+
gather_with_concurrency,
|
|
71
|
+
coros=coros,
|
|
72
|
+
max_concurrency=max_concurrency)
|
|
73
|
+
|
|
74
|
+
# Process results
|
|
75
|
+
for mr, md, nd, mpd, ns in results:
|
|
76
|
+
metrics_result.extend(mr)
|
|
77
|
+
messages_data.append(md)
|
|
78
|
+
node_data.extend(nd)
|
|
79
|
+
mapping_data.append(mpd)
|
|
80
|
+
|
|
81
|
+
for n in ns:
|
|
82
|
+
add_if_unique(n, self.__nodes, ["name", "func_name"], [
|
|
83
|
+
"foundation_models"])
|
|
84
|
+
|
|
85
|
+
result = get_agentic_evaluation_result(
|
|
86
|
+
metrics_result=metrics_result, nodes=self.__nodes)
|
|
87
|
+
|
|
88
|
+
result.messages_data = messages_data
|
|
89
|
+
result.nodes_data = node_data
|
|
90
|
+
result.metrics_mapping_data = mapping_data
|
|
91
|
+
result.nodes = self.__nodes
|
|
92
|
+
|
|
93
|
+
return result
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics import (AnswerRelevanceMetric,
|
|
19
|
+
AnswerSimilarityMetric,
|
|
20
|
+
FaithfulnessMetric,
|
|
21
|
+
UnsuccessfulRequestsMetric)
|
|
22
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AnswerQualityDecorator(BaseMetricDecorator):
|
|
26
|
+
def evaluate_answer_quality(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = []
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing answer quality metrics on an agentic node.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_answer_quality, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
if not metrics:
|
|
39
|
+
metrics = MetricGroup.ANSWER_QUALITY.get_metrics()
|
|
40
|
+
|
|
41
|
+
@decorator
|
|
42
|
+
def wrapper(func, instance, args, kwargs):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
self.validate(func=func, metrics=metrics,
|
|
46
|
+
valid_metric_types=(AnswerRelevanceMetric, FaithfulnessMetric, UnsuccessfulRequestsMetric, AnswerSimilarityMetric))
|
|
47
|
+
|
|
48
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS,
|
|
49
|
+
EvaluatorFields.CONTEXT_FIELDS]
|
|
50
|
+
metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
|
|
51
|
+
metric_references = [EvaluatorFields.REFERENCE_FIELDS]
|
|
52
|
+
|
|
53
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
54
|
+
configuration=configuration,
|
|
55
|
+
metrics=metrics,
|
|
56
|
+
metric_inputs=metric_inputs,
|
|
57
|
+
metric_outputs=metric_outputs,
|
|
58
|
+
metric_references=metric_references,
|
|
59
|
+
metric_groups=[MetricGroup.ANSWER_QUALITY])
|
|
60
|
+
|
|
61
|
+
return original_result
|
|
62
|
+
except Exception as ex:
|
|
63
|
+
raise Exception(
|
|
64
|
+
f"There was an error while evaluating answer quality metrics on {func.__name__},") from ex
|
|
65
|
+
|
|
66
|
+
return wrapper(func)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics import (HAPMetric, PIIMetric,
|
|
19
|
+
PromptSafetyRiskMetric)
|
|
20
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
21
|
+
from ibm_watsonx_gov.metrics.evasiveness.evasiveness_metric import \
|
|
22
|
+
EvasivenessMetric
|
|
23
|
+
from ibm_watsonx_gov.metrics.harm.harm_metric import HarmMetric
|
|
24
|
+
from ibm_watsonx_gov.metrics.harm_engagement.harm_engagement_metric import \
|
|
25
|
+
HarmEngagementMetric
|
|
26
|
+
from ibm_watsonx_gov.metrics.jailbreak.jailbreak_metric import JailbreakMetric
|
|
27
|
+
from ibm_watsonx_gov.metrics.profanity.profanity_metric import ProfanityMetric
|
|
28
|
+
from ibm_watsonx_gov.metrics.sexual_content.sexual_content_metric import \
|
|
29
|
+
SexualContentMetric
|
|
30
|
+
from ibm_watsonx_gov.metrics.social_bias.social_bias_metric import \
|
|
31
|
+
SocialBiasMetric
|
|
32
|
+
from ibm_watsonx_gov.metrics.unethical_behavior.unethical_behavior_metric import \
|
|
33
|
+
UnethicalBehaviorMetric
|
|
34
|
+
from ibm_watsonx_gov.metrics.violence.violence_metric import ViolenceMetric
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ContentSafetyDecorator(BaseMetricDecorator):
|
|
38
|
+
def evaluate_content_safety(self,
|
|
39
|
+
func: Optional[Callable] = None,
|
|
40
|
+
*,
|
|
41
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
42
|
+
metrics: list[GenAIMetric] = []
|
|
43
|
+
) -> dict:
|
|
44
|
+
"""
|
|
45
|
+
An evaluation decorator for computing content safety metrics on an agentic node.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
if func is None:
|
|
49
|
+
return partial(self.evaluate_content_safety, configuration=configuration, metrics=metrics)
|
|
50
|
+
|
|
51
|
+
if not metrics:
|
|
52
|
+
metrics = MetricGroup.CONTENT_SAFETY.get_metrics()
|
|
53
|
+
|
|
54
|
+
@decorator
|
|
55
|
+
def wrapper(func, instance, args, kwargs):
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
self.validate(func=func, metrics=metrics,
|
|
59
|
+
valid_metric_types=(PromptSafetyRiskMetric, HAPMetric, PIIMetric, HarmMetric, SocialBiasMetric, ProfanityMetric, SexualContentMetric,
|
|
60
|
+
UnethicalBehaviorMetric, ViolenceMetric, HarmEngagementMetric, EvasivenessMetric, JailbreakMetric))
|
|
61
|
+
|
|
62
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
63
|
+
|
|
64
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
65
|
+
configuration=configuration,
|
|
66
|
+
metrics=metrics,
|
|
67
|
+
metric_inputs=metric_inputs,
|
|
68
|
+
metric_outputs=[],
|
|
69
|
+
metric_groups=[MetricGroup.CONTENT_SAFETY])
|
|
70
|
+
|
|
71
|
+
return original_result
|
|
72
|
+
except Exception as ex:
|
|
73
|
+
raise Exception(
|
|
74
|
+
f"There was an error while evaluating content safety metrics on {func.__name__},") from ex
|
|
75
|
+
|
|
76
|
+
return wrapper(func)
|