ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from lazy_imports import LazyModule, load
|
|
14
|
+
from pydantic import Field, model_validator
|
|
15
|
+
from typing_extensions import Self
|
|
16
|
+
|
|
17
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
18
|
+
from ibm_watsonx_gov.entities.base_classes import Error
|
|
19
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
20
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
21
|
+
RecordMetricResult)
|
|
22
|
+
from ibm_watsonx_gov.entities.llm_judge import LLMJudge
|
|
23
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
24
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
25
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
26
|
+
from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
|
|
27
|
+
from ibm_watsonx_gov.utils.validation_util import (validate_llm_as_judge,
|
|
28
|
+
validate_output,
|
|
29
|
+
validate_reference,
|
|
30
|
+
validate_unitxt_method)
|
|
31
|
+
|
|
32
|
+
unitxt_provider = LazyModule(
|
|
33
|
+
"from ibm_watsonx_gov.providers.unitxt_provider import UnitxtProvider",
|
|
34
|
+
name="lazy_unitxt_provider"
|
|
35
|
+
)
|
|
36
|
+
load(unitxt_provider)
|
|
37
|
+
UnitxtProvider = unitxt_provider.UnitxtProvider
|
|
38
|
+
|
|
39
|
+
logger = GovSDKLogger.get_logger(__name__)
|
|
40
|
+
UNITXT_METRIC_NAME = "answer_correctness"
|
|
41
|
+
|
|
42
|
+
unitxt_methods = ["token_recall",
|
|
43
|
+
"bert_score_recall",
|
|
44
|
+
"sentence_bert_mini_lm",
|
|
45
|
+
"llm_as_judge",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class AnswerSimilarityMetric(GenAIMetric):
|
|
50
|
+
"""
|
|
51
|
+
Defines the Answer Similarity metric class.
|
|
52
|
+
|
|
53
|
+
The Answer Similarity metric measures the similarity between the generated text and the ground truth.
|
|
54
|
+
It can be computed using the below methods:
|
|
55
|
+
|
|
56
|
+
1. token_recall (default)
|
|
57
|
+
2. bert_score_recall
|
|
58
|
+
3. sentence_bert_mini_lm
|
|
59
|
+
4. llm_as_judge
|
|
60
|
+
|
|
61
|
+
Examples:
|
|
62
|
+
1. Create Answer Similarity metric with default parameters and compute using metrics evaluator.
|
|
63
|
+
.. code-block:: python
|
|
64
|
+
|
|
65
|
+
metric = AnswerSimilarityMetric()
|
|
66
|
+
result = MetricsEvaluator().evaluate(data={"generated_text": "...", "ground_truth": "..."},
|
|
67
|
+
metrics=[metric])
|
|
68
|
+
|
|
69
|
+
2. Create Answer Similarity metric with a custom threshold and method.
|
|
70
|
+
.. code-block:: python
|
|
71
|
+
|
|
72
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
73
|
+
method = "sentence_bert_mini_lm"
|
|
74
|
+
metric = AnswerSimilarityMetric(method=method, threshold=threshold)
|
|
75
|
+
|
|
76
|
+
3. Create Answer Similarity metric with llm_as_judge method.
|
|
77
|
+
.. code-block:: python
|
|
78
|
+
|
|
79
|
+
# Define LLM Judge using watsonx.ai
|
|
80
|
+
# To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
|
|
81
|
+
llm_judge = LLMJudge(model=WxAIFoundationModel(
|
|
82
|
+
model_id="ibm/granite-3-3-8b-instruct",
|
|
83
|
+
project_id="<PROJECT_ID>"
|
|
84
|
+
))
|
|
85
|
+
metric = AnswerSimilarityMetric(llm_judge=llm_judge)
|
|
86
|
+
"""
|
|
87
|
+
name: Annotated[Literal["answer_similarity"],
|
|
88
|
+
Field(title="Name",
|
|
89
|
+
description="The answer similarity metric name.",
|
|
90
|
+
default="answer_similarity", frozen=True)]
|
|
91
|
+
display_name: Annotated[Literal["Answer Similarity"],
|
|
92
|
+
Field(title="Display Name",
|
|
93
|
+
description="The answer similarity metric display name.",
|
|
94
|
+
default="Answer Similarity", frozen=True)]
|
|
95
|
+
tasks: Annotated[list[TaskType],
|
|
96
|
+
Field(title="Tasks",
|
|
97
|
+
description="The list of supported tasks.",
|
|
98
|
+
default=[TaskType.RAG, TaskType.QA])]
|
|
99
|
+
is_reference_free: Annotated[bool,
|
|
100
|
+
Field(title="Is Reference free",
|
|
101
|
+
description="The flag to indicate whether this metric needs a reference for computation. This metric needs reference value to compute.",
|
|
102
|
+
default=False, frozen=True)]
|
|
103
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
104
|
+
Field(title="Thresholds",
|
|
105
|
+
description="The metric thresholds.",
|
|
106
|
+
default=[MetricThreshold(type="lower_limit", value=0.7)])]
|
|
107
|
+
method: Annotated[Literal["token_recall", "bert_score_recall", "sentence_bert_mini_lm", "llm_as_judge"],
|
|
108
|
+
Field(title="Method",
|
|
109
|
+
description="The method used to compute the metric. This field is optional and when `llm_judge` is provided, the method would be set to `llm_as_judge`.",
|
|
110
|
+
default="token_recall")]
|
|
111
|
+
group: Annotated[MetricGroup,
|
|
112
|
+
Field(title="Group",
|
|
113
|
+
description="The metric group.",
|
|
114
|
+
default=MetricGroup.ANSWER_QUALITY, frozen=True)]
|
|
115
|
+
llm_judge: Annotated[LLMJudge | None,
|
|
116
|
+
Field(title="LLM Judge",
|
|
117
|
+
description="The LLM judge used to compute the metric.",
|
|
118
|
+
default=None)]
|
|
119
|
+
|
|
120
|
+
@model_validator(mode="after")
|
|
121
|
+
def set_llm_judge_default_method(self) -> Self:
|
|
122
|
+
# If llm_judge is set, set the method to llm_as_judge
|
|
123
|
+
if self.llm_judge:
|
|
124
|
+
self.method = "llm_as_judge"
|
|
125
|
+
return self
|
|
126
|
+
|
|
127
|
+
def evaluate(self,
|
|
128
|
+
data: pd.DataFrame,
|
|
129
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
130
|
+
**kwargs) -> AggregateMetricResult:
|
|
131
|
+
# If ran in sync mode, block until it is done
|
|
132
|
+
return run_in_event_loop(
|
|
133
|
+
self.evaluate_async,
|
|
134
|
+
data=data,
|
|
135
|
+
configuration=configuration,
|
|
136
|
+
**kwargs,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
async def evaluate_async(self, data: pd.DataFrame,
|
|
140
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
141
|
+
**kwargs) -> AggregateMetricResult:
|
|
142
|
+
|
|
143
|
+
data_cols = data.columns.to_list()
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
validate_output(data_cols, configuration)
|
|
147
|
+
validate_reference(data_cols, configuration)
|
|
148
|
+
validate_unitxt_method(self.name, self.method, unitxt_methods)
|
|
149
|
+
validate_llm_as_judge(self.name, self.method,
|
|
150
|
+
self.llm_judge, configuration.llm_judge)
|
|
151
|
+
except ValueError as ve:
|
|
152
|
+
if kwargs.get("ignore_validation_errors"):
|
|
153
|
+
message = f"Skipping '{self.name}' computation because the validation failed. Details: {str(ve)}"
|
|
154
|
+
logger.warning(message)
|
|
155
|
+
return
|
|
156
|
+
raise ve
|
|
157
|
+
|
|
158
|
+
# Separate the data into a dataframe with no None values and a dataframe with None values
|
|
159
|
+
required_fields = configuration.output_fields + configuration.reference_fields
|
|
160
|
+
mask_has_none = data[required_fields].isna().any(axis=1)
|
|
161
|
+
df_with_none = data[mask_has_none]
|
|
162
|
+
df_without_none = data[mask_has_none == False]
|
|
163
|
+
|
|
164
|
+
# Compute the metrics only for the dataframe with no None values
|
|
165
|
+
aggregated_metric_result = None
|
|
166
|
+
if not df_without_none.empty:
|
|
167
|
+
provider = UnitxtProvider(configuration=configuration,
|
|
168
|
+
metric_name=self.name,
|
|
169
|
+
metric_display_name=self.display_name,
|
|
170
|
+
metric_method=self.method,
|
|
171
|
+
metric_prefix="metrics.rag.external_rag",
|
|
172
|
+
metric_alias=UNITXT_METRIC_NAME,
|
|
173
|
+
metric_group=self.group,
|
|
174
|
+
llm_judge=self.llm_judge,
|
|
175
|
+
thresholds=self.thresholds,
|
|
176
|
+
**kwargs)
|
|
177
|
+
|
|
178
|
+
aggregated_metric_result = await provider.evaluate_async(data=df_without_none)
|
|
179
|
+
|
|
180
|
+
# Update the metric result with record level metrics results for the records with missing values
|
|
181
|
+
if not df_with_none.empty:
|
|
182
|
+
# Create None results for records with missing values
|
|
183
|
+
none_results = []
|
|
184
|
+
for _, row in df_with_none.iterrows():
|
|
185
|
+
record_result = RecordMetricResult(
|
|
186
|
+
name=self.name,
|
|
187
|
+
display_name=self.display_name,
|
|
188
|
+
method=self.method,
|
|
189
|
+
group=self.group,
|
|
190
|
+
value=None,
|
|
191
|
+
record_id=row[configuration.record_id_field],
|
|
192
|
+
thresholds=self.thresholds,
|
|
193
|
+
errors=[Error(
|
|
194
|
+
code="BAD_REQUEST", message_en="The value of required fields output or reference is None.")]
|
|
195
|
+
)
|
|
196
|
+
none_results.append(record_result)
|
|
197
|
+
|
|
198
|
+
# Merge the results
|
|
199
|
+
if aggregated_metric_result:
|
|
200
|
+
all_record_results = aggregated_metric_result.record_level_metrics + none_results
|
|
201
|
+
aggregated_metric_result.record_level_metrics = all_record_results
|
|
202
|
+
aggregated_metric_result.total_records = len(
|
|
203
|
+
all_record_results)
|
|
204
|
+
else:
|
|
205
|
+
aggregated_metric_result = AggregateMetricResult(
|
|
206
|
+
name=self.name,
|
|
207
|
+
display_name=self.display_name,
|
|
208
|
+
method=self.method,
|
|
209
|
+
group=self.group,
|
|
210
|
+
value=None,
|
|
211
|
+
total_records=len(none_results),
|
|
212
|
+
record_level_metrics=none_results,
|
|
213
|
+
min=None,
|
|
214
|
+
max=None,
|
|
215
|
+
mean=None,
|
|
216
|
+
thresholds=self.thresholds
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
return aggregated_metric_result
|
|
File without changes
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.average_precision.average_precision_metric import \
|
|
19
|
+
AveragePrecisionMetric
|
|
20
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
21
|
+
from ibm_watsonx_gov.metrics.context_relevance.context_relevance_metric import \
|
|
22
|
+
ContextRelevanceMetric
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AveragePrecisionDecorator(BaseMetricDecorator):
|
|
26
|
+
def evaluate_average_precision(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = []
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing average precision metric on an agentic node.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_average_precision, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
if not metrics:
|
|
39
|
+
metrics = [AveragePrecisionMetric()]
|
|
40
|
+
|
|
41
|
+
@decorator
|
|
42
|
+
def wrapper(func, instance, args, kwargs):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
self.validate(func=func, metrics=metrics,
|
|
46
|
+
valid_metric_types=(AveragePrecisionMetric, ContextRelevanceMetric))
|
|
47
|
+
|
|
48
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
49
|
+
metric_outputs = [EvaluatorFields.CONTEXT_FIELDS]
|
|
50
|
+
|
|
51
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
52
|
+
configuration=configuration,
|
|
53
|
+
metrics=metrics,
|
|
54
|
+
metric_inputs=metric_inputs,
|
|
55
|
+
metric_outputs=metric_outputs)
|
|
56
|
+
|
|
57
|
+
return original_result
|
|
58
|
+
except Exception as ex:
|
|
59
|
+
raise Exception(
|
|
60
|
+
f"There was an error while evaluating average precision metric on {func.__name__},") from ex
|
|
61
|
+
|
|
62
|
+
return wrapper(func)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Any, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field, TypeAdapter, field_validator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
18
|
+
RecordMetricResult)
|
|
19
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
20
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
21
|
+
from ibm_watsonx_gov.metrics.context_relevance.context_relevance_metric import (
|
|
22
|
+
CONTEXT_RELEVANCE, ContextRelevanceMetric, ContextRelevanceResult)
|
|
23
|
+
|
|
24
|
+
AVERAGE_PRECISION = "average_precision"
|
|
25
|
+
AVERAGE_PRECISION_DISPLAY_NAME = "Average Precision"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class AveragePrecisionResult(RecordMetricResult):
|
|
29
|
+
name: str = AVERAGE_PRECISION
|
|
30
|
+
display_name: str = AVERAGE_PRECISION_DISPLAY_NAME
|
|
31
|
+
group: MetricGroup = MetricGroup.RETRIEVAL_QUALITY
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AveragePrecisionMetric(GenAIMetric):
|
|
35
|
+
"""
|
|
36
|
+
Defines the Average Precision metric class.
|
|
37
|
+
|
|
38
|
+
The Average Precision metric measures the quality of how a retrieval system ranks relevant contexts.
|
|
39
|
+
The Context Relevance metric is computed as a pre requisite to compute this metric.
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
1. Create Average Precision metric with default parameters and compute using metrics evaluator.
|
|
43
|
+
.. code-block:: python
|
|
44
|
+
|
|
45
|
+
metric = AveragePrecisionMetric()
|
|
46
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": "..."},
|
|
47
|
+
metrics=[metric])
|
|
48
|
+
# A list of contexts can also be passed as shown below
|
|
49
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
|
|
50
|
+
metrics=[metric])
|
|
51
|
+
|
|
52
|
+
2. Create Average Precision metric with a custom threshold.
|
|
53
|
+
.. code-block:: python
|
|
54
|
+
|
|
55
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
56
|
+
metric = AveragePrecisionMetric(method=method, threshold=threshold)
|
|
57
|
+
|
|
58
|
+
3. Create Average Precision metric with llm_as_judge method.
|
|
59
|
+
.. code-block:: python
|
|
60
|
+
|
|
61
|
+
# Define LLM Judge using watsonx.ai
|
|
62
|
+
# To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
|
|
63
|
+
llm_judge = LLMJudge(model=WxAIFoundationModel(
|
|
64
|
+
model_id="ibm/granite-3-3-8b-instruct",
|
|
65
|
+
project_id="<PROJECT_ID>"
|
|
66
|
+
))
|
|
67
|
+
cr_metric = ContextRelevanceMetric(llm_judge=llm_judge)
|
|
68
|
+
ap_metric = AveragePrecisionMetric()
|
|
69
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
|
|
70
|
+
metrics=[cr_metric, ap_metric])
|
|
71
|
+
"""
|
|
72
|
+
name: Annotated[Literal["average_precision"],
|
|
73
|
+
Field(title="Name",
|
|
74
|
+
description="The average precision metric name.",
|
|
75
|
+
default=AVERAGE_PRECISION, frozen=True)]
|
|
76
|
+
display_name: Annotated[Literal["Average Precision"],
|
|
77
|
+
Field(title="Display Name",
|
|
78
|
+
description="The average precision metric display name.",
|
|
79
|
+
default=AVERAGE_PRECISION_DISPLAY_NAME, frozen=True)]
|
|
80
|
+
tasks: Annotated[list[TaskType],
|
|
81
|
+
Field(title="Tasks",
|
|
82
|
+
description="The list of supported tasks.",
|
|
83
|
+
default=[TaskType.RAG])]
|
|
84
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
85
|
+
Field(title="Thresholds",
|
|
86
|
+
description="The metric thresholds.",
|
|
87
|
+
default=[MetricThreshold(type="lower_limit", value=0.7)])]
|
|
88
|
+
metric_dependencies: Annotated[list[GenAIMetric],
|
|
89
|
+
Field(title="Metric dependencies",
|
|
90
|
+
description="The list of metric dependencies",
|
|
91
|
+
default=[ContextRelevanceMetric()])]
|
|
92
|
+
group: Annotated[MetricGroup,
|
|
93
|
+
Field(title="Group",
|
|
94
|
+
description="The metric group.",
|
|
95
|
+
default=MetricGroup.RETRIEVAL_QUALITY, frozen=True)]
|
|
96
|
+
|
|
97
|
+
@field_validator("metric_dependencies", mode="before")
|
|
98
|
+
@classmethod
|
|
99
|
+
def metric_dependencies_validator(cls, value: Any):
|
|
100
|
+
if value:
|
|
101
|
+
value = [TypeAdapter(Annotated[ContextRelevanceMetric, Field(
|
|
102
|
+
discriminator="name")]).validate_python(
|
|
103
|
+
m) for m in value]
|
|
104
|
+
return value
|
|
105
|
+
|
|
106
|
+
def evaluate(
|
|
107
|
+
self,
|
|
108
|
+
data: pd.DataFrame,
|
|
109
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
110
|
+
metrics_result: list[AggregateMetricResult],
|
|
111
|
+
**kwargs,
|
|
112
|
+
) -> AggregateMetricResult:
|
|
113
|
+
record_level_metrics = []
|
|
114
|
+
scores = []
|
|
115
|
+
|
|
116
|
+
context_relevance_result: list[ContextRelevanceResult] = next(
|
|
117
|
+
(metric_result.record_level_metrics for metric_result in metrics_result if metric_result.name == CONTEXT_RELEVANCE), None)
|
|
118
|
+
|
|
119
|
+
if context_relevance_result is None:
|
|
120
|
+
raise Exception(
|
|
121
|
+
f"Failed to evaluate {self.name} metric. Missing context relevance metric result")
|
|
122
|
+
|
|
123
|
+
for relevance_result in context_relevance_result:
|
|
124
|
+
score = self.__compute(
|
|
125
|
+
relevance_scores=relevance_result.additional_info.get(
|
|
126
|
+
"contexts_values", []),
|
|
127
|
+
threshold=self.thresholds[0].value,
|
|
128
|
+
)
|
|
129
|
+
scores.append(score)
|
|
130
|
+
record_level_metrics.append(
|
|
131
|
+
AveragePrecisionResult(
|
|
132
|
+
method="",
|
|
133
|
+
provider="",
|
|
134
|
+
record_id=relevance_result.record_id,
|
|
135
|
+
value=score,
|
|
136
|
+
thresholds=self.thresholds
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
mean = sum(scores) / len(scores)
|
|
141
|
+
aggregate_metric_score = AggregateMetricResult(
|
|
142
|
+
name=self.name,
|
|
143
|
+
display_name=self.display_name,
|
|
144
|
+
method="",
|
|
145
|
+
provider="",
|
|
146
|
+
group=self.group,
|
|
147
|
+
min=min(scores),
|
|
148
|
+
max=max(scores),
|
|
149
|
+
mean=mean,
|
|
150
|
+
value=mean,
|
|
151
|
+
total_records=len(record_level_metrics),
|
|
152
|
+
record_level_metrics=record_level_metrics,
|
|
153
|
+
thresholds=self.thresholds
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return aggregate_metric_score
|
|
157
|
+
|
|
158
|
+
def __compute(self, relevance_scores: list[float], threshold: float) -> float:
|
|
159
|
+
relevancy_at_k = []
|
|
160
|
+
for i, score in enumerate(relevance_scores):
|
|
161
|
+
if score >= threshold:
|
|
162
|
+
relevancy_at_k.append(i + 1)
|
|
163
|
+
total_relevant_items = len(relevancy_at_k)
|
|
164
|
+
if total_relevant_items == 0:
|
|
165
|
+
return 0
|
|
166
|
+
precision_sum = 0
|
|
167
|
+
relevant_rank = 0
|
|
168
|
+
for k in relevancy_at_k:
|
|
169
|
+
relevant_rank += 1
|
|
170
|
+
precision_at_k = relevant_rank / k
|
|
171
|
+
precision_sum += precision_at_k
|
|
172
|
+
average_precision = precision_sum / total_relevant_items
|
|
173
|
+
average_precision_rounded = round(average_precision, 1)
|
|
174
|
+
return average_precision_rounded
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from json import dumps
|
|
12
|
+
from threading import Lock
|
|
13
|
+
from typing import Any, Callable, Set
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.clients.api_client import APIClient
|
|
16
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
17
|
+
AgenticAIConfiguration
|
|
18
|
+
from ibm_watsonx_gov.entities.agentic_app import MetricsConfiguration
|
|
19
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
|
|
20
|
+
from ibm_watsonx_gov.entities.evaluation_result import AgentMetricResult
|
|
21
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
22
|
+
from ibm_watsonx_gov.evaluators.impl.evaluate_metrics_impl import \
|
|
23
|
+
_evaluate_metrics_async
|
|
24
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
25
|
+
from ibm_watsonx_gov.utils.python_utils import get_argument_value
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from ibm_agent_analytics.instrumentation.utils import (
|
|
29
|
+
AIEventRecorder, get_current_trace_id, record_span_attributes)
|
|
30
|
+
except:
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class BaseMetricDecorator():
|
|
35
|
+
"""
|
|
36
|
+
Base class for all metric decorators
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, api_client: APIClient = None, configuration: AgenticAIConfiguration = None,
|
|
40
|
+
compute_real_time: bool = True, metric_results: list[AgentMetricResult] = [],
|
|
41
|
+
execution_counts: dict[str, dict[str, int]] = {},
|
|
42
|
+
nodes_being_run: dict[str, Set[str]] = {}, lock: Lock = None):
|
|
43
|
+
self.api_client = api_client
|
|
44
|
+
self.configuration = configuration
|
|
45
|
+
self.compute_real_time = compute_real_time
|
|
46
|
+
self.metric_results = metric_results
|
|
47
|
+
self.execution_counts = execution_counts
|
|
48
|
+
self.nodes_being_run = nodes_being_run
|
|
49
|
+
self.lock = lock
|
|
50
|
+
|
|
51
|
+
def validate(self, *, func: Callable, metrics: list[GenAIMetric], valid_metric_types: tuple[Any]):
|
|
52
|
+
if not metrics:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"The 'metrics' argument can not be empty.")
|
|
55
|
+
|
|
56
|
+
invalid_metrics = [metric.name for metric in metrics if not isinstance(
|
|
57
|
+
metric, valid_metric_types)]
|
|
58
|
+
if len(invalid_metrics):
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"The evaluator '{func.__name__}' is not applicable for "
|
|
61
|
+
f"computing the metrics: {', '.join(invalid_metrics)}")
|
|
62
|
+
|
|
63
|
+
def compute_helper(self, *, func: Callable,
|
|
64
|
+
args: tuple,
|
|
65
|
+
kwargs: dict[str, Any],
|
|
66
|
+
configuration: AgenticAIConfiguration,
|
|
67
|
+
metrics: list[GenAIMetric],
|
|
68
|
+
metric_inputs: list[EvaluatorFields],
|
|
69
|
+
metric_outputs: list[EvaluatorFields],
|
|
70
|
+
metric_references: list[EvaluatorFields] = [],
|
|
71
|
+
metric_groups: list[MetricGroup] = []) -> dict:
|
|
72
|
+
"""
|
|
73
|
+
Helper method for computing metrics.
|
|
74
|
+
|
|
75
|
+
Does the following:
|
|
76
|
+
1. Computes node latency metric, and appends the result to the :py:attr:`AgenticEvaluation.metric_results` attribute.
|
|
77
|
+
2. Calls the original node.
|
|
78
|
+
3. Computes the list of metrics given, and appends the result to the :py:attr:`AgenticEvaluation.metric_results` attribute.
|
|
79
|
+
4. Returns the result of the original node without any changes.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
func (Callable): The node on which the metric is to be computed
|
|
83
|
+
args (tuple): The tuple of positional arguments passed to the node
|
|
84
|
+
kwargs (dict[str, Any]): The dictionary of keyword arguments passed to the node
|
|
85
|
+
configuration (AgenticAIConfiguration): The node specific configuration
|
|
86
|
+
metrics (list[GenAIMetric]): The list of metrics to compute.
|
|
87
|
+
metric_inputs (list[EvaluatorFields]): The list of inputs for the metric.
|
|
88
|
+
metric_outputs (list[EvaluatorFields]): The list of outputs for the metric.
|
|
89
|
+
metric_references (list[EvaluatorFields], optional): The optional list of references for the metric. Defaults to [].
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
ValueError: If the record id field is missing from the node inputs.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
dict: The result of the wrapped node.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
get_arg_value = partial(
|
|
99
|
+
get_argument_value, func=func, args=args, kwargs=kwargs)
|
|
100
|
+
|
|
101
|
+
defaults = metric_inputs + metric_outputs + metric_references
|
|
102
|
+
_configuration = AgenticAIConfiguration.create_configuration(app_config=self.configuration,
|
|
103
|
+
method_config=configuration,
|
|
104
|
+
defaults=defaults)
|
|
105
|
+
_configuration.record_id_field = _configuration.message_id_field
|
|
106
|
+
|
|
107
|
+
_data = {}
|
|
108
|
+
# Add record id to the data
|
|
109
|
+
_field = getattr(_configuration, EvaluatorFields.MESSAGE_ID_FIELD.value,
|
|
110
|
+
EvaluatorFields.get_default_fields_mapping()[EvaluatorFields.MESSAGE_ID_FIELD])
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
_message_id_value = get_arg_value(
|
|
114
|
+
param_name=_field) or get_current_trace_id()
|
|
115
|
+
except ValueError:
|
|
116
|
+
_message_id_value = get_current_trace_id()
|
|
117
|
+
|
|
118
|
+
if _message_id_value is None:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"The {_field} is required for evaluation. Please add it while invoking the application.")
|
|
121
|
+
|
|
122
|
+
_data[_field] = _message_id_value
|
|
123
|
+
|
|
124
|
+
if _message_id_value not in self.nodes_being_run:
|
|
125
|
+
self.nodes_being_run[_message_id_value] = set()
|
|
126
|
+
if _message_id_value not in self.execution_counts:
|
|
127
|
+
self.execution_counts[_message_id_value] = dict()
|
|
128
|
+
|
|
129
|
+
if func.__name__ not in self.nodes_being_run[_message_id_value]:
|
|
130
|
+
self.nodes_being_run[_message_id_value].add(func.__name__)
|
|
131
|
+
self.execution_counts[_message_id_value][func.__name__] = self.execution_counts[_message_id_value].get(
|
|
132
|
+
func.__name__, 0) + 1
|
|
133
|
+
|
|
134
|
+
original_result = func(*args, **kwargs)
|
|
135
|
+
|
|
136
|
+
metric_result = []
|
|
137
|
+
if self.compute_real_time:
|
|
138
|
+
for field in metric_inputs + metric_references:
|
|
139
|
+
_field = getattr(_configuration, field.value)
|
|
140
|
+
if not (isinstance(_field, list)):
|
|
141
|
+
_field = [_field]
|
|
142
|
+
_data.update(dict(map(lambda f: (
|
|
143
|
+
f, get_arg_value(param_name=f)), _field)))
|
|
144
|
+
|
|
145
|
+
for field in metric_outputs:
|
|
146
|
+
_field = getattr(_configuration, field.value)
|
|
147
|
+
if not (isinstance(_field, list)):
|
|
148
|
+
_field = [_field]
|
|
149
|
+
_data.update(dict(map(lambda f: (
|
|
150
|
+
f, original_result.get(f)), _field)))
|
|
151
|
+
|
|
152
|
+
metric_result = run_in_event_loop(
|
|
153
|
+
_evaluate_metrics_async,
|
|
154
|
+
configuration=_configuration,
|
|
155
|
+
data=_data,
|
|
156
|
+
metrics=metrics,
|
|
157
|
+
metric_groups=metric_groups,
|
|
158
|
+
api_client=self.api_client
|
|
159
|
+
)
|
|
160
|
+
metric_result = metric_result.to_dict()
|
|
161
|
+
|
|
162
|
+
for mr in metric_result:
|
|
163
|
+
node_result = {
|
|
164
|
+
"applies_to": "node",
|
|
165
|
+
"node_name": func.__name__,
|
|
166
|
+
**mr
|
|
167
|
+
}
|
|
168
|
+
node_result["message_id"] = node_result["record_id"]
|
|
169
|
+
amr = AgentMetricResult(**node_result)
|
|
170
|
+
|
|
171
|
+
AIEventRecorder.record_metric(name=amr.name,
|
|
172
|
+
value=amr.value,
|
|
173
|
+
attributes={"wxgov.result.metric": amr.model_dump_json(exclude_unset=True)})
|
|
174
|
+
metrics_configuration = MetricsConfiguration(
|
|
175
|
+
configuration=_configuration, metrics=metrics)
|
|
176
|
+
record_span_attributes({"wxgov.config.metrics."+str(type(self)).split(".")[2]: dumps({
|
|
177
|
+
"metrics_configuration": metrics_configuration.model_dump(mode="json"),
|
|
178
|
+
"compute_real_time": "true"
|
|
179
|
+
})})
|
|
180
|
+
|
|
181
|
+
with self.lock:
|
|
182
|
+
self.metric_results.append(amr)
|
|
183
|
+
|
|
184
|
+
else:
|
|
185
|
+
metrics_configuration = MetricsConfiguration(
|
|
186
|
+
configuration=_configuration, metrics=metrics)
|
|
187
|
+
# Store the configuration of metrics to compute in traces
|
|
188
|
+
record_span_attributes({"wxgov.config.metrics."+str(type(self)).split(".")[2]: dumps({
|
|
189
|
+
"metrics_configuration": metrics_configuration.model_dump(mode="json"),
|
|
190
|
+
"compute_real_time": "false"
|
|
191
|
+
})})
|
|
192
|
+
|
|
193
|
+
return original_result
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|