ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics import TextGradeLevelMetric, TextReadingEaseMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ReadabilityDecorator(BaseMetricDecorator):
|
|
23
|
+
def evaluate_readability(self,
|
|
24
|
+
func: Optional[Callable] = None,
|
|
25
|
+
*,
|
|
26
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
27
|
+
metrics: list[GenAIMetric] = []
|
|
28
|
+
) -> dict:
|
|
29
|
+
"""
|
|
30
|
+
An evaluation decorator for computing readability metric on an agentic node.
|
|
31
|
+
"""
|
|
32
|
+
if func is None:
|
|
33
|
+
return partial(self.evaluate_readability, configuration=configuration, metrics=metrics)
|
|
34
|
+
|
|
35
|
+
if not metrics:
|
|
36
|
+
metrics = MetricGroup.READABILITY.get_metrics()
|
|
37
|
+
|
|
38
|
+
@decorator
|
|
39
|
+
def wrapper(func, instance, args, kwargs):
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
self.validate(func=func, metrics=metrics,
|
|
43
|
+
valid_metric_types=(TextGradeLevelMetric, TextReadingEaseMetric,))
|
|
44
|
+
|
|
45
|
+
metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
|
|
46
|
+
|
|
47
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
48
|
+
configuration=configuration,
|
|
49
|
+
metrics=metrics,
|
|
50
|
+
metric_inputs=[],
|
|
51
|
+
metric_outputs=metric_outputs,
|
|
52
|
+
metric_groups=[MetricGroup.READABILITY])
|
|
53
|
+
|
|
54
|
+
return original_result
|
|
55
|
+
except Exception as ex:
|
|
56
|
+
raise Exception(
|
|
57
|
+
f"There was an error while evaluating readability metric on {func.__name__},") from ex
|
|
58
|
+
|
|
59
|
+
return wrapper(func)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics import (AveragePrecisionMetric,
|
|
19
|
+
ContextRelevanceMetric, HitRateMetric,
|
|
20
|
+
NDCGMetric, ReciprocalRankMetric,
|
|
21
|
+
RetrievalPrecisionMetric)
|
|
22
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RetrievalQualityDecorator(BaseMetricDecorator):
|
|
26
|
+
def evaluate_retrieval_quality(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = []
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing retrieval quality metrics on an agentic node.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_retrieval_quality, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
if not metrics:
|
|
39
|
+
metrics = MetricGroup.RETRIEVAL_QUALITY.get_metrics()
|
|
40
|
+
|
|
41
|
+
@decorator
|
|
42
|
+
def wrapper(func, instance, args, kwargs):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
self.validate(func=func, metrics=metrics,
|
|
46
|
+
valid_metric_types=(NDCGMetric, ContextRelevanceMetric, ReciprocalRankMetric, RetrievalPrecisionMetric, AveragePrecisionMetric, HitRateMetric))
|
|
47
|
+
|
|
48
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
49
|
+
metric_outputs = [EvaluatorFields.CONTEXT_FIELDS]
|
|
50
|
+
|
|
51
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
52
|
+
configuration=configuration,
|
|
53
|
+
metrics=metrics,
|
|
54
|
+
metric_inputs=metric_inputs,
|
|
55
|
+
metric_outputs=metric_outputs,
|
|
56
|
+
metric_groups=[MetricGroup.RETRIEVAL_QUALITY])
|
|
57
|
+
|
|
58
|
+
return original_result
|
|
59
|
+
except Exception as ex:
|
|
60
|
+
raise Exception(
|
|
61
|
+
f"There was an error while evaluating retrieval quality metrics on {func.__name__},") from ex
|
|
62
|
+
|
|
63
|
+
return wrapper(func)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields, MetricGroup
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
19
|
+
from ibm_watsonx_gov.metrics import CostMetric, InputTokenCountMetric, OutputTokenCountMetric
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class UsageDecorator(BaseMetricDecorator):
|
|
23
|
+
def evaluate_usage(self,
|
|
24
|
+
func: Optional[Callable] = None,
|
|
25
|
+
*,
|
|
26
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
27
|
+
metrics: list[GenAIMetric] = []
|
|
28
|
+
) -> dict:
|
|
29
|
+
"""
|
|
30
|
+
An evaluation decorator for computing usage metric on an agent invocation.
|
|
31
|
+
"""
|
|
32
|
+
if func is None:
|
|
33
|
+
return partial(self.evaluate_usage, configuration=configuration, metrics=metrics)
|
|
34
|
+
|
|
35
|
+
if not metrics:
|
|
36
|
+
metrics = MetricGroup.USAGE.get_metrics()
|
|
37
|
+
|
|
38
|
+
@decorator
|
|
39
|
+
def wrapper(func, instance, args, kwargs):
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
self.validate(func=func, metrics=metrics,
|
|
43
|
+
valid_metric_types=(CostMetric, InputTokenCountMetric, OutputTokenCountMetric))
|
|
44
|
+
|
|
45
|
+
metric_inputs = [EvaluatorFields.MODEL_USAGE_DETAIL_FIELDS,
|
|
46
|
+
EvaluatorFields.INPUT_TOKEN_COUNT_FIELDS, EvaluatorFields.OUTPUT_TOKEN_COUNT_FIELDS]
|
|
47
|
+
|
|
48
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
49
|
+
configuration=configuration,
|
|
50
|
+
metrics=metrics,
|
|
51
|
+
metric_inputs=metric_inputs)
|
|
52
|
+
|
|
53
|
+
return original_result
|
|
54
|
+
except Exception as ex:
|
|
55
|
+
raise Exception(
|
|
56
|
+
f"There was an error while evaluating usage metric on {func.__name__},") from ex
|
|
57
|
+
|
|
58
|
+
return wrapper(func)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
from typing import Annotated, Union
|
|
12
|
+
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
16
|
+
|
|
17
|
+
from .answer_relevance.answer_relevance_metric import AnswerRelevanceMetric
|
|
18
|
+
from .answer_similarity.answer_similarity_metric import AnswerSimilarityMetric
|
|
19
|
+
from .average_precision.average_precision_metric import AveragePrecisionMetric
|
|
20
|
+
from .cost.cost_metric import CostMetric
|
|
21
|
+
from .duration.duration_metric import DurationMetric
|
|
22
|
+
from .evasiveness.evasiveness_metric import EvasivenessMetric
|
|
23
|
+
from .faithfulness.faithfulness_metric import FaithfulnessMetric
|
|
24
|
+
from .hap.hap_metric import HAPMetric
|
|
25
|
+
from .hap.input_hap_metric import InputHAPMetric
|
|
26
|
+
from .hap.output_hap_metric import OutputHAPMetric
|
|
27
|
+
from .harm.harm_metric import HarmMetric
|
|
28
|
+
from .harm_engagement.harm_engagement_metric import HarmEngagementMetric
|
|
29
|
+
from .hit_rate.hit_rate_metric import HitRateMetric
|
|
30
|
+
from .input_token_count.input_token_count_metric import InputTokenCountMetric
|
|
31
|
+
from .jailbreak.jailbreak_metric import JailbreakMetric
|
|
32
|
+
from .keyword_detection.keyword_detection_metric import KeywordDetectionMetric
|
|
33
|
+
from .llm_validation.llm_validation_metric import LLMValidationMetric
|
|
34
|
+
from .llmaj.llmaj_metric import LLMAsJudgeMetric
|
|
35
|
+
from .ndcg.ndcg_metric import NDCGMetric
|
|
36
|
+
from .output_token_count.output_token_count_metric import \
|
|
37
|
+
OutputTokenCountMetric
|
|
38
|
+
from .pii.input_pii_metric import InputPIIMetric
|
|
39
|
+
from .pii.output_pii_metric import OutputPIIMetric
|
|
40
|
+
from .pii.pii_metric import PIIMetric
|
|
41
|
+
from .profanity.profanity_metric import ProfanityMetric
|
|
42
|
+
from .prompt_safety_risk.prompt_safety_risk_metric import \
|
|
43
|
+
PromptSafetyRiskMetric
|
|
44
|
+
from .reciprocal_rank.reciprocal_rank_metric import ReciprocalRankMetric
|
|
45
|
+
from .regex_detection.regex_detection_metric import RegexDetectionMetric
|
|
46
|
+
from .retrieval_precision.retrieval_precision_metric import \
|
|
47
|
+
RetrievalPrecisionMetric
|
|
48
|
+
from .sexual_content.sexual_content_metric import SexualContentMetric
|
|
49
|
+
from .social_bias.social_bias_metric import SocialBiasMetric
|
|
50
|
+
from .status.status_metric import StatusMetric
|
|
51
|
+
from .text_grade_level.text_grade_level_metric import TextGradeLevelMetric
|
|
52
|
+
from .text_reading_ease.text_reading_ease_metric import TextReadingEaseMetric
|
|
53
|
+
from .tool_call_accuracy.tool_call_accuracy_metric import \
|
|
54
|
+
ToolCallAccuracyMetric
|
|
55
|
+
from .tool_call_parameter_accuracy.tool_call_parameter_accuracy_metric import \
|
|
56
|
+
ToolCallParameterAccuracyMetric
|
|
57
|
+
from .tool_call_relevance.tool_call_relevance_metric import \
|
|
58
|
+
ToolCallRelevanceMetric
|
|
59
|
+
from .tool_call_syntactic_accuracy.tool_call_syntactic_accuracy_metric import \
|
|
60
|
+
ToolCallSyntacticAccuracyMetric
|
|
61
|
+
from .topic_relevance.topic_relevance_metric import TopicRelevanceMetric
|
|
62
|
+
from .unethical_behavior.unethical_behavior_metric import \
|
|
63
|
+
UnethicalBehaviorMetric
|
|
64
|
+
from .unsuccessful_requests.unsuccessful_requests_metric import \
|
|
65
|
+
UnsuccessfulRequestsMetric
|
|
66
|
+
from .user_id.user_id_metric import UserIdMetric
|
|
67
|
+
from .violence.violence_metric import ViolenceMetric
|
|
68
|
+
|
|
69
|
+
from .context_relevance.context_relevance_metric import ContextRelevanceMetric # isort:skip
|
|
70
|
+
|
|
71
|
+
METRICS_UNION = Annotated[Union[
|
|
72
|
+
tuple([c for c in GenAIMetric.__subclasses__() if c is not LLMAsJudgeMetric])
|
|
73
|
+
], Field(
|
|
74
|
+
discriminator="name")]
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.answer_relevance.answer_relevance_metric import \
|
|
20
|
+
AnswerRelevanceMetric
|
|
21
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class AnswerRelevanceDecorator(BaseMetricDecorator):
|
|
25
|
+
def evaluate_answer_relevance(self,
|
|
26
|
+
func: Optional[Callable] = None,
|
|
27
|
+
*,
|
|
28
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
29
|
+
metrics: list[GenAIMetric] = []
|
|
30
|
+
) -> dict:
|
|
31
|
+
"""
|
|
32
|
+
An evaluation decorator for computing answer relevance metric on an agentic node.
|
|
33
|
+
"""
|
|
34
|
+
if func is None:
|
|
35
|
+
return partial(self.evaluate_answer_relevance, configuration=configuration, metrics=metrics)
|
|
36
|
+
|
|
37
|
+
if not metrics:
|
|
38
|
+
metrics = [AnswerRelevanceMetric()]
|
|
39
|
+
|
|
40
|
+
@decorator
|
|
41
|
+
def wrapper(func, instance, args, kwargs):
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
self.validate(func=func, metrics=metrics,
|
|
45
|
+
valid_metric_types=(AnswerRelevanceMetric))
|
|
46
|
+
|
|
47
|
+
metric_inputs = [
|
|
48
|
+
EvaluatorFields.INPUT_FIELDS
|
|
49
|
+
]
|
|
50
|
+
metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
|
|
51
|
+
|
|
52
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
53
|
+
configuration=configuration,
|
|
54
|
+
metrics=metrics,
|
|
55
|
+
metric_inputs=metric_inputs,
|
|
56
|
+
metric_outputs=metric_outputs)
|
|
57
|
+
|
|
58
|
+
return original_result
|
|
59
|
+
except Exception as ex:
|
|
60
|
+
raise Exception(
|
|
61
|
+
f"There was an error while evaluating answer relevance metric on {func.__name__},") from ex
|
|
62
|
+
|
|
63
|
+
return wrapper(func)
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from lazy_imports import LazyModule, load
|
|
14
|
+
from pydantic import Field, model_validator
|
|
15
|
+
from typing_extensions import Self
|
|
16
|
+
|
|
17
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
18
|
+
from ibm_watsonx_gov.entities.base_classes import Error
|
|
19
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
20
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
21
|
+
RecordMetricResult)
|
|
22
|
+
from ibm_watsonx_gov.entities.llm_judge import LLMJudge
|
|
23
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
24
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
25
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
26
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
27
|
+
from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
|
|
28
|
+
from ibm_watsonx_gov.utils.validation_util import (validate_input,
|
|
29
|
+
validate_llm_as_judge,
|
|
30
|
+
validate_output,
|
|
31
|
+
validate_small_model_method,
|
|
32
|
+
validate_unitxt_method)
|
|
33
|
+
|
|
34
|
+
# Create lazy module for Unitxt imports
|
|
35
|
+
unitxt_provider = LazyModule(
|
|
36
|
+
"from ibm_watsonx_gov.providers.unitxt_provider import UnitxtColumnMapping",
|
|
37
|
+
"from ibm_watsonx_gov.providers.unitxt_provider import UnitxtProvider",
|
|
38
|
+
name="lazy_unitxt_provider"
|
|
39
|
+
)
|
|
40
|
+
load(unitxt_provider)
|
|
41
|
+
UnitxtColumnMapping = unitxt_provider.UnitxtColumnMapping
|
|
42
|
+
UnitxtProvider = unitxt_provider.UnitxtProvider
|
|
43
|
+
|
|
44
|
+
logger = GovSDKLogger.get_logger(__name__)
|
|
45
|
+
ANSWER_RELEVANCE = "answer_relevance"
|
|
46
|
+
UNITXT_METRIC_NAME = ANSWER_RELEVANCE
|
|
47
|
+
unitxt_methods = [
|
|
48
|
+
"token_recall",
|
|
49
|
+
"llm_as_judge",
|
|
50
|
+
"granite_guardian",
|
|
51
|
+
"answer_relevance_model"
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class AnswerRelevanceMetric(GenAIMetric):
|
|
56
|
+
"""
|
|
57
|
+
Defines the Answer Relevance metric class.
|
|
58
|
+
|
|
59
|
+
The Answer Relevance metric measures the relevance of the generated text to the given input query.
|
|
60
|
+
It can be computed using the below methods:
|
|
61
|
+
|
|
62
|
+
1. token_recall (default)
|
|
63
|
+
2. llm_as_judge
|
|
64
|
+
3. granite_guardian
|
|
65
|
+
4. answer_relevance_model
|
|
66
|
+
|
|
67
|
+
Examples:
|
|
68
|
+
1. Create Answer Relevance metric with default parameters and compute using metrics evaluator.
|
|
69
|
+
.. code-block:: python
|
|
70
|
+
|
|
71
|
+
metric = AnswerRelevanceMetric()
|
|
72
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "generated_text": "..."},
|
|
73
|
+
metrics=[metric])
|
|
74
|
+
|
|
75
|
+
2. Create Answer Relevance metric with a custom thresholds and method.
|
|
76
|
+
.. code-block:: python
|
|
77
|
+
|
|
78
|
+
thresholds = [MetricThreshold(type="lower_limit", value=0.5)]
|
|
79
|
+
method = "token_recall"
|
|
80
|
+
metric = AnswerRelevanceMetric(
|
|
81
|
+
method=method, thresholds=thresholds)
|
|
82
|
+
|
|
83
|
+
3. Create Answer Relevance metric with llm_as_judge method.
|
|
84
|
+
.. code-block:: python
|
|
85
|
+
|
|
86
|
+
# Define LLM Judge using watsonx.ai
|
|
87
|
+
# To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
|
|
88
|
+
llm_judge = LLMJudge(model=WxAIFoundationModel(
|
|
89
|
+
model_id="ibm/granite-3-3-8b-instruct",
|
|
90
|
+
project_id="<PROJECT_ID>"))
|
|
91
|
+
metric = AnswerRelevanceMetric(llm_judge=llm_judge)
|
|
92
|
+
|
|
93
|
+
4. Create Answer Relevance metric with granite_guardian method.
|
|
94
|
+
.. code-block:: python
|
|
95
|
+
|
|
96
|
+
metric = AnswerRelevanceMetric(method="granite_guardian")
|
|
97
|
+
|
|
98
|
+
5. Create Answer Relevance metric with answer_relevance_model method. Currently available only in On-Prem version.
|
|
99
|
+
.. code-block:: python
|
|
100
|
+
|
|
101
|
+
metric = AnswerRelevanceMetric(method="answer_relevance_model")
|
|
102
|
+
|
|
103
|
+
"""
|
|
104
|
+
name: Annotated[Literal["answer_relevance"],
|
|
105
|
+
Field(title="Name",
|
|
106
|
+
description="The answer relevance metric name.",
|
|
107
|
+
default=ANSWER_RELEVANCE, frozen=True)]
|
|
108
|
+
display_name: Annotated[Literal["Answer Relevance"],
|
|
109
|
+
Field(title="Display Name",
|
|
110
|
+
description="The answer relevance metric display name.",
|
|
111
|
+
default="Answer Relevance", frozen=True)]
|
|
112
|
+
tasks: Annotated[list[TaskType],
|
|
113
|
+
Field(title="Tasks",
|
|
114
|
+
description="The list of supported tasks.",
|
|
115
|
+
default=[TaskType.RAG, TaskType.QA])]
|
|
116
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
117
|
+
Field(title="Thresholds",
|
|
118
|
+
description="The metric thresholds.",
|
|
119
|
+
default=[MetricThreshold(type="lower_limit", value=0.7)])]
|
|
120
|
+
method: Annotated[Literal["token_recall", "llm_as_judge", "granite_guardian", "answer_relevance_model"],
|
|
121
|
+
Field(title="Method",
|
|
122
|
+
description="The method used to compute the metric. This field is optional and when `llm_judge` is provided, the method would be set to `llm_as_judge`.The `answer_relevance_model` method is currently available only in On-Prem version.",
|
|
123
|
+
default="token_recall")]
|
|
124
|
+
group: Annotated[MetricGroup,
|
|
125
|
+
Field(title="Group",
|
|
126
|
+
description="The metric group.",
|
|
127
|
+
default=MetricGroup.ANSWER_QUALITY, frozen=True)]
|
|
128
|
+
llm_judge: Annotated[LLMJudge | None,
|
|
129
|
+
Field(title="LLM Judge",
|
|
130
|
+
description="The LLM judge used to compute the metric.",
|
|
131
|
+
default=None)]
|
|
132
|
+
|
|
133
|
+
@model_validator(mode="after")
|
|
134
|
+
def set_llm_judge_default_method(self) -> Self:
|
|
135
|
+
# If llm_judge is set, set the method to llm_as_judge
|
|
136
|
+
if self.llm_judge:
|
|
137
|
+
self.method = "llm_as_judge"
|
|
138
|
+
return self
|
|
139
|
+
|
|
140
|
+
def evaluate(self,
|
|
141
|
+
data: pd.DataFrame,
|
|
142
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
143
|
+
**kwargs) -> AggregateMetricResult:
|
|
144
|
+
# If ran in sync mode, block until it is done
|
|
145
|
+
return run_in_event_loop(
|
|
146
|
+
self.evaluate_async,
|
|
147
|
+
data=data,
|
|
148
|
+
configuration=configuration,
|
|
149
|
+
**kwargs,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def __is_supported(self, **kwargs):
|
|
153
|
+
# Currently supported only in CPD and ypqa
|
|
154
|
+
return kwargs.get(
|
|
155
|
+
"api_client").credentials.region == "ypqa" or kwargs.get("api_client").is_cpd
|
|
156
|
+
|
|
157
|
+
async def evaluate_async(self, data: pd.DataFrame,
|
|
158
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
159
|
+
**kwargs) -> AggregateMetricResult:
|
|
160
|
+
|
|
161
|
+
data_cols = data.columns.to_list()
|
|
162
|
+
try:
|
|
163
|
+
validate_input(data_cols, configuration)
|
|
164
|
+
validate_output(data_cols, configuration)
|
|
165
|
+
validate_unitxt_method(self.name, self.method, unitxt_methods)
|
|
166
|
+
validate_llm_as_judge(self.name, self.method,
|
|
167
|
+
self.llm_judge, configuration.llm_judge)
|
|
168
|
+
validate_small_model_method(
|
|
169
|
+
self.name, self.method, self.__is_supported(**kwargs), unitxt_methods)
|
|
170
|
+
except ValueError as ve:
|
|
171
|
+
if kwargs.get("ignore_validation_errors"):
|
|
172
|
+
message = f"Skipping '{self.name}' computation because the validation failed. Details: {str(ve)}"
|
|
173
|
+
logger.warning(message)
|
|
174
|
+
return
|
|
175
|
+
raise ve
|
|
176
|
+
|
|
177
|
+
# Separate the data into a dataframe with no None values and a dataframe with None values
|
|
178
|
+
required_fields = configuration.input_fields + configuration.output_fields
|
|
179
|
+
mask_has_none = data[required_fields].isna().any(axis=1)
|
|
180
|
+
df_with_none = data[mask_has_none]
|
|
181
|
+
df_without_none = data[mask_has_none == False]
|
|
182
|
+
|
|
183
|
+
# Compute the metrics only for the dataframe with no None values
|
|
184
|
+
aggregated_metric_result = None
|
|
185
|
+
if not df_without_none.empty:
|
|
186
|
+
# Define the mapping if the method is not using the default one
|
|
187
|
+
if self.method == "token_recall":
|
|
188
|
+
column_mapping = UnitxtColumnMapping(
|
|
189
|
+
answer="prediction/answer",
|
|
190
|
+
question="task_data/question",
|
|
191
|
+
)
|
|
192
|
+
else:
|
|
193
|
+
column_mapping = UnitxtColumnMapping()
|
|
194
|
+
if self.method in ["granite_guardian", "answer_relevance_model"]:
|
|
195
|
+
kwargs["detector_params"] = {
|
|
196
|
+
"method": self.method, "threshold": 0.001}
|
|
197
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
198
|
+
metric_name=self.name,
|
|
199
|
+
metric_display_name=self.display_name,
|
|
200
|
+
metric_method=self.method,
|
|
201
|
+
metric_group=MetricGroup.ANSWER_QUALITY,
|
|
202
|
+
thresholds=self.thresholds,
|
|
203
|
+
**kwargs)
|
|
204
|
+
else:
|
|
205
|
+
provider = UnitxtProvider(
|
|
206
|
+
configuration=configuration,
|
|
207
|
+
metric_name=self.name,
|
|
208
|
+
metric_display_name=self.display_name,
|
|
209
|
+
metric_method=self.method,
|
|
210
|
+
metric_prefix="metrics.rag.external_rag",
|
|
211
|
+
metric_alias=UNITXT_METRIC_NAME,
|
|
212
|
+
metric_group=self.group,
|
|
213
|
+
column_mapping=column_mapping,
|
|
214
|
+
llm_judge=self.llm_judge,
|
|
215
|
+
thresholds=self.thresholds,
|
|
216
|
+
**kwargs,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
aggregated_metric_result = await provider.evaluate_async(data=df_without_none)
|
|
220
|
+
|
|
221
|
+
# Update the metric result with record level metrics results for the records with missing values
|
|
222
|
+
if not df_with_none.empty:
|
|
223
|
+
# Create None results for records with missing values
|
|
224
|
+
none_results = []
|
|
225
|
+
for _, row in df_with_none.iterrows():
|
|
226
|
+
record_result = RecordMetricResult(
|
|
227
|
+
name=self.name,
|
|
228
|
+
display_name=self.display_name,
|
|
229
|
+
method=self.method,
|
|
230
|
+
group=self.group,
|
|
231
|
+
value=None,
|
|
232
|
+
record_id=row[configuration.record_id_field],
|
|
233
|
+
thresholds=self.thresholds,
|
|
234
|
+
errors=[Error(
|
|
235
|
+
code="BAD_REQUEST", message_en="The value of required fields input or output is None.")]
|
|
236
|
+
)
|
|
237
|
+
none_results.append(record_result)
|
|
238
|
+
|
|
239
|
+
# Merge the results
|
|
240
|
+
if aggregated_metric_result:
|
|
241
|
+
all_record_results = aggregated_metric_result.record_level_metrics + none_results
|
|
242
|
+
aggregated_metric_result.record_level_metrics = all_record_results
|
|
243
|
+
aggregated_metric_result.total_records = len(
|
|
244
|
+
all_record_results)
|
|
245
|
+
else:
|
|
246
|
+
aggregated_metric_result = AggregateMetricResult(
|
|
247
|
+
name=self.name,
|
|
248
|
+
display_name=self.display_name,
|
|
249
|
+
method=self.method,
|
|
250
|
+
group=self.group,
|
|
251
|
+
value=None,
|
|
252
|
+
total_records=len(none_results),
|
|
253
|
+
record_level_metrics=none_results,
|
|
254
|
+
min=None,
|
|
255
|
+
max=None,
|
|
256
|
+
mean=None,
|
|
257
|
+
thresholds=self.thresholds
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
return aggregated_metric_result
|
|
File without changes
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.answer_similarity.answer_similarity_metric import \
|
|
20
|
+
AnswerSimilarityMetric
|
|
21
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class AnswerSimilarityDecorator(BaseMetricDecorator):
|
|
25
|
+
def evaluate_answer_similarity(self,
|
|
26
|
+
func: Optional[Callable] = None,
|
|
27
|
+
*,
|
|
28
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
29
|
+
metrics: list[GenAIMetric] = []
|
|
30
|
+
) -> dict:
|
|
31
|
+
"""
|
|
32
|
+
An evaluation decorator for computing answer similarity metric on an agentic node.
|
|
33
|
+
"""
|
|
34
|
+
if func is None:
|
|
35
|
+
return partial(self.evaluate_answer_similarity, configuration=configuration, metrics=metrics)
|
|
36
|
+
|
|
37
|
+
if not metrics:
|
|
38
|
+
metrics = [AnswerSimilarityMetric()]
|
|
39
|
+
|
|
40
|
+
@decorator
|
|
41
|
+
def wrapper(func, instance, args, kwargs):
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
self.validate(func=func, metrics=metrics,
|
|
45
|
+
valid_metric_types=(AnswerSimilarityMetric,))
|
|
46
|
+
|
|
47
|
+
metric_inputs = [
|
|
48
|
+
EvaluatorFields.INPUT_FIELDS,
|
|
49
|
+
EvaluatorFields.CONTEXT_FIELDS
|
|
50
|
+
]
|
|
51
|
+
metric_references = [EvaluatorFields.REFERENCE_FIELDS]
|
|
52
|
+
metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
|
|
53
|
+
|
|
54
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
55
|
+
configuration=configuration,
|
|
56
|
+
metrics=metrics,
|
|
57
|
+
metric_inputs=metric_inputs,
|
|
58
|
+
metric_outputs=metric_outputs,
|
|
59
|
+
metric_references=metric_references)
|
|
60
|
+
|
|
61
|
+
return original_result
|
|
62
|
+
except Exception as ex:
|
|
63
|
+
raise Exception(
|
|
64
|
+
f"There was an error while evaluating answer similarity metric on {func.__name__},") from ex
|
|
65
|
+
|
|
66
|
+
return wrapper(func)
|