ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
from ibm_watsonx_gov.metrics.harm.harm_metric import HarmMetric
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HarmDecorator(BaseMetricDecorator):
|
|
24
|
+
|
|
25
|
+
def evaluate_harm(self,
|
|
26
|
+
func: Optional[Callable] = None,
|
|
27
|
+
*,
|
|
28
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
29
|
+
metrics: list[GenAIMetric] = []
|
|
30
|
+
) -> dict:
|
|
31
|
+
"""
|
|
32
|
+
An evaluation decorator for computing harm risk on an agentic node via granite guardian.
|
|
33
|
+
"""
|
|
34
|
+
if func is None:
|
|
35
|
+
return partial(self.evaluate_harm, configuration=configuration, metrics=metrics)
|
|
36
|
+
|
|
37
|
+
if not metrics:
|
|
38
|
+
metrics = [HarmMetric()]
|
|
39
|
+
|
|
40
|
+
@decorator
|
|
41
|
+
def wrapper(func, instance, args, kwargs):
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
self.validate(func=func, metrics=metrics,
|
|
45
|
+
valid_metric_types=(HarmMetric,))
|
|
46
|
+
|
|
47
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
48
|
+
|
|
49
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
50
|
+
configuration=configuration,
|
|
51
|
+
metrics=metrics,
|
|
52
|
+
metric_inputs=metric_inputs,
|
|
53
|
+
metric_outputs=[])
|
|
54
|
+
|
|
55
|
+
return original_result
|
|
56
|
+
except Exception as ex:
|
|
57
|
+
raise Exception(
|
|
58
|
+
f"There was an error while evaluating harm risk on {func.__name__},") from ex
|
|
59
|
+
|
|
60
|
+
return wrapper(func)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_input
|
|
23
|
+
|
|
24
|
+
HARM = "harm"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class HarmMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the Harm metric class.
|
|
30
|
+
|
|
31
|
+
The Harm metric measures the risk of content considered universally harmful. This is general category, which should encompass a variety of risks.
|
|
32
|
+
It is computed using the granite guardian model.
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
1. Create Harm metric with default parameters and compute using metrics evaluator.
|
|
36
|
+
.. code-block:: python
|
|
37
|
+
|
|
38
|
+
metric = HarmMetric()
|
|
39
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
|
|
40
|
+
|
|
41
|
+
2. Create Harm metric with a custom threshold.
|
|
42
|
+
.. code-block:: python
|
|
43
|
+
|
|
44
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
45
|
+
metric = HarmMetric(threshold=threshold)
|
|
46
|
+
"""
|
|
47
|
+
name: Annotated[Literal["harm"],
|
|
48
|
+
Field(title="Name",
|
|
49
|
+
description="The harm metric name.",
|
|
50
|
+
default=HARM, frozen=True)]
|
|
51
|
+
display_name: Annotated[Literal["Harm"],
|
|
52
|
+
Field(title="Display Name",
|
|
53
|
+
description="The harm metric display name.",
|
|
54
|
+
default="Harm", frozen=True)]
|
|
55
|
+
method: Annotated[Literal["granite_guardian"],
|
|
56
|
+
Field(title="Method",
|
|
57
|
+
description="The method used to compute harm metric.",
|
|
58
|
+
default="granite_guardian")]
|
|
59
|
+
tasks: Annotated[list[TaskType],
|
|
60
|
+
Field(title="Tasks",
|
|
61
|
+
description="The list of supported tasks.",
|
|
62
|
+
default=TaskType.values(), frozen=True)]
|
|
63
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
64
|
+
Field(title="Thresholds",
|
|
65
|
+
description="The metric thresholds.",
|
|
66
|
+
default=[MetricThreshold(type="upper_limit", value=0.5)])]
|
|
67
|
+
group: Annotated[MetricGroup,
|
|
68
|
+
Field(title="Group",
|
|
69
|
+
description="The metric group.",
|
|
70
|
+
default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
71
|
+
|
|
72
|
+
async def evaluate_async(
|
|
73
|
+
self,
|
|
74
|
+
data: pd.DataFrame,
|
|
75
|
+
configuration: GenAIConfiguration,
|
|
76
|
+
**kwargs
|
|
77
|
+
) -> list[AggregateMetricResult]:
|
|
78
|
+
|
|
79
|
+
validate_input(data.columns.to_list(), configuration)
|
|
80
|
+
kwargs["detector_params"] = {"risk_name": HARM}
|
|
81
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
82
|
+
metric_name=self.name,
|
|
83
|
+
metric_display_name=self.display_name,
|
|
84
|
+
metric_method=self.method,
|
|
85
|
+
metric_group=self.group,
|
|
86
|
+
thresholds=self.thresholds,
|
|
87
|
+
**kwargs)
|
|
88
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
89
|
+
return aggregated_metric_result
|
|
90
|
+
|
|
91
|
+
def evaluate(
|
|
92
|
+
self,
|
|
93
|
+
data: pd.DataFrame | dict,
|
|
94
|
+
configuration: GenAIConfiguration,
|
|
95
|
+
**kwargs,
|
|
96
|
+
):
|
|
97
|
+
# If ran in sync mode, block until it is done
|
|
98
|
+
return run_in_event_loop(
|
|
99
|
+
self.evaluate_async,
|
|
100
|
+
data=data,
|
|
101
|
+
configuration=configuration,
|
|
102
|
+
**kwargs,
|
|
103
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
from ibm_watsonx_gov.metrics.harm_engagement.harm_engagement_metric import \
|
|
21
|
+
HarmEngagementMetric
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HarmEngagementDecorator(BaseMetricDecorator):
|
|
25
|
+
|
|
26
|
+
def evaluate_harm_engagement(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = []
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing harm engagement on an agentic node via granite guardian.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_harm_engagement, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
if not metrics:
|
|
39
|
+
metrics = [HarmEngagementMetric()]
|
|
40
|
+
|
|
41
|
+
@decorator
|
|
42
|
+
def wrapper(func, instance, args, kwargs):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
self.validate(func=func, metrics=metrics,
|
|
46
|
+
valid_metric_types=(HarmEngagementMetric))
|
|
47
|
+
|
|
48
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
49
|
+
|
|
50
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
51
|
+
configuration=configuration,
|
|
52
|
+
metrics=metrics,
|
|
53
|
+
metric_inputs=metric_inputs,
|
|
54
|
+
metric_outputs=[])
|
|
55
|
+
|
|
56
|
+
return original_result
|
|
57
|
+
except Exception as ex:
|
|
58
|
+
raise Exception(
|
|
59
|
+
f"There was an error while evaluating harm engagement on {func.__name__},") from ex
|
|
60
|
+
|
|
61
|
+
return wrapper(func)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_input
|
|
23
|
+
|
|
24
|
+
HARM_ENGAGEMENT = "harm_engagement"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class HarmEngagementMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the Harm Engagement metric class.
|
|
30
|
+
|
|
31
|
+
The Harm Engagement metric measures the risk of an engagement or endorsement with any requests that are harmful or unethical.
|
|
32
|
+
It is computed using the granite guardian model.
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
1. Create Harm Engagement metric with default parameters and compute using metrics evaluator.
|
|
36
|
+
.. code-block:: python
|
|
37
|
+
|
|
38
|
+
metric = HarmEngagementMetric()
|
|
39
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
|
|
40
|
+
|
|
41
|
+
2. Create Harm Engagement metric with a custom threshold.
|
|
42
|
+
.. code-block:: python
|
|
43
|
+
|
|
44
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
45
|
+
metric = HarmEngagementMetric(threshold=threshold)
|
|
46
|
+
"""
|
|
47
|
+
name: Annotated[Literal["harm_engagement"],
|
|
48
|
+
Field(title="Name",
|
|
49
|
+
description="The harm engagement metric name.",
|
|
50
|
+
default=HARM_ENGAGEMENT, frozen=True)]
|
|
51
|
+
display_name: Annotated[Literal["Harm Engagement"],
|
|
52
|
+
Field(title="Display Name",
|
|
53
|
+
description="The harm engagement metric display name.",
|
|
54
|
+
default="Harm Engagement", frozen=True)]
|
|
55
|
+
method: Annotated[Literal["granite_guardian"],
|
|
56
|
+
Field(title="Method",
|
|
57
|
+
description="The method used to compute harm engagement metric.",
|
|
58
|
+
default="granite_guardian")]
|
|
59
|
+
tasks: Annotated[list[TaskType],
|
|
60
|
+
Field(title="Tasks",
|
|
61
|
+
description="The list of supported tasks.",
|
|
62
|
+
default=TaskType.values(), frozen=True)]
|
|
63
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
64
|
+
Field(title="Thresholds",
|
|
65
|
+
description="The metric thresholds.",
|
|
66
|
+
default=[MetricThreshold(type="upper_limit", value=0.5)])]
|
|
67
|
+
group: Annotated[MetricGroup,
|
|
68
|
+
Field(title="Group",
|
|
69
|
+
description="The metric group.",
|
|
70
|
+
default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
71
|
+
|
|
72
|
+
async def evaluate_async(
|
|
73
|
+
self,
|
|
74
|
+
data: pd.DataFrame,
|
|
75
|
+
configuration: GenAIConfiguration,
|
|
76
|
+
**kwargs
|
|
77
|
+
) -> list[AggregateMetricResult]:
|
|
78
|
+
|
|
79
|
+
validate_input(data.columns.to_list(), configuration)
|
|
80
|
+
kwargs["detector_params"] = {"risk_name": HARM_ENGAGEMENT}
|
|
81
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
82
|
+
metric_name=self.name,
|
|
83
|
+
metric_display_name=self.display_name,
|
|
84
|
+
metric_method=self.method,
|
|
85
|
+
metric_group=self.group,
|
|
86
|
+
thresholds=self.thresholds,
|
|
87
|
+
**kwargs)
|
|
88
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
89
|
+
return aggregated_metric_result
|
|
90
|
+
|
|
91
|
+
def evaluate(
|
|
92
|
+
self,
|
|
93
|
+
data: pd.DataFrame | dict,
|
|
94
|
+
configuration: GenAIConfiguration,
|
|
95
|
+
**kwargs,
|
|
96
|
+
):
|
|
97
|
+
# If ran in sync mode, block until it is done
|
|
98
|
+
return run_in_event_loop(
|
|
99
|
+
self.evaluate_async,
|
|
100
|
+
data=data,
|
|
101
|
+
configuration=configuration,
|
|
102
|
+
**kwargs,
|
|
103
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
19
|
+
from ibm_watsonx_gov.metrics.hit_rate.hit_rate_metric import HitRateMetric
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HitRateDecorator(BaseMetricDecorator):
|
|
23
|
+
def evaluate_hit_rate(self,
|
|
24
|
+
func: Optional[Callable] = None,
|
|
25
|
+
*,
|
|
26
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
27
|
+
metrics: list[GenAIMetric] = []
|
|
28
|
+
) -> dict:
|
|
29
|
+
"""
|
|
30
|
+
An evaluation decorator for computing hit rate metric on an agentic node.
|
|
31
|
+
"""
|
|
32
|
+
if func is None:
|
|
33
|
+
return partial(self.evaluate_hit_rate, configuration=configuration, metrics=metrics)
|
|
34
|
+
|
|
35
|
+
if not metrics:
|
|
36
|
+
metrics = [HitRateMetric()]
|
|
37
|
+
|
|
38
|
+
@decorator
|
|
39
|
+
def wrapper(func, instance, args, kwargs):
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
self.validate(func=func, metrics=metrics,
|
|
43
|
+
valid_metric_types=(HitRateMetric,))
|
|
44
|
+
|
|
45
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
46
|
+
metric_outputs = [EvaluatorFields.CONTEXT_FIELDS]
|
|
47
|
+
|
|
48
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
49
|
+
configuration=configuration,
|
|
50
|
+
metrics=metrics,
|
|
51
|
+
metric_inputs=metric_inputs,
|
|
52
|
+
metric_outputs=metric_outputs)
|
|
53
|
+
|
|
54
|
+
return original_result
|
|
55
|
+
except Exception as ex:
|
|
56
|
+
raise Exception(
|
|
57
|
+
f"There was an error while evaluating hit rate metric on {func.__name__},") from ex
|
|
58
|
+
|
|
59
|
+
return wrapper(func)
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
from typing import Annotated, Any, Literal
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from pydantic import Field, TypeAdapter, field_validator
|
|
15
|
+
|
|
16
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
18
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
19
|
+
RecordMetricResult)
|
|
20
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
21
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
22
|
+
from ibm_watsonx_gov.metrics.context_relevance.context_relevance_metric import (
|
|
23
|
+
CONTEXT_RELEVANCE, ContextRelevanceMetric, ContextRelevanceResult)
|
|
24
|
+
|
|
25
|
+
HIT_RATE = "hit_rate"
|
|
26
|
+
HIT_RATE_DISPLAY_NAME = "Hit Rate"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class HitRateResult(RecordMetricResult):
|
|
30
|
+
name: str = HIT_RATE
|
|
31
|
+
display_name: str = HIT_RATE_DISPLAY_NAME
|
|
32
|
+
group: MetricGroup = MetricGroup.RETRIEVAL_QUALITY
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class HitRateMetric(GenAIMetric):
|
|
36
|
+
"""
|
|
37
|
+
Defines the Hit Rate metric class.
|
|
38
|
+
|
|
39
|
+
The Hit Rate metric whether there is at least one relevant context among the retrieved contexts.
|
|
40
|
+
The Context Relevance metric is computed as a pre requisite to compute this metric.
|
|
41
|
+
|
|
42
|
+
Examples:
|
|
43
|
+
1. Create Hit Rate metric with default parameters and compute using metrics evaluator.
|
|
44
|
+
.. code-block:: python
|
|
45
|
+
|
|
46
|
+
metric = HitRateMetric()
|
|
47
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": "..."},
|
|
48
|
+
metrics=[metric])
|
|
49
|
+
# A list of contexts can also be passed as shown below
|
|
50
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
|
|
51
|
+
metrics=[metric])
|
|
52
|
+
|
|
53
|
+
2. Create Hit Rate metric with a custom threshold.
|
|
54
|
+
.. code-block:: python
|
|
55
|
+
|
|
56
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
57
|
+
metric = HitRateMetric(method=method, threshold=threshold)
|
|
58
|
+
|
|
59
|
+
3. Create Hit Rate metric with llm_as_judge method.
|
|
60
|
+
.. code-block:: python
|
|
61
|
+
|
|
62
|
+
# Define LLM Judge using watsonx.ai
|
|
63
|
+
# To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
|
|
64
|
+
llm_judge = LLMJudge(model=WxAIFoundationModel(
|
|
65
|
+
model_id="ibm/granite-3-3-8b-instruct",
|
|
66
|
+
project_id="<PROJECT_ID>"
|
|
67
|
+
))
|
|
68
|
+
cr_metric = ContextRelevanceMetric(llm_judge=llm_judge)
|
|
69
|
+
ap_metric = HitRateMetric()
|
|
70
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
|
|
71
|
+
metrics=[cr_metric, ap_metric])
|
|
72
|
+
"""
|
|
73
|
+
name: Annotated[Literal["hit_rate"],
|
|
74
|
+
Field(title="Name",
|
|
75
|
+
description="The hit rate metric name.",
|
|
76
|
+
default=HIT_RATE, frozen=True)]
|
|
77
|
+
display_name: Annotated[Literal["Hit Rate"],
|
|
78
|
+
Field(title="Display Name",
|
|
79
|
+
description="The hit rate metric display name.",
|
|
80
|
+
default=HIT_RATE_DISPLAY_NAME, frozen=True)]
|
|
81
|
+
tasks: Annotated[list[TaskType],
|
|
82
|
+
Field(title="Tasks",
|
|
83
|
+
description="The list of supported tasks.",
|
|
84
|
+
default=[TaskType.RAG])]
|
|
85
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
86
|
+
Field(title="Thresholds",
|
|
87
|
+
description="The metric thresholds.",
|
|
88
|
+
default=[MetricThreshold(type="lower_limit", value=0.7)])]
|
|
89
|
+
metric_dependencies: Annotated[list[GenAIMetric],
|
|
90
|
+
Field(title="Metric dependencies",
|
|
91
|
+
description="The list of metric dependencies",
|
|
92
|
+
default=[ContextRelevanceMetric()])]
|
|
93
|
+
group: Annotated[MetricGroup,
|
|
94
|
+
Field(title="Group",
|
|
95
|
+
description="The metric group.",
|
|
96
|
+
default=MetricGroup.RETRIEVAL_QUALITY, frozen=True)]
|
|
97
|
+
|
|
98
|
+
@field_validator("metric_dependencies", mode="before")
|
|
99
|
+
@classmethod
|
|
100
|
+
def metric_dependencies_validator(cls, value: Any):
|
|
101
|
+
if value:
|
|
102
|
+
value = [TypeAdapter(Annotated[ContextRelevanceMetric, Field(
|
|
103
|
+
discriminator="name")]).validate_python(
|
|
104
|
+
m) for m in value]
|
|
105
|
+
return value
|
|
106
|
+
|
|
107
|
+
def evaluate(
|
|
108
|
+
self,
|
|
109
|
+
data: pd.DataFrame,
|
|
110
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
111
|
+
metrics_result: list[AggregateMetricResult],
|
|
112
|
+
**kwargs,
|
|
113
|
+
) -> AggregateMetricResult:
|
|
114
|
+
record_level_metrics = []
|
|
115
|
+
scores = []
|
|
116
|
+
|
|
117
|
+
context_relevance_result: list[ContextRelevanceResult] = next(
|
|
118
|
+
(metric_result.record_level_metrics for metric_result in metrics_result if metric_result.name == CONTEXT_RELEVANCE), None)
|
|
119
|
+
|
|
120
|
+
if context_relevance_result is None:
|
|
121
|
+
raise Exception(
|
|
122
|
+
f"Failed to evaluate {self.name} metric. Missing context relevance metric result")
|
|
123
|
+
|
|
124
|
+
for relevance_result in context_relevance_result:
|
|
125
|
+
score = self.__compute(
|
|
126
|
+
relevance_scores=relevance_result.additional_info.get(
|
|
127
|
+
"contexts_values", []),
|
|
128
|
+
threshold=self.thresholds[0].value,
|
|
129
|
+
)
|
|
130
|
+
scores.append(score)
|
|
131
|
+
record_level_metrics.append(
|
|
132
|
+
HitRateResult(
|
|
133
|
+
method="",
|
|
134
|
+
provider="",
|
|
135
|
+
record_id=relevance_result.record_id,
|
|
136
|
+
value=score,
|
|
137
|
+
thresholds=self.thresholds
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
mean = sum(scores) / len(scores)
|
|
142
|
+
aggregate_metric_score = AggregateMetricResult(
|
|
143
|
+
name=self.name,
|
|
144
|
+
display_name=self.display_name,
|
|
145
|
+
method="",
|
|
146
|
+
provider="",
|
|
147
|
+
group=self.group,
|
|
148
|
+
min=min(scores),
|
|
149
|
+
max=max(scores),
|
|
150
|
+
mean=mean,
|
|
151
|
+
value=mean,
|
|
152
|
+
total_records=len(record_level_metrics),
|
|
153
|
+
record_level_metrics=record_level_metrics,
|
|
154
|
+
thresholds=self.thresholds
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return aggregate_metric_score
|
|
158
|
+
|
|
159
|
+
def __compute(self, relevance_scores: list[float], threshold: float) -> float:
|
|
160
|
+
hit_rate_value = 0
|
|
161
|
+
|
|
162
|
+
for score in relevance_scores:
|
|
163
|
+
if score >= threshold:
|
|
164
|
+
hit_rate_value = 1
|
|
165
|
+
break # Stop once a relevance is found
|
|
166
|
+
|
|
167
|
+
return hit_rate_value
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
19
|
+
from ibm_watsonx_gov.metrics.input_token_count.input_token_count_metric import \
|
|
20
|
+
InputTokenCountMetric
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class InputTokenCountDecorator(BaseMetricDecorator):
|
|
24
|
+
def evaluate_input_token_count(self,
|
|
25
|
+
func: Optional[Callable] = None,
|
|
26
|
+
*,
|
|
27
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
28
|
+
metrics: list[GenAIMetric] = []
|
|
29
|
+
) -> dict:
|
|
30
|
+
"""
|
|
31
|
+
An evaluation decorator for computing total input token.
|
|
32
|
+
"""
|
|
33
|
+
if func is None:
|
|
34
|
+
return partial(self.evaluate_input_token_count, configuration=configuration, metrics=metrics)
|
|
35
|
+
|
|
36
|
+
if not metrics:
|
|
37
|
+
metrics = [InputTokenCountMetric()]
|
|
38
|
+
|
|
39
|
+
@decorator
|
|
40
|
+
def wrapper(func, instance, args, kwargs):
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
self.validate(func=func, metrics=metrics,
|
|
44
|
+
valid_metric_types=(InputTokenCountMetric,))
|
|
45
|
+
|
|
46
|
+
metric_inputs = [EvaluatorFields.INPUT_TOKEN_COUNT_FIELDS]
|
|
47
|
+
|
|
48
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
49
|
+
configuration=configuration,
|
|
50
|
+
metrics=metrics,
|
|
51
|
+
metric_inputs=metric_inputs)
|
|
52
|
+
|
|
53
|
+
return original_result
|
|
54
|
+
except Exception as ex:
|
|
55
|
+
raise Exception(
|
|
56
|
+
f"There was an error while tracking total input token count on {func.__name__},") from ex
|
|
57
|
+
|
|
58
|
+
return wrapper(func)
|