ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from lazy_imports import LazyModule, load
|
|
14
|
+
from pydantic import Field, model_validator
|
|
15
|
+
from typing_extensions import Self
|
|
16
|
+
|
|
17
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
18
|
+
from ibm_watsonx_gov.entities.base_classes import Error
|
|
19
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
20
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
21
|
+
RecordMetricResult)
|
|
22
|
+
from ibm_watsonx_gov.entities.llm_judge import LLMJudge
|
|
23
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
24
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
25
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
26
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
27
|
+
from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
|
|
28
|
+
from ibm_watsonx_gov.utils.validation_util import (validate_context,
|
|
29
|
+
validate_input,
|
|
30
|
+
validate_llm_as_judge,
|
|
31
|
+
validate_output,
|
|
32
|
+
validate_small_model_method,
|
|
33
|
+
validate_unitxt_method)
|
|
34
|
+
|
|
35
|
+
unitxt_provider = LazyModule(
|
|
36
|
+
"from ibm_watsonx_gov.providers.unitxt_provider import UnitxtProvider",
|
|
37
|
+
name="lazy_unitxt_provider"
|
|
38
|
+
)
|
|
39
|
+
load(unitxt_provider)
|
|
40
|
+
UnitxtProvider = unitxt_provider.UnitxtProvider
|
|
41
|
+
|
|
42
|
+
logger = GovSDKLogger.get_logger(__name__)
|
|
43
|
+
FAITHFULNESS = "faithfulness"
|
|
44
|
+
|
|
45
|
+
unitxt_methods = [
|
|
46
|
+
"token_k_precision",
|
|
47
|
+
"sentence_bert_mini_lm",
|
|
48
|
+
"llm_as_judge",
|
|
49
|
+
"granite_guardian",
|
|
50
|
+
"faithfulness_model"
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class FaithfulnessMetric(GenAIMetric):
|
|
55
|
+
"""
|
|
56
|
+
Defines the Faithfulness metric class.
|
|
57
|
+
|
|
58
|
+
The faithfulness metrics can be computed using the below methods:
|
|
59
|
+
|
|
60
|
+
1. token_k_precision (default)
|
|
61
|
+
2. sentence_bert_mini_lm
|
|
62
|
+
3. llm_as_judge
|
|
63
|
+
4. granite_guardian
|
|
64
|
+
5. faithfulness_model
|
|
65
|
+
|
|
66
|
+
Examples:
|
|
67
|
+
1. Create Faithfulness metric with default parameters.
|
|
68
|
+
.. code-block:: python
|
|
69
|
+
|
|
70
|
+
metric = FaithfulnessMetric()
|
|
71
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": "...", "generated_text": "..."},
|
|
72
|
+
metrics=[metric])
|
|
73
|
+
# A list of contexts can also be passed as shown below
|
|
74
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."], "generated_text": "..."},
|
|
75
|
+
metrics=[metric])
|
|
76
|
+
|
|
77
|
+
2. Create Faithfulness metric with a custom threshold and method.
|
|
78
|
+
.. code-block:: python
|
|
79
|
+
|
|
80
|
+
thresholds = [MetricThreshold(type="lower_limit", value=0.5)]
|
|
81
|
+
method = "sentence_bert_mini_lm"
|
|
82
|
+
metric = FaithfulnessMetric(method=method, thresholds=thresholds)
|
|
83
|
+
|
|
84
|
+
3. Create Faithfulness metric with llm_as_judge method.
|
|
85
|
+
.. code-block:: python
|
|
86
|
+
|
|
87
|
+
# Define LLM Judge using watsonx.ai
|
|
88
|
+
# To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
|
|
89
|
+
llm_judge = LLMJudge(model=WxAIFoundationModel(
|
|
90
|
+
model_id="ibm/granite-3-3-8b-instruct",
|
|
91
|
+
project_id="<PROJECT_ID>"
|
|
92
|
+
))
|
|
93
|
+
metric = FaithfulnessMetric(llm_judge=llm_judge)
|
|
94
|
+
|
|
95
|
+
4. Create Faithfulness metric with granite_guardian method.
|
|
96
|
+
.. code-block:: python
|
|
97
|
+
|
|
98
|
+
metric = FaithfulnessMetric(method="granite_guardian")
|
|
99
|
+
|
|
100
|
+
5. Create Faithfulness metric with faithfulness_model method. Currently available only in On-Prem version.
|
|
101
|
+
.. code-block:: python
|
|
102
|
+
|
|
103
|
+
metric = FaithfulnessMetric(method="faithfulness_model")
|
|
104
|
+
"""
|
|
105
|
+
name: Annotated[Literal["faithfulness"],
|
|
106
|
+
Field(title="Name",
|
|
107
|
+
description="The faithfulness metric name.",
|
|
108
|
+
default=FAITHFULNESS, frozen=True)]
|
|
109
|
+
display_name: Annotated[Literal["Faithfulness"],
|
|
110
|
+
Field(title="Display Name",
|
|
111
|
+
description="The faithfulness metric display name.",
|
|
112
|
+
default="Faithfulness", frozen=True)]
|
|
113
|
+
tasks: Annotated[list[TaskType],
|
|
114
|
+
Field(title="Tasks",
|
|
115
|
+
description="The list of supported tasks.",
|
|
116
|
+
default=[TaskType.RAG])]
|
|
117
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
118
|
+
Field(title="Thresholds",
|
|
119
|
+
description="The metric thresholds.",
|
|
120
|
+
default=[MetricThreshold(type="lower_limit", value=0.7)])]
|
|
121
|
+
method: Annotated[Literal["token_k_precision", "sentence_bert_mini_lm", "llm_as_judge", "granite_guardian", "faithfulness_model"],
|
|
122
|
+
Field(title="Method",
|
|
123
|
+
description="The method used to compute the metric. This field is optional and when `llm_judge` is provided, the method would be set to `llm_as_judge`. The `faithfulness_model` method is currently available only in On-Prem version.",
|
|
124
|
+
default="token_k_precision")]
|
|
125
|
+
group: Annotated[MetricGroup,
|
|
126
|
+
Field(title="Group",
|
|
127
|
+
description="The metric group.",
|
|
128
|
+
default=MetricGroup.ANSWER_QUALITY, frozen=True)]
|
|
129
|
+
llm_judge: Annotated[LLMJudge | None,
|
|
130
|
+
Field(title="LLM Judge",
|
|
131
|
+
description="The LLM judge used to compute the metric.",
|
|
132
|
+
default=None)]
|
|
133
|
+
|
|
134
|
+
@model_validator(mode="after")
|
|
135
|
+
def set_llm_judge_default_method(self) -> Self:
|
|
136
|
+
# If llm_judge is set, set the method to llm_as_judge
|
|
137
|
+
if self.llm_judge:
|
|
138
|
+
self.method = "llm_as_judge"
|
|
139
|
+
return self
|
|
140
|
+
|
|
141
|
+
def evaluate(self,
|
|
142
|
+
data: pd.DataFrame,
|
|
143
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
144
|
+
**kwargs) -> AggregateMetricResult:
|
|
145
|
+
# If ran in sync mode, block until it is done
|
|
146
|
+
return run_in_event_loop(
|
|
147
|
+
self.evaluate_async,
|
|
148
|
+
data=data,
|
|
149
|
+
configuration=configuration,
|
|
150
|
+
**kwargs,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def __is_supported(self, **kwargs):
|
|
154
|
+
# Currently supported only in CPD and ypqa
|
|
155
|
+
return kwargs.get(
|
|
156
|
+
"api_client").credentials.region == "ypqa" or kwargs.get("api_client").is_cpd
|
|
157
|
+
|
|
158
|
+
async def evaluate_async(self,
|
|
159
|
+
data: pd.DataFrame,
|
|
160
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
161
|
+
**kwargs) -> AggregateMetricResult:
|
|
162
|
+
data_cols = data.columns.to_list()
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
validate_input(data_cols, configuration)
|
|
166
|
+
validate_output(data_cols, configuration)
|
|
167
|
+
validate_context(data_cols, configuration)
|
|
168
|
+
validate_unitxt_method(self.name, self.method, unitxt_methods)
|
|
169
|
+
validate_llm_as_judge(self.name, self.method,
|
|
170
|
+
self.llm_judge, configuration.llm_judge)
|
|
171
|
+
validate_small_model_method(
|
|
172
|
+
self.name, self.method, self.__is_supported(**kwargs), unitxt_methods)
|
|
173
|
+
except ValueError as ve:
|
|
174
|
+
if kwargs.get("ignore_validation_errors"):
|
|
175
|
+
message = f"Skipping '{self.name}' computation because the validation failed. Details: {str(ve)}"
|
|
176
|
+
logger.warning(message)
|
|
177
|
+
return
|
|
178
|
+
raise ve
|
|
179
|
+
|
|
180
|
+
# Separate the data into a dataframe with no None values and a dataframe with None values
|
|
181
|
+
input_output_has_none = data[configuration.input_fields +
|
|
182
|
+
configuration.output_fields].isna().any(axis=1)
|
|
183
|
+
all_contexts_none = data[configuration.context_fields].isna().all(
|
|
184
|
+
axis=1)
|
|
185
|
+
mask_has_none = input_output_has_none | all_contexts_none
|
|
186
|
+
df_with_none = data[mask_has_none]
|
|
187
|
+
df_without_none = data[mask_has_none == False]
|
|
188
|
+
|
|
189
|
+
# Compute the metrics only for the dataframe with no None values
|
|
190
|
+
aggregated_metric_result = None
|
|
191
|
+
if not df_without_none.empty:
|
|
192
|
+
if self.method in ["granite_guardian", "faithfulness_model"]:
|
|
193
|
+
kwargs["detector_params"] = {
|
|
194
|
+
"method": self.method, "threshold": 0.001}
|
|
195
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
196
|
+
metric_name=self.name,
|
|
197
|
+
metric_display_name=self.display_name,
|
|
198
|
+
metric_method=self.method,
|
|
199
|
+
metric_group=self.group,
|
|
200
|
+
thresholds=self.thresholds,
|
|
201
|
+
**kwargs)
|
|
202
|
+
else:
|
|
203
|
+
provider = UnitxtProvider(configuration=configuration,
|
|
204
|
+
metric_name=self.name,
|
|
205
|
+
metric_display_name=self.display_name,
|
|
206
|
+
metric_method=self.method,
|
|
207
|
+
metric_prefix="metrics.rag.external_rag",
|
|
208
|
+
metric_group=self.group,
|
|
209
|
+
llm_judge=self.llm_judge,
|
|
210
|
+
thresholds=self.thresholds,
|
|
211
|
+
**kwargs)
|
|
212
|
+
|
|
213
|
+
aggregated_metric_result = await provider.evaluate_async(data=df_without_none)
|
|
214
|
+
|
|
215
|
+
# Update the metric result with record level metrics results for the records with missing values
|
|
216
|
+
if not df_with_none.empty:
|
|
217
|
+
# Create None results for records with missing values
|
|
218
|
+
none_results = []
|
|
219
|
+
for _, row in df_with_none.iterrows():
|
|
220
|
+
record_result = RecordMetricResult(
|
|
221
|
+
name=self.name,
|
|
222
|
+
display_name=self.display_name,
|
|
223
|
+
method=self.method,
|
|
224
|
+
group=self.group,
|
|
225
|
+
value=None,
|
|
226
|
+
record_id=row[configuration.record_id_field],
|
|
227
|
+
thresholds=self.thresholds,
|
|
228
|
+
errors=[Error(
|
|
229
|
+
code="BAD_REQUEST", message_en="The value of required fields input, output or context is None.")]
|
|
230
|
+
)
|
|
231
|
+
none_results.append(record_result)
|
|
232
|
+
|
|
233
|
+
# Merge the results
|
|
234
|
+
if aggregated_metric_result:
|
|
235
|
+
all_record_results = aggregated_metric_result.record_level_metrics + none_results
|
|
236
|
+
aggregated_metric_result.record_level_metrics = all_record_results
|
|
237
|
+
aggregated_metric_result.total_records = len(
|
|
238
|
+
all_record_results)
|
|
239
|
+
else:
|
|
240
|
+
aggregated_metric_result = AggregateMetricResult(
|
|
241
|
+
name=self.name,
|
|
242
|
+
display_name=self.display_name,
|
|
243
|
+
method=self.method,
|
|
244
|
+
group=self.group,
|
|
245
|
+
value=None,
|
|
246
|
+
total_records=len(none_results),
|
|
247
|
+
record_level_metrics=none_results,
|
|
248
|
+
min=None,
|
|
249
|
+
max=None,
|
|
250
|
+
mean=None,
|
|
251
|
+
thresholds=self.thresholds
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
return aggregated_metric_result
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from ibm_watsonx_gov.metrics.hap.hap_metric import HAPMetric
|
|
11
|
+
from ibm_watsonx_gov.metrics.hap.input_hap_metric import InputHAPMetric
|
|
12
|
+
from ibm_watsonx_gov.metrics.hap.output_hap_metric import OutputHAPMetric
|
|
13
|
+
|
|
14
|
+
__all__ = ["HAPMetric", "InputHAPMetric", "OutputHAPMetric"]
|
|
15
|
+
|
|
16
|
+
# Made with Bob
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
from ibm_watsonx_gov.metrics.hap.hap_metric import HAPMetric
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HAPDecorator(BaseMetricDecorator):
|
|
24
|
+
|
|
25
|
+
def evaluate_hap(self, func: Optional[Callable] = None,
|
|
26
|
+
*,
|
|
27
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
28
|
+
metrics: list[GenAIMetric] = [],
|
|
29
|
+
) -> dict:
|
|
30
|
+
"""
|
|
31
|
+
An evaluation decorator for computing hap metric on an agentic node.
|
|
32
|
+
"""
|
|
33
|
+
if func is None:
|
|
34
|
+
return partial(self.evaluate_hap, configuration=configuration, metrics=metrics)
|
|
35
|
+
|
|
36
|
+
if not metrics:
|
|
37
|
+
metrics = [HAPMetric()]
|
|
38
|
+
|
|
39
|
+
@decorator
|
|
40
|
+
def wrapper(func, instance, args, kwargs):
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
self.validate(func=func, metrics=metrics,
|
|
44
|
+
valid_metric_types=(HAPMetric))
|
|
45
|
+
|
|
46
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
47
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
48
|
+
configuration=configuration,
|
|
49
|
+
metrics=metrics,
|
|
50
|
+
metric_inputs=metric_inputs,
|
|
51
|
+
metric_outputs=[])
|
|
52
|
+
|
|
53
|
+
return original_result
|
|
54
|
+
except Exception as ex:
|
|
55
|
+
raise Exception(
|
|
56
|
+
f"There was an error while evaluating hap metric on {func.__name__},") from ex
|
|
57
|
+
|
|
58
|
+
return wrapper(func)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_input
|
|
23
|
+
|
|
24
|
+
HAP = "hap"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class HAPMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the HAP metric class.
|
|
30
|
+
|
|
31
|
+
The HAP metric measures if there is any toxic content that contains hate, abuse, or profanity in the input.
|
|
32
|
+
It is computed using the hap model.
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
1. Create HAP metric with default parameters and compute using metrics evaluator.
|
|
36
|
+
.. code-block:: python
|
|
37
|
+
|
|
38
|
+
metric = HAPMetric()
|
|
39
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
|
|
40
|
+
|
|
41
|
+
2. Create HAP metric with a custom threshold.
|
|
42
|
+
.. code-block:: python
|
|
43
|
+
|
|
44
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
45
|
+
metric = HAPMetric(threshold=threshold)
|
|
46
|
+
"""
|
|
47
|
+
name: Annotated[Literal["hap"],
|
|
48
|
+
Field(title="Name",
|
|
49
|
+
description="The hap metric name.",
|
|
50
|
+
default=HAP, frozen=True)]
|
|
51
|
+
display_name: Annotated[Literal["HAP"],
|
|
52
|
+
Field(title="Display Name",
|
|
53
|
+
description="The hap metric display name.",
|
|
54
|
+
default="HAP", frozen=True)]
|
|
55
|
+
tasks: Annotated[list[TaskType],
|
|
56
|
+
Field(title="Tasks",
|
|
57
|
+
description="The list of supported tasks.",
|
|
58
|
+
default=TaskType.values(), frozen=True)]
|
|
59
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
60
|
+
Field(title="Thresholds",
|
|
61
|
+
description="The metric thresholds.",
|
|
62
|
+
default=[MetricThreshold(type="upper_limit", value=0.1)])]
|
|
63
|
+
group: Annotated[MetricGroup,
|
|
64
|
+
Field(title="Group",
|
|
65
|
+
description="The metric group.",
|
|
66
|
+
default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
67
|
+
|
|
68
|
+
async def evaluate_async(
|
|
69
|
+
self,
|
|
70
|
+
data: pd.DataFrame,
|
|
71
|
+
configuration: GenAIConfiguration,
|
|
72
|
+
**kwargs
|
|
73
|
+
) -> list[AggregateMetricResult]:
|
|
74
|
+
|
|
75
|
+
validate_input(data.columns.to_list(), configuration)
|
|
76
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
77
|
+
metric_name=self.name,
|
|
78
|
+
metric_display_name=self.display_name,
|
|
79
|
+
metric_method=self.method,
|
|
80
|
+
metric_group=self.group,
|
|
81
|
+
thresholds=self.thresholds,
|
|
82
|
+
**kwargs)
|
|
83
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
84
|
+
return aggregated_metric_result
|
|
85
|
+
|
|
86
|
+
def evaluate(
|
|
87
|
+
self,
|
|
88
|
+
data: pd.DataFrame | dict,
|
|
89
|
+
configuration: GenAIConfiguration,
|
|
90
|
+
**kwargs,
|
|
91
|
+
):
|
|
92
|
+
# If ran in sync mode, block until it is done
|
|
93
|
+
return run_in_event_loop(
|
|
94
|
+
self.evaluate_async,
|
|
95
|
+
data=data,
|
|
96
|
+
configuration=configuration,
|
|
97
|
+
**kwargs,
|
|
98
|
+
)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_input
|
|
23
|
+
|
|
24
|
+
INPUT_HAP = "input_hap"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class InputHAPMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the Input HAP metric class.
|
|
30
|
+
|
|
31
|
+
The Input HAP metric measures if there is any toxic content that contains hate, abuse, or profanity in the input.
|
|
32
|
+
It is computed using the hap model on the input data.
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
1. Create Input HAP metric with default parameters and compute using metrics evaluator.
|
|
36
|
+
.. code-block:: python
|
|
37
|
+
|
|
38
|
+
metric = InputHAPMetric()
|
|
39
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
|
|
40
|
+
|
|
41
|
+
2. Create Input HAP metric with a custom threshold.
|
|
42
|
+
.. code-block:: python
|
|
43
|
+
|
|
44
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
45
|
+
metric = InputHAPMetric(threshold=threshold)
|
|
46
|
+
"""
|
|
47
|
+
name: Annotated[Literal["input_hap"],
|
|
48
|
+
Field(title="Name",
|
|
49
|
+
description="The input hap metric name.",
|
|
50
|
+
default=INPUT_HAP, frozen=True)]
|
|
51
|
+
display_name: Annotated[Literal["Input HAP"],
|
|
52
|
+
Field(title="Display Name",
|
|
53
|
+
description="The input hap metric display name.",
|
|
54
|
+
default="Input HAP", frozen=True)]
|
|
55
|
+
tasks: Annotated[list[TaskType],
|
|
56
|
+
Field(title="Tasks",
|
|
57
|
+
description="The list of supported tasks.",
|
|
58
|
+
default=TaskType.values(), frozen=True)]
|
|
59
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
60
|
+
Field(title="Thresholds",
|
|
61
|
+
description="The metric thresholds.",
|
|
62
|
+
default=[MetricThreshold(type="upper_limit", value=0.1)])]
|
|
63
|
+
group: Annotated[MetricGroup,
|
|
64
|
+
Field(title="Group",
|
|
65
|
+
description="The metric group.",
|
|
66
|
+
default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
67
|
+
|
|
68
|
+
async def evaluate_async(
|
|
69
|
+
self,
|
|
70
|
+
data: pd.DataFrame,
|
|
71
|
+
configuration: GenAIConfiguration,
|
|
72
|
+
**kwargs
|
|
73
|
+
) -> AggregateMetricResult:
|
|
74
|
+
|
|
75
|
+
validate_input(data.columns.to_list(), configuration)
|
|
76
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
77
|
+
metric_name="hap",
|
|
78
|
+
metric_display_name=self.display_name,
|
|
79
|
+
metric_method=self.method,
|
|
80
|
+
metric_group=self.group,
|
|
81
|
+
thresholds=self.thresholds,
|
|
82
|
+
**kwargs)
|
|
83
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
84
|
+
# Update the metric name to input_hap
|
|
85
|
+
aggregated_metric_result.name = self.name
|
|
86
|
+
for record in aggregated_metric_result.record_level_metrics:
|
|
87
|
+
record.name = self.name
|
|
88
|
+
return aggregated_metric_result
|
|
89
|
+
|
|
90
|
+
def evaluate(
|
|
91
|
+
self,
|
|
92
|
+
data: pd.DataFrame | dict,
|
|
93
|
+
configuration: GenAIConfiguration,
|
|
94
|
+
**kwargs,
|
|
95
|
+
):
|
|
96
|
+
# If ran in sync mode, block until it is done
|
|
97
|
+
return run_in_event_loop(
|
|
98
|
+
self.evaluate_async,
|
|
99
|
+
data=data,
|
|
100
|
+
configuration=configuration,
|
|
101
|
+
**kwargs,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Made with Bob
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_output
|
|
23
|
+
|
|
24
|
+
OUTPUT_HAP = "output_hap"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class OutputHAPMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the Output HAP metric class.
|
|
30
|
+
|
|
31
|
+
The Output HAP metric measures if there is any toxic content that contains hate, abuse, or profanity in the output.
|
|
32
|
+
It is computed using the hap model on the output data.
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
1. Create Output HAP metric with default parameters and compute using metrics evaluator.
|
|
36
|
+
.. code-block:: python
|
|
37
|
+
|
|
38
|
+
metric = OutputHAPMetric()
|
|
39
|
+
result = MetricsEvaluator().evaluate(data={"generated_text": "...", metrics=[metric])
|
|
40
|
+
|
|
41
|
+
2. Create Output HAP metric with a custom threshold.
|
|
42
|
+
.. code-block:: python
|
|
43
|
+
|
|
44
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
45
|
+
metric = OutputHAPMetric(threshold=threshold)
|
|
46
|
+
"""
|
|
47
|
+
name: Annotated[Literal["output_hap"],
|
|
48
|
+
Field(title="Name",
|
|
49
|
+
description="The output hap metric name.",
|
|
50
|
+
default=OUTPUT_HAP, frozen=True)]
|
|
51
|
+
display_name: Annotated[Literal["Output HAP"],
|
|
52
|
+
Field(title="Display Name",
|
|
53
|
+
description="The output hap metric display name.",
|
|
54
|
+
default="Output HAP", frozen=True)]
|
|
55
|
+
tasks: Annotated[list[TaskType],
|
|
56
|
+
Field(title="Tasks",
|
|
57
|
+
description="The list of supported tasks.",
|
|
58
|
+
default=TaskType.values(), frozen=True)]
|
|
59
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
60
|
+
Field(title="Thresholds",
|
|
61
|
+
description="The metric thresholds.",
|
|
62
|
+
default=[MetricThreshold(type="upper_limit", value=0.1)])]
|
|
63
|
+
group: Annotated[MetricGroup,
|
|
64
|
+
Field(title="Group",
|
|
65
|
+
description="The metric group.",
|
|
66
|
+
default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
67
|
+
|
|
68
|
+
async def evaluate_async(
|
|
69
|
+
self,
|
|
70
|
+
data: pd.DataFrame,
|
|
71
|
+
configuration: GenAIConfiguration,
|
|
72
|
+
**kwargs
|
|
73
|
+
) -> AggregateMetricResult:
|
|
74
|
+
|
|
75
|
+
validate_output(data.columns.to_list(), configuration)
|
|
76
|
+
|
|
77
|
+
# Create a modified configuration that uses output_fields as input_fields
|
|
78
|
+
# This allows DetectorsProvider to process output data
|
|
79
|
+
modified_config = configuration.model_copy(deep=True)
|
|
80
|
+
modified_config.input_fields = configuration.output_fields
|
|
81
|
+
|
|
82
|
+
provider = DetectorsProvider(configuration=modified_config,
|
|
83
|
+
metric_name="hap",
|
|
84
|
+
metric_display_name=self.display_name,
|
|
85
|
+
metric_method=self.method,
|
|
86
|
+
metric_group=self.group,
|
|
87
|
+
thresholds=self.thresholds,
|
|
88
|
+
**kwargs)
|
|
89
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
90
|
+
# Update the metric name to output_hap
|
|
91
|
+
aggregated_metric_result.name = self.name
|
|
92
|
+
for record in aggregated_metric_result.record_level_metrics:
|
|
93
|
+
record.name = self.name
|
|
94
|
+
return aggregated_metric_result
|
|
95
|
+
|
|
96
|
+
def evaluate(
|
|
97
|
+
self,
|
|
98
|
+
data: pd.DataFrame | dict,
|
|
99
|
+
configuration: GenAIConfiguration,
|
|
100
|
+
**kwargs,
|
|
101
|
+
):
|
|
102
|
+
# If ran in sync mode, block until it is done
|
|
103
|
+
return run_in_event_loop(
|
|
104
|
+
self.evaluate_async,
|
|
105
|
+
data=data,
|
|
106
|
+
configuration=configuration,
|
|
107
|
+
**kwargs,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Made with Bob
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|