ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
import ast
|
|
11
|
+
import re
|
|
12
|
+
from typing import Annotated, Literal
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
from pydantic import Field
|
|
16
|
+
|
|
17
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
18
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
19
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
20
|
+
RecordMetricResult)
|
|
21
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
22
|
+
from ibm_watsonx_gov.metrics.utils import calculate_cost
|
|
23
|
+
from ibm_watsonx_gov.utils.validation_util import validate_field
|
|
24
|
+
|
|
25
|
+
COST = "cost"
|
|
26
|
+
COST_DISPLAY_NAME = "Cost"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CostResult(RecordMetricResult):
|
|
30
|
+
name: str = COST
|
|
31
|
+
display_name: str = COST_DISPLAY_NAME
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class CostMetric(GenAIMetric):
|
|
35
|
+
"""
|
|
36
|
+
Defines the Cost metric class.
|
|
37
|
+
|
|
38
|
+
The Cost metric keep track of LLM usage cost for provided token count.
|
|
39
|
+
|
|
40
|
+
Examples:
|
|
41
|
+
1. Create Cost metric with default parameters and compute using metrics evaluator.
|
|
42
|
+
.. code-block:: python
|
|
43
|
+
|
|
44
|
+
metric = CostMetric()
|
|
45
|
+
result = MetricsEvaluator().evaluate(data={"model": "...", "prompt_tokens": "...", "completion_tokens": "..."},
|
|
46
|
+
metrics=[metric])
|
|
47
|
+
"""
|
|
48
|
+
name: Annotated[Literal["cost"],
|
|
49
|
+
Field(title="name",
|
|
50
|
+
description="The cost metric name.",
|
|
51
|
+
default=COST, frozen=True)]
|
|
52
|
+
display_name: Annotated[Literal["Cost"],
|
|
53
|
+
Field(title="Display Name",
|
|
54
|
+
description="The cost metric display name.",
|
|
55
|
+
default=COST_DISPLAY_NAME, frozen=True)]
|
|
56
|
+
tasks: Annotated[list[TaskType],
|
|
57
|
+
Field(title="Tasks",
|
|
58
|
+
description="The list of supported tasks.",
|
|
59
|
+
default=TaskType.values(), frozen=True)]
|
|
60
|
+
group: Annotated[MetricGroup,
|
|
61
|
+
Field(title="Group",
|
|
62
|
+
description="The metric group.",
|
|
63
|
+
default=MetricGroup.USAGE, frozen=True)]
|
|
64
|
+
|
|
65
|
+
def evaluate(
|
|
66
|
+
self,
|
|
67
|
+
data: pd.DataFrame,
|
|
68
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
69
|
+
**kwargs,
|
|
70
|
+
) -> list[AggregateMetricResult]:
|
|
71
|
+
from ibm_watsonx_gov.utils.aggregation_util import get_summaries
|
|
72
|
+
|
|
73
|
+
validate_field("model_usage_detail_fields", configuration)
|
|
74
|
+
meta_data = self._extract_required_dtls(data, configuration)
|
|
75
|
+
record_ids = data[configuration.record_id_field].to_list()
|
|
76
|
+
costs = self._evaluate(meta_data)
|
|
77
|
+
record_level_metrics = [
|
|
78
|
+
CostResult(record_id=record_id, value=cost,
|
|
79
|
+
group=MetricGroup.USAGE.value)
|
|
80
|
+
for cost, record_id in zip(costs, record_ids)
|
|
81
|
+
]
|
|
82
|
+
summary = get_summaries(costs)
|
|
83
|
+
aggregate_metric_scores = AggregateMetricResult(
|
|
84
|
+
name=self.name,
|
|
85
|
+
display_name=self.display_name,
|
|
86
|
+
group=self.group,
|
|
87
|
+
min=summary.get("min"),
|
|
88
|
+
max=summary.get("max"),
|
|
89
|
+
mean=summary.get("mean"),
|
|
90
|
+
value=summary.get("mean"),
|
|
91
|
+
total_records=len(record_level_metrics),
|
|
92
|
+
record_level_metrics=record_level_metrics,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return aggregate_metric_scores
|
|
96
|
+
|
|
97
|
+
def _extract_required_dtls(self, data: pd.DataFrame, configuration: GenAIConfiguration | AgenticAIConfiguration):
|
|
98
|
+
"""
|
|
99
|
+
Convert the dataframe to a consumable structure for calculate_cost function.
|
|
100
|
+
Structure:
|
|
101
|
+
[
|
|
102
|
+
{
|
|
103
|
+
"model": "<model_name>",
|
|
104
|
+
"total_prompt_tokens": <token_count>, # optional
|
|
105
|
+
"total_completion_tokens": <token_count> # optional
|
|
106
|
+
},
|
|
107
|
+
...
|
|
108
|
+
]
|
|
109
|
+
"""
|
|
110
|
+
patterns = configuration.model_usage_detail_fields
|
|
111
|
+
all_matching_cols = set()
|
|
112
|
+
|
|
113
|
+
for pattern in patterns:
|
|
114
|
+
regex = re.compile(pattern)
|
|
115
|
+
for col in data.columns:
|
|
116
|
+
if regex.fullmatch(col):
|
|
117
|
+
all_matching_cols.add(col)
|
|
118
|
+
|
|
119
|
+
# Early exit if no matches
|
|
120
|
+
if not all_matching_cols:
|
|
121
|
+
return [[] for _ in range(len(data))]
|
|
122
|
+
|
|
123
|
+
consolidated_data = []
|
|
124
|
+
for _, row in data.iterrows():
|
|
125
|
+
cost_meta_data = []
|
|
126
|
+
|
|
127
|
+
for col in all_matching_cols:
|
|
128
|
+
model_data = getattr(row, col, None)
|
|
129
|
+
if isinstance(model_data, str):
|
|
130
|
+
try:
|
|
131
|
+
model_data = ast.literal_eval(model_data)
|
|
132
|
+
except (ValueError, SyntaxError):
|
|
133
|
+
model_data = None # not a valid dict string
|
|
134
|
+
|
|
135
|
+
if not isinstance(model_data, dict):
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
model = model_data.get("model")
|
|
139
|
+
if not model or pd.isna(model):
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
cost_meta_data.append(model_data)
|
|
143
|
+
|
|
144
|
+
consolidated_data.append(cost_meta_data)
|
|
145
|
+
|
|
146
|
+
return consolidated_data
|
|
147
|
+
|
|
148
|
+
def _evaluate(self, usage_data: list[list[dict]]) -> list:
|
|
149
|
+
"""
|
|
150
|
+
Compute cost from associated properties
|
|
151
|
+
"""
|
|
152
|
+
costs = []
|
|
153
|
+
for data in usage_data:
|
|
154
|
+
costs.append(calculate_cost(data))
|
|
155
|
+
return costs
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
19
|
+
from ibm_watsonx_gov.metrics.duration.duration_metric import \
|
|
20
|
+
DurationMetric
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DurationDecorator(BaseMetricDecorator):
|
|
24
|
+
def evaluate_duration(self,
|
|
25
|
+
func: Optional[Callable] = None,
|
|
26
|
+
*,
|
|
27
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
28
|
+
metrics: list[GenAIMetric] = []
|
|
29
|
+
) -> dict:
|
|
30
|
+
"""
|
|
31
|
+
An evaluation decorator for computing duration metric.
|
|
32
|
+
"""
|
|
33
|
+
if func is None:
|
|
34
|
+
return partial(self.evaluate_duration, configuration=configuration, metrics=metrics)
|
|
35
|
+
|
|
36
|
+
if not metrics:
|
|
37
|
+
metrics = [DurationMetric()]
|
|
38
|
+
|
|
39
|
+
@decorator
|
|
40
|
+
def wrapper(func, instance, args, kwargs):
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
self.validate(func=func, metrics=metrics,
|
|
44
|
+
valid_metric_types=(DurationMetric,))
|
|
45
|
+
|
|
46
|
+
metric_inputs = [EvaluatorFields.START_TIME_FIELD,
|
|
47
|
+
EvaluatorFields.END_TIME_FIELD]
|
|
48
|
+
|
|
49
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
50
|
+
configuration=configuration,
|
|
51
|
+
metrics=metrics,
|
|
52
|
+
metric_inputs=metric_inputs)
|
|
53
|
+
|
|
54
|
+
return original_result
|
|
55
|
+
except Exception as ex:
|
|
56
|
+
raise Exception(
|
|
57
|
+
f"There was an error while evaluating duration metric on {func.__name__},") from ex
|
|
58
|
+
|
|
59
|
+
return wrapper(func)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
18
|
+
RecordMetricResult)
|
|
19
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
20
|
+
from ibm_watsonx_gov.utils.validation_util import validate_start_time, validate_end_time
|
|
21
|
+
|
|
22
|
+
DURATION = "duration"
|
|
23
|
+
DURATION_DISPLAY_NAME = "Duration"
|
|
24
|
+
LATENCY = "latency"
|
|
25
|
+
LATENCY_DISPLAY_NAME = "Latency"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DurationMetric(GenAIMetric):
|
|
29
|
+
"""
|
|
30
|
+
Defines the Duration metric class.
|
|
31
|
+
|
|
32
|
+
The Duration metric measures how long it take to process the task.
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
1. Create Duration metric with default parameters and compute using metrics evaluator.
|
|
36
|
+
.. code-block:: python
|
|
37
|
+
|
|
38
|
+
metric = DurationMetric()
|
|
39
|
+
result = MetricsEvaluator().evaluate(data={"start_time": "...", "end_time": "..."},
|
|
40
|
+
metrics=[metric])
|
|
41
|
+
"""
|
|
42
|
+
name: Annotated[Literal["duration", "latency"],
|
|
43
|
+
Field(title="name",
|
|
44
|
+
description="The duration metric name.",
|
|
45
|
+
default=DURATION, frozen=True)]
|
|
46
|
+
display_name: Annotated[Literal["Duration", "Latency"],
|
|
47
|
+
Field(title="Display Name",
|
|
48
|
+
description="The duration metric display name.",
|
|
49
|
+
default=DURATION_DISPLAY_NAME, frozen=True)]
|
|
50
|
+
tasks: Annotated[list[TaskType],
|
|
51
|
+
Field(title="Tasks",
|
|
52
|
+
description="The list of supported tasks.",
|
|
53
|
+
default=TaskType.values(), frozen=True)]
|
|
54
|
+
group: Annotated[MetricGroup,
|
|
55
|
+
Field(title="Group",
|
|
56
|
+
description="The metric group.",
|
|
57
|
+
default=MetricGroup.PERFORMANCE, frozen=True)]
|
|
58
|
+
|
|
59
|
+
def evaluate(
|
|
60
|
+
self,
|
|
61
|
+
data: pd.DataFrame,
|
|
62
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
63
|
+
**kwargs,
|
|
64
|
+
) -> list[AggregateMetricResult]:
|
|
65
|
+
from ibm_watsonx_gov.utils.aggregation_util import get_summaries
|
|
66
|
+
|
|
67
|
+
data_cols = data.columns.to_list()
|
|
68
|
+
validate_start_time(data_cols, configuration)
|
|
69
|
+
validate_end_time(data_cols, configuration)
|
|
70
|
+
record_ids = data[configuration.record_id_field].to_list()
|
|
71
|
+
if self.applies_to == "message":
|
|
72
|
+
metric_name = self.name
|
|
73
|
+
metric_display_name = self.display_name
|
|
74
|
+
else: # node
|
|
75
|
+
metric_name = LATENCY
|
|
76
|
+
metric_display_name = LATENCY_DISPLAY_NAME
|
|
77
|
+
|
|
78
|
+
durations = self._evaluate(data, configuration)
|
|
79
|
+
record_level_metrics = [
|
|
80
|
+
RecordMetricResult(record_id=record_id,
|
|
81
|
+
value=duration, name=metric_name, display_name=metric_display_name, group=MetricGroup.PERFORMANCE.value)
|
|
82
|
+
for duration, record_id in zip(durations, record_ids)
|
|
83
|
+
]
|
|
84
|
+
summary = get_summaries(durations)
|
|
85
|
+
aggregate_metric_scores = AggregateMetricResult(
|
|
86
|
+
name=self.name,
|
|
87
|
+
display_name=self.display_name,
|
|
88
|
+
group=self.group,
|
|
89
|
+
min=summary.get("min"),
|
|
90
|
+
max=summary.get("max"),
|
|
91
|
+
mean=summary.get("mean"),
|
|
92
|
+
value=summary.get("mean"),
|
|
93
|
+
total_records=len(record_level_metrics),
|
|
94
|
+
record_level_metrics=record_level_metrics,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return aggregate_metric_scores
|
|
98
|
+
|
|
99
|
+
def _evaluate(self, data: pd.DataFrame, config: GenAIConfiguration | AgenticAIConfiguration) -> list:
|
|
100
|
+
"""
|
|
101
|
+
Compute durations from start_time and end_time columns.
|
|
102
|
+
"""
|
|
103
|
+
# Convert to numeric, handling errors
|
|
104
|
+
start_time = pd.to_numeric(
|
|
105
|
+
data[config.start_time_field], errors="coerce")
|
|
106
|
+
end_time = pd.to_numeric(
|
|
107
|
+
data[config.end_time_field], errors="coerce")
|
|
108
|
+
|
|
109
|
+
# Calculate duration in seconds (convert from nanoseconds to seconds)
|
|
110
|
+
durations = ((end_time - start_time) / 1e9).tolist()
|
|
111
|
+
return durations
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
from ibm_watsonx_gov.metrics.evasiveness.evasiveness_metric import \
|
|
21
|
+
EvasivenessMetric
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EvasivenessDecorator(BaseMetricDecorator):
|
|
25
|
+
|
|
26
|
+
def evaluate_evasiveness(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = []
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing evasiveness on an agentic node via granite guardian.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_evasiveness, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
if not metrics:
|
|
39
|
+
metrics = [EvasivenessMetric()]
|
|
40
|
+
|
|
41
|
+
@decorator
|
|
42
|
+
def wrapper(func, instance, args, kwargs):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
self.validate(func=func, metrics=metrics,
|
|
46
|
+
valid_metric_types=(EvasivenessMetric))
|
|
47
|
+
|
|
48
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
49
|
+
|
|
50
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
51
|
+
configuration=configuration,
|
|
52
|
+
metrics=metrics,
|
|
53
|
+
metric_inputs=metric_inputs,
|
|
54
|
+
metric_outputs=[])
|
|
55
|
+
|
|
56
|
+
return original_result
|
|
57
|
+
except Exception as ex:
|
|
58
|
+
raise Exception(
|
|
59
|
+
f"There was an error while evaluating evasiveness on {func.__name__},") from ex
|
|
60
|
+
|
|
61
|
+
return wrapper(func)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_input
|
|
23
|
+
|
|
24
|
+
EVASIVENESS = "evasiveness"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class EvasivenessMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the Evasiveness metric class.
|
|
30
|
+
|
|
31
|
+
The Evasiveness metric measures the risk of avoiding to engage without providing sufficient reason.
|
|
32
|
+
It is computed using the granite guardian model.
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
1. Create Evasiveness metric with default parameters and compute using metrics evaluator.
|
|
36
|
+
.. code-block:: python
|
|
37
|
+
|
|
38
|
+
metric = EvasivenessMetric()
|
|
39
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
|
|
40
|
+
|
|
41
|
+
2. Create Evasiveness metric with a custom threshold.
|
|
42
|
+
.. code-block:: python
|
|
43
|
+
|
|
44
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
45
|
+
metric = EvasivenessMetric(threshold=threshold)
|
|
46
|
+
"""
|
|
47
|
+
name: Annotated[Literal["evasiveness"],
|
|
48
|
+
Field(title="Name",
|
|
49
|
+
description="The evasiveness metric name.",
|
|
50
|
+
default=EVASIVENESS, frozen=True)]
|
|
51
|
+
display_name: Annotated[Literal["Evasiveness"],
|
|
52
|
+
Field(title="Display Name",
|
|
53
|
+
description="The evasiveness metric display name.",
|
|
54
|
+
default="Evasiveness", frozen=True)]
|
|
55
|
+
method: Annotated[Literal["granite_guardian"],
|
|
56
|
+
Field(title="Method",
|
|
57
|
+
description="The method used to compute evasiveness metric.",
|
|
58
|
+
default="granite_guardian")]
|
|
59
|
+
tasks: Annotated[list[TaskType],
|
|
60
|
+
Field(title="Tasks",
|
|
61
|
+
description="The list of supported tasks.",
|
|
62
|
+
default=TaskType.values(), frozen=True)]
|
|
63
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
64
|
+
Field(title="Thresholds",
|
|
65
|
+
description="The metric thresholds.",
|
|
66
|
+
default=[MetricThreshold(type="upper_limit", value=0.5)])]
|
|
67
|
+
group: Annotated[MetricGroup,
|
|
68
|
+
Field(title="Group",
|
|
69
|
+
description="The metric group.",
|
|
70
|
+
default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
71
|
+
|
|
72
|
+
async def evaluate_async(
|
|
73
|
+
self,
|
|
74
|
+
data: pd.DataFrame,
|
|
75
|
+
configuration: GenAIConfiguration,
|
|
76
|
+
**kwargs
|
|
77
|
+
) -> list[AggregateMetricResult]:
|
|
78
|
+
|
|
79
|
+
validate_input(data.columns.to_list(), configuration)
|
|
80
|
+
kwargs["detector_params"] = {"risk_name": EVASIVENESS}
|
|
81
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
82
|
+
metric_name=self.name,
|
|
83
|
+
metric_display_name=self.display_name,
|
|
84
|
+
metric_method=self.method,
|
|
85
|
+
metric_group=self.group,
|
|
86
|
+
thresholds=self.thresholds,
|
|
87
|
+
**kwargs)
|
|
88
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
89
|
+
return aggregated_metric_result
|
|
90
|
+
|
|
91
|
+
def evaluate(
|
|
92
|
+
self,
|
|
93
|
+
data: pd.DataFrame | dict,
|
|
94
|
+
configuration: GenAIConfiguration,
|
|
95
|
+
**kwargs,
|
|
96
|
+
):
|
|
97
|
+
# If ran in sync mode, block until it is done
|
|
98
|
+
return run_in_event_loop(
|
|
99
|
+
self.evaluate_async,
|
|
100
|
+
data=data,
|
|
101
|
+
configuration=configuration,
|
|
102
|
+
**kwargs,
|
|
103
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
|
|
2
|
+
# ----------------------------------------------------------------------------------------------------
|
|
3
|
+
# IBM Confidential
|
|
4
|
+
# Licensed Materials - Property of IBM
|
|
5
|
+
# 5737-H76, 5900-A3Q
|
|
6
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
7
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
8
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
9
|
+
# ----------------------------------------------------------------------------------------------------
|
|
10
|
+
|
|
11
|
+
from functools import partial
|
|
12
|
+
from typing import Callable, Optional
|
|
13
|
+
|
|
14
|
+
from wrapt import decorator
|
|
15
|
+
|
|
16
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
17
|
+
AgenticAIConfiguration
|
|
18
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
19
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
20
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
21
|
+
from ibm_watsonx_gov.metrics.faithfulness.faithfulness_metric import \
|
|
22
|
+
FaithfulnessMetric
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class FaithfulnessDecorator(BaseMetricDecorator):
|
|
26
|
+
def evaluate_faithfulness(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = []
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing faithfulness metric on an agentic node.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_faithfulness, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
if not metrics:
|
|
39
|
+
metrics = [FaithfulnessMetric()]
|
|
40
|
+
|
|
41
|
+
@decorator
|
|
42
|
+
def wrapper(func, instance, args, kwargs):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
self.validate(func=func, metrics=metrics,
|
|
46
|
+
valid_metric_types=(FaithfulnessMetric,))
|
|
47
|
+
|
|
48
|
+
metric_inputs = [
|
|
49
|
+
EvaluatorFields.INPUT_FIELDS,
|
|
50
|
+
EvaluatorFields.CONTEXT_FIELDS
|
|
51
|
+
]
|
|
52
|
+
metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
|
|
53
|
+
|
|
54
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
55
|
+
configuration=configuration,
|
|
56
|
+
metrics=metrics,
|
|
57
|
+
metric_inputs=metric_inputs,
|
|
58
|
+
metric_outputs=metric_outputs)
|
|
59
|
+
|
|
60
|
+
return original_result
|
|
61
|
+
except Exception as ex:
|
|
62
|
+
raise Exception(
|
|
63
|
+
f"There was an error while evaluating faithfulness metric on {func.__name__},") from ex
|
|
64
|
+
|
|
65
|
+
return wrapper(func)
|