ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import re
|
|
14
|
+
from pydantic import Field
|
|
15
|
+
|
|
16
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
18
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
19
|
+
RecordMetricResult)
|
|
20
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
21
|
+
from ibm_watsonx_gov.utils.validation_util import validate_field
|
|
22
|
+
|
|
23
|
+
INPUT_TOKEN_COUNT = "input_token_count"
|
|
24
|
+
INPUT_TOKEN_COUNT_DISPLAY_NAME = "Input Token Count"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class InputTokenCountResult(RecordMetricResult):
|
|
28
|
+
name: str = INPUT_TOKEN_COUNT
|
|
29
|
+
display_name: str = INPUT_TOKEN_COUNT_DISPLAY_NAME
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class InputTokenCountMetric(GenAIMetric):
|
|
33
|
+
"""
|
|
34
|
+
Defines the Input token count metric class.
|
|
35
|
+
|
|
36
|
+
The Input token count metric keep track of LLM input token count.
|
|
37
|
+
|
|
38
|
+
Examples:
|
|
39
|
+
1. Create Cost metric with default parameters and compute using metrics evaluator.
|
|
40
|
+
.. code-block:: python
|
|
41
|
+
|
|
42
|
+
metric = InputTokenCountMetric()
|
|
43
|
+
result = MetricsEvaluator().evaluate(data={"prompt_tokens": "..."},
|
|
44
|
+
metrics=[metric])
|
|
45
|
+
"""
|
|
46
|
+
name: Annotated[Literal["input_token_count"],
|
|
47
|
+
Field(title="name",
|
|
48
|
+
description="The input token count metric name.",
|
|
49
|
+
default=INPUT_TOKEN_COUNT, frozen=True)]
|
|
50
|
+
display_name: Annotated[Literal["Input Token Count"],
|
|
51
|
+
Field(title="Display Name",
|
|
52
|
+
description="The input token count metric display name.",
|
|
53
|
+
default=INPUT_TOKEN_COUNT_DISPLAY_NAME, frozen=True)]
|
|
54
|
+
tasks: Annotated[list[TaskType],
|
|
55
|
+
Field(title="Tasks",
|
|
56
|
+
description="The list of supported tasks.",
|
|
57
|
+
default=TaskType.values(), frozen=True)]
|
|
58
|
+
group: Annotated[MetricGroup,
|
|
59
|
+
Field(title="Group",
|
|
60
|
+
description="The metric group.",
|
|
61
|
+
default=MetricGroup.USAGE, frozen=True)]
|
|
62
|
+
|
|
63
|
+
def evaluate(
|
|
64
|
+
self,
|
|
65
|
+
data: pd.DataFrame,
|
|
66
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
67
|
+
**kwargs,
|
|
68
|
+
) -> list[AggregateMetricResult]:
|
|
69
|
+
from ibm_watsonx_gov.utils.aggregation_util import get_summaries
|
|
70
|
+
|
|
71
|
+
validate_field("input_token_count_fields", configuration)
|
|
72
|
+
record_ids = data[configuration.record_id_field].to_list()
|
|
73
|
+
data = data.fillna(0)
|
|
74
|
+
input_tokens = self._evaluate(data, configuration)
|
|
75
|
+
record_level_metrics = [
|
|
76
|
+
InputTokenCountResult(record_id=record_id,
|
|
77
|
+
value=token, group=MetricGroup.USAGE.value)
|
|
78
|
+
for token, record_id in zip(input_tokens, record_ids)
|
|
79
|
+
]
|
|
80
|
+
summary = get_summaries(input_tokens)
|
|
81
|
+
aggregate_metric_scores = AggregateMetricResult(
|
|
82
|
+
name=self.name,
|
|
83
|
+
display_name=self.display_name,
|
|
84
|
+
group=self.group,
|
|
85
|
+
min=summary.get("min"),
|
|
86
|
+
max=summary.get("max"),
|
|
87
|
+
mean=summary.get("mean"),
|
|
88
|
+
value=summary.get("mean"),
|
|
89
|
+
total_records=len(record_level_metrics),
|
|
90
|
+
record_level_metrics=record_level_metrics,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return aggregate_metric_scores
|
|
94
|
+
|
|
95
|
+
def _evaluate(self, data: pd.DataFrame, config: GenAIConfiguration | AgenticAIConfiguration) -> list:
|
|
96
|
+
"""
|
|
97
|
+
Track total input token.
|
|
98
|
+
"""
|
|
99
|
+
matched_cols = []
|
|
100
|
+
|
|
101
|
+
for pattern in config.input_token_count_fields:
|
|
102
|
+
# Compile regex pattern for safety and performance
|
|
103
|
+
regex = re.compile(pattern)
|
|
104
|
+
# Filter columns matching this pattern
|
|
105
|
+
matched = [col for col in data.columns if regex.fullmatch(col)]
|
|
106
|
+
matched_cols.extend(matched)
|
|
107
|
+
|
|
108
|
+
# Remove duplicates in case multiple patterns match the same column
|
|
109
|
+
matched_cols = list(set(matched_cols))
|
|
110
|
+
|
|
111
|
+
# Sum across these columns row-wise
|
|
112
|
+
return data[matched_cols].sum(axis=1).tolist()
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
from ibm_watsonx_gov.metrics.jailbreak.jailbreak_metric import JailbreakMetric
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class JailbreakDecorator(BaseMetricDecorator):
|
|
24
|
+
|
|
25
|
+
def evaluate_jailbreak(self,
|
|
26
|
+
func: Optional[Callable] = None,
|
|
27
|
+
*,
|
|
28
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
29
|
+
metrics: list[GenAIMetric] = []
|
|
30
|
+
) -> dict:
|
|
31
|
+
"""
|
|
32
|
+
An evaluation decorator for computing jailbreak on an agentic node via granite guardian.
|
|
33
|
+
"""
|
|
34
|
+
if func is None:
|
|
35
|
+
return partial(self.evaluate_jailbreak, configuration=configuration, metrics=metrics)
|
|
36
|
+
|
|
37
|
+
if not metrics:
|
|
38
|
+
metrics = [JailbreakMetric()]
|
|
39
|
+
|
|
40
|
+
@decorator
|
|
41
|
+
def wrapper(func, instance, args, kwargs):
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
self.validate(func=func, metrics=metrics,
|
|
45
|
+
valid_metric_types=(JailbreakMetric))
|
|
46
|
+
|
|
47
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
48
|
+
|
|
49
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
50
|
+
configuration=configuration,
|
|
51
|
+
metrics=metrics,
|
|
52
|
+
metric_inputs=metric_inputs,
|
|
53
|
+
metric_outputs=[])
|
|
54
|
+
|
|
55
|
+
return original_result
|
|
56
|
+
except Exception as ex:
|
|
57
|
+
raise Exception(
|
|
58
|
+
f"There was an error while evaluating jailbreak on {func.__name__},") from ex
|
|
59
|
+
|
|
60
|
+
return wrapper(func)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_input
|
|
23
|
+
|
|
24
|
+
JAILBREAK = "jailbreak"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class JailbreakMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the Jailbreak metric class.
|
|
30
|
+
|
|
31
|
+
The Jailbreak metric measures the risk of deliberate circumvention of AI systems built-in safeguards or ethical guidelines. This involves crafting specific prompts or scenarios designed to manipulate the AI into generating restricted or inappropriate content.
|
|
32
|
+
It is computed using the granite guardian model.
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
1. Create Jailbreak metric with default parameters and compute using metrics evaluator.
|
|
36
|
+
.. code-block:: python
|
|
37
|
+
|
|
38
|
+
metric = JailbreakMetric()
|
|
39
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
|
|
40
|
+
|
|
41
|
+
2. Create Jailbreak metric with a custom threshold.
|
|
42
|
+
.. code-block:: python
|
|
43
|
+
|
|
44
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
45
|
+
metric = JailbreakMetric(threshold=threshold)
|
|
46
|
+
"""
|
|
47
|
+
name: Annotated[Literal["jailbreak"],
|
|
48
|
+
Field(title="Name",
|
|
49
|
+
description="The jailbreak metric name.",
|
|
50
|
+
default=JAILBREAK, frozen=True)]
|
|
51
|
+
display_name: Annotated[Literal["Jailbreak"],
|
|
52
|
+
Field(title="Display Name",
|
|
53
|
+
description="The jailbreak metric display name.",
|
|
54
|
+
default="Jailbreak", frozen=True)]
|
|
55
|
+
method: Annotated[Literal["granite_guardian"],
|
|
56
|
+
Field(title="Method",
|
|
57
|
+
description="The method used to compute harm metric.",
|
|
58
|
+
default="granite_guardian")]
|
|
59
|
+
tasks: Annotated[list[TaskType],
|
|
60
|
+
Field(title="Tasks",
|
|
61
|
+
description="The list of supported tasks.",
|
|
62
|
+
default=TaskType.values(), frozen=True)]
|
|
63
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
64
|
+
Field(title="Thresholds",
|
|
65
|
+
description="The metric thresholds.",
|
|
66
|
+
default=[MetricThreshold(type="upper_limit", value=0.5)])]
|
|
67
|
+
group: Annotated[MetricGroup,
|
|
68
|
+
Field(title="Group",
|
|
69
|
+
description="The metric group.",
|
|
70
|
+
default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
71
|
+
|
|
72
|
+
async def evaluate_async(
|
|
73
|
+
self,
|
|
74
|
+
data: pd.DataFrame,
|
|
75
|
+
configuration: GenAIConfiguration,
|
|
76
|
+
**kwargs
|
|
77
|
+
) -> list[AggregateMetricResult]:
|
|
78
|
+
|
|
79
|
+
validate_input(data.columns.to_list(), configuration)
|
|
80
|
+
kwargs["detector_params"] = {"risk_name": JAILBREAK}
|
|
81
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
82
|
+
metric_name=self.name,
|
|
83
|
+
metric_display_name=self.display_name,
|
|
84
|
+
metric_method=self.method,
|
|
85
|
+
metric_group=self.group,
|
|
86
|
+
thresholds=self.thresholds,
|
|
87
|
+
**kwargs)
|
|
88
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
89
|
+
return aggregated_metric_result
|
|
90
|
+
|
|
91
|
+
def evaluate(
|
|
92
|
+
self,
|
|
93
|
+
data: pd.DataFrame | dict,
|
|
94
|
+
configuration: GenAIConfiguration,
|
|
95
|
+
**kwargs,
|
|
96
|
+
):
|
|
97
|
+
# If ran in sync mode, block until it is done
|
|
98
|
+
return run_in_event_loop(
|
|
99
|
+
self.evaluate_async,
|
|
100
|
+
data=data,
|
|
101
|
+
configuration=configuration,
|
|
102
|
+
**kwargs,
|
|
103
|
+
)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
from ibm_watsonx_gov.metrics.keyword_detection.keyword_detection_metric import \
|
|
21
|
+
KeywordDetectionMetric
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class KeywordDetectionDecorator(BaseMetricDecorator):
|
|
25
|
+
|
|
26
|
+
def evaluate_keyword_detection(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric]
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing keyword detection on an agentic node.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_keyword_detection, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
@decorator
|
|
39
|
+
def wrapper(func, instance, args, kwargs):
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
self.validate(func=func, metrics=metrics,
|
|
43
|
+
valid_metric_types=(KeywordDetectionMetric,))
|
|
44
|
+
|
|
45
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
46
|
+
|
|
47
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
48
|
+
configuration=configuration,
|
|
49
|
+
metrics=metrics,
|
|
50
|
+
metric_inputs=metric_inputs,
|
|
51
|
+
metric_outputs=[])
|
|
52
|
+
|
|
53
|
+
return original_result
|
|
54
|
+
except Exception as ex:
|
|
55
|
+
raise Exception(
|
|
56
|
+
f"There was an error while evaluating keyword detection on {func.__name__},") from ex
|
|
57
|
+
|
|
58
|
+
return wrapper(func)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_input
|
|
23
|
+
|
|
24
|
+
KEYWORD = "keyword"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class KeywordDetectionMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the Keyword Detection metric class.
|
|
30
|
+
|
|
31
|
+
The Keyword detection metric detects specific keyword(s) when they are mentioned explicitly in natural language.
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
1. Create keyword detection metric with default parameters and compute using metrics evaluator.
|
|
35
|
+
.. code-block:: python
|
|
36
|
+
|
|
37
|
+
metric = KeywordDetectionMetric(case_sensitive=True, keywords=["..."])
|
|
38
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "..."}, metrics=[metric])
|
|
39
|
+
|
|
40
|
+
2. Create Keyword detection metric with a custom threshold.
|
|
41
|
+
.. code-block:: python
|
|
42
|
+
|
|
43
|
+
threshold = MetricThreshold(type="upper_limit", value=0)
|
|
44
|
+
metric = KeywordDetectionMetric(threshold=threshold, keywords=["..."])
|
|
45
|
+
"""
|
|
46
|
+
name: Annotated[Literal["keyword_detection"],
|
|
47
|
+
Field(title="Name",
|
|
48
|
+
description="The keyword detection metric name.",
|
|
49
|
+
default="keyword_detection", frozen=True)]
|
|
50
|
+
display_name: Annotated[Literal["Keyword Detection"],
|
|
51
|
+
Field(title="Display Name",
|
|
52
|
+
description="The keyword metric display name.",
|
|
53
|
+
default="Keyword Detection", frozen=True)]
|
|
54
|
+
tasks: Annotated[list[TaskType],
|
|
55
|
+
Field(title="Tasks",
|
|
56
|
+
description="The list of supported tasks.",
|
|
57
|
+
default=TaskType.values(), frozen=True)]
|
|
58
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
59
|
+
Field(title="Thresholds",
|
|
60
|
+
description="The metric thresholds.",
|
|
61
|
+
default=[MetricThreshold(type="upper_limit", value=0)])]
|
|
62
|
+
# group: Annotated[MetricGroup,
|
|
63
|
+
# Field(title="Group",
|
|
64
|
+
# description="The metric group.",
|
|
65
|
+
# default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
66
|
+
|
|
67
|
+
case_sensitive: Annotated[bool, Field(title="Case Sensitive",
|
|
68
|
+
default=False,
|
|
69
|
+
description="Specifies whether keyword matching is case-sensitive. If enabled, matches will be case-sensitive.")]
|
|
70
|
+
|
|
71
|
+
keywords: Annotated[list[str], Field(title="Keyword Strings",
|
|
72
|
+
default=None,
|
|
73
|
+
description=f"List of keywords to match against the input text.")]
|
|
74
|
+
|
|
75
|
+
async def evaluate_async(
|
|
76
|
+
self,
|
|
77
|
+
data: pd.DataFrame,
|
|
78
|
+
configuration: GenAIConfiguration,
|
|
79
|
+
**kwargs
|
|
80
|
+
) -> list[AggregateMetricResult]:
|
|
81
|
+
if not self.keywords:
|
|
82
|
+
raise AssertionError(
|
|
83
|
+
f"The keywords field is required, but was missing from the input.")
|
|
84
|
+
|
|
85
|
+
validate_input(data.columns.to_list(), configuration)
|
|
86
|
+
kwargs["detector_params"] = {"case_sensitive": self.case_sensitive,
|
|
87
|
+
"keywords": self.keywords}
|
|
88
|
+
|
|
89
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
90
|
+
metric_name=KEYWORD,
|
|
91
|
+
metric_display_name=self.display_name,
|
|
92
|
+
metric_method=self.method,
|
|
93
|
+
metric_group=MetricGroup.CONTENT_SAFETY,
|
|
94
|
+
thresholds=self.thresholds,
|
|
95
|
+
**kwargs)
|
|
96
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
97
|
+
return aggregated_metric_result
|
|
98
|
+
|
|
99
|
+
def evaluate(
|
|
100
|
+
self,
|
|
101
|
+
data: pd.DataFrame | dict,
|
|
102
|
+
configuration: GenAIConfiguration,
|
|
103
|
+
**kwargs,
|
|
104
|
+
):
|
|
105
|
+
# If ran in sync mode, block until it is done
|
|
106
|
+
return run_in_event_loop(
|
|
107
|
+
self.evaluate_async,
|
|
108
|
+
data=data,
|
|
109
|
+
configuration=configuration,
|
|
110
|
+
**kwargs,
|
|
111
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Dict, List
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class EvaluationCriterion:
|
|
16
|
+
name: str = None
|
|
17
|
+
description: str = None
|
|
18
|
+
|
|
19
|
+
def __post_init__(self):
|
|
20
|
+
if not isinstance(self.name, str):
|
|
21
|
+
raise TypeError(
|
|
22
|
+
f"name must be str, got {type(self.name).__name__}")
|
|
23
|
+
if not isinstance(self.description, str):
|
|
24
|
+
raise TypeError(
|
|
25
|
+
f"description must be str, got {type(self.description).__name__}")
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def from_dict(cls, criterion_dict):
|
|
29
|
+
if "name" not in criterion_dict:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
f"criterion_dict must contain key 'name', got {criterion_dict}")
|
|
32
|
+
if "description" not in criterion_dict:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"criterion_dict must contain key 'description', got {criterion_dict}")
|
|
35
|
+
return cls(criterion_dict.get("name"), criterion_dict.get("description"))
|
|
36
|
+
|
|
37
|
+
def to_str(self):
|
|
38
|
+
return f"{self.name}: {self.description}"
|
|
39
|
+
|
|
40
|
+
def to_dict(self):
|
|
41
|
+
return {"name": self.name, "description": self.description}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class EvaluationCriteria:
|
|
46
|
+
criteria_list: List[EvaluationCriterion]
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def from_dict(cls, criteria_dict: Dict[str, str]):
|
|
50
|
+
""" create from dictionary: {"name":"value"} """
|
|
51
|
+
criteria_list = [EvaluationCriterion.from_dict({"name": name, "description": description})
|
|
52
|
+
for name, description in criteria_dict.items()]
|
|
53
|
+
return cls(criteria_list)
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_list_of_dicts(cls, criteria_dict: List[Dict[str, str]]):
|
|
57
|
+
""" create from dictionary. each dict should be of format: {"name":"...", "description":""}"""
|
|
58
|
+
criteria_list = [EvaluationCriterion.from_dict(
|
|
59
|
+
d) for d in criteria_dict]
|
|
60
|
+
return cls(criteria_list)
|
|
61
|
+
|
|
62
|
+
def to_str(self):
|
|
63
|
+
return "\n".join(c.to_str() for c in self.criteria_list)
|
|
64
|
+
|
|
65
|
+
def to_list_of_dicts(self):
|
|
66
|
+
return [c.to_dict() for c in self.criteria_list]
|
|
67
|
+
|
|
68
|
+
def to_dict(self):
|
|
69
|
+
return {c.name: c.description for c in self.criteria_list}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_default_evaluation_criteria():
|
|
73
|
+
return EvaluationCriteria.from_dict(default_eval_criteria_dict)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
default_eval_criteria_dict = {
|
|
77
|
+
"Adherence to Instructions and Relevance":
|
|
78
|
+
"Does the model follow the given instructions (if any) and provide a relevant response to the input?",
|
|
79
|
+
"Accuracy & Completeness":
|
|
80
|
+
"Is the response factually correct (if applicable) and does it fully address the input's request without "
|
|
81
|
+
"omitting critical details?",
|
|
82
|
+
"Coherence & Clarity":
|
|
83
|
+
"Does the response make sense, follow a logical flow, and is it easy to understand?",
|
|
84
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from enum import Enum
|
|
11
|
+
|
|
12
|
+
LLMValidation = "llm_validation"
|
|
13
|
+
max_eval_text_for_synthesis = 150
|
|
14
|
+
min_recurrent_evaluation_issues = 5
|
|
15
|
+
|
|
16
|
+
class LLMValidationFields(Enum):
|
|
17
|
+
INPUT_FIELD = "model_prompt"
|
|
18
|
+
OUTPUT_FIELD = "model_output"
|
|
19
|
+
TEXT_FIELD = "evaluation_text"
|
|
20
|
+
SCORE_FIELD = "evaluation_score"
|
|
21
|
+
SUMMARY_FIELD = "evaluation_summary"
|
|
22
|
+
RECURRING_ISSUE_FIELD = "recurring_issues"
|
|
23
|
+
RECURRING_ISSUE_IDS_FIELD = "recurring_issues_ids"
|
|
24
|
+
EVALUATION_CRITERIA_FIELD = "evaluation_criteria"
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics import LLMValidationMetric
|
|
20
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LLMValidationDecorator(BaseMetricDecorator):
|
|
24
|
+
def evaluate_general_quality_with_llm(self,
|
|
25
|
+
func: Optional[Callable] = None,
|
|
26
|
+
*,
|
|
27
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
28
|
+
metrics: list[GenAIMetric],
|
|
29
|
+
) -> dict:
|
|
30
|
+
if func is None:
|
|
31
|
+
return partial(self.evaluate_general_quality_with_llm, configuration=configuration, metrics=metrics)
|
|
32
|
+
|
|
33
|
+
@decorator
|
|
34
|
+
def wrapper(func, instance, args, kwargs):
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
self.validate(func=func, metrics=metrics,
|
|
38
|
+
valid_metric_types=(LLMValidationMetric,))
|
|
39
|
+
|
|
40
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
41
|
+
metric_outputs = [EvaluatorFields.OUTPUT_FIELDS, EvaluatorFields.PROMPT_FIELD]
|
|
42
|
+
|
|
43
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
44
|
+
configuration=configuration,
|
|
45
|
+
metrics=metrics,
|
|
46
|
+
metric_inputs=metric_inputs,
|
|
47
|
+
metric_outputs=metric_outputs)
|
|
48
|
+
|
|
49
|
+
return original_result
|
|
50
|
+
except Exception as ex:
|
|
51
|
+
raise Exception(
|
|
52
|
+
f"There was an error while evaluating evaluate_general_quality_with_llm metric on {func.__name__},") from ex
|
|
53
|
+
|
|
54
|
+
return wrapper(func)
|