ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
14
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
15
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
16
|
+
from ibm_watsonx_gov.entities.llm_judge import LLMJudge
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
19
|
+
from ibm_watsonx_gov.providers.tool_call_metric_provider import \
|
|
20
|
+
ToolCallMetricProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
|
|
23
|
+
from ibm_watsonx_gov.utils.validation_util import (validate_input,
|
|
24
|
+
validate_llm_as_judge,
|
|
25
|
+
validate_tool_calls)
|
|
26
|
+
from pydantic import Field
|
|
27
|
+
|
|
28
|
+
logger = GovSDKLogger.get_logger(__name__)
|
|
29
|
+
TOOL_CALL_RELEVANCE = "tool_call_relevance"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ToolCallRelevanceMetric(GenAIMetric):
|
|
33
|
+
"""
|
|
34
|
+
ToolCallRelevanceMetric assesses whether this function call correctly implements
|
|
35
|
+
the user's immediate request as the appropriate next step in the conversation.
|
|
36
|
+
Compares against all available functions in the tool inventory to determine if
|
|
37
|
+
the selection aligns with user intent and context.
|
|
38
|
+
|
|
39
|
+
The ToolCallRelevanceMetric will be computed using llm_as_judge.
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
1. Create ToolCallRelevanceMetric by passing the basic configuration.
|
|
43
|
+
.. code-block:: python
|
|
44
|
+
|
|
45
|
+
config = GenAIConfiguration(tools = [get_weather,fetch_stock_price])
|
|
46
|
+
evaluator = MetricsEvaluator(configuration=config)
|
|
47
|
+
df = pd.read_csv("")
|
|
48
|
+
llm_judge = LLMJudge(
|
|
49
|
+
model=WxAIFoundationModel(
|
|
50
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
51
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
metrics = [ToolCallRelevanceMetric(llm_judge=llm_judge)]
|
|
55
|
+
result = evaluator.evaluate(data=df, metrics=metrics)
|
|
56
|
+
|
|
57
|
+
2. Create ToolCallRelevanceMetric by passing custom tool calls field in configuration.
|
|
58
|
+
.. code-block:: python
|
|
59
|
+
|
|
60
|
+
config = GenAIConfiguration(tools = [get_weather,fetch_stock_price],
|
|
61
|
+
tool_calls_field="tools_used")
|
|
62
|
+
evaluator = MetricsEvaluator(configuration=config)
|
|
63
|
+
df = pd.read_csv("")
|
|
64
|
+
llm_judge = LLMJudge(
|
|
65
|
+
model=WxAIFoundationModel(
|
|
66
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
67
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
metrics = [ToolCallRelevanceMetric(llm_judge=llm_judge)]
|
|
71
|
+
result = evaluator.evaluate(data=df, metrics=metrics)
|
|
72
|
+
|
|
73
|
+
3. Create ToolCallRelevanceMetric with a custom threshold.
|
|
74
|
+
.. code-block:: python
|
|
75
|
+
|
|
76
|
+
llm_judge = LLMJudge(
|
|
77
|
+
model=WxAIFoundationModel(
|
|
78
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
79
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
threshold = MetricThreshold(type="upper_limit", value=0.8)
|
|
83
|
+
metric = ToolCallRelevanceMetric(llm_judge=llm_judge, threshold=threshold)
|
|
84
|
+
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
name: Annotated[Literal["tool_call_relevance"], Field(title="Metric Name",
|
|
88
|
+
description="The name of metric.",
|
|
89
|
+
default=TOOL_CALL_RELEVANCE)]
|
|
90
|
+
display_name: Annotated[Literal["Tool Call Relevance"], Field(title="Display Name",
|
|
91
|
+
description="The tool call relevance metric display name.",
|
|
92
|
+
default="Tool Call Relevance", frozen=True)]
|
|
93
|
+
tasks: Annotated[list[TaskType], Field(title="Task Type",
|
|
94
|
+
description="The generative task type.",
|
|
95
|
+
default=[TaskType.RAG])]
|
|
96
|
+
group: Annotated[MetricGroup, Field(
|
|
97
|
+
default=MetricGroup.TOOL_CALL_QUALITY, frozen=True)]
|
|
98
|
+
|
|
99
|
+
llm_judge: Annotated[LLMJudge | None, Field(
|
|
100
|
+
description="The LLM judge used to compute the metric.", default=None)]
|
|
101
|
+
|
|
102
|
+
method: Annotated[Literal["llm_as_judge"], Field(title="Computation Method",
|
|
103
|
+
description="The method used to compute the metric.",
|
|
104
|
+
default="llm_as_judge")]
|
|
105
|
+
thresholds: Annotated[list[MetricThreshold], Field(title="Metric threshold",
|
|
106
|
+
description="Value that defines the violation limit for the metric",
|
|
107
|
+
default=[MetricThreshold(
|
|
108
|
+
type="lower_limit", value=0.8)]
|
|
109
|
+
)]
|
|
110
|
+
metric_mapping_name: Annotated[Literal["function_selection_appropriateness"], Field(title="Metric Mapping Name",
|
|
111
|
+
description="The mapping name of metric with llmevalkit.",
|
|
112
|
+
default="function_selection_appropriateness")]
|
|
113
|
+
|
|
114
|
+
async def evaluate_async(self, data: pd.DataFrame,
|
|
115
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
116
|
+
**kwargs) -> AggregateMetricResult:
|
|
117
|
+
"""
|
|
118
|
+
Evaluate the data for ToolCallRelevanceMetric
|
|
119
|
+
Args:
|
|
120
|
+
data (pd.DataFrame | dict): Data to be evaluated
|
|
121
|
+
configuration (GenAIConfiguration | AgenticAIConfiguration): Metrics configuration
|
|
122
|
+
**kwargs: Additional keyword arguments
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
AggregateMetricResult: The computed metrics
|
|
126
|
+
"""
|
|
127
|
+
data_cols = data.columns.to_list()
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
validate_tool_calls(data_cols, configuration)
|
|
131
|
+
validate_input(data_cols, configuration)
|
|
132
|
+
validate_llm_as_judge(self.name, self.method,
|
|
133
|
+
self.llm_judge, configuration.llm_judge)
|
|
134
|
+
except ValueError as ve:
|
|
135
|
+
if kwargs.get("ignore_validation_errors"):
|
|
136
|
+
message = f"Skipping '{self.name}' computation because the validation failed. Details: {str(ve)}"
|
|
137
|
+
logger.warning(message)
|
|
138
|
+
return
|
|
139
|
+
raise ve
|
|
140
|
+
|
|
141
|
+
tool_call_provider = ToolCallMetricProvider(
|
|
142
|
+
configuration=configuration, metric=self)
|
|
143
|
+
metric_config = {
|
|
144
|
+
"general_metrics": None,
|
|
145
|
+
"function_metrics": [self.metric_mapping_name],
|
|
146
|
+
"parameter_metrics": None,
|
|
147
|
+
"transform_enabled": False
|
|
148
|
+
}
|
|
149
|
+
metric_result = await tool_call_provider.compute_metrics(
|
|
150
|
+
data, syntactic_only=False, metric_result_mapping_name="function_selection", **metric_config)
|
|
151
|
+
|
|
152
|
+
return metric_result
|
|
153
|
+
|
|
154
|
+
def evaluate(
|
|
155
|
+
self,
|
|
156
|
+
data: pd.DataFrame | dict,
|
|
157
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
158
|
+
**kwargs,
|
|
159
|
+
):
|
|
160
|
+
# If ran in sync mode, block until it is done
|
|
161
|
+
return run_in_event_loop(
|
|
162
|
+
self.evaluate_async,
|
|
163
|
+
data=data,
|
|
164
|
+
configuration=configuration,
|
|
165
|
+
**kwargs,
|
|
166
|
+
)
|
|
File without changes
|
ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
19
|
+
from ibm_watsonx_gov.metrics.tool_call_syntactic_accuracy.tool_call_syntactic_accuracy_metric import \
|
|
20
|
+
ToolCallSyntacticAccuracyMetric
|
|
21
|
+
from ibm_watsonx_gov.providers.tool_call_metric_provider import \
|
|
22
|
+
ToolCallMetricProvider
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ToolCallSyntacticAccuracyDecorator(BaseMetricDecorator):
|
|
26
|
+
def evaluate_tool_call_syntactic_accuracy(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = []
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing tool_call_syntactic_accuracy metric on an agentic node.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_tool_call_syntactic_accuracy, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
if not metrics:
|
|
39
|
+
metrics = [ToolCallSyntacticAccuracyMetric()]
|
|
40
|
+
|
|
41
|
+
@decorator
|
|
42
|
+
def wrapper(func, instance, args, kwargs):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
self.validate(func=func, metrics=metrics,
|
|
46
|
+
valid_metric_types=(ToolCallSyntacticAccuracyMetric,))
|
|
47
|
+
|
|
48
|
+
metric_outputs = [
|
|
49
|
+
EvaluatorFields.TOOL_CALLS_FIELD, EvaluatorFields.OUTPUT_FIELDS]
|
|
50
|
+
|
|
51
|
+
if isinstance(configuration.tools, list) and all(callable(item) for item in configuration.tools):
|
|
52
|
+
configuration.tools = ToolCallMetricProvider.get_tools_list_schema(
|
|
53
|
+
configuration.tools)
|
|
54
|
+
|
|
55
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
56
|
+
configuration=configuration,
|
|
57
|
+
metrics=metrics,
|
|
58
|
+
metric_inputs=[],
|
|
59
|
+
metric_outputs=metric_outputs)
|
|
60
|
+
|
|
61
|
+
return original_result
|
|
62
|
+
except Exception as ex:
|
|
63
|
+
raise Exception(
|
|
64
|
+
f"There was an error while evaluating tool call syntactic metric on {func.__name__},") from ex
|
|
65
|
+
|
|
66
|
+
return wrapper(func)
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
14
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
15
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
16
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
17
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
18
|
+
from ibm_watsonx_gov.providers.tool_call_metric_provider import \
|
|
19
|
+
ToolCallMetricProvider
|
|
20
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
21
|
+
from ibm_watsonx_gov.utils.validation_util import validate_tool_calls
|
|
22
|
+
from pydantic import Field
|
|
23
|
+
|
|
24
|
+
TOOL_CALLING_SYNTACTIC_ACCURACY = "tool_call_syntactic_accuracy"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ToolCallSyntacticAccuracyMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
.. deprecated:: 1.2.0
|
|
30
|
+
Use :class:`ibm_watsonx_gov.metrics.ToolCallAccuracyMetric` with syntactic method instead.
|
|
31
|
+
|
|
32
|
+
ToolCallSyntacticAccuracyMetric compute the tool call syntactic correctness
|
|
33
|
+
by validating tool call against the schema of the list of available tools.
|
|
34
|
+
|
|
35
|
+
The ToolCallSyntacticAccuracy metric will be computed by performing the syntactic checks.
|
|
36
|
+
|
|
37
|
+
Examples:
|
|
38
|
+
1. Create ToolCallSyntacticAccuracy metric by passing the basic configuration.
|
|
39
|
+
.. code-block:: python
|
|
40
|
+
|
|
41
|
+
config = GenAIConfiguration(tools = [get_weather,fetch_stock_price])
|
|
42
|
+
evaluator = MetricsEvaluator(configuration=config)
|
|
43
|
+
df = pd.read_csv("")
|
|
44
|
+
metrics = [ToolCallSyntacticAccuracyMetric()]
|
|
45
|
+
result = evaluator.evaluate(data=df, metrics=metrics)
|
|
46
|
+
|
|
47
|
+
2. Create ToolCallSyntacticAccuracy metric by passing custom tool calls field in configuration.
|
|
48
|
+
.. code-block:: python
|
|
49
|
+
|
|
50
|
+
config = GenAIConfiguration(tools = [get_weather,fetch_stock_price],
|
|
51
|
+
tool_calls_field="tools_used")
|
|
52
|
+
evaluator = MetricsEvaluator(configuration=config)
|
|
53
|
+
df = pd.read_csv("")
|
|
54
|
+
metrics = [ToolCallSyntacticAccuracyMetric()]
|
|
55
|
+
result = evaluator.evaluate(data=df, metrics=metrics)
|
|
56
|
+
|
|
57
|
+
3. Create ToolCallSyntacticAccuracy metric with a custom threshold.
|
|
58
|
+
.. code-block:: python
|
|
59
|
+
|
|
60
|
+
threshold = MetricThreshold(type="upper_limit", value=0.8)
|
|
61
|
+
metric = ToolCallSyntacticAccuracyMetric(threshold=threshold)
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
name: Annotated[Literal["tool_call_syntactic_accuracy"], Field(title="Metric Name",
|
|
65
|
+
description="The name of metric.",
|
|
66
|
+
default=TOOL_CALLING_SYNTACTIC_ACCURACY)]
|
|
67
|
+
display_name: Annotated[Literal["Tool Call Syntactic Accuracy"], Field(title="Display Name",
|
|
68
|
+
description="The tool call syntactic accuracy metric display name.",
|
|
69
|
+
default="Tool Call Syntactic Accuracy", frozen=True)]
|
|
70
|
+
tasks: Annotated[list[TaskType], Field(title="Task Type",
|
|
71
|
+
description="The generative task type.",
|
|
72
|
+
default=[TaskType.RAG])]
|
|
73
|
+
group: Annotated[MetricGroup, Field(title="Group",
|
|
74
|
+
description="The metric group.",
|
|
75
|
+
default=MetricGroup.TOOL_CALL_QUALITY, frozen=True)]
|
|
76
|
+
method: Annotated[Literal["syntactic_check"], Field(title="Computation Method",
|
|
77
|
+
description="The method used to compute the metric.",
|
|
78
|
+
default="syntactic_check")]
|
|
79
|
+
thresholds: Annotated[list[MetricThreshold], Field(title="Metric threshold",
|
|
80
|
+
description="Value that defines the violation limit for the metric",
|
|
81
|
+
default=[MetricThreshold(
|
|
82
|
+
type="lower_limit", value=0.7)]
|
|
83
|
+
)]
|
|
84
|
+
|
|
85
|
+
async def evaluate_async(self, data: pd.DataFrame | dict,
|
|
86
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
87
|
+
**kwargs) -> AggregateMetricResult:
|
|
88
|
+
"""
|
|
89
|
+
Evaluate the data for ToolCallSyntacticAccuracyMetric
|
|
90
|
+
Args:
|
|
91
|
+
data (pd.DataFrame | dict): Data to be evaluated
|
|
92
|
+
configuration (GenAIConfiguration | AgenticAIConfiguration): Metrics configuration
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
AggregateMetricResult: The computed metrics
|
|
96
|
+
"""
|
|
97
|
+
# Validate tool calls field in data and tools in configuration
|
|
98
|
+
data_cols = data.columns.to_list()
|
|
99
|
+
validate_tool_calls(data_cols, configuration)
|
|
100
|
+
|
|
101
|
+
tool_call_provider = ToolCallMetricProvider(
|
|
102
|
+
configuration=configuration, metric=self)
|
|
103
|
+
|
|
104
|
+
# Compute the metrics
|
|
105
|
+
metric_result = await tool_call_provider.compute_metrics(data)
|
|
106
|
+
|
|
107
|
+
return metric_result
|
|
108
|
+
|
|
109
|
+
def evaluate(
|
|
110
|
+
self,
|
|
111
|
+
data: pd.DataFrame | dict,
|
|
112
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
113
|
+
**kwargs,
|
|
114
|
+
):
|
|
115
|
+
# If ran in sync mode, block until it is done
|
|
116
|
+
return run_in_event_loop(
|
|
117
|
+
self.evaluate_async,
|
|
118
|
+
data=data,
|
|
119
|
+
configuration=configuration,
|
|
120
|
+
**kwargs,
|
|
121
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
from ibm_watsonx_gov.metrics.topic_relevance.topic_relevance_metric import \
|
|
21
|
+
TopicRelevanceMetric
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TopicRelevanceDecorator(BaseMetricDecorator):
|
|
25
|
+
|
|
26
|
+
def evaluate_topic_relevance(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric],
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing topic relevance metric on an agentic node.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_topic_relevance, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
@decorator
|
|
39
|
+
def wrapper(func, instance, args, kwargs):
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
self.validate(func=func, metrics=metrics,
|
|
43
|
+
valid_metric_types=(TopicRelevanceMetric))
|
|
44
|
+
|
|
45
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
46
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
47
|
+
configuration=configuration,
|
|
48
|
+
metrics=metrics,
|
|
49
|
+
metric_inputs=metric_inputs,
|
|
50
|
+
metric_outputs=[])
|
|
51
|
+
|
|
52
|
+
return original_result
|
|
53
|
+
except Exception as ex:
|
|
54
|
+
raise Exception(
|
|
55
|
+
f"There was an error while evaluating topic relevance metric on {func.__name__},") from ex
|
|
56
|
+
|
|
57
|
+
return wrapper(func)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_input
|
|
23
|
+
|
|
24
|
+
TOPIC_RELEVANCE = "topic_relevance"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TopicRelevanceMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the TopicRelevance metric class.
|
|
30
|
+
|
|
31
|
+
The TopicRelevance metric evaluates how closely the input content aligns with the topic specified by the system_prompt.
|
|
32
|
+
|
|
33
|
+
Note : system_prompt is mandatory
|
|
34
|
+
|
|
35
|
+
Examples:
|
|
36
|
+
1. Create TopicRelevance metric with default parameters and compute using metrics evaluator.
|
|
37
|
+
.. code-block:: python
|
|
38
|
+
|
|
39
|
+
metric = TopicRelevanceMetric(system_prompt="...")
|
|
40
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
|
|
41
|
+
|
|
42
|
+
2. Create TopicRelevance metric with a custom threshold.
|
|
43
|
+
.. code-block:: python
|
|
44
|
+
|
|
45
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
46
|
+
metric = TopicRelevanceMetric(threshold=threshold, system_prompt="...")
|
|
47
|
+
"""
|
|
48
|
+
name: Annotated[Literal["topic_relevance"],
|
|
49
|
+
Field(title="Name",
|
|
50
|
+
description="The topic relevance metric name.",
|
|
51
|
+
default=TOPIC_RELEVANCE, frozen=True)]
|
|
52
|
+
display_name: Annotated[Literal["Topic Relevance"],
|
|
53
|
+
Field(title="Display Name",
|
|
54
|
+
description="The topic relevance metric display name.",
|
|
55
|
+
default="Topic Relevance", frozen=True)]
|
|
56
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
57
|
+
Field(title="Thresholds",
|
|
58
|
+
description="The metric thresholds.",
|
|
59
|
+
default=[MetricThreshold(type="lower_limit", value=0.7)])]
|
|
60
|
+
tasks: Annotated[list[TaskType],
|
|
61
|
+
Field(title="Tasks",
|
|
62
|
+
description="The list of supported tasks.",
|
|
63
|
+
default=TaskType.values(), frozen=True)]
|
|
64
|
+
# TODO uncomment when the metric is pushed to prod
|
|
65
|
+
# group: Annotated[MetricGroup, Field(title="Group",
|
|
66
|
+
# description="The metric group.",
|
|
67
|
+
# default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
68
|
+
system_prompt: Annotated[str, Field(title="System Prompt",
|
|
69
|
+
description=f"The AI model system prompt which contains instructions to define its overall behavior.")]
|
|
70
|
+
|
|
71
|
+
async def evaluate_async(
|
|
72
|
+
self,
|
|
73
|
+
data: pd.DataFrame | dict,
|
|
74
|
+
configuration: GenAIConfiguration,
|
|
75
|
+
**kwargs
|
|
76
|
+
) -> list[AggregateMetricResult]:
|
|
77
|
+
if not self.system_prompt:
|
|
78
|
+
raise AssertionError(
|
|
79
|
+
f"The system_prompt field is required but was missing from the input.")
|
|
80
|
+
|
|
81
|
+
validate_input(data.columns.to_list(), configuration)
|
|
82
|
+
# Set system_prompt as part of the detector parameters
|
|
83
|
+
kwargs["detector_params"] = {"system_prompt": self.system_prompt}
|
|
84
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
85
|
+
metric_name=self.name,
|
|
86
|
+
metric_display_name=self.display_name,
|
|
87
|
+
metric_method=self.method,
|
|
88
|
+
metric_group=MetricGroup.CONTENT_SAFETY,
|
|
89
|
+
thresholds=self.thresholds,
|
|
90
|
+
**kwargs)
|
|
91
|
+
aggregated_metric_result = provider.evaluate(data=data)
|
|
92
|
+
return aggregated_metric_result
|
|
93
|
+
|
|
94
|
+
def evaluate(
|
|
95
|
+
self,
|
|
96
|
+
data: pd.DataFrame | dict,
|
|
97
|
+
configuration: GenAIConfiguration,
|
|
98
|
+
**kwargs,
|
|
99
|
+
):
|
|
100
|
+
# If ran in sync mode, block until it is done
|
|
101
|
+
return run_in_event_loop(
|
|
102
|
+
self.evaluate_async,
|
|
103
|
+
data=data,
|
|
104
|
+
configuration=configuration,
|
|
105
|
+
**kwargs,
|
|
106
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
from ibm_watsonx_gov.metrics.unethical_behavior.unethical_behavior_metric import \
|
|
21
|
+
UnethicalBehaviorMetric
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class UnethicalBehaviorDecorator(BaseMetricDecorator):
|
|
25
|
+
|
|
26
|
+
def evaluate_unethical_behavior(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = []
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing unethical behavior on an agentic node via granite guardian.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_unethical_behavior, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
if not metrics:
|
|
39
|
+
metrics = [UnethicalBehaviorMetric()]
|
|
40
|
+
|
|
41
|
+
@decorator
|
|
42
|
+
def wrapper(func, instance, args, kwargs):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
self.validate(func=func, metrics=metrics,
|
|
46
|
+
valid_metric_types=(UnethicalBehaviorMetric))
|
|
47
|
+
|
|
48
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
49
|
+
|
|
50
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
51
|
+
configuration=configuration,
|
|
52
|
+
metrics=metrics,
|
|
53
|
+
metric_inputs=metric_inputs,
|
|
54
|
+
metric_outputs=[])
|
|
55
|
+
|
|
56
|
+
return original_result
|
|
57
|
+
except Exception as ex:
|
|
58
|
+
raise Exception(
|
|
59
|
+
f"There was an error while evaluating unethical behavior on {func.__name__},") from ex
|
|
60
|
+
|
|
61
|
+
return wrapper(func)
|