ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import textstat
|
|
14
|
+
from pydantic import Field
|
|
15
|
+
|
|
16
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
18
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
19
|
+
RecordMetricResult)
|
|
20
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
21
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
22
|
+
from ibm_watsonx_gov.utils.python_utils import replace_none_with_empty_string
|
|
23
|
+
from ibm_watsonx_gov.utils.validation_util import validate_output
|
|
24
|
+
|
|
25
|
+
TEXT_READING_EASE = "text_reading_ease"
|
|
26
|
+
TEXT_READING_EASE_DISPLAY_NAME = "Text Reading Ease"
|
|
27
|
+
FLESCH_READING_EASE = "flesch_reading_ease"
|
|
28
|
+
TEXTSTAT = "textstat"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TextReadingEaseResult(RecordMetricResult):
|
|
32
|
+
name: str = TEXT_READING_EASE
|
|
33
|
+
display_name: str = TEXT_READING_EASE_DISPLAY_NAME
|
|
34
|
+
provider: str = TEXTSTAT
|
|
35
|
+
method: str = FLESCH_READING_EASE
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TextReadingEaseMetric(GenAIMetric):
|
|
39
|
+
"""
|
|
40
|
+
Defines the Text Reading Ease metric class.
|
|
41
|
+
|
|
42
|
+
The Text Reading Ease metric measures how readable the text is.
|
|
43
|
+
It is computed using the flesch_reading_ease method.
|
|
44
|
+
The score ranges broadly from 0 to 100, where a higher score indicates that a text is easier to read
|
|
45
|
+
|
|
46
|
+
Examples:
|
|
47
|
+
1. Create Text Reading Ease metric with default parameters and compute using metrics evaluator.
|
|
48
|
+
.. code-block:: python
|
|
49
|
+
|
|
50
|
+
metric = TextReadingEaseMetric()
|
|
51
|
+
result = MetricsEvaluator().evaluate(data={"generated_text": "..."},
|
|
52
|
+
metrics=[metric])
|
|
53
|
+
|
|
54
|
+
2. Create Text Reading Ease metric with a custom threshold.
|
|
55
|
+
.. code-block:: python
|
|
56
|
+
|
|
57
|
+
threshold = MetricThreshold(type="lower_limit", value=70)
|
|
58
|
+
metric = TextReadingEaseMetric(thresholds=[threshold])
|
|
59
|
+
"""
|
|
60
|
+
name: Annotated[Literal["text_reading_ease"],
|
|
61
|
+
Field(title="name",
|
|
62
|
+
description="The text reading ease metric name.",
|
|
63
|
+
default=TEXT_READING_EASE, frozen=True)]
|
|
64
|
+
display_name: Annotated[Literal["Text Reading Ease"],
|
|
65
|
+
Field(title="Display Name",
|
|
66
|
+
description="The text reading ease metric display name.",
|
|
67
|
+
default=TEXT_READING_EASE_DISPLAY_NAME, frozen=True)]
|
|
68
|
+
method: Annotated[Literal["flesch_reading_ease"],
|
|
69
|
+
Field(title="Method",
|
|
70
|
+
description="The method used to compute text reading ease metric.",
|
|
71
|
+
default=FLESCH_READING_EASE)]
|
|
72
|
+
tasks: Annotated[list[TaskType],
|
|
73
|
+
Field(title="Tasks",
|
|
74
|
+
description="The list of supported tasks.",
|
|
75
|
+
default=TaskType.values(), frozen=True)]
|
|
76
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
77
|
+
Field(title="Thresholds",
|
|
78
|
+
description="The metric thresholds.",
|
|
79
|
+
default=[MetricThreshold(type="lower_limit", value=70)])]
|
|
80
|
+
group: Annotated[MetricGroup,
|
|
81
|
+
Field(title="Group",
|
|
82
|
+
description="The metric group.",
|
|
83
|
+
default=MetricGroup.READABILITY, frozen=True)]
|
|
84
|
+
|
|
85
|
+
def evaluate(
|
|
86
|
+
self,
|
|
87
|
+
data: pd.DataFrame,
|
|
88
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
89
|
+
**kwargs,
|
|
90
|
+
) -> list[AggregateMetricResult]:
|
|
91
|
+
from ibm_watsonx_gov.utils.aggregation_util import get_summaries
|
|
92
|
+
|
|
93
|
+
validate_output(data.columns.to_list(), configuration)
|
|
94
|
+
predictions = data[configuration.output_fields[0]].to_list()
|
|
95
|
+
record_ids = data[configuration.record_id_field].to_list()
|
|
96
|
+
replace_none_with_empty_string(predictions)
|
|
97
|
+
|
|
98
|
+
all_scores = self._compute(predictions=predictions)
|
|
99
|
+
record_level_metrics = [
|
|
100
|
+
TextReadingEaseResult(record_id=record_id,
|
|
101
|
+
value=score, thresholds=self.thresholds, group=MetricGroup.READABILITY.value)
|
|
102
|
+
for score, record_id in zip(all_scores, record_ids)
|
|
103
|
+
]
|
|
104
|
+
summary = get_summaries(all_scores)
|
|
105
|
+
aggregate_metric_scores = AggregateMetricResult(
|
|
106
|
+
name=self.name,
|
|
107
|
+
display_name=self.display_name,
|
|
108
|
+
provider=TEXTSTAT,
|
|
109
|
+
method=self.method,
|
|
110
|
+
group=self.group,
|
|
111
|
+
min=summary.get("min"),
|
|
112
|
+
max=summary.get("max"),
|
|
113
|
+
mean=summary.get("mean"),
|
|
114
|
+
value=summary.get("mean"),
|
|
115
|
+
total_records=len(record_level_metrics),
|
|
116
|
+
record_level_metrics=record_level_metrics,
|
|
117
|
+
thresholds=self.thresholds,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
return aggregate_metric_scores
|
|
121
|
+
|
|
122
|
+
def _compute(self, predictions: list) -> list:
|
|
123
|
+
return [textstat.flesch_reading_ease(pred) for pred in predictions]
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
19
|
+
from ibm_watsonx_gov.metrics.tool_call_accuracy.tool_call_accuracy_metric import \
|
|
20
|
+
ToolCallAccuracyMetric
|
|
21
|
+
from ibm_watsonx_gov.providers.tool_call_metric_provider import \
|
|
22
|
+
ToolCallMetricProvider
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ToolCallAccuracyDecorator(BaseMetricDecorator):
|
|
26
|
+
def evaluate_tool_call_accuracy(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = []
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing tool call accuracy metric on an agentic node.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_tool_call_accuracy, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
if not metrics:
|
|
39
|
+
metrics = [ToolCallAccuracyMetric()]
|
|
40
|
+
|
|
41
|
+
@decorator
|
|
42
|
+
def wrapper(func, instance, args, kwargs):
|
|
43
|
+
try:
|
|
44
|
+
self.validate(func=func, metrics=metrics,
|
|
45
|
+
valid_metric_types=(ToolCallAccuracyMetric,))
|
|
46
|
+
|
|
47
|
+
metric_inputs = [
|
|
48
|
+
EvaluatorFields.INPUT_FIELDS
|
|
49
|
+
]
|
|
50
|
+
metric_outputs = [
|
|
51
|
+
EvaluatorFields.TOOL_CALLS_FIELD, EvaluatorFields.OUTPUT_FIELDS]
|
|
52
|
+
|
|
53
|
+
if isinstance(configuration.tools, list) and all(callable(item) for item in configuration.tools):
|
|
54
|
+
configuration.tools = ToolCallMetricProvider.get_tools_list_schema(
|
|
55
|
+
configuration.tools)
|
|
56
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
57
|
+
configuration=configuration,
|
|
58
|
+
metrics=metrics,
|
|
59
|
+
metric_inputs=metric_inputs,
|
|
60
|
+
metric_outputs=metric_outputs)
|
|
61
|
+
|
|
62
|
+
return original_result
|
|
63
|
+
except Exception as ex:
|
|
64
|
+
raise Exception(
|
|
65
|
+
f"There was an error while evaluating tool call accuracy metric on {func.__name__},") from ex
|
|
66
|
+
|
|
67
|
+
return wrapper(func)
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
14
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
15
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
16
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
17
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
18
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
19
|
+
from ibm_watsonx_gov.providers.tool_call_metric_provider import \
|
|
20
|
+
ToolCallMetricProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.gov_sdk_logger import GovSDKLogger
|
|
23
|
+
from ibm_watsonx_gov.utils.validation_util import validate_tool_calls, validate_input
|
|
24
|
+
from pydantic import Field
|
|
25
|
+
|
|
26
|
+
logger = GovSDKLogger.get_logger(__name__)
|
|
27
|
+
TOOL_CALL_ACCURACY = "tool_call_accuracy"
|
|
28
|
+
FUNCTION_CALL = "function_call"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ToolCallAccuracyMetric(GenAIMetric):
|
|
32
|
+
"""
|
|
33
|
+
ToolCallAccuracyMetric checks whether the tool call in the LLM response is
|
|
34
|
+
syntactically correct and semantically meaningful, given the user's query and
|
|
35
|
+
the available tool definitions.
|
|
36
|
+
|
|
37
|
+
The ToolCallAccuracyMetric can be computed using the below methods:
|
|
38
|
+
|
|
39
|
+
1. syntactic (default)
|
|
40
|
+
2. granite_guardian
|
|
41
|
+
|
|
42
|
+
Examples:
|
|
43
|
+
1. Create ToolCallAccuracyMetric by passing the basic configuration.
|
|
44
|
+
.. code-block:: python
|
|
45
|
+
|
|
46
|
+
config = GenAIConfiguration(tools = [get_weather,fetch_stock_price])
|
|
47
|
+
evaluator = MetricsEvaluator(configuration=config)
|
|
48
|
+
df = pd.read_csv("")
|
|
49
|
+
|
|
50
|
+
metrics = [ToolCallAccuracyMetric()]
|
|
51
|
+
result = evaluator.evaluate(data=df, metrics=metrics)
|
|
52
|
+
|
|
53
|
+
2. Create ToolCallAccuracyMetric with a custom threshold.
|
|
54
|
+
.. code-block:: python
|
|
55
|
+
|
|
56
|
+
threshold = MetricThreshold(type="upper_limit", value=0.8)
|
|
57
|
+
metric = ToolCallAccuracyMetric(threshold=threshold)
|
|
58
|
+
|
|
59
|
+
3. Create ToolCallAccuracyMetric by passing custom tool calls field in configuration.
|
|
60
|
+
.. code-block:: python
|
|
61
|
+
|
|
62
|
+
test_data = {"input_text": "What's the latest on Tesla today?",
|
|
63
|
+
"tools_used":[{"name": "get_weather", "args": {"location": "Tesla"}, "id": "0724", "type": "tool_call"}]}
|
|
64
|
+
|
|
65
|
+
config = GenAIConfiguration(tools = [get_weather,fetch_stock_price],
|
|
66
|
+
tool_calls_field="tools_used")
|
|
67
|
+
evaluator = MetricsEvaluator(configuration=config)
|
|
68
|
+
metrics = [ToolCallAccuracyMetric()]
|
|
69
|
+
result = evaluator.evaluate(data=test_data, metrics=metrics)
|
|
70
|
+
|
|
71
|
+
4. Create ToolCallAccuracyMetric by passing a list of dictionary items as tools field in configuration.
|
|
72
|
+
.. code-block:: python
|
|
73
|
+
|
|
74
|
+
available_tools = [{"type":"function","function":{"name":"f1_name","description":"f1_description.","parameters":{"parameter1":{"description":"parameter_description","type":"parameter_type","default":"default_value"}}}}]
|
|
75
|
+
config = GenAIConfiguration(tools = available_tools,
|
|
76
|
+
tool_calls_field="tools_used")
|
|
77
|
+
evaluator = MetricsEvaluator(configuration=config)
|
|
78
|
+
df = pd.read_csv("")
|
|
79
|
+
|
|
80
|
+
metrics = [ToolCallAccuracyMetric()]
|
|
81
|
+
result = evaluator.evaluate(data=df, metrics=metrics)
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
name: Annotated[Literal["tool_call_accuracy"], Field(title="Metric Name",
|
|
85
|
+
description="The tool call accuracy metric name.",
|
|
86
|
+
default=TOOL_CALL_ACCURACY)]
|
|
87
|
+
display_name: Annotated[Literal["Tool Call Accuracy"], Field(title="Display Name",
|
|
88
|
+
description="The tool call accuracy metric display name.",
|
|
89
|
+
default="Tool Call Accuracy", frozen=True)]
|
|
90
|
+
tasks: Annotated[list[TaskType], Field(title="Task Type",
|
|
91
|
+
description="The generative task type.",
|
|
92
|
+
default=[TaskType.RAG])]
|
|
93
|
+
group: Annotated[MetricGroup, Field(
|
|
94
|
+
default=MetricGroup.TOOL_CALL_QUALITY, frozen=True)]
|
|
95
|
+
|
|
96
|
+
method: Annotated[Literal["syntactic", "granite_guardian"], Field(title="Computation Method",
|
|
97
|
+
description="The method used to compute the metric.",
|
|
98
|
+
default="syntactic")]
|
|
99
|
+
thresholds: Annotated[list[MetricThreshold], Field(title="Metric threshold",
|
|
100
|
+
description="Value that defines the violation limit for the metric",
|
|
101
|
+
default=[MetricThreshold(
|
|
102
|
+
type="lower_limit", value=0.7)]
|
|
103
|
+
)]
|
|
104
|
+
|
|
105
|
+
async def evaluate_async(
|
|
106
|
+
self,
|
|
107
|
+
data: pd.DataFrame,
|
|
108
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
109
|
+
**kwargs
|
|
110
|
+
) -> list[AggregateMetricResult]:
|
|
111
|
+
|
|
112
|
+
data_cols = data.columns.to_list()
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
validate_tool_calls(data_cols, configuration)
|
|
116
|
+
validate_input(data_cols, configuration)
|
|
117
|
+
except ValueError as ve:
|
|
118
|
+
if kwargs.get("ignore_validation_errors"):
|
|
119
|
+
message = f"Skipping '{self.name}' computation because the validation failed. Details: {str(ve)}"
|
|
120
|
+
logger.warning(message)
|
|
121
|
+
return
|
|
122
|
+
raise ve
|
|
123
|
+
|
|
124
|
+
if self.method == "granite_guardian":
|
|
125
|
+
kwargs["detector_params"] = {
|
|
126
|
+
"risk_name": FUNCTION_CALL, "threshold": 0.001}
|
|
127
|
+
tool_call_provider = DetectorsProvider(configuration=configuration,
|
|
128
|
+
metric_name=self.name,
|
|
129
|
+
metric_display_name=self.display_name,
|
|
130
|
+
metric_method=self.method,
|
|
131
|
+
metric_group=self.group,
|
|
132
|
+
thresholds=self.thresholds,
|
|
133
|
+
**kwargs)
|
|
134
|
+
metric_result = await tool_call_provider.evaluate_async(data=data)
|
|
135
|
+
elif self.method == "syntactic":
|
|
136
|
+
tool_call_provider = ToolCallMetricProvider(
|
|
137
|
+
configuration=configuration, metric=self)
|
|
138
|
+
|
|
139
|
+
# Compute the metrics
|
|
140
|
+
metric_result = await tool_call_provider.compute_metrics(data)
|
|
141
|
+
return metric_result
|
|
142
|
+
|
|
143
|
+
def evaluate(self, data: pd.DataFrame | dict,
|
|
144
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
145
|
+
**kwargs):
|
|
146
|
+
"""
|
|
147
|
+
Evaluate the data for ToolCallAccuracyMetric
|
|
148
|
+
Args:
|
|
149
|
+
data (pd.DataFrame | dict): Data to be evaluated
|
|
150
|
+
configuration (GenAIConfiguration | AgenticAIConfiguration): Metrics configuration
|
|
151
|
+
**kwargs: Additional keyword arguments
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
AggregateMetricResult: The computed metrics
|
|
155
|
+
"""
|
|
156
|
+
# If ran in sync mode, block until it is done
|
|
157
|
+
return run_in_event_loop(
|
|
158
|
+
self.evaluate_async,
|
|
159
|
+
data=data,
|
|
160
|
+
configuration=configuration,
|
|
161
|
+
**kwargs,
|
|
162
|
+
)
|
|
File without changes
|
ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
13
|
+
AgenticAIConfiguration
|
|
14
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
15
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
16
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
17
|
+
from ibm_watsonx_gov.metrics.tool_call_parameter_accuracy.tool_call_parameter_accuracy_metric import \
|
|
18
|
+
ToolCallParameterAccuracyMetric
|
|
19
|
+
from ibm_watsonx_gov.providers.tool_call_metric_provider import \
|
|
20
|
+
ToolCallMetricProvider
|
|
21
|
+
from wrapt import decorator
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ToolCallParameterAccuracyDecorator(BaseMetricDecorator):
|
|
25
|
+
def evaluate_tool_call_parameter_accuracy(self,
|
|
26
|
+
func: Optional[Callable] = None,
|
|
27
|
+
*,
|
|
28
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
29
|
+
metrics: list[GenAIMetric] = []
|
|
30
|
+
) -> dict:
|
|
31
|
+
"""
|
|
32
|
+
An evaluation decorator for computing tool call parameter accuracy metric on an agentic node.
|
|
33
|
+
"""
|
|
34
|
+
if func is None:
|
|
35
|
+
return partial(self.evaluate_tool_call_parameter_accuracy, configuration=configuration, metrics=metrics)
|
|
36
|
+
|
|
37
|
+
if not metrics:
|
|
38
|
+
metrics = [ToolCallParameterAccuracyMetric()]
|
|
39
|
+
|
|
40
|
+
@decorator
|
|
41
|
+
def wrapper(func, instance, args, kwargs):
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
self.validate(func=func, metrics=metrics,
|
|
45
|
+
valid_metric_types=(ToolCallParameterAccuracyMetric,))
|
|
46
|
+
|
|
47
|
+
metric_inputs = [
|
|
48
|
+
EvaluatorFields.INPUT_FIELDS
|
|
49
|
+
]
|
|
50
|
+
metric_outputs = [
|
|
51
|
+
EvaluatorFields.TOOL_CALLS_FIELD, EvaluatorFields.OUTPUT_FIELDS]
|
|
52
|
+
|
|
53
|
+
if isinstance(configuration.tools, list) and all(callable(item) for item in configuration.tools):
|
|
54
|
+
configuration.tools = ToolCallMetricProvider.get_tools_list_schema(
|
|
55
|
+
configuration.tools)
|
|
56
|
+
|
|
57
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
58
|
+
configuration=configuration,
|
|
59
|
+
metrics=metrics,
|
|
60
|
+
metric_inputs=metric_inputs,
|
|
61
|
+
metric_outputs=metric_outputs)
|
|
62
|
+
|
|
63
|
+
return original_result
|
|
64
|
+
except Exception as ex:
|
|
65
|
+
raise Exception(
|
|
66
|
+
f"There was an error while evaluating tool call parameter accuracy metric on {func.__name__},") from ex
|
|
67
|
+
|
|
68
|
+
return wrapper(func)
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.llm_judge import LLMJudge
|
|
19
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
20
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
21
|
+
from ibm_watsonx_gov.providers.tool_call_metric_provider import \
|
|
22
|
+
ToolCallMetricProvider
|
|
23
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
24
|
+
from ibm_watsonx_gov.utils.validation_util import validate_tool_calls
|
|
25
|
+
|
|
26
|
+
TOOL_CALL_PARAMETER_ACCURACY = "tool_call_parameter_accuracy"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ToolCallParameterAccuracyMetric(GenAIMetric):
|
|
30
|
+
"""
|
|
31
|
+
ToolCallParameterAccuracyMetric assesses whether ALL parameter values
|
|
32
|
+
in a function call are directly supported by conversation history
|
|
33
|
+
or API specifications. Identifies hallucinated values, missing information,
|
|
34
|
+
format errors, and contradictory values.
|
|
35
|
+
|
|
36
|
+
The ToolCallParameterAccuracyMetric will be computed using llm_as_judge.
|
|
37
|
+
|
|
38
|
+
Examples:
|
|
39
|
+
1. Create ToolCallParameterAccuracyMetric by passing the basic configuration.
|
|
40
|
+
.. code-block:: python
|
|
41
|
+
|
|
42
|
+
config = GenAIConfiguration(tools = [get_weather,fetch_stock_price])
|
|
43
|
+
evaluator = MetricsEvaluator(configuration=config)
|
|
44
|
+
df = pd.read_csv("")
|
|
45
|
+
llm_judge = LLMJudge(
|
|
46
|
+
model=WxAIFoundationModel(
|
|
47
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
48
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
metrics = [ToolCallParameterAccuracyMetric(llm_judge=llm_judge)]
|
|
52
|
+
result = evaluator.evaluate(data=df, metrics=metrics)
|
|
53
|
+
|
|
54
|
+
2. Create ToolCallParameterAccuracyMetric by passing custom tool calls field in configuration.
|
|
55
|
+
.. code-block:: python
|
|
56
|
+
|
|
57
|
+
config = GenAIConfiguration(tools = [get_weather,fetch_stock_price],
|
|
58
|
+
tool_calls_field="tools_used")
|
|
59
|
+
evaluator = MetricsEvaluator(configuration=config)
|
|
60
|
+
df = pd.read_csv("")
|
|
61
|
+
llm_judge = LLMJudge(
|
|
62
|
+
model=WxAIFoundationModel(
|
|
63
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
64
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
metrics = [ToolCallParameterAccuracyMetric(llm_judge=llm_judge)]
|
|
68
|
+
result = evaluator.evaluate(data=df, metrics=metrics)
|
|
69
|
+
|
|
70
|
+
3. Create ToolCallParameterAccuracyMetric with a custom threshold.
|
|
71
|
+
.. code-block:: python
|
|
72
|
+
|
|
73
|
+
llm_judge = LLMJudge(
|
|
74
|
+
model=WxAIFoundationModel(
|
|
75
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
76
|
+
project_id=os.getenv("WATSONX_PROJECT_ID"),
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
threshold = MetricThreshold(type="upper_limit", value=0.8)
|
|
80
|
+
metric = ToolCallParameterAccuracyMetric(llm_judge=llm_judge, threshold=threshold)
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
name: Annotated[Literal["tool_call_parameter_accuracy"], Field(title="Metric Name",
|
|
84
|
+
description="The name of metric.",
|
|
85
|
+
default=TOOL_CALL_PARAMETER_ACCURACY)]
|
|
86
|
+
display_name: Annotated[Literal["Tool Call Parameter Accuracy"], Field(title="Display Name",
|
|
87
|
+
description="The tool call parameter accuracy metric display name.",
|
|
88
|
+
default="Tool Call Parameter Accuracy", frozen=True)]
|
|
89
|
+
tasks: Annotated[list[TaskType], Field(title="Task Type",
|
|
90
|
+
description="The generative task type.",
|
|
91
|
+
default=[TaskType.RAG])]
|
|
92
|
+
group: Annotated[MetricGroup, Field(
|
|
93
|
+
default=MetricGroup.TOOL_CALL_QUALITY, frozen=True)]
|
|
94
|
+
|
|
95
|
+
llm_judge: Annotated[LLMJudge | None, Field(
|
|
96
|
+
description="The LLM judge used to compute the metric.", default=None)]
|
|
97
|
+
|
|
98
|
+
method: Annotated[Literal["llm_as_judge"], Field(title="Computation Method",
|
|
99
|
+
description="The method used to compute the metric.",
|
|
100
|
+
default="llm_as_judge")]
|
|
101
|
+
thresholds: Annotated[list[MetricThreshold], Field(title="Metric threshold",
|
|
102
|
+
description="Value that defines the violation limit for the metric",
|
|
103
|
+
default=[MetricThreshold(
|
|
104
|
+
type="lower_limit", value=0.8)]
|
|
105
|
+
)]
|
|
106
|
+
metric_mapping_name: Annotated[Literal["general_hallucination_check"], Field(title="Metric Mapping Name",
|
|
107
|
+
description="The mapping name of metric with llmevalkit.",
|
|
108
|
+
default="general_hallucination_check")]
|
|
109
|
+
|
|
110
|
+
async def evaluate_async(self, data: pd.DataFrame | dict,
|
|
111
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
112
|
+
**kwargs) -> AggregateMetricResult:
|
|
113
|
+
"""
|
|
114
|
+
Evaluate the data for ToolCallParameterAccuracyMetric
|
|
115
|
+
Args:
|
|
116
|
+
data (pd.DataFrame | dict): Data to be evaluated
|
|
117
|
+
configuration (GenAIConfiguration | AgenticAIConfiguration): Metrics configuration
|
|
118
|
+
**kwargs: Additional keyword arguments
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
AggregateMetricResult: The computed metrics
|
|
122
|
+
"""
|
|
123
|
+
data_cols = data.columns.to_list()
|
|
124
|
+
validate_tool_calls(data_cols, configuration)
|
|
125
|
+
|
|
126
|
+
tool_call_provider = ToolCallMetricProvider(
|
|
127
|
+
configuration=configuration, metric=self)
|
|
128
|
+
metric_config = {
|
|
129
|
+
"general_metrics": [self.metric_mapping_name],
|
|
130
|
+
"function_metrics": None,
|
|
131
|
+
"parameter_metrics": None,
|
|
132
|
+
"transform_enabled": False
|
|
133
|
+
}
|
|
134
|
+
metric_result = await tool_call_provider.compute_metrics(
|
|
135
|
+
data, syntactic_only=False, metric_result_mapping_name="general", **metric_config)
|
|
136
|
+
|
|
137
|
+
return metric_result
|
|
138
|
+
|
|
139
|
+
def evaluate(
|
|
140
|
+
self,
|
|
141
|
+
data: pd.DataFrame | dict,
|
|
142
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
143
|
+
**kwargs,
|
|
144
|
+
):
|
|
145
|
+
# If ran in sync mode, block until it is done
|
|
146
|
+
return run_in_event_loop(
|
|
147
|
+
self.evaluate_async,
|
|
148
|
+
data=data,
|
|
149
|
+
configuration=configuration,
|
|
150
|
+
**kwargs,
|
|
151
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
13
|
+
AgenticAIConfiguration
|
|
14
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
15
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
16
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
17
|
+
from ibm_watsonx_gov.metrics.tool_call_relevance.tool_call_relevance_metric import \
|
|
18
|
+
ToolCallRelevanceMetric
|
|
19
|
+
from ibm_watsonx_gov.providers.tool_call_metric_provider import \
|
|
20
|
+
ToolCallMetricProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.python_utils import parse_functions_to_openai_schema
|
|
22
|
+
from wrapt import decorator
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ToolCallRelevanceDecorator(BaseMetricDecorator):
|
|
26
|
+
def evaluate_tool_call_relevance(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = [
|
|
31
|
+
ToolCallRelevanceMetric()
|
|
32
|
+
]
|
|
33
|
+
) -> dict:
|
|
34
|
+
"""
|
|
35
|
+
An evaluation decorator for computing tool call relevance metric on an agentic node.
|
|
36
|
+
"""
|
|
37
|
+
if func is None:
|
|
38
|
+
return partial(self.evaluate_tool_call_relevance, configuration=configuration, metrics=metrics)
|
|
39
|
+
|
|
40
|
+
if not metrics:
|
|
41
|
+
metrics = [ToolCallRelevanceMetric()]
|
|
42
|
+
|
|
43
|
+
@decorator
|
|
44
|
+
def wrapper(func, instance, args, kwargs):
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
self.validate(func=func, metrics=metrics,
|
|
48
|
+
valid_metric_types=(ToolCallRelevanceMetric,))
|
|
49
|
+
|
|
50
|
+
metric_inputs = [
|
|
51
|
+
EvaluatorFields.INPUT_FIELDS
|
|
52
|
+
]
|
|
53
|
+
metric_outputs = [
|
|
54
|
+
EvaluatorFields.TOOL_CALLS_FIELD, EvaluatorFields.OUTPUT_FIELDS]
|
|
55
|
+
|
|
56
|
+
if isinstance(configuration.tools, list) and all(callable(item) for item in configuration.tools):
|
|
57
|
+
configuration.tools = ToolCallMetricProvider.get_tools_list_schema(
|
|
58
|
+
configuration.tools)
|
|
59
|
+
|
|
60
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
61
|
+
configuration=configuration,
|
|
62
|
+
metrics=metrics,
|
|
63
|
+
metric_inputs=metric_inputs,
|
|
64
|
+
metric_outputs=metric_outputs)
|
|
65
|
+
|
|
66
|
+
return original_result
|
|
67
|
+
except Exception as ex:
|
|
68
|
+
raise Exception(
|
|
69
|
+
f"There was an error while evaluating tool call relevance metric on {func.__name__},") from ex
|
|
70
|
+
|
|
71
|
+
return wrapper(func)
|