ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from ..types import (
|
|
7
|
+
ParameterComparisonResult,
|
|
8
|
+
ToolCallComparisonResult,
|
|
9
|
+
ComparisonConfig,
|
|
10
|
+
ParameterStatus,
|
|
11
|
+
ToolSpecFunction,
|
|
12
|
+
ToolSpecParameter,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BaseComparator(ABC):
|
|
19
|
+
"""Abstract base class for tool call comparison strategies."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, config: ComparisonConfig):
|
|
22
|
+
self.config = config
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def compare_parameter(
|
|
26
|
+
self,
|
|
27
|
+
param_name: str,
|
|
28
|
+
predicted_value: Any,
|
|
29
|
+
ground_truth_value: Any,
|
|
30
|
+
context: Optional[Dict[str, Any]] = None,
|
|
31
|
+
custom_instructions: Optional[str] = None,
|
|
32
|
+
) -> ParameterComparisonResult:
|
|
33
|
+
"""Compare a single parameter between predicted and ground truth."""
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def compare_function_name(
|
|
38
|
+
self,
|
|
39
|
+
predicted_name: str,
|
|
40
|
+
ground_truth_name: str,
|
|
41
|
+
context: Optional[Dict[str, Any]] = None,
|
|
42
|
+
) -> float:
|
|
43
|
+
"""Compare function names and return similarity score."""
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def _extract_tool_spec(
|
|
47
|
+
self, function_name: str, tool_specs: Optional[List[Dict[str, Any]]]
|
|
48
|
+
) -> Optional[ToolSpecFunction]:
|
|
49
|
+
"""Extract tool specification for the given function."""
|
|
50
|
+
if not tool_specs:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
for spec in tool_specs:
|
|
54
|
+
if spec.get("function", {}).get("name") == function_name:
|
|
55
|
+
return ToolSpecFunction.from_openai_spec(spec)
|
|
56
|
+
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
def _resolve_parameters_with_defaults(
|
|
60
|
+
self, provided_params: Dict[str, Any], tool_spec: Optional[ToolSpecFunction]
|
|
61
|
+
) -> Dict[str, Any]:
|
|
62
|
+
"""Resolve parameters by applying defaults from tool specification."""
|
|
63
|
+
resolved = provided_params.copy()
|
|
64
|
+
|
|
65
|
+
if tool_spec:
|
|
66
|
+
for param_def in tool_spec.parameters:
|
|
67
|
+
if param_def.name not in resolved and param_def.default is not None:
|
|
68
|
+
resolved[param_def.name] = param_def.default
|
|
69
|
+
|
|
70
|
+
return resolved
|
|
71
|
+
|
|
72
|
+
def _determine_parameter_status(
|
|
73
|
+
self,
|
|
74
|
+
param_name: str,
|
|
75
|
+
predicted_params: Dict[str, Any],
|
|
76
|
+
ground_truth_params: Dict[str, Any],
|
|
77
|
+
predicted_resolved: Dict[str, Any],
|
|
78
|
+
ground_truth_resolved: Dict[str, Any],
|
|
79
|
+
) -> ParameterStatus:
|
|
80
|
+
"""Determine the status of a parameter in both calls."""
|
|
81
|
+
pred_present = param_name in predicted_params
|
|
82
|
+
gt_present = param_name in ground_truth_params
|
|
83
|
+
pred_resolved = param_name in predicted_resolved
|
|
84
|
+
gt_resolved = param_name in ground_truth_resolved
|
|
85
|
+
|
|
86
|
+
if pred_present and gt_present:
|
|
87
|
+
return ParameterStatus.BOTH_PRESENT
|
|
88
|
+
elif not pred_present and not gt_present:
|
|
89
|
+
if pred_resolved and gt_resolved:
|
|
90
|
+
return ParameterStatus.BOTH_DEFAULT
|
|
91
|
+
else:
|
|
92
|
+
return ParameterStatus.BOTH_MISSING
|
|
93
|
+
elif not pred_present and gt_present:
|
|
94
|
+
if pred_resolved:
|
|
95
|
+
return ParameterStatus.PRED_DEFAULT
|
|
96
|
+
else:
|
|
97
|
+
return ParameterStatus.PRED_MISSING
|
|
98
|
+
elif pred_present and not gt_present:
|
|
99
|
+
if gt_resolved:
|
|
100
|
+
return ParameterStatus.GT_DEFAULT
|
|
101
|
+
else:
|
|
102
|
+
return ParameterStatus.GT_MISSING
|
|
103
|
+
else:
|
|
104
|
+
return ParameterStatus.BOTH_MISSING
|
|
105
|
+
|
|
106
|
+
def _normalize_value(self, value: Any, expected_type: Optional[str] = None) -> Any:
|
|
107
|
+
"""Normalize values for comparison (e.g., string numbers to numbers)."""
|
|
108
|
+
if not self.config.normalize_types:
|
|
109
|
+
return value
|
|
110
|
+
|
|
111
|
+
if expected_type == "integer" and isinstance(value, str):
|
|
112
|
+
try:
|
|
113
|
+
return int(value)
|
|
114
|
+
except ValueError:
|
|
115
|
+
pass
|
|
116
|
+
elif expected_type == "boolean" and isinstance(value, str):
|
|
117
|
+
return value.lower() in ("true", "1", "yes", "on")
|
|
118
|
+
elif expected_type == "number" and isinstance(value, str):
|
|
119
|
+
try:
|
|
120
|
+
return float(value)
|
|
121
|
+
except ValueError:
|
|
122
|
+
pass
|
|
123
|
+
elif expected_type == "array" and isinstance(value, str):
|
|
124
|
+
try:
|
|
125
|
+
return json.loads(value)
|
|
126
|
+
except (json.JSONDecodeError, ValueError):
|
|
127
|
+
pass
|
|
128
|
+
elif expected_type == "object" and isinstance(value, str):
|
|
129
|
+
try:
|
|
130
|
+
return json.loads(value)
|
|
131
|
+
except (json.JSONDecodeError, ValueError):
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
return value
|
|
135
|
+
|
|
136
|
+
def _get_parameter_weight(self, param_name: str) -> float:
|
|
137
|
+
"""Get weight for a specific parameter."""
|
|
138
|
+
for weight_config in self.config.parameter_weights:
|
|
139
|
+
if weight_config.name == param_name:
|
|
140
|
+
return weight_config.weight
|
|
141
|
+
return 1.0
|
|
142
|
+
|
|
143
|
+
def _is_critical_parameter(self, param_name: str) -> bool:
|
|
144
|
+
"""Check if a parameter is marked as critical."""
|
|
145
|
+
for weight_config in self.config.parameter_weights:
|
|
146
|
+
if weight_config.name == param_name:
|
|
147
|
+
return weight_config.is_critical
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
def _calculate_weighted_score(
|
|
151
|
+
self, param_results: List[ParameterComparisonResult]
|
|
152
|
+
) -> float:
|
|
153
|
+
"""Calculate weighted parameter score considering importance."""
|
|
154
|
+
if not param_results:
|
|
155
|
+
return 1.0
|
|
156
|
+
|
|
157
|
+
total_weight = 0
|
|
158
|
+
weighted_sum = 0
|
|
159
|
+
critical_failures = 0
|
|
160
|
+
|
|
161
|
+
for result in param_results:
|
|
162
|
+
weight = self._get_parameter_weight(result.parameter_name)
|
|
163
|
+
total_weight += weight
|
|
164
|
+
weighted_sum += result.score * weight
|
|
165
|
+
|
|
166
|
+
# Check for critical parameter failures
|
|
167
|
+
if (
|
|
168
|
+
self._is_critical_parameter(result.parameter_name)
|
|
169
|
+
and not result.is_match
|
|
170
|
+
):
|
|
171
|
+
critical_failures += 1
|
|
172
|
+
|
|
173
|
+
base_score = weighted_sum / total_weight if total_weight > 0 else 0
|
|
174
|
+
|
|
175
|
+
# Apply critical failure penalty
|
|
176
|
+
if critical_failures > 0:
|
|
177
|
+
penalty = self.config.critical_parameter_penalty * critical_failures
|
|
178
|
+
base_score *= 1 - penalty
|
|
179
|
+
|
|
180
|
+
return max(0.0, min(1.0, base_score))
|
|
181
|
+
|
|
182
|
+
def compare_tool_calls(
|
|
183
|
+
self,
|
|
184
|
+
predicted_call: Dict[str, Any],
|
|
185
|
+
ground_truth_call: Dict[str, Any],
|
|
186
|
+
conversation_history: Optional[List[Dict[str, str]]] = None,
|
|
187
|
+
tool_specs: Optional[List[Dict[str, Any]]] = None,
|
|
188
|
+
) -> ToolCallComparisonResult:
|
|
189
|
+
"""Main comparison method orchestrating the full comparison."""
|
|
190
|
+
|
|
191
|
+
# Extract function names
|
|
192
|
+
pred_name = predicted_call.get("function", {}).get("name", "")
|
|
193
|
+
gt_name = ground_truth_call.get("function", {}).get("name", "")
|
|
194
|
+
|
|
195
|
+
# Compare function names
|
|
196
|
+
fn_score = self.compare_function_name(pred_name, gt_name)
|
|
197
|
+
fn_match = fn_score >= 0.95 # High threshold for exact match
|
|
198
|
+
|
|
199
|
+
# Extract tool specification
|
|
200
|
+
tool_spec = self._extract_tool_spec(
|
|
201
|
+
gt_name, tool_specs
|
|
202
|
+
) or self._extract_tool_spec(pred_name, tool_specs)
|
|
203
|
+
|
|
204
|
+
# Extract and parse parameters
|
|
205
|
+
pred_params = predicted_call.get("function", {}).get("arguments", {})
|
|
206
|
+
gt_params = ground_truth_call.get("function", {}).get("arguments", {})
|
|
207
|
+
|
|
208
|
+
if isinstance(pred_params, str):
|
|
209
|
+
try:
|
|
210
|
+
pred_params = json.loads(pred_params)
|
|
211
|
+
except json.JSONDecodeError:
|
|
212
|
+
logger.warning(f"Failed to parse predicted parameters: {pred_params}")
|
|
213
|
+
pred_params = {}
|
|
214
|
+
|
|
215
|
+
if isinstance(gt_params, str):
|
|
216
|
+
try:
|
|
217
|
+
gt_params = json.loads(gt_params)
|
|
218
|
+
except json.JSONDecodeError:
|
|
219
|
+
logger.warning(f"Failed to parse ground truth parameters: {gt_params}")
|
|
220
|
+
gt_params = {}
|
|
221
|
+
|
|
222
|
+
# Resolve parameters with defaults
|
|
223
|
+
pred_resolved = self._resolve_parameters_with_defaults(pred_params, tool_spec)
|
|
224
|
+
gt_resolved = self._resolve_parameters_with_defaults(gt_params, tool_spec)
|
|
225
|
+
|
|
226
|
+
# Determine all parameters to compare
|
|
227
|
+
params_to_compare = self.config.parameters_to_compare
|
|
228
|
+
if params_to_compare is None:
|
|
229
|
+
if self.config.include_default_parameters:
|
|
230
|
+
# Include all parameters that appear in either call or have defaults
|
|
231
|
+
params_to_compare = set(pred_resolved.keys()) | set(gt_resolved.keys())
|
|
232
|
+
else:
|
|
233
|
+
# Only explicit parameters
|
|
234
|
+
params_to_compare = set(pred_params.keys()) | set(gt_params.keys())
|
|
235
|
+
|
|
236
|
+
# Find missing required parameters and unexpected parameters
|
|
237
|
+
missing_required = []
|
|
238
|
+
unexpected_params = []
|
|
239
|
+
|
|
240
|
+
if tool_spec:
|
|
241
|
+
required_params = {p.name for p in tool_spec.parameters if p.required}
|
|
242
|
+
all_defined_params = {p.name for p in tool_spec.parameters}
|
|
243
|
+
|
|
244
|
+
# Check for missing required parameters
|
|
245
|
+
for req_param in required_params:
|
|
246
|
+
if req_param not in pred_resolved and req_param not in gt_resolved:
|
|
247
|
+
missing_required.append(req_param)
|
|
248
|
+
|
|
249
|
+
# Check for unexpected parameters
|
|
250
|
+
for param_name in params_to_compare:
|
|
251
|
+
if param_name not in all_defined_params:
|
|
252
|
+
unexpected_params.append(param_name)
|
|
253
|
+
|
|
254
|
+
# Compare each parameter
|
|
255
|
+
param_results = []
|
|
256
|
+
context = {
|
|
257
|
+
"conversation_history": conversation_history,
|
|
258
|
+
"tool_specs": tool_specs,
|
|
259
|
+
"tool_spec": tool_spec,
|
|
260
|
+
"predicted_call": predicted_call,
|
|
261
|
+
"ground_truth_call": ground_truth_call,
|
|
262
|
+
"function_name": gt_name or pred_name,
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
for param_name in params_to_compare:
|
|
266
|
+
pred_val = pred_params.get(param_name)
|
|
267
|
+
gt_val = gt_params.get(param_name)
|
|
268
|
+
pred_resolved_val = pred_resolved.get(param_name)
|
|
269
|
+
gt_resolved_val = gt_resolved.get(param_name)
|
|
270
|
+
|
|
271
|
+
# Get parameter definition from tool spec
|
|
272
|
+
param_def = None
|
|
273
|
+
if tool_spec:
|
|
274
|
+
param_def = next(
|
|
275
|
+
(p for p in tool_spec.parameters if p.name == param_name), None
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Determine parameter status
|
|
279
|
+
param_status = self._determine_parameter_status(
|
|
280
|
+
param_name, pred_params, gt_params, pred_resolved, gt_resolved
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# Enhanced context for this parameter
|
|
284
|
+
param_context = context.copy()
|
|
285
|
+
param_context.update(
|
|
286
|
+
{
|
|
287
|
+
"parameter_definition": param_def.dict() if param_def else None,
|
|
288
|
+
"parameter_status": param_status,
|
|
289
|
+
"predicted_resolved": pred_resolved_val,
|
|
290
|
+
"ground_truth_resolved": gt_resolved_val,
|
|
291
|
+
}
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
param_result = self.compare_parameter(
|
|
295
|
+
param_name,
|
|
296
|
+
pred_resolved_val,
|
|
297
|
+
gt_resolved_val,
|
|
298
|
+
param_context,
|
|
299
|
+
custom_instructions=context.get("custom_instructions"),
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Enhance result with additional information
|
|
303
|
+
param_result.predicted_resolved_value = pred_resolved_val
|
|
304
|
+
param_result.ground_truth_resolved_value = gt_resolved_val
|
|
305
|
+
param_result.parameter_status = param_status
|
|
306
|
+
param_result.parameter_definition = param_def.dict() if param_def else None
|
|
307
|
+
param_result.is_required = param_def.required if param_def else False
|
|
308
|
+
param_result.default_value = param_def.default if param_def else None
|
|
309
|
+
|
|
310
|
+
param_results.append(param_result)
|
|
311
|
+
|
|
312
|
+
# Calculate overall score using weighted approach
|
|
313
|
+
param_score = self._calculate_weighted_score(param_results)
|
|
314
|
+
|
|
315
|
+
overall_score = (
|
|
316
|
+
self.config.weight_function_name * fn_score
|
|
317
|
+
+ self.config.weight_parameters * param_score
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# Apply penalties for missing required parameters
|
|
321
|
+
if missing_required:
|
|
322
|
+
penalty = len(missing_required) * self.config.missing_parameter_penalty
|
|
323
|
+
overall_score *= 1 - penalty
|
|
324
|
+
overall_score = max(0.0, overall_score)
|
|
325
|
+
|
|
326
|
+
# Generate overall explanation
|
|
327
|
+
overall_explanation = self._generate_overall_explanation(
|
|
328
|
+
fn_match,
|
|
329
|
+
fn_score,
|
|
330
|
+
param_results,
|
|
331
|
+
overall_score,
|
|
332
|
+
missing_required,
|
|
333
|
+
unexpected_params,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
return ToolCallComparisonResult(
|
|
337
|
+
predicted_call=predicted_call,
|
|
338
|
+
ground_truth_call=ground_truth_call,
|
|
339
|
+
function_name_match=fn_match,
|
|
340
|
+
function_name_score=fn_score,
|
|
341
|
+
parameter_results=param_results,
|
|
342
|
+
overall_score=overall_score,
|
|
343
|
+
overall_explanation=overall_explanation,
|
|
344
|
+
strategy_used=self.config.strategy,
|
|
345
|
+
missing_required_params=missing_required,
|
|
346
|
+
unexpected_params=unexpected_params,
|
|
347
|
+
metadata={
|
|
348
|
+
"tool_spec_used": tool_spec.dict() if tool_spec else None,
|
|
349
|
+
"parameters_compared": list(params_to_compare),
|
|
350
|
+
"default_parameters_included": self.config.include_default_parameters,
|
|
351
|
+
},
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
def _generate_overall_explanation(
|
|
355
|
+
self,
|
|
356
|
+
fn_match: bool,
|
|
357
|
+
fn_score: float,
|
|
358
|
+
param_results: List[ParameterComparisonResult],
|
|
359
|
+
overall_score: float,
|
|
360
|
+
missing_required: List[str],
|
|
361
|
+
unexpected_params: List[str],
|
|
362
|
+
) -> str:
|
|
363
|
+
"""Generate human-readable explanation of comparison results."""
|
|
364
|
+
explanations = []
|
|
365
|
+
|
|
366
|
+
# Function name analysis
|
|
367
|
+
if fn_match:
|
|
368
|
+
explanations.append("Function names match exactly.")
|
|
369
|
+
else:
|
|
370
|
+
explanations.append(f"Function names differ (similarity: {fn_score:.2f}).")
|
|
371
|
+
|
|
372
|
+
# Parameter analysis
|
|
373
|
+
if param_results:
|
|
374
|
+
matches = sum(1 for r in param_results if r.is_match)
|
|
375
|
+
total = len(param_results)
|
|
376
|
+
explanations.append(f"Parameters: {matches}/{total} matches.")
|
|
377
|
+
|
|
378
|
+
# Break down by status
|
|
379
|
+
status_counts = {}
|
|
380
|
+
for result in param_results:
|
|
381
|
+
status_counts[result.parameter_status] = (
|
|
382
|
+
status_counts.get(result.parameter_status, 0) + 1
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
if status_counts:
|
|
386
|
+
status_summary = ", ".join(
|
|
387
|
+
[
|
|
388
|
+
f"{status.value}: {count}"
|
|
389
|
+
for status, count in status_counts.items()
|
|
390
|
+
]
|
|
391
|
+
)
|
|
392
|
+
explanations.append(f"Parameter status breakdown: {status_summary}")
|
|
393
|
+
|
|
394
|
+
if matches < total:
|
|
395
|
+
mismatches = [r.parameter_name for r in param_results if not r.is_match]
|
|
396
|
+
explanations.append(f"Mismatched parameters: {', '.join(mismatches)}")
|
|
397
|
+
|
|
398
|
+
# Issues
|
|
399
|
+
if missing_required:
|
|
400
|
+
explanations.append(
|
|
401
|
+
f"Missing required parameters: {', '.join(missing_required)}"
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
if unexpected_params:
|
|
405
|
+
explanations.append(
|
|
406
|
+
f"Unexpected parameters: {', '.join(unexpected_params)}"
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
explanations.append(f"Overall similarity score: {overall_score:.2f}")
|
|
410
|
+
|
|
411
|
+
return " ".join(explanations)
|