ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json
ADDED
|
@@ -0,0 +1,613 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"name": "parameter_hallucination_check",
|
|
4
|
+
"task_description": "You are an expert evaluator assessing whether a **specific parameter value**-identified as `parameter_name`-in a tool call is **grounded** in the provided conversation history or tool specification, or whether it is **hallucinated**.\n\nYour task is to make a **strictly evidence-based** judgment. Evaluate this parameter alone-ignore other parameters or external knowledge. The value must:\n- Be explicitly supported by the dialogue, prior tool calls, or tool specification\n- Be appropriate for the context and user intent\n- Respect constraint rules defined in the specification\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value is grounded in the conversation or specification.\n---\n\n### Rating Scale\n\nAssign a score from 1 to 5 based on how clearly the value is grounded:\n\n**5 - Perfectly Grounded** \nExplicitly stated in conversation or matches a clearly documented and contextually appropriate default.\n\n**4 - Mostly Grounded** \nNot directly quoted but clearly follows from the conversation or is a minor variation on a documented default.\n\n**3 - Ambiguous** \nRequires non-trivial inference or transformation to relate the value to the dialogue or specification.\n\n**2 - Mostly Ungrounded** \nLoosely related, incorrectly formatted, or partially contradicts the spec or context.\n\n**1 - Completely Ungrounded** \nInvented, unrelated, or clearly in conflict with the dialogue or specification.\n\nBe conservative in your scoring: Use 1-3 for weak or unverified grounding, and 4-5 only when evidence is explicit or clearly reliable.\n\n---\n\n### Acceptable Sources for Grounding\nA value may be considered grounded if it comes from:\n- Explicit user input\n- Clearly implied user intent (minimal inference)\n- Assistant statements explicitly confirmed by the user\n- Outputs of previous tool calls\n- Documented default values in the tool specification\n\n---\n\n### Ungrounded Patterns\nMark a value ungrounded if:\n- It is absent from both conversation and tool spec\n- It depends on vague or missing input (e.g., \"tomorrow\" without a date format)\n- It uses the wrong type, unit, or format\n- It contradicts other parameters or tool constraints\n- It inaccurately rephrases or transforms the user's input\n\n---\n\n### Handling Defaults\nA default value may be accepted only if:\n- It is explicitly documented in the tool spec\n- It fits the context of the user's request\n- It does not contradict stated or implied intent\nOtherwise, treat the value as ungrounded.\n\n---\n\n### Parameter Relationships\nEnsure this parameter does not contradict other parameters in the same call.\nExample: If `add_day=false` and `day=\"Sunday\"` but the spec says `add_day` must be true to use `day`, the `day` value is ungrounded.\n\n---\n\n### Final Guideline\nLarge language models frequently hallucinate parameter values or apply defaults inappropriately. Your careful and conservative evaluation prevents propagation of such errors.",
|
|
5
|
+
"jsonschema": {
|
|
6
|
+
"title": "parameter_hallucination_check",
|
|
7
|
+
"description": "Assessment of tool call parameter hallucination, following the rubric defined above.",
|
|
8
|
+
"type": "object",
|
|
9
|
+
"additionalProperties": false,
|
|
10
|
+
"properties": {
|
|
11
|
+
"evidence": {
|
|
12
|
+
"type": "string",
|
|
13
|
+
"description": "Provide EXACT quotes from the conversation history (user messages, assistant responses, previous tool outputs) or tool specification that directly support your assessment. For grounded values, cite the specific text where this value originates. For ungrounded values, demonstrate the absence of supporting evidence or cite contradictory information."
|
|
14
|
+
},
|
|
15
|
+
"explanation": {
|
|
16
|
+
"type": "string",
|
|
17
|
+
"description": "Explain in detail why the specific parameter value is grounded or ungrounded. Reference exact sources from the conversation or tool specification that either support or contradict the parameter value. Address: 1) Where the value originated from, 2) Whether this source is sufficient for grounding, 3) Any format or type compliance issues, 4) Any contradictions with other parameters or specifications, and 5) For default values, whether they're documented and appropriate for this context."
|
|
18
|
+
},
|
|
19
|
+
"output": {
|
|
20
|
+
"type": "integer",
|
|
21
|
+
"minimum": 1,
|
|
22
|
+
"maximum": 5,
|
|
23
|
+
"threshold_low": 4,
|
|
24
|
+
"threshold_high": 5,
|
|
25
|
+
"description": "Parameter Hallucination Score (1-5):\n\n- 5: Perfectly grounded\n The parameter value is directly quoted in the conversation or exactly matches a documented default that fits the user's intent.\n Example:\n User: \"Book a hotel in Rome.\"\n → Tool call: search_hotels(city=\"Rome\")\n\n- 4: Grounded via logical reasoning\n The value is not directly quoted but clearly follows from the conversation through straightforward, logic-based inference.\n Example:\n User: \"I'd like a room for me and my wife.\"\n → Tool call: search_hotels(guests=2)\n\n- 3: Ambiguous\n The value might relate to the conversation or domain context, but grounding requires assumptions or nontrivial interpretation not clearly supported by evidence.\n Example:\n User: \"Let's go somewhere warm.\"\n → Tool call: search_hotels(city=\"Barcelona\")\n\n- 2: Likely hallucinated\n The value is loosely connected to the topic but lacks grounding in the conversation or tool specification, or only partially aligns.\n Example:\n User: \"Find hotels.\"\n → Tool call: search_hotels(city=\"Berlin\")\n (No city mentioned)\n\n- 1: Clearly hallucinated\n The parameter value is invented, irrelevant, or contradicts the dialogue or tool specification.\n Example:\n User: \"Book a hotel.\"\n → Tool call: search_hotels(city=\"Mars\")"
|
|
26
|
+
},
|
|
27
|
+
"confidence": {
|
|
28
|
+
"type": "number",
|
|
29
|
+
"minimum": 0,
|
|
30
|
+
"maximum": 1,
|
|
31
|
+
"threshold_low": 0,
|
|
32
|
+
"threshold_high": 1,
|
|
33
|
+
"description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
|
|
34
|
+
},
|
|
35
|
+
"correction": {
|
|
36
|
+
"type": "object",
|
|
37
|
+
"properties": {
|
|
38
|
+
"reason_types": {
|
|
39
|
+
"type": "array",
|
|
40
|
+
"description": "Types of issues with the parameter value, if any. Use one or more of these values: FORMAT_ERROR (wrong format or type), MISSING_INFORMATION (needs more data), PARAMETER_CONTRADICTION (conflicts with other parameters), DEFAULT_ISSUE (inappropriate default), OTHER (explain in reasons).",
|
|
41
|
+
"items": {
|
|
42
|
+
"type": "string",
|
|
43
|
+
"enum": [
|
|
44
|
+
"FORMAT_ERROR",
|
|
45
|
+
"MISSING_INFORMATION",
|
|
46
|
+
"PARAMETER_CONTRADICTION",
|
|
47
|
+
"DEFAULT_ISSUE",
|
|
48
|
+
"OTHER"
|
|
49
|
+
]
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
"reasons": {
|
|
53
|
+
"type": "string",
|
|
54
|
+
"description": "Concise explanation of the specific issues with the parameter value."
|
|
55
|
+
},
|
|
56
|
+
"parameter": {
|
|
57
|
+
"type": "object",
|
|
58
|
+
"description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.",
|
|
59
|
+
"additionalProperties": true
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
"description": "For well-grounded values: Provide an empty object {}. For ungrounded values: Provide an object with reason_types, reasons, and parameter correction.",
|
|
63
|
+
"required": []
|
|
64
|
+
},
|
|
65
|
+
"actionable_recommendations": {
|
|
66
|
+
"type": "array",
|
|
67
|
+
"description": "Provide specific, actionable recommendations for the agent developer to prevent parameter value hallucination. Only provide recommendations when problems are detected. Include multiple detailed suggestions where appropriate, such as:\n\n1. PARAMETER_DOCUMENTATION: Suggest specific clarifications to parameter descriptions that would help prevent hallucination, e.g., 'Update the \"username\" parameter description to: \"Must be explicitly provided by the user in conversation. No default value is permitted. Do not infer or guess usernames.\"'\n\n2. EXPLICIT_VALUE_EXTRACTION: Recommend concrete techniques for extracting parameter values reliably from user input, like 'Implement a structured extraction pattern for the \"count\" parameter that requires an explicit number from the user before proceeding, such as \"How many tweets would you like to see? (default is 10)\"'\n\n3. DEFAULT_VALUE_HANDLING: Propose specific parameter documentation improvements for default values, e.g., 'Add to the \"count\" parameter description: \"Default: 10 if not specified by user. Never invent a count value; use default or ask user.\"'\n\n4. FORMAT_SPECIFICATION_IMPROVEMENTS: Suggest clearer format requirements in parameter documentation, like 'For the \"date\" parameter, specify: \"Must be in ISO format YYYY-MM-DD. Example: 2024-06-22. Do not use other date formats.\"'\n\n5. TYPE_VALIDATION_RULES: Propose validation logic for parameter types, such as 'Implement pre-call validation that ensures integer parameters like \"count\" are provided as numbers without quotes or text.'\n\n6. SYSTEM_PROMPT_ADDITIONS: Recommend specific additions to the system prompt that would help the agent avoid hallucination, e.g., 'Add to the system prompt: \"For parameters like 'count', always use explicitly provided values from the user. If not specified, use documented defaults or ask for clarification.\"'\n\n7. PARAMETER_NAMING_CONVENTIONS: Suggest improvements to parameter naming that would make their purpose clearer, e.g., 'Rename \"target_lang\" to \"language_code\" to clarify it should be an ISO code, not a full language name.'\n\n8. PRE_CALL_REFLECTION_STRATEGIES: Recommend strategies for the agent to reflect on parameter values before making tool calls, such as 'Implement a pre-call reflection step that checks if all required parameters are grounded in user input before proceeding with the function call.'\n\n9. PARAMETER_SOURCE_TRACKING: Suggest implementing a mechanism to track where each parameter value originated from (e.g., user input, default value) to improve transparency and debugging.\n\n10. CLARIFICATION_PROMPTS: Recommend adding clarification prompts when parameter values are ambiguous or missing, e.g., 'If the \"count\" parameter is not specified, ask the user: \"How many items would you like to retrieve? (default is 10)\"'\n\n11. OTHER: Any other specific recommendations that would help prevent similar parameter hallucination issues in the future.",
|
|
68
|
+
"items": {
|
|
69
|
+
"type": "object",
|
|
70
|
+
"properties": {
|
|
71
|
+
"recommendation": {
|
|
72
|
+
"type": "string",
|
|
73
|
+
"description": "A specific, actionable recommendation to improve the agent's parameter handling process.",
|
|
74
|
+
"enum": [
|
|
75
|
+
"PARAMETER_DOCUMENTATION",
|
|
76
|
+
"EXPLICIT_VALUE_EXTRACTION",
|
|
77
|
+
"DEFAULT_VALUE_HANDLING",
|
|
78
|
+
"FORMAT_SPECIFICATION_IMPROVEMENTS",
|
|
79
|
+
"TYPE_VALIDATION_RULES",
|
|
80
|
+
"SYSTEM_PROMPT_ADDITIONS",
|
|
81
|
+
"PARAMETER_NAMING_CONVENTIONS",
|
|
82
|
+
"PRE_CALL_REFLECTION_STRATEGIES",
|
|
83
|
+
"PARAMETER_SOURCE_TRACKING",
|
|
84
|
+
"CLARIFICATION_PROMPTS",
|
|
85
|
+
"OTHER"
|
|
86
|
+
]
|
|
87
|
+
},
|
|
88
|
+
"details": {
|
|
89
|
+
"type": "string",
|
|
90
|
+
"description": "Detailed explanation of the recommendation, including what specific changes should be made, how they will improve parameter handling, and any relevant examples or best practices."
|
|
91
|
+
}
|
|
92
|
+
},
|
|
93
|
+
"required": [
|
|
94
|
+
"recommendation",
|
|
95
|
+
"details"
|
|
96
|
+
]
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
},
|
|
100
|
+
"required": [
|
|
101
|
+
"explanation",
|
|
102
|
+
"evidence",
|
|
103
|
+
"output",
|
|
104
|
+
"confidence",
|
|
105
|
+
"correction",
|
|
106
|
+
"actionable_recommendations"
|
|
107
|
+
]
|
|
108
|
+
},
|
|
109
|
+
"examples": [
|
|
110
|
+
{
|
|
111
|
+
"user_kwargs": {
|
|
112
|
+
"conversation_context": [
|
|
113
|
+
{
|
|
114
|
+
"role": "user",
|
|
115
|
+
"content": "Translate 'hello' to Spanish."
|
|
116
|
+
}
|
|
117
|
+
],
|
|
118
|
+
"tool_inventory": [
|
|
119
|
+
{
|
|
120
|
+
"type": "function",
|
|
121
|
+
"function": {
|
|
122
|
+
"name": "translate_text",
|
|
123
|
+
"description": "Translate given text",
|
|
124
|
+
"parameters": {
|
|
125
|
+
"type": "object",
|
|
126
|
+
"properties": {
|
|
127
|
+
"text": {
|
|
128
|
+
"type": "string",
|
|
129
|
+
"description": "The text to be translated"
|
|
130
|
+
},
|
|
131
|
+
"target_lang": {
|
|
132
|
+
"type": "string",
|
|
133
|
+
"description": "The target language code, e.g., 'en' for English"
|
|
134
|
+
}
|
|
135
|
+
},
|
|
136
|
+
"required": [
|
|
137
|
+
"text",
|
|
138
|
+
"target_lang"
|
|
139
|
+
]
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
],
|
|
144
|
+
"tool_call": {
|
|
145
|
+
"id": "call_001",
|
|
146
|
+
"type": "function",
|
|
147
|
+
"function": {
|
|
148
|
+
"name": "translate_text",
|
|
149
|
+
"arguments": "{ \"text\": \"hello\", \"target_lang\": \"es\" }"
|
|
150
|
+
}
|
|
151
|
+
},
|
|
152
|
+
"parameter_name": "target_lang",
|
|
153
|
+
"parameter_value": "es"
|
|
154
|
+
},
|
|
155
|
+
"output": {
|
|
156
|
+
"evidence": "User request: \"Translate 'hello' to Spanish.\" Tool specification for translate_text includes parameter target_lang of type string.",
|
|
157
|
+
"explanation": "The parameter value target_lang='es' is properly grounded in the conversation. The user explicitly requested translation \"to Spanish\" and the agent correctly converted this to the ISO language code 'es' for Spanish. This conversion from the common language name to its standard code is a reasonable and expected transformation that requires minimal inference. The tool specification indicates target_lang should be a string, and 'es' is the appropriate ISO code for Spanish.",
|
|
158
|
+
"output": 5,
|
|
159
|
+
"confidence": 0.96,
|
|
160
|
+
"correction": {},
|
|
161
|
+
"actionable_recommendations": []
|
|
162
|
+
}
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
"user_kwargs": {
|
|
166
|
+
"conversation_context": [
|
|
167
|
+
{
|
|
168
|
+
"role": "user",
|
|
169
|
+
"content": "Fetch my latest tweets."
|
|
170
|
+
}
|
|
171
|
+
],
|
|
172
|
+
"tool_inventory": [
|
|
173
|
+
{
|
|
174
|
+
"type": "function",
|
|
175
|
+
"function": {
|
|
176
|
+
"name": "get_tweets",
|
|
177
|
+
"description": "Retrieve recent tweets",
|
|
178
|
+
"parameters": {
|
|
179
|
+
"type": "object",
|
|
180
|
+
"properties": {
|
|
181
|
+
"username": {
|
|
182
|
+
"type": "string",
|
|
183
|
+
"description": "The Twitter username to fetch tweets for"
|
|
184
|
+
},
|
|
185
|
+
"count": {
|
|
186
|
+
"type": "integer",
|
|
187
|
+
"description": "The number of recent tweets to retrieve"
|
|
188
|
+
}
|
|
189
|
+
},
|
|
190
|
+
"required": [
|
|
191
|
+
"username",
|
|
192
|
+
"count"
|
|
193
|
+
]
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
],
|
|
198
|
+
"tool_call": {
|
|
199
|
+
"id": "call_001",
|
|
200
|
+
"type": "function",
|
|
201
|
+
"function": {
|
|
202
|
+
"name": "get_tweets",
|
|
203
|
+
"arguments": "{ \"username\": \"elonmusk\", \"count\": 20 }"
|
|
204
|
+
}
|
|
205
|
+
},
|
|
206
|
+
"parameter_name": "count",
|
|
207
|
+
"parameter_value": 20
|
|
208
|
+
},
|
|
209
|
+
"output": {
|
|
210
|
+
"evidence": "User request: \"Fetch my latest tweets.\" - contains no mention of any specific count or number of tweets. Tool specification does not document a default value for the count parameter.",
|
|
211
|
+
"explanation": "The parameter value count=20 is ungrounded and appears to be hallucinated. The user requested \"latest tweets\" but did not specify any count or quantity. The tool specification defines count as an integer parameter but does not document a default value. The agent appears to have arbitrarily chosen the value 20 without any basis in the conversation or tool specification. Without user specification or a documented default, the agent should either ask the user how many tweets they want to see or use a reasonable, documented default value if one exists.",
|
|
212
|
+
"output": 1,
|
|
213
|
+
"confidence": 0.95,
|
|
214
|
+
"correction": {
|
|
215
|
+
"reason_types": [
|
|
216
|
+
"MISSING_INFORMATION"
|
|
217
|
+
],
|
|
218
|
+
"reasons": "Count value 20 is invented without user specification or documented default.",
|
|
219
|
+
"parameter": {
|
|
220
|
+
"need_more_information": "How many tweets would you like to retrieve? Please specify a number."
|
|
221
|
+
}
|
|
222
|
+
},
|
|
223
|
+
"actionable_recommendations": [
|
|
224
|
+
{
|
|
225
|
+
"recommendation": "PARAMETER_DOCUMENTATION",
|
|
226
|
+
"details": "Update the 'count' parameter description to: 'Number of tweets to retrieve. Default: 10 if not specified. Do not invent count values; use the default if not explicitly provided by the user.'"
|
|
227
|
+
},
|
|
228
|
+
{
|
|
229
|
+
"recommendation": "SYSTEM_PROMPT_ADDITIONS",
|
|
230
|
+
"details": "Add to the system prompt: 'For the get_tweets function, only use explicitly provided count values from the user. When count is not specified, always use the default value of 10. Never invent arbitrary count values.'"
|
|
231
|
+
},
|
|
232
|
+
{
|
|
233
|
+
"recommendation": "PRE_CALL_REFLECTION_STRATEGIES",
|
|
234
|
+
"details": "Implement a pre-call reflection step that checks if all required parameters are grounded in user input before proceeding with the function call."
|
|
235
|
+
}
|
|
236
|
+
]
|
|
237
|
+
}
|
|
238
|
+
},
|
|
239
|
+
{
|
|
240
|
+
"user_kwargs": {
|
|
241
|
+
"conversation_context": [
|
|
242
|
+
{
|
|
243
|
+
"role": "user",
|
|
244
|
+
"content": "Can you show me just a few of my latest messages?"
|
|
245
|
+
}
|
|
246
|
+
],
|
|
247
|
+
"tool_inventory": [
|
|
248
|
+
{
|
|
249
|
+
"type": "function",
|
|
250
|
+
"function": {
|
|
251
|
+
"name": "get_messages",
|
|
252
|
+
"description": "Retrieve user's messages",
|
|
253
|
+
"parameters": {
|
|
254
|
+
"type": "object",
|
|
255
|
+
"properties": {
|
|
256
|
+
"user_id": {
|
|
257
|
+
"type": "integer",
|
|
258
|
+
"description": "The ID of the user whose messages are being retrieved"
|
|
259
|
+
},
|
|
260
|
+
"limit": {
|
|
261
|
+
"type": "integer",
|
|
262
|
+
"description": "The maximum number of messages to retrieve"
|
|
263
|
+
}
|
|
264
|
+
},
|
|
265
|
+
"required": [
|
|
266
|
+
"user_id",
|
|
267
|
+
"limit"
|
|
268
|
+
]
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
],
|
|
273
|
+
"tool_call": {
|
|
274
|
+
"id": "call_002",
|
|
275
|
+
"type": "function",
|
|
276
|
+
"function": {
|
|
277
|
+
"name": "get_messages",
|
|
278
|
+
"arguments": "{ \"user_id\": 456, \"limit\": 3 }"
|
|
279
|
+
}
|
|
280
|
+
},
|
|
281
|
+
"parameter_name": "limit",
|
|
282
|
+
"parameter_value": 3
|
|
283
|
+
},
|
|
284
|
+
"output": {
|
|
285
|
+
"evidence": "User said: 'just a few of my latest messages.' The phrase 'a few' is vague and subjective, and while 3 is a reasonable interpretation, it is not explicitly stated.",
|
|
286
|
+
"explanation": "The value limit=3 is a plausible interpretation of the user's request for 'a few' messages. However, the term is ambiguous and does not unambiguously support the specific value of 3. The assistant made a judgment call based on conversational context, but without explicit grounding or a documented default, the score is 3.",
|
|
287
|
+
"output": 3,
|
|
288
|
+
"confidence": 0.79,
|
|
289
|
+
"correction": {
|
|
290
|
+
"reason_types": [
|
|
291
|
+
"MISSING_INFORMATION"
|
|
292
|
+
],
|
|
293
|
+
"reasons": "The user's phrasing is vague and does not explicitly support the value 3. Clarification is needed to ensure the correct value is used.",
|
|
294
|
+
"parameter": {
|
|
295
|
+
"need_more_information": "You asked for 'a few' messages. Could you specify how many exactly you'd like to see?"
|
|
296
|
+
}
|
|
297
|
+
},
|
|
298
|
+
"actionable_recommendations": [
|
|
299
|
+
{
|
|
300
|
+
"recommendation": "PRE_CALL_REFLECTION_STRATEGIES",
|
|
301
|
+
"details": "Encourage the assistant to clarify vague quantifiers like 'a few' with a specific number before making a tool call."
|
|
302
|
+
},
|
|
303
|
+
{
|
|
304
|
+
"recommendation": "CLARIFICATION_PROMPTS",
|
|
305
|
+
"details": "Prompt the user for an exact number when their request includes ambiguous terms such as 'some' or 'a few.'"
|
|
306
|
+
}
|
|
307
|
+
]
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
]
|
|
311
|
+
},
|
|
312
|
+
{
|
|
313
|
+
"name": "parameter_value_format_alignment",
|
|
314
|
+
"task_description": "Evaluate whether the specific parameter value (named in \"parameter_name\") conforms exactly to the required type, format, and unit conventions defined in the API specification.\n\nYour judgment must be strictly evidence-based: rely exclusively on the provided API documentation and the parameter value. Do not assume or infer formatting rules that are not explicitly documented.\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided to ensure it meets the exact type, format, and unit requirements. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value aligns with the specification.\n---\n\n### 1. Data Type Compliance (Output = 4-5)\n- Check that the value matches the required data type (e.g., string, integer, boolean, object, array)\n- Verify that numbers are represented as numeric types, not strings\n- Ensure strings are quoted correctly if required\n- Confirm booleans are true/false (not \"true\"/\"false\") when the type is boolean\n\n---\n\n### 2. Format Specification Compliance\n- Validate adherence to documented format constraints, including:\n - Date and time formats (e.g., ISO 8601, YYYY-MM-DD, timezone presence)\n - Currency (e.g., $USD, symbol placement, decimal precision)\n - Special patterns (e.g., phone numbers, email addresses, postal codes)\n- Enforce regular expressions or formatting rules defined in the schema\n\n---\n\n### 3. Unit Compliance\n- Verify presence or absence of unit suffixes/prefixes as specified (e.g., \"kg\", \"seconds\")\n- Confirm unit types are consistent with spec (e.g., Celsius vs Fahrenheit)\n- Reject any extra units when raw values are required\n\n---\n\n### 4. Consistency Requirements\n- Ensure values are consistent with related parameters (e.g., both timestamps in UTC)\n- Validate compatibility with other parameter values in the same call\n\n---\n\n### Format Alignment Rating Scale\n- **5 - Perfect Alignment**: Value exactly matches required type, format, and units\n- **4 - Minor Deviation**: Value is fundamentally correct but with minor formatting issues\n- **3 - Moderate Deviation**: Type or format is partially incorrect; may cause issues\n- **2 - Major Deviation**: Type and format are significantly wrong; likely to fail\n- **1 - Complete Mismatch**: Value is entirely incompatible with specification\n\n---\n\n### Final Rule\nBe conservative in your assessment:\n- If the spec is ambiguous and the value seems plausible, prefer a higher score (4-5)\n- If the spec is clear and the value deviates, assign a lower score (1-3)\n\nYour evaluation is critical: even minor formatting issues can lead to tool failures or incorrect outputs. Careful review ensures reliability in downstream tool use.",
|
|
315
|
+
"jsonschema": {
|
|
316
|
+
"title": "parameter_value_format_alignment",
|
|
317
|
+
"description": "Assessment of a parameter value's compliance with required type, format, and unit specifications, based on the rubric above.",
|
|
318
|
+
"type": "object",
|
|
319
|
+
"additionalProperties": false,
|
|
320
|
+
"properties": {
|
|
321
|
+
"evidence": {
|
|
322
|
+
"type": "string",
|
|
323
|
+
"description": "Quote the specification's type/format definition and include the actual parameter value provided. Cite EXACT text from the tool specification that supports your judgment."
|
|
324
|
+
},
|
|
325
|
+
"explanation": {
|
|
326
|
+
"type": "string",
|
|
327
|
+
"description": "Explain clearly why the parameter value conforms or does not conform to the specification's requirements. Address data type, format, units, and any pattern or constraint violations. For incorrect values, describe exactly what is wrong."
|
|
328
|
+
},
|
|
329
|
+
"output": {
|
|
330
|
+
"type": "integer",
|
|
331
|
+
"minimum": 1,
|
|
332
|
+
"maximum": 5,
|
|
333
|
+
"threshold_low": 4,
|
|
334
|
+
"threshold_high": 5,
|
|
335
|
+
"description": "An integer from 1 to 5 indicating how well the parameter value conforms to the required type, format, and unit as defined in the API specification.\n\nScore meanings:\n\n5 - Perfect Alignment: The value exactly matches the expected type, format, and unit. Example: expected date in 'YYYY-MM-DD', value is '2025-08-05'.\n\n4 - Minor Deviation: The value has small formatting issues (e.g., missing leading zeros) but is still likely to be accepted by the API or parser unless strict validation is enforced. Example: expected '2025-08-05', value is '2025-8-5'.\n\n3 - Moderate Deviation: The value partially matches the expected format but is likely to be rejected or misinterpreted by automatic processing. Example: expected 'YYYY-MM-DD', value is 'August 5, 2025'.\n\n2 - Major Deviation: The value significantly violates the expected type, format, or unit and is very likely to fail. Example: expected 'YYYY-MM-DD', value is '08/05/2025'.\n\n1 - Complete Mismatch: The value is entirely incompatible with the required format or type. Example: expected boolean, value is 'maybe'."
|
|
336
|
+
},
|
|
337
|
+
"confidence": {
|
|
338
|
+
"type": "number",
|
|
339
|
+
"minimum": 0,
|
|
340
|
+
"maximum": 1,
|
|
341
|
+
"threshold_low": 0,
|
|
342
|
+
"threshold_high": 1,
|
|
343
|
+
"description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
|
|
344
|
+
},
|
|
345
|
+
"correction": {
|
|
346
|
+
"type": "object",
|
|
347
|
+
"description": "If output >= 4, return {}. If output <= 3, provide reason_types, explanation, and a corrected parameter.",
|
|
348
|
+
"properties": {
|
|
349
|
+
"reason_types": {
|
|
350
|
+
"type": "array",
|
|
351
|
+
"description": "List of format issues identified. Use one or more of: TYPE_ERROR, FORMAT_ERROR, UNIT_ERROR, PATTERN_ERROR, CONSISTENCY_ERROR, OTHER.",
|
|
352
|
+
"items": {
|
|
353
|
+
"type": "string",
|
|
354
|
+
"enum": [
|
|
355
|
+
"TYPE_ERROR",
|
|
356
|
+
"FORMAT_ERROR",
|
|
357
|
+
"UNIT_ERROR",
|
|
358
|
+
"PATTERN_ERROR",
|
|
359
|
+
"CONSISTENCY_ERROR",
|
|
360
|
+
"OTHER"
|
|
361
|
+
]
|
|
362
|
+
}
|
|
363
|
+
},
|
|
364
|
+
"reasons": {
|
|
365
|
+
"type": "string",
|
|
366
|
+
"description": "Short explanation of the specific issue(s) with the value's format, type, or unit."
|
|
367
|
+
},
|
|
368
|
+
"parameter": {
|
|
369
|
+
"type": "object",
|
|
370
|
+
"description": "An object containing the corrected parameter value in the form: { \"<parameter_name>\": <corrected_value> }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.",
|
|
371
|
+
"additionalProperties": true
|
|
372
|
+
}
|
|
373
|
+
},
|
|
374
|
+
"required": []
|
|
375
|
+
},
|
|
376
|
+
"actionable_recommendations": {
|
|
377
|
+
"type": "array",
|
|
378
|
+
"description": "Provide actionable suggestions for avoiding this type of error in the future, only if there is an issue. Address root causes like unclear format specs, missing examples, or lack of validation. Include suggestions such as:\n\n1. PARAMETER_FORMAT_DOCUMENTATION: Clarify or expand parameter format expectations in the API documentation.\n2. PARAMETER_EXAMPLES: Add example values with correct formatting in the parameter documentation.\n3. PARAMETER_VALIDATION: Introduce regex or rule-based format validation in the API to catch errors before tool calls.\n4. PARAMETER_CONVERSION: Add conversion helpers for user input to ensure correct formatting.\n5. UNIT_STANDARDS: Standardize unit expectations and flag missing or incorrect units in the tools implementation.\n\n6. PARAMETER_NAMING_CONVENTIONS: Suggest clearer parameter names that indicate expected formats or units.\n7. SYSTEM_PROMPT_ADDITIONS: Recommend specific additions to the system prompt that would help the agent avoid similar format issues in the future.\n8. PRE_CALL_REFLECTION_STRATEGIES: Propose strategies for the agent to reflect on parameter values before making tool calls, such as checking if all required parameters are grounded in user input.\n9. PARAMETER_SOURCE_TRACKING: Suggest implementing a mechanism to track where each parameter value originated from (e.g., user input, default value) to improve transparency and debugging.\n10. CLARIFICATION_PROMPTS: Recommend adding clarification prompts when parameter values are ambiguous or missing, e.g., asking the user for specific formats or units.\n\n11. OTHER: Any other specific recommendations that would help prevent similar format issues in the future.",
|
|
379
|
+
"items": {
|
|
380
|
+
"type": "object",
|
|
381
|
+
"properties": {
|
|
382
|
+
"recommendation": {
|
|
383
|
+
"type": "string",
|
|
384
|
+
"description": "A specific, actionable recommendation to improve the agent's parameter handling process.",
|
|
385
|
+
"enum": [
|
|
386
|
+
"PARAMETER_FORMAT_DOCUMENTATION",
|
|
387
|
+
"PARAMETER_EXAMPLES",
|
|
388
|
+
"PARAMETER_VALIDATION",
|
|
389
|
+
"PARAMETER_CONVERSION",
|
|
390
|
+
"UNIT_STANDARDS",
|
|
391
|
+
"PARAMETER_NAMING_CONVENTIONS",
|
|
392
|
+
"SYSTEM_PROMPT_ADDITIONS",
|
|
393
|
+
"PRE_CALL_REFLECTION_STRATEGIES",
|
|
394
|
+
"PARAMETER_SOURCE_TRACKING",
|
|
395
|
+
"CLARIFICATION_PROMPTS",
|
|
396
|
+
"OTHER"
|
|
397
|
+
]
|
|
398
|
+
},
|
|
399
|
+
"details": {
|
|
400
|
+
"type": "string",
|
|
401
|
+
"description": "Detailed explanation of the recommendation, including what specific changes should be made, how they will improve parameter handling, and any relevant examples or best practices."
|
|
402
|
+
}
|
|
403
|
+
},
|
|
404
|
+
"required": [
|
|
405
|
+
"recommendation",
|
|
406
|
+
"details"
|
|
407
|
+
]
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
},
|
|
411
|
+
"required": [
|
|
412
|
+
"explanation",
|
|
413
|
+
"evidence",
|
|
414
|
+
"output",
|
|
415
|
+
"confidence",
|
|
416
|
+
"correction",
|
|
417
|
+
"actionable_recommendations"
|
|
418
|
+
]
|
|
419
|
+
},
|
|
420
|
+
"examples": [
|
|
421
|
+
{
|
|
422
|
+
"user_kwargs": {
|
|
423
|
+
"conversation_context": [
|
|
424
|
+
{
|
|
425
|
+
"role": "user",
|
|
426
|
+
"content": "Set a short countdown for 12 minutes."
|
|
427
|
+
}
|
|
428
|
+
],
|
|
429
|
+
"tool_inventory": [
|
|
430
|
+
{
|
|
431
|
+
"type": "function",
|
|
432
|
+
"function": {
|
|
433
|
+
"name": "set_timer",
|
|
434
|
+
"description": "Set a countdown timer",
|
|
435
|
+
"parameters": {
|
|
436
|
+
"type": "object",
|
|
437
|
+
"properties": {
|
|
438
|
+
"duration": {
|
|
439
|
+
"type": "string",
|
|
440
|
+
"description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')"
|
|
441
|
+
}
|
|
442
|
+
},
|
|
443
|
+
"required": [
|
|
444
|
+
"duration"
|
|
445
|
+
]
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
],
|
|
450
|
+
"tool_call": {
|
|
451
|
+
"id": "call_001",
|
|
452
|
+
"type": "function",
|
|
453
|
+
"function": {
|
|
454
|
+
"name": "set_timer",
|
|
455
|
+
"arguments": "{ \"duration\": \"12 minutes\" }"
|
|
456
|
+
}
|
|
457
|
+
},
|
|
458
|
+
"parameter_name": "duration",
|
|
459
|
+
"parameter_value": "12 minutes"
|
|
460
|
+
},
|
|
461
|
+
"output": {
|
|
462
|
+
"evidence": "The tool spec states that 'duration' must be a string in the format 'number + unit', with examples such as '5 minutes' and '30 seconds'. The value '12 minutes' follows this format precisely: it consists of a number followed by a space and a valid time unit.",
|
|
463
|
+
"explanation": "The parameter value '12 minutes' aligns perfectly with the required type, structure, and unit conventions defined in the spec. While not explicitly given as an example, it is fully consistent with the format and requires no normalization or interpretation.",
|
|
464
|
+
"output": 5,
|
|
465
|
+
"confidence": 0.99,
|
|
466
|
+
"correction": {},
|
|
467
|
+
"actionable_recommendations": []
|
|
468
|
+
}
|
|
469
|
+
},
|
|
470
|
+
{
|
|
471
|
+
"user_kwargs": {
|
|
472
|
+
"conversation_context": [
|
|
473
|
+
{
|
|
474
|
+
"role": "user",
|
|
475
|
+
"content": "Start a countdown for 5 minutes."
|
|
476
|
+
}
|
|
477
|
+
],
|
|
478
|
+
"tool_inventory": [
|
|
479
|
+
{
|
|
480
|
+
"type": "function",
|
|
481
|
+
"function": {
|
|
482
|
+
"name": "set_timer",
|
|
483
|
+
"description": "Set a countdown timer",
|
|
484
|
+
"parameters": {
|
|
485
|
+
"type": "object",
|
|
486
|
+
"properties": {
|
|
487
|
+
"duration": {
|
|
488
|
+
"type": "string",
|
|
489
|
+
"description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')"
|
|
490
|
+
}
|
|
491
|
+
},
|
|
492
|
+
"required": [
|
|
493
|
+
"duration"
|
|
494
|
+
]
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
],
|
|
499
|
+
"tool_call": {
|
|
500
|
+
"id": "call_001",
|
|
501
|
+
"type": "function",
|
|
502
|
+
"function": {
|
|
503
|
+
"name": "set_timer",
|
|
504
|
+
"arguments": "{ \"duration\": \"300000\" }"
|
|
505
|
+
}
|
|
506
|
+
},
|
|
507
|
+
"parameter_name": "duration",
|
|
508
|
+
"parameter_value": "300000"
|
|
509
|
+
},
|
|
510
|
+
"output": {
|
|
511
|
+
"evidence": "The tool specification defines 'duration' as a string following the 'number + unit' format (e.g., '5 minutes'). The value '300000' is a string, but it lacks a time unit and does not follow the documented pattern.",
|
|
512
|
+
"explanation": "The value is likely intended to represent milliseconds, but the format is both undocumented and inconsistent with the specification. It fails both unit and pattern requirements, and cannot be reliably interpreted as valid input without assumptions or external conventions.",
|
|
513
|
+
"output": 2,
|
|
514
|
+
"confidence": 0.95,
|
|
515
|
+
"correction": {
|
|
516
|
+
"reason_types": [
|
|
517
|
+
"FORMAT_ERROR",
|
|
518
|
+
"UNIT_ERROR"
|
|
519
|
+
],
|
|
520
|
+
"reasons": "Missing explicit time unit and nonconformant format. Specification requires 'number + unit' format, which this value does not follow.",
|
|
521
|
+
"parameter": {
|
|
522
|
+
"duration": "5 minutes"
|
|
523
|
+
}
|
|
524
|
+
},
|
|
525
|
+
"actionable_recommendations": [
|
|
526
|
+
{
|
|
527
|
+
"recommendation": "PARAMETER_FORMAT_DOCUMENTATION",
|
|
528
|
+
"details": "Update the 'duration' parameter documentation to be more explicit: 'Duration must be provided as a string with a numeric value followed by a space and then a time unit (e.g., \"5 minutes\", \"30 seconds\"). Millisecond values without units are not supported. Raw numeric values without units will be rejected.'"
|
|
529
|
+
},
|
|
530
|
+
{
|
|
531
|
+
"recommendation": "PARAMETER_VALIDATION",
|
|
532
|
+
"details": "Implement regex validation for the 'duration' parameter that enforces the 'number + unit' format with a required space between them, e.g., /^\\d+\\s+[a-z]+$/i, and clearly rejects pure numeric values with an informative error message."
|
|
533
|
+
},
|
|
534
|
+
{
|
|
535
|
+
"recommendation": "PARAMETER_CONVERSION",
|
|
536
|
+
"details": "Add a conversion helper that automatically formats numeric-only values to the required format by appending appropriate units based on the magnitude of the number (e.g., convert '300000' to '5 minutes'). This would prevent tool call failures while maintaining the intended functionality."
|
|
537
|
+
}
|
|
538
|
+
]
|
|
539
|
+
}
|
|
540
|
+
},
|
|
541
|
+
{
|
|
542
|
+
"user_kwargs": {
|
|
543
|
+
"conversation_context": [
|
|
544
|
+
{
|
|
545
|
+
"role": "user",
|
|
546
|
+
"content": "Set a timer for five minutes please."
|
|
547
|
+
}
|
|
548
|
+
],
|
|
549
|
+
"tool_inventory": [
|
|
550
|
+
{
|
|
551
|
+
"type": "function",
|
|
552
|
+
"function": {
|
|
553
|
+
"name": "set_timer",
|
|
554
|
+
"description": "Set a countdown timer",
|
|
555
|
+
"parameters": {
|
|
556
|
+
"type": "object",
|
|
557
|
+
"properties": {
|
|
558
|
+
"duration": {
|
|
559
|
+
"type": "string",
|
|
560
|
+
"description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')"
|
|
561
|
+
}
|
|
562
|
+
},
|
|
563
|
+
"required": [
|
|
564
|
+
"duration"
|
|
565
|
+
]
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
],
|
|
570
|
+
"tool_call": {
|
|
571
|
+
"id": "call_001",
|
|
572
|
+
"type": "function",
|
|
573
|
+
"function": {
|
|
574
|
+
"name": "set_timer",
|
|
575
|
+
"arguments": "{ \"duration\": \"5Minutes\" }"
|
|
576
|
+
}
|
|
577
|
+
},
|
|
578
|
+
"parameter_name": "duration",
|
|
579
|
+
"parameter_value": "5Minutes"
|
|
580
|
+
},
|
|
581
|
+
"output": {
|
|
582
|
+
"evidence": "The specification explicitly defines the duration format as 'number + unit' with examples like '5 minutes'. The given value '5Minutes' omits the required space, resulting in a concatenated string.",
|
|
583
|
+
"explanation": "The input conveys the intended meaning and includes all required semantic elements (number and unit), but the lack of a space between them breaks the documented pattern. This deviation is moderate-it introduces a risk of failure in strict parsers that expect a space-separated format.",
|
|
584
|
+
"output": 3,
|
|
585
|
+
"confidence": 0.89,
|
|
586
|
+
"correction": {
|
|
587
|
+
"reason_types": [
|
|
588
|
+
"FORMAT_ERROR"
|
|
589
|
+
],
|
|
590
|
+
"reasons": "Value contains both components but violates the explicit formatting requirement of space separation.",
|
|
591
|
+
"parameter": {
|
|
592
|
+
"duration": "5 minutes"
|
|
593
|
+
}
|
|
594
|
+
},
|
|
595
|
+
"actionable_recommendations": [
|
|
596
|
+
{
|
|
597
|
+
"recommendation": "PARAMETER_FORMAT_DOCUMENTATION",
|
|
598
|
+
"details": "Enhance the documentation for the 'duration' parameter to explicitly highlight the importance of the space separator: 'Duration MUST include a space between the numeric value and the unit (e.g., \"5 minutes\" not \"5minutes\"). The space separator is required for proper parsing.'"
|
|
599
|
+
},
|
|
600
|
+
{
|
|
601
|
+
"recommendation": "PARAMETER_VALIDATION",
|
|
602
|
+
"details": "Implement regex-based validation that specifically checks for the presence of a space between the number and unit, e.g., /^(\\d+)\\s+(\\w+)$/, and provides a clear error message when this pattern is violated: 'Duration format error: Please ensure there is a space between the number and unit (e.g., \"5 minutes\").'"
|
|
603
|
+
},
|
|
604
|
+
{
|
|
605
|
+
"recommendation": "PARAMETER_CONVERSION",
|
|
606
|
+
"details": "Add automatic reformatting that can detect and correct common format errors like missing spaces between numbers and units, ensuring the API remains robust against minor formatting issues."
|
|
607
|
+
}
|
|
608
|
+
]
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
]
|
|
612
|
+
}
|
|
613
|
+
]
|