ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
_JSON_TYPE_ALIASES = {
|
|
5
|
+
"str": "string",
|
|
6
|
+
"string": "string",
|
|
7
|
+
"int": "integer",
|
|
8
|
+
"integer": "integer",
|
|
9
|
+
"float": "number",
|
|
10
|
+
"double": "number",
|
|
11
|
+
"number": "number",
|
|
12
|
+
"bool": "boolean",
|
|
13
|
+
"boolean": "boolean",
|
|
14
|
+
"dict": "object",
|
|
15
|
+
"object": "object",
|
|
16
|
+
"list": "array",
|
|
17
|
+
"array": "array",
|
|
18
|
+
"null": "null",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
_ALLOWED_JSON_TYPES = set(_JSON_TYPE_ALIASES.values())
|
|
22
|
+
|
|
23
|
+
_NAME_RE = re.compile(r"^[A-Za-z0-9_]{1,64}$")
|
|
24
|
+
|
|
25
|
+
def _sanitize_name(name: str) -> str:
|
|
26
|
+
name = re.sub(r"[^A-Za-z0-9_]", "_", str(name))
|
|
27
|
+
if not name:
|
|
28
|
+
name = "tool"
|
|
29
|
+
if name[0].isdigit():
|
|
30
|
+
name = f"fn_{name}"
|
|
31
|
+
return name[:64]
|
|
32
|
+
|
|
33
|
+
def _normalize_type(t):
|
|
34
|
+
if isinstance(t, list):
|
|
35
|
+
fixed = list({ _JSON_TYPE_ALIASES.get(x, x) for x in t })
|
|
36
|
+
return [x for x in fixed if x in _ALLOWED_JSON_TYPES] or ["string"]
|
|
37
|
+
if isinstance(t, str):
|
|
38
|
+
return _JSON_TYPE_ALIASES.get(t, t) if _JSON_TYPE_ALIASES.get(t, t) in _ALLOWED_JSON_TYPES else "string"
|
|
39
|
+
return "string"
|
|
40
|
+
|
|
41
|
+
def _clean_description(desc):
|
|
42
|
+
if desc is None:
|
|
43
|
+
return None
|
|
44
|
+
s = str(desc).strip()
|
|
45
|
+
# collapse excessive whitespace
|
|
46
|
+
s = re.sub(r"\s+\n", "\n", s)
|
|
47
|
+
s = re.sub(r"\n\s+", "\n", s)
|
|
48
|
+
return s
|
|
49
|
+
|
|
50
|
+
def _fix_schema(schema, notes, path="parameters"):
|
|
51
|
+
"""
|
|
52
|
+
Recursively fix a JSON Schema-ish dict in place.
|
|
53
|
+
"""
|
|
54
|
+
if not isinstance(schema, dict):
|
|
55
|
+
notes.append(f"{path}: non-dict schema replaced with empty object")
|
|
56
|
+
return {"type": "object"}
|
|
57
|
+
|
|
58
|
+
out = dict(schema)
|
|
59
|
+
|
|
60
|
+
# type
|
|
61
|
+
if "type" in out:
|
|
62
|
+
out["type"] = _normalize_type(out["type"])
|
|
63
|
+
# For top-level parameters or any object-like node, ensure properties shape if object
|
|
64
|
+
if out.get("type") == "object":
|
|
65
|
+
props = out.get("properties", {})
|
|
66
|
+
if not isinstance(props, dict):
|
|
67
|
+
notes.append(f"{path}.properties: not a dict, replaced with empty dict")
|
|
68
|
+
props = {}
|
|
69
|
+
fixed_props = {}
|
|
70
|
+
for k, v in props.items():
|
|
71
|
+
fixed_props[k] = _fix_schema(v if isinstance(v, dict) else {"type": v}, notes, f"{path}.properties.{k}")
|
|
72
|
+
out["properties"] = fixed_props
|
|
73
|
+
|
|
74
|
+
# required: only keep keys that exist in properties and are strings
|
|
75
|
+
if "required" in out:
|
|
76
|
+
req = out["required"]
|
|
77
|
+
if isinstance(req, list):
|
|
78
|
+
req_clean = [r for r in req if isinstance(r, str) and r in out["properties"]]
|
|
79
|
+
if req_clean != req:
|
|
80
|
+
notes.append(f"{path}.required: pruned invalid entries")
|
|
81
|
+
out["required"] = req_clean
|
|
82
|
+
else:
|
|
83
|
+
notes.append(f"{path}.required: not a list, removed")
|
|
84
|
+
out.pop("required", None)
|
|
85
|
+
|
|
86
|
+
# additionalProperties is fine as is if present
|
|
87
|
+
elif out.get("type") == "array":
|
|
88
|
+
# ensure items
|
|
89
|
+
items = out.get("items")
|
|
90
|
+
if not isinstance(items, dict):
|
|
91
|
+
notes.append(f"{path}.items: missing or invalid, set to permissive object")
|
|
92
|
+
out["items"] = {}
|
|
93
|
+
else:
|
|
94
|
+
out["items"] = _fix_schema(items, notes, f"{path}.items")
|
|
95
|
+
|
|
96
|
+
# description
|
|
97
|
+
if "description" in out:
|
|
98
|
+
cleaned = _clean_description(out.get("description"))
|
|
99
|
+
if cleaned != out.get("description"):
|
|
100
|
+
notes.append(f"{path}.description: normalized whitespace")
|
|
101
|
+
out["description"] = cleaned
|
|
102
|
+
|
|
103
|
+
# normalize leaf shorthand like {"type": "str"} already handled
|
|
104
|
+
return out
|
|
105
|
+
|
|
106
|
+
def to_valid_openai_tool(spec: dict):
|
|
107
|
+
"""
|
|
108
|
+
Convert a possibly-invalid tool specification dict into a valid
|
|
109
|
+
OpenAI tool spec dict. Returns (converted_dict, notes).
|
|
110
|
+
"""
|
|
111
|
+
notes = []
|
|
112
|
+
if not isinstance(spec, dict):
|
|
113
|
+
raise TypeError("spec must be a dict")
|
|
114
|
+
|
|
115
|
+
spec = copy.deepcopy(spec)
|
|
116
|
+
|
|
117
|
+
# Unwrap or detect shape
|
|
118
|
+
if spec.get("type") == "function" and isinstance(spec.get("function"), dict):
|
|
119
|
+
fn = spec["function"]
|
|
120
|
+
else:
|
|
121
|
+
# Maybe user passed just the function block
|
|
122
|
+
fn = spec
|
|
123
|
+
|
|
124
|
+
name = fn.get("name")
|
|
125
|
+
if not name:
|
|
126
|
+
notes.append("function.name missing, set to 'tool'")
|
|
127
|
+
name = "tool"
|
|
128
|
+
new_name = _sanitize_name(name)
|
|
129
|
+
if new_name != name:
|
|
130
|
+
notes.append(f"function.name sanitized to '{new_name}'")
|
|
131
|
+
name = new_name
|
|
132
|
+
|
|
133
|
+
description = fn.get("description")
|
|
134
|
+
description = _clean_description(description) if description is not None else ""
|
|
135
|
+
if not isinstance(description, str):
|
|
136
|
+
notes.append("function.description not a string, coerced")
|
|
137
|
+
description = str(description)
|
|
138
|
+
|
|
139
|
+
# Parameters
|
|
140
|
+
raw_params = fn.get("parameters")
|
|
141
|
+
if not isinstance(raw_params, dict):
|
|
142
|
+
if raw_params is not None:
|
|
143
|
+
notes.append("function.parameters not a dict, replaced with empty object schema")
|
|
144
|
+
raw_params = {}
|
|
145
|
+
# Ensure object type
|
|
146
|
+
if raw_params.get("type") != "object":
|
|
147
|
+
raw_params["type"] = "object"
|
|
148
|
+
raw_params.setdefault("properties", {})
|
|
149
|
+
parameters = _fix_schema(raw_params, notes, "parameters")
|
|
150
|
+
|
|
151
|
+
# Final envelope
|
|
152
|
+
out = {
|
|
153
|
+
"type": "function",
|
|
154
|
+
"function": {
|
|
155
|
+
"name": name,
|
|
156
|
+
"description": description,
|
|
157
|
+
"parameters": parameters,
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
return out, notes
|
|
162
|
+
|
|
163
|
+
# --------- Example usage ---------
|
|
164
|
+
if __name__ == "__main__":
|
|
165
|
+
messy = {
|
|
166
|
+
"type": "function",
|
|
167
|
+
"function": {
|
|
168
|
+
"name": "GenerateKyvernoTool!",
|
|
169
|
+
"description": "The tool to generate a Kyverno policy.\n",
|
|
170
|
+
"parameters": {
|
|
171
|
+
"type": "object",
|
|
172
|
+
"properties": {
|
|
173
|
+
"sentence": {"type": "str", "description": "...\n"},
|
|
174
|
+
"policy_file": {"type": "str", "description": "filepath."},
|
|
175
|
+
"current_policy_file": {"type": "str", "description": "optional", "default": ""}
|
|
176
|
+
},
|
|
177
|
+
"required": ["sentence", "policy_file", "nonexistent_param"]
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
fixed, notes = to_valid_openai_tool(messy)
|
|
183
|
+
print(fixed)
|
|
184
|
+
print("Notes:")
|
|
185
|
+
for n in notes:
|
|
186
|
+
print("-", n)
|
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
# Function-Calling Reflection Pipeline
|
|
2
|
+
|
|
3
|
+
This directory implements a full **pre-call reflection** workflow for conversational agents making API (function) calls. It leverages:
|
|
4
|
+
|
|
5
|
+
- **Static schema checks** - Ensure that calls conform exactly to the API schema and naming rules.
|
|
6
|
+
- **Semantic LLM-driven metrics** - Evaluate the deeper meaning, context alignment, and correctness of calls beyond syntax.
|
|
7
|
+
- **Optional unit-conversion transforms** via code generation
|
|
8
|
+
|
|
9
|
+
All LLM and metric logic lives inside this package—no external frameworks are required.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Table of Contents
|
|
14
|
+
|
|
15
|
+
1. [Syntactic Checks](##yntactic-checks)
|
|
16
|
+
2. [Semantic Metrics](#semantic-metrics)
|
|
17
|
+
3. [Quickstart](#quickstart)
|
|
18
|
+
4. [Directory Structure](#directory-structure)
|
|
19
|
+
5. [ReflectionPipeline API](#reflectionpipeline-api)
|
|
20
|
+
- `static_only`
|
|
21
|
+
- `semantic_sync` / `semantic_async`
|
|
22
|
+
- `run_sync` / `run_async`
|
|
23
|
+
6. [Example Usage](#example-usage)
|
|
24
|
+
7. [Custom Metrics](#custom-metrics)
|
|
25
|
+
8. [Transform-Enabled Mode](#transform-enabled-mode)
|
|
26
|
+
9. [Error Handling & Logging](#error-handling--logging)
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
## Syntactic Checks
|
|
32
|
+
|
|
33
|
+
These catch straightforward, schema-level errors against your API specification:
|
|
34
|
+
|
|
35
|
+
* **NonExistentFunction**
|
|
36
|
+
|
|
37
|
+
*Description:* The function name does not appear in the API spec.
|
|
38
|
+
|
|
39
|
+
*Mistake Example:* Calling `get_customer_profile` when only `get_user_profile` is defined.
|
|
40
|
+
|
|
41
|
+
* **NonExistentParameter**
|
|
42
|
+
|
|
43
|
+
*Description:* One or more parameters are not defined for the chosen function.
|
|
44
|
+
|
|
45
|
+
*Mistake Example:* Using `user` in `get_user_profile(user=42)` when the function expects `user_id`.
|
|
46
|
+
|
|
47
|
+
* **IncorrectParameterType**
|
|
48
|
+
|
|
49
|
+
*Description:* Provided parameter values do not match the expected types.
|
|
50
|
+
|
|
51
|
+
*Mistake Example:* Passing `"true"` (string) to a boolean parameter `is_active`, instead of `true`.
|
|
52
|
+
|
|
53
|
+
* **MissingRequiredParameter**
|
|
54
|
+
|
|
55
|
+
*Description:* A required parameter is omitted.
|
|
56
|
+
|
|
57
|
+
*Mistake Example:* Calling `list_events(start_date="2025-05-01")` without the required `end_date`.
|
|
58
|
+
|
|
59
|
+
* **AllowedValuesViolation**
|
|
60
|
+
|
|
61
|
+
*Description:* A parameter value falls outside its allowed enumeration.
|
|
62
|
+
|
|
63
|
+
*Mistake Example:* Passing `"urgent"` to `priority` when only `"low"`, `"medium"`, or `"high"` are allowed.
|
|
64
|
+
|
|
65
|
+
* **JsonSchemaValidation**
|
|
66
|
+
|
|
67
|
+
*Description:* The API call does not conform to the provided JSON Schema
|
|
68
|
+
|
|
69
|
+
Note that We flag types errors in **IncorrectParameterType**, and all other validations (that are not type or Enum) are under **JsonSchemaValidation**.
|
|
70
|
+
|
|
71
|
+
*Examples of Checked Constraints:*
|
|
72
|
+
* Numeric constraints: minimum, maximum, exclusiveMinimum, exclusiveMaximum, multipleOf
|
|
73
|
+
* String constraints: minLength, maxLength, pattern, format (e.g., email, date, URI)
|
|
74
|
+
* Array constraints: items, minItems, maxItems, uniqueItems, contains
|
|
75
|
+
|
|
76
|
+
* **EmptyApiSpec**
|
|
77
|
+
|
|
78
|
+
*Description:* There are no API specifications provided or they are invalid
|
|
79
|
+
|
|
80
|
+
* **InvalidApiSpec**
|
|
81
|
+
|
|
82
|
+
*Description:* The API specifications provided are not valid Tool or ToolSpec instances
|
|
83
|
+
|
|
84
|
+
* **InvalidToolCall**
|
|
85
|
+
|
|
86
|
+
*Description:* The provided ToolCall is not a valid instance of ToolCall
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
# Semantic Metrics
|
|
91
|
+
|
|
92
|
+
Each semantic metric outputs a JSON object with fields customized in the JSONL definition files:
|
|
93
|
+
|
|
94
|
+
* **explanation**: Detailed reasoning behind the judgment.
|
|
95
|
+
* **evidence**: Exact conversation or spec excerpts supporting the assessment.
|
|
96
|
+
* **output**: Numeric rating on a 1-5 scale (5=best, 1=worst).
|
|
97
|
+
* **confidence**: Judge's confidence in the assessment (0.0-1.0).
|
|
98
|
+
* **correction**: Structured object containing issue types, explanations, and suggested fixes.
|
|
99
|
+
* **actionable_recommendation**: Specific developer guidance when issues are detected.
|
|
100
|
+
|
|
101
|
+
You can add, remove, or modify metrics by editing the JSONL definitions.
|
|
102
|
+
|
|
103
|
+
### 2.1 Function Selection Metric
|
|
104
|
+
|
|
105
|
+
Assesses whether this function call correctly implements the user's immediate request as the appropriate next step in the conversation. Compares against all available functions in the tool inventory to determine if the selection aligns with user intent and context.
|
|
106
|
+
|
|
107
|
+
**Rating Scale:**
|
|
108
|
+
- 5: Perfect match for user request
|
|
109
|
+
- 4: Good match with minor misalignment
|
|
110
|
+
- 3: Adequate match (threshold for acceptability)
|
|
111
|
+
- 2: Poor match for user request
|
|
112
|
+
- 1: Completely irrelevant function
|
|
113
|
+
|
|
114
|
+
*Mistake Example:* User: "What time is it in Tokyo?" Call: `translate_text(text="Hello", target_language="en")` instead of `get_time(timezone="Tokyo")`.
|
|
115
|
+
|
|
116
|
+
### 2.2 Agentic Metric
|
|
117
|
+
|
|
118
|
+
Evaluates whether a tool call satisfies prerequisite constraints and relationships defined in conversation history and tool inventory. Checks for explicit prerequisites, tool sequencing requirements, redundancy, parameter completeness, and parameter value relationships.
|
|
119
|
+
|
|
120
|
+
**Rating Scale:**
|
|
121
|
+
- 5: All agentic constraints satisfied
|
|
122
|
+
- 4: Minor insignificant issues that don't block execution
|
|
123
|
+
- 3: Significant issues requiring additional information
|
|
124
|
+
- 2: Major issues preventing proper execution
|
|
125
|
+
- 1: Completely inappropriate given context
|
|
126
|
+
|
|
127
|
+
*Mistake Example:* User: "Translate 'Hola' to English." Call: `translate_text(text="Hola", target="en")` when the tool description explicitly requires a prior call to `detect_language(text="Hola")`.
|
|
128
|
+
|
|
129
|
+
### 2.3 Grounding Metrics
|
|
130
|
+
|
|
131
|
+
#### 2.3.1 General Parameter Value Grounding
|
|
132
|
+
|
|
133
|
+
Assesses whether ALL parameter values in a function call are directly supported by conversation history or API specifications. Identifies hallucinated values, missing information, format errors, and contradictory values.
|
|
134
|
+
|
|
135
|
+
**Rating Scale:**
|
|
136
|
+
- 5: All parameter values correctly grounded and formatted
|
|
137
|
+
- 4: Some values may need more information but not hallucinated
|
|
138
|
+
- 3: Some values hallucinated or have format errors
|
|
139
|
+
- 2: Multiple values incorrect or contradictory
|
|
140
|
+
- 1: All values incorrect or missing
|
|
141
|
+
|
|
142
|
+
*Mistake Example:* User: "Fetch my profile." Call: `get_user_profile(user_id=42)` when no user ID was mentioned in conversation or available from context.
|
|
143
|
+
|
|
144
|
+
#### 2.3.2 Individual Parameter Hallucination Check
|
|
145
|
+
|
|
146
|
+
Evaluates whether a SPECIFIC parameter value is grounded in evidence or hallucinated. Checks sources, format compliance, value relationships, and default handling.
|
|
147
|
+
|
|
148
|
+
**Rating Scale:**
|
|
149
|
+
- 5: Perfectly grounded in conversation or documented defaults
|
|
150
|
+
- 4: Mostly grounded with minimal inference
|
|
151
|
+
- 3: Ambiguously grounded requiring substantial inference
|
|
152
|
+
- 2: Mostly ungrounded with tenuous connection
|
|
153
|
+
- 1: Completely hallucinated with no basis
|
|
154
|
+
|
|
155
|
+
*Mistake Example:* User: "Fetch my latest tweets." Call: `get_tweets(username="elonmusk", count=20)` when count was not specified by user and has no documented default.
|
|
156
|
+
|
|
157
|
+
#### 2.3.3 Value Format Alignment
|
|
158
|
+
|
|
159
|
+
Checks if a specific parameter value exactly conforms to required type, format, and unit conventions in the API specification.
|
|
160
|
+
|
|
161
|
+
**Rating Scale:**
|
|
162
|
+
- 5: Perfect alignment with specified type, format, units
|
|
163
|
+
- 4: Minor deviation unlikely to affect function
|
|
164
|
+
- 3: Moderate deviation that might affect function
|
|
165
|
+
- 2: Major deviation likely to cause function failure
|
|
166
|
+
- 1: Complete mismatch certain to cause failure
|
|
167
|
+
|
|
168
|
+
*Mistake Example:* User: "Start a countdown for 5 minutes." Call: `set_timer(duration="300000")` instead of `set_timer(duration="5 minutes")`.
|
|
169
|
+
|
|
170
|
+
## Use Cases
|
|
171
|
+
|
|
172
|
+
For different use cases, we suggest to execute different metrics, as follows:
|
|
173
|
+
|
|
174
|
+
### Fast Track Single-Turn
|
|
175
|
+
|
|
176
|
+
Execute (1) function selection (2) global parameter value grounding
|
|
177
|
+
|
|
178
|
+
### Slow Track Single-Turn
|
|
179
|
+
|
|
180
|
+
Execute (1) function selection (2) per-parameter hallucination check (3) per-parameter value format check
|
|
181
|
+
|
|
182
|
+
### Fast Track Agentic
|
|
183
|
+
|
|
184
|
+
Execute (1) function selection (2) global agentic metric (3) global parameter value grounding
|
|
185
|
+
|
|
186
|
+
### Slow Track Agentic
|
|
187
|
+
|
|
188
|
+
Execute (1) function selection (2) global agentic metric (3) per-parameter hallucination check (4) per-parameter value format check
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
**Customization:** Modify metrics, thresholds, and fields by editing your JSONL configuration files.
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## Quickstart
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
pip install llmevalkit[litellm] # or your preferred extras
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
from llmevalkit.llm.registry import get_llm
|
|
204
|
+
from llmevalkit.function_calling.pipeline.pipeline import ReflectionPipeline
|
|
205
|
+
|
|
206
|
+
# 1) Pick your LLM provider and initialize clients
|
|
207
|
+
MetricsClient = get_llm("litellm.watsonx.output_val")
|
|
208
|
+
CodegenClient = get_llm("litellm.watsonx.output_val")
|
|
209
|
+
metrics_client = MetricsClient(model_name="meta-llama/llama-3-3-70b-instruct")
|
|
210
|
+
codegen_client = CodegenClient(model_name="meta-llama/llama-3-3-70b-instruct")
|
|
211
|
+
|
|
212
|
+
# 2) Create pipeline (loads bundled metrics JSONL by default)
|
|
213
|
+
pipeline = ReflectionPipeline(
|
|
214
|
+
metrics_client=metrics_client,
|
|
215
|
+
codegen_client=codegen_client,
|
|
216
|
+
transform_enabled=False
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# 3) Define your API specs (OpenAI-style function definitions)
|
|
220
|
+
apis_specs = [
|
|
221
|
+
{ "type":"function", "function": { ... } },
|
|
222
|
+
...
|
|
223
|
+
]
|
|
224
|
+
|
|
225
|
+
# 4) Provide a tool_call and context
|
|
226
|
+
call = {
|
|
227
|
+
"id":"1","type":"function",
|
|
228
|
+
"function":{"name":"get_weather","arguments":{"location":"Berlin"}}
|
|
229
|
+
}
|
|
230
|
+
context = "User: What's the weather in Berlin?"
|
|
231
|
+
|
|
232
|
+
# 5) Run end-to-end reflection
|
|
233
|
+
result = pipeline.run_sync(
|
|
234
|
+
conversation=context,
|
|
235
|
+
inventory=apis_specs,
|
|
236
|
+
call=call,
|
|
237
|
+
continue_on_static=False,
|
|
238
|
+
retries=2
|
|
239
|
+
)
|
|
240
|
+
print(result.model_dump_json(indent=2))
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
## Directory Structure
|
|
246
|
+
|
|
247
|
+
```
|
|
248
|
+
src/llmevalkit/function_calling/
|
|
249
|
+
├── __init__.py
|
|
250
|
+
├── metrics/ <- MetricPrompt templates & JSONL definitions
|
|
251
|
+
│ ├── base.py
|
|
252
|
+
│ ├── loader.py
|
|
253
|
+
│ ├── function_call/
|
|
254
|
+
│ │ ├── general.py
|
|
255
|
+
│ │ └── general_metrics.jsonl
|
|
256
|
+
│ ├── function_selection/
|
|
257
|
+
│ │ ├── function_selection.py
|
|
258
|
+
│ │ └── function_selection_metrics.jsonl
|
|
259
|
+
│ └── parameter/
|
|
260
|
+
│ ├── parameter.py
|
|
261
|
+
│ └── parameter_metrics.jsonl
|
|
262
|
+
├── pipeline/
|
|
263
|
+
│ ├── adapters.py <- API-spec / call normalization
|
|
264
|
+
│ ├── pipeline.py <- High-level ReflectionPipeline
|
|
265
|
+
│ ├── semantic_checker.py <- Core LLM metrics orchestration
|
|
266
|
+
│ ├── static_checker.py <- JSONSchema-based validation
|
|
267
|
+
│ ├── transformation_prompts.py <- Unit-conversion prompts
|
|
268
|
+
│ └── types.py <- Pydantic models for inputs & outputs
|
|
269
|
+
└── examples/
|
|
270
|
+
└── function_calling/
|
|
271
|
+
└── pipeline.py <- Complete runnable example
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## ReflectionPipeline API
|
|
277
|
+
|
|
278
|
+
### Initialization
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
ReflectionPipeline(
|
|
282
|
+
metrics_client: LLMClient,
|
|
283
|
+
codegen_client: LLMClient,
|
|
284
|
+
transform_enabled: bool = False,
|
|
285
|
+
general_metrics: Optional[Path] = None,
|
|
286
|
+
function_metrics: Optional[Path] = None,
|
|
287
|
+
parameter_metrics: Optional[Path] = None,
|
|
288
|
+
transform_examples: Optional[Dict[str,str]] = None,
|
|
289
|
+
)
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
- **`metrics_client`**: llmevalkit LLM client for semantic metrics (e.g. output-validating OpenAI or LiteLLM).
|
|
293
|
+
- **`codegen_client`**: llmevalkit LLM client for code generation (required if `transform_enabled=True`).
|
|
294
|
+
- **`*_metrics`**: override paths to your own JSONL metric definitions (otherwise uses `metrics/.../*.json`).
|
|
295
|
+
- **`transform_enabled`**: whether to run unit-conversion checks.
|
|
296
|
+
|
|
297
|
+
### `static_only(conversation, inventory, call) → StaticResult`
|
|
298
|
+
|
|
299
|
+
- Runs pure JSON-schema validation on `call` against `inventory` specs.
|
|
300
|
+
- Checks required parameters, types, enums, etc.
|
|
301
|
+
|
|
302
|
+
### `semantic_sync(conversation, inventory, call, retries=1) → SemanticResult`
|
|
303
|
+
|
|
304
|
+
- Runs LLM-driven metric evaluations **synchronously**.
|
|
305
|
+
- Returns per-category semantic results.
|
|
306
|
+
|
|
307
|
+
### `semantic_async(conversation, inventory, call, retries=1, max_parallel=10) → SemanticResult`
|
|
308
|
+
|
|
309
|
+
- Same as above, but issues LLM calls in parallel.
|
|
310
|
+
|
|
311
|
+
### `run_sync(conversation, inventory, call, continue_on_static=False, retries=1) → PipelineResult`
|
|
312
|
+
|
|
313
|
+
- Full pipeline:
|
|
314
|
+
1. Static checks
|
|
315
|
+
2. Semantic metrics (if static passes or `continue_on_static=True`)
|
|
316
|
+
3. Aggregates final `PipelineResult` with `static`, `semantic`, and `overall_valid`.
|
|
317
|
+
|
|
318
|
+
### `run_async(...)`
|
|
319
|
+
|
|
320
|
+
- Asynchronous equivalent of `run_sync`.
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## Example Usage
|
|
325
|
+
|
|
326
|
+
See `examples/function_calling/pipeline/example.py` for a complete, runnable demo:
|
|
327
|
+
|
|
328
|
+
```bash
|
|
329
|
+
python examples/function_calling/pipeline/example.py
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
It will:
|
|
333
|
+
|
|
334
|
+
1. Define three sample functions (`get_weather`, `create_event`, `translate_text`).
|
|
335
|
+
2. Initialize Watsonx clients.
|
|
336
|
+
3. Run sync reflection for valid and invalid calls.
|
|
337
|
+
4. Print nicely formatted JSON results.
|
|
338
|
+
|
|
339
|
+
---
|
|
340
|
+
|
|
341
|
+
## Custom Metrics
|
|
342
|
+
|
|
343
|
+
By default we ship three JSONL files under `metrics/...`:
|
|
344
|
+
|
|
345
|
+
- **General**: overall call quality
|
|
346
|
+
- **Function-Selection**: was the right function chosen?
|
|
347
|
+
- **Parameter**: correctness of each parameter value
|
|
348
|
+
|
|
349
|
+
Each line in a `.json` file is a JSON object:
|
|
350
|
+
|
|
351
|
+
```jsonc
|
|
352
|
+
// general_metrics.json
|
|
353
|
+
{"name":"Clarity", "description":"Rate clarity of the intent","schema":{...},
|
|
354
|
+
"thresholds":{"output":[0,1],"confidence":[0,1]},
|
|
355
|
+
"examples":[
|
|
356
|
+
{"user_kwargs":{...}, "output":{...}}
|
|
357
|
+
]}
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
To add your own:
|
|
361
|
+
|
|
362
|
+
1. Create a new `.json` in any folder.
|
|
363
|
+
2. Pass its path into the pipeline constructor:
|
|
364
|
+
|
|
365
|
+
```python
|
|
366
|
+
pipeline = ReflectionPipeline(
|
|
367
|
+
metrics_client=...,
|
|
368
|
+
codegen_client=...,
|
|
369
|
+
general_metrics="path/to/my_general.json",
|
|
370
|
+
function_metrics="path/to/my_func.json",
|
|
371
|
+
parameter_metrics="path/to/my_param.json",
|
|
372
|
+
)
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
3. Follow the same JSONL format:
|
|
376
|
+
- `schema`: valid JSON-Schema object
|
|
377
|
+
- `thresholds`: dict of numeric field thresholds
|
|
378
|
+
- `examples`: few-shot examples validating against that schema
|
|
379
|
+
|
|
380
|
+
---
|
|
381
|
+
|
|
382
|
+
## Transform-Enabled Mode
|
|
383
|
+
|
|
384
|
+
If you want automated unit conversions:
|
|
385
|
+
|
|
386
|
+
```python
|
|
387
|
+
pipeline = ReflectionPipeline(
|
|
388
|
+
metrics_client=metrics_client,
|
|
389
|
+
codegen_client=codegen_client,
|
|
390
|
+
transform_enabled=True,
|
|
391
|
+
transform_examples=my_transform_examples_dict,
|
|
392
|
+
)
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
- Uses two additional LLM prompts (in `transformation_prompts.py`):
|
|
396
|
+
1. **Extract units** from context
|
|
397
|
+
2. **Generate transformation code**
|
|
398
|
+
|
|
399
|
+
- Finally executes the generated code in-process and reports a `TransformResult` per parameter.
|
|
400
|
+
|
|
401
|
+
---
|
|
402
|
+
|
|
403
|
+
## Error Handling & Logging
|
|
404
|
+
|
|
405
|
+
- Each stage wraps exceptions with clear, contextual messages.
|
|
406
|
+
- The LLM clients emit optional hooks (`hooks=[...]`) for tracing or metrics.
|
|
407
|
+
- In semantic phases, malformed or missing fields result in per-metric errors rather than crashing the entire pipeline.
|
|
408
|
+
|
|
409
|
+
---
|
|
410
|
+
|
|
411
|
+
Enjoy robust, end-to-end reflection on your function calls—static and semantic—powered entirely by `llmevalkit`!
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .metrics import (
|
|
2
|
+
GeneralMetricsPrompt,
|
|
3
|
+
FunctionSelectionPrompt,
|
|
4
|
+
ParameterMetricsPrompt,
|
|
5
|
+
TrajectoryReflectionPrompt,
|
|
6
|
+
get_general_metrics_prompt,
|
|
7
|
+
get_parameter_metrics_prompt,
|
|
8
|
+
get_trajectory_reflection_prompt,
|
|
9
|
+
load_prompts_from_jsonl,
|
|
10
|
+
load_prompts_from_list,
|
|
11
|
+
load_prompts_from_metrics,
|
|
12
|
+
PromptKind,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"GeneralMetricsPrompt",
|
|
17
|
+
"FunctionSelectionPrompt",
|
|
18
|
+
"ParameterMetricsPrompt",
|
|
19
|
+
"TrajectoryReflectionPrompt",
|
|
20
|
+
"get_general_metrics_prompt",
|
|
21
|
+
"get_parameter_metrics_prompt",
|
|
22
|
+
"get_trajectory_reflection_prompt",
|
|
23
|
+
"load_prompts_from_jsonl",
|
|
24
|
+
"load_prompts_from_list",
|
|
25
|
+
"load_prompts_from_metrics",
|
|
26
|
+
"PromptKind",
|
|
27
|
+
]
|