ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Any, Awaitable, Callable, Dict, List, Optional, TypeVar, Union, Tuple
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
Prompt = Union[str, List[Dict[str, Any]]]
|
|
6
|
+
PromptAndSchema = Tuple[Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]]
|
|
7
|
+
SyncGen = Callable[[Prompt], Union[str, Any]]
|
|
8
|
+
BatchGen = Callable[[List[Prompt]], List[Union[str, Any]]]
|
|
9
|
+
AsyncGen = Callable[[Prompt], Awaitable[Union[str, Any]]]
|
|
10
|
+
AsyncBatchGen = Callable[[List[Prompt]], Awaitable[List[Union[str, Any]]]]
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PromptResult(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Holds the prompt sent and the response (or error).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
prompt: Prompt
|
|
21
|
+
response: Optional[Any] = None
|
|
22
|
+
error: Optional[str] = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PromptRunner:
|
|
26
|
+
"""
|
|
27
|
+
Runs a collection of prompts through various generation strategies.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
prompts: the list of prompts to run.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self, prompts: Optional[List[Union[Prompt, PromptAndSchema]]] = None
|
|
35
|
+
) -> None:
|
|
36
|
+
"""
|
|
37
|
+
Args:
|
|
38
|
+
prompts: initial list of prompts (strings or chat messages).
|
|
39
|
+
"""
|
|
40
|
+
self.prompts: List[Union[Prompt, PromptAndSchema]] = prompts or []
|
|
41
|
+
|
|
42
|
+
def add_prompt(self, prompt: Union[Prompt, PromptAndSchema]) -> None:
|
|
43
|
+
"""Append a prompt to the runner."""
|
|
44
|
+
self.prompts.append(prompt)
|
|
45
|
+
|
|
46
|
+
def remove_prompt(self, prompt: Union[Prompt, PromptAndSchema]) -> None:
|
|
47
|
+
"""Remove a prompt (first occurrence)."""
|
|
48
|
+
self.prompts.remove(prompt)
|
|
49
|
+
|
|
50
|
+
def clear_prompts(self) -> None:
|
|
51
|
+
"""Remove all prompts."""
|
|
52
|
+
self.prompts.clear()
|
|
53
|
+
|
|
54
|
+
def get_prompt_and_schema(
|
|
55
|
+
self, prompt: Union[Prompt, PromptAndSchema]
|
|
56
|
+
) -> Tuple[Prompt, Optional[Dict[str, Any]]]:
|
|
57
|
+
"""
|
|
58
|
+
Extract the prompt and schema from a Prompt object.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
prompt: The prompt to extract from.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Tuple of (prompt, schema).
|
|
65
|
+
"""
|
|
66
|
+
if isinstance(prompt, tuple):
|
|
67
|
+
return prompt[0], prompt[1]
|
|
68
|
+
return prompt, None
|
|
69
|
+
|
|
70
|
+
def run_all(
|
|
71
|
+
self,
|
|
72
|
+
gen_fn: SyncGen,
|
|
73
|
+
prompt_param_name: str = "prompt",
|
|
74
|
+
schema_param_name: Optional[str] = None,
|
|
75
|
+
**kwargs: Any,
|
|
76
|
+
) -> List[PromptResult]:
|
|
77
|
+
"""
|
|
78
|
+
Run each prompt through a synchronous single-prompt generator.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
gen_fn: Callable taking one Prompt, returning str or Any.
|
|
82
|
+
prompt_param_name: Name of the parameter for the prompt.
|
|
83
|
+
schema_param_name: Name of the parameter for the schema.
|
|
84
|
+
kwargs: Additional arguments to pass to the function.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
List of PromptResult.
|
|
88
|
+
"""
|
|
89
|
+
results: List[PromptResult] = []
|
|
90
|
+
for p in self.prompts:
|
|
91
|
+
try:
|
|
92
|
+
prompt, schema = self.get_prompt_and_schema(p)
|
|
93
|
+
args = {prompt_param_name: prompt, **kwargs}
|
|
94
|
+
if schema_param_name and schema:
|
|
95
|
+
args[schema_param_name] = schema
|
|
96
|
+
resp = gen_fn(**args)
|
|
97
|
+
results.append(PromptResult(prompt=prompt, response=resp))
|
|
98
|
+
except Exception as e:
|
|
99
|
+
results.append(PromptResult(prompt=prompt, error=str(e)))
|
|
100
|
+
return results
|
|
101
|
+
|
|
102
|
+
async def run_async(
|
|
103
|
+
self,
|
|
104
|
+
async_fn: AsyncGen,
|
|
105
|
+
max_parallel: int = 10,
|
|
106
|
+
prompt_param_name: str = "prompt",
|
|
107
|
+
schema_param_name: Optional[str] = None,
|
|
108
|
+
**kwargs: Any,
|
|
109
|
+
) -> List[PromptResult]:
|
|
110
|
+
"""
|
|
111
|
+
Run each prompt through an async single-prompt generator with concurrency limit.
|
|
112
|
+
Results are returned in the same order as self.prompts.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
async_fn: Async callable taking one Prompt, returning str or Any.
|
|
116
|
+
max_parallel: Max concurrent tasks.
|
|
117
|
+
prompt_param_name: Name of the parameter for the prompt.
|
|
118
|
+
schema_param_name: Name of the parameter for the schema.
|
|
119
|
+
kwargs: Additional arguments to pass to the async function.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
List of PromptResult.
|
|
123
|
+
"""
|
|
124
|
+
semaphore = asyncio.Semaphore(max_parallel)
|
|
125
|
+
|
|
126
|
+
async def _run_one(index: int, p: Prompt) -> Tuple[int, PromptResult]:
|
|
127
|
+
async with semaphore:
|
|
128
|
+
try:
|
|
129
|
+
prompt, schema = self.get_prompt_and_schema(p)
|
|
130
|
+
args = {prompt_param_name: prompt, **kwargs}
|
|
131
|
+
if schema_param_name and schema:
|
|
132
|
+
args[schema_param_name] = schema
|
|
133
|
+
resp = await async_fn(**args)
|
|
134
|
+
return index, PromptResult(prompt=prompt, response=resp)
|
|
135
|
+
except Exception as e:
|
|
136
|
+
return index, PromptResult(prompt=prompt, error=str(e))
|
|
137
|
+
|
|
138
|
+
tasks = [
|
|
139
|
+
asyncio.create_task(_run_one(i, p)) for i, p in enumerate(self.prompts)
|
|
140
|
+
]
|
|
141
|
+
indexed_results = await asyncio.gather(*tasks)
|
|
142
|
+
# Sort results to match original order
|
|
143
|
+
indexed_results.sort(key=lambda x: x[0])
|
|
144
|
+
return [res for _, res in indexed_results]
|
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
import json
|
|
10
|
+
from functools import lru_cache
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from llmevalkit.function_calling.pipeline.pipeline import ReflectionPipeline
|
|
14
|
+
from llmevalkit.function_calling.pipeline.types import ToolCall, ToolSpec
|
|
15
|
+
|
|
16
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.base_classes import Error
|
|
18
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
19
|
+
RecordMetricResult)
|
|
20
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
21
|
+
from ibm_watsonx_gov.utils.python_utils import (
|
|
22
|
+
get, parse_functions_to_openai_schema)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ToolCallMetricProvider():
|
|
26
|
+
"""
|
|
27
|
+
Base class for Tool Call Metrics Computation.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, configuration: GenAIConfiguration | AgenticAIConfiguration, metric: GenAIMetric):
|
|
31
|
+
"""
|
|
32
|
+
Initialize the ToolCallMetricProvider with the configuration.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
configuration (GenAIConfiguration | AgenticAIConfiguration): The configuration for the metric computation.
|
|
36
|
+
metric (GenAIMetric): The metric to be computed.
|
|
37
|
+
"""
|
|
38
|
+
self.configuration = configuration
|
|
39
|
+
self.metric = metric
|
|
40
|
+
|
|
41
|
+
def pre_process(self, data: pd.DataFrame):
|
|
42
|
+
"""
|
|
43
|
+
Preprocess the dataframe and tool list for metrics computation
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
data (pd.DataFrame): Input dataframe
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
pd.Dataframe: Processed dataframe
|
|
50
|
+
"""
|
|
51
|
+
# Get the specification of tools used in the application
|
|
52
|
+
# in proper format if it is a list of Callable
|
|
53
|
+
if isinstance(self.configuration.tools, list) and all(callable(item) for item in self.configuration.tools):
|
|
54
|
+
self.configuration.tools = self.get_tools_list_schema(
|
|
55
|
+
self.configuration.tools)
|
|
56
|
+
|
|
57
|
+
if self.configuration.available_tools_field and self.configuration.available_tools_field in data.columns:
|
|
58
|
+
data[self.configuration.available_tools_field] = data[self.configuration.available_tools_field].apply(
|
|
59
|
+
lambda x: json.loads(x) if isinstance(x, str) else x)
|
|
60
|
+
|
|
61
|
+
# TODO: Add validation for the tool_call_field data schema
|
|
62
|
+
tool_call_field = self.configuration.tool_calls_field
|
|
63
|
+
if tool_call_field:
|
|
64
|
+
data[tool_call_field] = data[tool_call_field].apply(
|
|
65
|
+
lambda x: json.loads(x) if isinstance(x, str) else x)
|
|
66
|
+
return data
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def get_tools_list_schema(tools: list) -> list:
|
|
70
|
+
"""
|
|
71
|
+
Convert the list of callable objects to the
|
|
72
|
+
format needed for the TCH computation
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
tools (list): List of Callable objects
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
list: List of dictionary containing the tool
|
|
79
|
+
specifications
|
|
80
|
+
"""
|
|
81
|
+
tools_specifications = []
|
|
82
|
+
for func in tools:
|
|
83
|
+
tool_schema = parse_functions_to_openai_schema(func)
|
|
84
|
+
if not tool_schema:
|
|
85
|
+
continue
|
|
86
|
+
tools_specifications.append(ToolSpec.model_validate(tool_schema))
|
|
87
|
+
|
|
88
|
+
return tools_specifications
|
|
89
|
+
|
|
90
|
+
async def compute_metrics(self, data: pd.DataFrame, syntactic_only: bool = True, metric_result_mapping_name: str = None, **kwargs):
|
|
91
|
+
"""
|
|
92
|
+
Compute the Tool Call Metrics for the given data
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
data (pd.DataFrame): Input data including the tools used for the application
|
|
96
|
+
syntactic_only (bool): If True, compute only syntactic metrics.
|
|
97
|
+
metric_result_mapping_name (str): The mapping name for the metric result with the llmevalkit
|
|
98
|
+
kwargs: Additional keyword arguments for the pipeline
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
list: List of metrics calculated for each record
|
|
102
|
+
"""
|
|
103
|
+
try:
|
|
104
|
+
|
|
105
|
+
data = self.pre_process(data)
|
|
106
|
+
tool_calls_field = self.configuration.tool_calls_field
|
|
107
|
+
record_id_field = self.configuration.record_id_field
|
|
108
|
+
record_level_metrics = []
|
|
109
|
+
|
|
110
|
+
# Do not compute metrics if llm_judge is not set
|
|
111
|
+
# and trying to compute a non syntactic metrics
|
|
112
|
+
if not getattr(self.metric, "llm_judge", None) and not syntactic_only:
|
|
113
|
+
return []
|
|
114
|
+
|
|
115
|
+
for _, row in data.iterrows():
|
|
116
|
+
|
|
117
|
+
available_tools = self.configuration.tools or row.get(
|
|
118
|
+
self.configuration.available_tools_field, [])
|
|
119
|
+
if not all(isinstance(t, ToolSpec) for t in available_tools):
|
|
120
|
+
available_tools = [ToolSpec.model_validate(
|
|
121
|
+
func) for func in available_tools]
|
|
122
|
+
|
|
123
|
+
tool_calls = self.extract_tool_calls_from_response(
|
|
124
|
+
row[tool_calls_field])
|
|
125
|
+
|
|
126
|
+
if not available_tools:
|
|
127
|
+
record_level_metrics.append({
|
|
128
|
+
"value": None, # Treat no available tools as None score as we are not able to compute a score
|
|
129
|
+
"record_id": row[record_id_field],
|
|
130
|
+
"errors": [Error(code="BAD_REQUEST", message_en="The list of available tools is empty.")]
|
|
131
|
+
})
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
if not tool_calls:
|
|
135
|
+
record_level_metrics.append({
|
|
136
|
+
"value": None, # Treat no tool calls as None score as we are not able to compute a score
|
|
137
|
+
"record_id": row[record_id_field],
|
|
138
|
+
"errors": [Error(code="BAD_REQUEST", message_en="The list of tool calls made by LLM is empty.")]
|
|
139
|
+
})
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
if syntactic_only:
|
|
143
|
+
tool_call_level_explanation = self.compute_syntactic_metrics(
|
|
144
|
+
data=row, tool_calls=tool_calls, available_tools=available_tools)
|
|
145
|
+
record_level_metrics.append({
|
|
146
|
+
"value": 0.0 if tool_call_level_explanation else 1.0,
|
|
147
|
+
"record_id": row[record_id_field],
|
|
148
|
+
"explanations": tool_call_level_explanation
|
|
149
|
+
})
|
|
150
|
+
else:
|
|
151
|
+
tool_call_level_explanation = await self.compute_semantic_metrics(
|
|
152
|
+
data=row, tool_calls=tool_calls, available_tools=available_tools, metric_result_mapping_name=metric_result_mapping_name, **kwargs)
|
|
153
|
+
record_level_metrics.append({
|
|
154
|
+
"value": min(entry.get("value") for entry in tool_call_level_explanation),
|
|
155
|
+
"errors": [Error(code="REQUEST_FAILED", message_en=entry.get("error")) for entry in
|
|
156
|
+
tool_call_level_explanation if entry.get("error")],
|
|
157
|
+
"record_id": row[record_id_field],
|
|
158
|
+
"explanations": tool_call_level_explanation
|
|
159
|
+
})
|
|
160
|
+
|
|
161
|
+
metric_result = self.post_process(
|
|
162
|
+
record_level_metrics, syntactic_only=syntactic_only)
|
|
163
|
+
|
|
164
|
+
return metric_result
|
|
165
|
+
except Exception as ex:
|
|
166
|
+
raise Exception(
|
|
167
|
+
f"Error while computing metrics: '{self.metric.name}' using '{self.metric.method}'. Reason: {str(ex)}") from ex
|
|
168
|
+
|
|
169
|
+
def compute_syntactic_metrics(self, data: pd.DataFrame, tool_calls: list, available_tools: list):
|
|
170
|
+
"""
|
|
171
|
+
Compute the Tool Call Metrics for the given data
|
|
172
|
+
in static mode
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
data (pd.DataFrame): Input data including the tools used for the application
|
|
176
|
+
tool_calls (list): List of tool calls made by the LLM
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
list: List of metrics calculated for each record
|
|
180
|
+
"""
|
|
181
|
+
tool_call_level_explanation = []
|
|
182
|
+
for call in tool_calls:
|
|
183
|
+
explanations = ReflectionPipeline.static_only(
|
|
184
|
+
inventory=available_tools, call=ToolCall.model_validate(call))
|
|
185
|
+
explanations = explanations.model_dump()
|
|
186
|
+
if explanations.get("final_decision") is False:
|
|
187
|
+
tool_call_level_explanation.append({
|
|
188
|
+
"tool_name": call.get("function").get("name"),
|
|
189
|
+
"hallucinations": {
|
|
190
|
+
key: val for key, val in explanations["metrics"].items() if not val["valid"]
|
|
191
|
+
}
|
|
192
|
+
})
|
|
193
|
+
return tool_call_level_explanation
|
|
194
|
+
|
|
195
|
+
async def compute_semantic_metrics(self, data: pd.DataFrame, tool_calls: list, available_tools: list, metric_result_mapping_name: str, **kwargs):
|
|
196
|
+
"""
|
|
197
|
+
Compute the Tool Call Metrics for the given data
|
|
198
|
+
in semantic mode
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
data (pd.DataFrame): Input data including the tools used for the application
|
|
202
|
+
configuration (GenAIConfiguration | AgenticAIConfiguration): Metrics configuration
|
|
203
|
+
metric_result_mapping_name (str): The mapping name for the metric result with the llmevalkit
|
|
204
|
+
kwargs: Additional keyword arguments for the pipeline
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
list: List of metrics calculated for each record
|
|
208
|
+
"""
|
|
209
|
+
tool_call_level_explanation = []
|
|
210
|
+
metrics_client = self.get_llm_metric_client()
|
|
211
|
+
pipeline = ReflectionPipeline(
|
|
212
|
+
metrics_client=metrics_client,
|
|
213
|
+
**kwargs
|
|
214
|
+
)
|
|
215
|
+
for call in tool_calls:
|
|
216
|
+
result = await pipeline.semantic_async(
|
|
217
|
+
conversation=data[self.configuration.input_fields[0]],
|
|
218
|
+
inventory=available_tools,
|
|
219
|
+
call=ToolCall.model_validate(call),
|
|
220
|
+
retries=2
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
explanations = get(
|
|
224
|
+
result.model_dump(), f"{metric_result_mapping_name}.metrics.{self.metric.metric_mapping_name}")
|
|
225
|
+
|
|
226
|
+
error = get(explanations, "error")
|
|
227
|
+
if explanations:
|
|
228
|
+
tool_call_level_explanation.append({
|
|
229
|
+
"tool_name": get(call, "function.name"),
|
|
230
|
+
"value": float(get(explanations, "raw_response.output", 0.0))/5 if not error else None,
|
|
231
|
+
"error": error,
|
|
232
|
+
"explanation": get(explanations, "raw_response.explanation"),
|
|
233
|
+
"evidence": get(explanations, "raw_response.evidence"),
|
|
234
|
+
"correction": get(explanations, "raw_response.correction")
|
|
235
|
+
})
|
|
236
|
+
return tool_call_level_explanation
|
|
237
|
+
|
|
238
|
+
@staticmethod
|
|
239
|
+
def extract_tool_calls_from_response(tool_calls_response) -> list:
|
|
240
|
+
"""
|
|
241
|
+
Extracts the tool calls from the response
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
tool_calls_response (Any): The tool calls response
|
|
245
|
+
can be a list of dictionary, an AIMessage object
|
|
246
|
+
or a dictionary
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
list: List of openai formatted tool call
|
|
250
|
+
"""
|
|
251
|
+
if isinstance(tool_calls_response, dict):
|
|
252
|
+
tool_calls = get(tool_calls_response, "kwargs.tool_calls")
|
|
253
|
+
elif hasattr(tool_calls_response, "tool_calls"):
|
|
254
|
+
tool_calls = tool_calls_response.tool_calls
|
|
255
|
+
else:
|
|
256
|
+
tool_calls = tool_calls_response
|
|
257
|
+
|
|
258
|
+
if tool_calls is None:
|
|
259
|
+
tool_calls = []
|
|
260
|
+
converted = []
|
|
261
|
+
for call in tool_calls:
|
|
262
|
+
# check if tool call is already in the required format, else convert it
|
|
263
|
+
if (isinstance(call, dict) and
|
|
264
|
+
"id" in call and
|
|
265
|
+
call.get("type") == "function" and
|
|
266
|
+
isinstance(call.get("function"), dict) and
|
|
267
|
+
"name" in call["function"] and
|
|
268
|
+
"arguments" in call["function"]):
|
|
269
|
+
converted.append(call)
|
|
270
|
+
else:
|
|
271
|
+
converted.append({
|
|
272
|
+
"id": call["id"],
|
|
273
|
+
"type": "function",
|
|
274
|
+
"function": {
|
|
275
|
+
"name": call["name"],
|
|
276
|
+
"arguments": json.dumps(call["args"])
|
|
277
|
+
}
|
|
278
|
+
})
|
|
279
|
+
return converted
|
|
280
|
+
|
|
281
|
+
def post_process(self, results: pd.DataFrame, syntactic_only: bool = True):
|
|
282
|
+
"""
|
|
283
|
+
Post process the computed metrics to get the Aggregated Result and
|
|
284
|
+
Record level metric result in the proper format
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
results (pd.DataFrame): Computed metric results
|
|
288
|
+
configuration (GenAIConfiguration | AgenticAIConfiguration): Metric configuration
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
AggregateMetricResult: The AggregateMetricResult object containing the calculated
|
|
292
|
+
metrics information
|
|
293
|
+
"""
|
|
294
|
+
|
|
295
|
+
# Preparing the record level metrics
|
|
296
|
+
record_level_metrics: list[RecordMetricResult] = []
|
|
297
|
+
|
|
298
|
+
for row in results:
|
|
299
|
+
record_level_metrics.append(
|
|
300
|
+
RecordMetricResult(
|
|
301
|
+
name=self.metric.name,
|
|
302
|
+
display_name=self.metric.display_name,
|
|
303
|
+
method=self.metric.method,
|
|
304
|
+
value=row.get("value"),
|
|
305
|
+
provider="ibm",
|
|
306
|
+
errors=row.get("errors", []),
|
|
307
|
+
group=self.metric.group,
|
|
308
|
+
record_id=row["record_id"],
|
|
309
|
+
thresholds=self.metric.thresholds,
|
|
310
|
+
additional_info={"explanations": row.get("explanations")}
|
|
311
|
+
)
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# Get the number of records are violated, min, max
|
|
315
|
+
values = [item.get("value") or 0.0 for item in results]
|
|
316
|
+
min_value = min(values, default=0.0)
|
|
317
|
+
max_value = max(values, default=0.0)
|
|
318
|
+
if syntactic_only:
|
|
319
|
+
count_invalid = sum(val == 0.0 for val in values)
|
|
320
|
+
value = int(count_invalid)/int(len(results))
|
|
321
|
+
else:
|
|
322
|
+
value = sum(values)/len(values) if values else 0.0
|
|
323
|
+
|
|
324
|
+
# creating AggregateMetricResult
|
|
325
|
+
aggregated_result = AggregateMetricResult(
|
|
326
|
+
name=self.metric.name,
|
|
327
|
+
display_name=self.metric.display_name,
|
|
328
|
+
method=self.metric.method,
|
|
329
|
+
provider="ibm",
|
|
330
|
+
group=self.metric.group,
|
|
331
|
+
value=value,
|
|
332
|
+
total_records=len(results),
|
|
333
|
+
record_level_metrics=record_level_metrics,
|
|
334
|
+
min=min_value,
|
|
335
|
+
max=max_value,
|
|
336
|
+
thresholds=self.metric.thresholds
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# return the aggregated result
|
|
340
|
+
return aggregated_result
|
|
341
|
+
|
|
342
|
+
@staticmethod
|
|
343
|
+
@lru_cache(maxsize=128)
|
|
344
|
+
def _create_client_impl(
|
|
345
|
+
provider: str,
|
|
346
|
+
model_id: str,
|
|
347
|
+
project_id: str,
|
|
348
|
+
space_id: str,
|
|
349
|
+
credentials_json: str
|
|
350
|
+
):
|
|
351
|
+
"""
|
|
352
|
+
Cached instance method for creating LLM clients.
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
provider: Provider name
|
|
357
|
+
model_id: Model identifier
|
|
358
|
+
project_id: Project ID (empty string if None)
|
|
359
|
+
space_id: Space ID (empty string if None)
|
|
360
|
+
credentials_json: JSON string of credentials
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Configured LLM client
|
|
364
|
+
"""
|
|
365
|
+
from llmevalkit.llm import get_llm
|
|
366
|
+
|
|
367
|
+
if provider == "ibm_watsonx.ai":
|
|
368
|
+
from llmevalkit.llm.providers.ibm_watsonx_ai.ibm_watsonx_ai import \
|
|
369
|
+
WatsonxLLMClientOutputVal
|
|
370
|
+
|
|
371
|
+
provider_kwargs = json.loads(credentials_json)
|
|
372
|
+
provider_kwargs["model_id"] = model_id
|
|
373
|
+
|
|
374
|
+
if project_id:
|
|
375
|
+
provider_kwargs["project_id"] = project_id
|
|
376
|
+
if space_id:
|
|
377
|
+
provider_kwargs["space_id"] = space_id
|
|
378
|
+
|
|
379
|
+
return WatsonxLLMClientOutputVal(**provider_kwargs)
|
|
380
|
+
|
|
381
|
+
elif provider == "openai":
|
|
382
|
+
MetricsClientCls = get_llm("openai.async")
|
|
383
|
+
return MetricsClientCls(model_name=model_id)
|
|
384
|
+
|
|
385
|
+
elif provider == "wxo_ai_gateway":
|
|
386
|
+
from llmevalkit.llm.providers.wxo_ai_gateway.wxo_ai_gateway import \
|
|
387
|
+
WxoAIGatewayClientOutputVal
|
|
388
|
+
|
|
389
|
+
provider_kwargs = json.loads(credentials_json)
|
|
390
|
+
return WxoAIGatewayClientOutputVal(**provider_kwargs)
|
|
391
|
+
else:
|
|
392
|
+
raise ValueError(f"Unsupported provider: {provider}")
|
|
393
|
+
|
|
394
|
+
def get_llm_metric_client(self):
|
|
395
|
+
"""
|
|
396
|
+
Get or create cached LLM metrics client.
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
Cached or newly created LLM client
|
|
400
|
+
"""
|
|
401
|
+
llm_judge = self.metric.llm_judge
|
|
402
|
+
|
|
403
|
+
# Extract hashable parameters
|
|
404
|
+
provider = llm_judge.get_model_provider()
|
|
405
|
+
model_id = getattr(llm_judge.model, 'model_id', None) or ""
|
|
406
|
+
project_id = getattr(llm_judge.model, 'project_id', None) or ""
|
|
407
|
+
space_id = getattr(llm_judge.model, 'space_id', None) or ""
|
|
408
|
+
|
|
409
|
+
credentials = llm_judge.model.provider.credentials.model_dump(
|
|
410
|
+
exclude_none=True, exclude_unset=True
|
|
411
|
+
)
|
|
412
|
+
credentials_json = json.dumps(credentials, sort_keys=True)
|
|
413
|
+
|
|
414
|
+
# Call cached method with hashable parameters
|
|
415
|
+
return ToolCallMetricProvider._create_client_impl(
|
|
416
|
+
provider, model_id, project_id, space_id, credentials_json
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
def extract_parameter_info(self, data, metric_mapping_name):
|
|
420
|
+
"""
|
|
421
|
+
Extract parameter metrics into a list
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
data (dict): Response data to be extracted
|
|
425
|
+
metric_mapping_name (str): Metric mapping name
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
List: List of Parameter based explanation
|
|
429
|
+
"""
|
|
430
|
+
result = {
|
|
431
|
+
"is_issue": False,
|
|
432
|
+
"raw_response": []
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
for param_name, param_data in data.get("parameter", {}).items():
|
|
436
|
+
metrics = get(param_data, f"metrics.{metric_mapping_name}")
|
|
437
|
+
raw_response = metrics['raw_response']
|
|
438
|
+
is_issue = metrics.get('is_issue', False)
|
|
439
|
+
|
|
440
|
+
if is_issue:
|
|
441
|
+
result["is_issue"] = True
|
|
442
|
+
|
|
443
|
+
param_info = {
|
|
444
|
+
"parameter": param_name,
|
|
445
|
+
"explanation": raw_response['explanation'],
|
|
446
|
+
"evidence": raw_response['evidence'],
|
|
447
|
+
"output": raw_response['output'],
|
|
448
|
+
"confidence": raw_response['confidence'],
|
|
449
|
+
"correction": raw_response['correction'],
|
|
450
|
+
"is_issue": is_issue
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
result["raw_response"].append(param_info)
|
|
454
|
+
|
|
455
|
+
return result
|
|
Binary file
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from .core.tool_loader import load_tool
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from ..entities.ai_tools import ToolRegistrationPayload, ToolUpdatePayload
|
|
10
|
+
from .ai_tool_client import (delete_tool, delete_tool_with_name, get_tool,
|
|
11
|
+
get_tool_info, list_tools, register_tool)
|