ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
from typing import Any, Dict, Optional, List
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from .base import BaseComparator
|
|
5
|
+
from .exact_match import ExactMatchComparator
|
|
6
|
+
from .fuzzy_string import FuzzyStringComparator
|
|
7
|
+
from .llm_judge import LLMJudgeComparator
|
|
8
|
+
from ..types import (
|
|
9
|
+
ParameterComparisonResult,
|
|
10
|
+
ComparisonStrategy,
|
|
11
|
+
ParameterStatus,
|
|
12
|
+
ComparisonConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
# Import code agent conditionally to avoid dependency issues
|
|
16
|
+
try:
|
|
17
|
+
from .code_agent import CodeAgentComparator
|
|
18
|
+
|
|
19
|
+
CODE_AGENT_AVAILABLE = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
CODE_AGENT_AVAILABLE = False
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class HybridComparator(BaseComparator):
|
|
27
|
+
"""
|
|
28
|
+
Hybrid comparator that uses multiple strategies and picks the best result.
|
|
29
|
+
|
|
30
|
+
Strategy order:
|
|
31
|
+
1. Exact match (if perfect match found)
|
|
32
|
+
2. Fuzzy string similarity (for near matches)
|
|
33
|
+
3. LLM judge (for semantic understanding) - now with enhanced capabilities
|
|
34
|
+
4. Code agent (for complex programmatic analysis) - if available
|
|
35
|
+
|
|
36
|
+
Enhanced LLM Judge capabilities:
|
|
37
|
+
- Supports custom instructions for specialized evaluation scenarios
|
|
38
|
+
- Supports custom schemas for tailored response formats
|
|
39
|
+
- Both sync and async operation modes
|
|
40
|
+
- Bulk and individual parameter comparison modes
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, config: ComparisonConfig, llm_client=None):
|
|
44
|
+
super().__init__(config)
|
|
45
|
+
self.llm_client = llm_client
|
|
46
|
+
|
|
47
|
+
# Initialize sub-comparators
|
|
48
|
+
self.exact_comparator = ExactMatchComparator(config)
|
|
49
|
+
self.fuzzy_comparator = FuzzyStringComparator(config)
|
|
50
|
+
|
|
51
|
+
# Only initialize LLM comparator if client is available
|
|
52
|
+
self.llm_comparator = None
|
|
53
|
+
if llm_client:
|
|
54
|
+
try:
|
|
55
|
+
self.llm_comparator = LLMJudgeComparator(config, llm_client)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.warning(f"Failed to initialize LLM comparator: {e}")
|
|
58
|
+
|
|
59
|
+
# Only initialize Code Agent comparator if available and client provided
|
|
60
|
+
self.code_agent_comparator = None
|
|
61
|
+
if CODE_AGENT_AVAILABLE and llm_client:
|
|
62
|
+
try:
|
|
63
|
+
self.code_agent_comparator = CodeAgentComparator(config, llm_client)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.warning(f"Failed to initialize Code Agent comparator: {e}")
|
|
66
|
+
|
|
67
|
+
def compare_parameter(
|
|
68
|
+
self,
|
|
69
|
+
param_name: str,
|
|
70
|
+
predicted_value: Any,
|
|
71
|
+
ground_truth_value: Any,
|
|
72
|
+
context: Optional[Dict[str, Any]] = None,
|
|
73
|
+
custom_instructions: Optional[str] = None,
|
|
74
|
+
custom_schema: Optional[Dict[str, Any]] = None,
|
|
75
|
+
) -> ParameterComparisonResult:
|
|
76
|
+
|
|
77
|
+
results = []
|
|
78
|
+
strategies_used = []
|
|
79
|
+
|
|
80
|
+
# 1. Try exact match first
|
|
81
|
+
try:
|
|
82
|
+
exact_result = self.exact_comparator.compare_parameter(
|
|
83
|
+
param_name,
|
|
84
|
+
predicted_value,
|
|
85
|
+
ground_truth_value,
|
|
86
|
+
context,
|
|
87
|
+
custom_instructions,
|
|
88
|
+
)
|
|
89
|
+
results.append(exact_result)
|
|
90
|
+
strategies_used.append(ComparisonStrategy.EXACT_MATCH)
|
|
91
|
+
|
|
92
|
+
# If exact match is perfect, return it
|
|
93
|
+
if exact_result.score >= 0.95:
|
|
94
|
+
exact_result.comparison_strategy = ComparisonStrategy.HYBRID
|
|
95
|
+
return exact_result
|
|
96
|
+
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.warning(f"Exact match comparison failed: {e}")
|
|
99
|
+
|
|
100
|
+
# 2. Try fuzzy string matching
|
|
101
|
+
try:
|
|
102
|
+
fuzzy_result = self.fuzzy_comparator.compare_parameter(
|
|
103
|
+
param_name,
|
|
104
|
+
predicted_value,
|
|
105
|
+
ground_truth_value,
|
|
106
|
+
context,
|
|
107
|
+
custom_instructions,
|
|
108
|
+
)
|
|
109
|
+
results.append(fuzzy_result)
|
|
110
|
+
strategies_used.append(ComparisonStrategy.FUZZY_STRING)
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.warning(f"Fuzzy string comparison failed: {e}")
|
|
114
|
+
|
|
115
|
+
# 3. Try LLM judge if available and other methods haven't given high confidence
|
|
116
|
+
if self.llm_comparator and (not results or max(r.score for r in results) < 0.8):
|
|
117
|
+
try:
|
|
118
|
+
llm_result = self.llm_comparator.compare_parameter(
|
|
119
|
+
param_name,
|
|
120
|
+
predicted_value,
|
|
121
|
+
ground_truth_value,
|
|
122
|
+
context,
|
|
123
|
+
custom_instructions,
|
|
124
|
+
custom_schema,
|
|
125
|
+
)
|
|
126
|
+
results.append(llm_result)
|
|
127
|
+
strategies_used.append(ComparisonStrategy.LLM_JUDGE)
|
|
128
|
+
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.warning(f"LLM judge comparison failed: {e}")
|
|
131
|
+
|
|
132
|
+
# 4. Try Code Agent if available and other methods haven't given high confidence
|
|
133
|
+
if self.code_agent_comparator and (
|
|
134
|
+
not results or max(r.score for r in results) < 0.85
|
|
135
|
+
):
|
|
136
|
+
try:
|
|
137
|
+
# Code agent typically doesn't support custom_schema, so pass other params
|
|
138
|
+
code_result = self.code_agent_comparator.compare_parameter(
|
|
139
|
+
param_name,
|
|
140
|
+
predicted_value,
|
|
141
|
+
ground_truth_value,
|
|
142
|
+
context,
|
|
143
|
+
custom_instructions,
|
|
144
|
+
)
|
|
145
|
+
results.append(code_result)
|
|
146
|
+
strategies_used.append(ComparisonStrategy.CODE_AGENT)
|
|
147
|
+
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.warning(f"Code agent comparison failed: {e}")
|
|
150
|
+
|
|
151
|
+
# Select the best result
|
|
152
|
+
if not results:
|
|
153
|
+
# Fallback to basic exact match
|
|
154
|
+
return ParameterComparisonResult(
|
|
155
|
+
parameter_name=param_name,
|
|
156
|
+
predicted_value=predicted_value,
|
|
157
|
+
ground_truth_value=ground_truth_value,
|
|
158
|
+
predicted_resolved_value=predicted_value,
|
|
159
|
+
ground_truth_resolved_value=ground_truth_value,
|
|
160
|
+
parameter_status=(
|
|
161
|
+
context.get("parameter_status", ParameterStatus.BOTH_PRESENT)
|
|
162
|
+
if context
|
|
163
|
+
else ParameterStatus.BOTH_PRESENT
|
|
164
|
+
),
|
|
165
|
+
comparison_strategy=ComparisonStrategy.HYBRID,
|
|
166
|
+
score=0.0,
|
|
167
|
+
explanation="All comparison strategies failed",
|
|
168
|
+
is_match=False,
|
|
169
|
+
confidence=0.1,
|
|
170
|
+
error_type="comparison_failed",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Choose best result based on combination of score and confidence
|
|
174
|
+
best_result = self._select_best_result(results, strategies_used)
|
|
175
|
+
|
|
176
|
+
# Update the strategy to reflect hybrid approach
|
|
177
|
+
best_result.comparison_strategy = ComparisonStrategy.HYBRID
|
|
178
|
+
|
|
179
|
+
# Enhance explanation with strategy information
|
|
180
|
+
strategy_names = [s.value for s in strategies_used]
|
|
181
|
+
best_result.explanation += (
|
|
182
|
+
f" (Hybrid strategies used: {', '.join(strategy_names)})"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
return best_result
|
|
186
|
+
|
|
187
|
+
def _select_best_result(
|
|
188
|
+
self,
|
|
189
|
+
results: List[ParameterComparisonResult],
|
|
190
|
+
strategies: List[ComparisonStrategy],
|
|
191
|
+
) -> ParameterComparisonResult:
|
|
192
|
+
"""Select the best result from multiple comparison strategies."""
|
|
193
|
+
|
|
194
|
+
if len(results) == 1:
|
|
195
|
+
return results[0]
|
|
196
|
+
|
|
197
|
+
# Calculate weighted scores
|
|
198
|
+
strategy_weights = {
|
|
199
|
+
ComparisonStrategy.EXACT_MATCH: 1.0, # Highest priority for exact matches
|
|
200
|
+
ComparisonStrategy.CODE_AGENT: 0.95, # Very high priority for code analysis
|
|
201
|
+
ComparisonStrategy.LLM_JUDGE: 0.9, # High priority for LLM understanding
|
|
202
|
+
ComparisonStrategy.FUZZY_STRING: 0.7, # Medium priority for fuzzy matching
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
best_result = None
|
|
206
|
+
best_weighted_score = -1
|
|
207
|
+
|
|
208
|
+
for result, strategy in zip(results, strategies):
|
|
209
|
+
# Weighted score combines result score, confidence, and strategy preference
|
|
210
|
+
weight = strategy_weights.get(strategy, 0.5)
|
|
211
|
+
weighted_score = (result.score * 0.6 + result.confidence * 0.2) * weight
|
|
212
|
+
|
|
213
|
+
# Bonus for exact matches
|
|
214
|
+
if result.score >= 0.95:
|
|
215
|
+
weighted_score += 0.1
|
|
216
|
+
|
|
217
|
+
# Bonus for high confidence
|
|
218
|
+
if result.confidence >= 0.9:
|
|
219
|
+
weighted_score += 0.05
|
|
220
|
+
|
|
221
|
+
if weighted_score > best_weighted_score:
|
|
222
|
+
best_weighted_score = weighted_score
|
|
223
|
+
best_result = result
|
|
224
|
+
|
|
225
|
+
return best_result or results[0]
|
|
226
|
+
|
|
227
|
+
def compare_function_name(
|
|
228
|
+
self,
|
|
229
|
+
predicted_name: str,
|
|
230
|
+
ground_truth_name: str,
|
|
231
|
+
context: Optional[Dict[str, Any]] = None,
|
|
232
|
+
) -> float:
|
|
233
|
+
"""Hybrid function name comparison."""
|
|
234
|
+
|
|
235
|
+
# Try exact match first
|
|
236
|
+
exact_score = self.exact_comparator.compare_function_name(
|
|
237
|
+
predicted_name, ground_truth_name, context
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
if exact_score >= 0.95:
|
|
241
|
+
return exact_score
|
|
242
|
+
|
|
243
|
+
# Try fuzzy matching
|
|
244
|
+
fuzzy_score = self.fuzzy_comparator.compare_function_name(
|
|
245
|
+
predicted_name, ground_truth_name, context
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Try LLM if available and fuzzy score is not high
|
|
249
|
+
if self.llm_comparator and fuzzy_score < 0.8:
|
|
250
|
+
try:
|
|
251
|
+
llm_score = self.llm_comparator.compare_function_name(
|
|
252
|
+
predicted_name, ground_truth_name, context
|
|
253
|
+
)
|
|
254
|
+
# Take the higher of fuzzy and LLM scores
|
|
255
|
+
fuzzy_score = max(fuzzy_score, llm_score)
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.warning(f"LLM function name comparison failed: {e}")
|
|
258
|
+
|
|
259
|
+
# Try Code Agent if available and current score is not high
|
|
260
|
+
if self.code_agent_comparator and fuzzy_score < 0.85:
|
|
261
|
+
try:
|
|
262
|
+
code_score = self.code_agent_comparator.compare_function_name(
|
|
263
|
+
predicted_name, ground_truth_name, context
|
|
264
|
+
)
|
|
265
|
+
# Take the highest score
|
|
266
|
+
fuzzy_score = max(fuzzy_score, code_score)
|
|
267
|
+
except Exception as e:
|
|
268
|
+
logger.warning(f"Code agent function name comparison failed: {e}")
|
|
269
|
+
|
|
270
|
+
return fuzzy_score
|
|
271
|
+
|
|
272
|
+
async def compare_parameter_async(
|
|
273
|
+
self,
|
|
274
|
+
param_name: str,
|
|
275
|
+
predicted_value: Any,
|
|
276
|
+
ground_truth_value: Any,
|
|
277
|
+
context: Optional[Dict[str, Any]] = None,
|
|
278
|
+
custom_instructions: Optional[str] = None,
|
|
279
|
+
custom_schema: Optional[Dict[str, Any]] = None,
|
|
280
|
+
) -> ParameterComparisonResult:
|
|
281
|
+
"""Async hybrid parameter comparison with all enhanced features."""
|
|
282
|
+
|
|
283
|
+
results = []
|
|
284
|
+
strategies_used = []
|
|
285
|
+
|
|
286
|
+
# 1. Try exact match first
|
|
287
|
+
try:
|
|
288
|
+
exact_result = self.exact_comparator.compare_parameter(
|
|
289
|
+
param_name,
|
|
290
|
+
predicted_value,
|
|
291
|
+
ground_truth_value,
|
|
292
|
+
context,
|
|
293
|
+
custom_instructions,
|
|
294
|
+
)
|
|
295
|
+
results.append(exact_result)
|
|
296
|
+
strategies_used.append(ComparisonStrategy.EXACT_MATCH)
|
|
297
|
+
|
|
298
|
+
# If exact match is perfect, return it
|
|
299
|
+
if exact_result.score >= 0.95:
|
|
300
|
+
exact_result.comparison_strategy = ComparisonStrategy.HYBRID
|
|
301
|
+
return exact_result
|
|
302
|
+
|
|
303
|
+
except Exception as e:
|
|
304
|
+
logger.warning(f"Exact match comparison failed: {e}")
|
|
305
|
+
|
|
306
|
+
# 2. Try fuzzy string matching
|
|
307
|
+
try:
|
|
308
|
+
fuzzy_result = self.fuzzy_comparator.compare_parameter(
|
|
309
|
+
param_name,
|
|
310
|
+
predicted_value,
|
|
311
|
+
ground_truth_value,
|
|
312
|
+
context,
|
|
313
|
+
custom_instructions,
|
|
314
|
+
)
|
|
315
|
+
results.append(fuzzy_result)
|
|
316
|
+
strategies_used.append(ComparisonStrategy.FUZZY_STRING)
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
logger.warning(f"Fuzzy string comparison failed: {e}")
|
|
320
|
+
|
|
321
|
+
# 3. Try LLM judge async if available and other methods haven't given high confidence
|
|
322
|
+
if self.llm_comparator and (not results or max(r.score for r in results) < 0.8):
|
|
323
|
+
try:
|
|
324
|
+
llm_result = await self.llm_comparator.compare_parameter_async(
|
|
325
|
+
param_name,
|
|
326
|
+
predicted_value,
|
|
327
|
+
ground_truth_value,
|
|
328
|
+
context,
|
|
329
|
+
custom_instructions,
|
|
330
|
+
custom_schema,
|
|
331
|
+
)
|
|
332
|
+
results.append(llm_result)
|
|
333
|
+
strategies_used.append(ComparisonStrategy.LLM_JUDGE)
|
|
334
|
+
|
|
335
|
+
except Exception as e:
|
|
336
|
+
logger.warning(f"Async LLM judge comparison failed: {e}")
|
|
337
|
+
|
|
338
|
+
# 4. Try Code Agent async if available and other methods haven't given high confidence
|
|
339
|
+
if self.code_agent_comparator and (
|
|
340
|
+
not results or max(r.score for r in results) < 0.85
|
|
341
|
+
):
|
|
342
|
+
try:
|
|
343
|
+
# Check if code agent has async support
|
|
344
|
+
if hasattr(self.code_agent_comparator, "compare_parameter_async"):
|
|
345
|
+
code_result = (
|
|
346
|
+
await self.code_agent_comparator.compare_parameter_async(
|
|
347
|
+
param_name,
|
|
348
|
+
predicted_value,
|
|
349
|
+
ground_truth_value,
|
|
350
|
+
context,
|
|
351
|
+
custom_instructions,
|
|
352
|
+
)
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
# Fallback to sync version
|
|
356
|
+
code_result = self.code_agent_comparator.compare_parameter(
|
|
357
|
+
param_name,
|
|
358
|
+
predicted_value,
|
|
359
|
+
ground_truth_value,
|
|
360
|
+
context,
|
|
361
|
+
custom_instructions,
|
|
362
|
+
)
|
|
363
|
+
results.append(code_result)
|
|
364
|
+
strategies_used.append(ComparisonStrategy.CODE_AGENT)
|
|
365
|
+
|
|
366
|
+
except Exception as e:
|
|
367
|
+
logger.warning(f"Async code agent comparison failed: {e}")
|
|
368
|
+
|
|
369
|
+
# Select the best result
|
|
370
|
+
if not results:
|
|
371
|
+
# Fallback to basic exact match
|
|
372
|
+
return ParameterComparisonResult(
|
|
373
|
+
parameter_name=param_name,
|
|
374
|
+
predicted_value=predicted_value,
|
|
375
|
+
ground_truth_value=ground_truth_value,
|
|
376
|
+
predicted_resolved_value=predicted_value,
|
|
377
|
+
ground_truth_resolved_value=ground_truth_value,
|
|
378
|
+
parameter_status=(
|
|
379
|
+
context.get("parameter_status", ParameterStatus.BOTH_PRESENT)
|
|
380
|
+
if context
|
|
381
|
+
else ParameterStatus.BOTH_PRESENT
|
|
382
|
+
),
|
|
383
|
+
comparison_strategy=ComparisonStrategy.HYBRID,
|
|
384
|
+
score=0.0,
|
|
385
|
+
explanation="All async comparison strategies failed",
|
|
386
|
+
is_match=False,
|
|
387
|
+
confidence=0.1,
|
|
388
|
+
error_type="comparison_failed",
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
# Choose best result based on combination of score and confidence
|
|
392
|
+
best_result = self._select_best_result(results, strategies_used)
|
|
393
|
+
|
|
394
|
+
# Update the strategy to reflect hybrid approach
|
|
395
|
+
best_result.comparison_strategy = ComparisonStrategy.HYBRID
|
|
396
|
+
|
|
397
|
+
# Enhance explanation with strategy information
|
|
398
|
+
strategy_names = [s.value for s in strategies_used]
|
|
399
|
+
best_result.explanation += (
|
|
400
|
+
f" (Async hybrid strategies used: {', '.join(strategy_names)})"
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
return best_result
|
|
404
|
+
|
|
405
|
+
async def compare_tool_calls_async(
|
|
406
|
+
self,
|
|
407
|
+
predicted_call: Dict[str, Any],
|
|
408
|
+
ground_truth_call: Dict[str, Any],
|
|
409
|
+
conversation_history: Optional[List[Dict[str, str]]] = None,
|
|
410
|
+
tool_specs: Optional[List[Dict[str, Any]]] = None,
|
|
411
|
+
custom_instructions: Optional[str] = None,
|
|
412
|
+
custom_schema: Optional[str] = None,
|
|
413
|
+
) -> Any:
|
|
414
|
+
"""Async hybrid tool call comparison with enhanced LLM Judge features."""
|
|
415
|
+
|
|
416
|
+
# For tool call level comparison, prioritize LLM-based approaches
|
|
417
|
+
|
|
418
|
+
# Try enhanced LLM judge with custom schema first if available
|
|
419
|
+
if self.llm_comparator and custom_schema:
|
|
420
|
+
try:
|
|
421
|
+
return await self.llm_comparator.compare_tool_calls_with_custom_schema(
|
|
422
|
+
predicted_call,
|
|
423
|
+
ground_truth_call,
|
|
424
|
+
conversation_history,
|
|
425
|
+
tool_specs,
|
|
426
|
+
custom_instructions,
|
|
427
|
+
custom_schema,
|
|
428
|
+
)
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logger.warning(f"Custom schema LLM comparison failed: {e}")
|
|
431
|
+
|
|
432
|
+
# Try standard async LLM judge
|
|
433
|
+
if self.llm_comparator:
|
|
434
|
+
try:
|
|
435
|
+
return await self.llm_comparator.compare_tool_calls_async(
|
|
436
|
+
predicted_call,
|
|
437
|
+
ground_truth_call,
|
|
438
|
+
conversation_history,
|
|
439
|
+
tool_specs,
|
|
440
|
+
custom_instructions,
|
|
441
|
+
)
|
|
442
|
+
except Exception as e:
|
|
443
|
+
logger.warning(f"Async LLM tool call comparison failed: {e}")
|
|
444
|
+
|
|
445
|
+
# Try code agent async if available
|
|
446
|
+
if self.code_agent_comparator:
|
|
447
|
+
try:
|
|
448
|
+
if hasattr(self.code_agent_comparator, "compare_tool_calls_async"):
|
|
449
|
+
return await self.code_agent_comparator.compare_tool_calls_async(
|
|
450
|
+
predicted_call,
|
|
451
|
+
ground_truth_call,
|
|
452
|
+
conversation_history,
|
|
453
|
+
tool_specs,
|
|
454
|
+
custom_instructions,
|
|
455
|
+
)
|
|
456
|
+
else:
|
|
457
|
+
# Fallback to sync version
|
|
458
|
+
return self.code_agent_comparator.compare_tool_calls(
|
|
459
|
+
predicted_call,
|
|
460
|
+
ground_truth_call,
|
|
461
|
+
conversation_history,
|
|
462
|
+
tool_specs,
|
|
463
|
+
custom_instructions,
|
|
464
|
+
)
|
|
465
|
+
except Exception as e:
|
|
466
|
+
logger.warning(f"Async code agent tool call comparison failed: {e}")
|
|
467
|
+
|
|
468
|
+
# Fallback to base class comparison
|
|
469
|
+
return await super().compare_tool_calls_async(
|
|
470
|
+
predicted_call, ground_truth_call, conversation_history, tool_specs
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
def compare_tool_calls(
|
|
474
|
+
self,
|
|
475
|
+
predicted_call: Dict[str, Any],
|
|
476
|
+
ground_truth_call: Dict[str, Any],
|
|
477
|
+
conversation_history: Optional[List[Dict[str, str]]] = None,
|
|
478
|
+
tool_specs: Optional[List[Dict[str, Any]]] = None,
|
|
479
|
+
custom_instructions: Optional[str] = None,
|
|
480
|
+
custom_schema: Optional[str] = None,
|
|
481
|
+
) -> Any:
|
|
482
|
+
"""Sync hybrid tool call comparison with enhanced LLM Judge features."""
|
|
483
|
+
|
|
484
|
+
# For tool call level comparison, prioritize LLM-based approaches
|
|
485
|
+
|
|
486
|
+
# Try standard LLM judge first
|
|
487
|
+
if self.llm_comparator:
|
|
488
|
+
try:
|
|
489
|
+
return self.llm_comparator.compare_tool_calls(
|
|
490
|
+
predicted_call,
|
|
491
|
+
ground_truth_call,
|
|
492
|
+
conversation_history,
|
|
493
|
+
tool_specs,
|
|
494
|
+
custom_instructions,
|
|
495
|
+
custom_schema,
|
|
496
|
+
)
|
|
497
|
+
except Exception as e:
|
|
498
|
+
logger.warning(f"LLM tool call comparison failed: {e}")
|
|
499
|
+
|
|
500
|
+
# Try code agent if available
|
|
501
|
+
if self.code_agent_comparator:
|
|
502
|
+
try:
|
|
503
|
+
return self.code_agent_comparator.compare_tool_calls(
|
|
504
|
+
predicted_call,
|
|
505
|
+
ground_truth_call,
|
|
506
|
+
conversation_history,
|
|
507
|
+
tool_specs,
|
|
508
|
+
custom_instructions,
|
|
509
|
+
)
|
|
510
|
+
except Exception as e:
|
|
511
|
+
logger.warning(f"Code agent tool call comparison failed: {e}")
|
|
512
|
+
|
|
513
|
+
# Fallback to base class comparison
|
|
514
|
+
return super().compare_tool_calls(
|
|
515
|
+
predicted_call, ground_truth_call, conversation_history, tool_specs
|
|
516
|
+
)
|