ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,816 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import math
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
from typing import (
|
|
6
|
+
Any,
|
|
7
|
+
Dict,
|
|
8
|
+
List,
|
|
9
|
+
Optional,
|
|
10
|
+
Tuple,
|
|
11
|
+
Union,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from llmevalkit.llm import LLMClient
|
|
15
|
+
from llmevalkit.metrics import MetricRunner, MetricRunResult
|
|
16
|
+
from llmevalkit.function_calling import load_prompts_from_list, PromptKind
|
|
17
|
+
from llmevalkit.function_calling.pipeline.adapters import (
|
|
18
|
+
BaseAdapter,
|
|
19
|
+
OpenAIAdapter,
|
|
20
|
+
)
|
|
21
|
+
from llmevalkit.function_calling.pipeline.types import (
|
|
22
|
+
ToolSpec,
|
|
23
|
+
ToolCall,
|
|
24
|
+
TransformResult,
|
|
25
|
+
SemanticCategoryResult,
|
|
26
|
+
SemanticResult,
|
|
27
|
+
)
|
|
28
|
+
from llmevalkit.function_calling.pipeline.transformation_prompts import (
|
|
29
|
+
GENERATE_CODE_SYSTEM,
|
|
30
|
+
GENERATE_CODE_USER,
|
|
31
|
+
GENERATE_CODE_SCHEMA,
|
|
32
|
+
build_multi_extract_units_schema,
|
|
33
|
+
MULTI_EXTRACT_UNITS_SYSTEM,
|
|
34
|
+
MULTI_EXTRACT_UNITS_USER,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
from llmevalkit.function_calling import (
|
|
38
|
+
GeneralMetricsPrompt,
|
|
39
|
+
FunctionSelectionPrompt,
|
|
40
|
+
ParameterMetricsPrompt,
|
|
41
|
+
TrajectoryReflectionPrompt,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SemanticChecker:
|
|
46
|
+
"""
|
|
47
|
+
Orchestrates semantic metrics (and optional unit-transforms)
|
|
48
|
+
for a single function call.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
general_metrics: JSON-schema dicts for general metrics.
|
|
52
|
+
function_metrics: JSON-schema dicts for function-selection metrics.
|
|
53
|
+
parameter_metrics: JSON-schema dicts for parameter-level metrics.
|
|
54
|
+
metrics_client: an llmevalkit LLMClient for metric evaluation.
|
|
55
|
+
codegen_client: an llmevalkit LLMClient for transformation codegen.
|
|
56
|
+
transform_enabled: whether to run unit-conversion checks.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
metrics_client: LLMClient,
|
|
62
|
+
*,
|
|
63
|
+
general_metrics: Optional[List[Dict[str, Any]]] = None,
|
|
64
|
+
function_metrics: Optional[List[Dict[str, Any]]] = None,
|
|
65
|
+
parameter_metrics: Optional[List[Dict[str, Any]]] = None,
|
|
66
|
+
trajectory_metrics: Optional[List[Dict[str, Any]]] = None,
|
|
67
|
+
codegen_client: Optional[LLMClient] = None,
|
|
68
|
+
transform_enabled: Optional[bool] = False,
|
|
69
|
+
) -> None:
|
|
70
|
+
# Validate clients
|
|
71
|
+
# if not isinstance(metrics_client, LLMClient):
|
|
72
|
+
# raise TypeError("metrics_client must be an llmevalkit LLMClient")
|
|
73
|
+
self.metrics_client = metrics_client
|
|
74
|
+
|
|
75
|
+
self.transform_enabled = transform_enabled
|
|
76
|
+
self.codegen_client = codegen_client
|
|
77
|
+
# if not codegen_client or not isinstance(codegen_client, LLMClient):
|
|
78
|
+
# self.codegen_client = metrics_client
|
|
79
|
+
|
|
80
|
+
self.general_prompts = []
|
|
81
|
+
if general_metrics is not None:
|
|
82
|
+
self.general_prompts = load_prompts_from_list(
|
|
83
|
+
general_metrics, PromptKind.GENERAL
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
self.function_prompts = []
|
|
87
|
+
if function_metrics is not None:
|
|
88
|
+
self.function_prompts = load_prompts_from_list(
|
|
89
|
+
function_metrics, PromptKind.FUNCTION_SELECTION
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
self.parameter_prompts = []
|
|
93
|
+
if parameter_metrics is not None:
|
|
94
|
+
self.parameter_prompts = load_prompts_from_list(
|
|
95
|
+
parameter_metrics, PromptKind.PARAMETER
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
self.trajectory_prompts = []
|
|
99
|
+
if trajectory_metrics is not None:
|
|
100
|
+
self.trajectory_prompts = load_prompts_from_list(
|
|
101
|
+
trajectory_metrics, PromptKind.TRAJECTORY
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def _make_adapter(self, apis_specs, tool_call):
|
|
105
|
+
first = apis_specs[0]
|
|
106
|
+
if isinstance(first, ToolSpec):
|
|
107
|
+
return OpenAIAdapter(apis_specs, tool_call)
|
|
108
|
+
raise TypeError("Unsupported spec type")
|
|
109
|
+
|
|
110
|
+
def _collect_params(self, adapter: BaseAdapter) -> Dict[str, Any]:
|
|
111
|
+
"""
|
|
112
|
+
Return a mapping of every parameter name in the spec inventory
|
|
113
|
+
to its value from the call (or defaulted if missing).
|
|
114
|
+
"""
|
|
115
|
+
call_args = adapter.get_parameters()
|
|
116
|
+
merged: Dict[str, Any] = {}
|
|
117
|
+
# Find the function in the inventory
|
|
118
|
+
function_parameters = (
|
|
119
|
+
adapter.get_tool_spec(adapter.get_function_name())
|
|
120
|
+
.get("parameters", {})
|
|
121
|
+
.get("properties", {})
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
for pname, pschema in function_parameters.items():
|
|
125
|
+
if pname in call_args:
|
|
126
|
+
merged[pname] = call_args[pname]
|
|
127
|
+
elif "default" in pschema:
|
|
128
|
+
merged[pname] = pschema["default"]
|
|
129
|
+
else:
|
|
130
|
+
merged[pname] = (
|
|
131
|
+
f"Default value from parameter description (if defined): '{pschema.get('description', 'No description provided')}'"
|
|
132
|
+
f" Otherwise, by the default value of type: {pschema.get('type', 'object')}"
|
|
133
|
+
)
|
|
134
|
+
return merged
|
|
135
|
+
|
|
136
|
+
def extract_all_units_sync(
|
|
137
|
+
self,
|
|
138
|
+
context: Union[str, List[Dict[str, str]]],
|
|
139
|
+
adapter: BaseAdapter,
|
|
140
|
+
params: List[str],
|
|
141
|
+
retries: int = 1,
|
|
142
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
143
|
+
"""
|
|
144
|
+
Synchronously extract user_value/user_units_or_format/spec_units_or_format for every parameter in `params`
|
|
145
|
+
by issuing a single LLM call.
|
|
146
|
+
Returns a dict mapping each parameter name to its classification object.
|
|
147
|
+
"""
|
|
148
|
+
# Build the combined JSON Schema requiring one object per parameter
|
|
149
|
+
multi_schema = build_multi_extract_units_schema(params)
|
|
150
|
+
schema_str = json.dumps(multi_schema, indent=2)
|
|
151
|
+
|
|
152
|
+
# Build the "full_spec" JSON Schema snippet for all parameters
|
|
153
|
+
full_spec_json = json.dumps(
|
|
154
|
+
adapter.get_tool_spec(adapter.get_function_name()).model_dump(),
|
|
155
|
+
indent=2,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Format system and user prompts
|
|
159
|
+
system_prompt = MULTI_EXTRACT_UNITS_SYSTEM.format(schema=schema_str)
|
|
160
|
+
user_prompt = MULTI_EXTRACT_UNITS_USER.format(
|
|
161
|
+
context=context,
|
|
162
|
+
full_spec=full_spec_json,
|
|
163
|
+
parameter_names=", ".join(params),
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Single synchronous LLM call
|
|
167
|
+
try:
|
|
168
|
+
response: Dict[str, Any] = self.metrics_client.generate(
|
|
169
|
+
prompt=[
|
|
170
|
+
{"role": "system", "content": system_prompt},
|
|
171
|
+
{"role": "user", "content": user_prompt},
|
|
172
|
+
],
|
|
173
|
+
schema=multi_schema,
|
|
174
|
+
retries=retries,
|
|
175
|
+
)
|
|
176
|
+
except Exception:
|
|
177
|
+
response = {
|
|
178
|
+
pname: {
|
|
179
|
+
"user_value": None,
|
|
180
|
+
"user_units_or_format": None,
|
|
181
|
+
"spec_units_or_format": None,
|
|
182
|
+
}
|
|
183
|
+
for pname in params
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
return response
|
|
187
|
+
|
|
188
|
+
def run_sync(
|
|
189
|
+
self,
|
|
190
|
+
apis_specs: List[ToolSpec],
|
|
191
|
+
tool_call: ToolCall,
|
|
192
|
+
context: Union[str, List[Dict[str, str]]],
|
|
193
|
+
retries: int = 1,
|
|
194
|
+
transform_enabled: Optional[bool] = None,
|
|
195
|
+
) -> SemanticResult:
|
|
196
|
+
"""
|
|
197
|
+
Synchronous semantic-only evaluation.
|
|
198
|
+
|
|
199
|
+
Returns a SemanticResult:
|
|
200
|
+
{
|
|
201
|
+
"general": {metric_name: result, …} or None
|
|
202
|
+
"function_selection": {…} or None
|
|
203
|
+
"parameter": {param_name: {metric_name: result}, …} or None
|
|
204
|
+
"transform": {param_name: TransformResult, …} or None
|
|
205
|
+
}
|
|
206
|
+
"""
|
|
207
|
+
# 1) Normalize via adapter
|
|
208
|
+
adapter = self._make_adapter(apis_specs, tool_call)
|
|
209
|
+
tools_inventory_summary = adapter.get_tools_inventory_summary()
|
|
210
|
+
call_dict = adapter.get_call_dict()
|
|
211
|
+
fn_name = adapter.get_function_name()
|
|
212
|
+
cur_tool_spec = adapter.get_tool_spec(fn_name)
|
|
213
|
+
params = self._collect_params(adapter)
|
|
214
|
+
|
|
215
|
+
if transform_enabled is not None:
|
|
216
|
+
old_transform_enabled = self.transform_enabled
|
|
217
|
+
self.transform_enabled = transform_enabled
|
|
218
|
+
|
|
219
|
+
# 2) GENERAL METRICS
|
|
220
|
+
general_results: Optional[SemanticCategoryResult]
|
|
221
|
+
entries: List[Tuple[GeneralMetricsPrompt, Dict[str, Any]]] = []
|
|
222
|
+
for prompt in self.general_prompts:
|
|
223
|
+
entries.append(
|
|
224
|
+
(
|
|
225
|
+
prompt,
|
|
226
|
+
{
|
|
227
|
+
"conversation_context": context,
|
|
228
|
+
"tool_inventory": cur_tool_spec,
|
|
229
|
+
"tool_call": call_dict,
|
|
230
|
+
},
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
if entries:
|
|
234
|
+
try:
|
|
235
|
+
runner = MetricRunner(entries)
|
|
236
|
+
sync_results = runner.run_all(
|
|
237
|
+
self.metrics_client.generate,
|
|
238
|
+
prompt_param_name="prompt",
|
|
239
|
+
schema_param_name="schema",
|
|
240
|
+
retries=retries,
|
|
241
|
+
)
|
|
242
|
+
general_results = SemanticCategoryResult.from_results(sync_results)
|
|
243
|
+
except Exception as e:
|
|
244
|
+
general_results = {"error": str(e)}
|
|
245
|
+
else:
|
|
246
|
+
general_results = None
|
|
247
|
+
|
|
248
|
+
# 3) FUNCTION-SELECTION METRICS
|
|
249
|
+
function_results: Optional[SemanticCategoryResult]
|
|
250
|
+
func_entries: List[Tuple[FunctionSelectionPrompt, Dict[str, Any]]] = []
|
|
251
|
+
for prompt in self.function_prompts:
|
|
252
|
+
func_entries.append(
|
|
253
|
+
(
|
|
254
|
+
prompt,
|
|
255
|
+
{
|
|
256
|
+
"conversation_context": context,
|
|
257
|
+
"tools_inventory": tools_inventory_summary,
|
|
258
|
+
"proposed_tool_call": call_dict,
|
|
259
|
+
"selected_function": fn_name,
|
|
260
|
+
},
|
|
261
|
+
)
|
|
262
|
+
)
|
|
263
|
+
if func_entries:
|
|
264
|
+
try:
|
|
265
|
+
runner = MetricRunner(func_entries)
|
|
266
|
+
sync_results = runner.run_all(
|
|
267
|
+
self.metrics_client.generate,
|
|
268
|
+
prompt_param_name="prompt",
|
|
269
|
+
schema_param_name="schema",
|
|
270
|
+
retries=retries,
|
|
271
|
+
)
|
|
272
|
+
function_results = SemanticCategoryResult.from_results(sync_results)
|
|
273
|
+
except Exception as e:
|
|
274
|
+
function_results = {"error": str(e)}
|
|
275
|
+
else:
|
|
276
|
+
function_results = None
|
|
277
|
+
|
|
278
|
+
# 4) PARAMETER-LEVEL METRICS
|
|
279
|
+
parameter_results: Optional[Dict[str, SemanticCategoryResult]] = {}
|
|
280
|
+
for pname, pval in params.items():
|
|
281
|
+
# Each parameter has its own prompts
|
|
282
|
+
try:
|
|
283
|
+
param_entries: List[Tuple[ParameterMetricsPrompt, Dict[str, Any]]] = []
|
|
284
|
+
for prompt in self.parameter_prompts:
|
|
285
|
+
param_entries.append(
|
|
286
|
+
(
|
|
287
|
+
prompt,
|
|
288
|
+
{
|
|
289
|
+
"conversation_context": context,
|
|
290
|
+
"tool_inventory": cur_tool_spec,
|
|
291
|
+
"tool_call": call_dict,
|
|
292
|
+
"parameter_name": pname,
|
|
293
|
+
"parameter_value": pval,
|
|
294
|
+
},
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
runner = MetricRunner(param_entries)
|
|
298
|
+
sync_results = runner.run_all(
|
|
299
|
+
self.metrics_client.generate,
|
|
300
|
+
prompt_param_name="prompt",
|
|
301
|
+
schema_param_name="schema",
|
|
302
|
+
retries=retries,
|
|
303
|
+
)
|
|
304
|
+
parameter_results[pname] = SemanticCategoryResult.from_results(
|
|
305
|
+
sync_results
|
|
306
|
+
)
|
|
307
|
+
except Exception as e:
|
|
308
|
+
parameter_results[pname] = {"error": str(e)}
|
|
309
|
+
|
|
310
|
+
if not parameter_results:
|
|
311
|
+
parameter_results = None
|
|
312
|
+
|
|
313
|
+
# Base SemanticResult without transforms
|
|
314
|
+
result = SemanticResult(
|
|
315
|
+
general=general_results,
|
|
316
|
+
function_selection=function_results,
|
|
317
|
+
parameter=parameter_results,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# 5) OPTIONAL TRANSFORMS
|
|
321
|
+
params = adapter.get_parameters()
|
|
322
|
+
if self.transform_enabled and params:
|
|
323
|
+
if transform_enabled is not None:
|
|
324
|
+
self.transform_enabled = old_transform_enabled
|
|
325
|
+
|
|
326
|
+
transform_out: Dict[str, TransformResult] = {}
|
|
327
|
+
|
|
328
|
+
# 5a) Extract units for all parameters in one synchronous call
|
|
329
|
+
units_map = self.extract_all_units_sync(
|
|
330
|
+
context=context,
|
|
331
|
+
adapter=adapter,
|
|
332
|
+
params=list(params.keys()),
|
|
333
|
+
retries=retries,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# 5b) Generate code & execute for each parameter needing conversion
|
|
337
|
+
for pname, units in units_map.items():
|
|
338
|
+
user_units = units.get("user_units_or_format") or ""
|
|
339
|
+
spec_units = units.get("spec_units_or_format") or ""
|
|
340
|
+
user_value = units.get("user_value")
|
|
341
|
+
transformation_summary = units.get("transformation_summary", "")
|
|
342
|
+
gen_code = ""
|
|
343
|
+
|
|
344
|
+
# Only generate code if user_units differs from spec_units and user_value is present
|
|
345
|
+
if (
|
|
346
|
+
user_units
|
|
347
|
+
and user_value is not None
|
|
348
|
+
and spec_units
|
|
349
|
+
and (user_units != spec_units)
|
|
350
|
+
):
|
|
351
|
+
try:
|
|
352
|
+
prompt = GENERATE_CODE_USER.format(
|
|
353
|
+
old_value=user_value,
|
|
354
|
+
old_units=user_units,
|
|
355
|
+
transformed_value=str(params[pname]),
|
|
356
|
+
transformed_units=spec_units,
|
|
357
|
+
transformed_type=type(params[pname]).__name__,
|
|
358
|
+
transformation_summary=transformation_summary,
|
|
359
|
+
)
|
|
360
|
+
gen_code = self.codegen_client.generate(
|
|
361
|
+
prompt=[
|
|
362
|
+
{"role": "system", "content": GENERATE_CODE_SYSTEM},
|
|
363
|
+
{"role": "user", "content": prompt},
|
|
364
|
+
],
|
|
365
|
+
schema=GENERATE_CODE_SCHEMA,
|
|
366
|
+
retries=retries,
|
|
367
|
+
).get("generated_code", "")
|
|
368
|
+
except Exception:
|
|
369
|
+
gen_code = ""
|
|
370
|
+
|
|
371
|
+
# 5c) Execute & validate
|
|
372
|
+
tr = self._execute_code_and_validate(
|
|
373
|
+
code=gen_code,
|
|
374
|
+
user_val=str(user_value or ""),
|
|
375
|
+
api_val=str(params[pname]),
|
|
376
|
+
units=units,
|
|
377
|
+
)
|
|
378
|
+
transform_out[pname] = tr
|
|
379
|
+
|
|
380
|
+
if transform_out:
|
|
381
|
+
result.transform = transform_out
|
|
382
|
+
else:
|
|
383
|
+
result.transform = None
|
|
384
|
+
|
|
385
|
+
return result
|
|
386
|
+
|
|
387
|
+
async def extract_all_units(
|
|
388
|
+
self,
|
|
389
|
+
context: Union[str, List[Dict[str, str]]],
|
|
390
|
+
adapter: BaseAdapter,
|
|
391
|
+
params: List[str],
|
|
392
|
+
retries: int = 1,
|
|
393
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
394
|
+
"""
|
|
395
|
+
Call the LLM once to extract user_value/user_units_or_format/spec_units_or_format
|
|
396
|
+
for every parameter in `params`. Returns a dict:
|
|
397
|
+
{ parameter_name: {"user_value": ..., "user_units_or_format": ..., "spec_units_or_format": ...}, ... }
|
|
398
|
+
"""
|
|
399
|
+
# 1) Build the JSON Schema that requires one object per parameter
|
|
400
|
+
multi_schema = build_multi_extract_units_schema(params)
|
|
401
|
+
schema_str = json.dumps(multi_schema, indent=2)
|
|
402
|
+
|
|
403
|
+
# 2) Build the "full_spec" JSON Schema snippet for all parameters
|
|
404
|
+
full_spec_json = json.dumps(
|
|
405
|
+
adapter.get_tool_spec(adapter.get_function_name()),
|
|
406
|
+
indent=2,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
# 3) Fill in system and user prompts
|
|
410
|
+
system_prompt = MULTI_EXTRACT_UNITS_SYSTEM.format(schema=schema_str)
|
|
411
|
+
|
|
412
|
+
user_prompt = MULTI_EXTRACT_UNITS_USER.format(
|
|
413
|
+
context=context,
|
|
414
|
+
full_spec=full_spec_json,
|
|
415
|
+
parameter_names=", ".join(params),
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
# 4) Fire a single async LLM call
|
|
419
|
+
try:
|
|
420
|
+
response: Dict[str, Any] = await self.metrics_client.generate_async(
|
|
421
|
+
prompt=[
|
|
422
|
+
{"role": "system", "content": system_prompt},
|
|
423
|
+
{"role": "user", "content": user_prompt},
|
|
424
|
+
],
|
|
425
|
+
schema=multi_schema,
|
|
426
|
+
retries=retries,
|
|
427
|
+
)
|
|
428
|
+
except Exception:
|
|
429
|
+
# If the LLM fails, default to no-information for each parameter
|
|
430
|
+
response = {
|
|
431
|
+
pname: {
|
|
432
|
+
"user_value": None,
|
|
433
|
+
"user_units_or_format": None,
|
|
434
|
+
"spec_units_or_format": None,
|
|
435
|
+
}
|
|
436
|
+
for pname in params
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
return response
|
|
440
|
+
|
|
441
|
+
async def run_async(
|
|
442
|
+
self,
|
|
443
|
+
apis_specs: List[ToolSpec],
|
|
444
|
+
tool_call: ToolCall,
|
|
445
|
+
context: Union[str, List[Dict[str, str]]],
|
|
446
|
+
retries: int = 1,
|
|
447
|
+
max_parallel: int = 10,
|
|
448
|
+
transform_enabled: Optional[bool] = None,
|
|
449
|
+
) -> SemanticResult:
|
|
450
|
+
"""
|
|
451
|
+
Asynchronous semantic-only evaluation with concurrency.
|
|
452
|
+
Returns a SemanticResult with:
|
|
453
|
+
- general: results of general metrics
|
|
454
|
+
- function_selection: results of function-selection metrics
|
|
455
|
+
- parameter: results of parameter-level metrics
|
|
456
|
+
- transform: (optional) unit-conversion transforms if enabled
|
|
457
|
+
"""
|
|
458
|
+
adapter = self._make_adapter(apis_specs, tool_call)
|
|
459
|
+
tools_inventory_summary = adapter.get_tools_inventory_summary()
|
|
460
|
+
call_dict = adapter.get_call_dict()
|
|
461
|
+
fn_name = adapter.get_function_name()
|
|
462
|
+
cur_tool_spec = adapter.get_tool_spec(fn_name)
|
|
463
|
+
params = self._collect_params(adapter)
|
|
464
|
+
|
|
465
|
+
# Handle optional override of transform_enabled
|
|
466
|
+
if transform_enabled is not None:
|
|
467
|
+
old_transform_enabled = self.transform_enabled
|
|
468
|
+
self.transform_enabled = transform_enabled
|
|
469
|
+
|
|
470
|
+
# 2) GENERAL METRICS
|
|
471
|
+
general_results: SemanticCategoryResult = {}
|
|
472
|
+
general_entries: List[Tuple[GeneralMetricsPrompt, Dict[str, Any]]] = []
|
|
473
|
+
general_async_results: List[MetricRunResult] = []
|
|
474
|
+
|
|
475
|
+
for prompt in self.general_prompts:
|
|
476
|
+
general_entries.append(
|
|
477
|
+
(
|
|
478
|
+
prompt,
|
|
479
|
+
{
|
|
480
|
+
"conversation_context": context,
|
|
481
|
+
"tool_inventory": cur_tool_spec,
|
|
482
|
+
"tool_call": call_dict,
|
|
483
|
+
},
|
|
484
|
+
)
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
# 3) FUNCTION-SELECTION METRICS
|
|
488
|
+
function_results: SemanticCategoryResult = {}
|
|
489
|
+
func_entries: List[Tuple[FunctionSelectionPrompt, Dict[str, Any]]] = []
|
|
490
|
+
function_async_results: List[MetricRunResult] = []
|
|
491
|
+
|
|
492
|
+
for prompt in self.function_prompts:
|
|
493
|
+
func_entries.append(
|
|
494
|
+
(
|
|
495
|
+
prompt,
|
|
496
|
+
{
|
|
497
|
+
"conversation_context": context,
|
|
498
|
+
"tools_inventory": tools_inventory_summary,
|
|
499
|
+
"proposed_tool_call": call_dict,
|
|
500
|
+
"selected_function": fn_name,
|
|
501
|
+
},
|
|
502
|
+
)
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
# 4) PARAMETER-LEVEL METRICS
|
|
506
|
+
parameter_results: Dict[str, SemanticCategoryResult] = {}
|
|
507
|
+
parameter_async_results: Dict[str, List[MetricRunResult]] = {}
|
|
508
|
+
param_entries: List[Tuple[ParameterMetricsPrompt, Dict[str, Any]]] = []
|
|
509
|
+
|
|
510
|
+
for pname, pval in params.items():
|
|
511
|
+
for prompt in self.parameter_prompts:
|
|
512
|
+
param_entries.append(
|
|
513
|
+
(
|
|
514
|
+
prompt,
|
|
515
|
+
{
|
|
516
|
+
"conversation_context": context,
|
|
517
|
+
"tool_inventory": cur_tool_spec,
|
|
518
|
+
"tool_call": call_dict,
|
|
519
|
+
"parameter_name": pname,
|
|
520
|
+
"parameter_value": pval,
|
|
521
|
+
},
|
|
522
|
+
)
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
# Run all metric prompts in parallel (up to max_parallel)
|
|
526
|
+
try:
|
|
527
|
+
all_entries = general_entries + func_entries + param_entries
|
|
528
|
+
runner = MetricRunner(all_entries)
|
|
529
|
+
async_results = await runner.run_async(
|
|
530
|
+
self.metrics_client.generate_async,
|
|
531
|
+
prompt_param_name="prompt",
|
|
532
|
+
schema_param_name="schema",
|
|
533
|
+
retries=retries,
|
|
534
|
+
max_parallel=max_parallel,
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
# Split the results back into categories
|
|
538
|
+
for entry, result in zip(all_entries, async_results):
|
|
539
|
+
prompt_obj, ctx_dict = entry
|
|
540
|
+
if isinstance(prompt_obj, GeneralMetricsPrompt) and isinstance(
|
|
541
|
+
result, MetricRunResult
|
|
542
|
+
):
|
|
543
|
+
general_async_results.append(result)
|
|
544
|
+
elif isinstance(prompt_obj, FunctionSelectionPrompt) and isinstance(
|
|
545
|
+
result, MetricRunResult
|
|
546
|
+
):
|
|
547
|
+
function_async_results.append(result)
|
|
548
|
+
elif isinstance(prompt_obj, ParameterMetricsPrompt) and isinstance(
|
|
549
|
+
result, MetricRunResult
|
|
550
|
+
):
|
|
551
|
+
pname = ctx_dict["parameter_name"]
|
|
552
|
+
parameter_async_results.setdefault(pname, []).append(result)
|
|
553
|
+
|
|
554
|
+
# Aggregate general results
|
|
555
|
+
if general_async_results:
|
|
556
|
+
general_results = SemanticCategoryResult.from_results(
|
|
557
|
+
general_async_results
|
|
558
|
+
)
|
|
559
|
+
else:
|
|
560
|
+
general_results = None
|
|
561
|
+
|
|
562
|
+
# Aggregate function-selection results
|
|
563
|
+
if function_async_results:
|
|
564
|
+
function_results = SemanticCategoryResult.from_results(
|
|
565
|
+
function_async_results
|
|
566
|
+
)
|
|
567
|
+
else:
|
|
568
|
+
function_results = None
|
|
569
|
+
|
|
570
|
+
# Aggregate parameter-level results
|
|
571
|
+
if parameter_async_results:
|
|
572
|
+
for pname, results in parameter_async_results.items():
|
|
573
|
+
if results:
|
|
574
|
+
parameter_results[pname] = SemanticCategoryResult.from_results(
|
|
575
|
+
results
|
|
576
|
+
)
|
|
577
|
+
else:
|
|
578
|
+
parameter_results[pname] = None
|
|
579
|
+
else:
|
|
580
|
+
parameter_results = None
|
|
581
|
+
|
|
582
|
+
except Exception as e:
|
|
583
|
+
# In case any metric-run fails, record the error
|
|
584
|
+
general_results = {"error": str(e)}
|
|
585
|
+
function_results = {"error": str(e)}
|
|
586
|
+
parameter_results = {"error": str(e)}
|
|
587
|
+
|
|
588
|
+
# Construct the base SemanticResult
|
|
589
|
+
result = SemanticResult(
|
|
590
|
+
general=general_results,
|
|
591
|
+
function_selection=function_results,
|
|
592
|
+
parameter=parameter_results,
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
# -------------------------------------------------------------------
|
|
596
|
+
# 5) Optional TRANSFORMS: Unit extraction & code generation
|
|
597
|
+
# -------------------------------------------------------------------
|
|
598
|
+
params = adapter.get_parameters()
|
|
599
|
+
if self.transform_enabled and params:
|
|
600
|
+
# Restore transform_enabled if overridden
|
|
601
|
+
if transform_enabled is not None:
|
|
602
|
+
self.transform_enabled = old_transform_enabled
|
|
603
|
+
|
|
604
|
+
# 5.1) Extract units for ALL parameters in one LLM call
|
|
605
|
+
units_map = await self.extract_all_units(
|
|
606
|
+
context=context,
|
|
607
|
+
adapter=adapter,
|
|
608
|
+
params=list(params.keys()),
|
|
609
|
+
retries=retries,
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
# 5.2) Generate conversion code for parameters that need it
|
|
613
|
+
code_tasks: Dict[str, asyncio.Task] = {}
|
|
614
|
+
for pname, units in units_map.items():
|
|
615
|
+
user_units = units.get("user_units_or_format") or ""
|
|
616
|
+
spec_units = units.get("spec_units_or_format") or ""
|
|
617
|
+
user_value = units.get("user_value")
|
|
618
|
+
transformation_summary = units.get("transformation_summary", "")
|
|
619
|
+
if (
|
|
620
|
+
user_units
|
|
621
|
+
and user_value
|
|
622
|
+
and spec_units
|
|
623
|
+
and (user_units != spec_units)
|
|
624
|
+
):
|
|
625
|
+
# Generate code only if units differ and value is present
|
|
626
|
+
prompt = GENERATE_CODE_USER.format(
|
|
627
|
+
old_value=user_value,
|
|
628
|
+
old_units=user_units,
|
|
629
|
+
transformed_value=str(params[pname]),
|
|
630
|
+
transformed_units=spec_units,
|
|
631
|
+
transformed_type=type(params[pname]).__name__,
|
|
632
|
+
transformation_summary=transformation_summary,
|
|
633
|
+
)
|
|
634
|
+
code_tasks[pname] = asyncio.create_task(
|
|
635
|
+
self.codegen_client.generate_async(
|
|
636
|
+
prompt=[
|
|
637
|
+
{"role": "system", "content": GENERATE_CODE_SYSTEM},
|
|
638
|
+
{"role": "user", "content": prompt},
|
|
639
|
+
],
|
|
640
|
+
schema=GENERATE_CODE_SCHEMA,
|
|
641
|
+
retries=retries,
|
|
642
|
+
)
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
# 5.3) Await up to max_parallel code-generation tasks
|
|
646
|
+
semaphore = asyncio.Semaphore(max_parallel)
|
|
647
|
+
|
|
648
|
+
async def run_with_semaphore(task: asyncio.Task):
|
|
649
|
+
async with semaphore:
|
|
650
|
+
return await task
|
|
651
|
+
|
|
652
|
+
wrapped_code_tasks = [
|
|
653
|
+
asyncio.create_task(run_with_semaphore(t)) for t in code_tasks.values()
|
|
654
|
+
]
|
|
655
|
+
try:
|
|
656
|
+
code_responses = await asyncio.gather(*wrapped_code_tasks)
|
|
657
|
+
except Exception:
|
|
658
|
+
# If code generation fails, set all to None
|
|
659
|
+
code_responses = [None] * len(wrapped_code_tasks)
|
|
660
|
+
|
|
661
|
+
# 5.4) Map code responses back to parameter names
|
|
662
|
+
code_map: Dict[str, Dict[str, Any]] = {}
|
|
663
|
+
for pname, response in zip(code_tasks.keys(), code_responses):
|
|
664
|
+
if response is not None:
|
|
665
|
+
code_map[pname] = response
|
|
666
|
+
|
|
667
|
+
# 5.5) Execute generated code and validate conversions
|
|
668
|
+
transform_map: Dict[str, TransformResult] = {}
|
|
669
|
+
for pname, code_resp in code_map.items():
|
|
670
|
+
gen_code = code_resp.get("generated_code", "")
|
|
671
|
+
units_info = units_map[pname]
|
|
672
|
+
if not gen_code:
|
|
673
|
+
transform_map[pname] = TransformResult(
|
|
674
|
+
units=units_info,
|
|
675
|
+
generated_code="",
|
|
676
|
+
execution_success=False,
|
|
677
|
+
correct=True,
|
|
678
|
+
execution_output=None,
|
|
679
|
+
correction=None,
|
|
680
|
+
error="No code generated",
|
|
681
|
+
)
|
|
682
|
+
continue
|
|
683
|
+
|
|
684
|
+
tr = self._execute_code_and_validate(
|
|
685
|
+
code=gen_code,
|
|
686
|
+
user_val=str(units_info.get("user_value") or ""),
|
|
687
|
+
api_val=str(params[pname]),
|
|
688
|
+
units=units_info,
|
|
689
|
+
)
|
|
690
|
+
transform_map[pname] = tr
|
|
691
|
+
|
|
692
|
+
if transform_map:
|
|
693
|
+
result.transform = transform_map
|
|
694
|
+
else:
|
|
695
|
+
result.transform = None
|
|
696
|
+
|
|
697
|
+
return result
|
|
698
|
+
|
|
699
|
+
def _execute_code_and_validate(
|
|
700
|
+
self,
|
|
701
|
+
code: str,
|
|
702
|
+
user_val: str,
|
|
703
|
+
api_val: str,
|
|
704
|
+
units: Dict[str, Any],
|
|
705
|
+
) -> TransformResult:
|
|
706
|
+
"""
|
|
707
|
+
Strip code fences, install imports, exec code, compare, return TransformResult.
|
|
708
|
+
"""
|
|
709
|
+
clean = re.sub(r"^```(?:python)?|```$", "", code, flags=re.MULTILINE).strip()
|
|
710
|
+
|
|
711
|
+
# install imports
|
|
712
|
+
for mod in set(
|
|
713
|
+
re.findall(r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE)
|
|
714
|
+
):
|
|
715
|
+
try:
|
|
716
|
+
__import__(mod)
|
|
717
|
+
except ImportError as e:
|
|
718
|
+
return TransformResult(
|
|
719
|
+
units=units,
|
|
720
|
+
generated_code=clean,
|
|
721
|
+
execution_success=False,
|
|
722
|
+
correct=True,
|
|
723
|
+
execution_output=None,
|
|
724
|
+
correction=None,
|
|
725
|
+
error=f"Error: {e}. Could not import module '{mod}'. Please install the package and try again,"
|
|
726
|
+
" or run the generated code manually:\n"
|
|
727
|
+
f"transformation_code({user_val}) == convert_example_str_transformed_to_transformed_type({api_val})",
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
ns: Dict[str, Any] = {}
|
|
731
|
+
try:
|
|
732
|
+
exec(clean, ns)
|
|
733
|
+
fn_t = ns.get("transformation_code")
|
|
734
|
+
fn_c = ns.get("convert_example_str_transformed_to_transformed_type")
|
|
735
|
+
if not callable(fn_t) or not callable(fn_c):
|
|
736
|
+
raise ValueError("Generated code missing required functions")
|
|
737
|
+
|
|
738
|
+
out_t = fn_t(user_val)
|
|
739
|
+
out_c = fn_c(api_val)
|
|
740
|
+
if isinstance(out_t, (int, float)) and isinstance(out_c, (int, float)):
|
|
741
|
+
success = math.isclose(out_t, out_c, abs_tol=1e-3)
|
|
742
|
+
else:
|
|
743
|
+
success = str(out_t) == str(out_c)
|
|
744
|
+
|
|
745
|
+
correction = None
|
|
746
|
+
if not success:
|
|
747
|
+
correction = (
|
|
748
|
+
f"The transformation code validation found an issue with the units transformation "
|
|
749
|
+
f"of the parameter.\n"
|
|
750
|
+
f"The user request value is '{user_val}' with units '{units.get('user_units_or_format')}' and "
|
|
751
|
+
f"the API call value is '{api_val}' with units '{units.get('spec_units_or_format')}'.\n"
|
|
752
|
+
f"Expected transformation is '{out_t}' based on the code.\n"
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
correct = correction is None
|
|
756
|
+
|
|
757
|
+
return TransformResult(
|
|
758
|
+
units=units,
|
|
759
|
+
generated_code=clean,
|
|
760
|
+
execution_success=True,
|
|
761
|
+
correct=correct,
|
|
762
|
+
execution_output={"transformed": out_t, "converted": out_c},
|
|
763
|
+
correction=correction,
|
|
764
|
+
error=None,
|
|
765
|
+
)
|
|
766
|
+
except Exception as e:
|
|
767
|
+
return TransformResult(
|
|
768
|
+
units=units,
|
|
769
|
+
generated_code=clean,
|
|
770
|
+
execution_success=False,
|
|
771
|
+
correct=True,
|
|
772
|
+
execution_output=None,
|
|
773
|
+
correction=None,
|
|
774
|
+
error=str(e),
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
async def run_trajectory_async(
|
|
778
|
+
self,
|
|
779
|
+
trajectory: Union[str, List[Dict[str, str]]],
|
|
780
|
+
tool_inventory: List[Dict[str, Any]],
|
|
781
|
+
retries: int = 1,
|
|
782
|
+
max_parallel: int = 10,
|
|
783
|
+
) -> Optional[SemanticCategoryResult]:
|
|
784
|
+
"""
|
|
785
|
+
Asynchronous trajectory evaluation.
|
|
786
|
+
"""
|
|
787
|
+
trajectory_results: Optional[SemanticCategoryResult]
|
|
788
|
+
entries: List[Tuple[TrajectoryReflectionPrompt, Dict[str, Any]]] = []
|
|
789
|
+
for prompt in self.trajectory_prompts:
|
|
790
|
+
entries.append(
|
|
791
|
+
(
|
|
792
|
+
prompt,
|
|
793
|
+
{
|
|
794
|
+
"trajectory": trajectory,
|
|
795
|
+
"tool_inventory": tool_inventory,
|
|
796
|
+
},
|
|
797
|
+
)
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
if not entries:
|
|
801
|
+
return None
|
|
802
|
+
|
|
803
|
+
try:
|
|
804
|
+
runner = MetricRunner(entries)
|
|
805
|
+
async_results = await runner.run_async(
|
|
806
|
+
self.metrics_client.generate_async,
|
|
807
|
+
prompt_param_name="prompt",
|
|
808
|
+
schema_param_name="schema",
|
|
809
|
+
retries=retries,
|
|
810
|
+
max_parallel=max_parallel,
|
|
811
|
+
)
|
|
812
|
+
trajectory_results = SemanticCategoryResult.from_results(async_results)
|
|
813
|
+
except Exception as e:
|
|
814
|
+
trajectory_results = {"error": str(e)}
|
|
815
|
+
|
|
816
|
+
return trajectory_results
|