ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# OCO Source Materials
|
|
4
|
+
# 5900-A3Q, 5737-H76
|
|
5
|
+
# Copyright IBM Corp. 2025
|
|
6
|
+
# The source code for this program is not published or other-wise divested of its trade
|
|
7
|
+
# secrets, irrespective of what has been deposited with the U.S.Copyright Office.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from ibm_watsonx_ai.foundation_models.prompts.prompt_template import (
|
|
11
|
+
DetachedPromptTemplate, PromptTemplate)
|
|
12
|
+
|
|
13
|
+
from ibm_watsonx_gov.entities.container import (BaseMonitor, ProjectContainer,
|
|
14
|
+
SpaceContainer)
|
|
15
|
+
from ibm_watsonx_gov.entities.credentials import Credentials
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluationStage
|
|
17
|
+
from ibm_watsonx_gov.entities.prompt_setup import PromptSetup
|
|
18
|
+
from ibm_watsonx_gov.prompt_evaluator.impl.prompt_evaluator_impl import \
|
|
19
|
+
PromptEvaluatorImpl
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class PromptEvaluator:
|
|
23
|
+
"""
|
|
24
|
+
PromptEvaluator is a class that sets up a prompt template and evaluates the risks associated with it.
|
|
25
|
+
|
|
26
|
+
Example
|
|
27
|
+
-------
|
|
28
|
+
.. code-block:: python
|
|
29
|
+
|
|
30
|
+
# Create the prompt evaluator
|
|
31
|
+
evaluator = PromptEvaluator(
|
|
32
|
+
credentials=Credentials(api_key="")
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Create the prompt setup
|
|
36
|
+
prompt_setup = PromptSetup(
|
|
37
|
+
task_type=TaskType.RAG,
|
|
38
|
+
question_field="question",
|
|
39
|
+
context_fields=["context1"],
|
|
40
|
+
label_column="answer",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Create the prompt template
|
|
44
|
+
prompt_template = PromptTemplate(
|
|
45
|
+
name="test",
|
|
46
|
+
description="description",
|
|
47
|
+
input_variables=["question", "context1"],
|
|
48
|
+
input_text="Answer the below question from the given context only and do not use the knowledge outside the context. Context: {context1} Question: {question} Answer:",
|
|
49
|
+
model_id="ibm/granite-3-3-8b-instruct",
|
|
50
|
+
task_ids=[TaskType.RAG.value]
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Provide the development container details
|
|
54
|
+
development_container = ProjectContainer(
|
|
55
|
+
container_id="3acf420f-526a-4007-abe7-78a03435aac2",
|
|
56
|
+
monitors=[
|
|
57
|
+
GenerativeAIQualityMonitor(),
|
|
58
|
+
]
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Evaluate the risk based on the provided dataset
|
|
62
|
+
evaluator.evaluate_risk(
|
|
63
|
+
prompt_setup=prompt_setup,
|
|
64
|
+
prompt_template=prompt_template,
|
|
65
|
+
containers=[development_container],
|
|
66
|
+
environments=[EvaluationStage.DEVELOPMENT],
|
|
67
|
+
input_file_path="./rag_dataset.csv",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Show the evaluation result
|
|
71
|
+
evaluator.get_monitor_metrics(
|
|
72
|
+
monitor=BaseMonitor(monitor_name="generative_ai_quality"),
|
|
73
|
+
environment=EvaluationStage.DEVELOPMENT,
|
|
74
|
+
show_table=True,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
evaluator.get_dataset_records(
|
|
78
|
+
dataset_type="gen_ai_quality_metrics",
|
|
79
|
+
environment=EvaluationStage.DEVELOPMENT,
|
|
80
|
+
show_table=True,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(self, credentials: Credentials | None = None):
|
|
86
|
+
"""
|
|
87
|
+
Initializes the code assistant with the provided credentials.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
credentials (Credentials): The credentials required for authentication and authorization.
|
|
91
|
+
"""
|
|
92
|
+
self.__evaluator = PromptEvaluatorImpl(credentials)
|
|
93
|
+
|
|
94
|
+
def e2e_prompt_evaluation(
|
|
95
|
+
self,
|
|
96
|
+
config: dict[str, any],
|
|
97
|
+
input_file_path: str = None,
|
|
98
|
+
):
|
|
99
|
+
"""
|
|
100
|
+
Method to set up and evaluate the prompt template end to end with a simplified interface.
|
|
101
|
+
|
|
102
|
+
Examples:
|
|
103
|
+
|
|
104
|
+
.. code-block:: python
|
|
105
|
+
|
|
106
|
+
# Create the prompt evaluator
|
|
107
|
+
evaluator = PromptEvaluator(
|
|
108
|
+
credentials=Credentials(api_key="")
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# detached prompt configuration example
|
|
112
|
+
detached_prompt_config = {
|
|
113
|
+
"prompt_setup": {
|
|
114
|
+
"problem_type": TaskType.RAG.value,
|
|
115
|
+
"context_fields": ["context1"],
|
|
116
|
+
},
|
|
117
|
+
"development_project_id": "3acf420f-526a-4007-abe7-78a03435aac2",
|
|
118
|
+
"detached_prompt_template": {
|
|
119
|
+
"name": "detached prompt experiment",
|
|
120
|
+
"model_id": "ibm/granite-3-2-8b-instruct",
|
|
121
|
+
"input_text": "Answer the below question from the given context only and do not use the knowledge outside the context. Context: {context1} Question: {question} Answer:",
|
|
122
|
+
"input_variables": ["question", "context1"],
|
|
123
|
+
"detached_model_url": "https://us-south.ml.cloud.ibm.com/ml/v1/deployments/insurance_test_deployment/text/generation?version=2021-05-01",
|
|
124
|
+
"task_ids": [TaskType.RAG.value],
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# prompt configuration example
|
|
128
|
+
prompt_config = {
|
|
129
|
+
"prompt_setup": {
|
|
130
|
+
"problem_type": TaskType.RAG.value,
|
|
131
|
+
"context_fields": ["context1"],
|
|
132
|
+
},
|
|
133
|
+
"development_project_id": "3acf420f-526a-4007-abe7-78a03435aac2",
|
|
134
|
+
"prompt_template": {
|
|
135
|
+
"name": "prompt experiment",
|
|
136
|
+
"model_id": "ibm/granite-3-2-8b-instruct",
|
|
137
|
+
"input_text": "Answer the below question from the given context only and do not use the knowledge outside the context. Context: {context1} Question: {question} Answer:",
|
|
138
|
+
"input_variables": ["question", "context1"],
|
|
139
|
+
"task_ids": [TaskType.RAG.value],
|
|
140
|
+
},
|
|
141
|
+
// optional usecase configuration
|
|
142
|
+
"ai_usecase_id": "b1504848-3cf9-4ab9-9d46-d688e34a0295",
|
|
143
|
+
"catalog_id": "7bca9a52-7c90-4fb4-b3ef-3194e25a8452", // same as inventory_id
|
|
144
|
+
"approach_id": "80b3a883-015f-498a-86f3-55ba74b5374b",
|
|
145
|
+
"approach_version": "0.0.2",
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
# Evaluate the risk based on the provided dataset
|
|
149
|
+
evaluator.e2e_prompt_evaluation(
|
|
150
|
+
config=config,
|
|
151
|
+
input_file_path="./rag_dataset.csv",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Show the evaluation result
|
|
155
|
+
evaluator.get_monitor_metrics(
|
|
156
|
+
monitor=BaseMonitor(monitor_name="generative_ai_quality"),
|
|
157
|
+
environment=EvaluationStage.DEVELOPMENT,
|
|
158
|
+
show_table=True,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
evaluator.get_dataset_records(
|
|
162
|
+
dataset_type="gen_ai_quality_metrics",
|
|
163
|
+
environment=EvaluationStage.DEVELOPMENT,
|
|
164
|
+
show_table=True,
|
|
165
|
+
)
|
|
166
|
+
Args:
|
|
167
|
+
config (dict[str, any]): configurations dictionary
|
|
168
|
+
input_file_path (str, optional): Path to the input to evaluate. This can be a local file or link to a file. The propmt template evaluation will be skipped if this argument is no set.
|
|
169
|
+
"""
|
|
170
|
+
self.__evaluator.e2e_prompt_evaluation(config, input_file_path)
|
|
171
|
+
|
|
172
|
+
def evaluate_risk(
|
|
173
|
+
self,
|
|
174
|
+
prompt_setup: PromptSetup,
|
|
175
|
+
containers: list[ProjectContainer | SpaceContainer],
|
|
176
|
+
input_file_path: str,
|
|
177
|
+
prompt_template: PromptTemplate | DetachedPromptTemplate = None,
|
|
178
|
+
prompt_template_id: str = None,
|
|
179
|
+
environments: list[EvaluationStage] = [EvaluationStage.DEVELOPMENT],
|
|
180
|
+
):
|
|
181
|
+
"""
|
|
182
|
+
Evaluate the risk of a given input file path for a list of containers. Note either prompt_template or prompt_template_id should be provided.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
prompt_template (PromptTemplate | DetachedPromptTemplate, optional): The prompt template to use for evaluation.
|
|
186
|
+
prompt_template_id (str, optional): The prompt template id to use for evaluation.
|
|
187
|
+
containers (list[ProjectContainer | SpaceContainer]): The containers details.
|
|
188
|
+
input_file_path (str): The path to the input file to evaluate.
|
|
189
|
+
environments (list[EvaluationStage], optional): The list of evaluation stages to do the evaluation in. Defaults to [EvaluationStage.DEVELOPMENT].
|
|
190
|
+
"""
|
|
191
|
+
self.__evaluator.evaluate_risk(
|
|
192
|
+
prompt_setup=prompt_setup,
|
|
193
|
+
prompt_template=prompt_template,
|
|
194
|
+
prompt_template_id=prompt_template_id,
|
|
195
|
+
containers=containers,
|
|
196
|
+
evaluation_stages=environments,
|
|
197
|
+
input_file_path=input_file_path,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def get_monitor_metrics(
|
|
201
|
+
self,
|
|
202
|
+
monitor: BaseMonitor,
|
|
203
|
+
environment: EvaluationStage = EvaluationStage.DEVELOPMENT,
|
|
204
|
+
show_table: bool = False,
|
|
205
|
+
):
|
|
206
|
+
"""
|
|
207
|
+
Get monitors metrics for a given monitor in a specific environment.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
monitor (BaseMonitor): monitor to get the metrics for.
|
|
211
|
+
environment (EvaluationStage, optional): monitor environment. Defaults to EvaluationStage.DEVELOPMENT.
|
|
212
|
+
show_table (bool, optional): Flag to print the result table. Defaults to False.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
dict[str, any]: Monitor metrics dictionary
|
|
216
|
+
"""
|
|
217
|
+
return self.__evaluator.get_monitor_metrics(
|
|
218
|
+
evaluation_stage=environment,
|
|
219
|
+
monitor=monitor,
|
|
220
|
+
show_table=show_table,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
def get_dataset_records(
|
|
224
|
+
self,
|
|
225
|
+
dataset_type: str,
|
|
226
|
+
environment: EvaluationStage = EvaluationStage.DEVELOPMENT,
|
|
227
|
+
show_table: bool = False,
|
|
228
|
+
) -> dict[str, any]:
|
|
229
|
+
"""
|
|
230
|
+
Retrieve dataset records for a given dataset type and environment.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
dataset_type (str): The type of dataset to retrieve records for.
|
|
234
|
+
environment (EvaluationStage, optional): The environment to retrieve records from. Defaults to EvaluationStage.DEVELOPMENT.
|
|
235
|
+
show_table (bool, optional): Whether to display the dataset records as a table. Defaults to False.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
dict[str, any]: A dictionary containing the dataset records.
|
|
239
|
+
"""
|
|
240
|
+
return self.__evaluator.get_dataset_records(
|
|
241
|
+
evaluation_stage=environment,
|
|
242
|
+
dataset_type=dataset_type,
|
|
243
|
+
show_table=show_table,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
def get_prompt_template_id(
|
|
247
|
+
self,
|
|
248
|
+
environment: EvaluationStage = EvaluationStage.DEVELOPMENT,
|
|
249
|
+
) -> str:
|
|
250
|
+
"""
|
|
251
|
+
Retrieves the prompt template ID based on the specified environment.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
environment (EvaluationStage, optional): The environment for which to retrieve the prompt template ID.
|
|
255
|
+
Defaults to EvaluationStage.DEVELOPMENT.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
str: The prompt template ID corresponding to the specified environment.
|
|
259
|
+
"""
|
|
260
|
+
return self.__evaluator.get_prompt_template_id(
|
|
261
|
+
environment=environment
|
|
262
|
+
)
|
|
Binary file
|
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
import aiohttp
|
|
14
|
+
import pandas as pd
|
|
15
|
+
from ibm_watson_openscale import APIClient as WOSClient
|
|
16
|
+
|
|
17
|
+
from ibm_watsonx_gov.clients.usage_client import validate_usage_client
|
|
18
|
+
from ibm_watsonx_gov.config import GenAIConfiguration
|
|
19
|
+
from ibm_watsonx_gov.entities.base_classes import Error
|
|
20
|
+
from ibm_watsonx_gov.entities.enums import (EvaluationProvider,
|
|
21
|
+
GraniteGuardianRisks, MetricGroup)
|
|
22
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
23
|
+
RecordMetricResult)
|
|
24
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
25
|
+
from ibm_watsonx_gov.providers.tool_call_metric_provider import \
|
|
26
|
+
ToolCallMetricProvider
|
|
27
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
28
|
+
from ibm_watsonx_gov.utils.python_utils import (get_authenticator_token,
|
|
29
|
+
transform_str_to_list)
|
|
30
|
+
|
|
31
|
+
ACCURACY_METRICS = ["topic_relevance", "tool_call_accuracy"]
|
|
32
|
+
RAG_EVIDENCE_METRICS = ["faithfulness",
|
|
33
|
+
"context_relevance"]
|
|
34
|
+
RAG_SMALL_MODELS = ["faithfulness_model", "context_relevance_model"]
|
|
35
|
+
EVIDENCES = ["hap", "pii", "keyword", "regex"]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DetectorsProvider():
|
|
39
|
+
# Status codes for BAD_GATEWAY, SERVICE_UNAVAILABLE and GATEWAY_TIMEOUT
|
|
40
|
+
RETRY_AFTER_STATUS_CODES = [502, 503, 504]
|
|
41
|
+
RETRY_COUNT = 3
|
|
42
|
+
BACK_OFF_FACTOR = 1
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
configuration: GenAIConfiguration,
|
|
47
|
+
metric_name: str,
|
|
48
|
+
metric_display_name: str,
|
|
49
|
+
metric_method: Optional[str] = None,
|
|
50
|
+
metric_group: MetricGroup = None,
|
|
51
|
+
thresholds: list[MetricThreshold] = [],
|
|
52
|
+
**kwargs,
|
|
53
|
+
) -> None:
|
|
54
|
+
if not kwargs.get("api_client"):
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f"IBM WatsonX Governance SDK must be initialized to compute {metric_name} using {metric_method}. Please initialize the client to proceed, or remove this metric from the evaluation.")
|
|
57
|
+
if not isinstance(kwargs.get("api_client").wos_client, WOSClient):
|
|
58
|
+
raise ValueError(
|
|
59
|
+
f"watsonx.governance service instance id is required to compute {metric_name} using {metric_method}. You can set the service instance ID using the `WXG_SERVICE_INSTANCE_ID` environment variable in the notebook and retry.")
|
|
60
|
+
base = self.__get_base_url(metric_name)
|
|
61
|
+
self.base_url = base.format(
|
|
62
|
+
self.get_detector_url(kwargs.get("api_client")))
|
|
63
|
+
self.configuration: GenAIConfiguration = configuration
|
|
64
|
+
self.configuration_: dict[str, any] = {}
|
|
65
|
+
self.metric_name = metric_name
|
|
66
|
+
self.metric_display_name = metric_display_name
|
|
67
|
+
self.metric_method = metric_method
|
|
68
|
+
self.metric_group = metric_group
|
|
69
|
+
self.service_instance_id = self.get_service_instance_id(
|
|
70
|
+
kwargs.get("api_client"))
|
|
71
|
+
self.thresholds = thresholds
|
|
72
|
+
self.detector_params = kwargs.get("detector_params", None)
|
|
73
|
+
validate_usage_client(kwargs.get("usage_client"))
|
|
74
|
+
|
|
75
|
+
def evaluate(self, data: pd.DataFrame) -> AggregateMetricResult:
|
|
76
|
+
return run_in_event_loop(
|
|
77
|
+
self.evaluate_async,
|
|
78
|
+
data=data
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
async def evaluate_async(self, data: pd.DataFrame) -> AggregateMetricResult:
|
|
82
|
+
"""
|
|
83
|
+
Entry point method to compute the configured detectors-based metrics.
|
|
84
|
+
Args:
|
|
85
|
+
data: Input test data
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
json_payloads, record_ids = self.__pre_process_data(data=data)
|
|
89
|
+
result = await self.__compute_metric(json_payloads)
|
|
90
|
+
aggregated_result = self.__post_process(result, record_ids)
|
|
91
|
+
return aggregated_result
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
raise Exception(
|
|
95
|
+
f"Error while computing metrics: {self.metric_name}. Reason: {str(e)}")
|
|
96
|
+
|
|
97
|
+
def __pre_process_data(self, data: pd.DataFrame):
|
|
98
|
+
"""
|
|
99
|
+
Creates payload for each row in the test data.
|
|
100
|
+
"""
|
|
101
|
+
# read data based on the metric.
|
|
102
|
+
input_content = data[self.configuration.input_fields[0]].to_list()
|
|
103
|
+
output_content, context_content, tool_calls_content, tools_catalog_content = None, None, None, None
|
|
104
|
+
|
|
105
|
+
if self.metric_name in ["answer_relevance", "faithfulness"]:
|
|
106
|
+
output_content = data[self.configuration.output_fields[0]].to_list(
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if self.metric_name in ["context_relevance", "faithfulness"]:
|
|
110
|
+
if len(self.configuration.context_fields) > 1:
|
|
111
|
+
context_content = data[self.configuration.context_fields].values.tolist(
|
|
112
|
+
)
|
|
113
|
+
elif len(self.configuration.context_fields) == 1:
|
|
114
|
+
context_content = data[self.configuration.context_fields[0]].apply(
|
|
115
|
+
transform_str_to_list).tolist()
|
|
116
|
+
|
|
117
|
+
if self.metric_name == "tool_call_accuracy":
|
|
118
|
+
# Get the tools catalog i.e., a list of available tools
|
|
119
|
+
tools_catalog_content = self.get_tools_catalog_content()
|
|
120
|
+
# Get the tool calls list
|
|
121
|
+
tool_calls_content = self.get_tool_calls_content(data)
|
|
122
|
+
|
|
123
|
+
payloads_json = self.__get_json_payloads(
|
|
124
|
+
input_content, output_content, context_content, tools_catalog_content, tool_calls_content)
|
|
125
|
+
record_ids = data[self.configuration.record_id_field].to_list()
|
|
126
|
+
return payloads_json, record_ids
|
|
127
|
+
|
|
128
|
+
async def send_with_retries(self, payload, session: aiohttp.ClientSession):
|
|
129
|
+
"""
|
|
130
|
+
Asynchronously calls the detections API with retries and returns the responses.
|
|
131
|
+
Returns an error if all retries fail or an exception is caught.
|
|
132
|
+
"""
|
|
133
|
+
for attempt in range(self.RETRY_COUNT):
|
|
134
|
+
try:
|
|
135
|
+
async with session.post(
|
|
136
|
+
url=self.base_url,
|
|
137
|
+
headers=self.__get_headers(),
|
|
138
|
+
data=payload,
|
|
139
|
+
ssl=self.verify
|
|
140
|
+
) as response:
|
|
141
|
+
|
|
142
|
+
response_status = response.status
|
|
143
|
+
response_text = await response.text()
|
|
144
|
+
|
|
145
|
+
if response_status == 200:
|
|
146
|
+
return json.loads(response_text)
|
|
147
|
+
|
|
148
|
+
elif response_status in self.RETRY_AFTER_STATUS_CODES and attempt < self.RETRY_COUNT - 1:
|
|
149
|
+
await asyncio.sleep(self.BACK_OFF_FACTOR * (2 ** attempt))
|
|
150
|
+
continue # retry
|
|
151
|
+
else:
|
|
152
|
+
return {
|
|
153
|
+
"error": Error(
|
|
154
|
+
code=str(response_status),
|
|
155
|
+
message_en=str(json.loads(response_text))
|
|
156
|
+
)
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
except Exception as e:
|
|
160
|
+
return {
|
|
161
|
+
"error": Error(
|
|
162
|
+
code="REQUEST_FAILED",
|
|
163
|
+
message_en=str(e)
|
|
164
|
+
)
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
async def __compute_metric(self, api_payloads: list):
|
|
168
|
+
async with aiohttp.ClientSession() as session:
|
|
169
|
+
tasks = [self.send_with_retries(payload, session)
|
|
170
|
+
for payload in api_payloads]
|
|
171
|
+
responses = await asyncio.gather(*tasks, return_exceptions=True)
|
|
172
|
+
return responses
|
|
173
|
+
|
|
174
|
+
def __get_additional_info(self, results) -> list:
|
|
175
|
+
info = []
|
|
176
|
+
if self.metric_name in RAG_EVIDENCE_METRICS:
|
|
177
|
+
for result in results:
|
|
178
|
+
info.append(result["evidence"][0])
|
|
179
|
+
else:
|
|
180
|
+
for result in results:
|
|
181
|
+
info.append({"text": result["text"], "score": result["score"],
|
|
182
|
+
"start": result["start"], "end": result["end"]})
|
|
183
|
+
if info:
|
|
184
|
+
return info
|
|
185
|
+
return []
|
|
186
|
+
|
|
187
|
+
def __post_process(self, results: list, record_ids: list) -> AggregateMetricResult:
|
|
188
|
+
"""
|
|
189
|
+
Process the responses and aggregate the results.
|
|
190
|
+
"""
|
|
191
|
+
record_level_metrics: list[RecordMetricResult] = []
|
|
192
|
+
values = []
|
|
193
|
+
errors = []
|
|
194
|
+
for result, record_id in zip(results, record_ids):
|
|
195
|
+
if self.metric_name == "keyword":
|
|
196
|
+
metric_name = "keyword_detection"
|
|
197
|
+
elif self.metric_name == "regex":
|
|
198
|
+
metric_name = "regex_detection"
|
|
199
|
+
else:
|
|
200
|
+
metric_name = self.metric_name
|
|
201
|
+
record_data = {
|
|
202
|
+
"name": metric_name,
|
|
203
|
+
"display_name": self.metric_display_name,
|
|
204
|
+
"method": self.metric_method,
|
|
205
|
+
"provider": EvaluationProvider.DETECTORS.value,
|
|
206
|
+
"group": self.metric_group,
|
|
207
|
+
"record_id": record_id,
|
|
208
|
+
"thresholds": self.thresholds,
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
if "error" in result:
|
|
212
|
+
record_level_metrics.append(RecordMetricResult(
|
|
213
|
+
**record_data,
|
|
214
|
+
value=None,
|
|
215
|
+
errors=[Error(code=result["error"].code,
|
|
216
|
+
message_en=str(result["error"].message_en))]
|
|
217
|
+
))
|
|
218
|
+
errors.append(Error(code=result["error"].code,
|
|
219
|
+
message_en=str(result["error"].message_en)))
|
|
220
|
+
else:
|
|
221
|
+
value = 0
|
|
222
|
+
if len(result["detections"]) > 0:
|
|
223
|
+
# Return the highest of all detections' scores.
|
|
224
|
+
score_list = []
|
|
225
|
+
for detection in result["detections"]:
|
|
226
|
+
score_list.append(detection["score"])
|
|
227
|
+
score = max(score_list)
|
|
228
|
+
value = round(
|
|
229
|
+
1 - score if self.metric_name in ACCURACY_METRICS else score, 4)
|
|
230
|
+
# provide evidences for few metrics
|
|
231
|
+
evidences = None
|
|
232
|
+
if self.metric_name in EVIDENCES or (self.metric_name in RAG_EVIDENCE_METRICS and self.metric_method in RAG_SMALL_MODELS):
|
|
233
|
+
evidences = self.__get_additional_info(
|
|
234
|
+
result["detections"])
|
|
235
|
+
|
|
236
|
+
record_level_metrics.append(RecordMetricResult(
|
|
237
|
+
**record_data,
|
|
238
|
+
value=value,
|
|
239
|
+
**({"evidences": evidences} if evidences else {})
|
|
240
|
+
))
|
|
241
|
+
values.append(value)
|
|
242
|
+
|
|
243
|
+
# creating AggregateMetricResult
|
|
244
|
+
if values:
|
|
245
|
+
mean_val = round(sum(values) / len(values), 4)
|
|
246
|
+
min_val = min(values)
|
|
247
|
+
max_val = max(values)
|
|
248
|
+
value = mean_val
|
|
249
|
+
error_info = {}
|
|
250
|
+
else:
|
|
251
|
+
mean_val = min_val = max_val = None
|
|
252
|
+
value = "Error"
|
|
253
|
+
error_info = {"errors": errors}
|
|
254
|
+
aggregated_result = AggregateMetricResult(
|
|
255
|
+
name=self.metric_name,
|
|
256
|
+
display_name=self.metric_display_name,
|
|
257
|
+
method=self.metric_method,
|
|
258
|
+
group=self.metric_group,
|
|
259
|
+
provider=EvaluationProvider.DETECTORS.value,
|
|
260
|
+
value=value,
|
|
261
|
+
total_records=len(results),
|
|
262
|
+
record_level_metrics=record_level_metrics,
|
|
263
|
+
min=min_val,
|
|
264
|
+
max=max_val,
|
|
265
|
+
mean=mean_val,
|
|
266
|
+
thresholds=self.thresholds,
|
|
267
|
+
**error_info
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# return the aggregated result
|
|
271
|
+
return aggregated_result
|
|
272
|
+
|
|
273
|
+
def __get_json_payloads(self, input_contents: list, output_contents: list | None, context_contents: list | None, tools_catalog_content: list | None, tool_calls_content: list | None) -> list:
|
|
274
|
+
# Method to create the request payload.
|
|
275
|
+
json_payloads = []
|
|
276
|
+
metric_name = self.set_metric_name(self.metric_name)
|
|
277
|
+
|
|
278
|
+
if self.metric_name == "answer_relevance":
|
|
279
|
+
for (input, output) in zip(input_contents, output_contents):
|
|
280
|
+
payload_json = {
|
|
281
|
+
"detectors": {
|
|
282
|
+
metric_name: self.detector_params or {}
|
|
283
|
+
},
|
|
284
|
+
"prompt": input,
|
|
285
|
+
"generated_text": output
|
|
286
|
+
}
|
|
287
|
+
json_payloads.append(json.dumps(payload_json))
|
|
288
|
+
|
|
289
|
+
elif self.metric_name == "context_relevance":
|
|
290
|
+
for (input, context) in zip(input_contents, context_contents):
|
|
291
|
+
payload_json = {
|
|
292
|
+
"detectors": {
|
|
293
|
+
metric_name: self.detector_params or {}
|
|
294
|
+
},
|
|
295
|
+
"input": input,
|
|
296
|
+
"context_type": "docs",
|
|
297
|
+
"context": context
|
|
298
|
+
}
|
|
299
|
+
json_payloads.append(json.dumps(payload_json))
|
|
300
|
+
|
|
301
|
+
elif self.metric_name == "faithfulness":
|
|
302
|
+
for (output, context) in zip(output_contents, context_contents):
|
|
303
|
+
payload_json = {
|
|
304
|
+
"detectors": {
|
|
305
|
+
metric_name: self.detector_params or {}
|
|
306
|
+
},
|
|
307
|
+
"input": output,
|
|
308
|
+
"context_type": "docs",
|
|
309
|
+
"context": context
|
|
310
|
+
}
|
|
311
|
+
json_payloads.append(json.dumps(payload_json))
|
|
312
|
+
|
|
313
|
+
elif self.metric_name == "tool_call_accuracy":
|
|
314
|
+
for (input, tool_call) in zip(input_contents, tool_calls_content):
|
|
315
|
+
payload_json = {
|
|
316
|
+
"detectors": {
|
|
317
|
+
metric_name: self.detector_params or {}
|
|
318
|
+
},
|
|
319
|
+
"messages": [{"content": input, "role": "user"},
|
|
320
|
+
{"tool_calls": tool_call, "role": "assistant"}],
|
|
321
|
+
"tools": tools_catalog_content
|
|
322
|
+
}
|
|
323
|
+
json_payloads.append(json.dumps(payload_json))
|
|
324
|
+
else:
|
|
325
|
+
for input in input_contents:
|
|
326
|
+
payload_json = {
|
|
327
|
+
"detectors": {
|
|
328
|
+
metric_name: self.detector_params or {}
|
|
329
|
+
},
|
|
330
|
+
"input": input
|
|
331
|
+
}
|
|
332
|
+
json_payloads.append(json.dumps(payload_json))
|
|
333
|
+
return json_payloads
|
|
334
|
+
|
|
335
|
+
def __get_headers(self):
|
|
336
|
+
# Method to create request headers
|
|
337
|
+
headers = {}
|
|
338
|
+
headers["Content-Type"] = "application/json"
|
|
339
|
+
headers["Authorization"] = f"Bearer {get_authenticator_token(self.wos_client.authenticator)}"
|
|
340
|
+
headers["x-governance-instance-id"] = self.service_instance_id
|
|
341
|
+
headers["origin"] = "sdk"
|
|
342
|
+
return headers
|
|
343
|
+
|
|
344
|
+
def get_detector_url(self, api_client):
|
|
345
|
+
"""
|
|
346
|
+
Sets the wos_client and returns the service url
|
|
347
|
+
"""
|
|
348
|
+
self.wos_client = api_client.wos_client
|
|
349
|
+
self.verify = not api_client.credentials.disable_ssl
|
|
350
|
+
if api_client.credentials.version:
|
|
351
|
+
return api_client.credentials.url
|
|
352
|
+
else:
|
|
353
|
+
from ibm_watsonx_gov.utils.url_mapping import WOS_URL_MAPPING
|
|
354
|
+
urls = WOS_URL_MAPPING.get(api_client.credentials.url)
|
|
355
|
+
return urls.wml_url
|
|
356
|
+
|
|
357
|
+
def get_service_instance_id(self, api_client):
|
|
358
|
+
"""
|
|
359
|
+
Sets the wos_client and returns the service instance id
|
|
360
|
+
"""
|
|
361
|
+
|
|
362
|
+
self.wos_client = api_client.wos_client
|
|
363
|
+
return self.wos_client.service_instance_id
|
|
364
|
+
|
|
365
|
+
def set_metric_name(self, metric_name):
|
|
366
|
+
"""
|
|
367
|
+
Sets metric name as 'granite guardian' for Granite Guardian risks
|
|
368
|
+
"""
|
|
369
|
+
# Set metric name to harm for computing PSR using GG
|
|
370
|
+
if self.metric_name == "prompt_safety_risk" and self.metric_method == "granite_guardian":
|
|
371
|
+
metric_name = "harm"
|
|
372
|
+
metric_name = "granite_guardian" if metric_name in GraniteGuardianRisks.values() else metric_name
|
|
373
|
+
return metric_name
|
|
374
|
+
|
|
375
|
+
def __get_base_url(self, metric_name):
|
|
376
|
+
"""
|
|
377
|
+
Returns the inference proxy end-point to be invoked based on the metric.
|
|
378
|
+
"""
|
|
379
|
+
if metric_name == "answer_relevance":
|
|
380
|
+
return "{}/ml/v1/text/detection/generated?version=2023-10-25"
|
|
381
|
+
elif metric_name in ["context_relevance", "faithfulness"]:
|
|
382
|
+
return "{}/ml/v1/text/detection/context?version=2023-10-25"
|
|
383
|
+
elif metric_name == "tool_call_accuracy":
|
|
384
|
+
return "{}/ml/v1/text/detection/chat?version=2023-10-25"
|
|
385
|
+
else:
|
|
386
|
+
return "{}/ml/v1/text/detection?version=2023-10-25"
|
|
387
|
+
|
|
388
|
+
def get_tool_calls_content(self, data):
|
|
389
|
+
tool_calls_content = []
|
|
390
|
+
if self.configuration.tool_calls_field:
|
|
391
|
+
data[self.configuration.tool_calls_field] = data[self.configuration.tool_calls_field].apply(
|
|
392
|
+
lambda x: json.loads(x) if isinstance(x, str) else x)
|
|
393
|
+
for _, row in data.iterrows():
|
|
394
|
+
tool_calls = ToolCallMetricProvider.extract_tool_calls_from_response(
|
|
395
|
+
row[self.configuration.tool_calls_field])
|
|
396
|
+
for tc in tool_calls:
|
|
397
|
+
if isinstance(tc["function"]["arguments"], str):
|
|
398
|
+
tc["function"]["arguments"] = json.loads(
|
|
399
|
+
tc["function"]["arguments"])
|
|
400
|
+
tool_calls_content.append(tool_calls)
|
|
401
|
+
|
|
402
|
+
return tool_calls_content
|
|
403
|
+
|
|
404
|
+
def get_tools_catalog_content(self):
|
|
405
|
+
# Get the specification of tools used in the application
|
|
406
|
+
# in proper format if it is a list of Callable
|
|
407
|
+
if isinstance(self.configuration.tools, list) and all(callable(item) for item in self.configuration.tools):
|
|
408
|
+
tools_catalog_content = ToolCallMetricProvider.get_tools_list_schema(
|
|
409
|
+
self.configuration.tools)
|
|
410
|
+
# converting the model to a json object
|
|
411
|
+
tools_catalog_content = [
|
|
412
|
+
tool_spec.model_dump() for tool_spec in tools_catalog_content]
|
|
413
|
+
else:
|
|
414
|
+
tools_catalog_content = self.configuration.tools
|
|
415
|
+
return tools_catalog_content
|
|
Binary file
|