ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Any, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field, TypeAdapter, field_validator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
18
|
+
RecordMetricResult)
|
|
19
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
20
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
21
|
+
from ibm_watsonx_gov.metrics.context_relevance.context_relevance_metric import (
|
|
22
|
+
CONTEXT_RELEVANCE, ContextRelevanceMetric, ContextRelevanceResult)
|
|
23
|
+
|
|
24
|
+
RECIPROCAL_RANK = "reciprocal_rank"
|
|
25
|
+
RECIPROCAL_RANK_DISPLAY_NAME = "Reciprocal Rank"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ReciprocalRankResult(RecordMetricResult):
|
|
29
|
+
name: str = RECIPROCAL_RANK
|
|
30
|
+
display_name: str = RECIPROCAL_RANK_DISPLAY_NAME
|
|
31
|
+
group: MetricGroup = MetricGroup.RETRIEVAL_QUALITY
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ReciprocalRankMetric(GenAIMetric):
|
|
35
|
+
"""
|
|
36
|
+
Defines the Reciprocal Rank metric class.
|
|
37
|
+
|
|
38
|
+
The Reciprocal Rank metric measures the reciprocal rank of the first relevant context.
|
|
39
|
+
The Context Relevance metric is computed as a pre requisite to compute this metric.
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
1. Create Reciprocal Rank metric with default parameters and compute using metrics evaluator.
|
|
43
|
+
.. code-block:: python
|
|
44
|
+
|
|
45
|
+
metric = ReciprocalRankMetric()
|
|
46
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": "..."},
|
|
47
|
+
metrics=[metric])
|
|
48
|
+
# A list of contexts can also be passed as shown below
|
|
49
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
|
|
50
|
+
metrics=[metric])
|
|
51
|
+
|
|
52
|
+
2. Create Reciprocal Rank metric with a custom threshold.
|
|
53
|
+
.. code-block:: python
|
|
54
|
+
|
|
55
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
56
|
+
metric = ReciprocalRankMetric(method=method, threshold=threshold)
|
|
57
|
+
|
|
58
|
+
3. Create Reciprocal Rank metric with llm_as_judge method.
|
|
59
|
+
.. code-block:: python
|
|
60
|
+
|
|
61
|
+
# Define LLM Judge using watsonx.ai
|
|
62
|
+
# To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
|
|
63
|
+
llm_judge = LLMJudge(model=WxAIFoundationModel(
|
|
64
|
+
model_id="ibm/granite-3-3-8b-instruct",
|
|
65
|
+
project_id="<PROJECT_ID>"
|
|
66
|
+
))
|
|
67
|
+
cr_metric = ContextRelevanceMetric(llm_judge=llm_judge)
|
|
68
|
+
ap_metric = ReciprocalRankMetric()
|
|
69
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
|
|
70
|
+
metrics=[cr_metric, ap_metric])
|
|
71
|
+
"""
|
|
72
|
+
name: Annotated[Literal["reciprocal_rank"],
|
|
73
|
+
Field(title="Name",
|
|
74
|
+
description="The reciprocal rank metric name.",
|
|
75
|
+
default=RECIPROCAL_RANK, frozen=True)]
|
|
76
|
+
display_name: Annotated[Literal["Reciprocal Rank"],
|
|
77
|
+
Field(title="Display Name",
|
|
78
|
+
description="The reciprocal rank metric display name.",
|
|
79
|
+
default=RECIPROCAL_RANK_DISPLAY_NAME, frozen=True)]
|
|
80
|
+
tasks: Annotated[list[TaskType],
|
|
81
|
+
Field(title="Tasks",
|
|
82
|
+
description="The list of supported tasks.",
|
|
83
|
+
default=[TaskType.RAG])]
|
|
84
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
85
|
+
Field(title="Thresholds",
|
|
86
|
+
description="The metric thresholds.",
|
|
87
|
+
default=[MetricThreshold(type="lower_limit", value=0.7)])]
|
|
88
|
+
metric_dependencies: Annotated[list[GenAIMetric],
|
|
89
|
+
Field(title="Metric dependencies",
|
|
90
|
+
description="The list of metric dependencies",
|
|
91
|
+
default=[ContextRelevanceMetric()])]
|
|
92
|
+
group: Annotated[MetricGroup,
|
|
93
|
+
Field(title="Group",
|
|
94
|
+
description="The metric group.",
|
|
95
|
+
default=MetricGroup.RETRIEVAL_QUALITY, frozen=True)]
|
|
96
|
+
|
|
97
|
+
@field_validator("metric_dependencies", mode="before")
|
|
98
|
+
@classmethod
|
|
99
|
+
def metric_dependencies_validator(cls, value: Any):
|
|
100
|
+
if value:
|
|
101
|
+
value = [TypeAdapter(Annotated[ContextRelevanceMetric, Field(
|
|
102
|
+
discriminator="name")]).validate_python(
|
|
103
|
+
m) for m in value]
|
|
104
|
+
return value
|
|
105
|
+
|
|
106
|
+
def evaluate(
|
|
107
|
+
self,
|
|
108
|
+
data: pd.DataFrame,
|
|
109
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
110
|
+
metrics_result: list[AggregateMetricResult],
|
|
111
|
+
**kwargs,
|
|
112
|
+
) -> AggregateMetricResult:
|
|
113
|
+
record_level_metrics = []
|
|
114
|
+
scores = []
|
|
115
|
+
|
|
116
|
+
context_relevance_result: list[ContextRelevanceResult] = next(
|
|
117
|
+
(metric_result.record_level_metrics for metric_result in metrics_result if metric_result.name == CONTEXT_RELEVANCE), None)
|
|
118
|
+
|
|
119
|
+
if context_relevance_result is None:
|
|
120
|
+
raise Exception(
|
|
121
|
+
f"Failed to evaluate {self.name} metric. Missing context relevance metric result")
|
|
122
|
+
|
|
123
|
+
for relevance_result in context_relevance_result:
|
|
124
|
+
score = self.__compute(
|
|
125
|
+
relevance_scores=relevance_result.additional_info.get(
|
|
126
|
+
"contexts_values", []),
|
|
127
|
+
threshold=self.thresholds[0].value,
|
|
128
|
+
)
|
|
129
|
+
scores.append(score)
|
|
130
|
+
record_level_metrics.append(
|
|
131
|
+
ReciprocalRankResult(
|
|
132
|
+
method="",
|
|
133
|
+
provider="",
|
|
134
|
+
record_id=relevance_result.record_id,
|
|
135
|
+
value=score,
|
|
136
|
+
thresholds=self.thresholds,
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
mean = sum(scores) / len(scores)
|
|
141
|
+
aggregate_metric_score = AggregateMetricResult(
|
|
142
|
+
name=self.name,
|
|
143
|
+
display_name=self.display_name,
|
|
144
|
+
method="",
|
|
145
|
+
provider="",
|
|
146
|
+
group=self.group,
|
|
147
|
+
min=min(scores),
|
|
148
|
+
max=max(scores),
|
|
149
|
+
mean=mean,
|
|
150
|
+
value=mean,
|
|
151
|
+
total_records=len(record_level_metrics),
|
|
152
|
+
record_level_metrics=record_level_metrics,
|
|
153
|
+
thresholds=self.thresholds,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return aggregate_metric_score
|
|
157
|
+
|
|
158
|
+
def __compute(self, relevance_scores: list[float], threshold: float) -> float:
|
|
159
|
+
for i, score in enumerate(relevance_scores, start=1):
|
|
160
|
+
if score >= threshold:
|
|
161
|
+
return 1 / i
|
|
162
|
+
return 0
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
from ibm_watsonx_gov.metrics.regex_detection.regex_detection_metric import \
|
|
21
|
+
RegexDetectionMetric
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RegexDetectionDecorator(BaseMetricDecorator):
|
|
25
|
+
|
|
26
|
+
def evaluate_regex(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric]
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing regex detection on an agentic node.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_regex, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
@decorator
|
|
39
|
+
def wrapper(func, instance, args, kwargs):
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
self.validate(func=func, metrics=metrics,
|
|
43
|
+
valid_metric_types=(RegexDetectionMetric,))
|
|
44
|
+
|
|
45
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
46
|
+
|
|
47
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
48
|
+
configuration=configuration,
|
|
49
|
+
metrics=metrics,
|
|
50
|
+
metric_inputs=metric_inputs,
|
|
51
|
+
metric_outputs=[])
|
|
52
|
+
|
|
53
|
+
return original_result
|
|
54
|
+
except Exception as ex:
|
|
55
|
+
raise Exception(
|
|
56
|
+
f"There was an error while evaluating regex detection on {func.__name__},") from ex
|
|
57
|
+
|
|
58
|
+
return wrapper(func)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_input
|
|
23
|
+
|
|
24
|
+
REGEX = "regex"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class RegexDetectionMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the Regex Detection metric class.
|
|
30
|
+
|
|
31
|
+
The Regex detection metric detects specific regex pattern(s) when they are mentioned explicitly in natural language.
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
1. Create regex detection metric with default parameters and compute using metrics evaluator.
|
|
35
|
+
.. code-block:: python
|
|
36
|
+
|
|
37
|
+
metric = RegexDetectionMetric(case_sensitive=True, regex_patterns=["..."])
|
|
38
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "..."}, metrics=[metric])
|
|
39
|
+
|
|
40
|
+
2. Create regex detection metric with a custom threshold.
|
|
41
|
+
.. code-block:: python
|
|
42
|
+
|
|
43
|
+
threshold = MetricThreshold(type="upper_limit", value=0)
|
|
44
|
+
metric = RegexDetectionMetric(threshold=threshold, regex_patterns=["..."])
|
|
45
|
+
"""
|
|
46
|
+
name: Annotated[Literal["regex_detection"],
|
|
47
|
+
Field(title="Name",
|
|
48
|
+
description="The regex detectionmetric name.",
|
|
49
|
+
default="regex_detection", frozen=True)]
|
|
50
|
+
display_name: Annotated[Literal["Regex Detection"],
|
|
51
|
+
Field(title="Display Name",
|
|
52
|
+
description="The regex detection metric display name.",
|
|
53
|
+
default="Regex Detection", frozen=True)]
|
|
54
|
+
tasks: Annotated[list[TaskType],
|
|
55
|
+
Field(title="Tasks",
|
|
56
|
+
description="The list of supported tasks.",
|
|
57
|
+
default=TaskType.values(), frozen=True)]
|
|
58
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
59
|
+
Field(title="Thresholds",
|
|
60
|
+
description="The metric thresholds.",
|
|
61
|
+
default=[MetricThreshold(type="upper_limit", value=0)])]
|
|
62
|
+
# group: Annotated[MetricGroup,
|
|
63
|
+
# Field(title="Group",
|
|
64
|
+
# description="The metric group.",
|
|
65
|
+
# default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
66
|
+
|
|
67
|
+
regex_patterns: Annotated[list[str], Field(title="Regex Patterns",
|
|
68
|
+
default=None,
|
|
69
|
+
description=f"List of regex patterns to match against the input text.")]
|
|
70
|
+
|
|
71
|
+
async def evaluate_async(
|
|
72
|
+
self,
|
|
73
|
+
data: pd.DataFrame,
|
|
74
|
+
configuration: GenAIConfiguration,
|
|
75
|
+
**kwargs
|
|
76
|
+
) -> list[AggregateMetricResult]:
|
|
77
|
+
if not self.regex_patterns:
|
|
78
|
+
raise AssertionError(
|
|
79
|
+
f"The regex_patterns field is required, but was missing from the input.")
|
|
80
|
+
|
|
81
|
+
validate_input(data.columns.to_list(), configuration)
|
|
82
|
+
kwargs["detector_params"] = {"regex_patterns": self.regex_patterns}
|
|
83
|
+
|
|
84
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
85
|
+
metric_name=REGEX,
|
|
86
|
+
metric_display_name=self.display_name,
|
|
87
|
+
metric_method=self.method,
|
|
88
|
+
metric_group=MetricGroup.CONTENT_SAFETY,
|
|
89
|
+
thresholds=self.thresholds,
|
|
90
|
+
**kwargs)
|
|
91
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
92
|
+
return aggregated_metric_result
|
|
93
|
+
|
|
94
|
+
def evaluate(
|
|
95
|
+
self,
|
|
96
|
+
data: pd.DataFrame | dict,
|
|
97
|
+
configuration: GenAIConfiguration,
|
|
98
|
+
**kwargs,
|
|
99
|
+
):
|
|
100
|
+
# If ran in sync mode, block until it is done
|
|
101
|
+
return run_in_event_loop(
|
|
102
|
+
self.evaluate_async,
|
|
103
|
+
data=data,
|
|
104
|
+
configuration=configuration,
|
|
105
|
+
**kwargs,
|
|
106
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
19
|
+
from ibm_watsonx_gov.metrics.context_relevance.context_relevance_metric import \
|
|
20
|
+
ContextRelevanceMetric
|
|
21
|
+
from ibm_watsonx_gov.metrics.retrieval_precision.retrieval_precision_metric import \
|
|
22
|
+
RetrievalPrecisionMetric
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RetrievalPrecisionDecorator(BaseMetricDecorator):
|
|
26
|
+
def evaluate_retrieval_precision(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = []
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing retrieval precision metric on an agentic node.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_retrieval_precision, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
if not metrics:
|
|
39
|
+
metrics = [RetrievalPrecisionMetric()]
|
|
40
|
+
|
|
41
|
+
@decorator
|
|
42
|
+
def wrapper(func, instance, args, kwargs):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
self.validate(func=func, metrics=metrics,
|
|
46
|
+
valid_metric_types=(RetrievalPrecisionMetric, ContextRelevanceMetric))
|
|
47
|
+
|
|
48
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
49
|
+
metric_outputs = [EvaluatorFields.CONTEXT_FIELDS]
|
|
50
|
+
|
|
51
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
52
|
+
configuration=configuration,
|
|
53
|
+
metrics=metrics,
|
|
54
|
+
metric_inputs=metric_inputs,
|
|
55
|
+
metric_outputs=metric_outputs)
|
|
56
|
+
|
|
57
|
+
return original_result
|
|
58
|
+
except Exception as ex:
|
|
59
|
+
raise Exception(
|
|
60
|
+
f"There was an error while evaluating retrieval precision metric on {func.__name__},") from ex
|
|
61
|
+
|
|
62
|
+
return wrapper(func)
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Any, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field, TypeAdapter, field_validator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
18
|
+
RecordMetricResult)
|
|
19
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
20
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
21
|
+
from ibm_watsonx_gov.metrics.context_relevance.context_relevance_metric import (
|
|
22
|
+
CONTEXT_RELEVANCE, ContextRelevanceMetric, ContextRelevanceResult)
|
|
23
|
+
|
|
24
|
+
RETRIEVAL_PRECISION = "retrieval_precision"
|
|
25
|
+
RETRIEVAL_PRECISION_DISPLAY_NAME = "Retrieval Precision"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class RetrievalPrecisionResult(RecordMetricResult):
|
|
29
|
+
name: str = RETRIEVAL_PRECISION
|
|
30
|
+
display_name: str = RETRIEVAL_PRECISION_DISPLAY_NAME
|
|
31
|
+
group: MetricGroup = MetricGroup.RETRIEVAL_QUALITY
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class RetrievalPrecisionMetric(GenAIMetric):
|
|
35
|
+
"""
|
|
36
|
+
Defines the Retrieval Precision metric class.
|
|
37
|
+
|
|
38
|
+
The Retrieval Precision metric measures the quanity of relevant contexts from the total of contexts that are retrieved.
|
|
39
|
+
The Context Relevance metric is computed as a pre requisite to compute this metric.
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
1. Create Retrieval Precision metric with default parameters and compute using metrics evaluator.
|
|
43
|
+
.. code-block:: python
|
|
44
|
+
|
|
45
|
+
metric = RetrievalPrecisionMetric()
|
|
46
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": "..."},
|
|
47
|
+
metrics=[metric])
|
|
48
|
+
# A list of contexts can also be passed as shown below
|
|
49
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
|
|
50
|
+
metrics=[metric])
|
|
51
|
+
|
|
52
|
+
2. Create Retrieval Precision metric with a custom threshold.
|
|
53
|
+
.. code-block:: python
|
|
54
|
+
|
|
55
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
56
|
+
metric = RetrievalPrecisionMetric(method=method, threshold=threshold)
|
|
57
|
+
|
|
58
|
+
3. Create Retrieval Precision metric with llm_as_judge method.
|
|
59
|
+
.. code-block:: python
|
|
60
|
+
|
|
61
|
+
# Define LLM Judge using watsonx.ai
|
|
62
|
+
# To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
|
|
63
|
+
llm_judge = LLMJudge(model=WxAIFoundationModel(
|
|
64
|
+
model_id="ibm/granite-3-3-8b-instruct",
|
|
65
|
+
project_id="<PROJECT_ID>"
|
|
66
|
+
))
|
|
67
|
+
cr_metric = ContextRelevanceMetric(llm_judge=llm_judge)
|
|
68
|
+
ap_metric = RetrievalPrecisionMetric()
|
|
69
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
|
|
70
|
+
metrics=[cr_metric, ap_metric])
|
|
71
|
+
"""
|
|
72
|
+
name: Annotated[Literal["retrieval_precision"],
|
|
73
|
+
Field(title="Name",
|
|
74
|
+
description="The retrieval precision metric name.",
|
|
75
|
+
default=RETRIEVAL_PRECISION, frozen=True)]
|
|
76
|
+
display_name: Annotated[Literal["Retrieval Precision"],
|
|
77
|
+
Field(title="Display Name",
|
|
78
|
+
description="The retrieval precision metric display name.",
|
|
79
|
+
default=RETRIEVAL_PRECISION_DISPLAY_NAME, frozen=True)]
|
|
80
|
+
tasks: Annotated[list[TaskType],
|
|
81
|
+
Field(title="Tasks",
|
|
82
|
+
description="The list of supported tasks.",
|
|
83
|
+
default=[TaskType.RAG])]
|
|
84
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
85
|
+
Field(title="Thresholds",
|
|
86
|
+
description="The metric thresholds.",
|
|
87
|
+
default=[MetricThreshold(type="lower_limit", value=0.7)])]
|
|
88
|
+
metric_dependencies: Annotated[list[GenAIMetric],
|
|
89
|
+
Field(title="Metric dependencies",
|
|
90
|
+
description="The list of metric dependencies",
|
|
91
|
+
default=[ContextRelevanceMetric()])]
|
|
92
|
+
group: Annotated[MetricGroup,
|
|
93
|
+
Field(title="Group",
|
|
94
|
+
description="The metric group.",
|
|
95
|
+
default=MetricGroup.RETRIEVAL_QUALITY, frozen=True)]
|
|
96
|
+
|
|
97
|
+
@field_validator("metric_dependencies", mode="before")
|
|
98
|
+
@classmethod
|
|
99
|
+
def metric_dependencies_validator(cls, value: Any):
|
|
100
|
+
if value:
|
|
101
|
+
value = [TypeAdapter(Annotated[ContextRelevanceMetric, Field(
|
|
102
|
+
discriminator="name")]).validate_python(
|
|
103
|
+
m) for m in value]
|
|
104
|
+
return value
|
|
105
|
+
|
|
106
|
+
def evaluate(
|
|
107
|
+
self,
|
|
108
|
+
data: pd.DataFrame,
|
|
109
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
110
|
+
metrics_result: list[AggregateMetricResult],
|
|
111
|
+
**kwargs,
|
|
112
|
+
) -> AggregateMetricResult:
|
|
113
|
+
record_level_metrics = []
|
|
114
|
+
scores = []
|
|
115
|
+
|
|
116
|
+
context_relevance_result: list[ContextRelevanceResult] = next(
|
|
117
|
+
(metric_result.record_level_metrics for metric_result in metrics_result if metric_result.name == CONTEXT_RELEVANCE), None)
|
|
118
|
+
|
|
119
|
+
if context_relevance_result is None:
|
|
120
|
+
raise Exception(
|
|
121
|
+
f"Failed to evaluate {self.name} metric. Missing context relevance metric result")
|
|
122
|
+
|
|
123
|
+
for relevance_result in context_relevance_result:
|
|
124
|
+
score = self.__compute(
|
|
125
|
+
relevance_scores=relevance_result.additional_info.get(
|
|
126
|
+
"contexts_values", []),
|
|
127
|
+
threshold=self.thresholds[0].value,
|
|
128
|
+
)
|
|
129
|
+
scores.append(score)
|
|
130
|
+
record_level_metrics.append(
|
|
131
|
+
RetrievalPrecisionResult(
|
|
132
|
+
method="",
|
|
133
|
+
provider="",
|
|
134
|
+
record_id=relevance_result.record_id,
|
|
135
|
+
value=score,
|
|
136
|
+
thresholds=self.thresholds,
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
mean = sum(scores) / len(scores)
|
|
141
|
+
aggregate_metric_score = AggregateMetricResult(
|
|
142
|
+
name=self.name,
|
|
143
|
+
display_name=self.display_name,
|
|
144
|
+
method="",
|
|
145
|
+
provider="",
|
|
146
|
+
group=self.group,
|
|
147
|
+
min=min(scores),
|
|
148
|
+
max=max(scores),
|
|
149
|
+
mean=mean,
|
|
150
|
+
value=mean,
|
|
151
|
+
total_records=len(record_level_metrics),
|
|
152
|
+
record_level_metrics=record_level_metrics,
|
|
153
|
+
thresholds=self.thresholds,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return aggregate_metric_score
|
|
157
|
+
|
|
158
|
+
def __compute(self, relevance_scores: list[float], threshold: float) -> float:
|
|
159
|
+
relevance_list = []
|
|
160
|
+
# Total Number of Contexts Retrieved
|
|
161
|
+
total_no_of_contexts = len(relevance_scores)
|
|
162
|
+
for r in relevance_scores:
|
|
163
|
+
if r >= threshold:
|
|
164
|
+
relevance_list.append(1) # True positive
|
|
165
|
+
else:
|
|
166
|
+
relevance_list.append(0) # False positive
|
|
167
|
+
precision = sum(relevance_list) / \
|
|
168
|
+
total_no_of_contexts if total_no_of_contexts > 0 else 0
|
|
169
|
+
precision = round(precision, 4)
|
|
170
|
+
return precision
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
from ibm_watsonx_gov.metrics.sexual_content.sexual_content_metric import \
|
|
21
|
+
SexualContentMetric
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SexualContentDecorator(BaseMetricDecorator):
|
|
25
|
+
|
|
26
|
+
def evaluate_sexual_content(self,
|
|
27
|
+
func: Optional[Callable] = None,
|
|
28
|
+
*,
|
|
29
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
30
|
+
metrics: list[GenAIMetric] = []
|
|
31
|
+
) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
An evaluation decorator for computing sexual content on an agentic node via granite guardian.
|
|
34
|
+
"""
|
|
35
|
+
if func is None:
|
|
36
|
+
return partial(self.evaluate_sexual_content, configuration=configuration, metrics=metrics)
|
|
37
|
+
|
|
38
|
+
if not metrics:
|
|
39
|
+
metrics = [SexualContentMetric()]
|
|
40
|
+
|
|
41
|
+
@decorator
|
|
42
|
+
def wrapper(func, instance, args, kwargs):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
self.validate(func=func, metrics=metrics,
|
|
46
|
+
valid_metric_types=(SexualContentMetric))
|
|
47
|
+
|
|
48
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
49
|
+
|
|
50
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
51
|
+
configuration=configuration,
|
|
52
|
+
metrics=metrics,
|
|
53
|
+
metric_inputs=metric_inputs,
|
|
54
|
+
metric_outputs=[])
|
|
55
|
+
|
|
56
|
+
return original_result
|
|
57
|
+
except Exception as ex:
|
|
58
|
+
raise Exception(
|
|
59
|
+
f"There was an error while evaluating sexual content on {func.__name__},") from ex
|
|
60
|
+
|
|
61
|
+
return wrapper(func)
|