ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_input
|
|
23
|
+
|
|
24
|
+
SEXUAL_CONTENT = "sexual_content"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SexualContentMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the Sexual Content metric class.
|
|
30
|
+
|
|
31
|
+
The Sexual Content metric measures the risk of content explicitly related to sexual activities, anatomy, or desires. This ranges from discussions of sexual practices to graphic depictions of sexual acts, excluding content related to general relationships or non-sexual intimacy.
|
|
32
|
+
It is computed using the granite guardian model.
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
1. Create Sexual Content metric with default parameters and compute using metrics evaluator.
|
|
36
|
+
.. code-block:: python
|
|
37
|
+
|
|
38
|
+
metric = SexualContentMetric()
|
|
39
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
|
|
40
|
+
|
|
41
|
+
2. Create Sexual Content metric with a custom threshold.
|
|
42
|
+
.. code-block:: python
|
|
43
|
+
|
|
44
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
45
|
+
metric = SexualContentMetric(threshold=threshold)
|
|
46
|
+
"""
|
|
47
|
+
name: Annotated[Literal["sexual_content"],
|
|
48
|
+
Field(title="Name",
|
|
49
|
+
description="The sexual content metric name.",
|
|
50
|
+
default=SEXUAL_CONTENT, frozen=True)]
|
|
51
|
+
display_name: Annotated[Literal["Sexual Content"],
|
|
52
|
+
Field(title="Display Name",
|
|
53
|
+
description="The sexual content metric display name.",
|
|
54
|
+
default="Sexual Content", frozen=True)]
|
|
55
|
+
method: Annotated[Literal["granite_guardian"],
|
|
56
|
+
Field(title="Method",
|
|
57
|
+
description="The method used to compute harm metric.",
|
|
58
|
+
default="granite_guardian")]
|
|
59
|
+
tasks: Annotated[list[TaskType],
|
|
60
|
+
Field(title="Tasks",
|
|
61
|
+
description="The list of supported tasks.",
|
|
62
|
+
default=TaskType.values(), frozen=True)]
|
|
63
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
64
|
+
Field(title="Thresholds",
|
|
65
|
+
description="The metric thresholds.",
|
|
66
|
+
default=[MetricThreshold(type="upper_limit", value=0.5)])]
|
|
67
|
+
group: Annotated[MetricGroup,
|
|
68
|
+
Field(title="Group",
|
|
69
|
+
description="The metric group.",
|
|
70
|
+
default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
71
|
+
|
|
72
|
+
async def evaluate_async(
|
|
73
|
+
self,
|
|
74
|
+
data: pd.DataFrame,
|
|
75
|
+
configuration: GenAIConfiguration,
|
|
76
|
+
**kwargs
|
|
77
|
+
) -> list[AggregateMetricResult]:
|
|
78
|
+
|
|
79
|
+
validate_input(data.columns.to_list(), configuration)
|
|
80
|
+
kwargs["detector_params"] = {"risk_name": SEXUAL_CONTENT}
|
|
81
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
82
|
+
metric_name=self.name,
|
|
83
|
+
metric_display_name=self.display_name,
|
|
84
|
+
metric_method=self.method,
|
|
85
|
+
metric_group=self.group,
|
|
86
|
+
thresholds=self.thresholds,
|
|
87
|
+
**kwargs)
|
|
88
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
89
|
+
return aggregated_metric_result
|
|
90
|
+
|
|
91
|
+
def evaluate(
|
|
92
|
+
self,
|
|
93
|
+
data: pd.DataFrame | dict,
|
|
94
|
+
configuration: GenAIConfiguration,
|
|
95
|
+
**kwargs,
|
|
96
|
+
):
|
|
97
|
+
# If ran in sync mode, block until it is done
|
|
98
|
+
return run_in_event_loop(
|
|
99
|
+
self.evaluate_async,
|
|
100
|
+
data=data,
|
|
101
|
+
configuration=configuration,
|
|
102
|
+
**kwargs,
|
|
103
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from functools import partial
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
from wrapt import decorator
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
16
|
+
AgenticAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
20
|
+
from ibm_watsonx_gov.metrics.harm.harm_metric import HarmMetric
|
|
21
|
+
from ibm_watsonx_gov.metrics.social_bias.social_bias_metric import \
|
|
22
|
+
SocialBiasMetric
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SocialBiasDecorator(BaseMetricDecorator):
|
|
26
|
+
|
|
27
|
+
def evaluate_social_bias(self,
|
|
28
|
+
func: Optional[Callable] = None,
|
|
29
|
+
*,
|
|
30
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
31
|
+
metrics: list[GenAIMetric] = []
|
|
32
|
+
) -> dict:
|
|
33
|
+
"""
|
|
34
|
+
An evaluation decorator for computing social bias on an agentic node via granite guardian.
|
|
35
|
+
"""
|
|
36
|
+
if func is None:
|
|
37
|
+
return partial(self.evaluate_social_bias, configuration=configuration, metrics=metrics)
|
|
38
|
+
|
|
39
|
+
if not metrics:
|
|
40
|
+
metrics = [SocialBiasMetric()]
|
|
41
|
+
|
|
42
|
+
@decorator
|
|
43
|
+
def wrapper(func, instance, args, kwargs):
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
self.validate(func=func, metrics=metrics,
|
|
47
|
+
valid_metric_types=(SocialBiasMetric))
|
|
48
|
+
|
|
49
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
50
|
+
|
|
51
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
52
|
+
configuration=configuration,
|
|
53
|
+
metrics=metrics,
|
|
54
|
+
metric_inputs=metric_inputs,
|
|
55
|
+
metric_outputs=[])
|
|
56
|
+
|
|
57
|
+
return original_result
|
|
58
|
+
except Exception as ex:
|
|
59
|
+
raise Exception(
|
|
60
|
+
f"There was an error while evaluating social bias on {func.__name__},") from ex
|
|
61
|
+
|
|
62
|
+
return wrapper(func)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
from ibm_watsonx_gov.utils.validation_util import validate_input
|
|
23
|
+
|
|
24
|
+
SOCIAL_BIAS = "social_bias"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SocialBiasMetric(GenAIMetric):
|
|
28
|
+
"""
|
|
29
|
+
Defines the Social Bias metric class.
|
|
30
|
+
|
|
31
|
+
The Social Bias metric measures the risk of systemic prejudice against groups based on shared identity or characteristics, often stemming from stereotypes or cultural influences. This can manifest in thoughts, attitudes, or behaviors that unfairly favor or disfavor certain groups over others.
|
|
32
|
+
It is computed using the granite guardian model.
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
1. Create Social Bias metric with default parameters and compute using metrics evaluator.
|
|
36
|
+
.. code-block:: python
|
|
37
|
+
|
|
38
|
+
metric = SocialBiasMetric()
|
|
39
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
|
|
40
|
+
|
|
41
|
+
2. Create Social Bias metric with a custom threshold.
|
|
42
|
+
.. code-block:: python
|
|
43
|
+
|
|
44
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
45
|
+
metric = SocialBiasMetric(threshold=threshold)
|
|
46
|
+
"""
|
|
47
|
+
name: Annotated[Literal["social_bias"],
|
|
48
|
+
Field(title="Name",
|
|
49
|
+
description="The social bias metric name.",
|
|
50
|
+
default=SOCIAL_BIAS, frozen=True)]
|
|
51
|
+
display_name: Annotated[Literal["Social Bias"],
|
|
52
|
+
Field(title="Display Name",
|
|
53
|
+
description="The social bias metric display name.",
|
|
54
|
+
default="Social Bias", frozen=True)]
|
|
55
|
+
method: Annotated[Literal["granite_guardian"],
|
|
56
|
+
Field(title="Method",
|
|
57
|
+
description="The method used to compute harm metric.",
|
|
58
|
+
default="granite_guardian")]
|
|
59
|
+
tasks: Annotated[list[TaskType],
|
|
60
|
+
Field(title="Tasks",
|
|
61
|
+
description="The list of supported tasks.",
|
|
62
|
+
default=TaskType.values(), frozen=True)]
|
|
63
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
64
|
+
Field(title="Thresholds",
|
|
65
|
+
description="The metric thresholds.",
|
|
66
|
+
default=[MetricThreshold(type="upper_limit", value=0.5)])]
|
|
67
|
+
group: Annotated[MetricGroup,
|
|
68
|
+
Field(title="Group",
|
|
69
|
+
description="The metric group.",
|
|
70
|
+
default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
71
|
+
|
|
72
|
+
async def evaluate_async(
|
|
73
|
+
self,
|
|
74
|
+
data: pd.DataFrame | dict,
|
|
75
|
+
configuration: GenAIConfiguration,
|
|
76
|
+
**kwargs
|
|
77
|
+
) -> list[AggregateMetricResult]:
|
|
78
|
+
|
|
79
|
+
validate_input(data.columns.to_list(), configuration)
|
|
80
|
+
kwargs["detector_params"] = {"risk_name": SOCIAL_BIAS}
|
|
81
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
82
|
+
metric_name=self.name,
|
|
83
|
+
metric_display_name=self.display_name,
|
|
84
|
+
metric_method=self.method,
|
|
85
|
+
metric_group=self.group,
|
|
86
|
+
thresholds=self.thresholds,
|
|
87
|
+
**kwargs)
|
|
88
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
89
|
+
return aggregated_metric_result
|
|
90
|
+
|
|
91
|
+
def evaluate(
|
|
92
|
+
self,
|
|
93
|
+
data: pd.DataFrame | dict,
|
|
94
|
+
configuration: GenAIConfiguration,
|
|
95
|
+
**kwargs,
|
|
96
|
+
):
|
|
97
|
+
# If ran in sync mode, block until it is done
|
|
98
|
+
return run_in_event_loop(
|
|
99
|
+
self.evaluate_async,
|
|
100
|
+
data=data,
|
|
101
|
+
configuration=configuration,
|
|
102
|
+
**kwargs,
|
|
103
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
14
|
+
from ibm_watsonx_gov.entities.enums import (CategoryClassificationType,
|
|
15
|
+
MessageStatus, MetricGroup,
|
|
16
|
+
MetricValueType, TaskType)
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
18
|
+
RecordMetricResult)
|
|
19
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
20
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
21
|
+
from pydantic import Field
|
|
22
|
+
|
|
23
|
+
STATUS = "status"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class StatusMetric(GenAIMetric):
|
|
27
|
+
"""
|
|
28
|
+
Defines the Status metric class.
|
|
29
|
+
|
|
30
|
+
The Status metric measures the status of the message processing, which can be one of the following values:
|
|
31
|
+
- successful
|
|
32
|
+
- failure
|
|
33
|
+
- unknown
|
|
34
|
+
Examples:
|
|
35
|
+
1. Create Status metric with default parameters and compute using metrics AgenticEvaluator.
|
|
36
|
+
.. code-block:: python
|
|
37
|
+
|
|
38
|
+
agent_app = AgenticApp(name="Rag agent",
|
|
39
|
+
metrics_configuration=MetricsConfiguration(metrics=[
|
|
40
|
+
StatusMetric()]))
|
|
41
|
+
|
|
42
|
+
evaluator = AgenticEvaluator(agentic_app=agent_app)
|
|
43
|
+
evaluator.start_run()
|
|
44
|
+
result = rag_app.invoke({"input_text": "What is concept drift?", "ground_truth": "Concept drift occurs when the statistical properties of the target variable change over time, causing a machine learning model’s predictions to become less accurate."})
|
|
45
|
+
evaluator.end_run()
|
|
46
|
+
"""
|
|
47
|
+
name: Annotated[Literal["status"],
|
|
48
|
+
Field(title="Name",
|
|
49
|
+
description="The status metric name.",
|
|
50
|
+
default=STATUS, frozen=True)]
|
|
51
|
+
display_name: Annotated[Literal["Status"],
|
|
52
|
+
Field(title="Display Name",
|
|
53
|
+
description="The status metric display name.",
|
|
54
|
+
default="Status", frozen=True)]
|
|
55
|
+
tasks: Annotated[list[TaskType],
|
|
56
|
+
Field(title="Tasks",
|
|
57
|
+
description="The list of supported tasks.",
|
|
58
|
+
default=TaskType.values(), frozen=True)]
|
|
59
|
+
group: Annotated[MetricGroup,
|
|
60
|
+
Field(title="Group",
|
|
61
|
+
description="The metric group.",
|
|
62
|
+
default=MetricGroup.MESSAGE_COMPLETION, frozen=True)]
|
|
63
|
+
category_classification: Annotated[dict[str, list[str]], Field(
|
|
64
|
+
title="Category Classification",
|
|
65
|
+
description="The category classification of the metrics values.",
|
|
66
|
+
default={
|
|
67
|
+
CategoryClassificationType.FAVOURABLE.value: [MessageStatus.SUCCESSFUL.value],
|
|
68
|
+
CategoryClassificationType.UNFAVOURABLE.value: [MessageStatus.FAILURE.value],
|
|
69
|
+
CategoryClassificationType.NEUTRAL.value: [
|
|
70
|
+
MessageStatus.UNKNOWN.value]
|
|
71
|
+
},
|
|
72
|
+
)]
|
|
73
|
+
|
|
74
|
+
async def evaluate_async(
|
|
75
|
+
self,
|
|
76
|
+
data: pd.DataFrame | dict,
|
|
77
|
+
configuration: GenAIConfiguration,
|
|
78
|
+
**kwargs
|
|
79
|
+
) -> list[AggregateMetricResult]:
|
|
80
|
+
|
|
81
|
+
record_level_metrics: list[RecordMetricResult] = []
|
|
82
|
+
for _, row in data.iterrows():
|
|
83
|
+
record_level_metrics.append(
|
|
84
|
+
RecordMetricResult(
|
|
85
|
+
name=self.name,
|
|
86
|
+
display_name=self.display_name,
|
|
87
|
+
method=self.method,
|
|
88
|
+
label=row.get(
|
|
89
|
+
configuration.status_field) or MessageStatus.UNKNOWN.value,
|
|
90
|
+
value=None,
|
|
91
|
+
category_classification=self.category_classification,
|
|
92
|
+
group=self.group,
|
|
93
|
+
record_id=row[configuration.record_id_field],
|
|
94
|
+
value_type=MetricValueType.CATEGORICAL.value)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
aggregated_metric_result = AggregateMetricResult.create(
|
|
98
|
+
record_level_metrics)
|
|
99
|
+
return aggregated_metric_result
|
|
100
|
+
|
|
101
|
+
def evaluate(
|
|
102
|
+
self,
|
|
103
|
+
data: pd.DataFrame | dict,
|
|
104
|
+
configuration: GenAIConfiguration,
|
|
105
|
+
**kwargs,
|
|
106
|
+
):
|
|
107
|
+
# If ran in sync mode, block until it is done
|
|
108
|
+
return run_in_event_loop(
|
|
109
|
+
self.evaluate_async,
|
|
110
|
+
data=data,
|
|
111
|
+
configuration=configuration,
|
|
112
|
+
**kwargs,
|
|
113
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
19
|
+
from ibm_watsonx_gov.metrics.text_grade_level.text_grade_level_metric import \
|
|
20
|
+
TextGradeLevelMetric
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TextGradeLevelDecorator(BaseMetricDecorator):
|
|
24
|
+
def evaluate_text_grade_level(self,
|
|
25
|
+
func: Optional[Callable] = None,
|
|
26
|
+
*,
|
|
27
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
28
|
+
metrics: list[GenAIMetric] = []
|
|
29
|
+
) -> dict:
|
|
30
|
+
"""
|
|
31
|
+
An evaluation decorator for computing text grade level metric on an agentic node.
|
|
32
|
+
"""
|
|
33
|
+
if func is None:
|
|
34
|
+
return partial(self.evaluate_text_grade_level, configuration=configuration, metrics=metrics)
|
|
35
|
+
|
|
36
|
+
if not metrics:
|
|
37
|
+
metrics = [TextGradeLevelMetric()]
|
|
38
|
+
|
|
39
|
+
@decorator
|
|
40
|
+
def wrapper(func, instance, args, kwargs):
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
self.validate(func=func, metrics=metrics,
|
|
44
|
+
valid_metric_types=(TextGradeLevelMetric,))
|
|
45
|
+
|
|
46
|
+
metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
|
|
47
|
+
|
|
48
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
49
|
+
configuration=configuration,
|
|
50
|
+
metrics=metrics,
|
|
51
|
+
metric_inputs=[],
|
|
52
|
+
metric_outputs=metric_outputs)
|
|
53
|
+
|
|
54
|
+
return original_result
|
|
55
|
+
except Exception as ex:
|
|
56
|
+
raise Exception(
|
|
57
|
+
f"There was an error while evaluating text grade level metric on {func.__name__},") from ex
|
|
58
|
+
|
|
59
|
+
return wrapper(func)
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import textstat
|
|
14
|
+
from pydantic import Field
|
|
15
|
+
|
|
16
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
18
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
19
|
+
RecordMetricResult)
|
|
20
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
21
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
22
|
+
from ibm_watsonx_gov.utils.python_utils import replace_none_with_empty_string
|
|
23
|
+
from ibm_watsonx_gov.utils.validation_util import validate_output
|
|
24
|
+
|
|
25
|
+
TEXT_GRADE_LEVEL = "text_grade_level"
|
|
26
|
+
TEXT_GRADE_LEVEL_DISPLAY_NAME = "Text Grade Level"
|
|
27
|
+
FLESCH_KINCAID_GRADE = "flesch_kincaid_grade"
|
|
28
|
+
TEXTSTAT = "textstat"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TextGradeLevelResult(RecordMetricResult):
|
|
32
|
+
name: str = TEXT_GRADE_LEVEL
|
|
33
|
+
display_name: str = TEXT_GRADE_LEVEL_DISPLAY_NAME
|
|
34
|
+
provider: str = TEXTSTAT
|
|
35
|
+
method: str = FLESCH_KINCAID_GRADE
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TextGradeLevelMetric(GenAIMetric):
|
|
39
|
+
"""
|
|
40
|
+
Defines the Text Grade Level metric class.
|
|
41
|
+
|
|
42
|
+
The Text Grade Level metric measures the approximate reading US grade level of a text.
|
|
43
|
+
It is computed using the flesch_kincaid_grade method.
|
|
44
|
+
Its possible values typically range from 0 to 12+
|
|
45
|
+
|
|
46
|
+
- Negative scores are rare and only occur with artificially simple texts.
|
|
47
|
+
- No strict upper limit—some highly complex texts can score 30+, but these are extremely hard to read.
|
|
48
|
+
|
|
49
|
+
Examples:
|
|
50
|
+
1. Create Text Grade Level metric with default parameters and compute using metrics evaluator.
|
|
51
|
+
.. code-block:: python
|
|
52
|
+
|
|
53
|
+
metric = TextGradeLevelMetric()
|
|
54
|
+
result = MetricsEvaluator().evaluate(data={"generated_text": "..."},
|
|
55
|
+
metrics=[metric])
|
|
56
|
+
|
|
57
|
+
2. Create Text Grade Level metric with a custom threshold.
|
|
58
|
+
.. code-block:: python
|
|
59
|
+
|
|
60
|
+
threshold = MetricThreshold(type="lower_limit", value=6)
|
|
61
|
+
metric = TextGradeLevelMetric(thresholds=[threshold])
|
|
62
|
+
"""
|
|
63
|
+
name: Annotated[Literal["text_grade_level"],
|
|
64
|
+
Field(title="name",
|
|
65
|
+
description="The text grade level metric name.",
|
|
66
|
+
default=TEXT_GRADE_LEVEL, frozen=True)]
|
|
67
|
+
display_name: Annotated[Literal["Text Grade Level"],
|
|
68
|
+
Field(title="Display Name",
|
|
69
|
+
description="The text grade level metric display name.",
|
|
70
|
+
default=TEXT_GRADE_LEVEL_DISPLAY_NAME, frozen=True)]
|
|
71
|
+
method: Annotated[Literal["flesch_kincaid_grade"],
|
|
72
|
+
Field(title="Method",
|
|
73
|
+
description="The method used to compute text grade level metric.",
|
|
74
|
+
default=FLESCH_KINCAID_GRADE)]
|
|
75
|
+
tasks: Annotated[list[TaskType],
|
|
76
|
+
Field(title="Tasks",
|
|
77
|
+
description="The list of supported tasks.",
|
|
78
|
+
default=TaskType.values(), frozen=True)]
|
|
79
|
+
group: Annotated[MetricGroup,
|
|
80
|
+
Field(title="Group",
|
|
81
|
+
description="The metric group.",
|
|
82
|
+
default=MetricGroup.READABILITY, frozen=True)]
|
|
83
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
84
|
+
Field(title="Thresholds",
|
|
85
|
+
description="The metric thresholds.",
|
|
86
|
+
default=[MetricThreshold(type="lower_limit", value=6)])]
|
|
87
|
+
|
|
88
|
+
def evaluate(
|
|
89
|
+
self,
|
|
90
|
+
data: pd.DataFrame,
|
|
91
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
92
|
+
**kwargs,
|
|
93
|
+
) -> list[AggregateMetricResult]:
|
|
94
|
+
from ibm_watsonx_gov.utils.aggregation_util import get_summaries
|
|
95
|
+
|
|
96
|
+
validate_output(data.columns.to_list(), configuration)
|
|
97
|
+
record_level_metrics = []
|
|
98
|
+
predictions = data[configuration.output_fields[0]].to_list()
|
|
99
|
+
record_ids = data[configuration.record_id_field].to_list()
|
|
100
|
+
replace_none_with_empty_string(predictions)
|
|
101
|
+
|
|
102
|
+
all_scores = self._compute(predictions=predictions)
|
|
103
|
+
record_level_metrics = [
|
|
104
|
+
TextGradeLevelResult(record_id=record_id,
|
|
105
|
+
value=score, thresholds=self.thresholds, group=MetricGroup.READABILITY.value)
|
|
106
|
+
for score, record_id in zip(all_scores, record_ids)
|
|
107
|
+
]
|
|
108
|
+
summary = get_summaries(all_scores)
|
|
109
|
+
aggregate_metric_scores = AggregateMetricResult(
|
|
110
|
+
name=self.name,
|
|
111
|
+
display_name=self.display_name,
|
|
112
|
+
provider=TEXTSTAT,
|
|
113
|
+
method=self.method,
|
|
114
|
+
group=self.group,
|
|
115
|
+
min=summary.get("min"),
|
|
116
|
+
max=summary.get("max"),
|
|
117
|
+
mean=summary.get("mean"),
|
|
118
|
+
value=summary.get("mean"),
|
|
119
|
+
total_records=len(record_level_metrics),
|
|
120
|
+
record_level_metrics=record_level_metrics,
|
|
121
|
+
thresholds=self.thresholds,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
return aggregate_metric_scores
|
|
125
|
+
|
|
126
|
+
def _compute(self, predictions: list) -> list:
|
|
127
|
+
return [textstat.flesch_kincaid_grade(pred) for pred in predictions]
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
19
|
+
from ibm_watsonx_gov.metrics.text_reading_ease.text_reading_ease_metric import \
|
|
20
|
+
TextReadingEaseMetric
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TextReadingEaseDecorator(BaseMetricDecorator):
|
|
24
|
+
def evaluate_text_reading_ease(self,
|
|
25
|
+
func: Optional[Callable] = None,
|
|
26
|
+
*,
|
|
27
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
28
|
+
metrics: list[GenAIMetric] = []
|
|
29
|
+
) -> dict:
|
|
30
|
+
"""
|
|
31
|
+
An evaluation decorator for computing text reading ease metric on an agentic node.
|
|
32
|
+
"""
|
|
33
|
+
if func is None:
|
|
34
|
+
return partial(self.evaluate_text_reading_ease, configuration=configuration, metrics=metrics)
|
|
35
|
+
|
|
36
|
+
if not metrics:
|
|
37
|
+
metrics = [TextReadingEaseMetric()]
|
|
38
|
+
|
|
39
|
+
@decorator
|
|
40
|
+
def wrapper(func, instance, args, kwargs):
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
self.validate(func=func, metrics=metrics,
|
|
44
|
+
valid_metric_types=(TextReadingEaseMetric,))
|
|
45
|
+
|
|
46
|
+
metric_outputs = [EvaluatorFields.OUTPUT_FIELDS]
|
|
47
|
+
|
|
48
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
49
|
+
configuration=configuration,
|
|
50
|
+
metrics=metrics,
|
|
51
|
+
metric_inputs=[],
|
|
52
|
+
metric_outputs=metric_outputs)
|
|
53
|
+
|
|
54
|
+
return original_result
|
|
55
|
+
except Exception as ex:
|
|
56
|
+
raise Exception(
|
|
57
|
+
f"There was an error while evaluating text reading ease metric on {func.__name__},") from ex
|
|
58
|
+
|
|
59
|
+
return wrapper(func)
|