ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
19
|
+
from ibm_watsonx_gov.metrics.context_relevance.context_relevance_metric import \
|
|
20
|
+
ContextRelevanceMetric
|
|
21
|
+
from ibm_watsonx_gov.metrics.ndcg.ndcg_metric import NDCGMetric
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NDCGDecorator(BaseMetricDecorator):
|
|
25
|
+
def evaluate_ndcg(self,
|
|
26
|
+
func: Optional[Callable] = None,
|
|
27
|
+
*,
|
|
28
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
29
|
+
metrics: list[GenAIMetric] = []
|
|
30
|
+
) -> dict:
|
|
31
|
+
"""
|
|
32
|
+
An evaluation decorator for computing ndcg metric on an agentic node.
|
|
33
|
+
"""
|
|
34
|
+
if func is None:
|
|
35
|
+
return partial(self.evaluate_ndcg, configuration=configuration, metrics=metrics)
|
|
36
|
+
|
|
37
|
+
if not metrics:
|
|
38
|
+
metrics = [NDCGMetric()]
|
|
39
|
+
|
|
40
|
+
@decorator
|
|
41
|
+
def wrapper(func, instance, args, kwargs):
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
self.validate(func=func, metrics=metrics,
|
|
45
|
+
valid_metric_types=(NDCGMetric, ContextRelevanceMetric))
|
|
46
|
+
|
|
47
|
+
metric_inputs = [EvaluatorFields.INPUT_FIELDS]
|
|
48
|
+
metric_outputs = [EvaluatorFields.CONTEXT_FIELDS]
|
|
49
|
+
|
|
50
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
51
|
+
configuration=configuration,
|
|
52
|
+
metrics=metrics,
|
|
53
|
+
metric_inputs=metric_inputs,
|
|
54
|
+
metric_outputs=metric_outputs)
|
|
55
|
+
|
|
56
|
+
return original_result
|
|
57
|
+
except Exception as ex:
|
|
58
|
+
raise Exception(
|
|
59
|
+
f"There was an error while evaluating ndcg metric on {func.__name__},") from ex
|
|
60
|
+
|
|
61
|
+
return wrapper(func)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Any, Literal
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from pydantic import Field, TypeAdapter, field_validator
|
|
15
|
+
from sklearn.metrics import ndcg_score
|
|
16
|
+
|
|
17
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
18
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
19
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
20
|
+
RecordMetricResult)
|
|
21
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
22
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
23
|
+
from ibm_watsonx_gov.metrics.context_relevance.context_relevance_metric import (
|
|
24
|
+
CONTEXT_RELEVANCE, ContextRelevanceMetric, ContextRelevanceResult)
|
|
25
|
+
|
|
26
|
+
NDCG = "ndcg"
|
|
27
|
+
NDCG_DISPLAY_NAME = "Normalized Discounted Cumulative Gain"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class NDCGResult(RecordMetricResult):
|
|
31
|
+
name: str = NDCG
|
|
32
|
+
display_name: str = NDCG_DISPLAY_NAME
|
|
33
|
+
group: MetricGroup = MetricGroup.RETRIEVAL_QUALITY
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class NDCGMetric(GenAIMetric):
|
|
37
|
+
"""
|
|
38
|
+
Defines the NDCG(Normalized Discounted Cumulative Gain) metric class.
|
|
39
|
+
|
|
40
|
+
The Normalized Discounted Cumulative Gain metric measures the ranking quality of the retrieved contexts.
|
|
41
|
+
The Context Relevance metric is computed as a pre requisite to compute this metric.
|
|
42
|
+
|
|
43
|
+
Examples:
|
|
44
|
+
1. Create NDCG metric with default parameters and compute using metrics evaluator.
|
|
45
|
+
.. code-block:: python
|
|
46
|
+
|
|
47
|
+
metric = NDCGMetric()
|
|
48
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": "..."},
|
|
49
|
+
metrics=[metric])
|
|
50
|
+
# A list of contexts can also be passed as shown below
|
|
51
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
|
|
52
|
+
metrics=[metric])
|
|
53
|
+
|
|
54
|
+
2. Create NDCG metric with a custom threshold.
|
|
55
|
+
.. code-block:: python
|
|
56
|
+
|
|
57
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
58
|
+
metric = NDCGMetric(method=method, threshold=threshold)
|
|
59
|
+
|
|
60
|
+
3. Create NDCG metric with llm_as_judge method.
|
|
61
|
+
.. code-block:: python
|
|
62
|
+
|
|
63
|
+
# Define LLM Judge using watsonx.ai
|
|
64
|
+
# To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
|
|
65
|
+
llm_judge = LLMJudge(model=WxAIFoundationModel(
|
|
66
|
+
model_id="ibm/granite-3-3-8b-instruct",
|
|
67
|
+
project_id="<PROJECT_ID>"
|
|
68
|
+
))
|
|
69
|
+
cr_metric = ContextRelevanceMetric(llm_judge=llm_judge)
|
|
70
|
+
ap_metric = NDCGMetric()
|
|
71
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", "context": ["...", "..."]},
|
|
72
|
+
metrics=[cr_metric, ap_metric])
|
|
73
|
+
"""
|
|
74
|
+
name: Annotated[Literal["ndcg"],
|
|
75
|
+
Field(title="Name",
|
|
76
|
+
description="The ndcg metric name.",
|
|
77
|
+
default=NDCG, frozen=True)]
|
|
78
|
+
display_name: Annotated[Literal["Normalized Discounted Cumulative Gain"],
|
|
79
|
+
Field(title="Display Name",
|
|
80
|
+
description="The ndcg metric display name.",
|
|
81
|
+
default=NDCG_DISPLAY_NAME, frozen=True)]
|
|
82
|
+
tasks: Annotated[list[TaskType],
|
|
83
|
+
Field(title="Tasks",
|
|
84
|
+
description="The list of supported tasks.",
|
|
85
|
+
default=[TaskType.RAG])]
|
|
86
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
87
|
+
Field(title="Thresholds",
|
|
88
|
+
description="The metric thresholds.",
|
|
89
|
+
default=[MetricThreshold(type="lower_limit", value=0.7)])]
|
|
90
|
+
metric_dependencies: Annotated[list[GenAIMetric],
|
|
91
|
+
Field(title="Metric dependencies",
|
|
92
|
+
description="The list of metric dependencies",
|
|
93
|
+
default=[ContextRelevanceMetric()])]
|
|
94
|
+
group: Annotated[MetricGroup,
|
|
95
|
+
Field(title="Group",
|
|
96
|
+
description="The metric group.",
|
|
97
|
+
default=MetricGroup.RETRIEVAL_QUALITY, frozen=True)]
|
|
98
|
+
|
|
99
|
+
@field_validator("metric_dependencies", mode="before")
|
|
100
|
+
@classmethod
|
|
101
|
+
def metric_dependencies_validator(cls, value: Any):
|
|
102
|
+
if value:
|
|
103
|
+
value = [TypeAdapter(Annotated[ContextRelevanceMetric, Field(
|
|
104
|
+
discriminator="name")]).validate_python(
|
|
105
|
+
m) for m in value]
|
|
106
|
+
return value
|
|
107
|
+
|
|
108
|
+
def evaluate(
|
|
109
|
+
self,
|
|
110
|
+
data: pd.DataFrame,
|
|
111
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
112
|
+
metrics_result: list[AggregateMetricResult],
|
|
113
|
+
**kwargs,
|
|
114
|
+
) -> AggregateMetricResult:
|
|
115
|
+
record_level_metrics = []
|
|
116
|
+
scores = []
|
|
117
|
+
|
|
118
|
+
context_relevance_result: list[ContextRelevanceResult] = next(
|
|
119
|
+
(metric_result.record_level_metrics for metric_result in metrics_result if metric_result.name == CONTEXT_RELEVANCE), None)
|
|
120
|
+
|
|
121
|
+
if context_relevance_result is None:
|
|
122
|
+
raise Exception(
|
|
123
|
+
f"Failed to evaluate {self.name} metric. Missing context relevance metric result")
|
|
124
|
+
|
|
125
|
+
for relevance_result in context_relevance_result:
|
|
126
|
+
score = self.__compute(
|
|
127
|
+
relevance_result.additional_info.get(
|
|
128
|
+
"contexts_values", []),)
|
|
129
|
+
scores.append(score)
|
|
130
|
+
record_level_metrics.append(
|
|
131
|
+
NDCGResult(
|
|
132
|
+
method="",
|
|
133
|
+
provider="",
|
|
134
|
+
record_id=relevance_result.record_id,
|
|
135
|
+
value=score,
|
|
136
|
+
thresholds=self.thresholds
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
mean = sum(scores) / len(scores)
|
|
141
|
+
aggregate_metric_score = AggregateMetricResult(
|
|
142
|
+
name=self.name,
|
|
143
|
+
display_name=self.display_name,
|
|
144
|
+
method="",
|
|
145
|
+
provider="",
|
|
146
|
+
group=self.group,
|
|
147
|
+
min=min(scores),
|
|
148
|
+
max=max(scores),
|
|
149
|
+
mean=mean,
|
|
150
|
+
value=mean,
|
|
151
|
+
total_records=len(record_level_metrics),
|
|
152
|
+
record_level_metrics=record_level_metrics,
|
|
153
|
+
thresholds=self.thresholds
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return aggregate_metric_score
|
|
157
|
+
|
|
158
|
+
def __compute(self, relevance_scores: list[float]) -> float:
|
|
159
|
+
if len(relevance_scores) < 2:
|
|
160
|
+
return 1.0
|
|
161
|
+
|
|
162
|
+
true_relevance = np.sort(relevance_scores)[::-1]
|
|
163
|
+
|
|
164
|
+
ndcg_value = ndcg_score([true_relevance], [relevance_scores])
|
|
165
|
+
|
|
166
|
+
return round(ndcg_value, 4)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Callable, Optional
|
|
11
|
+
|
|
12
|
+
from wrapt import decorator
|
|
13
|
+
|
|
14
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
15
|
+
AgenticAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import EvaluatorFields
|
|
17
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
18
|
+
from ibm_watsonx_gov.metrics.base_metric_decorator import BaseMetricDecorator
|
|
19
|
+
from ibm_watsonx_gov.metrics.output_token_count.output_token_count_metric import \
|
|
20
|
+
OutputTokenCountMetric
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class OutputTokenCountDecorator(BaseMetricDecorator):
|
|
24
|
+
def evaluate_output_token_count(self,
|
|
25
|
+
func: Optional[Callable] = None,
|
|
26
|
+
*,
|
|
27
|
+
configuration: Optional[AgenticAIConfiguration] = None,
|
|
28
|
+
metrics: list[GenAIMetric] = []
|
|
29
|
+
) -> dict:
|
|
30
|
+
"""
|
|
31
|
+
An evaluation decorator for computing total output token.
|
|
32
|
+
"""
|
|
33
|
+
if func is None:
|
|
34
|
+
return partial(self.evaluate_output_token_count, configuration=configuration, metrics=metrics)
|
|
35
|
+
|
|
36
|
+
if not metrics:
|
|
37
|
+
metrics = [OutputTokenCountMetric()]
|
|
38
|
+
|
|
39
|
+
@decorator
|
|
40
|
+
def wrapper(func, instance, args, kwargs):
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
self.validate(func=func, metrics=metrics,
|
|
44
|
+
valid_metric_types=(OutputTokenCountMetric,))
|
|
45
|
+
|
|
46
|
+
metric_inputs = [EvaluatorFields.OUTPUT_TOKEN_COUNT_FIELDS]
|
|
47
|
+
|
|
48
|
+
original_result = self.compute_helper(func=func, args=args, kwargs=kwargs,
|
|
49
|
+
configuration=configuration,
|
|
50
|
+
metrics=metrics,
|
|
51
|
+
metric_inputs=metric_inputs)
|
|
52
|
+
|
|
53
|
+
return original_result
|
|
54
|
+
except Exception as ex:
|
|
55
|
+
raise Exception(
|
|
56
|
+
f"There was an error while tracking total output token count on {func.__name__},") from ex
|
|
57
|
+
|
|
58
|
+
return wrapper(func)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import re
|
|
14
|
+
from pydantic import Field
|
|
15
|
+
|
|
16
|
+
from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
|
|
17
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
18
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
|
|
19
|
+
RecordMetricResult)
|
|
20
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
21
|
+
from ibm_watsonx_gov.utils.validation_util import validate_field
|
|
22
|
+
|
|
23
|
+
OUTPUT_TOKEN_COUNT = "output_token_count"
|
|
24
|
+
OUTPUT_TOKEN_COUNT_DISPLAY_NAME = "Output Token Count"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class OutputTokenCountResult(RecordMetricResult):
|
|
28
|
+
name: str = OUTPUT_TOKEN_COUNT
|
|
29
|
+
display_name: str = OUTPUT_TOKEN_COUNT_DISPLAY_NAME
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class OutputTokenCountMetric(GenAIMetric):
|
|
33
|
+
"""
|
|
34
|
+
Defines the output token count metric class.
|
|
35
|
+
|
|
36
|
+
The output token count metric keep track of LLM output token count.
|
|
37
|
+
|
|
38
|
+
Examples:
|
|
39
|
+
1. Create Cost metric with default parameters and compute using metrics evaluator.
|
|
40
|
+
.. code-block:: python
|
|
41
|
+
|
|
42
|
+
metric = OutputTokenCountMetric()
|
|
43
|
+
result = MetricsEvaluator().evaluate(data={"completion_tokens": "..."},
|
|
44
|
+
metrics=[metric])
|
|
45
|
+
"""
|
|
46
|
+
name: Annotated[Literal["output_token_count"],
|
|
47
|
+
Field(title="name",
|
|
48
|
+
description="The output token count metric name.",
|
|
49
|
+
default=OUTPUT_TOKEN_COUNT, frozen=True)]
|
|
50
|
+
display_name: Annotated[Literal["Output Token Count"],
|
|
51
|
+
Field(title="Display Name",
|
|
52
|
+
description="The output token count metric display name.",
|
|
53
|
+
default=OUTPUT_TOKEN_COUNT_DISPLAY_NAME, frozen=True)]
|
|
54
|
+
tasks: Annotated[list[TaskType],
|
|
55
|
+
Field(title="Tasks",
|
|
56
|
+
description="The list of supported tasks.",
|
|
57
|
+
default=TaskType.values(), frozen=True)]
|
|
58
|
+
group: Annotated[MetricGroup,
|
|
59
|
+
Field(title="Group",
|
|
60
|
+
description="The metric group.",
|
|
61
|
+
default=MetricGroup.USAGE, frozen=True)]
|
|
62
|
+
|
|
63
|
+
def evaluate(
|
|
64
|
+
self,
|
|
65
|
+
data: pd.DataFrame,
|
|
66
|
+
configuration: GenAIConfiguration | AgenticAIConfiguration,
|
|
67
|
+
**kwargs,
|
|
68
|
+
) -> list[AggregateMetricResult]:
|
|
69
|
+
from ibm_watsonx_gov.utils.aggregation_util import get_summaries
|
|
70
|
+
|
|
71
|
+
validate_field("output_token_count_fields", configuration)
|
|
72
|
+
record_ids = data[configuration.record_id_field].to_list()
|
|
73
|
+
data = data.fillna(0)
|
|
74
|
+
output_tokens = self._evaluate(data, configuration)
|
|
75
|
+
record_level_metrics = [
|
|
76
|
+
OutputTokenCountResult(record_id=record_id,
|
|
77
|
+
value=token, group=MetricGroup.USAGE.value)
|
|
78
|
+
for token, record_id in zip(output_tokens, record_ids)
|
|
79
|
+
]
|
|
80
|
+
summary = get_summaries(output_tokens)
|
|
81
|
+
aggregate_metric_scores = AggregateMetricResult(
|
|
82
|
+
name=self.name,
|
|
83
|
+
display_name=self.display_name,
|
|
84
|
+
group=self.group,
|
|
85
|
+
min=summary.get("min"),
|
|
86
|
+
max=summary.get("max"),
|
|
87
|
+
mean=summary.get("mean"),
|
|
88
|
+
value=summary.get("mean"),
|
|
89
|
+
total_records=len(record_level_metrics),
|
|
90
|
+
record_level_metrics=record_level_metrics,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return aggregate_metric_scores
|
|
94
|
+
|
|
95
|
+
def _evaluate(self, data: pd.DataFrame, config: GenAIConfiguration | AgenticAIConfiguration) -> list:
|
|
96
|
+
"""
|
|
97
|
+
Track total output token.
|
|
98
|
+
"""
|
|
99
|
+
matched_cols = []
|
|
100
|
+
|
|
101
|
+
for pattern in config.output_token_count_fields:
|
|
102
|
+
# Compile regex pattern for safety and performance
|
|
103
|
+
regex = re.compile(pattern)
|
|
104
|
+
# Filter columns matching this pattern
|
|
105
|
+
matched = [col for col in data.columns if regex.fullmatch(col)]
|
|
106
|
+
matched_cols.extend(matched)
|
|
107
|
+
|
|
108
|
+
# Remove duplicates in case multiple patterns match the same column
|
|
109
|
+
matched_cols = list(set(matched_cols))
|
|
110
|
+
|
|
111
|
+
# Sum across these columns row-wise
|
|
112
|
+
return data[matched_cols].sum(axis=1).tolist()
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from ibm_watsonx_gov.metrics.pii.input_pii_metric import InputPIIMetric
|
|
11
|
+
from ibm_watsonx_gov.metrics.pii.output_pii_metric import OutputPIIMetric
|
|
12
|
+
from ibm_watsonx_gov.metrics.pii.pii_metric import PIIMetric
|
|
13
|
+
|
|
14
|
+
__all__ = ["PIIMetric", "InputPIIMetric", "OutputPIIMetric"]
|
|
15
|
+
|
|
16
|
+
# Made with Bob
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
|
|
23
|
+
INPUT_PII = "input_pii"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class InputPIIMetric(GenAIMetric):
|
|
27
|
+
"""
|
|
28
|
+
Defines the Input PII metric class.
|
|
29
|
+
|
|
30
|
+
The Input PII metric measures if your model input data contains any personally identifiable information.
|
|
31
|
+
It is computed using the Watson Natural Language Processing entity extraction model on the input data.
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
1. Create Input PII metric with default parameters and compute using metrics evaluator.
|
|
35
|
+
.. code-block:: python
|
|
36
|
+
|
|
37
|
+
metric = InputPIIMetric()
|
|
38
|
+
result = MetricsEvaluator().evaluate(data={"input_text": "...", metrics=[metric])
|
|
39
|
+
|
|
40
|
+
2. Create Input PII metric with a custom threshold.
|
|
41
|
+
.. code-block:: python
|
|
42
|
+
|
|
43
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
44
|
+
metric = InputPIIMetric(threshold=threshold)
|
|
45
|
+
"""
|
|
46
|
+
name: Annotated[Literal["input_pii"],
|
|
47
|
+
Field(title="Name",
|
|
48
|
+
description="The input pii metric name.",
|
|
49
|
+
default=INPUT_PII, frozen=True)]
|
|
50
|
+
display_name: Annotated[Literal["Input PII"],
|
|
51
|
+
Field(title="Display Name",
|
|
52
|
+
description="The input pii metric display name.",
|
|
53
|
+
default="Input PII", frozen=True)]
|
|
54
|
+
tasks: Annotated[list[TaskType],
|
|
55
|
+
Field(title="Tasks",
|
|
56
|
+
description="The list of supported tasks.",
|
|
57
|
+
default=TaskType.values(), frozen=True)]
|
|
58
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
59
|
+
Field(title="Thresholds",
|
|
60
|
+
description="The metric thresholds.",
|
|
61
|
+
default=[MetricThreshold(type="upper_limit", value=0.1)])]
|
|
62
|
+
group: Annotated[MetricGroup,
|
|
63
|
+
Field(title="Group",
|
|
64
|
+
description="The metric group.",
|
|
65
|
+
default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
66
|
+
|
|
67
|
+
async def evaluate_async(
|
|
68
|
+
self,
|
|
69
|
+
data: pd.DataFrame | dict,
|
|
70
|
+
configuration: GenAIConfiguration,
|
|
71
|
+
**kwargs
|
|
72
|
+
) -> AggregateMetricResult:
|
|
73
|
+
|
|
74
|
+
provider = DetectorsProvider(configuration=configuration,
|
|
75
|
+
metric_name="pii",
|
|
76
|
+
metric_display_name=self.display_name,
|
|
77
|
+
metric_method=self.method,
|
|
78
|
+
metric_group=self.group,
|
|
79
|
+
thresholds=self.thresholds,
|
|
80
|
+
**kwargs)
|
|
81
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
82
|
+
# Update the metric name to input_pii
|
|
83
|
+
aggregated_metric_result.name = self.name
|
|
84
|
+
for record in aggregated_metric_result.record_level_metrics:
|
|
85
|
+
record.name = self.name
|
|
86
|
+
return aggregated_metric_result
|
|
87
|
+
|
|
88
|
+
def evaluate(
|
|
89
|
+
self,
|
|
90
|
+
data: pd.DataFrame | dict,
|
|
91
|
+
configuration: GenAIConfiguration,
|
|
92
|
+
**kwargs,
|
|
93
|
+
):
|
|
94
|
+
# If ran in sync mode, block until it is done
|
|
95
|
+
return run_in_event_loop(
|
|
96
|
+
self.evaluate_async,
|
|
97
|
+
data=data,
|
|
98
|
+
configuration=configuration,
|
|
99
|
+
**kwargs,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Made with Bob
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from ibm_watsonx_gov.config.gen_ai_configuration import GenAIConfiguration
|
|
16
|
+
from ibm_watsonx_gov.entities.enums import MetricGroup, TaskType
|
|
17
|
+
from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
|
|
18
|
+
from ibm_watsonx_gov.entities.metric import GenAIMetric
|
|
19
|
+
from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
|
|
20
|
+
from ibm_watsonx_gov.providers.detectors_provider import DetectorsProvider
|
|
21
|
+
from ibm_watsonx_gov.utils.async_util import run_in_event_loop
|
|
22
|
+
|
|
23
|
+
OUTPUT_PII = "output_pii"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class OutputPIIMetric(GenAIMetric):
|
|
27
|
+
"""
|
|
28
|
+
Defines the Output PII metric class.
|
|
29
|
+
|
|
30
|
+
The Output PII metric measures if your model output data contains any personally identifiable information.
|
|
31
|
+
It is computed using the Watson Natural Language Processing entity extraction model on the output data.
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
1. Create Output PII metric with default parameters and compute using metrics evaluator.
|
|
35
|
+
.. code-block:: python
|
|
36
|
+
|
|
37
|
+
metric = OutputPIIMetric()
|
|
38
|
+
result = MetricsEvaluator().evaluate(data={"generated_text": "...", metrics=[metric])
|
|
39
|
+
|
|
40
|
+
2. Create Output PII metric with a custom threshold.
|
|
41
|
+
.. code-block:: python
|
|
42
|
+
|
|
43
|
+
threshold = MetricThreshold(type="lower_limit", value=0.5)
|
|
44
|
+
metric = OutputPIIMetric(threshold=threshold)
|
|
45
|
+
"""
|
|
46
|
+
name: Annotated[Literal["output_pii"],
|
|
47
|
+
Field(title="Name",
|
|
48
|
+
description="The output pii metric name.",
|
|
49
|
+
default=OUTPUT_PII, frozen=True)]
|
|
50
|
+
display_name: Annotated[Literal["Output PII"],
|
|
51
|
+
Field(title="Display Name",
|
|
52
|
+
description="The output pii metric display name.",
|
|
53
|
+
default="Output PII", frozen=True)]
|
|
54
|
+
tasks: Annotated[list[TaskType],
|
|
55
|
+
Field(title="Tasks",
|
|
56
|
+
description="The list of supported tasks.",
|
|
57
|
+
default=TaskType.values(), frozen=True)]
|
|
58
|
+
thresholds: Annotated[list[MetricThreshold],
|
|
59
|
+
Field(title="Thresholds",
|
|
60
|
+
description="The metric thresholds.",
|
|
61
|
+
default=[MetricThreshold(type="upper_limit", value=0.1)])]
|
|
62
|
+
group: Annotated[MetricGroup,
|
|
63
|
+
Field(title="Group",
|
|
64
|
+
description="The metric group.",
|
|
65
|
+
default=MetricGroup.CONTENT_SAFETY, frozen=True)]
|
|
66
|
+
|
|
67
|
+
async def evaluate_async(
|
|
68
|
+
self,
|
|
69
|
+
data: pd.DataFrame | dict,
|
|
70
|
+
configuration: GenAIConfiguration,
|
|
71
|
+
**kwargs
|
|
72
|
+
) -> AggregateMetricResult:
|
|
73
|
+
|
|
74
|
+
# Create a modified configuration that uses output_fields as input_fields
|
|
75
|
+
# This allows DetectorsProvider to process output data
|
|
76
|
+
modified_config = configuration.model_copy(deep=True)
|
|
77
|
+
modified_config.input_fields = configuration.output_fields
|
|
78
|
+
|
|
79
|
+
provider = DetectorsProvider(configuration=modified_config,
|
|
80
|
+
metric_name="pii",
|
|
81
|
+
metric_display_name=self.display_name,
|
|
82
|
+
metric_method=self.method,
|
|
83
|
+
metric_group=self.group,
|
|
84
|
+
thresholds=self.thresholds,
|
|
85
|
+
**kwargs)
|
|
86
|
+
aggregated_metric_result = await provider.evaluate_async(data=data)
|
|
87
|
+
# Update the metric name to output_pii
|
|
88
|
+
aggregated_metric_result.name = self.name
|
|
89
|
+
for record in aggregated_metric_result.record_level_metrics:
|
|
90
|
+
record.name = self.name
|
|
91
|
+
return aggregated_metric_result
|
|
92
|
+
|
|
93
|
+
def evaluate(
|
|
94
|
+
self,
|
|
95
|
+
data: pd.DataFrame | dict,
|
|
96
|
+
configuration: GenAIConfiguration,
|
|
97
|
+
**kwargs,
|
|
98
|
+
):
|
|
99
|
+
# If ran in sync mode, block until it is done
|
|
100
|
+
return run_in_event_loop(
|
|
101
|
+
self.evaluate_async,
|
|
102
|
+
data=data,
|
|
103
|
+
configuration=configuration,
|
|
104
|
+
**kwargs,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Made with Bob
|