azure-ai-evaluation 1.2.0__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/CHANGELOG.md +61 -0
- {azure_ai_evaluation-1.2.0/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.4.0}/PKG-INFO +68 -8
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/README.md +4 -8
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/TROUBLESHOOTING.md +0 -5
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/__init__.py +42 -14
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_azure/_models.py +6 -6
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_common/constants.py +6 -2
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_common/rai_service.py +38 -4
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_common/utils.py +30 -10
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_constants.py +10 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_converters/_ai_services.py +804 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_converters/_models.py +302 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluate/_evaluate.py +36 -4
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_common/_base_eval.py +43 -3
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +3 -1
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +43 -4
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +21 -3
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +157 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +228 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_exceptions.py +5 -1
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/__init__.py +3 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/_attack_objective_generator.py +192 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/_attack_strategy.py +42 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/_callback_chat_target.py +74 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/_default_converter.py +21 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/_red_team.py +1858 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/_red_team_result.py +246 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/_utils/__init__.py +3 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/_utils/constants.py +64 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/_utils/formatting_utils.py +164 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/_utils/logging_utils.py +139 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_red_team/_utils/strategy_utils.py +188 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +741 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/_vendor/__init__.py +3 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_version.py +2 -1
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_adversarial_simulator.py +61 -27
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure_ai_evaluation-1.4.0/azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0/azure_ai_evaluation.egg-info}/PKG-INFO +68 -8
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure_ai_evaluation.egg-info/SOURCES.txt +112 -9
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure_ai_evaluation.egg-info/requires.txt +3 -0
- azure_ai_evaluation-1.4.0/migration_guide.md +243 -0
- azure_ai_evaluation-1.4.0/samples/agent_evaluators/agent_evaluation.ipynb +329 -0
- azure_ai_evaluation-1.4.0/samples/agent_evaluators/instructions.md +40 -0
- azure_ai_evaluation-1.4.0/samples/agent_evaluators/intent_resolution.ipynb +452 -0
- azure_ai_evaluation-1.4.0/samples/agent_evaluators/response_completeness.ipynb +209 -0
- azure_ai_evaluation-1.4.0/samples/agent_evaluators/sample_synthetic_conversations.jsonl +90 -0
- azure_ai_evaluation-1.4.0/samples/agent_evaluators/task_adherence.ipynb +245 -0
- azure_ai_evaluation-1.4.0/samples/agent_evaluators/tool_call_accuracy.ipynb +365 -0
- azure_ai_evaluation-1.4.0/samples/agent_evaluators/user_functions.py +268 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/samples/evaluation_samples_evaluate.py +110 -4
- azure_ai_evaluation-1.4.0/samples/evaluation_samples_safety_evaluation.py +299 -0
- azure_ai_evaluation-1.4.0/samples/evaluation_samples_threshold.py +367 -0
- azure_ai_evaluation-1.4.0/samples/red_team_samples.py +567 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/setup.py +4 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/conftest.py +82 -92
- azure_ai_evaluation-1.4.0/tests/converters/ai_agent_converter/serialization_helper.py +110 -0
- azure_ai_evaluation-1.4.0/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py +189 -0
- azure_ai_evaluation-1.4.0/tests/converters/ai_agent_converter/test_run_ids_from_conversation.py +35 -0
- azure_ai_evaluation-1.4.0/tests/e2etests/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/e2etests/test_adv_simulator.py +159 -3
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/e2etests/test_builtin_evaluators.py +65 -16
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/e2etests/test_evaluate.py +1 -1
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/e2etests/test_lite_management_client.py +7 -1
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/e2etests/test_mass_evaluate.py +196 -97
- azure_ai_evaluation-1.4.0/tests/e2etests/test_prompty_async.py +187 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/e2etests/test_sim_and_eval.py +233 -12
- azure_ai_evaluation-1.4.0/tests/unittests/test_agent_evaluators.py +117 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_built_in_evaluator.py +4 -2
- azure_ai_evaluation-1.4.0/tests/unittests/test_completeness_evaluator.py +92 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_eval_run.py +1 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_evaluate.py +17 -2
- azure_ai_evaluation-1.4.0/tests/unittests/test_evaluators/test_conversation_thresholds.py +215 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_evaluators/test_service_evaluator_thresholds.py +221 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_evaluators/test_threshold_behavior.py +221 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_redteam/__init__.py +3 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_redteam/test_attack_objective_generator.py +199 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_redteam/test_attack_strategy.py +82 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_redteam/test_callback_chat_target.py +143 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_redteam/test_constants.py +67 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_redteam/test_formatting_utils.py +255 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_redteam/test_red_team.py +1174 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_redteam/test_red_team_result.py +267 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_redteam/test_strategy_utils.py +219 -0
- azure_ai_evaluation-1.4.0/tests/unittests/test_safety_evaluation.py +245 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_save_eval.py +1 -1
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_synthetic_callback_conv_bot.py +5 -4
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +0 -119
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/MANIFEST.in +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_azure/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_azure/_clients.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_azure/_token_manager.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_common/_experimental.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_common/math.py +0 -0
- {azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluate → azure_ai_evaluation-1.4.0/azure/ai/evaluation/_converters}/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators → azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluate}/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluate/_batch_run/code_client.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluate/_utils.py +0 -0
- {azure_ai_evaluation-1.2.0/azure/ai/evaluation/_vendor → azure_ai_evaluation-1.4.0/azure/ai/evaluation/_evaluators}/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_eci/_eci.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_http_utils.py +0 -0
- {azure_ai_evaluation-1.2.0/azure/ai/evaluation/simulator/_data_sources → azure_ai_evaluation-1.4.0/azure/ai/evaluation/_legacy}/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_model_configurations.py +0 -0
- /azure_ai_evaluation-1.2.0/azure/ai/evaluation/py.typed → /azure_ai_evaluation-1.4.0/azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_user_agent.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -0
- /azure_ai_evaluation-1.2.0/azure/ai/evaluation/simulator/_prompty/__init__.py → /azure_ai_evaluation-1.4.0/azure/ai/evaluation/py.typed +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_constants.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_helpers/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_model_tools/models.py +0 -0
- {azure_ai_evaluation-1.2.0/tests → azure_ai_evaluation-1.4.0/azure/ai/evaluation/simulator/_prompty}/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_simulator.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_tracing.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/simulator/_utils.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/pyproject.toml +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/samples/README.md +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/samples/data/evaluate_test_data.jsonl +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/samples/evaluation_samples_common.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/samples/evaluation_samples_simulate.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/setup.cfg +0 -0
- {azure_ai_evaluation-1.2.0/tests/e2etests → azure_ai_evaluation-1.4.0/tests}/__init__.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/__openai_patcher.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/e2etests/target_fn.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/e2etests/test_metrics_upload.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_batch_run_context.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_content_safety_defect_rate.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_content_safety_rai_script.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_evaluate_performance.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_evaluate_telemetry.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_evaluators/apology_dag/apology.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_evaluators/slow_eval.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_evaluators/test_inputs_evaluators.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_jailbreak_simulator.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_non_adv_simulator.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_simulator.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_synthetic_conversation_bot.py +0 -0
- {azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/tests/unittests/test_utils.py +0 -0
|
@@ -1,5 +1,66 @@
|
|
|
1
1
|
# Release History
|
|
2
2
|
|
|
3
|
+
## 1.4.0 (2025-03-27)
|
|
4
|
+
|
|
5
|
+
### Features Added
|
|
6
|
+
- Enhanced binary evaluation results with customizable thresholds
|
|
7
|
+
- Added threshold support for QA and ContentSafety evaluators
|
|
8
|
+
- Evaluation results now include both the score and threshold values
|
|
9
|
+
- Configurable threshold parameter allows custom binary classification boundaries
|
|
10
|
+
- Default thresholds provided for backward compatibility
|
|
11
|
+
- Quality evaluators use "higher is better" scoring (score ≥ threshold is positive)
|
|
12
|
+
- Content safety evaluators use "lower is better" scoring (score ≤ threshold is positive)
|
|
13
|
+
- New Built-in evaluator called CodeVulnerabilityEvaluator is added.
|
|
14
|
+
- It provides capabilities to identify the following code vulnerabilities.
|
|
15
|
+
- path-injection
|
|
16
|
+
- sql-injection
|
|
17
|
+
- code-injection
|
|
18
|
+
- stack-trace-exposure
|
|
19
|
+
- incomplete-url-substring-sanitization
|
|
20
|
+
- flask-debug
|
|
21
|
+
- clear-text-logging-sensitive-data
|
|
22
|
+
- incomplete-hostname-regexp
|
|
23
|
+
- server-side-unvalidated-url-redirection
|
|
24
|
+
- weak-cryptographic-algorithm
|
|
25
|
+
- full-ssrf
|
|
26
|
+
- bind-socket-all-network-interfaces
|
|
27
|
+
- client-side-unvalidated-url-redirection
|
|
28
|
+
- likely-bugs
|
|
29
|
+
- reflected-xss
|
|
30
|
+
- clear-text-storage-sensitive-data
|
|
31
|
+
- tarslip
|
|
32
|
+
- hardcoded-credentials
|
|
33
|
+
- insecure-randomness
|
|
34
|
+
- It also supports multiple coding languages such as (Python, Java, C++, C#, Go, Javascript, SQL)
|
|
35
|
+
|
|
36
|
+
- New Built-in evaluator called UngroundedAttributesEvaluator is added.
|
|
37
|
+
- It evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
|
|
38
|
+
- where query represents the user query and response represents the AI system response given the provided context.
|
|
39
|
+
|
|
40
|
+
- Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class
|
|
41
|
+
- or emotional state of a person.
|
|
42
|
+
|
|
43
|
+
- It identifies the following attributes:
|
|
44
|
+
|
|
45
|
+
- emotional_state
|
|
46
|
+
- protected_class
|
|
47
|
+
- groundedness
|
|
48
|
+
- New Built-in evaluators for Agent Evaluation (Preview)
|
|
49
|
+
- IntentResolutionEvaluator - Evaluates the intent resolution of an agent's response to a user query.
|
|
50
|
+
- ResponseCompletenessEvaluator - Evaluates the response completeness of an agent's response to a user query.
|
|
51
|
+
- TaskAdherenceEvaluator - Evaluates the task adherence of an agent's response to a user query.
|
|
52
|
+
- ToolCallAccuracyEvaluator - Evaluates the accuracy of tool calls made by an agent in response to a user query.
|
|
53
|
+
|
|
54
|
+
### Bugs Fixed
|
|
55
|
+
- Fixed error in `GroundednessProEvaluator` when handling non-numeric values like "n/a" returned from the service.
|
|
56
|
+
- Uploading local evaluation results from `evaluate` with the same run name will no longer result in each online run sharing (and bashing) result files.
|
|
57
|
+
|
|
58
|
+
## 1.3.0 (2025-02-28)
|
|
59
|
+
|
|
60
|
+
### Breaking Changes
|
|
61
|
+
- Multimodal specific evaluators `ContentSafetyMultimodalEvaluator`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` has been removed. Please use `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator` instead.
|
|
62
|
+
- Metric name in ProtectedMaterialEvaluator's output is changed from `protected_material.fictional_characters_label` to `protected_material.fictional_characters_defect_rate`. It's now consistent with other evaluator's metric names (ending with `_defect_rate`).
|
|
63
|
+
|
|
3
64
|
## 1.2.0 (2025-01-27)
|
|
4
65
|
|
|
5
66
|
### Features Added
|
{azure_ai_evaluation-1.2.0/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.4.0}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -28,6 +28,8 @@ Requires-Dist: azure-identity>=1.16.0
|
|
|
28
28
|
Requires-Dist: azure-core>=1.30.2
|
|
29
29
|
Requires-Dist: nltk>=3.9.1
|
|
30
30
|
Requires-Dist: azure-storage-blob>=12.10.0
|
|
31
|
+
Provides-Extra: redteam
|
|
32
|
+
Requires-Dist: pyrit>=0.8.0; extra == "redteam"
|
|
31
33
|
|
|
32
34
|
# Azure AI Evaluation client library for Python
|
|
33
35
|
|
|
@@ -54,7 +56,7 @@ Azure AI SDK provides following to evaluate Generative AI Applications:
|
|
|
54
56
|
### Prerequisites
|
|
55
57
|
|
|
56
58
|
- Python 3.9 or later is required to use this package.
|
|
57
|
-
- [Optional] You must have [Azure AI Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
|
|
59
|
+
- [Optional] You must have [Azure AI Foundry Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
|
|
58
60
|
|
|
59
61
|
### Install the package
|
|
60
62
|
|
|
@@ -63,10 +65,6 @@ Install the Azure AI Evaluation SDK for Python with [pip][pip_link]:
|
|
|
63
65
|
```bash
|
|
64
66
|
pip install azure-ai-evaluation
|
|
65
67
|
```
|
|
66
|
-
If you want to track results in [AI Studio][ai_studio], install `remote` extra:
|
|
67
|
-
```python
|
|
68
|
-
pip install azure-ai-evaluation[remote]
|
|
69
|
-
```
|
|
70
68
|
|
|
71
69
|
## Key concepts
|
|
72
70
|
|
|
@@ -175,9 +173,9 @@ result = evaluate(
|
|
|
175
173
|
}
|
|
176
174
|
}
|
|
177
175
|
}
|
|
178
|
-
# Optionally provide your AI
|
|
176
|
+
# Optionally provide your AI Foundry project information to track your evaluation results in your Azure AI Foundry project
|
|
179
177
|
azure_ai_project = azure_ai_project,
|
|
180
|
-
# Optionally provide an output path to dump a json of metric summary, row level data and metric and
|
|
178
|
+
# Optionally provide an output path to dump a json of metric summary, row level data and metric and AI Foundry URL
|
|
181
179
|
output_path="./evaluation_results.json"
|
|
182
180
|
)
|
|
183
181
|
```
|
|
@@ -375,8 +373,70 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
375
373
|
[simulate_with_conversation_starter]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Simulators/Simulate_Context-Relevant_Data/Simulate_From_Conversation_Starter
|
|
376
374
|
[adversarial_jailbreak]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#simulating-jailbreak-attacks
|
|
377
375
|
|
|
376
|
+
|
|
378
377
|
# Release History
|
|
379
378
|
|
|
379
|
+
## 1.4.0 (2025-03-27)
|
|
380
|
+
|
|
381
|
+
### Features Added
|
|
382
|
+
- Enhanced binary evaluation results with customizable thresholds
|
|
383
|
+
- Added threshold support for QA and ContentSafety evaluators
|
|
384
|
+
- Evaluation results now include both the score and threshold values
|
|
385
|
+
- Configurable threshold parameter allows custom binary classification boundaries
|
|
386
|
+
- Default thresholds provided for backward compatibility
|
|
387
|
+
- Quality evaluators use "higher is better" scoring (score ≥ threshold is positive)
|
|
388
|
+
- Content safety evaluators use "lower is better" scoring (score ≤ threshold is positive)
|
|
389
|
+
- New Built-in evaluator called CodeVulnerabilityEvaluator is added.
|
|
390
|
+
- It provides capabilities to identify the following code vulnerabilities.
|
|
391
|
+
- path-injection
|
|
392
|
+
- sql-injection
|
|
393
|
+
- code-injection
|
|
394
|
+
- stack-trace-exposure
|
|
395
|
+
- incomplete-url-substring-sanitization
|
|
396
|
+
- flask-debug
|
|
397
|
+
- clear-text-logging-sensitive-data
|
|
398
|
+
- incomplete-hostname-regexp
|
|
399
|
+
- server-side-unvalidated-url-redirection
|
|
400
|
+
- weak-cryptographic-algorithm
|
|
401
|
+
- full-ssrf
|
|
402
|
+
- bind-socket-all-network-interfaces
|
|
403
|
+
- client-side-unvalidated-url-redirection
|
|
404
|
+
- likely-bugs
|
|
405
|
+
- reflected-xss
|
|
406
|
+
- clear-text-storage-sensitive-data
|
|
407
|
+
- tarslip
|
|
408
|
+
- hardcoded-credentials
|
|
409
|
+
- insecure-randomness
|
|
410
|
+
- It also supports multiple coding languages such as (Python, Java, C++, C#, Go, Javascript, SQL)
|
|
411
|
+
|
|
412
|
+
- New Built-in evaluator called UngroundedAttributesEvaluator is added.
|
|
413
|
+
- It evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
|
|
414
|
+
- where query represents the user query and response represents the AI system response given the provided context.
|
|
415
|
+
|
|
416
|
+
- Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class
|
|
417
|
+
- or emotional state of a person.
|
|
418
|
+
|
|
419
|
+
- It identifies the following attributes:
|
|
420
|
+
|
|
421
|
+
- emotional_state
|
|
422
|
+
- protected_class
|
|
423
|
+
- groundedness
|
|
424
|
+
- New Built-in evaluators for Agent Evaluation (Preview)
|
|
425
|
+
- IntentResolutionEvaluator - Evaluates the intent resolution of an agent's response to a user query.
|
|
426
|
+
- ResponseCompletenessEvaluator - Evaluates the response completeness of an agent's response to a user query.
|
|
427
|
+
- TaskAdherenceEvaluator - Evaluates the task adherence of an agent's response to a user query.
|
|
428
|
+
- ToolCallAccuracyEvaluator - Evaluates the accuracy of tool calls made by an agent in response to a user query.
|
|
429
|
+
|
|
430
|
+
### Bugs Fixed
|
|
431
|
+
- Fixed error in `GroundednessProEvaluator` when handling non-numeric values like "n/a" returned from the service.
|
|
432
|
+
- Uploading local evaluation results from `evaluate` with the same run name will no longer result in each online run sharing (and bashing) result files.
|
|
433
|
+
|
|
434
|
+
## 1.3.0 (2025-02-28)
|
|
435
|
+
|
|
436
|
+
### Breaking Changes
|
|
437
|
+
- Multimodal specific evaluators `ContentSafetyMultimodalEvaluator`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` has been removed. Please use `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator` instead.
|
|
438
|
+
- Metric name in ProtectedMaterialEvaluator's output is changed from `protected_material.fictional_characters_label` to `protected_material.fictional_characters_defect_rate`. It's now consistent with other evaluator's metric names (ending with `_defect_rate`).
|
|
439
|
+
|
|
380
440
|
## 1.2.0 (2025-01-27)
|
|
381
441
|
|
|
382
442
|
### Features Added
|
|
@@ -23,7 +23,7 @@ Azure AI SDK provides following to evaluate Generative AI Applications:
|
|
|
23
23
|
### Prerequisites
|
|
24
24
|
|
|
25
25
|
- Python 3.9 or later is required to use this package.
|
|
26
|
-
- [Optional] You must have [Azure AI Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
|
|
26
|
+
- [Optional] You must have [Azure AI Foundry Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
|
|
27
27
|
|
|
28
28
|
### Install the package
|
|
29
29
|
|
|
@@ -32,10 +32,6 @@ Install the Azure AI Evaluation SDK for Python with [pip][pip_link]:
|
|
|
32
32
|
```bash
|
|
33
33
|
pip install azure-ai-evaluation
|
|
34
34
|
```
|
|
35
|
-
If you want to track results in [AI Studio][ai_studio], install `remote` extra:
|
|
36
|
-
```python
|
|
37
|
-
pip install azure-ai-evaluation[remote]
|
|
38
|
-
```
|
|
39
35
|
|
|
40
36
|
## Key concepts
|
|
41
37
|
|
|
@@ -144,9 +140,9 @@ result = evaluate(
|
|
|
144
140
|
}
|
|
145
141
|
}
|
|
146
142
|
}
|
|
147
|
-
# Optionally provide your AI
|
|
143
|
+
# Optionally provide your AI Foundry project information to track your evaluation results in your Azure AI Foundry project
|
|
148
144
|
azure_ai_project = azure_ai_project,
|
|
149
|
-
# Optionally provide an output path to dump a json of metric summary, row level data and metric and
|
|
145
|
+
# Optionally provide an output path to dump a json of metric summary, row level data and metric and AI Foundry URL
|
|
150
146
|
output_path="./evaluation_results.json"
|
|
151
147
|
)
|
|
152
148
|
```
|
|
@@ -342,4 +338,4 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
342
338
|
[adversarial_simulation_scenarios]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#supported-adversarial-simulation-scenarios
|
|
343
339
|
[adversarial_simulation]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Simulators/Simulate_Adversarial_Data
|
|
344
340
|
[simulate_with_conversation_starter]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Simulators/Simulate_Context-Relevant_Data/Simulate_From_Conversation_Starter
|
|
345
|
-
[adversarial_jailbreak]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#simulating-jailbreak-attacks
|
|
341
|
+
[adversarial_jailbreak]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#simulating-jailbreak-attacks
|
|
@@ -17,11 +17,6 @@ This guide walks you through how to investigate failures, common errors in the `
|
|
|
17
17
|
### Troubleshoot Remote Tracking Issues
|
|
18
18
|
|
|
19
19
|
- Before running `evaluate()`, to ensure that you can enable logging and tracing to your Azure AI project, make sure you are first logged in by running `az login`.
|
|
20
|
-
- Then install the following sub-package:
|
|
21
|
-
|
|
22
|
-
```Shell
|
|
23
|
-
pip install azure-ai-evaluation[remote]
|
|
24
|
-
```
|
|
25
20
|
|
|
26
21
|
- Ensure that you assign the proper permissions to the storage account linked to your Azure AI Studio hub. This can be done with the following command. More information can be found [here](https://aka.ms/credentialleshub).
|
|
27
22
|
|
|
@@ -12,27 +12,25 @@ from ._evaluators._content_safety import (
|
|
|
12
12
|
SexualEvaluator,
|
|
13
13
|
ViolenceEvaluator,
|
|
14
14
|
)
|
|
15
|
-
from ._evaluators._multimodal._content_safety_multimodal import (
|
|
16
|
-
ContentSafetyMultimodalEvaluator,
|
|
17
|
-
HateUnfairnessMultimodalEvaluator,
|
|
18
|
-
SelfHarmMultimodalEvaluator,
|
|
19
|
-
SexualMultimodalEvaluator,
|
|
20
|
-
ViolenceMultimodalEvaluator,
|
|
21
|
-
)
|
|
22
|
-
from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
|
|
23
15
|
from ._evaluators._f1_score import F1ScoreEvaluator
|
|
24
16
|
from ._evaluators._fluency import FluencyEvaluator
|
|
25
17
|
from ._evaluators._gleu import GleuScoreEvaluator
|
|
26
18
|
from ._evaluators._groundedness import GroundednessEvaluator
|
|
27
19
|
from ._evaluators._service_groundedness import GroundednessProEvaluator
|
|
20
|
+
from ._evaluators._intent_resolution import IntentResolutionEvaluator
|
|
28
21
|
from ._evaluators._meteor import MeteorScoreEvaluator
|
|
29
22
|
from ._evaluators._protected_material import ProtectedMaterialEvaluator
|
|
30
23
|
from ._evaluators._qa import QAEvaluator
|
|
24
|
+
from ._evaluators._response_completeness import ResponseCompletenessEvaluator
|
|
25
|
+
from ._evaluators._task_adherence import TaskAdherenceEvaluator
|
|
31
26
|
from ._evaluators._relevance import RelevanceEvaluator
|
|
32
27
|
from ._evaluators._retrieval import RetrievalEvaluator
|
|
33
28
|
from ._evaluators._rouge import RougeScoreEvaluator, RougeType
|
|
34
29
|
from ._evaluators._similarity import SimilarityEvaluator
|
|
35
30
|
from ._evaluators._xpia import IndirectAttackEvaluator
|
|
31
|
+
from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
|
|
32
|
+
from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator
|
|
33
|
+
from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator
|
|
36
34
|
from ._model_configurations import (
|
|
37
35
|
AzureAIProject,
|
|
38
36
|
AzureOpenAIModelConfiguration,
|
|
@@ -43,6 +41,34 @@ from ._model_configurations import (
|
|
|
43
41
|
OpenAIModelConfiguration,
|
|
44
42
|
)
|
|
45
43
|
|
|
44
|
+
_patch_all = []
|
|
45
|
+
|
|
46
|
+
# The converter from the AI service to the evaluator schema requires a dependency on
|
|
47
|
+
# ai.projects, but we also don't want to force users installing ai.evaluations to pull
|
|
48
|
+
# in ai.projects. So we only import it if it's available and the user has ai.projects.
|
|
49
|
+
try:
|
|
50
|
+
from ._converters._ai_services import AIAgentConverter
|
|
51
|
+
_patch_all.append("AIAgentConverter")
|
|
52
|
+
except ImportError:
|
|
53
|
+
print("[INFO] Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`.")
|
|
54
|
+
|
|
55
|
+
# RedTeam requires a dependency on pyrit, but python 3.9 is not supported by pyrit.
|
|
56
|
+
# So we only import it if it's available and the user has pyrit.
|
|
57
|
+
try:
|
|
58
|
+
from ._red_team._red_team import RedTeam
|
|
59
|
+
from ._red_team._attack_strategy import AttackStrategy
|
|
60
|
+
from ._red_team._attack_objective_generator import RiskCategory
|
|
61
|
+
from ._red_team._red_team_result import RedTeamOutput
|
|
62
|
+
_patch_all.extend([
|
|
63
|
+
"RedTeam",
|
|
64
|
+
"RedTeamOutput",
|
|
65
|
+
"AttackStrategy",
|
|
66
|
+
"RiskCategory",
|
|
67
|
+
])
|
|
68
|
+
except ImportError:
|
|
69
|
+
print("[INFO] Could not import RedTeam. Please install the dependency with `pip install azure-ai-evaluation[redteam]`.")
|
|
70
|
+
|
|
71
|
+
|
|
46
72
|
__all__ = [
|
|
47
73
|
"evaluate",
|
|
48
74
|
"CoherenceEvaluator",
|
|
@@ -50,6 +76,9 @@ __all__ = [
|
|
|
50
76
|
"FluencyEvaluator",
|
|
51
77
|
"GroundednessEvaluator",
|
|
52
78
|
"GroundednessProEvaluator",
|
|
79
|
+
"ResponseCompletenessEvaluator",
|
|
80
|
+
"TaskAdherenceEvaluator",
|
|
81
|
+
"IntentResolutionEvaluator",
|
|
53
82
|
"RelevanceEvaluator",
|
|
54
83
|
"SimilarityEvaluator",
|
|
55
84
|
"QAEvaluator",
|
|
@@ -73,10 +102,9 @@ __all__ = [
|
|
|
73
102
|
"Conversation",
|
|
74
103
|
"Message",
|
|
75
104
|
"EvaluationResult",
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
"SexualMultimodalEvaluator",
|
|
80
|
-
"ViolenceMultimodalEvaluator",
|
|
81
|
-
"ProtectedMaterialMultimodalEvaluator",
|
|
105
|
+
"CodeVulnerabilityEvaluator",
|
|
106
|
+
"UngroundedAttributesEvaluator",
|
|
107
|
+
"ToolCallAccuracyEvaluator",
|
|
82
108
|
]
|
|
109
|
+
|
|
110
|
+
__all__.extend([p for p in _patch_all if p not in __all__])
|
{azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_azure/_models.py
RENAMED
|
@@ -52,7 +52,7 @@ class Workspace(Model):
|
|
|
52
52
|
"agents_endpoint_uri": {"readonly": True},
|
|
53
53
|
"ml_flow_tracking_uri": {"readonly": True},
|
|
54
54
|
#'notebook_info': {'readonly': True},
|
|
55
|
-
"private_endpoint_connections": {"readonly": True},
|
|
55
|
+
# "private_endpoint_connections": {"readonly": True},
|
|
56
56
|
#'private_link_count': {'readonly': True},
|
|
57
57
|
"provisioning_state": {"readonly": True},
|
|
58
58
|
"service_provisioned_resource_group": {"readonly": True},
|
|
@@ -99,10 +99,10 @@ class Workspace(Model):
|
|
|
99
99
|
#'network_acls': {'key': 'properties.networkAcls', 'type': 'NetworkAcls'},
|
|
100
100
|
#'notebook_info': {'key': 'properties.notebookInfo', 'type': 'NotebookResourceInfo'},
|
|
101
101
|
"primary_user_assigned_identity": {"key": "properties.primaryUserAssignedIdentity", "type": "str"},
|
|
102
|
-
"private_endpoint_connections": {
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
},
|
|
102
|
+
# "private_endpoint_connections": {
|
|
103
|
+
# "key": "properties.privateEndpointConnections",
|
|
104
|
+
# "type": "[PrivateEndpointConnection]",
|
|
105
|
+
# },
|
|
106
106
|
"private_link_count": {"key": "properties.privateLinkCount", "type": "int"},
|
|
107
107
|
"provision_network_now": {"key": "properties.provisionNetworkNow", "type": "bool"},
|
|
108
108
|
"provisioning_state": {"key": "properties.provisioningState", "type": "str"},
|
|
@@ -207,7 +207,7 @@ class Workspace(Model):
|
|
|
207
207
|
# self.network_acls = network_acls
|
|
208
208
|
# self.notebook_info = None
|
|
209
209
|
self.primary_user_assigned_identity = primary_user_assigned_identity
|
|
210
|
-
self.private_endpoint_connections = None
|
|
210
|
+
# self.private_endpoint_connections = None
|
|
211
211
|
self.private_link_count = None
|
|
212
212
|
self.provision_network_now = provision_network_now
|
|
213
213
|
self.provisioning_state = None
|
{azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_common/constants.py
RENAMED
|
@@ -5,8 +5,8 @@ from enum import Enum
|
|
|
5
5
|
|
|
6
6
|
from azure.core import CaseInsensitiveEnumMeta
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency", "intent_resolution",
|
|
9
|
+
"tool_call_accurate", "response_completeness", "task_adherence"]
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class CommonConstants:
|
|
@@ -39,6 +39,8 @@ class Tasks:
|
|
|
39
39
|
PROTECTED_MATERIAL = "protected material"
|
|
40
40
|
XPIA = "xpia"
|
|
41
41
|
GROUNDEDNESS = "groundedness"
|
|
42
|
+
CODE_VULNERABILITY = "code vulnerability"
|
|
43
|
+
UNGROUNDED_ATTRIBUTES = "inference sensitive attributes"
|
|
42
44
|
|
|
43
45
|
|
|
44
46
|
class _InternalAnnotationTasks:
|
|
@@ -61,6 +63,8 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
|
61
63
|
PROTECTED_MATERIAL = "protected_material"
|
|
62
64
|
XPIA = "xpia"
|
|
63
65
|
GROUNDEDNESS = "generic_groundedness"
|
|
66
|
+
CODE_VULNERABILITY = "code_vulnerability"
|
|
67
|
+
UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
|
|
64
68
|
|
|
65
69
|
|
|
66
70
|
class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
{azure_ai_evaluation-1.2.0 → azure_ai_evaluation-1.4.0}/azure/ai/evaluation/_common/rai_service.py
RENAMED
|
@@ -42,6 +42,7 @@ USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
|
|
|
42
42
|
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
|
|
43
43
|
}
|
|
44
44
|
|
|
45
|
+
INFERENCE_OF_SENSITIVE_ATTRIBUTES = "inference_sensitive_attributes"
|
|
45
46
|
|
|
46
47
|
def get_formatted_template(data: dict, annotation_task: str) -> str:
|
|
47
48
|
"""Given the task and input data, produce a formatted string that will serve as the main
|
|
@@ -64,6 +65,19 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
|
|
|
64
65
|
"context": data.get("context", ""),
|
|
65
66
|
}
|
|
66
67
|
return json.dumps(as_dict)
|
|
68
|
+
if annotation_task == Tasks.CODE_VULNERABILITY:
|
|
69
|
+
as_dict = {
|
|
70
|
+
"context": data.get("query", ""),
|
|
71
|
+
"completion": data.get("response", "")
|
|
72
|
+
}
|
|
73
|
+
return json.dumps(as_dict)
|
|
74
|
+
if annotation_task == Tasks.UNGROUNDED_ATTRIBUTES:
|
|
75
|
+
as_dict = {
|
|
76
|
+
"query": data.get("query", ""),
|
|
77
|
+
"response": data.get("response", ""),
|
|
78
|
+
"context": data.get("context", "")
|
|
79
|
+
}
|
|
80
|
+
return json.dumps(as_dict)
|
|
67
81
|
as_dict = {
|
|
68
82
|
"query": html.escape(data.get("query", "")),
|
|
69
83
|
"response": html.escape(data.get("response", "")),
|
|
@@ -160,6 +174,8 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
|
|
|
160
174
|
task = annotation_task
|
|
161
175
|
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
162
176
|
include_metric = False
|
|
177
|
+
elif metric == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
|
|
178
|
+
include_metric = False
|
|
163
179
|
elif metric == _InternalEvaluationMetrics.ECI:
|
|
164
180
|
include_metric = False
|
|
165
181
|
elif metric == EvaluationMetrics.XPIA:
|
|
@@ -251,7 +267,6 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
251
267
|
sleep_time = RAIService.SLEEP_TIME**request_count
|
|
252
268
|
await asyncio.sleep(sleep_time)
|
|
253
269
|
|
|
254
|
-
|
|
255
270
|
def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
256
271
|
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
257
272
|
) -> Dict[str, Union[str, float]]:
|
|
@@ -274,10 +289,16 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
274
289
|
EvaluationMetrics.PROTECTED_MATERIAL,
|
|
275
290
|
_InternalEvaluationMetrics.ECI,
|
|
276
291
|
EvaluationMetrics.XPIA,
|
|
292
|
+
EvaluationMetrics.CODE_VULNERABILITY,
|
|
293
|
+
EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
|
|
277
294
|
}:
|
|
278
295
|
result = {}
|
|
279
296
|
if not batch_response or len(batch_response[0]) == 0:
|
|
280
297
|
return {}
|
|
298
|
+
if metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]:
|
|
299
|
+
batch_response[0] = {
|
|
300
|
+
EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
|
|
301
|
+
}
|
|
281
302
|
if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
|
|
282
303
|
pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
|
|
283
304
|
for pm_metric_name in pm_metric_names:
|
|
@@ -313,6 +334,13 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
313
334
|
result[metric_display_name + "_information_gathering"] = (
|
|
314
335
|
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
|
|
315
336
|
)
|
|
337
|
+
if metric_name == EvaluationMetrics.CODE_VULNERABILITY or metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
|
|
338
|
+
# Add all attributes under the details.
|
|
339
|
+
details = {}
|
|
340
|
+
for key, value in parsed_response.items():
|
|
341
|
+
if key not in {"label", "reasoning", "version"}:
|
|
342
|
+
details[key.replace("-", "_")] = value
|
|
343
|
+
result[metric_display_name + "_details"] = details
|
|
316
344
|
return result
|
|
317
345
|
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
|
|
318
346
|
|
|
@@ -359,7 +387,14 @@ def _parse_content_harm_response(
|
|
|
359
387
|
|
|
360
388
|
# get content harm metric_value
|
|
361
389
|
if "label" in harm_response:
|
|
362
|
-
|
|
390
|
+
try:
|
|
391
|
+
# Handle "n/a" or other non-numeric values
|
|
392
|
+
if isinstance(harm_response["label"], str) and harm_response["label"].strip().lower() == "n/a":
|
|
393
|
+
metric_value = math.nan
|
|
394
|
+
else:
|
|
395
|
+
metric_value = float(harm_response["label"])
|
|
396
|
+
except (ValueError, TypeError):
|
|
397
|
+
metric_value = math.nan
|
|
363
398
|
elif "valid" in harm_response:
|
|
364
399
|
metric_value = 0 if harm_response["valid"] else math.nan
|
|
365
400
|
else:
|
|
@@ -390,8 +425,7 @@ def _parse_content_harm_response(
|
|
|
390
425
|
reason = ""
|
|
391
426
|
|
|
392
427
|
harm_score = metric_value
|
|
393
|
-
|
|
394
|
-
return result
|
|
428
|
+
# We've already handled the "n/a" case by converting to math.nan
|
|
395
429
|
if not math.isnan(metric_value):
|
|
396
430
|
# int(math.nan) causes a value error, and math.nan is already handled
|
|
397
431
|
# by get_harm_severity_level
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# --------------------------------------------------------------------------
|
|
3
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
4
|
+
# Licensed under the MIT License. See License.txt in the project root for license information.
|
|
5
|
+
# Code generated by Microsoft (R) Python Code Generator.
|
|
6
|
+
# Changes may cause incorrect behavior and will be lost if the code is regenerated.
|
|
7
|
+
# --------------------------------------------------------------------------
|
|
8
|
+
# pylint: disable=wrong-import-position
|
|
9
|
+
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from ._patch import * # pylint: disable=unused-wildcard-import
|
|
14
|
+
|
|
15
|
+
from ._client import MachineLearningServicesClient # type: ignore
|
|
16
|
+
from ._version import VERSION
|
|
17
|
+
|
|
18
|
+
__version__ = VERSION
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from ._patch import __all__ as _patch_all
|
|
22
|
+
from ._patch import *
|
|
23
|
+
except ImportError:
|
|
24
|
+
_patch_all = []
|
|
25
|
+
from ._patch import patch_sdk as _patch_sdk
|
|
26
|
+
|
|
27
|
+
# Export GeneratedRAIClient as alias of MachineLearningServicesClient for backward compatibility
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"MachineLearningServicesClient",
|
|
31
|
+
]
|
|
32
|
+
__all__.extend([p for p in _patch_all if p not in __all__]) # pyright: ignore
|
|
33
|
+
|
|
34
|
+
_patch_sdk()
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# pylint: disable=line-too-long,useless-suppression
|
|
2
|
+
# coding=utf-8
|
|
3
|
+
# --------------------------------------------------------------------------
|
|
4
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
5
|
+
# Licensed under the MIT License. See License.txt in the project root for license information.
|
|
6
|
+
# Code generated by Microsoft (R) Python Code Generator.
|
|
7
|
+
# Changes may cause incorrect behavior and will be lost if the code is regenerated.
|
|
8
|
+
# --------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
from copy import deepcopy
|
|
11
|
+
from typing import Any, TYPE_CHECKING
|
|
12
|
+
from typing_extensions import Self
|
|
13
|
+
|
|
14
|
+
from azure.core import PipelineClient
|
|
15
|
+
from azure.core.pipeline import policies
|
|
16
|
+
from azure.core.rest import HttpRequest, HttpResponse
|
|
17
|
+
|
|
18
|
+
from ._configuration import MachineLearningServicesClientConfiguration
|
|
19
|
+
from ._serialization import Deserializer, Serializer
|
|
20
|
+
from .operations import RAISvcOperations
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from azure.core.credentials import TokenCredential
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class MachineLearningServicesClient:
|
|
27
|
+
"""MachineLearningServicesClient.
|
|
28
|
+
|
|
29
|
+
:ivar rai_svc: RAISvcOperations operations
|
|
30
|
+
:vartype rai_svc: raiclient.operations.RAISvcOperations
|
|
31
|
+
:param endpoint: Supported Azure-AI endpoints. Required.
|
|
32
|
+
:type endpoint: str
|
|
33
|
+
:param subscription_id: The ID of the target subscription. Required.
|
|
34
|
+
:type subscription_id: str
|
|
35
|
+
:param resource_group_name: The name of the Resource Group. Required.
|
|
36
|
+
:type resource_group_name: str
|
|
37
|
+
:param workspace_name: The name of the AzureML workspace or AI project. Required.
|
|
38
|
+
:type workspace_name: str
|
|
39
|
+
:param credential: Credential used to authenticate requests to the service. Required.
|
|
40
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
41
|
+
:keyword api_version: The API version to use for this operation. Default value is
|
|
42
|
+
"2022-11-01-preview". Note that overriding this default value may result in unsupported
|
|
43
|
+
behavior.
|
|
44
|
+
:paramtype api_version: str
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
endpoint: str,
|
|
50
|
+
subscription_id: str,
|
|
51
|
+
resource_group_name: str,
|
|
52
|
+
workspace_name: str,
|
|
53
|
+
credential: "TokenCredential",
|
|
54
|
+
**kwargs: Any
|
|
55
|
+
) -> None:
|
|
56
|
+
_endpoint = "{endpoint}/raisvc/v1.0/subscriptions/{subscriptionId}/resourceGroups/{resourceGroupName}/providers/Microsoft.MachineLearningServices/workspaces/{workspaceName}"
|
|
57
|
+
self._config = MachineLearningServicesClientConfiguration(
|
|
58
|
+
endpoint=endpoint,
|
|
59
|
+
subscription_id=subscription_id,
|
|
60
|
+
resource_group_name=resource_group_name,
|
|
61
|
+
workspace_name=workspace_name,
|
|
62
|
+
credential=credential,
|
|
63
|
+
**kwargs
|
|
64
|
+
)
|
|
65
|
+
_policies = kwargs.pop("policies", None)
|
|
66
|
+
if _policies is None:
|
|
67
|
+
_policies = [
|
|
68
|
+
policies.RequestIdPolicy(**kwargs),
|
|
69
|
+
self._config.headers_policy,
|
|
70
|
+
self._config.user_agent_policy,
|
|
71
|
+
self._config.proxy_policy,
|
|
72
|
+
policies.ContentDecodePolicy(**kwargs),
|
|
73
|
+
self._config.redirect_policy,
|
|
74
|
+
self._config.retry_policy,
|
|
75
|
+
self._config.authentication_policy,
|
|
76
|
+
self._config.custom_hook_policy,
|
|
77
|
+
self._config.logging_policy,
|
|
78
|
+
policies.DistributedTracingPolicy(**kwargs),
|
|
79
|
+
policies.SensitiveHeaderCleanupPolicy(**kwargs) if self._config.redirect_policy else None,
|
|
80
|
+
self._config.http_logging_policy,
|
|
81
|
+
]
|
|
82
|
+
self._client: PipelineClient = PipelineClient(base_url=_endpoint, policies=_policies, **kwargs)
|
|
83
|
+
|
|
84
|
+
self._serialize = Serializer()
|
|
85
|
+
self._deserialize = Deserializer()
|
|
86
|
+
self._serialize.client_side_validation = False
|
|
87
|
+
self.rai_svc = RAISvcOperations(self._client, self._config, self._serialize, self._deserialize)
|
|
88
|
+
|
|
89
|
+
def send_request(self, request: HttpRequest, *, stream: bool = False, **kwargs: Any) -> HttpResponse:
|
|
90
|
+
"""Runs the network request through the client's chained policies.
|
|
91
|
+
|
|
92
|
+
>>> from azure.core.rest import HttpRequest
|
|
93
|
+
>>> request = HttpRequest("GET", "https://www.example.org/")
|
|
94
|
+
<HttpRequest [GET], url: 'https://www.example.org/'>
|
|
95
|
+
>>> response = client.send_request(request)
|
|
96
|
+
<HttpResponse: 200 OK>
|
|
97
|
+
|
|
98
|
+
For more information on this code flow, see https://aka.ms/azsdk/dpcodegen/python/send_request
|
|
99
|
+
|
|
100
|
+
:param request: The network request you want to make. Required.
|
|
101
|
+
:type request: ~azure.core.rest.HttpRequest
|
|
102
|
+
:keyword bool stream: Whether the response payload will be streamed. Defaults to False.
|
|
103
|
+
:return: The response of your network call. Does not do error handling on your response.
|
|
104
|
+
:rtype: ~azure.core.rest.HttpResponse
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
request_copy = deepcopy(request)
|
|
108
|
+
path_format_arguments = {
|
|
109
|
+
"endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
|
|
110
|
+
"subscriptionId": self._serialize.url("self._config.subscription_id", self._config.subscription_id, "str"),
|
|
111
|
+
"resourceGroupName": self._serialize.url(
|
|
112
|
+
"self._config.resource_group_name", self._config.resource_group_name, "str"
|
|
113
|
+
),
|
|
114
|
+
"workspaceName": self._serialize.url("self._config.workspace_name", self._config.workspace_name, "str"),
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
request_copy.url = self._client.format_url(request_copy.url, **path_format_arguments)
|
|
118
|
+
return self._client.send_request(request_copy, stream=stream, **kwargs) # type: ignore
|
|
119
|
+
|
|
120
|
+
def close(self) -> None:
|
|
121
|
+
self._client.close()
|
|
122
|
+
|
|
123
|
+
def __enter__(self) -> Self:
|
|
124
|
+
self._client.__enter__()
|
|
125
|
+
return self
|
|
126
|
+
|
|
127
|
+
def __exit__(self, *exc_details: Any) -> None:
|
|
128
|
+
self._client.__exit__(*exc_details)
|