azure-ai-evaluation 1.8.0__tar.gz → 1.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/CHANGELOG.md +45 -0
- {azure_ai_evaluation-1.8.0/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.10.0}/PKG-INFO +46 -3
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/TROUBLESHOOTING.md +0 -3
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/__init__.py +51 -6
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_aoai/__init__.py +1 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_azure/_envs.py +9 -10
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_azure/_token_manager.py +7 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/constants.py +11 -2
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/_client.py +136 -139
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/_common/onedp/models → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_common/onedp}/_patch.py +21 -21
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/_version.py +9 -9
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/_common/onedp/aio/operations → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_common/onedp/aio}/_patch.py +21 -21
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/_common/onedp → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_common/onedp/aio}/operations/__init__.py +37 -39
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/_common/onedp → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_common/onedp/aio/operations}/_patch.py +21 -21
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/_common/onedp/aio → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_common/onedp/models}/_patch.py +21 -21
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/_common/onedp/aio → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_common/onedp}/operations/__init__.py +37 -39
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/rai_service.py +88 -52
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/utils.py +188 -10
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_constants.py +2 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_converters/_ai_services.py +9 -8
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_converters/_models.py +46 -0
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_converters/_sk_services.py +495 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_eval_mapping.py +2 -2
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +166 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluate/_utils.py +25 -17
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +181 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +405 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +149 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_exceptions.py +10 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_http_utils.py +3 -3
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_user_agent.py +37 -0
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/_vendor/__init__.py +3 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_version.py +1 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/__init__.py +3 -1
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_default_converter.py +1 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_red_team.py +1947 -1040
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_red_team_result.py +49 -38
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/red_team/_utils/__init__.py +3 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_utils/constants.py +1 -13
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/red_team/_utils/metric_mapping.py +50 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure_ai_evaluation-1.10.0/azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_simulator.py +21 -8
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0/azure_ai_evaluation.egg-info}/PKG-INFO +46 -3
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure_ai_evaluation.egg-info/SOURCES.txt +14 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure_ai_evaluation.egg-info/requires.txt +0 -2
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/pyproject.toml +2 -2
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/agent_evaluators/tool_call_accuracy.ipynb +7 -4
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/agent_evaluators/user_functions.py +9 -2
- azure_ai_evaluation-1.10.0/samples/aoai_score_model_grader_sample.py +257 -0
- azure_ai_evaluation-1.10.0/samples/evaluation_samples_common.py +128 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/evaluation_samples_evaluate.py +62 -72
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/evaluation_samples_evaluate_fdp.py +99 -92
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/evaluation_samples_safety_evaluation.py +118 -85
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/evaluation_samples_threshold.py +35 -58
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/red_team_agent_tool_sample.py +16 -17
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/red_team_samples.py +106 -126
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/red_team_skip_upload.py +15 -9
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/semantic_kernel_red_team_agent_sample.py +13 -17
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/setup.py +1 -7
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/conftest.py +12 -1
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/converters/ai_agent_converter/serialization_helper.py +34 -54
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py +17 -6
- azure_ai_evaluation-1.10.0/tests/converters/ai_agent_converter/test_run_ids_from_conversation.py +67 -0
- azure_ai_evaluation-1.10.0/tests/converters/ai_agent_converter/test_sk_agent_converter_internals.py +128 -0
- azure_ai_evaluation-1.10.0/tests/converters/ai_agent_converter/test_sk_turn_idxs_from_conversation.py +112 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/test_adv_simulator.py +6 -6
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/test_aoai_graders.py +129 -38
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/test_builtin_evaluators.py +208 -125
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/test_evaluate.py +67 -7
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/test_mass_evaluate.py +9 -9
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/test_metrics_upload.py +6 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/test_remote_evaluation.py +3 -5
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/test_sim_and_eval.py +46 -41
- azure_ai_evaluation-1.10.0/tests/unittests/test_agent_evaluators.py +105 -0
- azure_ai_evaluation-1.10.0/tests/unittests/test_aoai_evaluation_pagination.py +244 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_aoai_integration_features.py +17 -26
- azure_ai_evaluation-1.10.0/tests/unittests/test_aoai_python_grader.py +54 -0
- azure_ai_evaluation-1.10.0/tests/unittests/test_aoai_score_model_grader.py +951 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_batch_run_context.py +2 -2
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_completeness_evaluator.py +29 -16
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_document_retrieval_evaluator.py +106 -57
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_evaluate.py +36 -25
- azure_ai_evaluation-1.10.0/tests/unittests/test_evaluate_mismatch.py +488 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_evaluate_performance.py +2 -3
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_evaluators/test_conversation_thresholds.py +28 -106
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_evaluators/test_service_evaluator_thresholds.py +45 -68
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_evaluators/test_threshold_behavior.py +91 -63
- azure_ai_evaluation-1.10.0/tests/unittests/test_lazy_imports.py +135 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/__init__.py +3 -2
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/test_attack_objective_generator.py +34 -49
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/test_attack_strategy.py +4 -8
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/test_callback_chat_target.py +22 -27
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/test_constants.py +7 -23
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/test_formatting_utils.py +36 -40
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/test_rai_service_eval_chat_target.py +33 -16
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/test_rai_service_target.py +108 -52
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/test_rai_service_true_false_scorer.py +17 -8
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/test_red_team.py +589 -458
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/test_red_team_result.py +32 -40
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_redteam/test_strategy_utils.py +41 -58
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_remote_evaluation_features.py +10 -5
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_safety_evaluation.py +57 -18
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_save_eval.py +6 -2
- azure_ai_evaluation-1.10.0/tests/unittests/test_tool_call_accuracy_evaluator.py +398 -0
- azure_ai_evaluation-1.10.0/tests/unittests/test_utils.py +847 -0
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/_common/onedp/operations/_patch.py +0 -21
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/_converters/__init__.py +0 -3
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +0 -118
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +0 -161
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -100
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +0 -117
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +0 -71
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/_safety_evaluation/__init__.py +0 -3
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/_user_agent.py +0 -6
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/red_team/_agent/__init__.py +0 -3
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/red_team/_utils/__init__.py +0 -3
- azure_ai_evaluation-1.8.0/azure/ai/evaluation/red_team/_utils/metric_mapping.py +0 -23
- azure_ai_evaluation-1.8.0/samples/evaluation_samples_common.py +0 -60
- azure_ai_evaluation-1.8.0/tests/converters/ai_agent_converter/test_run_ids_from_conversation.py +0 -35
- azure_ai_evaluation-1.8.0/tests/unittests/test_agent_evaluators.py +0 -117
- azure_ai_evaluation-1.8.0/tests/unittests/test_tool_call_accuracy_evaluator.py +0 -446
- azure_ai_evaluation-1.8.0/tests/unittests/test_utils.py +0 -258
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/MANIFEST.in +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/README.md +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_azure/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_azure/_clients.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_azure/_models.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/_experimental.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/math.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/_model_base.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/_serialization.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/_types.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/_vendor.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/py.typed +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/_client.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/_configuration.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/_model_base.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/_patch.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/_serialization.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/_version.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/aio/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/aio/_client.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/aio/_configuration.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/aio/_patch.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/models/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/models/_enums.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/models/_models.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/models/_patch.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/raiclient/py.typed +0 -0
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/_evaluate → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_converters}/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/_evaluators → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_evaluate}/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluate/_batch_run/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluate/_batch_run/code_client.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluate/_eval_run.py +0 -0
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/_legacy → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_evaluators}/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/_legacy/_common → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_legacy}/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/_check.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/_configuration.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/_constants.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/_errors.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/_flows.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/_service.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/client.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/entities.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/tracing.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/types.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_adapters/utils.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_status.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_trace.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +0 -0
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/_vendor → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_legacy/_common}/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/_common/_logging.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/prompty/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/prompty/_connection.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/prompty/_exceptions.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/prompty/_utils.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_model_configurations.py +0 -0
- {azure_ai_evaluation-1.8.0/azure/ai/evaluation/simulator/_data_sources → azure_ai_evaluation-1.10.0/azure/ai/evaluation/_safety_evaluation}/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/py.typed +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_constants.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_helpers/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/simulator/_utils.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/migration_guide.md +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/README.md +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/agent_evaluators/agent_evaluation.ipynb +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/agent_evaluators/instructions.md +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/agent_evaluators/intent_resolution.ipynb +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/agent_evaluators/response_completeness.ipynb +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/agent_evaluators/sample_synthetic_conversations.jsonl +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/agent_evaluators/task_adherence.ipynb +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/data/evaluate_test_data.jsonl +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/samples/evaluation_samples_simulate.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/setup.cfg +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/__openai_patcher.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/__init__.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/target_fn.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/test_lite_management_client.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/e2etests/test_prompty_async.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_built_in_evaluator.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_content_safety_defect_rate.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_content_safety_rai_script.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_eval_run.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_evaluators/slow_eval.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_evaluators/test_inputs_evaluators.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_jailbreak_simulator.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_non_adv_simulator.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_simulator.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
- {azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/tests/unittests/test_synthetic_conversation_bot.py +0 -0
|
@@ -1,5 +1,50 @@
|
|
|
1
1
|
# Release History
|
|
2
2
|
|
|
3
|
+
## 1.10.0 (2025-07-31)
|
|
4
|
+
|
|
5
|
+
### Breaking Changes
|
|
6
|
+
|
|
7
|
+
- Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`. Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
|
|
8
|
+
|
|
9
|
+
### Features Added
|
|
10
|
+
|
|
11
|
+
- Added support for Azure OpenAI Python grader via `AzureOpenAIPythonGrader` class, which serves as a wrapper around Azure Open AI Python grader configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
|
|
12
|
+
- Added `attack_success_thresholds` parameter to `RedTeam` class for configuring custom thresholds that determine attack success. This allows users to set specific threshold values for each risk category, with scores greater than the threshold considered successful attacks (i.e. higher threshold means higher
|
|
13
|
+
tolerance for harmful responses).
|
|
14
|
+
- Enhanced threshold reporting in RedTeam results to include default threshold values when custom thresholds aren't specified, providing better transparency about the evaluation criteria used.
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
### Bugs Fixed
|
|
18
|
+
|
|
19
|
+
- Fixed red team scan `output_path` issue where individual evaluation results were overwriting each other instead of being preserved as separate files. Individual evaluations now create unique files while the user's `output_path` is reserved for final aggregated results.
|
|
20
|
+
- Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
|
|
21
|
+
- Significant improvements to Relevance evaluator. New version has more concrete rubrics and has less variance, is much faster and consumes fewer tokens.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
### Other Changes
|
|
25
|
+
|
|
26
|
+
- The default engine for evaluation was changed from `promptflow` (PFClient) to an in-SDK batch client (RunSubmitterClient)
|
|
27
|
+
- Note: We've temporarily kept an escape hatch to fall back to the legacy `promptflow` implementation by setting `_use_pf_client=True` when invoking `evaluate()`.
|
|
28
|
+
This is due to be removed in a future release.
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
## 1.9.0 (2025-07-02)
|
|
32
|
+
|
|
33
|
+
### Features Added
|
|
34
|
+
|
|
35
|
+
- Added support for Azure Open AI evaluation via `AzureOpenAIScoreModelGrader` class, which serves as a wrapper around Azure Open AI score model configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
|
|
36
|
+
- Added new experimental risk categories ProtectedMaterial and CodeVulnerability for redteam agent scan.
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
### Bugs Fixed
|
|
40
|
+
|
|
41
|
+
- Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
|
|
42
|
+
|
|
43
|
+
- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
|
|
44
|
+
- Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
|
|
45
|
+
- Added a new enum `ADVERSARIAL_QA_DOCUMENTS` which moves all the "file_content" type prompts away from `ADVERSARIAL_QA` to the new enum
|
|
46
|
+
- `AzureOpenAIScoreModelGrader` evaluator now supports `pass_threshold` parameter to set the minimum score required for a response to be considered passing. This allows users to define custom thresholds for evaluation results, enhancing flexibility in grading AI model responses.
|
|
47
|
+
|
|
3
48
|
## 1.8.0 (2025-05-29)
|
|
4
49
|
|
|
5
50
|
### Features Added
|
{azure_ai_evaluation-1.8.0/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.10.0}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.10.0
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -21,8 +21,6 @@ Classifier: Operating System :: OS Independent
|
|
|
21
21
|
Requires-Python: >=3.9
|
|
22
22
|
Description-Content-Type: text/markdown
|
|
23
23
|
License-File: NOTICE.txt
|
|
24
|
-
Requires-Dist: promptflow-devkit>=1.17.1
|
|
25
|
-
Requires-Dist: promptflow-core>=1.17.1
|
|
26
24
|
Requires-Dist: pyjwt>=2.8.0
|
|
27
25
|
Requires-Dist: azure-identity>=1.16.0
|
|
28
26
|
Requires-Dist: azure-core>=1.30.2
|
|
@@ -400,6 +398,51 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
400
398
|
|
|
401
399
|
# Release History
|
|
402
400
|
|
|
401
|
+
## 1.10.0 (2025-07-31)
|
|
402
|
+
|
|
403
|
+
### Breaking Changes
|
|
404
|
+
|
|
405
|
+
- Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`. Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
|
|
406
|
+
|
|
407
|
+
### Features Added
|
|
408
|
+
|
|
409
|
+
- Added support for Azure OpenAI Python grader via `AzureOpenAIPythonGrader` class, which serves as a wrapper around Azure Open AI Python grader configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
|
|
410
|
+
- Added `attack_success_thresholds` parameter to `RedTeam` class for configuring custom thresholds that determine attack success. This allows users to set specific threshold values for each risk category, with scores greater than the threshold considered successful attacks (i.e. higher threshold means higher
|
|
411
|
+
tolerance for harmful responses).
|
|
412
|
+
- Enhanced threshold reporting in RedTeam results to include default threshold values when custom thresholds aren't specified, providing better transparency about the evaluation criteria used.
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
### Bugs Fixed
|
|
416
|
+
|
|
417
|
+
- Fixed red team scan `output_path` issue where individual evaluation results were overwriting each other instead of being preserved as separate files. Individual evaluations now create unique files while the user's `output_path` is reserved for final aggregated results.
|
|
418
|
+
- Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
|
|
419
|
+
- Significant improvements to Relevance evaluator. New version has more concrete rubrics and has less variance, is much faster and consumes fewer tokens.
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
### Other Changes
|
|
423
|
+
|
|
424
|
+
- The default engine for evaluation was changed from `promptflow` (PFClient) to an in-SDK batch client (RunSubmitterClient)
|
|
425
|
+
- Note: We've temporarily kept an escape hatch to fall back to the legacy `promptflow` implementation by setting `_use_pf_client=True` when invoking `evaluate()`.
|
|
426
|
+
This is due to be removed in a future release.
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
## 1.9.0 (2025-07-02)
|
|
430
|
+
|
|
431
|
+
### Features Added
|
|
432
|
+
|
|
433
|
+
- Added support for Azure Open AI evaluation via `AzureOpenAIScoreModelGrader` class, which serves as a wrapper around Azure Open AI score model configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
|
|
434
|
+
- Added new experimental risk categories ProtectedMaterial and CodeVulnerability for redteam agent scan.
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
### Bugs Fixed
|
|
438
|
+
|
|
439
|
+
- Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
|
|
440
|
+
|
|
441
|
+
- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
|
|
442
|
+
- Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
|
|
443
|
+
- Added a new enum `ADVERSARIAL_QA_DOCUMENTS` which moves all the "file_content" type prompts away from `ADVERSARIAL_QA` to the new enum
|
|
444
|
+
- `AzureOpenAIScoreModelGrader` evaluator now supports `pass_threshold` parameter to set the minimum score required for a response to be considered passing. This allows users to define custom thresholds for evaluation results, enhancing flexibility in grading AI model responses.
|
|
445
|
+
|
|
403
446
|
## 1.8.0 (2025-05-29)
|
|
404
447
|
|
|
405
448
|
### Features Added
|
|
@@ -46,9 +46,6 @@ This guide walks you through how to investigate failures, common errors in the `
|
|
|
46
46
|
- Risk and safety evaluators depend on the Azure AI Studio safety evaluation backend service. For a list of supported regions, please refer to the documentation [here](https://aka.ms/azureaisafetyeval-regionsupport).
|
|
47
47
|
- If you encounter a 403 Unauthorized error when using safety evaluators, verify that you have the `Contributor` role assigned to your Azure AI project. `Contributor` role is currently required to run safety evaluations.
|
|
48
48
|
|
|
49
|
-
### Troubleshoot Quality Evaluator Issues
|
|
50
|
-
- For `ToolCallAccuracyEvaluator`, if your input did not have a tool to evaluate, the current behavior is to output `null`.
|
|
51
|
-
|
|
52
49
|
## Handle Simulation Errors
|
|
53
50
|
|
|
54
51
|
### Adversarial Simulation Supported Regions
|
|
@@ -45,6 +45,8 @@ from ._aoai.aoai_grader import AzureOpenAIGrader
|
|
|
45
45
|
from ._aoai.label_grader import AzureOpenAILabelGrader
|
|
46
46
|
from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader
|
|
47
47
|
from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader
|
|
48
|
+
from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader
|
|
49
|
+
from ._aoai.python_grader import AzureOpenAIPythonGrader
|
|
48
50
|
|
|
49
51
|
|
|
50
52
|
_patch_all = []
|
|
@@ -52,13 +54,47 @@ _patch_all = []
|
|
|
52
54
|
# The converter from the AI service to the evaluator schema requires a dependency on
|
|
53
55
|
# ai.projects, but we also don't want to force users installing ai.evaluations to pull
|
|
54
56
|
# in ai.projects. So we only import it if it's available and the user has ai.projects.
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
_patch_all.append("AIAgentConverter")
|
|
58
|
-
except ImportError:
|
|
59
|
-
print("[INFO] Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`.")
|
|
57
|
+
# We use lazy loading to avoid printing messages during import unless the classes are actually used.
|
|
58
|
+
_lazy_imports = {}
|
|
60
59
|
|
|
61
60
|
|
|
61
|
+
def _create_lazy_import(class_name, module_path, dependency_name):
|
|
62
|
+
"""Create a lazy import function for optional dependencies.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
class_name: Name of the class to import
|
|
66
|
+
module_path: Module path to import from
|
|
67
|
+
dependency_name: Name of the dependency package for error message
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
A function that performs the lazy import when called
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def lazy_import():
|
|
74
|
+
try:
|
|
75
|
+
module = __import__(module_path, fromlist=[class_name])
|
|
76
|
+
cls = getattr(module, class_name)
|
|
77
|
+
_patch_all.append(class_name)
|
|
78
|
+
return cls
|
|
79
|
+
except ImportError:
|
|
80
|
+
raise ImportError(
|
|
81
|
+
f"Could not import {class_name}. Please install the dependency with `pip install {dependency_name}`."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return lazy_import
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
_lazy_imports["AIAgentConverter"] = _create_lazy_import(
|
|
88
|
+
"AIAgentConverter",
|
|
89
|
+
"azure.ai.evaluation._converters._ai_services",
|
|
90
|
+
"azure-ai-projects",
|
|
91
|
+
)
|
|
92
|
+
_lazy_imports["SKAgentConverter"] = _create_lazy_import(
|
|
93
|
+
"SKAgentConverter",
|
|
94
|
+
"azure.ai.evaluation._converters._sk_services",
|
|
95
|
+
"semantic-kernel",
|
|
96
|
+
)
|
|
97
|
+
|
|
62
98
|
__all__ = [
|
|
63
99
|
"evaluate",
|
|
64
100
|
"CoherenceEvaluator",
|
|
@@ -99,6 +135,15 @@ __all__ = [
|
|
|
99
135
|
"AzureOpenAILabelGrader",
|
|
100
136
|
"AzureOpenAIStringCheckGrader",
|
|
101
137
|
"AzureOpenAITextSimilarityGrader",
|
|
138
|
+
"AzureOpenAIScoreModelGrader",
|
|
139
|
+
"AzureOpenAIPythonGrader",
|
|
102
140
|
]
|
|
103
141
|
|
|
104
|
-
__all__.extend([p for p in _patch_all if p not in __all__])
|
|
142
|
+
__all__.extend([p for p in _patch_all if p not in __all__])
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def __getattr__(name):
|
|
146
|
+
"""Handle lazy imports for optional dependencies."""
|
|
147
|
+
if name in _lazy_imports:
|
|
148
|
+
return _lazy_imports[name]()
|
|
149
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
{azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_aoai/aoai_grader.py
RENAMED
|
@@ -5,12 +5,13 @@ from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfigurat
|
|
|
5
5
|
|
|
6
6
|
from azure.ai.evaluation._constants import DEFAULT_AOAI_API_VERSION
|
|
7
7
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
8
|
+
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
8
9
|
from typing import Any, Dict, Union
|
|
9
10
|
from azure.ai.evaluation._common._experimental import experimental
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
@experimental
|
|
13
|
-
class AzureOpenAIGrader
|
|
14
|
+
class AzureOpenAIGrader:
|
|
14
15
|
"""
|
|
15
16
|
Base class for Azure OpenAI grader wrappers, recommended only for use by experienced OpenAI API users.
|
|
16
17
|
Combines a model configuration and any grader configuration
|
|
@@ -35,9 +36,15 @@ class AzureOpenAIGrader():
|
|
|
35
36
|
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
|
-
id = "
|
|
39
|
+
id = "azureai://built-in/evaluators/azure-openai/custom_grader"
|
|
39
40
|
|
|
40
|
-
def __init__(
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
*,
|
|
44
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
45
|
+
grader_config: Dict[str, Any],
|
|
46
|
+
**kwargs: Any,
|
|
47
|
+
):
|
|
41
48
|
self._model_config = model_config
|
|
42
49
|
self._grader_config = grader_config
|
|
43
50
|
|
|
@@ -45,8 +52,6 @@ class AzureOpenAIGrader():
|
|
|
45
52
|
self._validate_model_config()
|
|
46
53
|
self._validate_grader_config()
|
|
47
54
|
|
|
48
|
-
|
|
49
|
-
|
|
50
55
|
def _validate_model_config(self) -> None:
|
|
51
56
|
"""Validate the model configuration that this grader wrapper is using."""
|
|
52
57
|
if "api_key" not in self._model_config or not self._model_config.get("api_key"):
|
|
@@ -57,7 +62,7 @@ class AzureOpenAIGrader():
|
|
|
57
62
|
category=ErrorCategory.INVALID_VALUE,
|
|
58
63
|
target=ErrorTarget.AOAI_GRADER,
|
|
59
64
|
)
|
|
60
|
-
|
|
65
|
+
|
|
61
66
|
def _validate_grader_config(self) -> None:
|
|
62
67
|
"""Validate the grader configuration that this grader wrapper is using."""
|
|
63
68
|
|
|
@@ -71,19 +76,24 @@ class AzureOpenAIGrader():
|
|
|
71
76
|
:return: The OpenAI client.
|
|
72
77
|
:rtype: [~openai.OpenAI, ~openai.AzureOpenAI]
|
|
73
78
|
"""
|
|
79
|
+
default_headers = {"User-Agent": UserAgentSingleton().value}
|
|
74
80
|
if "azure_endpoint" in self._model_config:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
81
|
+
from openai import AzureOpenAI
|
|
82
|
+
|
|
83
|
+
# TODO set default values?
|
|
84
|
+
return AzureOpenAI(
|
|
78
85
|
azure_endpoint=self._model_config["azure_endpoint"],
|
|
79
|
-
api_key=self._model_config.get("api_key", None),
|
|
80
|
-
api_version=DEFAULT_AOAI_API_VERSION,
|
|
86
|
+
api_key=self._model_config.get("api_key", None), # Default-style access to appease linters.
|
|
87
|
+
api_version=DEFAULT_AOAI_API_VERSION, # Force a known working version
|
|
81
88
|
azure_deployment=self._model_config.get("azure_deployment", ""),
|
|
89
|
+
default_headers=default_headers,
|
|
82
90
|
)
|
|
83
91
|
from openai import OpenAI
|
|
92
|
+
|
|
84
93
|
# TODO add default values for base_url and organization?
|
|
85
94
|
return OpenAI(
|
|
86
95
|
api_key=self._model_config["api_key"],
|
|
87
96
|
base_url=self._model_config.get("base_url", ""),
|
|
88
97
|
organization=self._model_config.get("organization", ""),
|
|
98
|
+
default_headers=default_headers,
|
|
89
99
|
)
|
{azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_aoai/label_grader.py
RENAMED
|
@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
9
9
|
|
|
10
10
|
from .aoai_grader import AzureOpenAIGrader
|
|
11
11
|
|
|
12
|
+
|
|
12
13
|
@experimental
|
|
13
14
|
class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
14
15
|
"""
|
|
@@ -42,12 +43,12 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
|
42
43
|
|
|
43
44
|
"""
|
|
44
45
|
|
|
45
|
-
id = "
|
|
46
|
+
id = "azureai://built-in/evaluators/azure-openai/label_grader"
|
|
46
47
|
|
|
47
48
|
def __init__(
|
|
48
49
|
self,
|
|
49
50
|
*,
|
|
50
|
-
model_config
|
|
51
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
51
52
|
input: List[Dict[str, str]],
|
|
52
53
|
labels: List[str],
|
|
53
54
|
model: str,
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Any, Dict, Union, Optional
|
|
5
|
+
|
|
6
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
7
|
+
from openai.types.graders import PythonGrader
|
|
8
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
|
|
10
|
+
from .aoai_grader import AzureOpenAIGrader
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@experimental
|
|
14
|
+
class AzureOpenAIPythonGrader(AzureOpenAIGrader):
|
|
15
|
+
"""
|
|
16
|
+
Wrapper class for OpenAI's Python code graders.
|
|
17
|
+
|
|
18
|
+
Enables custom Python-based evaluation logic with flexible scoring and
|
|
19
|
+
pass/fail thresholds. The grader executes user-provided Python code
|
|
20
|
+
to evaluate outputs against custom criteria.
|
|
21
|
+
|
|
22
|
+
Supplying a PythonGrader to the `evaluate` method will cause an
|
|
23
|
+
asynchronous request to evaluate the grader via the OpenAI API. The
|
|
24
|
+
results of the evaluation will then be merged into the standard
|
|
25
|
+
evaluation results.
|
|
26
|
+
|
|
27
|
+
:param model_config: The model configuration to use for the grader.
|
|
28
|
+
:type model_config: Union[
|
|
29
|
+
~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
30
|
+
~azure.ai.evaluation.OpenAIModelConfiguration
|
|
31
|
+
]
|
|
32
|
+
:param name: The name of the grader.
|
|
33
|
+
:type name: str
|
|
34
|
+
:param image_tag: The image tag for the Python execution environment.
|
|
35
|
+
:type image_tag: str
|
|
36
|
+
:param pass_threshold: Score threshold for pass/fail classification.
|
|
37
|
+
Scores >= threshold are considered passing.
|
|
38
|
+
:type pass_threshold: float
|
|
39
|
+
:param source: Python source code containing the grade function.
|
|
40
|
+
Must define: def grade(sample: dict, item: dict) -> float
|
|
41
|
+
:type source: str
|
|
42
|
+
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
43
|
+
:type kwargs: Any
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
.. admonition:: Example:
|
|
47
|
+
|
|
48
|
+
.. literalinclude:: ../samples/evaluation_samples_common.py
|
|
49
|
+
:start-after: [START python_grader_example]
|
|
50
|
+
:end-before: [END python_grader_example]
|
|
51
|
+
:language: python
|
|
52
|
+
:dedent: 8
|
|
53
|
+
:caption: Using AzureOpenAIPythonGrader for custom evaluation logic.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
id = "azureai://built-in/evaluators/azure-openai/python_grader"
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
62
|
+
name: str,
|
|
63
|
+
image_tag: str,
|
|
64
|
+
pass_threshold: float,
|
|
65
|
+
source: str,
|
|
66
|
+
**kwargs: Any,
|
|
67
|
+
):
|
|
68
|
+
# Validate pass_threshold
|
|
69
|
+
if not 0.0 <= pass_threshold <= 1.0:
|
|
70
|
+
raise ValueError("pass_threshold must be between 0.0 and 1.0")
|
|
71
|
+
|
|
72
|
+
# Store pass_threshold as instance attribute for potential future use
|
|
73
|
+
self.pass_threshold = pass_threshold
|
|
74
|
+
|
|
75
|
+
# Create OpenAI PythonGrader instance
|
|
76
|
+
grader = PythonGrader(
|
|
77
|
+
name=name,
|
|
78
|
+
image_tag=image_tag,
|
|
79
|
+
pass_threshold=pass_threshold,
|
|
80
|
+
source=source,
|
|
81
|
+
type="python",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Any, Dict, Union, List, Optional
|
|
5
|
+
|
|
6
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
7
|
+
from openai.types.graders import ScoreModelGrader
|
|
8
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
|
|
10
|
+
from .aoai_grader import AzureOpenAIGrader
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@experimental
|
|
14
|
+
class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
15
|
+
"""
|
|
16
|
+
Wrapper class for OpenAI's score model graders.
|
|
17
|
+
|
|
18
|
+
Enables continuous scoring evaluation with custom prompts and flexible
|
|
19
|
+
conversation-style inputs. Supports configurable score ranges and
|
|
20
|
+
pass thresholds for binary classification.
|
|
21
|
+
|
|
22
|
+
Supplying a ScoreModelGrader to the `evaluate` method will cause an
|
|
23
|
+
asynchronous request to evaluate the grader via the OpenAI API. The
|
|
24
|
+
results of the evaluation will then be merged into the standard
|
|
25
|
+
evaluation results.
|
|
26
|
+
|
|
27
|
+
:param model_config: The model configuration to use for the grader.
|
|
28
|
+
:type model_config: Union[
|
|
29
|
+
~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
30
|
+
~azure.ai.evaluation.OpenAIModelConfiguration
|
|
31
|
+
]
|
|
32
|
+
:param input: The input messages for the grader. List of conversation
|
|
33
|
+
messages with role and content.
|
|
34
|
+
:type input: List[Dict[str, str]]
|
|
35
|
+
:param model: The model to use for the evaluation.
|
|
36
|
+
:type model: str
|
|
37
|
+
:param name: The name of the grader.
|
|
38
|
+
:type name: str
|
|
39
|
+
:param range: The range of the score. Defaults to [0, 1].
|
|
40
|
+
:type range: Optional[List[float]]
|
|
41
|
+
:param pass_threshold: Score threshold for pass/fail classification.
|
|
42
|
+
Defaults to midpoint of range.
|
|
43
|
+
:type pass_threshold: Optional[float]
|
|
44
|
+
:param sampling_params: The sampling parameters for the model.
|
|
45
|
+
:type sampling_params: Optional[Dict[str, Any]]
|
|
46
|
+
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
47
|
+
:type kwargs: Any
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
id = "azureai://built-in/evaluators/azure-openai/score_model_grader"
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
*,
|
|
55
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
56
|
+
input: List[Dict[str, str]],
|
|
57
|
+
model: str,
|
|
58
|
+
name: str,
|
|
59
|
+
range: Optional[List[float]] = None,
|
|
60
|
+
pass_threshold: Optional[float] = None,
|
|
61
|
+
sampling_params: Optional[Dict[str, Any]] = None,
|
|
62
|
+
**kwargs: Any,
|
|
63
|
+
):
|
|
64
|
+
# Validate range and pass_threshold
|
|
65
|
+
if range is not None:
|
|
66
|
+
if len(range) != 2 or range[0] >= range[1]:
|
|
67
|
+
raise ValueError("range must be a list of two numbers [min, max] where min < max")
|
|
68
|
+
else:
|
|
69
|
+
range = [0.0, 1.0] # Default range
|
|
70
|
+
|
|
71
|
+
if pass_threshold is not None:
|
|
72
|
+
if range and (pass_threshold < range[0] or pass_threshold > range[1]):
|
|
73
|
+
raise ValueError(f"pass_threshold {pass_threshold} must be within range {range}")
|
|
74
|
+
else:
|
|
75
|
+
pass_threshold = (range[0] + range[1]) / 2 # Default to midpoint
|
|
76
|
+
|
|
77
|
+
# Store pass_threshold as instance attribute
|
|
78
|
+
self.pass_threshold = pass_threshold
|
|
79
|
+
|
|
80
|
+
# Create OpenAI ScoreModelGrader instance
|
|
81
|
+
grader_kwargs = {"input": input, "model": model, "name": name, "type": "score_model"}
|
|
82
|
+
|
|
83
|
+
if range is not None:
|
|
84
|
+
grader_kwargs["range"] = range
|
|
85
|
+
if sampling_params is not None:
|
|
86
|
+
grader_kwargs["sampling_params"] = sampling_params
|
|
87
|
+
grader_kwargs["pass_threshold"] = self.pass_threshold
|
|
88
|
+
|
|
89
|
+
grader = ScoreModelGrader(**grader_kwargs)
|
|
90
|
+
|
|
91
|
+
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
@@ -10,6 +10,7 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
10
10
|
|
|
11
11
|
from .aoai_grader import AzureOpenAIGrader
|
|
12
12
|
|
|
13
|
+
|
|
13
14
|
@experimental
|
|
14
15
|
class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
15
16
|
"""
|
|
@@ -38,12 +39,12 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
|
38
39
|
|
|
39
40
|
"""
|
|
40
41
|
|
|
41
|
-
id = "
|
|
42
|
+
id = "azureai://built-in/evaluators/azure-openai/string_check_grader"
|
|
42
43
|
|
|
43
44
|
def __init__(
|
|
44
45
|
self,
|
|
45
46
|
*,
|
|
46
|
-
model_config
|
|
47
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
47
48
|
input: str,
|
|
48
49
|
name: str,
|
|
49
50
|
operation: Literal[
|
|
@@ -10,6 +10,7 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
10
10
|
|
|
11
11
|
from .aoai_grader import AzureOpenAIGrader
|
|
12
12
|
|
|
13
|
+
|
|
13
14
|
@experimental
|
|
14
15
|
class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
15
16
|
"""
|
|
@@ -52,12 +53,12 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
|
52
53
|
|
|
53
54
|
"""
|
|
54
55
|
|
|
55
|
-
id = "
|
|
56
|
+
id = "azureai://built-in/evaluators/azure-openai/text_similarity_grader"
|
|
56
57
|
|
|
57
58
|
def __init__(
|
|
58
59
|
self,
|
|
59
60
|
*,
|
|
60
|
-
model_config
|
|
61
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
61
62
|
evaluation_metric: Literal[
|
|
62
63
|
"fuzzy_match",
|
|
63
64
|
"bleu",
|
{azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_azure/_envs.py
RENAMED
|
@@ -19,6 +19,7 @@ from azure.core.pipeline.policies import ProxyPolicy, AsyncRetryPolicy
|
|
|
19
19
|
|
|
20
20
|
class AzureEnvironmentMetadata(TypedDict):
|
|
21
21
|
"""Configuration for various Azure environments. All endpoints include a trailing slash."""
|
|
22
|
+
|
|
22
23
|
portal_endpoint: str
|
|
23
24
|
"""The management portal for the Azure environment (e.g. https://portal.azure.com/)"""
|
|
24
25
|
resource_manager_endpoint: str
|
|
@@ -107,15 +108,15 @@ class AzureEnvironmentClient:
|
|
|
107
108
|
|
|
108
109
|
def case_insensitive_match(d: Mapping[str, Any], key: str) -> Optional[Any]:
|
|
109
110
|
key = key.strip().lower()
|
|
110
|
-
return next((v for k,v in d.items() if k.strip().lower() == key), None)
|
|
111
|
+
return next((v for k, v in d.items() if k.strip().lower() == key), None)
|
|
111
112
|
|
|
112
113
|
async with _ASYNC_LOCK:
|
|
113
114
|
cloud = _KNOWN_AZURE_ENVIRONMENTS.get(name) or case_insensitive_match(_KNOWN_AZURE_ENVIRONMENTS, name)
|
|
114
115
|
if cloud:
|
|
115
116
|
return cloud
|
|
116
|
-
default_endpoint = (
|
|
117
|
-
|
|
118
|
-
|
|
117
|
+
default_endpoint = _KNOWN_AZURE_ENVIRONMENTS.get(_DEFAULT_AZURE_ENV_NAME, {}).get(
|
|
118
|
+
"resource_manager_endpoint"
|
|
119
|
+
)
|
|
119
120
|
|
|
120
121
|
metadata_url = self.get_default_metadata_url(default_endpoint)
|
|
121
122
|
clouds = await self.get_clouds_async(metadata_url=metadata_url, update_cached=update_cached)
|
|
@@ -124,10 +125,7 @@ class AzureEnvironmentClient:
|
|
|
124
125
|
return cloud_metadata
|
|
125
126
|
|
|
126
127
|
async def get_clouds_async(
|
|
127
|
-
self,
|
|
128
|
-
*,
|
|
129
|
-
metadata_url: Optional[str] = None,
|
|
130
|
-
update_cached: bool = True
|
|
128
|
+
self, *, metadata_url: Optional[str] = None, update_cached: bool = True
|
|
131
129
|
) -> Mapping[str, AzureEnvironmentMetadata]:
|
|
132
130
|
metadata_url = metadata_url or self.get_default_metadata_url()
|
|
133
131
|
|
|
@@ -149,7 +147,8 @@ class AzureEnvironmentClient:
|
|
|
149
147
|
default_endpoint = default_endpoint or "https://management.azure.com/"
|
|
150
148
|
metadata_url = os.getenv(
|
|
151
149
|
_ENV_ARM_CLOUD_METADATA_URL,
|
|
152
|
-
f"{default_endpoint}metadata/endpoints?api-version={AzureEnvironmentClient.DEFAULT_API_VERSION}"
|
|
150
|
+
f"{default_endpoint}metadata/endpoints?api-version={AzureEnvironmentClient.DEFAULT_API_VERSION}",
|
|
151
|
+
)
|
|
153
152
|
return metadata_url
|
|
154
153
|
|
|
155
154
|
@staticmethod
|
|
@@ -197,7 +196,7 @@ class AzureEnvironmentClient:
|
|
|
197
196
|
|
|
198
197
|
def recursive_update(d: Dict, u: Mapping) -> None:
|
|
199
198
|
"""Recursively update a dictionary.
|
|
200
|
-
|
|
199
|
+
|
|
201
200
|
:param Dict d: The dictionary to update.
|
|
202
201
|
:param Mapping u: The mapping to update from.
|
|
203
202
|
"""
|
|
@@ -73,7 +73,13 @@ class AzureMLTokenManager(APITokenManager):
|
|
|
73
73
|
return super().get_aad_credential()
|
|
74
74
|
|
|
75
75
|
def get_token(
|
|
76
|
-
|
|
76
|
+
self,
|
|
77
|
+
scopes=None,
|
|
78
|
+
claims: Union[str, None] = None,
|
|
79
|
+
tenant_id: Union[str, None] = None,
|
|
80
|
+
enable_cae: bool = False,
|
|
81
|
+
**kwargs: Any
|
|
82
|
+
) -> AccessToken:
|
|
77
83
|
"""Get the API token. If the token is not available or has expired, refresh the token.
|
|
78
84
|
|
|
79
85
|
:return: API token
|
{azure_ai_evaluation-1.8.0 → azure_ai_evaluation-1.10.0}/azure/ai/evaluation/_common/constants.py
RENAMED
|
@@ -5,8 +5,17 @@ from enum import Enum
|
|
|
5
5
|
|
|
6
6
|
from azure.core import CaseInsensitiveEnumMeta
|
|
7
7
|
|
|
8
|
-
PROMPT_BASED_REASON_EVALUATORS = [
|
|
9
|
-
|
|
8
|
+
PROMPT_BASED_REASON_EVALUATORS = [
|
|
9
|
+
"coherence",
|
|
10
|
+
"relevance",
|
|
11
|
+
"retrieval",
|
|
12
|
+
"groundedness",
|
|
13
|
+
"fluency",
|
|
14
|
+
"intent_resolution",
|
|
15
|
+
"tool_call_accurate",
|
|
16
|
+
"response_completeness",
|
|
17
|
+
"task_adherence",
|
|
18
|
+
]
|
|
10
19
|
|
|
11
20
|
|
|
12
21
|
class CommonConstants:
|