azure-ai-evaluation 1.9.0__tar.gz → 1.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/CHANGELOG.md +47 -0
- {azure_ai_evaluation-1.9.0/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.11.0}/PKG-INFO +63 -4
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/TROUBLESHOOTING.md +0 -3
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/__init__.py +46 -12
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/_aoai/python_grader.py +84 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/rai_service.py +3 -3
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/utils.py +74 -17
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_converters/_ai_services.py +60 -10
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_converters/_models.py +75 -26
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_utils.py +5 -2
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +181 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +405 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_exceptions.py +1 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_version.py +1 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/__init__.py +4 -3
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/red_team/_red_team.py +1164 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_red_team_result.py +6 -0
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/red_team/_result_processor.py +610 -0
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_utils/constants.py +0 -2
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure_ai_evaluation-1.11.0/azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_simulator.py +12 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0/azure_ai_evaluation.egg-info}/PKG-INFO +63 -4
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure_ai_evaluation.egg-info/SOURCES.txt +16 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure_ai_evaluation.egg-info/requires.txt +0 -2
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/agent_evaluators/tool_call_accuracy.ipynb +7 -4
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/aoai_score_model_grader_sample.py +61 -7
- azure_ai_evaluation-1.11.0/samples/data/custom_objectives_with_context_example.json +51 -0
- azure_ai_evaluation-1.11.0/samples/evaluation_samples_common.py +128 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/evaluation_samples_evaluate.py +40 -27
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/evaluation_samples_evaluate_fdp.py +7 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/evaluation_samples_threshold.py +16 -16
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/red_team_samples.py +56 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/setup.py +0 -2
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/conftest.py +59 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/converters/ai_agent_converter/serialization_helper.py +6 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py +4 -4
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/test_builtin_evaluators.py +54 -20
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/test_evaluate.py +7 -7
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/test_mass_evaluate.py +1 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/test_metrics_upload.py +4 -0
- azure_ai_evaluation-1.11.0/tests/e2etests/test_red_team.py +379 -0
- azure_ai_evaluation-1.11.0/tests/unittests/test_agent_evaluators.py +105 -0
- azure_ai_evaluation-1.11.0/tests/unittests/test_aoai_alignment_missing_rows.py +90 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_aoai_evaluation_pagination.py +13 -5
- azure_ai_evaluation-1.11.0/tests/unittests/test_aoai_python_grader.py +54 -0
- azure_ai_evaluation-1.11.0/tests/unittests/test_built_in_evaluator.py +254 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_eval_run.py +291 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_evaluate.py +331 -12
- azure_ai_evaluation-1.11.0/tests/unittests/test_evaluate_mismatch.py +488 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_evaluate_performance.py +2 -3
- azure_ai_evaluation-1.11.0/tests/unittests/test_lazy_imports.py +135 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/test_attack_objective_generator.py +4 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/test_callback_chat_target.py +77 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/test_rai_service_eval_chat_target.py +1 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/test_red_team.py +279 -171
- azure_ai_evaluation-1.11.0/tests/unittests/test_redteam/test_red_team_language_support.py +213 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/test_red_team_result.py +6 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/test_strategy_utils.py +61 -1
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_safety_evaluation.py +48 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_save_eval.py +1 -0
- azure_ai_evaluation-1.11.0/tests/unittests/test_tool_call_accuracy_evaluator.py +686 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_utils.py +212 -1
- azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -100
- azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +0 -117
- azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +0 -71
- azure_ai_evaluation-1.9.0/azure/ai/evaluation/red_team/_red_team.py +0 -3174
- azure_ai_evaluation-1.9.0/azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure_ai_evaluation-1.9.0/samples/evaluation_samples_common.py +0 -60
- azure_ai_evaluation-1.9.0/tests/unittests/test_agent_evaluators.py +0 -102
- azure_ai_evaluation-1.9.0/tests/unittests/test_built_in_evaluator.py +0 -130
- azure_ai_evaluation-1.9.0/tests/unittests/test_tool_call_accuracy_evaluator.py +0 -417
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/MANIFEST.in +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/README.md +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_aoai/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_aoai/aoai_grader.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_aoai/label_grader.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_aoai/string_check_grader.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_aoai/text_similarity_grader.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_azure/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_azure/_clients.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_azure/_envs.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_azure/_models.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_azure/_token_manager.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/_experimental.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/constants.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/evaluation_onedp_client.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/math.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_client.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_configuration.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_model_base.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_serialization.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_types.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_utils/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_utils/model_base.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_utils/serialization.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_validation.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_vendor.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/_version.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/aio/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/aio/_client.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/aio/_configuration.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/aio/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/models/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/models/_enums.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/models/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/py.typed +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/_client.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/_configuration.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/_model_base.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/_serialization.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/_version.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/aio/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/aio/_client.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/aio/_configuration.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/aio/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/models/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/models/_enums.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/models/_models.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/models/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/operations/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/operations/_operations.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/operations/_patch.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/raiclient/py.typed +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_constants.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_converters/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_converters/_sk_services.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_eval_mapping.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_batch_run/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_batch_run/code_client.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_http_utils.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/_check.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/_configuration.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/_constants.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/_errors.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/_flows.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/_service.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/client.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/entities.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/tracing.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/types.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_adapters/utils.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_status.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_trace.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_utils.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_common/_async_token_provider.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_common/_logging.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/prompty/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/prompty/_connection.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/prompty/_exceptions.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/prompty/_prompty.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/prompty/_utils.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_model_configurations.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_safety_evaluation/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_user_agent.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_vendor/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/py.typed +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_agent/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_agent/_agent_functions.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_agent/_agent_tools.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_agent/_agent_utils.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_attack_strategy.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_default_converter.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_utils/_rai_service_target.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/red_team/_utils/logging_utils.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_adversarial_scenario.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_constants.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_conversation/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
- {azure_ai_evaluation-1.9.0/azure/ai/evaluation/red_team/_utils → azure_ai_evaluation-1.11.0/azure/ai/evaluation/simulator/_data_sources}/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_helpers/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_model_tools/models.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/simulator/_utils.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/migration_guide.md +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/pyproject.toml +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/README.md +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/agent_evaluators/agent_evaluation.ipynb +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/agent_evaluators/instructions.md +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/agent_evaluators/intent_resolution.ipynb +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/agent_evaluators/response_completeness.ipynb +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/agent_evaluators/sample_synthetic_conversations.jsonl +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/agent_evaluators/task_adherence.ipynb +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/agent_evaluators/user_functions.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/data/evaluate_test_data.jsonl +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/evaluation_samples_safety_evaluation.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/evaluation_samples_simulate.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/red_team_agent_tool_sample.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/red_team_skip_upload.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/samples/semantic_kernel_red_team_agent_sample.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/setup.cfg +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/__openai_patcher.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/converters/ai_agent_converter/test_run_ids_from_conversation.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/converters/ai_agent_converter/test_sk_agent_converter_internals.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/converters/ai_agent_converter/test_sk_turn_idxs_from_conversation.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/target_fn.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/test_adv_simulator.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/test_aoai_graders.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/test_lite_management_client.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/test_prompty_async.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/test_remote_evaluation.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/e2etests/test_sim_and_eval.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_aoai_integration_features.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_aoai_score_model_grader.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_batch_run_context.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_completeness_evaluator.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_content_safety_defect_rate.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_content_safety_rai_script.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_document_retrieval_evaluator.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_evaluators/slow_eval.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_evaluators/test_conversation_thresholds.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_evaluators/test_inputs_evaluators.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_evaluators/test_service_evaluator_thresholds.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_evaluators/test_threshold_behavior.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_jailbreak_simulator.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_non_adv_simulator.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/__init__.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/test_attack_strategy.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/test_constants.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/test_formatting_utils.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/test_rai_service_target.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_redteam/test_rai_service_true_false_scorer.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_remote_evaluation_features.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_simulator.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
- {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/tests/unittests/test_synthetic_conversation_bot.py +0 -0
|
@@ -1,5 +1,49 @@
|
|
|
1
1
|
# Release History
|
|
2
2
|
|
|
3
|
+
## 1.11.0 (2025-09-02)
|
|
4
|
+
|
|
5
|
+
### Features Added
|
|
6
|
+
- Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
|
|
7
|
+
- Added support for user-supplied TokenCredentials with LLM based evaluators.
|
|
8
|
+
- Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
|
|
9
|
+
- Added `language` parameter to `RedTeam` class for multilingual red team scanning support. The parameter accepts values from `SupportedLanguages` enum including English, Spanish, French, German, Italian, Portuguese, Japanese, Korean, and Simplified Chinese, enabling red team attacks to be generated and conducted in multiple languages.
|
|
10
|
+
- Added support for IndirectAttack and UngroundedAttributes risk categories in `RedTeam` scanning. These new risk categories expand red team capabilities to detect cross-platform indirect attacks and evaluate ungrounded inferences about human attributes including emotional state and protected class information.
|
|
11
|
+
|
|
12
|
+
### Bugs Fixed
|
|
13
|
+
- Fixed issue where evaluation results were not properly aligned with input data, leading to incorrect metrics being reported.
|
|
14
|
+
|
|
15
|
+
### Other Changes
|
|
16
|
+
- Deprecating `AdversarialSimulator` in favor of the [AI Red Teaming Agent](https://aka.ms/airedteamingagent-sample). `AdversarialSimulator` will be removed in the next minor release.
|
|
17
|
+
- Moved retry configuration constants (`MAX_RETRY_ATTEMPTS`, `MAX_RETRY_WAIT_SECONDS`, `MIN_RETRY_WAIT_SECONDS`) from `RedTeam` class to new `RetryManager` class for better code organization and configurability.
|
|
18
|
+
|
|
19
|
+
## 1.10.0 (2025-07-31)
|
|
20
|
+
|
|
21
|
+
### Breaking Changes
|
|
22
|
+
|
|
23
|
+
- Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`. Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
|
|
24
|
+
|
|
25
|
+
### Features Added
|
|
26
|
+
|
|
27
|
+
- Added support for Azure OpenAI Python grader via `AzureOpenAIPythonGrader` class, which serves as a wrapper around Azure Open AI Python grader configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
|
|
28
|
+
- Added `attack_success_thresholds` parameter to `RedTeam` class for configuring custom thresholds that determine attack success. This allows users to set specific threshold values for each risk category, with scores greater than the threshold considered successful attacks (i.e. higher threshold means higher
|
|
29
|
+
tolerance for harmful responses).
|
|
30
|
+
- Enhanced threshold reporting in RedTeam results to include default threshold values when custom thresholds aren't specified, providing better transparency about the evaluation criteria used.
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
### Bugs Fixed
|
|
34
|
+
|
|
35
|
+
- Fixed red team scan `output_path` issue where individual evaluation results were overwriting each other instead of being preserved as separate files. Individual evaluations now create unique files while the user's `output_path` is reserved for final aggregated results.
|
|
36
|
+
- Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
|
|
37
|
+
- Significant improvements to Relevance evaluator. New version has more concrete rubrics and has less variance, is much faster and consumes fewer tokens.
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
### Other Changes
|
|
41
|
+
|
|
42
|
+
- The default engine for evaluation was changed from `promptflow` (PFClient) to an in-SDK batch client (RunSubmitterClient)
|
|
43
|
+
- Note: We've temporarily kept an escape hatch to fall back to the legacy `promptflow` implementation by setting `_use_pf_client=True` when invoking `evaluate()`.
|
|
44
|
+
This is due to be removed in a future release.
|
|
45
|
+
|
|
46
|
+
|
|
3
47
|
## 1.9.0 (2025-07-02)
|
|
4
48
|
|
|
5
49
|
### Features Added
|
|
@@ -11,8 +55,11 @@
|
|
|
11
55
|
### Bugs Fixed
|
|
12
56
|
|
|
13
57
|
- Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
|
|
58
|
+
|
|
59
|
+
- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
|
|
14
60
|
- Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
|
|
15
61
|
- Added a new enum `ADVERSARIAL_QA_DOCUMENTS` which moves all the "file_content" type prompts away from `ADVERSARIAL_QA` to the new enum
|
|
62
|
+
- `AzureOpenAIScoreModelGrader` evaluator now supports `pass_threshold` parameter to set the minimum score required for a response to be considered passing. This allows users to define custom thresholds for evaluation results, enhancing flexibility in grading AI model responses.
|
|
16
63
|
|
|
17
64
|
## 1.8.0 (2025-05-29)
|
|
18
65
|
|
{azure_ai_evaluation-1.9.0/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.11.0}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.11.0
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -21,8 +21,6 @@ Classifier: Operating System :: OS Independent
|
|
|
21
21
|
Requires-Python: >=3.9
|
|
22
22
|
Description-Content-Type: text/markdown
|
|
23
23
|
License-File: NOTICE.txt
|
|
24
|
-
Requires-Dist: promptflow-devkit>=1.17.1
|
|
25
|
-
Requires-Dist: promptflow-core>=1.17.1
|
|
26
24
|
Requires-Dist: pyjwt>=2.8.0
|
|
27
25
|
Requires-Dist: azure-identity>=1.16.0
|
|
28
26
|
Requires-Dist: azure-core>=1.30.2
|
|
@@ -37,6 +35,20 @@ Requires-Dist: Jinja2>=3.1.6
|
|
|
37
35
|
Requires-Dist: aiohttp>=3.0
|
|
38
36
|
Provides-Extra: redteam
|
|
39
37
|
Requires-Dist: pyrit==0.8.1; extra == "redteam"
|
|
38
|
+
Dynamic: author
|
|
39
|
+
Dynamic: author-email
|
|
40
|
+
Dynamic: classifier
|
|
41
|
+
Dynamic: description
|
|
42
|
+
Dynamic: description-content-type
|
|
43
|
+
Dynamic: home-page
|
|
44
|
+
Dynamic: keywords
|
|
45
|
+
Dynamic: license
|
|
46
|
+
Dynamic: license-file
|
|
47
|
+
Dynamic: project-url
|
|
48
|
+
Dynamic: provides-extra
|
|
49
|
+
Dynamic: requires-dist
|
|
50
|
+
Dynamic: requires-python
|
|
51
|
+
Dynamic: summary
|
|
40
52
|
|
|
41
53
|
# Azure AI Evaluation client library for Python
|
|
42
54
|
|
|
@@ -400,6 +412,50 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
400
412
|
|
|
401
413
|
# Release History
|
|
402
414
|
|
|
415
|
+
## 1.11.0 (2025-09-02)
|
|
416
|
+
|
|
417
|
+
### Features Added
|
|
418
|
+
- Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
|
|
419
|
+
- Added support for user-supplied TokenCredentials with LLM based evaluators.
|
|
420
|
+
- Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
|
|
421
|
+
- Added `language` parameter to `RedTeam` class for multilingual red team scanning support. The parameter accepts values from `SupportedLanguages` enum including English, Spanish, French, German, Italian, Portuguese, Japanese, Korean, and Simplified Chinese, enabling red team attacks to be generated and conducted in multiple languages.
|
|
422
|
+
- Added support for IndirectAttack and UngroundedAttributes risk categories in `RedTeam` scanning. These new risk categories expand red team capabilities to detect cross-platform indirect attacks and evaluate ungrounded inferences about human attributes including emotional state and protected class information.
|
|
423
|
+
|
|
424
|
+
### Bugs Fixed
|
|
425
|
+
- Fixed issue where evaluation results were not properly aligned with input data, leading to incorrect metrics being reported.
|
|
426
|
+
|
|
427
|
+
### Other Changes
|
|
428
|
+
- Deprecating `AdversarialSimulator` in favor of the [AI Red Teaming Agent](https://aka.ms/airedteamingagent-sample). `AdversarialSimulator` will be removed in the next minor release.
|
|
429
|
+
- Moved retry configuration constants (`MAX_RETRY_ATTEMPTS`, `MAX_RETRY_WAIT_SECONDS`, `MIN_RETRY_WAIT_SECONDS`) from `RedTeam` class to new `RetryManager` class for better code organization and configurability.
|
|
430
|
+
|
|
431
|
+
## 1.10.0 (2025-07-31)
|
|
432
|
+
|
|
433
|
+
### Breaking Changes
|
|
434
|
+
|
|
435
|
+
- Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`. Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
|
|
436
|
+
|
|
437
|
+
### Features Added
|
|
438
|
+
|
|
439
|
+
- Added support for Azure OpenAI Python grader via `AzureOpenAIPythonGrader` class, which serves as a wrapper around Azure Open AI Python grader configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
|
|
440
|
+
- Added `attack_success_thresholds` parameter to `RedTeam` class for configuring custom thresholds that determine attack success. This allows users to set specific threshold values for each risk category, with scores greater than the threshold considered successful attacks (i.e. higher threshold means higher
|
|
441
|
+
tolerance for harmful responses).
|
|
442
|
+
- Enhanced threshold reporting in RedTeam results to include default threshold values when custom thresholds aren't specified, providing better transparency about the evaluation criteria used.
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
### Bugs Fixed
|
|
446
|
+
|
|
447
|
+
- Fixed red team scan `output_path` issue where individual evaluation results were overwriting each other instead of being preserved as separate files. Individual evaluations now create unique files while the user's `output_path` is reserved for final aggregated results.
|
|
448
|
+
- Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
|
|
449
|
+
- Significant improvements to Relevance evaluator. New version has more concrete rubrics and has less variance, is much faster and consumes fewer tokens.
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
### Other Changes
|
|
453
|
+
|
|
454
|
+
- The default engine for evaluation was changed from `promptflow` (PFClient) to an in-SDK batch client (RunSubmitterClient)
|
|
455
|
+
- Note: We've temporarily kept an escape hatch to fall back to the legacy `promptflow` implementation by setting `_use_pf_client=True` when invoking `evaluate()`.
|
|
456
|
+
This is due to be removed in a future release.
|
|
457
|
+
|
|
458
|
+
|
|
403
459
|
## 1.9.0 (2025-07-02)
|
|
404
460
|
|
|
405
461
|
### Features Added
|
|
@@ -411,8 +467,11 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
411
467
|
### Bugs Fixed
|
|
412
468
|
|
|
413
469
|
- Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
|
|
470
|
+
|
|
471
|
+
- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
|
|
414
472
|
- Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
|
|
415
473
|
- Added a new enum `ADVERSARIAL_QA_DOCUMENTS` which moves all the "file_content" type prompts away from `ADVERSARIAL_QA` to the new enum
|
|
474
|
+
- `AzureOpenAIScoreModelGrader` evaluator now supports `pass_threshold` parameter to set the minimum score required for a response to be considered passing. This allows users to define custom thresholds for evaluation results, enhancing flexibility in grading AI model responses.
|
|
416
475
|
|
|
417
476
|
## 1.8.0 (2025-05-29)
|
|
418
477
|
|
|
@@ -46,9 +46,6 @@ This guide walks you through how to investigate failures, common errors in the `
|
|
|
46
46
|
- Risk and safety evaluators depend on the Azure AI Studio safety evaluation backend service. For a list of supported regions, please refer to the documentation [here](https://aka.ms/azureaisafetyeval-regionsupport).
|
|
47
47
|
- If you encounter a 403 Unauthorized error when using safety evaluators, verify that you have the `Contributor` role assigned to your Azure AI project. `Contributor` role is currently required to run safety evaluations.
|
|
48
48
|
|
|
49
|
-
### Troubleshoot Quality Evaluator Issues
|
|
50
|
-
- For `ToolCallAccuracyEvaluator`, if your input did not have a tool to evaluate, the current behavior is to output `null`.
|
|
51
|
-
|
|
52
49
|
## Handle Simulation Errors
|
|
53
50
|
|
|
54
51
|
### Adversarial Simulation Supported Regions
|
|
@@ -46,6 +46,7 @@ from ._aoai.label_grader import AzureOpenAILabelGrader
|
|
|
46
46
|
from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader
|
|
47
47
|
from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader
|
|
48
48
|
from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader
|
|
49
|
+
from ._aoai.python_grader import AzureOpenAIPythonGrader
|
|
49
50
|
|
|
50
51
|
|
|
51
52
|
_patch_all = []
|
|
@@ -53,21 +54,46 @@ _patch_all = []
|
|
|
53
54
|
# The converter from the AI service to the evaluator schema requires a dependency on
|
|
54
55
|
# ai.projects, but we also don't want to force users installing ai.evaluations to pull
|
|
55
56
|
# in ai.projects. So we only import it if it's available and the user has ai.projects.
|
|
56
|
-
|
|
57
|
-
|
|
57
|
+
# We use lazy loading to avoid printing messages during import unless the classes are actually used.
|
|
58
|
+
_lazy_imports = {}
|
|
58
59
|
|
|
59
|
-
_patch_all.append("AIAgentConverter")
|
|
60
|
-
except ImportError:
|
|
61
|
-
print(
|
|
62
|
-
"[INFO] Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`."
|
|
63
|
-
)
|
|
64
60
|
|
|
65
|
-
|
|
66
|
-
|
|
61
|
+
def _create_lazy_import(class_name, module_path, dependency_name):
|
|
62
|
+
"""Create a lazy import function for optional dependencies.
|
|
67
63
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
64
|
+
Args:
|
|
65
|
+
class_name: Name of the class to import
|
|
66
|
+
module_path: Module path to import from
|
|
67
|
+
dependency_name: Name of the dependency package for error message
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
A function that performs the lazy import when called
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def lazy_import():
|
|
74
|
+
try:
|
|
75
|
+
module = __import__(module_path, fromlist=[class_name])
|
|
76
|
+
cls = getattr(module, class_name)
|
|
77
|
+
_patch_all.append(class_name)
|
|
78
|
+
return cls
|
|
79
|
+
except ImportError:
|
|
80
|
+
raise ImportError(
|
|
81
|
+
f"Could not import {class_name}. Please install the dependency with `pip install {dependency_name}`."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return lazy_import
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
_lazy_imports["AIAgentConverter"] = _create_lazy_import(
|
|
88
|
+
"AIAgentConverter",
|
|
89
|
+
"azure.ai.evaluation._converters._ai_services",
|
|
90
|
+
"azure-ai-projects",
|
|
91
|
+
)
|
|
92
|
+
_lazy_imports["SKAgentConverter"] = _create_lazy_import(
|
|
93
|
+
"SKAgentConverter",
|
|
94
|
+
"azure.ai.evaluation._converters._sk_services",
|
|
95
|
+
"semantic-kernel",
|
|
96
|
+
)
|
|
71
97
|
|
|
72
98
|
__all__ = [
|
|
73
99
|
"evaluate",
|
|
@@ -110,6 +136,14 @@ __all__ = [
|
|
|
110
136
|
"AzureOpenAIStringCheckGrader",
|
|
111
137
|
"AzureOpenAITextSimilarityGrader",
|
|
112
138
|
"AzureOpenAIScoreModelGrader",
|
|
139
|
+
"AzureOpenAIPythonGrader",
|
|
113
140
|
]
|
|
114
141
|
|
|
115
142
|
__all__.extend([p for p in _patch_all if p not in __all__])
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def __getattr__(name):
|
|
146
|
+
"""Handle lazy imports for optional dependencies."""
|
|
147
|
+
if name in _lazy_imports:
|
|
148
|
+
return _lazy_imports[name]()
|
|
149
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Any, Dict, Union, Optional
|
|
5
|
+
|
|
6
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
7
|
+
from openai.types.graders import PythonGrader
|
|
8
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
|
|
10
|
+
from .aoai_grader import AzureOpenAIGrader
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@experimental
|
|
14
|
+
class AzureOpenAIPythonGrader(AzureOpenAIGrader):
|
|
15
|
+
"""
|
|
16
|
+
Wrapper class for OpenAI's Python code graders.
|
|
17
|
+
|
|
18
|
+
Enables custom Python-based evaluation logic with flexible scoring and
|
|
19
|
+
pass/fail thresholds. The grader executes user-provided Python code
|
|
20
|
+
to evaluate outputs against custom criteria.
|
|
21
|
+
|
|
22
|
+
Supplying a PythonGrader to the `evaluate` method will cause an
|
|
23
|
+
asynchronous request to evaluate the grader via the OpenAI API. The
|
|
24
|
+
results of the evaluation will then be merged into the standard
|
|
25
|
+
evaluation results.
|
|
26
|
+
|
|
27
|
+
:param model_config: The model configuration to use for the grader.
|
|
28
|
+
:type model_config: Union[
|
|
29
|
+
~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
30
|
+
~azure.ai.evaluation.OpenAIModelConfiguration
|
|
31
|
+
]
|
|
32
|
+
:param name: The name of the grader.
|
|
33
|
+
:type name: str
|
|
34
|
+
:param image_tag: The image tag for the Python execution environment.
|
|
35
|
+
:type image_tag: str
|
|
36
|
+
:param pass_threshold: Score threshold for pass/fail classification.
|
|
37
|
+
Scores >= threshold are considered passing.
|
|
38
|
+
:type pass_threshold: float
|
|
39
|
+
:param source: Python source code containing the grade function.
|
|
40
|
+
Must define: def grade(sample: dict, item: dict) -> float
|
|
41
|
+
:type source: str
|
|
42
|
+
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
43
|
+
:type kwargs: Any
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
.. admonition:: Example:
|
|
47
|
+
|
|
48
|
+
.. literalinclude:: ../samples/evaluation_samples_common.py
|
|
49
|
+
:start-after: [START python_grader_example]
|
|
50
|
+
:end-before: [END python_grader_example]
|
|
51
|
+
:language: python
|
|
52
|
+
:dedent: 8
|
|
53
|
+
:caption: Using AzureOpenAIPythonGrader for custom evaluation logic.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
id = "azureai://built-in/evaluators/azure-openai/python_grader"
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
62
|
+
name: str,
|
|
63
|
+
image_tag: str,
|
|
64
|
+
pass_threshold: float,
|
|
65
|
+
source: str,
|
|
66
|
+
**kwargs: Any,
|
|
67
|
+
):
|
|
68
|
+
# Validate pass_threshold
|
|
69
|
+
if not 0.0 <= pass_threshold <= 1.0:
|
|
70
|
+
raise ValueError("pass_threshold must be between 0.0 and 1.0")
|
|
71
|
+
|
|
72
|
+
# Store pass_threshold as instance attribute for potential future use
|
|
73
|
+
self.pass_threshold = pass_threshold
|
|
74
|
+
|
|
75
|
+
# Create OpenAI PythonGrader instance
|
|
76
|
+
grader = PythonGrader(
|
|
77
|
+
name=name,
|
|
78
|
+
image_tag=image_tag,
|
|
79
|
+
pass_threshold=pass_threshold,
|
|
80
|
+
source=source,
|
|
81
|
+
type="python",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
@@ -84,6 +84,7 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
|
84
84
|
grader_kwargs["range"] = range
|
|
85
85
|
if sampling_params is not None:
|
|
86
86
|
grader_kwargs["sampling_params"] = sampling_params
|
|
87
|
+
grader_kwargs["pass_threshold"] = self.pass_threshold
|
|
87
88
|
|
|
88
89
|
grader = ScoreModelGrader(**grader_kwargs)
|
|
89
90
|
|
|
@@ -1961,12 +1961,16 @@ class Message(_Model):
|
|
|
1961
1961
|
:vartype role: str
|
|
1962
1962
|
:ivar content: The content.
|
|
1963
1963
|
:vartype content: str
|
|
1964
|
+
:ivar context: The context.
|
|
1965
|
+
:vartype context: str
|
|
1964
1966
|
"""
|
|
1965
1967
|
|
|
1966
1968
|
role: Optional[str] = rest_field(name="Role", visibility=["read", "create", "update", "delete", "query"])
|
|
1967
1969
|
"""The role."""
|
|
1968
1970
|
content: Optional[str] = rest_field(name="Content", visibility=["read", "create", "update", "delete", "query"])
|
|
1969
1971
|
"""The content."""
|
|
1972
|
+
context: Optional[str] = rest_field(name="Context", visibility=["read", "create", "update", "delete", "query"])
|
|
1973
|
+
"""The context."""
|
|
1970
1974
|
|
|
1971
1975
|
@overload
|
|
1972
1976
|
def __init__(
|
|
@@ -1974,6 +1978,7 @@ class Message(_Model):
|
|
|
1974
1978
|
*,
|
|
1975
1979
|
role: Optional[str] = None,
|
|
1976
1980
|
content: Optional[str] = None,
|
|
1981
|
+
context: Optional[str] = None,
|
|
1977
1982
|
) -> None: ...
|
|
1978
1983
|
|
|
1979
1984
|
@overload
|
{azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/rai_service.py
RENAMED
|
@@ -290,7 +290,7 @@ async def submit_request_onedp(
|
|
|
290
290
|
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
291
291
|
headers = get_common_headers(token, evaluator_name)
|
|
292
292
|
if scan_session_id:
|
|
293
|
-
headers["
|
|
293
|
+
headers["x-ms-client-request-id"] = scan_session_id
|
|
294
294
|
response = client.evaluations.submit_annotation(payload, headers=headers)
|
|
295
295
|
result = json.loads(response)
|
|
296
296
|
operation_id = result["location"].split("/")[-1]
|
|
@@ -319,8 +319,8 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
319
319
|
token = await fetch_or_reuse_token(credential, token)
|
|
320
320
|
headers = get_common_headers(token)
|
|
321
321
|
|
|
322
|
-
async with
|
|
323
|
-
response = await client.get(url, headers=headers)
|
|
322
|
+
async with get_async_http_client() as client:
|
|
323
|
+
response = await client.get(url, headers=headers, timeout=RAIService.TIMEOUT)
|
|
324
324
|
|
|
325
325
|
if response.status_code == 200:
|
|
326
326
|
return response.json()
|
{azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.11.0}/azure/ai/evaluation/_common/utils.py
RENAMED
|
@@ -6,11 +6,11 @@ import posixpath
|
|
|
6
6
|
import re
|
|
7
7
|
import math
|
|
8
8
|
import threading
|
|
9
|
-
from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
|
|
9
|
+
from typing import Any, List, Literal, Mapping, Optional, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
|
|
10
10
|
|
|
11
11
|
import nltk
|
|
12
12
|
from azure.storage.blob import ContainerClient
|
|
13
|
-
from typing_extensions import NotRequired, Required, TypeGuard
|
|
13
|
+
from typing_extensions import NotRequired, Required, TypeGuard, TypeIs
|
|
14
14
|
from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
|
|
15
15
|
from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
|
|
16
16
|
from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
@@ -127,17 +127,15 @@ def construct_prompty_model_config(
|
|
|
127
127
|
return prompty_model_config
|
|
128
128
|
|
|
129
129
|
|
|
130
|
-
def is_onedp_project(azure_ai_project: AzureAIProject) ->
|
|
130
|
+
def is_onedp_project(azure_ai_project: Optional[Union[str, AzureAIProject]]) -> TypeIs[str]:
|
|
131
131
|
"""Check if the Azure AI project is an OneDP project.
|
|
132
132
|
|
|
133
133
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
134
|
-
:type azure_ai_project:
|
|
134
|
+
:type azure_ai_project: Optional[Union[str,~azure.ai.evaluation.AzureAIProject]]
|
|
135
135
|
:return: True if the Azure AI project is an OneDP project, False otherwise.
|
|
136
136
|
:rtype: bool
|
|
137
137
|
"""
|
|
138
|
-
|
|
139
|
-
return True
|
|
140
|
-
return False
|
|
138
|
+
return isinstance(azure_ai_project, str)
|
|
141
139
|
|
|
142
140
|
|
|
143
141
|
def validate_azure_ai_project(o: object) -> AzureAIProject:
|
|
@@ -494,14 +492,17 @@ def _extract_text_from_content(content):
|
|
|
494
492
|
return text
|
|
495
493
|
|
|
496
494
|
|
|
497
|
-
def _get_conversation_history(query):
|
|
495
|
+
def _get_conversation_history(query, include_system_messages=False):
|
|
498
496
|
all_user_queries = []
|
|
499
497
|
cur_user_query = []
|
|
500
498
|
all_agent_responses = []
|
|
501
499
|
cur_agent_response = []
|
|
500
|
+
system_message = None
|
|
502
501
|
for msg in query:
|
|
503
502
|
if not "role" in msg:
|
|
504
503
|
continue
|
|
504
|
+
if include_system_messages and msg["role"] == "system" and "content" in msg:
|
|
505
|
+
system_message = msg.get("content", "")
|
|
505
506
|
if msg["role"] == "user" and "content" in msg:
|
|
506
507
|
if cur_agent_response != []:
|
|
507
508
|
all_agent_responses.append(cur_agent_response)
|
|
@@ -530,13 +531,18 @@ def _get_conversation_history(query):
|
|
|
530
531
|
category=ErrorCategory.INVALID_VALUE,
|
|
531
532
|
blame=ErrorBlame.USER_ERROR,
|
|
532
533
|
)
|
|
533
|
-
|
|
534
|
-
|
|
534
|
+
result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
|
|
535
|
+
if include_system_messages:
|
|
536
|
+
result["system_message"] = system_message
|
|
537
|
+
return result
|
|
535
538
|
|
|
536
539
|
|
|
537
540
|
def _pretty_format_conversation_history(conversation_history):
|
|
538
541
|
"""Formats the conversation history for better readability."""
|
|
539
542
|
formatted_history = ""
|
|
543
|
+
if "system_message" in conversation_history and conversation_history["system_message"] is not None:
|
|
544
|
+
formatted_history += "SYSTEM_PROMPT:\n"
|
|
545
|
+
formatted_history += " " + conversation_history["system_message"] + "\n\n"
|
|
540
546
|
for i, (user_query, agent_response) in enumerate(
|
|
541
547
|
zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
|
|
542
548
|
):
|
|
@@ -552,10 +558,10 @@ def _pretty_format_conversation_history(conversation_history):
|
|
|
552
558
|
return formatted_history
|
|
553
559
|
|
|
554
560
|
|
|
555
|
-
def reformat_conversation_history(query, logger=None):
|
|
561
|
+
def reformat_conversation_history(query, logger=None, include_system_messages=False):
|
|
556
562
|
"""Reformats the conversation history to a more compact representation."""
|
|
557
563
|
try:
|
|
558
|
-
conversation_history = _get_conversation_history(query)
|
|
564
|
+
conversation_history = _get_conversation_history(query, include_system_messages=include_system_messages)
|
|
559
565
|
return _pretty_format_conversation_history(conversation_history)
|
|
560
566
|
except:
|
|
561
567
|
# If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
|
|
@@ -570,22 +576,53 @@ def reformat_conversation_history(query, logger=None):
|
|
|
570
576
|
return query
|
|
571
577
|
|
|
572
578
|
|
|
573
|
-
def _get_agent_response(agent_response_msgs):
|
|
574
|
-
"""Extracts
|
|
579
|
+
def _get_agent_response(agent_response_msgs, include_tool_messages=False):
|
|
580
|
+
"""Extracts formatted agent response including text, and optionally tool calls/results."""
|
|
575
581
|
agent_response_text = []
|
|
582
|
+
tool_results = {}
|
|
583
|
+
|
|
584
|
+
# First pass: collect tool results
|
|
585
|
+
if include_tool_messages:
|
|
586
|
+
for msg in agent_response_msgs:
|
|
587
|
+
if msg.get("role") == "tool" and "tool_call_id" in msg:
|
|
588
|
+
for content in msg.get("content", []):
|
|
589
|
+
if content.get("type") == "tool_result":
|
|
590
|
+
result = content.get("tool_result")
|
|
591
|
+
tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
|
|
592
|
+
|
|
593
|
+
# Second pass: parse assistant messages and tool calls
|
|
576
594
|
for msg in agent_response_msgs:
|
|
577
|
-
if "role" in msg and msg
|
|
595
|
+
if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
|
|
578
596
|
text = _extract_text_from_content(msg["content"])
|
|
579
597
|
if text:
|
|
580
598
|
agent_response_text.extend(text)
|
|
599
|
+
if include_tool_messages:
|
|
600
|
+
for content in msg.get("content", []):
|
|
601
|
+
# Todo: Verify if this is the correct way to handle tool calls
|
|
602
|
+
if content.get("type") == "tool_call":
|
|
603
|
+
if "tool_call" in content and "function" in content.get("tool_call", {}):
|
|
604
|
+
tc = content.get("tool_call", {})
|
|
605
|
+
func_name = tc.get("function", {}).get("name", "")
|
|
606
|
+
args = tc.get("function", {}).get("arguments", {})
|
|
607
|
+
tool_call_id = tc.get("id")
|
|
608
|
+
else:
|
|
609
|
+
tool_call_id = content.get("tool_call_id")
|
|
610
|
+
func_name = content.get("name", "")
|
|
611
|
+
args = content.get("arguments", {})
|
|
612
|
+
args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
|
|
613
|
+
call_line = f"[TOOL_CALL] {func_name}({args_str})"
|
|
614
|
+
agent_response_text.append(call_line)
|
|
615
|
+
if tool_call_id in tool_results:
|
|
616
|
+
agent_response_text.append(tool_results[tool_call_id])
|
|
617
|
+
|
|
581
618
|
return agent_response_text
|
|
582
619
|
|
|
583
620
|
|
|
584
|
-
def reformat_agent_response(response, logger=None):
|
|
621
|
+
def reformat_agent_response(response, logger=None, include_tool_messages=False):
|
|
585
622
|
try:
|
|
586
623
|
if response is None or response == []:
|
|
587
624
|
return ""
|
|
588
|
-
agent_response = _get_agent_response(response)
|
|
625
|
+
agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages)
|
|
589
626
|
if agent_response == []:
|
|
590
627
|
# If no message could be extracted, likely the format changed, fallback to the original response in that case
|
|
591
628
|
if logger:
|
|
@@ -602,6 +639,26 @@ def reformat_agent_response(response, logger=None):
|
|
|
602
639
|
return response
|
|
603
640
|
|
|
604
641
|
|
|
642
|
+
def reformat_tool_definitions(tool_definitions, logger=None):
|
|
643
|
+
try:
|
|
644
|
+
output_lines = ["TOOL_DEFINITIONS:"]
|
|
645
|
+
for tool in tool_definitions:
|
|
646
|
+
name = tool.get("name", "unnamed_tool")
|
|
647
|
+
desc = tool.get("description", "").strip()
|
|
648
|
+
params = tool.get("parameters", {}).get("properties", {})
|
|
649
|
+
param_names = ", ".join(params.keys()) if params else "no parameters"
|
|
650
|
+
output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
|
|
651
|
+
return "\n".join(output_lines)
|
|
652
|
+
except Exception as e:
|
|
653
|
+
# If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
|
|
654
|
+
# This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
|
|
655
|
+
if logger:
|
|
656
|
+
logger.warning(
|
|
657
|
+
f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
|
|
658
|
+
)
|
|
659
|
+
return tool_definitions
|
|
660
|
+
|
|
661
|
+
|
|
605
662
|
def upload(path: str, container_client: ContainerClient, logger=None):
|
|
606
663
|
"""Upload files or directories to Azure Blob Storage using a container client.
|
|
607
664
|
|