azure-ai-evaluation 1.0.1__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/CHANGELOG.md +73 -1
- {azure_ai_evaluation-1.0.1/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.2.0}/PKG-INFO +84 -15
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/README.md +7 -7
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/TROUBLESHOOTING.md +5 -1
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_azure/_clients.py +204 -0
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_azure/_models.py +227 -0
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_azure/_token_manager.py +118 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/rai_service.py +30 -21
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_constants.py +19 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_eval_run.py +16 -43
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_evaluate.py +76 -44
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_utils.py +93 -34
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/_base_eval.py +140 -5
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -1
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +40 -2
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +6 -43
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +42 -82
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_http_utils.py +6 -4
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_version.py +1 -1
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_adversarial_simulator.py +35 -16
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +40 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_simulator.py +24 -13
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0/azure_ai_evaluation.egg-info}/PKG-INFO +84 -15
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/SOURCES.txt +10 -1
- azure_ai_evaluation-1.2.0/azure_ai_evaluation.egg-info/requires.txt +7 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/samples/README.md +1 -1
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/samples/evaluation_samples_simulate.py +1 -1
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/setup.py +4 -9
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/conftest.py +99 -32
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_adv_simulator.py +222 -4
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_builtin_evaluators.py +103 -45
- azure_ai_evaluation-1.2.0/tests/e2etests/test_evaluate.py +501 -0
- azure_ai_evaluation-1.2.0/tests/e2etests/test_lite_management_client.py +81 -0
- azure_ai_evaluation-1.2.0/tests/e2etests/test_mass_evaluate.py +406 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_metrics_upload.py +40 -19
- azure_ai_evaluation-1.2.0/tests/e2etests/test_sim_and_eval.py +398 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_batch_run_context.py +2 -2
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_built_in_evaluator.py +1 -1
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_content_safety_rai_script.py +2 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_eval_run.py +23 -23
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluate.py +189 -31
- azure_ai_evaluation-1.2.0/tests/unittests/test_evaluate_performance.py +78 -0
- azure_ai_evaluation-1.2.0/tests/unittests/test_evaluators/slow_eval.py +34 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluators/test_inputs_evaluators.py +42 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_save_eval.py +6 -4
- azure_ai_evaluation-1.0.1/azure_ai_evaluation.egg-info/requires.txt +0 -9
- azure_ai_evaluation-1.0.1/tests/__pf_service_isolation.py +0 -28
- azure_ai_evaluation-1.0.1/tests/e2etests/test_evaluate.py +0 -926
- azure_ai_evaluation-1.0.1/tests/e2etests/test_sim_and_eval.py +0 -129
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/MANIFEST.in +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1/azure/ai/evaluation/_evaluate → azure_ai_evaluation-1.2.0/azure/ai/evaluation/_azure}/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/_experimental.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/constants.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/math.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/utils.py +0 -0
- {azure_ai_evaluation-1.0.1/azure/ai/evaluation/_evaluators → azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluate}/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/code_client.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1/azure/ai/evaluation/_vendor → azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators}/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_eci/_eci.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_xpia/xpia.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_exceptions.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_model_configurations.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_user_agent.py +0 -0
- {azure_ai_evaluation-1.0.1/azure/ai/evaluation/simulator/_data_sources → azure_ai_evaluation-1.2.0/azure/ai/evaluation/_vendor}/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/py.typed +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_constants.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_helpers/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/models.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_tracing.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_utils.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/pyproject.toml +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/samples/data/evaluate_test_data.jsonl +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/samples/evaluation_samples_common.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/samples/evaluation_samples_evaluate.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/setup.cfg +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/__openai_patcher.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/e2etests/__init__.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/e2etests/target_fn.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_content_safety_defect_rate.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluate_telemetry.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluators/apology_dag/apology.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_jailbreak_simulator.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_non_adv_simulator.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_simulator.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_synthetic_conversation_bot.py +0 -0
- {azure_ai_evaluation-1.0.1 → azure_ai_evaluation-1.2.0}/tests/unittests/test_utils.py +0 -0
|
@@ -1,10 +1,81 @@
|
|
|
1
1
|
# Release History
|
|
2
2
|
|
|
3
|
+
## 1.2.0 (2025-01-27)
|
|
4
|
+
|
|
5
|
+
### Features Added
|
|
6
|
+
- CSV files are now supported as data file inputs with `evaluate()` API. The CSV file should have a header row with column names that match the `data` and `target` fields in the `evaluate()` method and the filename should be passed as the `data` parameter. Column name 'Conversation' in CSV file is not fully supported yet.
|
|
7
|
+
|
|
8
|
+
### Breaking Changes
|
|
9
|
+
- `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be removed in next release.
|
|
10
|
+
|
|
11
|
+
### Bugs Fixed
|
|
12
|
+
- Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Studio.
|
|
13
|
+
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
|
|
14
|
+
- Fixed the non adversarial simulator to run in task-free mode
|
|
15
|
+
- Content safety evaluators (violence, self harm, sexual, hate/unfairness) return the maximum result as the
|
|
16
|
+
main score when aggregating per-turn evaluations from a conversation into an overall
|
|
17
|
+
evaluation score. Other conversation-capable evaluators still default to a mean for aggregation.
|
|
18
|
+
- Fixed bug in non adversarial simulator sample where `tasks` undefined
|
|
19
|
+
|
|
20
|
+
### Other Changes
|
|
21
|
+
- Changed minimum required python version to use this package from 3.8 to 3.9
|
|
22
|
+
- Stop dependency on the local promptflow service. No promptflow service will automatically start when running evaluation.
|
|
23
|
+
- Evaluators internally allow for custom aggregation. However, this causes serialization failures if evaluated while the
|
|
24
|
+
environment variable `AI_EVALS_BATCH_USE_ASYNC` is set to false.
|
|
25
|
+
|
|
26
|
+
## 1.1.0 (2024-12-12)
|
|
27
|
+
|
|
28
|
+
### Features Added
|
|
29
|
+
- Added image support in `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator`. Provide image URLs or base64 encoded images in `conversation` input for image evaluation. See below for an example:
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
evaluator = ContentSafetyEvaluator(credential=azure_cred, azure_ai_project=project_scope)
|
|
33
|
+
conversation = {
|
|
34
|
+
"messages": [
|
|
35
|
+
{
|
|
36
|
+
"role": "system",
|
|
37
|
+
"content": [
|
|
38
|
+
{"type": "text", "text": "You are an AI assistant that understands images."}
|
|
39
|
+
],
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"role": "user",
|
|
43
|
+
"content": [
|
|
44
|
+
{"type": "text", "text": "Can you describe this image?"},
|
|
45
|
+
{
|
|
46
|
+
"type": "image_url",
|
|
47
|
+
"image_url": {
|
|
48
|
+
"url": "https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg"
|
|
49
|
+
},
|
|
50
|
+
},
|
|
51
|
+
],
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"role": "assistant",
|
|
55
|
+
"content": [
|
|
56
|
+
{
|
|
57
|
+
"type": "text",
|
|
58
|
+
"text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.",
|
|
59
|
+
}
|
|
60
|
+
],
|
|
61
|
+
},
|
|
62
|
+
]
|
|
63
|
+
}
|
|
64
|
+
print("Calling Content Safety Evaluator for multi-modal")
|
|
65
|
+
score = evaluator(conversation=conversation)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
- Please switch to generic evaluators for image evaluations as mentioned above. `ContentSafetyMultimodalEvaluator`, `ContentSafetyMultimodalEvaluatorBase`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be deprecated in the next release.
|
|
69
|
+
|
|
70
|
+
### Bugs Fixed
|
|
71
|
+
- Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Foundry portal.
|
|
72
|
+
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
|
|
73
|
+
|
|
3
74
|
## 1.0.1 (2024-11-15)
|
|
4
75
|
|
|
5
76
|
### Bugs Fixed
|
|
6
|
-
- Fixed `[remote]` extra to be needed only when tracking results in Azure AI Studio.
|
|
7
77
|
- Removing `azure-ai-inference` as dependency.
|
|
78
|
+
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
|
|
8
79
|
|
|
9
80
|
## 1.0.0 (2024-11-13)
|
|
10
81
|
|
|
@@ -16,6 +87,7 @@
|
|
|
16
87
|
- Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
|
|
17
88
|
- Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
|
|
18
89
|
- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
|
|
90
|
+
- Fixed an issue with the `ContentSafetyEvaluator` that caused parallel execution of sub-evaluators to fail. Parallel execution is now enabled by default again, but can still be disabled via the '_parallel' boolean keyword argument during class initialization.
|
|
19
91
|
- Fix `evaluate` function not producing aggregated metrics if ANY values to be aggregated were None, NaN, or
|
|
20
92
|
otherwise difficult to process. Such values are ignored fully, so the aggregated metric of `[1, 2, 3, NaN]`
|
|
21
93
|
would be 2, not 1.5.
|
{azure_ai_evaluation-1.0.1/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.2.0}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -13,23 +13,21 @@ Classifier: Development Status :: 5 - Production/Stable
|
|
|
13
13
|
Classifier: Programming Language :: Python
|
|
14
14
|
Classifier: Programming Language :: Python :: 3
|
|
15
15
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.9
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
19
|
Classifier: License :: OSI Approved :: MIT License
|
|
21
20
|
Classifier: Operating System :: OS Independent
|
|
22
|
-
Requires-Python: >=3.
|
|
21
|
+
Requires-Python: >=3.9
|
|
23
22
|
Description-Content-Type: text/markdown
|
|
24
23
|
License-File: NOTICE.txt
|
|
25
|
-
Requires-Dist: promptflow-devkit>=1.
|
|
26
|
-
Requires-Dist: promptflow-core>=1.
|
|
24
|
+
Requires-Dist: promptflow-devkit>=1.17.1
|
|
25
|
+
Requires-Dist: promptflow-core>=1.17.1
|
|
27
26
|
Requires-Dist: pyjwt>=2.8.0
|
|
28
27
|
Requires-Dist: azure-identity>=1.16.0
|
|
29
28
|
Requires-Dist: azure-core>=1.30.2
|
|
30
29
|
Requires-Dist: nltk>=3.9.1
|
|
31
|
-
|
|
32
|
-
Requires-Dist: promptflow-azure<2.0.0,>=1.15.0; extra == "remote"
|
|
30
|
+
Requires-Dist: azure-storage-blob>=12.10.0
|
|
33
31
|
|
|
34
32
|
# Azure AI Evaluation client library for Python
|
|
35
33
|
|
|
@@ -55,7 +53,7 @@ Azure AI SDK provides following to evaluate Generative AI Applications:
|
|
|
55
53
|
|
|
56
54
|
### Prerequisites
|
|
57
55
|
|
|
58
|
-
- Python 3.
|
|
56
|
+
- Python 3.9 or later is required to use this package.
|
|
59
57
|
- [Optional] You must have [Azure AI Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
|
|
60
58
|
|
|
61
59
|
### Install the package
|
|
@@ -359,13 +357,13 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
359
357
|
[evaluate_dataset]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#evaluate-on-test-dataset-using-evaluate
|
|
360
358
|
[evaluators]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview
|
|
361
359
|
[evaluate_api]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview#azure-ai-evaluation-evaluate
|
|
362
|
-
[evaluate_app]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/
|
|
360
|
+
[evaluate_app]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Supported_Evaluation_Targets/Evaluate_App_Endpoint
|
|
363
361
|
[evaluation_tsg]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/TROUBLESHOOTING.md
|
|
364
362
|
[ai_studio]: https://learn.microsoft.com/azure/ai-studio/what-is-ai-studio
|
|
365
363
|
[ai_project]: https://learn.microsoft.com/azure/ai-studio/how-to/create-projects?tabs=ai-studio
|
|
366
364
|
[azure_openai]: https://learn.microsoft.com/azure/ai-services/openai/
|
|
367
|
-
[evaluate_models]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/
|
|
368
|
-
[custom_evaluators]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/
|
|
365
|
+
[evaluate_models]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Supported_Evaluation_Targets/Evaluate_Base_Model_Endpoint
|
|
366
|
+
[custom_evaluators]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Supported_Evaluation_Metrics/Custom_Evaluators
|
|
369
367
|
[evaluate_samples]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate
|
|
370
368
|
[evaluation_metrics]: https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in
|
|
371
369
|
[performance_and_quality_evaluators]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#performance-and-quality-evaluators
|
|
@@ -373,18 +371,88 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
373
371
|
[composite_evaluators]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#composite-evaluators
|
|
374
372
|
[adversarial_simulation_docs]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#generate-adversarial-simulations-for-safety-evaluation
|
|
375
373
|
[adversarial_simulation_scenarios]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#supported-adversarial-simulation-scenarios
|
|
376
|
-
[adversarial_simulation]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/
|
|
377
|
-
[simulate_with_conversation_starter]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/
|
|
374
|
+
[adversarial_simulation]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Simulators/Simulate_Adversarial_Data
|
|
375
|
+
[simulate_with_conversation_starter]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Simulators/Simulate_Context-Relevant_Data/Simulate_From_Conversation_Starter
|
|
378
376
|
[adversarial_jailbreak]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#simulating-jailbreak-attacks
|
|
379
377
|
|
|
380
|
-
|
|
381
378
|
# Release History
|
|
382
379
|
|
|
380
|
+
## 1.2.0 (2025-01-27)
|
|
381
|
+
|
|
382
|
+
### Features Added
|
|
383
|
+
- CSV files are now supported as data file inputs with `evaluate()` API. The CSV file should have a header row with column names that match the `data` and `target` fields in the `evaluate()` method and the filename should be passed as the `data` parameter. Column name 'Conversation' in CSV file is not fully supported yet.
|
|
384
|
+
|
|
385
|
+
### Breaking Changes
|
|
386
|
+
- `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be removed in next release.
|
|
387
|
+
|
|
388
|
+
### Bugs Fixed
|
|
389
|
+
- Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Studio.
|
|
390
|
+
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
|
|
391
|
+
- Fixed the non adversarial simulator to run in task-free mode
|
|
392
|
+
- Content safety evaluators (violence, self harm, sexual, hate/unfairness) return the maximum result as the
|
|
393
|
+
main score when aggregating per-turn evaluations from a conversation into an overall
|
|
394
|
+
evaluation score. Other conversation-capable evaluators still default to a mean for aggregation.
|
|
395
|
+
- Fixed bug in non adversarial simulator sample where `tasks` undefined
|
|
396
|
+
|
|
397
|
+
### Other Changes
|
|
398
|
+
- Changed minimum required python version to use this package from 3.8 to 3.9
|
|
399
|
+
- Stop dependency on the local promptflow service. No promptflow service will automatically start when running evaluation.
|
|
400
|
+
- Evaluators internally allow for custom aggregation. However, this causes serialization failures if evaluated while the
|
|
401
|
+
environment variable `AI_EVALS_BATCH_USE_ASYNC` is set to false.
|
|
402
|
+
|
|
403
|
+
## 1.1.0 (2024-12-12)
|
|
404
|
+
|
|
405
|
+
### Features Added
|
|
406
|
+
- Added image support in `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator`. Provide image URLs or base64 encoded images in `conversation` input for image evaluation. See below for an example:
|
|
407
|
+
|
|
408
|
+
```python
|
|
409
|
+
evaluator = ContentSafetyEvaluator(credential=azure_cred, azure_ai_project=project_scope)
|
|
410
|
+
conversation = {
|
|
411
|
+
"messages": [
|
|
412
|
+
{
|
|
413
|
+
"role": "system",
|
|
414
|
+
"content": [
|
|
415
|
+
{"type": "text", "text": "You are an AI assistant that understands images."}
|
|
416
|
+
],
|
|
417
|
+
},
|
|
418
|
+
{
|
|
419
|
+
"role": "user",
|
|
420
|
+
"content": [
|
|
421
|
+
{"type": "text", "text": "Can you describe this image?"},
|
|
422
|
+
{
|
|
423
|
+
"type": "image_url",
|
|
424
|
+
"image_url": {
|
|
425
|
+
"url": "https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg"
|
|
426
|
+
},
|
|
427
|
+
},
|
|
428
|
+
],
|
|
429
|
+
},
|
|
430
|
+
{
|
|
431
|
+
"role": "assistant",
|
|
432
|
+
"content": [
|
|
433
|
+
{
|
|
434
|
+
"type": "text",
|
|
435
|
+
"text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.",
|
|
436
|
+
}
|
|
437
|
+
],
|
|
438
|
+
},
|
|
439
|
+
]
|
|
440
|
+
}
|
|
441
|
+
print("Calling Content Safety Evaluator for multi-modal")
|
|
442
|
+
score = evaluator(conversation=conversation)
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
- Please switch to generic evaluators for image evaluations as mentioned above. `ContentSafetyMultimodalEvaluator`, `ContentSafetyMultimodalEvaluatorBase`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be deprecated in the next release.
|
|
446
|
+
|
|
447
|
+
### Bugs Fixed
|
|
448
|
+
- Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Foundry portal.
|
|
449
|
+
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
|
|
450
|
+
|
|
383
451
|
## 1.0.1 (2024-11-15)
|
|
384
452
|
|
|
385
453
|
### Bugs Fixed
|
|
386
|
-
- Fixed `[remote]` extra to be needed only when tracking results in Azure AI Studio.
|
|
387
454
|
- Removing `azure-ai-inference` as dependency.
|
|
455
|
+
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
|
|
388
456
|
|
|
389
457
|
## 1.0.0 (2024-11-13)
|
|
390
458
|
|
|
@@ -396,6 +464,7 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
396
464
|
- Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
|
|
397
465
|
- Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
|
|
398
466
|
- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
|
|
467
|
+
- Fixed an issue with the `ContentSafetyEvaluator` that caused parallel execution of sub-evaluators to fail. Parallel execution is now enabled by default again, but can still be disabled via the '_parallel' boolean keyword argument during class initialization.
|
|
399
468
|
- Fix `evaluate` function not producing aggregated metrics if ANY values to be aggregated were None, NaN, or
|
|
400
469
|
otherwise difficult to process. Such values are ignored fully, so the aggregated metric of `[1, 2, 3, NaN]`
|
|
401
470
|
would be 2, not 1.5.
|
|
@@ -22,7 +22,7 @@ Azure AI SDK provides following to evaluate Generative AI Applications:
|
|
|
22
22
|
|
|
23
23
|
### Prerequisites
|
|
24
24
|
|
|
25
|
-
- Python 3.
|
|
25
|
+
- Python 3.9 or later is required to use this package.
|
|
26
26
|
- [Optional] You must have [Azure AI Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
|
|
27
27
|
|
|
28
28
|
### Install the package
|
|
@@ -326,13 +326,13 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
326
326
|
[evaluate_dataset]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#evaluate-on-test-dataset-using-evaluate
|
|
327
327
|
[evaluators]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview
|
|
328
328
|
[evaluate_api]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview#azure-ai-evaluation-evaluate
|
|
329
|
-
[evaluate_app]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/
|
|
329
|
+
[evaluate_app]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Supported_Evaluation_Targets/Evaluate_App_Endpoint
|
|
330
330
|
[evaluation_tsg]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/TROUBLESHOOTING.md
|
|
331
331
|
[ai_studio]: https://learn.microsoft.com/azure/ai-studio/what-is-ai-studio
|
|
332
332
|
[ai_project]: https://learn.microsoft.com/azure/ai-studio/how-to/create-projects?tabs=ai-studio
|
|
333
333
|
[azure_openai]: https://learn.microsoft.com/azure/ai-services/openai/
|
|
334
|
-
[evaluate_models]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/
|
|
335
|
-
[custom_evaluators]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/
|
|
334
|
+
[evaluate_models]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Supported_Evaluation_Targets/Evaluate_Base_Model_Endpoint
|
|
335
|
+
[custom_evaluators]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Supported_Evaluation_Metrics/Custom_Evaluators
|
|
336
336
|
[evaluate_samples]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate
|
|
337
337
|
[evaluation_metrics]: https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in
|
|
338
338
|
[performance_and_quality_evaluators]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#performance-and-quality-evaluators
|
|
@@ -340,6 +340,6 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
340
340
|
[composite_evaluators]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#composite-evaluators
|
|
341
341
|
[adversarial_simulation_docs]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#generate-adversarial-simulations-for-safety-evaluation
|
|
342
342
|
[adversarial_simulation_scenarios]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#supported-adversarial-simulation-scenarios
|
|
343
|
-
[adversarial_simulation]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/
|
|
344
|
-
[simulate_with_conversation_starter]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/
|
|
345
|
-
[adversarial_jailbreak]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#simulating-jailbreak-attacks
|
|
343
|
+
[adversarial_simulation]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Simulators/Simulate_Adversarial_Data
|
|
344
|
+
[simulate_with_conversation_starter]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Simulators/Simulate_Context-Relevant_Data/Simulate_From_Conversation_Starter
|
|
345
|
+
[adversarial_jailbreak]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#simulating-jailbreak-attacks
|
|
@@ -26,6 +26,10 @@ This guide walks you through how to investigate failures, common errors in the `
|
|
|
26
26
|
- Ensure that you assign the proper permissions to the storage account linked to your Azure AI Studio hub. This can be done with the following command. More information can be found [here](https://aka.ms/credentialleshub).
|
|
27
27
|
|
|
28
28
|
```Shell
|
|
29
|
+
# <mySubscriptionID>: Subscription ID of the Azure AI Studio hub's linked storage account (available in Azure AI hub resource view in Azure Portal).
|
|
30
|
+
# <myResourceGroupName>: Resource group of the Azure AI Studio hub's linked storage account.
|
|
31
|
+
# <user-id>: User object ID for role assignment (retrieve with "az ad user show" command).
|
|
32
|
+
|
|
29
33
|
az role assignment create --role "Storage Blob Data Contributor" --scope /subscriptions/<mySubscriptionID>/resourceGroups/<myResourceGroupName> --assignee-principal-type User --assignee-object-id "<user-id>"
|
|
30
34
|
```
|
|
31
35
|
|
|
@@ -50,7 +54,7 @@ The Adversarial simulator does not support selecting individual harms, instead w
|
|
|
50
54
|
### Simulator is slow
|
|
51
55
|
|
|
52
56
|
Identify the type of simulations being run (adversarial or non-adversarial).
|
|
53
|
-
Adjust parameters such as `api_call_retry_sleep_sec`, `api_call_delay_sec`, and `concurrent_async_task`. Please note that rate limits to llm calls can be both tokens per minute and requests per minute.
|
|
57
|
+
Adjust parameters such as `api_call_retry_sleep_sec`, `api_call_delay_sec`, and `concurrent_async_task`. Please note that rate limits to llm calls can be both tokens per minute and requests per minute.
|
|
54
58
|
|
|
55
59
|
## Logging
|
|
56
60
|
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from logging import Logger
|
|
6
|
+
from typing import Any, Dict, Final, Optional, Set, Union, cast
|
|
7
|
+
from threading import Lock
|
|
8
|
+
from urllib.parse import quote
|
|
9
|
+
from json.decoder import JSONDecodeError
|
|
10
|
+
|
|
11
|
+
from azure.core.credentials import TokenCredential, AzureSasCredential
|
|
12
|
+
from azure.core.rest import HttpResponse
|
|
13
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
|
+
from azure.ai.evaluation._http_utils import HttpPipeline, get_http_client
|
|
15
|
+
from azure.ai.evaluation._azure._token_manager import AzureMLTokenManager
|
|
16
|
+
from azure.ai.evaluation.simulator._model_tools._identity_manager import TokenScope
|
|
17
|
+
from ._models import BlobStoreInfo, Workspace
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
API_VERSION: Final[str] = "2024-07-01-preview"
|
|
21
|
+
QUERY_KEY_API_VERSION: Final[str] = "api-version"
|
|
22
|
+
PATH_ML_WORKSPACES = ("providers", "Microsoft.MachineLearningServices", "workspaces")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class LiteMLClient:
|
|
26
|
+
"""A lightweight Azure ML API client.
|
|
27
|
+
|
|
28
|
+
:param subscription_id: Azure subscription ID
|
|
29
|
+
:type subscription_id: str
|
|
30
|
+
:param resource_group: Azure resource group name
|
|
31
|
+
:type resource_group: str
|
|
32
|
+
:param logger: Logger object
|
|
33
|
+
:type logger: logging.Logger
|
|
34
|
+
:keyword credential: Azure credentials
|
|
35
|
+
:paramtype credential: TokenCredential
|
|
36
|
+
:keyword kwargs: Additional keyword arguments
|
|
37
|
+
:paramtype kwargs: Dict
|
|
38
|
+
:keyword str api_version: The API version. Default is 2024-10-01
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
subscription_id: str,
|
|
44
|
+
resource_group: str,
|
|
45
|
+
logger: Logger,
|
|
46
|
+
credential: Optional[TokenCredential] = None,
|
|
47
|
+
**kwargs: Any,
|
|
48
|
+
) -> None:
|
|
49
|
+
subscription_id = quote(subscription_id, safe="")
|
|
50
|
+
resource_group = quote(resource_group, safe="")
|
|
51
|
+
|
|
52
|
+
self._base_url: Final[str] = (
|
|
53
|
+
f"https://management.azure.com/subscriptions/{subscription_id}/resourceGroups/{resource_group}"
|
|
54
|
+
)
|
|
55
|
+
self._logger: Final[Logger] = logger
|
|
56
|
+
self._api_version: Final[str] = kwargs.get("api_version", API_VERSION)
|
|
57
|
+
self._http_client: Final[HttpPipeline] = get_http_client(**kwargs)
|
|
58
|
+
self._lock: Final[Lock] = Lock()
|
|
59
|
+
|
|
60
|
+
# things that can change under lock
|
|
61
|
+
self._token_manager: Optional[AzureMLTokenManager] = None
|
|
62
|
+
self._credential: Optional[TokenCredential] = credential
|
|
63
|
+
|
|
64
|
+
def get_token(self) -> str:
|
|
65
|
+
return self._get_token_manager().get_token()
|
|
66
|
+
|
|
67
|
+
def get_credential(self) -> TokenCredential:
|
|
68
|
+
# load the token manager to get the credential if needed
|
|
69
|
+
self._get_token_manager()
|
|
70
|
+
return cast(TokenCredential, self._credential)
|
|
71
|
+
|
|
72
|
+
def workspace_get_default_datastore(
|
|
73
|
+
self, workspace_name: str, *, include_credentials: bool = False, **kwargs: Any
|
|
74
|
+
) -> BlobStoreInfo:
|
|
75
|
+
# 1. Get the default blob store
|
|
76
|
+
# REST API documentation:
|
|
77
|
+
# https://learn.microsoft.com/rest/api/azureml/datastores/list?view=rest-azureml-2024-10-01
|
|
78
|
+
url = self._generate_path( # pylint: disable=specify-parameter-names-in-call
|
|
79
|
+
*PATH_ML_WORKSPACES, workspace_name, "datastores"
|
|
80
|
+
)
|
|
81
|
+
headers = self._get_headers()
|
|
82
|
+
|
|
83
|
+
stores_response = self._http_client.request(
|
|
84
|
+
method="GET",
|
|
85
|
+
url=url,
|
|
86
|
+
params={QUERY_KEY_API_VERSION: self._api_version, "isDefault": True, "count": 1, "orderByAsc": "false"},
|
|
87
|
+
headers=headers,
|
|
88
|
+
)
|
|
89
|
+
self._throw_on_http_error(stores_response, "list default workspace datastore")
|
|
90
|
+
|
|
91
|
+
json = stores_response.json()["value"][0]
|
|
92
|
+
props_json = json["properties"]
|
|
93
|
+
name = json["name"]
|
|
94
|
+
account_name = props_json["accountName"]
|
|
95
|
+
endpoint = props_json["endpoint"]
|
|
96
|
+
container_name = props_json["containerName"]
|
|
97
|
+
credential_type = props_json.get("credentials", {}).get("credentialsType")
|
|
98
|
+
|
|
99
|
+
# 2. Get the SAS token to use for accessing the blob store
|
|
100
|
+
# REST API documentation:
|
|
101
|
+
# https://learn.microsoft.com/rest/api/azureml/datastores/list-secrets?view=rest-azureml-2024-10-01
|
|
102
|
+
blob_store_credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
|
|
103
|
+
if not include_credentials:
|
|
104
|
+
blob_store_credential = None
|
|
105
|
+
elif credential_type and credential_type.lower() == "none":
|
|
106
|
+
# If storage account key access is disabled, and only Microsoft Entra ID authentication is available,
|
|
107
|
+
# the credentialsType will be "None" and we should not attempt to get the secrets.
|
|
108
|
+
blob_store_credential = self.get_credential()
|
|
109
|
+
else:
|
|
110
|
+
url = self._generate_path(
|
|
111
|
+
*PATH_ML_WORKSPACES, workspace_name, "datastores", "workspaceblobstore", "listSecrets"
|
|
112
|
+
)
|
|
113
|
+
secrets_response = self._http_client.request(
|
|
114
|
+
method="POST",
|
|
115
|
+
url=url,
|
|
116
|
+
json={
|
|
117
|
+
"expirableSecret": True,
|
|
118
|
+
"expireAfterHours": int(kwargs.get("key_expiration_hours", 1)),
|
|
119
|
+
},
|
|
120
|
+
params={
|
|
121
|
+
QUERY_KEY_API_VERSION: self._api_version,
|
|
122
|
+
},
|
|
123
|
+
headers=headers,
|
|
124
|
+
)
|
|
125
|
+
self._throw_on_http_error(secrets_response, "workspace datastore secrets")
|
|
126
|
+
|
|
127
|
+
secrets_json = secrets_response.json()
|
|
128
|
+
secrets_type = secrets_json["secretsType"].lower()
|
|
129
|
+
|
|
130
|
+
# As per this website, only SAS tokens, access tokens, or Entra IDs are valid for accessing blob data
|
|
131
|
+
# stores:
|
|
132
|
+
# https://learn.microsoft.com/rest/api/storageservices/authorize-requests-to-azure-storage.
|
|
133
|
+
if secrets_type == "sas":
|
|
134
|
+
blob_store_credential = AzureSasCredential(secrets_json["sasToken"])
|
|
135
|
+
elif secrets_type == "accountkey":
|
|
136
|
+
# To support older versions of azure-storage-blob better, we return a string here instead of
|
|
137
|
+
# an AzureNamedKeyCredential
|
|
138
|
+
blob_store_credential = secrets_json["key"]
|
|
139
|
+
else:
|
|
140
|
+
raise EvaluationException(
|
|
141
|
+
message=f"The '{account_name}' blob store does not use a recognized credential type.",
|
|
142
|
+
internal_message=f"The credential type is '{secrets_type}'",
|
|
143
|
+
target=ErrorTarget.EVALUATE,
|
|
144
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
145
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
return BlobStoreInfo(name, account_name, endpoint, container_name, blob_store_credential)
|
|
149
|
+
|
|
150
|
+
def workspace_get_info(self, workspace_name: str) -> Workspace:
|
|
151
|
+
# https://learn.microsoft.com/rest/api/azureml/workspaces/get?view=rest-azureml-2024-10-01
|
|
152
|
+
workspace_response = self._http_client.request(
|
|
153
|
+
"GET",
|
|
154
|
+
self._generate_path(*PATH_ML_WORKSPACES, workspace_name),
|
|
155
|
+
params={QUERY_KEY_API_VERSION: self._api_version},
|
|
156
|
+
headers=self._get_headers(),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
self._throw_on_http_error(workspace_response, f"get '{workspace_name}' workspace")
|
|
160
|
+
workspace = Workspace.deserialize(workspace_response)
|
|
161
|
+
return workspace
|
|
162
|
+
|
|
163
|
+
def _get_token_manager(self) -> AzureMLTokenManager:
|
|
164
|
+
# Lazy init since getting credentials in the constructor can take a long time in some situations
|
|
165
|
+
if self._token_manager is None:
|
|
166
|
+
with self._lock:
|
|
167
|
+
if self._token_manager is None:
|
|
168
|
+
self._token_manager = AzureMLTokenManager(
|
|
169
|
+
TokenScope.DEFAULT_AZURE_MANAGEMENT.value, self._logger, credential=self._credential
|
|
170
|
+
)
|
|
171
|
+
self._credential = self._token_manager.credential
|
|
172
|
+
|
|
173
|
+
return self._token_manager
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def _throw_on_http_error(response: HttpResponse, description: str, valid_status: Optional[Set[int]] = None) -> None:
|
|
177
|
+
if valid_status and (response.status_code in valid_status):
|
|
178
|
+
return
|
|
179
|
+
if response.status_code >= 200 and response.status_code < 300:
|
|
180
|
+
# nothing to see here, move along
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
message = f"The {description} request failed with HTTP {response.status_code}"
|
|
184
|
+
try:
|
|
185
|
+
error_json = response.json()["error"]
|
|
186
|
+
additional_info = f"({error_json['code']}) {error_json['message']}"
|
|
187
|
+
message += f" - {additional_info}"
|
|
188
|
+
except (JSONDecodeError, ValueError, KeyError):
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
raise EvaluationException(
|
|
192
|
+
message=message,
|
|
193
|
+
target=ErrorTarget.EVALUATE,
|
|
194
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
195
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def _generate_path(self, *paths: str) -> str:
|
|
199
|
+
sanitized_paths = [quote(path, safe="") for path in paths]
|
|
200
|
+
url = self._base_url + "/" + str.join("/", sanitized_paths)
|
|
201
|
+
return url
|
|
202
|
+
|
|
203
|
+
def _get_headers(self) -> Dict[str, str]:
|
|
204
|
+
return {"Authorization": f"Bearer {self.get_token()}", "Content-Type": "application/json"}
|