azure-ai-evaluation 1.1.0__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/CHANGELOG.md +66 -1
- {azure_ai_evaluation-1.1.0/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.2.0}/PKG-INFO +71 -7
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/README.md +1 -1
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_azure/_clients.py +24 -8
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_azure/_models.py +2 -2
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_constants.py +18 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_eval_run.py +3 -1
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_evaluate.py +69 -12
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_utils.py +27 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/_base_eval.py +69 -4
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +7 -1
- azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +5 -42
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +41 -81
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_version.py +1 -1
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_simulator.py +21 -13
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0/azure_ai_evaluation.egg-info}/PKG-INFO +71 -7
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/SOURCES.txt +2 -1
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/requires.txt +2 -2
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/samples/README.md +1 -1
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/samples/evaluation_samples_simulate.py +1 -1
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/setup.py +3 -4
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/conftest.py +23 -1
- azure_ai_evaluation-1.2.0/tests/e2etests/test_evaluate.py +501 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_lite_management_client.py +12 -3
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_mass_evaluate.py +111 -86
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_metrics_upload.py +11 -1
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_built_in_evaluator.py +1 -1
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluate.py +189 -31
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluate_performance.py +9 -13
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluators/test_inputs_evaluators.py +42 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_save_eval.py +6 -4
- azure_ai_evaluation-1.1.0/tests/__pf_service_isolation.py +0 -28
- azure_ai_evaluation-1.1.0/tests/e2etests/test_evaluate.py +0 -953
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/MANIFEST.in +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/TROUBLESHOOTING.md +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_azure/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_azure/_token_manager.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/_experimental.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/constants.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/math.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/rai_service.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/utils.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/code_client.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_eci/_eci.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_xpia/xpia.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_exceptions.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_http_utils.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_model_configurations.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_user_agent.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/py.typed +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_adversarial_scenario.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_adversarial_simulator.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_constants.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_conversation/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_helpers/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/models.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_tracing.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_utils.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/pyproject.toml +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/samples/data/evaluate_test_data.jsonl +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/samples/evaluation_samples_common.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/samples/evaluation_samples_evaluate.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/setup.cfg +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/__openai_patcher.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/__init__.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/target_fn.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_adv_simulator.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_builtin_evaluators.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_sim_and_eval.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_batch_run_context.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_content_safety_defect_rate.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_content_safety_rai_script.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_eval_run.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluate_telemetry.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluators/apology_dag/apology.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluators/slow_eval.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_jailbreak_simulator.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_non_adv_simulator.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_simulator.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_synthetic_conversation_bot.py +0 -0
- {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_utils.py +0 -0
|
@@ -1,10 +1,75 @@
|
|
|
1
1
|
# Release History
|
|
2
2
|
|
|
3
|
-
## 1.
|
|
3
|
+
## 1.2.0 (2025-01-27)
|
|
4
|
+
|
|
5
|
+
### Features Added
|
|
6
|
+
- CSV files are now supported as data file inputs with `evaluate()` API. The CSV file should have a header row with column names that match the `data` and `target` fields in the `evaluate()` method and the filename should be passed as the `data` parameter. Column name 'Conversation' in CSV file is not fully supported yet.
|
|
7
|
+
|
|
8
|
+
### Breaking Changes
|
|
9
|
+
- `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be removed in next release.
|
|
4
10
|
|
|
5
11
|
### Bugs Fixed
|
|
6
12
|
- Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Studio.
|
|
7
13
|
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
|
|
14
|
+
- Fixed the non adversarial simulator to run in task-free mode
|
|
15
|
+
- Content safety evaluators (violence, self harm, sexual, hate/unfairness) return the maximum result as the
|
|
16
|
+
main score when aggregating per-turn evaluations from a conversation into an overall
|
|
17
|
+
evaluation score. Other conversation-capable evaluators still default to a mean for aggregation.
|
|
18
|
+
- Fixed bug in non adversarial simulator sample where `tasks` undefined
|
|
19
|
+
|
|
20
|
+
### Other Changes
|
|
21
|
+
- Changed minimum required python version to use this package from 3.8 to 3.9
|
|
22
|
+
- Stop dependency on the local promptflow service. No promptflow service will automatically start when running evaluation.
|
|
23
|
+
- Evaluators internally allow for custom aggregation. However, this causes serialization failures if evaluated while the
|
|
24
|
+
environment variable `AI_EVALS_BATCH_USE_ASYNC` is set to false.
|
|
25
|
+
|
|
26
|
+
## 1.1.0 (2024-12-12)
|
|
27
|
+
|
|
28
|
+
### Features Added
|
|
29
|
+
- Added image support in `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator`. Provide image URLs or base64 encoded images in `conversation` input for image evaluation. See below for an example:
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
evaluator = ContentSafetyEvaluator(credential=azure_cred, azure_ai_project=project_scope)
|
|
33
|
+
conversation = {
|
|
34
|
+
"messages": [
|
|
35
|
+
{
|
|
36
|
+
"role": "system",
|
|
37
|
+
"content": [
|
|
38
|
+
{"type": "text", "text": "You are an AI assistant that understands images."}
|
|
39
|
+
],
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"role": "user",
|
|
43
|
+
"content": [
|
|
44
|
+
{"type": "text", "text": "Can you describe this image?"},
|
|
45
|
+
{
|
|
46
|
+
"type": "image_url",
|
|
47
|
+
"image_url": {
|
|
48
|
+
"url": "https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg"
|
|
49
|
+
},
|
|
50
|
+
},
|
|
51
|
+
],
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"role": "assistant",
|
|
55
|
+
"content": [
|
|
56
|
+
{
|
|
57
|
+
"type": "text",
|
|
58
|
+
"text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.",
|
|
59
|
+
}
|
|
60
|
+
],
|
|
61
|
+
},
|
|
62
|
+
]
|
|
63
|
+
}
|
|
64
|
+
print("Calling Content Safety Evaluator for multi-modal")
|
|
65
|
+
score = evaluator(conversation=conversation)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
- Please switch to generic evaluators for image evaluations as mentioned above. `ContentSafetyMultimodalEvaluator`, `ContentSafetyMultimodalEvaluatorBase`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be deprecated in the next release.
|
|
69
|
+
|
|
70
|
+
### Bugs Fixed
|
|
71
|
+
- Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Foundry portal.
|
|
72
|
+
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
|
|
8
73
|
|
|
9
74
|
## 1.0.1 (2024-11-15)
|
|
10
75
|
|
{azure_ai_evaluation-1.1.0/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.2.0}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -13,17 +13,16 @@ Classifier: Development Status :: 5 - Production/Stable
|
|
|
13
13
|
Classifier: Programming Language :: Python
|
|
14
14
|
Classifier: Programming Language :: Python :: 3
|
|
15
15
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.9
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
19
|
Classifier: License :: OSI Approved :: MIT License
|
|
21
20
|
Classifier: Operating System :: OS Independent
|
|
22
|
-
Requires-Python: >=3.
|
|
21
|
+
Requires-Python: >=3.9
|
|
23
22
|
Description-Content-Type: text/markdown
|
|
24
23
|
License-File: NOTICE.txt
|
|
25
|
-
Requires-Dist: promptflow-devkit>=1.
|
|
26
|
-
Requires-Dist: promptflow-core>=1.
|
|
24
|
+
Requires-Dist: promptflow-devkit>=1.17.1
|
|
25
|
+
Requires-Dist: promptflow-core>=1.17.1
|
|
27
26
|
Requires-Dist: pyjwt>=2.8.0
|
|
28
27
|
Requires-Dist: azure-identity>=1.16.0
|
|
29
28
|
Requires-Dist: azure-core>=1.30.2
|
|
@@ -54,7 +53,7 @@ Azure AI SDK provides following to evaluate Generative AI Applications:
|
|
|
54
53
|
|
|
55
54
|
### Prerequisites
|
|
56
55
|
|
|
57
|
-
- Python 3.
|
|
56
|
+
- Python 3.9 or later is required to use this package.
|
|
58
57
|
- [Optional] You must have [Azure AI Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
|
|
59
58
|
|
|
60
59
|
### Install the package
|
|
@@ -378,11 +377,76 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
378
377
|
|
|
379
378
|
# Release History
|
|
380
379
|
|
|
381
|
-
## 1.
|
|
380
|
+
## 1.2.0 (2025-01-27)
|
|
381
|
+
|
|
382
|
+
### Features Added
|
|
383
|
+
- CSV files are now supported as data file inputs with `evaluate()` API. The CSV file should have a header row with column names that match the `data` and `target` fields in the `evaluate()` method and the filename should be passed as the `data` parameter. Column name 'Conversation' in CSV file is not fully supported yet.
|
|
384
|
+
|
|
385
|
+
### Breaking Changes
|
|
386
|
+
- `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be removed in next release.
|
|
382
387
|
|
|
383
388
|
### Bugs Fixed
|
|
384
389
|
- Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Studio.
|
|
385
390
|
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
|
|
391
|
+
- Fixed the non adversarial simulator to run in task-free mode
|
|
392
|
+
- Content safety evaluators (violence, self harm, sexual, hate/unfairness) return the maximum result as the
|
|
393
|
+
main score when aggregating per-turn evaluations from a conversation into an overall
|
|
394
|
+
evaluation score. Other conversation-capable evaluators still default to a mean for aggregation.
|
|
395
|
+
- Fixed bug in non adversarial simulator sample where `tasks` undefined
|
|
396
|
+
|
|
397
|
+
### Other Changes
|
|
398
|
+
- Changed minimum required python version to use this package from 3.8 to 3.9
|
|
399
|
+
- Stop dependency on the local promptflow service. No promptflow service will automatically start when running evaluation.
|
|
400
|
+
- Evaluators internally allow for custom aggregation. However, this causes serialization failures if evaluated while the
|
|
401
|
+
environment variable `AI_EVALS_BATCH_USE_ASYNC` is set to false.
|
|
402
|
+
|
|
403
|
+
## 1.1.0 (2024-12-12)
|
|
404
|
+
|
|
405
|
+
### Features Added
|
|
406
|
+
- Added image support in `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator`. Provide image URLs or base64 encoded images in `conversation` input for image evaluation. See below for an example:
|
|
407
|
+
|
|
408
|
+
```python
|
|
409
|
+
evaluator = ContentSafetyEvaluator(credential=azure_cred, azure_ai_project=project_scope)
|
|
410
|
+
conversation = {
|
|
411
|
+
"messages": [
|
|
412
|
+
{
|
|
413
|
+
"role": "system",
|
|
414
|
+
"content": [
|
|
415
|
+
{"type": "text", "text": "You are an AI assistant that understands images."}
|
|
416
|
+
],
|
|
417
|
+
},
|
|
418
|
+
{
|
|
419
|
+
"role": "user",
|
|
420
|
+
"content": [
|
|
421
|
+
{"type": "text", "text": "Can you describe this image?"},
|
|
422
|
+
{
|
|
423
|
+
"type": "image_url",
|
|
424
|
+
"image_url": {
|
|
425
|
+
"url": "https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg"
|
|
426
|
+
},
|
|
427
|
+
},
|
|
428
|
+
],
|
|
429
|
+
},
|
|
430
|
+
{
|
|
431
|
+
"role": "assistant",
|
|
432
|
+
"content": [
|
|
433
|
+
{
|
|
434
|
+
"type": "text",
|
|
435
|
+
"text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.",
|
|
436
|
+
}
|
|
437
|
+
],
|
|
438
|
+
},
|
|
439
|
+
]
|
|
440
|
+
}
|
|
441
|
+
print("Calling Content Safety Evaluator for multi-modal")
|
|
442
|
+
score = evaluator(conversation=conversation)
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
- Please switch to generic evaluators for image evaluations as mentioned above. `ContentSafetyMultimodalEvaluator`, `ContentSafetyMultimodalEvaluatorBase`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be deprecated in the next release.
|
|
446
|
+
|
|
447
|
+
### Bugs Fixed
|
|
448
|
+
- Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Foundry portal.
|
|
449
|
+
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
|
|
386
450
|
|
|
387
451
|
## 1.0.1 (2024-11-15)
|
|
388
452
|
|
|
@@ -22,7 +22,7 @@ Azure AI SDK provides following to evaluate Generative AI Applications:
|
|
|
22
22
|
|
|
23
23
|
### Prerequisites
|
|
24
24
|
|
|
25
|
-
- Python 3.
|
|
25
|
+
- Python 3.9 or later is required to use this package.
|
|
26
26
|
- [Optional] You must have [Azure AI Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
|
|
27
27
|
|
|
28
28
|
### Install the package
|
{azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_azure/_clients.py
RENAMED
|
@@ -17,7 +17,7 @@ from azure.ai.evaluation.simulator._model_tools._identity_manager import TokenSc
|
|
|
17
17
|
from ._models import BlobStoreInfo, Workspace
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
API_VERSION: Final[str] = "2024-
|
|
20
|
+
API_VERSION: Final[str] = "2024-07-01-preview"
|
|
21
21
|
QUERY_KEY_API_VERSION: Final[str] = "api-version"
|
|
22
22
|
PATH_ML_WORKSPACES = ("providers", "Microsoft.MachineLearningServices", "workspaces")
|
|
23
23
|
|
|
@@ -69,7 +69,9 @@ class LiteMLClient:
|
|
|
69
69
|
self._get_token_manager()
|
|
70
70
|
return cast(TokenCredential, self._credential)
|
|
71
71
|
|
|
72
|
-
def workspace_get_default_datastore(
|
|
72
|
+
def workspace_get_default_datastore(
|
|
73
|
+
self, workspace_name: str, *, include_credentials: bool = False, **kwargs: Any
|
|
74
|
+
) -> BlobStoreInfo:
|
|
73
75
|
# 1. Get the default blob store
|
|
74
76
|
# REST API documentation:
|
|
75
77
|
# https://learn.microsoft.com/rest/api/azureml/datastores/list?view=rest-azureml-2024-10-01
|
|
@@ -92,18 +94,29 @@ class LiteMLClient:
|
|
|
92
94
|
account_name = props_json["accountName"]
|
|
93
95
|
endpoint = props_json["endpoint"]
|
|
94
96
|
container_name = props_json["containerName"]
|
|
97
|
+
credential_type = props_json.get("credentials", {}).get("credentialsType")
|
|
95
98
|
|
|
96
99
|
# 2. Get the SAS token to use for accessing the blob store
|
|
97
100
|
# REST API documentation:
|
|
98
101
|
# https://learn.microsoft.com/rest/api/azureml/datastores/list-secrets?view=rest-azureml-2024-10-01
|
|
99
|
-
blob_store_credential: Optional[Union[AzureSasCredential, str]]
|
|
100
|
-
if include_credentials:
|
|
102
|
+
blob_store_credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
|
|
103
|
+
if not include_credentials:
|
|
104
|
+
blob_store_credential = None
|
|
105
|
+
elif credential_type and credential_type.lower() == "none":
|
|
106
|
+
# If storage account key access is disabled, and only Microsoft Entra ID authentication is available,
|
|
107
|
+
# the credentialsType will be "None" and we should not attempt to get the secrets.
|
|
108
|
+
blob_store_credential = self.get_credential()
|
|
109
|
+
else:
|
|
101
110
|
url = self._generate_path(
|
|
102
111
|
*PATH_ML_WORKSPACES, workspace_name, "datastores", "workspaceblobstore", "listSecrets"
|
|
103
112
|
)
|
|
104
113
|
secrets_response = self._http_client.request(
|
|
105
114
|
method="POST",
|
|
106
115
|
url=url,
|
|
116
|
+
json={
|
|
117
|
+
"expirableSecret": True,
|
|
118
|
+
"expireAfterHours": int(kwargs.get("key_expiration_hours", 1)),
|
|
119
|
+
},
|
|
107
120
|
params={
|
|
108
121
|
QUERY_KEY_API_VERSION: self._api_version,
|
|
109
122
|
},
|
|
@@ -114,10 +127,13 @@ class LiteMLClient:
|
|
|
114
127
|
secrets_json = secrets_response.json()
|
|
115
128
|
secrets_type = secrets_json["secretsType"].lower()
|
|
116
129
|
|
|
130
|
+
# As per this website, only SAS tokens, access tokens, or Entra IDs are valid for accessing blob data
|
|
131
|
+
# stores:
|
|
132
|
+
# https://learn.microsoft.com/rest/api/storageservices/authorize-requests-to-azure-storage.
|
|
117
133
|
if secrets_type == "sas":
|
|
118
134
|
blob_store_credential = AzureSasCredential(secrets_json["sasToken"])
|
|
119
135
|
elif secrets_type == "accountkey":
|
|
120
|
-
# To support
|
|
136
|
+
# To support older versions of azure-storage-blob better, we return a string here instead of
|
|
121
137
|
# an AzureNamedKeyCredential
|
|
122
138
|
blob_store_credential = secrets_json["key"]
|
|
123
139
|
else:
|
|
@@ -164,19 +180,19 @@ class LiteMLClient:
|
|
|
164
180
|
# nothing to see here, move along
|
|
165
181
|
return
|
|
166
182
|
|
|
167
|
-
|
|
183
|
+
message = f"The {description} request failed with HTTP {response.status_code}"
|
|
168
184
|
try:
|
|
169
185
|
error_json = response.json()["error"]
|
|
170
186
|
additional_info = f"({error_json['code']}) {error_json['message']}"
|
|
187
|
+
message += f" - {additional_info}"
|
|
171
188
|
except (JSONDecodeError, ValueError, KeyError):
|
|
172
189
|
pass
|
|
173
190
|
|
|
174
191
|
raise EvaluationException(
|
|
175
|
-
message=
|
|
192
|
+
message=message,
|
|
176
193
|
target=ErrorTarget.EVALUATE,
|
|
177
194
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
178
195
|
blame=ErrorBlame.SYSTEM_ERROR,
|
|
179
|
-
internal_message=additional_info,
|
|
180
196
|
)
|
|
181
197
|
|
|
182
198
|
def _generate_path(self, *paths: str) -> str:
|
{azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_azure/_models.py
RENAMED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
from typing import Dict, List, NamedTuple, Optional, Union
|
|
10
10
|
from msrest.serialization import Model
|
|
11
|
-
from azure.core.credentials import AzureSasCredential
|
|
11
|
+
from azure.core.credentials import AzureSasCredential, TokenCredential
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class BlobStoreInfo(NamedTuple):
|
|
@@ -16,7 +16,7 @@ class BlobStoreInfo(NamedTuple):
|
|
|
16
16
|
account_name: str
|
|
17
17
|
endpoint: str
|
|
18
18
|
container_name: str
|
|
19
|
-
credential: Optional[Union[AzureSasCredential, str]]
|
|
19
|
+
credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class WorkspaceHubConfig(Model):
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
import enum
|
|
4
5
|
from typing import Literal
|
|
6
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class EvaluationMetrics:
|
|
@@ -57,6 +59,22 @@ class EvaluationRunProperties:
|
|
|
57
59
|
EVALUATION_SDK = "_azureml.evaluation_sdk_name"
|
|
58
60
|
|
|
59
61
|
|
|
62
|
+
@experimental
|
|
63
|
+
class _AggregationType(enum.Enum):
|
|
64
|
+
"""Defines how numeric evaluation results should be aggregated
|
|
65
|
+
to produce a single value. Used by individual evaluators to combine per-turn results for
|
|
66
|
+
a conversation-based input. In general, wherever this enum is used, it is also possible
|
|
67
|
+
to directly assign the underlying aggregation function for more complex use cases.
|
|
68
|
+
The 'custom' value is generally not an acceptable input, and should only be used as an output
|
|
69
|
+
to indicate that a custom aggregation function has been injected."""
|
|
70
|
+
|
|
71
|
+
MEAN = "mean"
|
|
72
|
+
MAX = "max"
|
|
73
|
+
MIN = "min"
|
|
74
|
+
SUM = "sum"
|
|
75
|
+
CUSTOM = "custom"
|
|
76
|
+
|
|
77
|
+
|
|
60
78
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
|
|
61
79
|
|
|
62
80
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
|
|
@@ -5,5 +5,6 @@ from .eval_run_context import EvalRunContext
|
|
|
5
5
|
from .code_client import CodeClient
|
|
6
6
|
from .proxy_client import ProxyClient
|
|
7
7
|
from .target_run_context import TargetRunContext
|
|
8
|
+
from .proxy_client import ProxyRun
|
|
8
9
|
|
|
9
|
-
__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
|
|
10
|
+
__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]
|
{azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_eval_run.py
RENAMED
|
@@ -421,7 +421,9 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
421
421
|
local_paths.append(local_file_path)
|
|
422
422
|
|
|
423
423
|
# We will write the artifacts to the workspaceblobstore
|
|
424
|
-
datastore = self._management_client.workspace_get_default_datastore(
|
|
424
|
+
datastore = self._management_client.workspace_get_default_datastore(
|
|
425
|
+
self._workspace_name, include_credentials=True
|
|
426
|
+
)
|
|
425
427
|
account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
|
|
426
428
|
|
|
427
429
|
svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
|
{azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_evaluate.py
RENAMED
|
@@ -12,6 +12,7 @@ import pandas as pd
|
|
|
12
12
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
13
|
from promptflow.client import PFClient
|
|
14
14
|
from promptflow.entities import Run
|
|
15
|
+
from promptflow._sdk._configuration import Configuration
|
|
15
16
|
|
|
16
17
|
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
17
18
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
@@ -20,17 +21,19 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
|
|
|
20
21
|
from .._constants import (
|
|
21
22
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
22
23
|
EvaluationMetrics,
|
|
24
|
+
DefaultOpenEncoding,
|
|
23
25
|
Prefixes,
|
|
24
26
|
_InternalEvaluationMetrics,
|
|
25
27
|
)
|
|
26
28
|
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
27
29
|
from .._user_agent import USER_AGENT
|
|
28
|
-
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
|
|
30
|
+
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
|
|
29
31
|
from ._utils import (
|
|
30
32
|
_apply_column_mapping,
|
|
31
33
|
_log_metrics_and_instance_results,
|
|
32
34
|
_trace_destination_from_project_scope,
|
|
33
35
|
_write_output,
|
|
36
|
+
DataLoaderFactory,
|
|
34
37
|
)
|
|
35
38
|
|
|
36
39
|
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
@@ -429,10 +432,11 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
429
432
|
)
|
|
430
433
|
|
|
431
434
|
try:
|
|
432
|
-
|
|
435
|
+
data_loader = DataLoaderFactory.get_loader(data)
|
|
436
|
+
initial_data_df = data_loader.load()
|
|
433
437
|
except Exception as e:
|
|
434
438
|
raise EvaluationException(
|
|
435
|
-
message=f"Unable to load data from '{data}'.
|
|
439
|
+
message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
|
|
436
440
|
target=ErrorTarget.EVALUATE,
|
|
437
441
|
category=ErrorCategory.INVALID_VALUE,
|
|
438
442
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -444,7 +448,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
444
448
|
def _apply_target_to_data(
|
|
445
449
|
target: Callable,
|
|
446
450
|
data: Union[str, os.PathLike],
|
|
447
|
-
|
|
451
|
+
batch_client: TClient,
|
|
448
452
|
initial_data: pd.DataFrame,
|
|
449
453
|
evaluation_name: Optional[str] = None,
|
|
450
454
|
**kwargs,
|
|
@@ -454,10 +458,10 @@ def _apply_target_to_data(
|
|
|
454
458
|
|
|
455
459
|
:param target: The function to be applied to data.
|
|
456
460
|
:type target: Callable
|
|
457
|
-
:param data: The path to input jsonl file.
|
|
461
|
+
:param data: The path to input jsonl or csv file.
|
|
458
462
|
:type data: Union[str, os.PathLike]
|
|
459
|
-
:param
|
|
460
|
-
:type
|
|
463
|
+
:param batch_client: The promptflow client to be used.
|
|
464
|
+
:type batch_client: PFClient
|
|
461
465
|
:param initial_data: The data frame with the loaded data.
|
|
462
466
|
:type initial_data: pd.DataFrame
|
|
463
467
|
:param evaluation_name: The name of the evaluation.
|
|
@@ -467,7 +471,7 @@ def _apply_target_to_data(
|
|
|
467
471
|
"""
|
|
468
472
|
_run_name = kwargs.get("_run_name")
|
|
469
473
|
with TargetRunContext():
|
|
470
|
-
run:
|
|
474
|
+
run: ProxyRun = batch_client.run(
|
|
471
475
|
flow=target,
|
|
472
476
|
display_name=evaluation_name,
|
|
473
477
|
data=data,
|
|
@@ -475,7 +479,18 @@ def _apply_target_to_data(
|
|
|
475
479
|
name=_run_name,
|
|
476
480
|
)
|
|
477
481
|
|
|
478
|
-
target_output: pd.DataFrame =
|
|
482
|
+
target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
|
|
483
|
+
run_summary = batch_client.get_run_summary(run)
|
|
484
|
+
|
|
485
|
+
if run_summary["completed_lines"] == 0:
|
|
486
|
+
msg = (f"Evaluation target failed to produce any results."
|
|
487
|
+
f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
|
|
488
|
+
raise EvaluationException(
|
|
489
|
+
message=msg,
|
|
490
|
+
target=ErrorTarget.EVALUATE,
|
|
491
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
492
|
+
blame=ErrorBlame.USER_ERROR,
|
|
493
|
+
)
|
|
479
494
|
# Remove input and output prefix
|
|
480
495
|
generated_columns = {
|
|
481
496
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -494,7 +509,7 @@ def _apply_target_to_data(
|
|
|
494
509
|
# Concatenate output to input
|
|
495
510
|
target_output = pd.concat([target_output, initial_data], axis=1)
|
|
496
511
|
|
|
497
|
-
return target_output, generated_columns, run
|
|
512
|
+
return target_output, generated_columns, run.run.result()
|
|
498
513
|
|
|
499
514
|
|
|
500
515
|
def _process_column_mappings(
|
|
@@ -569,13 +584,14 @@ def evaluate(
|
|
|
569
584
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
570
585
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
571
586
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
587
|
+
fail_on_evaluator_errors: bool = False,
|
|
572
588
|
**kwargs,
|
|
573
589
|
) -> EvaluationResult:
|
|
574
590
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
575
591
|
data will be run through target function and then results will be evaluated.
|
|
576
592
|
|
|
577
593
|
:keyword data: Path to the data to be evaluated or passed to target if target is set.
|
|
578
|
-
|
|
594
|
+
JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
|
|
579
595
|
:paramtype data: str
|
|
580
596
|
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
581
597
|
and value as the evaluator function. Required.
|
|
@@ -594,6 +610,11 @@ def evaluate(
|
|
|
594
610
|
:paramtype output_path: Optional[str]
|
|
595
611
|
:keyword azure_ai_project: Logs evaluation results to AI Studio if set.
|
|
596
612
|
:paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
|
|
613
|
+
:keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
|
|
614
|
+
if ANY evaluator fails during their evaluation.
|
|
615
|
+
Defaults to false, which means that evaluations will continue regardless of failures.
|
|
616
|
+
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
|
|
617
|
+
:paramtype fail_on_evaluator_errors: bool
|
|
597
618
|
:return: Evaluation results.
|
|
598
619
|
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
599
620
|
|
|
@@ -615,6 +636,7 @@ def evaluate(
|
|
|
615
636
|
evaluator_config=evaluator_config,
|
|
616
637
|
azure_ai_project=azure_ai_project,
|
|
617
638
|
output_path=output_path,
|
|
639
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
618
640
|
**kwargs,
|
|
619
641
|
)
|
|
620
642
|
except Exception as e:
|
|
@@ -663,6 +685,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
|
663
685
|
print("\n====================================================\n")
|
|
664
686
|
|
|
665
687
|
|
|
688
|
+
def _print_fail_flag_warning() -> None:
|
|
689
|
+
print(
|
|
690
|
+
"Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
|
|
691
|
+
+ "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
|
|
692
|
+
+ "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
|
|
693
|
+
+ "without producing any outputs, since a single failure will cancel the entire run "
|
|
694
|
+
"when fail_on_evaluator_errors is enabled."
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
|
|
666
698
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
667
699
|
*,
|
|
668
700
|
evaluators: Dict[str, Callable],
|
|
@@ -672,8 +704,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
672
704
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
673
705
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
674
706
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
707
|
+
fail_on_evaluator_errors: bool = False,
|
|
675
708
|
**kwargs,
|
|
676
709
|
) -> EvaluationResult:
|
|
710
|
+
if fail_on_evaluator_errors:
|
|
711
|
+
_print_fail_flag_warning()
|
|
677
712
|
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
|
|
678
713
|
|
|
679
714
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
@@ -690,6 +725,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
690
725
|
if target is not None:
|
|
691
726
|
_validate_columns_for_target(input_data_df, target)
|
|
692
727
|
|
|
728
|
+
Configuration.get_instance().set_config("trace.destination", "none")
|
|
693
729
|
pf_client = PFClient(user_agent=USER_AGENT)
|
|
694
730
|
target_run: Optional[Run] = None
|
|
695
731
|
|
|
@@ -702,7 +738,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
702
738
|
target_generated_columns: Set[str] = set()
|
|
703
739
|
if data is not None and target is not None:
|
|
704
740
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
705
|
-
target, data, pf_client, input_data_df, evaluation_name, **kwargs
|
|
741
|
+
target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
|
|
706
742
|
)
|
|
707
743
|
|
|
708
744
|
for evaluator_name, mapping in column_mapping.items():
|
|
@@ -773,6 +809,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
773
809
|
evaluators_result_df = None
|
|
774
810
|
evaluators_metric = {}
|
|
775
811
|
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
812
|
+
if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
|
|
813
|
+
_print_summary(per_evaluator_results)
|
|
814
|
+
_turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
|
|
815
|
+
|
|
776
816
|
evaluator_result_df = evaluator_result["result"]
|
|
777
817
|
|
|
778
818
|
# drop input columns
|
|
@@ -825,3 +865,20 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
825
865
|
_write_output(output_path, result)
|
|
826
866
|
|
|
827
867
|
return result
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
871
|
+
"""Produce an EvaluationException using the contents of the inputted
|
|
872
|
+
file as the error message.
|
|
873
|
+
|
|
874
|
+
:param log_path: The path to the error log file.
|
|
875
|
+
:type log_path: str
|
|
876
|
+
"""
|
|
877
|
+
with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
|
|
878
|
+
error_message = file.read()
|
|
879
|
+
raise EvaluationException(
|
|
880
|
+
message=error_message,
|
|
881
|
+
target=ErrorTarget.EVALUATE,
|
|
882
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
883
|
+
blame=ErrorBlame.UNKNOWN,
|
|
884
|
+
)
|
{azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_utils.py
RENAMED
|
@@ -328,3 +328,30 @@ def set_event_loop_policy() -> None:
|
|
|
328
328
|
# Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
|
|
329
329
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
330
330
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class JSONLDataFileLoader:
|
|
334
|
+
def __init__(self, filename: Union[os.PathLike, str]):
|
|
335
|
+
self.filename = filename
|
|
336
|
+
|
|
337
|
+
def load(self) -> pd.DataFrame:
|
|
338
|
+
return pd.read_json(self.filename, lines=True)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class CSVDataFileLoader:
|
|
342
|
+
def __init__(self, filename: Union[os.PathLike, str]):
|
|
343
|
+
self.filename = filename
|
|
344
|
+
|
|
345
|
+
def load(self) -> pd.DataFrame:
|
|
346
|
+
return pd.read_csv(self.filename)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
class DataLoaderFactory:
|
|
350
|
+
@staticmethod
|
|
351
|
+
def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, CSVDataFileLoader]:
|
|
352
|
+
filename_str = str(filename).lower()
|
|
353
|
+
if filename_str.endswith(".csv"):
|
|
354
|
+
return CSVDataFileLoader(filename)
|
|
355
|
+
|
|
356
|
+
# fallback to JSONL to maintain backward compatibility
|
|
357
|
+
return JSONLDataFileLoader(filename)
|