azure-ai-evaluation 1.0.0b4__tar.gz → 1.0.0b5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/CHANGELOG.md +68 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/MANIFEST.in +1 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/NOTICE.txt +20 -0
- {azure_ai_evaluation-1.0.0b4/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.0.0b5}/PKG-INFO +166 -9
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/README.md +96 -8
- azure_ai_evaluation-1.0.0b5/TROUBLESHOOTING.md +50 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/__init__.py +22 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/constants.py +5 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/math.py +11 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/rai_service.py +172 -35
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/utils.py +162 -23
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_constants.py +6 -6
- {azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluate/_batch_run_client → azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluate/_batch_run}/__init__.py +3 -2
- azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py → azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +4 -4
- {azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluate/_batch_run_client → azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluate/_batch_run}/proxy_client.py +6 -3
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluate/_eval_run.py +21 -4
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluate/_evaluate.py +267 -139
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -5
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluate/_utils.py +40 -7
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +14 -9
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_common/_base_eval.py +20 -19
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +18 -8
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +48 -9
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -19
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +5 -5
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +30 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +30 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +30 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +30 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +20 -20
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +106 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +90 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_qa/_qa.py +7 -3
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -19
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +197 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +17 -14
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_xpia/xpia.py +32 -5
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_exceptions.py +17 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_model_configurations.py +18 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_version.py +1 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/__init__.py +2 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_adversarial_simulator.py +4 -1
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +79 -34
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_simulator.py +115 -61
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_utils.py +6 -6
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5/azure_ai_evaluation.egg-info}/PKG-INFO +166 -9
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure_ai_evaluation.egg-info/SOURCES.txt +22 -6
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure_ai_evaluation.egg-info/requires.txt +1 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/setup.py +2 -0
- azure_ai_evaluation-1.0.0b5/tests/__pf_service_isolation.py +28 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/conftest.py +27 -8
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/target_fn.py +18 -0
- azure_ai_evaluation-1.0.0b5/tests/e2etests/test_builtin_evaluators.py +1021 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/test_evaluate.py +217 -21
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/test_sim_and_eval.py +5 -9
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_batch_run_context.py +8 -8
- azure_ai_evaluation-1.0.0b5/tests/unittests/test_built_in_evaluator.py +138 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_content_safety_rai_script.py +17 -12
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_eval_run.py +28 -2
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_evaluate.py +59 -22
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_non_adv_simulator.py +7 -4
- azure_ai_evaluation-1.0.0b5/tests/unittests/test_utils.py +56 -0
- azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -57
- azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -56
- azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +0 -72
- azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +0 -57
- azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -64
- azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -154
- azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -43
- azure_ai_evaluation-1.0.0b4/tests/e2etests/test_builtin_evaluators.py +0 -474
- azure_ai_evaluation-1.0.0b4/tests/unittests/test_built_in_evaluator.py +0 -41
- azure_ai_evaluation-1.0.0b4/tests/unittests/test_utils.py +0 -20
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/simulator/_helpers → azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_common}/_experimental.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluate/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluate/_batch_run_client → azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluate/_batch_run}/code_client.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_http_utils.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_user_agent.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/py.typed +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_constants.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_conversation/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/models.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_tracing.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/pyproject.toml +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/setup.cfg +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/__openai_patcher.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/test_adv_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/test_metrics_upload.py +1 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_content_safety_defect_rate.py +1 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_evaluate_telemetry.py +1 -1
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_evaluators/apology_dag/apology.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_evaluators/test_inputs_evaluators.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_jailbreak_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_save_eval.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
- {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_synthetic_conversation_bot.py +1 -1
|
@@ -1,5 +1,71 @@
|
|
|
1
1
|
# Release History
|
|
2
2
|
|
|
3
|
+
## 1.0.0b5 (2024-10-28)
|
|
4
|
+
|
|
5
|
+
### Features Added
|
|
6
|
+
- Added `GroundednessProEvaluator`, which is a service-based evaluator for determining response groundedness.
|
|
7
|
+
- Groundedness detection in Non Adversarial Simulator via query/context pairs
|
|
8
|
+
```python
|
|
9
|
+
import importlib.resources as pkg_resources
|
|
10
|
+
package = "azure.ai.evaluation.simulator._data_sources"
|
|
11
|
+
resource_name = "grounding.json"
|
|
12
|
+
custom_simulator = Simulator(model_config=model_config)
|
|
13
|
+
conversation_turns = []
|
|
14
|
+
with pkg_resources.path(package, resource_name) as grounding_file:
|
|
15
|
+
with open(grounding_file, "r") as file:
|
|
16
|
+
data = json.load(file)
|
|
17
|
+
for item in data:
|
|
18
|
+
conversation_turns.append([item])
|
|
19
|
+
outputs = asyncio.run(custom_simulator(
|
|
20
|
+
target=callback,
|
|
21
|
+
conversation_turns=conversation_turns,
|
|
22
|
+
max_conversation_turns=1,
|
|
23
|
+
))
|
|
24
|
+
```
|
|
25
|
+
- Adding evaluator for multimodal use cases
|
|
26
|
+
|
|
27
|
+
### Breaking Changes
|
|
28
|
+
- Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`.
|
|
29
|
+
- `RetrievalEvaluator` now requires a `context` input in addition to `query` in single-turn evaluation.
|
|
30
|
+
- `RelevanceEvaluator` no longer takes `context` as an input. It now only takes `query` and `response` in single-turn evaluation.
|
|
31
|
+
- `FluencyEvaluator` no longer takes `query` as an input. It now only takes `response` in single-turn evaluation.
|
|
32
|
+
- AdversarialScenario enum does not include `ADVERSARIAL_INDIRECT_JAILBREAK`, invoking IndirectJailbreak or XPIA should be done with `IndirectAttackSimulator`
|
|
33
|
+
- Outputs of `Simulator` and `AdversarialSimulator` previously had `to_eval_qa_json_lines` and now has `to_eval_qr_json_lines`. Where `to_eval_qa_json_lines` had:
|
|
34
|
+
```json
|
|
35
|
+
{"question": <user_message>, "answer": <assistant_message>}
|
|
36
|
+
```
|
|
37
|
+
`to_eval_qr_json_lines` now has:
|
|
38
|
+
```json
|
|
39
|
+
{"query": <user_message>, "response": assistant_message}
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Bugs Fixed
|
|
43
|
+
- Non adversarial simulator works with `gpt-4o` models using the `json_schema` response format
|
|
44
|
+
- Fixed an issue where the `evaluate` API would fail with "[WinError 32] The process cannot access the file because it is being used by another process" when venv folder and target function file are in the same directory.
|
|
45
|
+
- Fix evaluate API failure when `trace.destination` is set to `none`
|
|
46
|
+
- Non adversarial simulator now accepts context from the callback
|
|
47
|
+
|
|
48
|
+
### Other Changes
|
|
49
|
+
- Improved error messages for the `evaluate` API by enhancing the validation of input parameters. This update provides more detailed and actionable error descriptions.
|
|
50
|
+
- `GroundednessEvaluator` now supports `query` as an optional input in single-turn evaluation. If `query` is provided, a different prompt template will be used for the evaluation.
|
|
51
|
+
- To align with our support of a diverse set of models, the following evaluators will now have a new key in their result output without the `gpt_` prefix. To maintain backwards compatibility, the old key with the `gpt_` prefix will still be present in the output; however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
52
|
+
- `CoherenceEvaluator`
|
|
53
|
+
- `RelevanceEvaluator`
|
|
54
|
+
- `FluencyEvaluator`
|
|
55
|
+
- `GroundednessEvaluator`
|
|
56
|
+
- `SimilarityEvaluator`
|
|
57
|
+
- `RetrievalEvaluator`
|
|
58
|
+
- The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "<metric_name>_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased.
|
|
59
|
+
|
|
60
|
+
| Evaluator | New Token Limit |
|
|
61
|
+
| --- | --- |
|
|
62
|
+
| `CoherenceEvaluator` | 800 |
|
|
63
|
+
| `RelevanceEvaluator` | 800 |
|
|
64
|
+
| `FluencyEvaluator` | 800 |
|
|
65
|
+
| `GroundednessEvaluator` | 800 |
|
|
66
|
+
| `RetrievalEvaluator` | 1600 |
|
|
67
|
+
- Improved the error message for storage access permission issues to provide clearer guidance for users.
|
|
68
|
+
|
|
3
69
|
## 1.0.0b4 (2024-10-16)
|
|
4
70
|
|
|
5
71
|
### Breaking Changes
|
|
@@ -10,9 +76,11 @@
|
|
|
10
76
|
|
|
11
77
|
### Bugs Fixed
|
|
12
78
|
- Adversarial Conversation simulations would fail with `Forbidden`. Added logic to re-fetch token in the exponential retry logic to retrive RAI Service response.
|
|
79
|
+
- Fixed an issue where the Evaluate API did not fail due to missing inputs when the target did not return columns required by the evaluators.
|
|
13
80
|
|
|
14
81
|
### Other Changes
|
|
15
82
|
- Enhance the error message to provide clearer instruction when required packages for the remote tracking feature are missing.
|
|
83
|
+
- Print the per-evaluator run summary at the end of the Evaluate API call to make troubleshooting row-level failures easier.
|
|
16
84
|
|
|
17
85
|
## 1.0.0b3 (2024-10-01)
|
|
18
86
|
|
|
@@ -48,3 +48,23 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
48
48
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
49
49
|
See the License for the specific language governing permissions and
|
|
50
50
|
limitations under the License.
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
License notice for [Is GPT-4 a reliable rater? Evaluating consistency in GPT-4's text ratings](https://www.frontiersin.org/journals/education/articles/10.3389/feduc.2023.1272229/full)
|
|
54
|
+
------------------------------------------------------------------------------------------------------------------
|
|
55
|
+
Copyright © 2023 Hackl, Müller, Granitzer and Sailer. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
License notice for [Is ChatGPT a Good NLG Evaluator? A Preliminary Study](https://aclanthology.org/2023.newsum-1.1) (Wang et al., NewSum 2023)
|
|
59
|
+
------------------------------------------------------------------------------------------------------------------
|
|
60
|
+
Copyright © 2023. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
License notice for [SummEval: Re-evaluating Summarization Evaluation.](https://doi.org/10.1162/tacl_a_00373) (Fabbri et al.)
|
|
64
|
+
------------------------------------------------------------------------------------------------------------------
|
|
65
|
+
© 2021 Association for Computational Linguistics. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
License notice for [Evaluation Metrics in the Era of GPT-4: Reliably Evaluating Large Language Models on Sequence to Sequence Tasks](https://aclanthology.org/2023.emnlp-main.543) (Sottana et al., EMNLP 2023)
|
|
69
|
+
------------------------------------------------------------------------------------------------------------------
|
|
70
|
+
© 2023 Association for Computational Linguistics. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
|
{azure_ai_evaluation-1.0.0b4/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.0.0b5}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0b5
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -30,6 +30,7 @@ Requires-Dist: azure-core>=1.30.2
|
|
|
30
30
|
Requires-Dist: nltk>=3.9.1
|
|
31
31
|
Provides-Extra: remote
|
|
32
32
|
Requires-Dist: promptflow-azure<2.0.0,>=1.15.0; extra == "remote"
|
|
33
|
+
Requires-Dist: azure-ai-inference>=1.0.0b4; extra == "remote"
|
|
33
34
|
|
|
34
35
|
# Azure AI Evaluation client library for Python
|
|
35
36
|
|
|
@@ -95,9 +96,6 @@ if __name__ == "__main__":
|
|
|
95
96
|
# Running Relevance Evaluator on single input row
|
|
96
97
|
relevance_score = relevance_eval(
|
|
97
98
|
response="The Alpine Explorer Tent is the most waterproof.",
|
|
98
|
-
context="From the our product list,"
|
|
99
|
-
" the alpine explorer tent is the most waterproof."
|
|
100
|
-
" The Adventure Dining Table has higher weight.",
|
|
101
99
|
query="Which tent is the most waterproof?",
|
|
102
100
|
)
|
|
103
101
|
|
|
@@ -172,6 +170,95 @@ Output with a string that continues the conversation, responding to the latest m
|
|
|
172
170
|
{{ conversation_history }}
|
|
173
171
|
|
|
174
172
|
```
|
|
173
|
+
|
|
174
|
+
Query Response generaing prompty for gpt-4o with `json_schema` support
|
|
175
|
+
Use this file as an override.
|
|
176
|
+
```yaml
|
|
177
|
+
---
|
|
178
|
+
name: TaskSimulatorQueryResponseGPT4o
|
|
179
|
+
description: Gets queries and responses from a blob of text
|
|
180
|
+
model:
|
|
181
|
+
api: chat
|
|
182
|
+
parameters:
|
|
183
|
+
temperature: 0.0
|
|
184
|
+
top_p: 1.0
|
|
185
|
+
presence_penalty: 0
|
|
186
|
+
frequency_penalty: 0
|
|
187
|
+
response_format:
|
|
188
|
+
type: json_schema
|
|
189
|
+
json_schema:
|
|
190
|
+
name: QRJsonSchema
|
|
191
|
+
schema:
|
|
192
|
+
type: object
|
|
193
|
+
properties:
|
|
194
|
+
items:
|
|
195
|
+
type: array
|
|
196
|
+
items:
|
|
197
|
+
type: object
|
|
198
|
+
properties:
|
|
199
|
+
q:
|
|
200
|
+
type: string
|
|
201
|
+
r:
|
|
202
|
+
type: string
|
|
203
|
+
required:
|
|
204
|
+
- q
|
|
205
|
+
- r
|
|
206
|
+
|
|
207
|
+
inputs:
|
|
208
|
+
text:
|
|
209
|
+
type: string
|
|
210
|
+
num_queries:
|
|
211
|
+
type: integer
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
system:
|
|
216
|
+
You're an AI that helps in preparing a Question/Answer quiz from Text for "Who wants to be a millionaire" tv show
|
|
217
|
+
Both Questions and Answers MUST BE extracted from given Text
|
|
218
|
+
Frame Question in a way so that Answer is RELEVANT SHORT BITE-SIZED info from Text
|
|
219
|
+
RELEVANT info could be: NUMBER, DATE, STATISTIC, MONEY, NAME
|
|
220
|
+
A sentence should contribute multiple QnAs if it has more info in it
|
|
221
|
+
Answer must not be more than 5 words
|
|
222
|
+
Answer must be picked from Text as is
|
|
223
|
+
Question should be as descriptive as possible and must include as much context as possible from Text
|
|
224
|
+
Output must always have the provided number of QnAs
|
|
225
|
+
Output must be in JSON format.
|
|
226
|
+
Output must have {{num_queries}} objects in the format specified below. Any other count is unacceptable.
|
|
227
|
+
Text:
|
|
228
|
+
<|text_start|>
|
|
229
|
+
On January 24, 1984, former Apple CEO Steve Jobs introduced the first Macintosh. In late 2003, Apple had 2.06 percent of the desktop share in the United States.
|
|
230
|
+
Some years later, research firms IDC and Gartner reported that Apple's market share in the U.S. had increased to about 6%.
|
|
231
|
+
<|text_end|>
|
|
232
|
+
Output with 5 QnAs:
|
|
233
|
+
{
|
|
234
|
+
"qna": [{
|
|
235
|
+
"q": "When did the former Apple CEO Steve Jobs introduced the first Macintosh?",
|
|
236
|
+
"r": "January 24, 1984"
|
|
237
|
+
},
|
|
238
|
+
{
|
|
239
|
+
"q": "Who was the former Apple CEO that introduced the first Macintosh on January 24, 1984?",
|
|
240
|
+
"r": "Steve Jobs"
|
|
241
|
+
},
|
|
242
|
+
{
|
|
243
|
+
"q": "What percent of the desktop share did Apple have in the United States in late 2003?",
|
|
244
|
+
"r": "2.06 percent"
|
|
245
|
+
},
|
|
246
|
+
{
|
|
247
|
+
"q": "What were the research firms that reported on Apple's market share in the U.S.?",
|
|
248
|
+
"r": "IDC and Gartner"
|
|
249
|
+
},
|
|
250
|
+
{
|
|
251
|
+
"q": "What was the percentage increase of Apple's market share in the U.S., as reported by research firms IDC and Gartner?",
|
|
252
|
+
"r": "6%"
|
|
253
|
+
}]
|
|
254
|
+
}
|
|
255
|
+
Text:
|
|
256
|
+
<|text_start|>
|
|
257
|
+
{{ text }}
|
|
258
|
+
<|text_end|>
|
|
259
|
+
Output with {{ num_queries }} QnAs:
|
|
260
|
+
```
|
|
261
|
+
|
|
175
262
|
Application code:
|
|
176
263
|
|
|
177
264
|
```python
|
|
@@ -189,6 +276,7 @@ model_config = {
|
|
|
189
276
|
"azure_deployment": os.environ.get("AZURE_DEPLOYMENT"),
|
|
190
277
|
# not providing key would make the SDK pick up `DefaultAzureCredential`
|
|
191
278
|
# use "api_key": "<your API key>"
|
|
279
|
+
"api_version": "2024-08-01-preview" # keep this for gpt-4o
|
|
192
280
|
}
|
|
193
281
|
|
|
194
282
|
# Use Wikipedia to get some text for the simulation
|
|
@@ -232,20 +320,21 @@ async def callback(
|
|
|
232
320
|
formatted_response = {
|
|
233
321
|
"content": response,
|
|
234
322
|
"role": "assistant",
|
|
235
|
-
"context":
|
|
236
|
-
"citations": None,
|
|
237
|
-
},
|
|
323
|
+
"context": "",
|
|
238
324
|
}
|
|
239
325
|
messages["messages"].append(formatted_response)
|
|
240
326
|
return {"messages": messages["messages"], "stream": stream, "session_state": session_state, "context": context}
|
|
241
327
|
|
|
242
328
|
async def main():
|
|
243
329
|
simulator = Simulator(model_config=model_config)
|
|
330
|
+
current_dir = os.path.dirname(__file__)
|
|
331
|
+
query_response_override_for_latest_gpt_4o = os.path.join(current_dir, "TaskSimulatorQueryResponseGPT4o.prompty")
|
|
244
332
|
outputs = await simulator(
|
|
245
333
|
target=callback,
|
|
246
334
|
text=text,
|
|
335
|
+
query_response_generating_prompty=query_response_override_for_latest_gpt_4o, # use this only with latest gpt-4o
|
|
247
336
|
num_queries=2,
|
|
248
|
-
max_conversation_turns=
|
|
337
|
+
max_conversation_turns=1,
|
|
249
338
|
user_persona=[
|
|
250
339
|
f"I am a student and I want to learn more about {wiki_search_term}",
|
|
251
340
|
f"I am a teacher and I want to teach my students about {wiki_search_term}"
|
|
@@ -267,7 +356,7 @@ if __name__ == "__main__":
|
|
|
267
356
|
#### Adversarial Simulator
|
|
268
357
|
|
|
269
358
|
```python
|
|
270
|
-
from
|
|
359
|
+
from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
|
|
271
360
|
from azure.identity import DefaultAzureCredential
|
|
272
361
|
from typing import Any, Dict, List, Optional
|
|
273
362
|
import asyncio
|
|
@@ -420,6 +509,72 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
420
509
|
|
|
421
510
|
# Release History
|
|
422
511
|
|
|
512
|
+
## 1.0.0b5 (2024-10-28)
|
|
513
|
+
|
|
514
|
+
### Features Added
|
|
515
|
+
- Added `GroundednessProEvaluator`, which is a service-based evaluator for determining response groundedness.
|
|
516
|
+
- Groundedness detection in Non Adversarial Simulator via query/context pairs
|
|
517
|
+
```python
|
|
518
|
+
import importlib.resources as pkg_resources
|
|
519
|
+
package = "azure.ai.evaluation.simulator._data_sources"
|
|
520
|
+
resource_name = "grounding.json"
|
|
521
|
+
custom_simulator = Simulator(model_config=model_config)
|
|
522
|
+
conversation_turns = []
|
|
523
|
+
with pkg_resources.path(package, resource_name) as grounding_file:
|
|
524
|
+
with open(grounding_file, "r") as file:
|
|
525
|
+
data = json.load(file)
|
|
526
|
+
for item in data:
|
|
527
|
+
conversation_turns.append([item])
|
|
528
|
+
outputs = asyncio.run(custom_simulator(
|
|
529
|
+
target=callback,
|
|
530
|
+
conversation_turns=conversation_turns,
|
|
531
|
+
max_conversation_turns=1,
|
|
532
|
+
))
|
|
533
|
+
```
|
|
534
|
+
- Adding evaluator for multimodal use cases
|
|
535
|
+
|
|
536
|
+
### Breaking Changes
|
|
537
|
+
- Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`.
|
|
538
|
+
- `RetrievalEvaluator` now requires a `context` input in addition to `query` in single-turn evaluation.
|
|
539
|
+
- `RelevanceEvaluator` no longer takes `context` as an input. It now only takes `query` and `response` in single-turn evaluation.
|
|
540
|
+
- `FluencyEvaluator` no longer takes `query` as an input. It now only takes `response` in single-turn evaluation.
|
|
541
|
+
- AdversarialScenario enum does not include `ADVERSARIAL_INDIRECT_JAILBREAK`, invoking IndirectJailbreak or XPIA should be done with `IndirectAttackSimulator`
|
|
542
|
+
- Outputs of `Simulator` and `AdversarialSimulator` previously had `to_eval_qa_json_lines` and now has `to_eval_qr_json_lines`. Where `to_eval_qa_json_lines` had:
|
|
543
|
+
```json
|
|
544
|
+
{"question": <user_message>, "answer": <assistant_message>}
|
|
545
|
+
```
|
|
546
|
+
`to_eval_qr_json_lines` now has:
|
|
547
|
+
```json
|
|
548
|
+
{"query": <user_message>, "response": assistant_message}
|
|
549
|
+
```
|
|
550
|
+
|
|
551
|
+
### Bugs Fixed
|
|
552
|
+
- Non adversarial simulator works with `gpt-4o` models using the `json_schema` response format
|
|
553
|
+
- Fixed an issue where the `evaluate` API would fail with "[WinError 32] The process cannot access the file because it is being used by another process" when venv folder and target function file are in the same directory.
|
|
554
|
+
- Fix evaluate API failure when `trace.destination` is set to `none`
|
|
555
|
+
- Non adversarial simulator now accepts context from the callback
|
|
556
|
+
|
|
557
|
+
### Other Changes
|
|
558
|
+
- Improved error messages for the `evaluate` API by enhancing the validation of input parameters. This update provides more detailed and actionable error descriptions.
|
|
559
|
+
- `GroundednessEvaluator` now supports `query` as an optional input in single-turn evaluation. If `query` is provided, a different prompt template will be used for the evaluation.
|
|
560
|
+
- To align with our support of a diverse set of models, the following evaluators will now have a new key in their result output without the `gpt_` prefix. To maintain backwards compatibility, the old key with the `gpt_` prefix will still be present in the output; however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
561
|
+
- `CoherenceEvaluator`
|
|
562
|
+
- `RelevanceEvaluator`
|
|
563
|
+
- `FluencyEvaluator`
|
|
564
|
+
- `GroundednessEvaluator`
|
|
565
|
+
- `SimilarityEvaluator`
|
|
566
|
+
- `RetrievalEvaluator`
|
|
567
|
+
- The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "<metric_name>_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased.
|
|
568
|
+
|
|
569
|
+
| Evaluator | New Token Limit |
|
|
570
|
+
| --- | --- |
|
|
571
|
+
| `CoherenceEvaluator` | 800 |
|
|
572
|
+
| `RelevanceEvaluator` | 800 |
|
|
573
|
+
| `FluencyEvaluator` | 800 |
|
|
574
|
+
| `GroundednessEvaluator` | 800 |
|
|
575
|
+
| `RetrievalEvaluator` | 1600 |
|
|
576
|
+
- Improved the error message for storage access permission issues to provide clearer guidance for users.
|
|
577
|
+
|
|
423
578
|
## 1.0.0b4 (2024-10-16)
|
|
424
579
|
|
|
425
580
|
### Breaking Changes
|
|
@@ -430,9 +585,11 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
430
585
|
|
|
431
586
|
### Bugs Fixed
|
|
432
587
|
- Adversarial Conversation simulations would fail with `Forbidden`. Added logic to re-fetch token in the exponential retry logic to retrive RAI Service response.
|
|
588
|
+
- Fixed an issue where the Evaluate API did not fail due to missing inputs when the target did not return columns required by the evaluators.
|
|
433
589
|
|
|
434
590
|
### Other Changes
|
|
435
591
|
- Enhance the error message to provide clearer instruction when required packages for the remote tracking feature are missing.
|
|
592
|
+
- Print the per-evaluator run summary at the end of the Evaluate API call to make troubleshooting row-level failures easier.
|
|
436
593
|
|
|
437
594
|
## 1.0.0b3 (2024-10-01)
|
|
438
595
|
|
|
@@ -62,9 +62,6 @@ if __name__ == "__main__":
|
|
|
62
62
|
# Running Relevance Evaluator on single input row
|
|
63
63
|
relevance_score = relevance_eval(
|
|
64
64
|
response="The Alpine Explorer Tent is the most waterproof.",
|
|
65
|
-
context="From the our product list,"
|
|
66
|
-
" the alpine explorer tent is the most waterproof."
|
|
67
|
-
" The Adventure Dining Table has higher weight.",
|
|
68
65
|
query="Which tent is the most waterproof?",
|
|
69
66
|
)
|
|
70
67
|
|
|
@@ -139,6 +136,95 @@ Output with a string that continues the conversation, responding to the latest m
|
|
|
139
136
|
{{ conversation_history }}
|
|
140
137
|
|
|
141
138
|
```
|
|
139
|
+
|
|
140
|
+
Query Response generaing prompty for gpt-4o with `json_schema` support
|
|
141
|
+
Use this file as an override.
|
|
142
|
+
```yaml
|
|
143
|
+
---
|
|
144
|
+
name: TaskSimulatorQueryResponseGPT4o
|
|
145
|
+
description: Gets queries and responses from a blob of text
|
|
146
|
+
model:
|
|
147
|
+
api: chat
|
|
148
|
+
parameters:
|
|
149
|
+
temperature: 0.0
|
|
150
|
+
top_p: 1.0
|
|
151
|
+
presence_penalty: 0
|
|
152
|
+
frequency_penalty: 0
|
|
153
|
+
response_format:
|
|
154
|
+
type: json_schema
|
|
155
|
+
json_schema:
|
|
156
|
+
name: QRJsonSchema
|
|
157
|
+
schema:
|
|
158
|
+
type: object
|
|
159
|
+
properties:
|
|
160
|
+
items:
|
|
161
|
+
type: array
|
|
162
|
+
items:
|
|
163
|
+
type: object
|
|
164
|
+
properties:
|
|
165
|
+
q:
|
|
166
|
+
type: string
|
|
167
|
+
r:
|
|
168
|
+
type: string
|
|
169
|
+
required:
|
|
170
|
+
- q
|
|
171
|
+
- r
|
|
172
|
+
|
|
173
|
+
inputs:
|
|
174
|
+
text:
|
|
175
|
+
type: string
|
|
176
|
+
num_queries:
|
|
177
|
+
type: integer
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
system:
|
|
182
|
+
You're an AI that helps in preparing a Question/Answer quiz from Text for "Who wants to be a millionaire" tv show
|
|
183
|
+
Both Questions and Answers MUST BE extracted from given Text
|
|
184
|
+
Frame Question in a way so that Answer is RELEVANT SHORT BITE-SIZED info from Text
|
|
185
|
+
RELEVANT info could be: NUMBER, DATE, STATISTIC, MONEY, NAME
|
|
186
|
+
A sentence should contribute multiple QnAs if it has more info in it
|
|
187
|
+
Answer must not be more than 5 words
|
|
188
|
+
Answer must be picked from Text as is
|
|
189
|
+
Question should be as descriptive as possible and must include as much context as possible from Text
|
|
190
|
+
Output must always have the provided number of QnAs
|
|
191
|
+
Output must be in JSON format.
|
|
192
|
+
Output must have {{num_queries}} objects in the format specified below. Any other count is unacceptable.
|
|
193
|
+
Text:
|
|
194
|
+
<|text_start|>
|
|
195
|
+
On January 24, 1984, former Apple CEO Steve Jobs introduced the first Macintosh. In late 2003, Apple had 2.06 percent of the desktop share in the United States.
|
|
196
|
+
Some years later, research firms IDC and Gartner reported that Apple's market share in the U.S. had increased to about 6%.
|
|
197
|
+
<|text_end|>
|
|
198
|
+
Output with 5 QnAs:
|
|
199
|
+
{
|
|
200
|
+
"qna": [{
|
|
201
|
+
"q": "When did the former Apple CEO Steve Jobs introduced the first Macintosh?",
|
|
202
|
+
"r": "January 24, 1984"
|
|
203
|
+
},
|
|
204
|
+
{
|
|
205
|
+
"q": "Who was the former Apple CEO that introduced the first Macintosh on January 24, 1984?",
|
|
206
|
+
"r": "Steve Jobs"
|
|
207
|
+
},
|
|
208
|
+
{
|
|
209
|
+
"q": "What percent of the desktop share did Apple have in the United States in late 2003?",
|
|
210
|
+
"r": "2.06 percent"
|
|
211
|
+
},
|
|
212
|
+
{
|
|
213
|
+
"q": "What were the research firms that reported on Apple's market share in the U.S.?",
|
|
214
|
+
"r": "IDC and Gartner"
|
|
215
|
+
},
|
|
216
|
+
{
|
|
217
|
+
"q": "What was the percentage increase of Apple's market share in the U.S., as reported by research firms IDC and Gartner?",
|
|
218
|
+
"r": "6%"
|
|
219
|
+
}]
|
|
220
|
+
}
|
|
221
|
+
Text:
|
|
222
|
+
<|text_start|>
|
|
223
|
+
{{ text }}
|
|
224
|
+
<|text_end|>
|
|
225
|
+
Output with {{ num_queries }} QnAs:
|
|
226
|
+
```
|
|
227
|
+
|
|
142
228
|
Application code:
|
|
143
229
|
|
|
144
230
|
```python
|
|
@@ -156,6 +242,7 @@ model_config = {
|
|
|
156
242
|
"azure_deployment": os.environ.get("AZURE_DEPLOYMENT"),
|
|
157
243
|
# not providing key would make the SDK pick up `DefaultAzureCredential`
|
|
158
244
|
# use "api_key": "<your API key>"
|
|
245
|
+
"api_version": "2024-08-01-preview" # keep this for gpt-4o
|
|
159
246
|
}
|
|
160
247
|
|
|
161
248
|
# Use Wikipedia to get some text for the simulation
|
|
@@ -199,20 +286,21 @@ async def callback(
|
|
|
199
286
|
formatted_response = {
|
|
200
287
|
"content": response,
|
|
201
288
|
"role": "assistant",
|
|
202
|
-
"context":
|
|
203
|
-
"citations": None,
|
|
204
|
-
},
|
|
289
|
+
"context": "",
|
|
205
290
|
}
|
|
206
291
|
messages["messages"].append(formatted_response)
|
|
207
292
|
return {"messages": messages["messages"], "stream": stream, "session_state": session_state, "context": context}
|
|
208
293
|
|
|
209
294
|
async def main():
|
|
210
295
|
simulator = Simulator(model_config=model_config)
|
|
296
|
+
current_dir = os.path.dirname(__file__)
|
|
297
|
+
query_response_override_for_latest_gpt_4o = os.path.join(current_dir, "TaskSimulatorQueryResponseGPT4o.prompty")
|
|
211
298
|
outputs = await simulator(
|
|
212
299
|
target=callback,
|
|
213
300
|
text=text,
|
|
301
|
+
query_response_generating_prompty=query_response_override_for_latest_gpt_4o, # use this only with latest gpt-4o
|
|
214
302
|
num_queries=2,
|
|
215
|
-
max_conversation_turns=
|
|
303
|
+
max_conversation_turns=1,
|
|
216
304
|
user_persona=[
|
|
217
305
|
f"I am a student and I want to learn more about {wiki_search_term}",
|
|
218
306
|
f"I am a teacher and I want to teach my students about {wiki_search_term}"
|
|
@@ -234,7 +322,7 @@ if __name__ == "__main__":
|
|
|
234
322
|
#### Adversarial Simulator
|
|
235
323
|
|
|
236
324
|
```python
|
|
237
|
-
from
|
|
325
|
+
from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
|
|
238
326
|
from azure.identity import DefaultAzureCredential
|
|
239
327
|
from typing import Any, Dict, List, Optional
|
|
240
328
|
import asyncio
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Troubleshoot AI Evaluation SDK Issues
|
|
2
|
+
|
|
3
|
+
This guide walks you through how to investigate failures, common errors in the `azure-ai-evaluation` SDK, and steps to mitigate these issues.
|
|
4
|
+
|
|
5
|
+
## Table of Contents
|
|
6
|
+
|
|
7
|
+
- [Handle Evaluate API Errors](#handle-evaluate-api-errors)
|
|
8
|
+
- [Troubleshoot Remote Tracking Issues](#troubleshoot-remote-tracking-issues)
|
|
9
|
+
- [Safety Metric Supported Regions](#safety-metric-supported-regions)
|
|
10
|
+
- [Handle Simulation Errors](#handle-simulation-errors)
|
|
11
|
+
- [Adversarial Simulation Supported Regions](#adversarial-simulation-supported-regions)
|
|
12
|
+
- [Logging](#logging)
|
|
13
|
+
- [Get additional help](#get-additional-help)
|
|
14
|
+
|
|
15
|
+
## Handle Evaluate API Errors
|
|
16
|
+
|
|
17
|
+
### Troubleshoot Remote Tracking Issues
|
|
18
|
+
|
|
19
|
+
- Before running `evaluate()`, to ensure that you can enable logging and tracing to your Azure AI project, make sure you are first logged in by running `az login`.
|
|
20
|
+
- Then install the following sub-package:
|
|
21
|
+
|
|
22
|
+
```Shell
|
|
23
|
+
pip install azure-ai-evaluation[remote]
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
- Ensure that you assign the proper permissions to the storage account linked to your Azure AI Studio hub. This can be done with the following command. More information can be found [here](https://review.learn.microsoft.com/azure/ai-studio/how-to/disable-local-auth).
|
|
27
|
+
|
|
28
|
+
```Shell
|
|
29
|
+
az role assignment create --role "Storage Blob Data Contributor" --scope /subscriptions/<mySubscriptionID>/resourceGroups/<myResourceGroupName> --assignee-principal-type User --assignee-object-id "<user-id>"
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
- Additionally, if you're using a virtual network or private link, and your evaluation run upload fails because of that, check out this [guide](https://docs.microsoft.com/azure/machine-learning/how-to-enable-studio-virtual-network#access-data-using-the-studio).
|
|
33
|
+
|
|
34
|
+
### Safety Metric Supported Regions
|
|
35
|
+
|
|
36
|
+
Risk and safety evaluators depend on the Azure AI Studio safety evaluation backend service. For a list of supported regions, please refer to the documentation [here](https://aka.ms/azureaisafetyeval-regionsupport).
|
|
37
|
+
|
|
38
|
+
## Handle Simulation Errors
|
|
39
|
+
|
|
40
|
+
### Adversarial Simulation Supported Regions
|
|
41
|
+
|
|
42
|
+
Adversarial simulators use Azure AI Studio safety evaluation backend service to generate an adversarial dataset against your application. For a list of supported regions, please refer to the documentation [here](https://aka.ms/azureaiadvsimulator-regionsupport).
|
|
43
|
+
|
|
44
|
+
## Logging
|
|
45
|
+
|
|
46
|
+
You can set logging level via environment variable `PF_LOGGING_LEVEL`, valid values includes `CRITICAL`, `ERROR`, `WARNING`, `INFO`, `DEBUG`, default to `INFO`.
|
|
47
|
+
|
|
48
|
+
## Get Additional Help
|
|
49
|
+
|
|
50
|
+
Additional information on ways to reach out for support can be found in the [SUPPORT.md](https://github.com/Azure/azure-sdk-for-python/blob/main/SUPPORT.md) at the root of the repo.
|
|
@@ -12,10 +12,19 @@ from ._evaluators._content_safety import (
|
|
|
12
12
|
SexualEvaluator,
|
|
13
13
|
ViolenceEvaluator,
|
|
14
14
|
)
|
|
15
|
+
from ._evaluators._multimodal._content_safety_multimodal import (
|
|
16
|
+
ContentSafetyMultimodalEvaluator,
|
|
17
|
+
HateUnfairnessMultimodalEvaluator,
|
|
18
|
+
SelfHarmMultimodalEvaluator,
|
|
19
|
+
SexualMultimodalEvaluator,
|
|
20
|
+
ViolenceMultimodalEvaluator,
|
|
21
|
+
)
|
|
22
|
+
from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
|
|
15
23
|
from ._evaluators._f1_score import F1ScoreEvaluator
|
|
16
24
|
from ._evaluators._fluency import FluencyEvaluator
|
|
17
25
|
from ._evaluators._gleu import GleuScoreEvaluator
|
|
18
26
|
from ._evaluators._groundedness import GroundednessEvaluator
|
|
27
|
+
from ._evaluators._service_groundedness import GroundednessProEvaluator
|
|
19
28
|
from ._evaluators._meteor import MeteorScoreEvaluator
|
|
20
29
|
from ._evaluators._protected_material import ProtectedMaterialEvaluator
|
|
21
30
|
from ._evaluators._qa import QAEvaluator
|
|
@@ -27,7 +36,10 @@ from ._evaluators._xpia import IndirectAttackEvaluator
|
|
|
27
36
|
from ._model_configurations import (
|
|
28
37
|
AzureAIProject,
|
|
29
38
|
AzureOpenAIModelConfiguration,
|
|
39
|
+
Conversation,
|
|
40
|
+
EvaluationResult,
|
|
30
41
|
EvaluatorConfig,
|
|
42
|
+
Message,
|
|
31
43
|
OpenAIModelConfiguration,
|
|
32
44
|
)
|
|
33
45
|
|
|
@@ -37,6 +49,7 @@ __all__ = [
|
|
|
37
49
|
"F1ScoreEvaluator",
|
|
38
50
|
"FluencyEvaluator",
|
|
39
51
|
"GroundednessEvaluator",
|
|
52
|
+
"GroundednessProEvaluator",
|
|
40
53
|
"RelevanceEvaluator",
|
|
41
54
|
"SimilarityEvaluator",
|
|
42
55
|
"QAEvaluator",
|
|
@@ -57,4 +70,13 @@ __all__ = [
|
|
|
57
70
|
"AzureOpenAIModelConfiguration",
|
|
58
71
|
"OpenAIModelConfiguration",
|
|
59
72
|
"EvaluatorConfig",
|
|
73
|
+
"Conversation",
|
|
74
|
+
"Message",
|
|
75
|
+
"EvaluationResult",
|
|
76
|
+
"ContentSafetyMultimodalEvaluator",
|
|
77
|
+
"HateUnfairnessMultimodalEvaluator",
|
|
78
|
+
"SelfHarmMultimodalEvaluator",
|
|
79
|
+
"SexualMultimodalEvaluator",
|
|
80
|
+
"ViolenceMultimodalEvaluator",
|
|
81
|
+
"ProtectedMaterialMultimodalEvaluator",
|
|
60
82
|
]
|
{azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/constants.py
RENAMED
|
@@ -6,6 +6,9 @@ from enum import Enum
|
|
|
6
6
|
from azure.core import CaseInsensitiveEnumMeta
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
|
|
10
|
+
|
|
11
|
+
|
|
9
12
|
class CommonConstants:
|
|
10
13
|
"""Define common constants."""
|
|
11
14
|
|
|
@@ -35,6 +38,7 @@ class Tasks:
|
|
|
35
38
|
CONTENT_HARM = "content harm"
|
|
36
39
|
PROTECTED_MATERIAL = "protected material"
|
|
37
40
|
XPIA = "xpia"
|
|
41
|
+
GROUNDEDNESS = "groundedness"
|
|
38
42
|
|
|
39
43
|
|
|
40
44
|
class _InternalAnnotationTasks:
|
|
@@ -56,6 +60,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
|
56
60
|
SEXUAL = "sexual"
|
|
57
61
|
PROTECTED_MATERIAL = "protected_material"
|
|
58
62
|
XPIA = "xpia"
|
|
63
|
+
GROUNDEDNESS = "generic_groundedness"
|
|
59
64
|
|
|
60
65
|
|
|
61
66
|
class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
{azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/math.py
RENAMED
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
import math
|
|
6
6
|
from typing import List
|
|
7
7
|
|
|
8
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
def list_sum(lst: List[float]) -> float:
|
|
10
12
|
return sum(lst)
|
|
@@ -15,4 +17,13 @@ def list_mean(lst: List[float]) -> float:
|
|
|
15
17
|
|
|
16
18
|
|
|
17
19
|
def list_mean_nan_safe(lst: List[float]) -> float:
|
|
20
|
+
msg = "All score values are NaN. The mean cannot be calculated."
|
|
21
|
+
if all(math.isnan(l) for l in lst):
|
|
22
|
+
raise EvaluationException(
|
|
23
|
+
message=msg,
|
|
24
|
+
internal_message=msg,
|
|
25
|
+
blame=ErrorBlame.USER_ERROR,
|
|
26
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
27
|
+
target=ErrorTarget.CONVERSATION,
|
|
28
|
+
)
|
|
18
29
|
return list_mean([l for l in lst if not math.isnan(l)])
|