azure-ai-evaluation 1.0.0b1__tar.gz → 1.0.0b3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure_ai_evaluation-1.0.0b3/CHANGELOG.md +81 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/PKG-INFO +144 -14
- azure_ai_evaluation-1.0.0b1/azure_ai_evaluation.egg-info/PKG-INFO → azure_ai_evaluation-1.0.0b3/README.md +74 -62
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/__init__.py +4 -4
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/rai_service.py +4 -4
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/utils.py +40 -25
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_constants.py +13 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +2 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +39 -17
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +23 -13
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_eval_run.py +38 -18
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_evaluate.py +88 -63
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +13 -8
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_utils.py +29 -22
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +70 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/_base_eval.py +302 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +79 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +99 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -2
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +9 -4
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +55 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +55 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +55 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +55 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_eci/_eci.py +62 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +2 -1
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +72 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +71 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_qa/_qa.py +3 -14
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +77 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
- {azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat → azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_retrieval}/__init__.py +2 -2
- {azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/retrieval → azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_retrieval}/_retrieval.py +17 -29
- {azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/retrieval → azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_retrieval}/retrieval.prompty +0 -5
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +3 -2
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +5 -18
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_xpia/xpia.py +65 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_exceptions.py +0 -1
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_model_configurations.py +55 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_version.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/__init__.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_adversarial_simulator.py +8 -6
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_conversation/_conversation.py +16 -16
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +6 -6
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_helpers/__init__.py +3 -2
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/simulator/_helpers/_experimental.py +157 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +11 -29
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +6 -6
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -3
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +18 -11
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/models.py +9 -11
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -1
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/simulator.py → azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/simulator/_simulator.py +166 -88
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_tracing.py +21 -24
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_utils.py +4 -1
- azure_ai_evaluation-1.0.0b1/README.md → azure_ai_evaluation-1.0.0b3/azure_ai_evaluation.egg-info/PKG-INFO +197 -13
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/SOURCES.txt +12 -11
- azure_ai_evaluation-1.0.0b3/pyproject.toml +21 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/setup.py +1 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/conftest.py +22 -2
- azure_ai_evaluation-1.0.0b3/tests/e2etests/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_builtin_evaluators.py +146 -186
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_evaluate.py +18 -11
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_metrics_upload.py +9 -3
- azure_ai_evaluation-1.0.0b3/tests/e2etests/test_sim_and_eval.py +134 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_built_in_evaluator.py +4 -9
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_evaluate.py +104 -8
- azure_ai_evaluation-1.0.0b3/tests/unittests/test_evaluators/test_inputs_evaluators.py +46 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_non_adv_simulator.py +13 -15
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_simulator.py +2 -2
- azure_ai_evaluation-1.0.0b1/CHANGELOG.md +0 -17
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -350
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -9
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +0 -122
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -66
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +0 -78
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +0 -76
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +0 -76
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +0 -76
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_eci/_eci.py +0 -99
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +0 -122
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +0 -123
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +0 -104
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +0 -131
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_xpia/xpia.py +0 -140
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_model_configurations.py +0 -27
- azure_ai_evaluation-1.0.0b1/pyproject.toml +0 -6
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_chat_evaluator.py +0 -109
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_content_safety_chat_evaluator.py +0 -82
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/MANIFEST.in +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/constants.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_http_utils.py +3 -3
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_user_agent.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/py.typed +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_adversarial_scenario.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_constants.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +0 -0
- {azure_ai_evaluation-1.0.0b1/tests → azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/simulator/_prompty}/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/requires.txt +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/setup.cfg +0 -0
- {azure_ai_evaluation-1.0.0b1/tests/e2etests → azure_ai_evaluation-1.0.0b3/tests}/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/__openai_patcher.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/target_fn.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_adv_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_batch_run_context.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_content_safety_defect_rate.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_content_safety_rai_script.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_eval_run.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_evaluate_telemetry.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_evaluators/apology_dag/apology.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_jailbreak_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_save_eval.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_synthetic_conversation_bot.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_utils.py +0 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Release History
|
|
2
|
+
|
|
3
|
+
## 1.0.0b3 (2024-10-01)
|
|
4
|
+
|
|
5
|
+
### Features Added
|
|
6
|
+
|
|
7
|
+
- Added `type` field to `AzureOpenAIModelConfiguration` and `OpenAIModelConfiguration`
|
|
8
|
+
- The following evaluators now support `conversation` as an alternative input to their usual single-turn inputs:
|
|
9
|
+
- `ViolenceEvaluator`
|
|
10
|
+
- `SexualEvaluator`
|
|
11
|
+
- `SelfHarmEvaluator`
|
|
12
|
+
- `HateUnfairnessEvaluator`
|
|
13
|
+
- `ProtectedMaterialEvaluator`
|
|
14
|
+
- `IndirectAttackEvaluator`
|
|
15
|
+
- `CoherenceEvaluator`
|
|
16
|
+
- `RelevanceEvaluator`
|
|
17
|
+
- `FluencyEvaluator`
|
|
18
|
+
- `GroundednessEvaluator`
|
|
19
|
+
- Surfaced `RetrievalScoreEvaluator`, formally an internal part of `ChatEvaluator` as a standalone conversation-only evaluator.
|
|
20
|
+
|
|
21
|
+
### Breaking Changes
|
|
22
|
+
|
|
23
|
+
- Removed `ContentSafetyChatEvaluator` and `ChatEvaluator`
|
|
24
|
+
- The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
|
|
25
|
+
`column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
|
|
26
|
+
|
|
27
|
+
Before:
|
|
28
|
+
```python
|
|
29
|
+
evaluate(
|
|
30
|
+
...,
|
|
31
|
+
evaluator_config={
|
|
32
|
+
"hate_unfairness": {
|
|
33
|
+
"query": "${data.question}",
|
|
34
|
+
"response": "${data.answer}",
|
|
35
|
+
}
|
|
36
|
+
},
|
|
37
|
+
...
|
|
38
|
+
)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
After
|
|
42
|
+
```python
|
|
43
|
+
evaluate(
|
|
44
|
+
...,
|
|
45
|
+
evaluator_config={
|
|
46
|
+
"hate_unfairness": {
|
|
47
|
+
"column_mapping": {
|
|
48
|
+
"query": "${data.question}",
|
|
49
|
+
"response": "${data.answer}",
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
...
|
|
54
|
+
)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Bugs Fixed
|
|
58
|
+
|
|
59
|
+
- Fixed issue where Entra ID authentication was not working with `AzureOpenAIModelConfiguration`
|
|
60
|
+
|
|
61
|
+
## 1.0.0b2 (2024-09-24)
|
|
62
|
+
|
|
63
|
+
### Breaking Changes
|
|
64
|
+
|
|
65
|
+
- `data` and `evaluators` are now required keywords in `evaluate`.
|
|
66
|
+
|
|
67
|
+
## 1.0.0b1 (2024-09-20)
|
|
68
|
+
|
|
69
|
+
### Breaking Changes
|
|
70
|
+
|
|
71
|
+
- The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
|
|
72
|
+
- The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`
|
|
73
|
+
- The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
|
|
74
|
+
- Model configurations classes are now of type `TypedDict` and are exposed in the `azure.ai.evaluation` module instead of coming from `promptflow.core`.
|
|
75
|
+
- Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
|
|
76
|
+
|
|
77
|
+
### Features Added
|
|
78
|
+
|
|
79
|
+
- First preview
|
|
80
|
+
- This package is port of `promptflow-evals`. New features will be added only to this package moving forward.
|
|
81
|
+
- Added a `TypedDict` for `AzureAIProject` that allows for better intellisense and type checking when passing in project information
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0b3
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -35,11 +35,27 @@ Requires-Dist: promptflow-azure<2.0.0,>=1.15.0; extra == "pf-azure"
|
|
|
35
35
|
|
|
36
36
|
# Azure AI Evaluation client library for Python
|
|
37
37
|
|
|
38
|
+
We are excited to introduce the public preview of the Azure AI Evaluation SDK.
|
|
39
|
+
|
|
40
|
+
[Source code][source_code]
|
|
41
|
+
| [Package (PyPI)][evaluation_pypi]
|
|
42
|
+
| [API reference documentation][evaluation_ref_docs]
|
|
43
|
+
| [Product documentation][product_documentation]
|
|
44
|
+
| [Samples][evaluation_samples]
|
|
45
|
+
|
|
46
|
+
This package has been tested with Python 3.8, 3.9, 3.10, 3.11, and 3.12.
|
|
47
|
+
|
|
48
|
+
For a more complete set of Azure libraries, see https://aka.ms/azsdk/python/all
|
|
49
|
+
|
|
38
50
|
## Getting started
|
|
39
51
|
|
|
52
|
+
### Prerequisites
|
|
53
|
+
|
|
54
|
+
- Python 3.8 or later is required to use this package.
|
|
55
|
+
|
|
40
56
|
### Install the package
|
|
41
57
|
|
|
42
|
-
Install the Azure AI Evaluation library for Python with
|
|
58
|
+
Install the Azure AI Evaluation library for Python with [pip][pip_link]::
|
|
43
59
|
|
|
44
60
|
```bash
|
|
45
61
|
pip install azure-ai-evaluation
|
|
@@ -51,6 +67,8 @@ Evaluators are custom or prebuilt classes or functions that are designed to meas
|
|
|
51
67
|
|
|
52
68
|
## Examples
|
|
53
69
|
|
|
70
|
+
### Evaluators
|
|
71
|
+
|
|
54
72
|
Users can create evaluator runs on the local machine as shown in the example below:
|
|
55
73
|
|
|
56
74
|
```python
|
|
@@ -92,9 +110,9 @@ if __name__ == "__main__":
|
|
|
92
110
|
|
|
93
111
|
# Initialize Project Scope
|
|
94
112
|
azure_ai_project = {
|
|
95
|
-
"subscription_id":
|
|
96
|
-
"resource_group_name":
|
|
97
|
-
"project_name":
|
|
113
|
+
"subscription_id": <subscription_id>,
|
|
114
|
+
"resource_group_name": <resource_group_name>,
|
|
115
|
+
"project_name": <project_name>
|
|
98
116
|
}
|
|
99
117
|
|
|
100
118
|
violence_eval = ViolenceEvaluator(azure_ai_project)
|
|
@@ -122,9 +140,13 @@ if __name__ == "__main__":
|
|
|
122
140
|
|
|
123
141
|
pprint(result)
|
|
124
142
|
```
|
|
125
|
-
|
|
143
|
+
### Simulator
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
Simulators allow users to generate synthentic data using their application. Simulator expects the user to have a callback method that invokes
|
|
147
|
+
their AI application.
|
|
126
148
|
|
|
127
|
-
|
|
149
|
+
#### Simulating with a Prompty
|
|
128
150
|
|
|
129
151
|
```yaml
|
|
130
152
|
---
|
|
@@ -163,7 +185,7 @@ Application code:
|
|
|
163
185
|
import json
|
|
164
186
|
import asyncio
|
|
165
187
|
from typing import Any, Dict, List, Optional
|
|
166
|
-
from azure.ai.evaluation.
|
|
188
|
+
from azure.ai.evaluation.simulator import Simulator
|
|
167
189
|
from promptflow.client import load_flow
|
|
168
190
|
from azure.identity import DefaultAzureCredential
|
|
169
191
|
import os
|
|
@@ -171,8 +193,7 @@ import os
|
|
|
171
193
|
azure_ai_project = {
|
|
172
194
|
"subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
|
|
173
195
|
"resource_group_name": os.environ.get("RESOURCE_GROUP"),
|
|
174
|
-
"project_name": os.environ.get("PROJECT_NAME")
|
|
175
|
-
"credential": DefaultAzureCredential(),
|
|
196
|
+
"project_name": os.environ.get("PROJECT_NAME")
|
|
176
197
|
}
|
|
177
198
|
|
|
178
199
|
import wikipedia
|
|
@@ -249,8 +270,7 @@ if __name__ == "__main__":
|
|
|
249
270
|
print("done!")
|
|
250
271
|
```
|
|
251
272
|
|
|
252
|
-
|
|
253
|
-
their AI application. Here's a sample of a callback which invokes AsyncAzureOpenAI:
|
|
273
|
+
#### Adversarial Simulator
|
|
254
274
|
|
|
255
275
|
```python
|
|
256
276
|
from from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
|
|
@@ -318,7 +338,9 @@ async def callback(
|
|
|
318
338
|
}
|
|
319
339
|
|
|
320
340
|
```
|
|
321
|
-
|
|
341
|
+
|
|
342
|
+
#### Adversarial QA
|
|
343
|
+
|
|
322
344
|
```python
|
|
323
345
|
scenario = AdversarialScenario.ADVERSARIAL_QA
|
|
324
346
|
simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
|
|
@@ -334,7 +356,7 @@ outputs = asyncio.run(
|
|
|
334
356
|
|
|
335
357
|
print(outputs.to_eval_qa_json_lines())
|
|
336
358
|
```
|
|
337
|
-
|
|
359
|
+
#### Direct Attack Simulator
|
|
338
360
|
|
|
339
361
|
```python
|
|
340
362
|
scenario = AdversarialScenario.ADVERSARIAL_QA
|
|
@@ -353,13 +375,121 @@ print(outputs)
|
|
|
353
375
|
```
|
|
354
376
|
## Troubleshooting
|
|
355
377
|
|
|
378
|
+
### General
|
|
379
|
+
|
|
380
|
+
Azure ML clients raise exceptions defined in [Azure Core][azure_core_readme].
|
|
381
|
+
|
|
382
|
+
### Logging
|
|
383
|
+
|
|
384
|
+
This library uses the standard
|
|
385
|
+
[logging][python_logging] library for logging.
|
|
386
|
+
Basic information about HTTP sessions (URLs, headers, etc.) is logged at INFO
|
|
387
|
+
level.
|
|
388
|
+
|
|
389
|
+
Detailed DEBUG level logging, including request/response bodies and unredacted
|
|
390
|
+
headers, can be enabled on a client with the `logging_enable` argument.
|
|
391
|
+
|
|
392
|
+
See full SDK logging documentation with examples [here][sdk_logging_docs].
|
|
393
|
+
|
|
356
394
|
## Next steps
|
|
357
395
|
|
|
396
|
+
- View our [samples][evaluation_samples].
|
|
397
|
+
- View our [documentation][product_documentation]
|
|
398
|
+
|
|
358
399
|
## Contributing
|
|
359
400
|
|
|
401
|
+
This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit [cla.microsoft.com][cla].
|
|
402
|
+
|
|
403
|
+
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
|
|
404
|
+
|
|
405
|
+
This project has adopted the [Microsoft Open Source Code of Conduct][code_of_conduct]. For more information see the [Code of Conduct FAQ][coc_faq] or contact [opencode@microsoft.com][coc_contact] with any additional questions or comments.
|
|
406
|
+
|
|
407
|
+
<!-- LINKS -->
|
|
408
|
+
|
|
409
|
+
[source_code]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/evaluation/azure-ai-evaluation
|
|
410
|
+
[evaluation_pypi]: https://pypi.org/project/azure-ai-evaluation/
|
|
411
|
+
[evaluation_ref_docs]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview
|
|
412
|
+
[evaluation_samples]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios
|
|
413
|
+
[product_documentation]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk
|
|
414
|
+
[python_logging]: https://docs.python.org/3/library/logging.html
|
|
415
|
+
[sdk_logging_docs]: https://docs.microsoft.com/azure/developer/python/azure-sdk-logging
|
|
416
|
+
[azure_core_readme]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md
|
|
417
|
+
[pip_link]: https://pypi.org/project/pip/
|
|
418
|
+
[azure_core_ref_docs]: https://aka.ms/azsdk-python-core-policies
|
|
419
|
+
[azure_core]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md
|
|
420
|
+
[azure_identity]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/identity/azure-identity
|
|
421
|
+
[cla]: https://cla.microsoft.com
|
|
422
|
+
[code_of_conduct]: https://opensource.microsoft.com/codeofconduct/
|
|
423
|
+
[coc_faq]: https://opensource.microsoft.com/codeofconduct/faq/
|
|
424
|
+
[coc_contact]: mailto:opencode@microsoft.com
|
|
425
|
+
|
|
360
426
|
|
|
361
427
|
# Release History
|
|
362
428
|
|
|
429
|
+
## 1.0.0b3 (2024-10-01)
|
|
430
|
+
|
|
431
|
+
### Features Added
|
|
432
|
+
|
|
433
|
+
- Added `type` field to `AzureOpenAIModelConfiguration` and `OpenAIModelConfiguration`
|
|
434
|
+
- The following evaluators now support `conversation` as an alternative input to their usual single-turn inputs:
|
|
435
|
+
- `ViolenceEvaluator`
|
|
436
|
+
- `SexualEvaluator`
|
|
437
|
+
- `SelfHarmEvaluator`
|
|
438
|
+
- `HateUnfairnessEvaluator`
|
|
439
|
+
- `ProtectedMaterialEvaluator`
|
|
440
|
+
- `IndirectAttackEvaluator`
|
|
441
|
+
- `CoherenceEvaluator`
|
|
442
|
+
- `RelevanceEvaluator`
|
|
443
|
+
- `FluencyEvaluator`
|
|
444
|
+
- `GroundednessEvaluator`
|
|
445
|
+
- Surfaced `RetrievalScoreEvaluator`, formally an internal part of `ChatEvaluator` as a standalone conversation-only evaluator.
|
|
446
|
+
|
|
447
|
+
### Breaking Changes
|
|
448
|
+
|
|
449
|
+
- Removed `ContentSafetyChatEvaluator` and `ChatEvaluator`
|
|
450
|
+
- The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
|
|
451
|
+
`column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
|
|
452
|
+
|
|
453
|
+
Before:
|
|
454
|
+
```python
|
|
455
|
+
evaluate(
|
|
456
|
+
...,
|
|
457
|
+
evaluator_config={
|
|
458
|
+
"hate_unfairness": {
|
|
459
|
+
"query": "${data.question}",
|
|
460
|
+
"response": "${data.answer}",
|
|
461
|
+
}
|
|
462
|
+
},
|
|
463
|
+
...
|
|
464
|
+
)
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
After
|
|
468
|
+
```python
|
|
469
|
+
evaluate(
|
|
470
|
+
...,
|
|
471
|
+
evaluator_config={
|
|
472
|
+
"hate_unfairness": {
|
|
473
|
+
"column_mapping": {
|
|
474
|
+
"query": "${data.question}",
|
|
475
|
+
"response": "${data.answer}",
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
},
|
|
479
|
+
...
|
|
480
|
+
)
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
### Bugs Fixed
|
|
484
|
+
|
|
485
|
+
- Fixed issue where Entra ID authentication was not working with `AzureOpenAIModelConfiguration`
|
|
486
|
+
|
|
487
|
+
## 1.0.0b2 (2024-09-24)
|
|
488
|
+
|
|
489
|
+
### Breaking Changes
|
|
490
|
+
|
|
491
|
+
- `data` and `evaluators` are now required keywords in `evaluate`.
|
|
492
|
+
|
|
363
493
|
## 1.0.0b1 (2024-09-20)
|
|
364
494
|
|
|
365
495
|
### Breaking Changes
|
|
@@ -1,45 +1,26 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.0.0b1
|
|
4
|
-
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
|
-
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
|
-
Author: Microsoft Corporation
|
|
7
|
-
Author-email: azuresdkengsysadmins@microsoft.com
|
|
8
|
-
License: MIT License
|
|
9
|
-
Project-URL: Bug Reports, https://github.com/Azure/azure-sdk-for-python/issues
|
|
10
|
-
Project-URL: Source, https://github.com/Azure/azure-sdk-for-python
|
|
11
|
-
Keywords: azure,azure sdk
|
|
12
|
-
Classifier: Development Status :: 4 - Beta
|
|
13
|
-
Classifier: Programming Language :: Python
|
|
14
|
-
Classifier: Programming Language :: Python :: 3
|
|
15
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
21
|
-
Classifier: Operating System :: OS Independent
|
|
22
|
-
Requires-Python: >=3.8
|
|
23
|
-
Description-Content-Type: text/markdown
|
|
24
|
-
Requires-Dist: promptflow-devkit>=1.15.0
|
|
25
|
-
Requires-Dist: promptflow-core>=1.15.0
|
|
26
|
-
Requires-Dist: numpy>=1.23.2; python_version < "3.12"
|
|
27
|
-
Requires-Dist: numpy>=1.26.4; python_version >= "3.12"
|
|
28
|
-
Requires-Dist: pyjwt>=2.8.0
|
|
29
|
-
Requires-Dist: azure-identity>=1.12.0
|
|
30
|
-
Requires-Dist: azure-core>=1.30.2
|
|
31
|
-
Requires-Dist: nltk>=3.9.1
|
|
32
|
-
Requires-Dist: rouge-score>=0.1.2
|
|
33
|
-
Provides-Extra: pf-azure
|
|
34
|
-
Requires-Dist: promptflow-azure<2.0.0,>=1.15.0; extra == "pf-azure"
|
|
35
|
-
|
|
36
1
|
# Azure AI Evaluation client library for Python
|
|
37
2
|
|
|
3
|
+
We are excited to introduce the public preview of the Azure AI Evaluation SDK.
|
|
4
|
+
|
|
5
|
+
[Source code][source_code]
|
|
6
|
+
| [Package (PyPI)][evaluation_pypi]
|
|
7
|
+
| [API reference documentation][evaluation_ref_docs]
|
|
8
|
+
| [Product documentation][product_documentation]
|
|
9
|
+
| [Samples][evaluation_samples]
|
|
10
|
+
|
|
11
|
+
This package has been tested with Python 3.8, 3.9, 3.10, 3.11, and 3.12.
|
|
12
|
+
|
|
13
|
+
For a more complete set of Azure libraries, see https://aka.ms/azsdk/python/all
|
|
14
|
+
|
|
38
15
|
## Getting started
|
|
39
16
|
|
|
17
|
+
### Prerequisites
|
|
18
|
+
|
|
19
|
+
- Python 3.8 or later is required to use this package.
|
|
20
|
+
|
|
40
21
|
### Install the package
|
|
41
22
|
|
|
42
|
-
Install the Azure AI Evaluation library for Python with
|
|
23
|
+
Install the Azure AI Evaluation library for Python with [pip][pip_link]::
|
|
43
24
|
|
|
44
25
|
```bash
|
|
45
26
|
pip install azure-ai-evaluation
|
|
@@ -51,6 +32,8 @@ Evaluators are custom or prebuilt classes or functions that are designed to meas
|
|
|
51
32
|
|
|
52
33
|
## Examples
|
|
53
34
|
|
|
35
|
+
### Evaluators
|
|
36
|
+
|
|
54
37
|
Users can create evaluator runs on the local machine as shown in the example below:
|
|
55
38
|
|
|
56
39
|
```python
|
|
@@ -92,9 +75,9 @@ if __name__ == "__main__":
|
|
|
92
75
|
|
|
93
76
|
# Initialize Project Scope
|
|
94
77
|
azure_ai_project = {
|
|
95
|
-
"subscription_id":
|
|
96
|
-
"resource_group_name":
|
|
97
|
-
"project_name":
|
|
78
|
+
"subscription_id": <subscription_id>,
|
|
79
|
+
"resource_group_name": <resource_group_name>,
|
|
80
|
+
"project_name": <project_name>
|
|
98
81
|
}
|
|
99
82
|
|
|
100
83
|
violence_eval = ViolenceEvaluator(azure_ai_project)
|
|
@@ -122,9 +105,13 @@ if __name__ == "__main__":
|
|
|
122
105
|
|
|
123
106
|
pprint(result)
|
|
124
107
|
```
|
|
125
|
-
|
|
108
|
+
### Simulator
|
|
126
109
|
|
|
127
|
-
|
|
110
|
+
|
|
111
|
+
Simulators allow users to generate synthentic data using their application. Simulator expects the user to have a callback method that invokes
|
|
112
|
+
their AI application.
|
|
113
|
+
|
|
114
|
+
#### Simulating with a Prompty
|
|
128
115
|
|
|
129
116
|
```yaml
|
|
130
117
|
---
|
|
@@ -163,7 +150,7 @@ Application code:
|
|
|
163
150
|
import json
|
|
164
151
|
import asyncio
|
|
165
152
|
from typing import Any, Dict, List, Optional
|
|
166
|
-
from azure.ai.evaluation.
|
|
153
|
+
from azure.ai.evaluation.simulator import Simulator
|
|
167
154
|
from promptflow.client import load_flow
|
|
168
155
|
from azure.identity import DefaultAzureCredential
|
|
169
156
|
import os
|
|
@@ -171,8 +158,7 @@ import os
|
|
|
171
158
|
azure_ai_project = {
|
|
172
159
|
"subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
|
|
173
160
|
"resource_group_name": os.environ.get("RESOURCE_GROUP"),
|
|
174
|
-
"project_name": os.environ.get("PROJECT_NAME")
|
|
175
|
-
"credential": DefaultAzureCredential(),
|
|
161
|
+
"project_name": os.environ.get("PROJECT_NAME")
|
|
176
162
|
}
|
|
177
163
|
|
|
178
164
|
import wikipedia
|
|
@@ -249,8 +235,7 @@ if __name__ == "__main__":
|
|
|
249
235
|
print("done!")
|
|
250
236
|
```
|
|
251
237
|
|
|
252
|
-
|
|
253
|
-
their AI application. Here's a sample of a callback which invokes AsyncAzureOpenAI:
|
|
238
|
+
#### Adversarial Simulator
|
|
254
239
|
|
|
255
240
|
```python
|
|
256
241
|
from from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
|
|
@@ -318,7 +303,9 @@ async def callback(
|
|
|
318
303
|
}
|
|
319
304
|
|
|
320
305
|
```
|
|
321
|
-
|
|
306
|
+
|
|
307
|
+
#### Adversarial QA
|
|
308
|
+
|
|
322
309
|
```python
|
|
323
310
|
scenario = AdversarialScenario.ADVERSARIAL_QA
|
|
324
311
|
simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
|
|
@@ -334,7 +321,7 @@ outputs = asyncio.run(
|
|
|
334
321
|
|
|
335
322
|
print(outputs.to_eval_qa_json_lines())
|
|
336
323
|
```
|
|
337
|
-
|
|
324
|
+
#### Direct Attack Simulator
|
|
338
325
|
|
|
339
326
|
```python
|
|
340
327
|
scenario = AdversarialScenario.ADVERSARIAL_QA
|
|
@@ -353,25 +340,50 @@ print(outputs)
|
|
|
353
340
|
```
|
|
354
341
|
## Troubleshooting
|
|
355
342
|
|
|
356
|
-
|
|
343
|
+
### General
|
|
357
344
|
|
|
358
|
-
|
|
345
|
+
Azure ML clients raise exceptions defined in [Azure Core][azure_core_readme].
|
|
346
|
+
|
|
347
|
+
### Logging
|
|
359
348
|
|
|
349
|
+
This library uses the standard
|
|
350
|
+
[logging][python_logging] library for logging.
|
|
351
|
+
Basic information about HTTP sessions (URLs, headers, etc.) is logged at INFO
|
|
352
|
+
level.
|
|
360
353
|
|
|
361
|
-
|
|
354
|
+
Detailed DEBUG level logging, including request/response bodies and unredacted
|
|
355
|
+
headers, can be enabled on a client with the `logging_enable` argument.
|
|
362
356
|
|
|
363
|
-
|
|
357
|
+
See full SDK logging documentation with examples [here][sdk_logging_docs].
|
|
364
358
|
|
|
365
|
-
|
|
359
|
+
## Next steps
|
|
366
360
|
|
|
367
|
-
-
|
|
368
|
-
-
|
|
369
|
-
- The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
|
|
370
|
-
- Model configurations classes are now of type `TypedDict` and are exposed in the `azure.ai.evaluation` module instead of coming from `promptflow.core`.
|
|
371
|
-
- Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
|
|
361
|
+
- View our [samples][evaluation_samples].
|
|
362
|
+
- View our [documentation][product_documentation]
|
|
372
363
|
|
|
373
|
-
|
|
364
|
+
## Contributing
|
|
374
365
|
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
366
|
+
This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit [cla.microsoft.com][cla].
|
|
367
|
+
|
|
368
|
+
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
|
|
369
|
+
|
|
370
|
+
This project has adopted the [Microsoft Open Source Code of Conduct][code_of_conduct]. For more information see the [Code of Conduct FAQ][coc_faq] or contact [opencode@microsoft.com][coc_contact] with any additional questions or comments.
|
|
371
|
+
|
|
372
|
+
<!-- LINKS -->
|
|
373
|
+
|
|
374
|
+
[source_code]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/evaluation/azure-ai-evaluation
|
|
375
|
+
[evaluation_pypi]: https://pypi.org/project/azure-ai-evaluation/
|
|
376
|
+
[evaluation_ref_docs]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview
|
|
377
|
+
[evaluation_samples]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios
|
|
378
|
+
[product_documentation]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk
|
|
379
|
+
[python_logging]: https://docs.python.org/3/library/logging.html
|
|
380
|
+
[sdk_logging_docs]: https://docs.microsoft.com/azure/developer/python/azure-sdk-logging
|
|
381
|
+
[azure_core_readme]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md
|
|
382
|
+
[pip_link]: https://pypi.org/project/pip/
|
|
383
|
+
[azure_core_ref_docs]: https://aka.ms/azsdk-python-core-policies
|
|
384
|
+
[azure_core]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md
|
|
385
|
+
[azure_identity]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/identity/azure-identity
|
|
386
|
+
[cla]: https://cla.microsoft.com
|
|
387
|
+
[code_of_conduct]: https://opensource.microsoft.com/codeofconduct/
|
|
388
|
+
[coc_faq]: https://opensource.microsoft.com/codeofconduct/faq/
|
|
389
|
+
[coc_contact]: mailto:opencode@microsoft.com
|
|
@@ -4,10 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
from ._evaluate._evaluate import evaluate
|
|
6
6
|
from ._evaluators._bleu import BleuScoreEvaluator
|
|
7
|
-
from ._evaluators._chat import ChatEvaluator
|
|
8
7
|
from ._evaluators._coherence import CoherenceEvaluator
|
|
9
8
|
from ._evaluators._content_safety import (
|
|
10
|
-
ContentSafetyChatEvaluator,
|
|
11
9
|
ContentSafetyEvaluator,
|
|
12
10
|
HateUnfairnessEvaluator,
|
|
13
11
|
SelfHarmEvaluator,
|
|
@@ -22,6 +20,7 @@ from ._evaluators._meteor import MeteorScoreEvaluator
|
|
|
22
20
|
from ._evaluators._protected_material import ProtectedMaterialEvaluator
|
|
23
21
|
from ._evaluators._qa import QAEvaluator
|
|
24
22
|
from ._evaluators._relevance import RelevanceEvaluator
|
|
23
|
+
from ._evaluators._retrieval import RetrievalEvaluator
|
|
25
24
|
from ._evaluators._rouge import RougeScoreEvaluator, RougeType
|
|
26
25
|
from ._evaluators._similarity import SimilarityEvaluator
|
|
27
26
|
from ._evaluators._xpia import IndirectAttackEvaluator
|
|
@@ -29,6 +28,7 @@ from ._model_configurations import (
|
|
|
29
28
|
AzureAIProject,
|
|
30
29
|
AzureOpenAIModelConfiguration,
|
|
31
30
|
OpenAIModelConfiguration,
|
|
31
|
+
EvaluatorConfig,
|
|
32
32
|
)
|
|
33
33
|
|
|
34
34
|
__all__ = [
|
|
@@ -40,21 +40,21 @@ __all__ = [
|
|
|
40
40
|
"RelevanceEvaluator",
|
|
41
41
|
"SimilarityEvaluator",
|
|
42
42
|
"QAEvaluator",
|
|
43
|
-
"ChatEvaluator",
|
|
44
43
|
"ViolenceEvaluator",
|
|
45
44
|
"SexualEvaluator",
|
|
46
45
|
"SelfHarmEvaluator",
|
|
47
46
|
"HateUnfairnessEvaluator",
|
|
48
47
|
"ContentSafetyEvaluator",
|
|
49
|
-
"ContentSafetyChatEvaluator",
|
|
50
48
|
"IndirectAttackEvaluator",
|
|
51
49
|
"BleuScoreEvaluator",
|
|
52
50
|
"GleuScoreEvaluator",
|
|
53
51
|
"MeteorScoreEvaluator",
|
|
52
|
+
"RetrievalEvaluator",
|
|
54
53
|
"RougeScoreEvaluator",
|
|
55
54
|
"RougeType",
|
|
56
55
|
"ProtectedMaterialEvaluator",
|
|
57
56
|
"AzureAIProject",
|
|
58
57
|
"AzureOpenAIModelConfiguration",
|
|
59
58
|
"OpenAIModelConfiguration",
|
|
59
|
+
"EvaluatorConfig",
|
|
60
60
|
]
|
|
@@ -11,12 +11,12 @@ from urllib.parse import urlparse
|
|
|
11
11
|
|
|
12
12
|
import jwt
|
|
13
13
|
import numpy as np
|
|
14
|
-
from azure.core.credentials import TokenCredential
|
|
15
|
-
from azure.identity import DefaultAzureCredential
|
|
16
14
|
|
|
15
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
17
16
|
from azure.ai.evaluation._http_utils import get_async_http_client
|
|
18
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
19
17
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
18
|
+
from azure.core.credentials import TokenCredential
|
|
19
|
+
from azure.identity import DefaultAzureCredential
|
|
20
20
|
|
|
21
21
|
from .constants import (
|
|
22
22
|
CommonConstants,
|
|
@@ -348,7 +348,7 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
348
348
|
)
|
|
349
349
|
|
|
350
350
|
if response.status_code != 200:
|
|
351
|
-
msg =
|
|
351
|
+
msg = "Failed to retrieve the discovery service URL."
|
|
352
352
|
raise EvaluationException(
|
|
353
353
|
message=msg,
|
|
354
354
|
internal_message=msg,
|