azure-ai-evaluation 1.0.0b2__tar.gz → 1.0.0b3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure_ai_evaluation-1.0.0b3/CHANGELOG.md +81 -0
- {azure_ai_evaluation-1.0.0b2/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.0.0b3}/PKG-INFO +59 -1
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/__init__.py +9 -5
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/utils.py +24 -9
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_constants.py +4 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_evaluate.py +57 -39
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +70 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/_base_eval.py +302 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +79 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +99 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +55 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +55 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +55 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +55 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_eci/_eci.py +62 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +72 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +71 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -0
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +77 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
- {azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_chat → azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_retrieval}/__init__.py +2 -2
- {azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_chat/retrieval → azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_retrieval}/_retrieval.py +16 -22
- {azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_chat/retrieval → azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_retrieval}/retrieval.prompty +0 -5
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -11
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_xpia/xpia.py +65 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_exceptions.py +0 -1
- azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_model_configurations.py +55 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_version.py +1 -1
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -1
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_simulator.py +19 -8
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3/azure_ai_evaluation.egg-info}/PKG-INFO +59 -1
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/SOURCES.txt +8 -10
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/conftest.py +22 -2
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_builtin_evaluators.py +146 -186
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_evaluate.py +18 -11
- azure_ai_evaluation-1.0.0b3/tests/e2etests/test_sim_and_eval.py +134 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_built_in_evaluator.py +4 -9
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_evaluate.py +2 -2
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_non_adv_simulator.py +2 -3
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_simulator.py +2 -2
- azure_ai_evaluation-1.0.0b2/CHANGELOG.md +0 -23
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -9
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +0 -117
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +0 -78
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +0 -76
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +0 -76
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +0 -76
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_eci/_eci.py +0 -99
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +0 -117
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +0 -118
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +0 -104
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +0 -126
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_xpia/xpia.py +0 -139
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_model_configurations.py +0 -27
- azure_ai_evaluation-1.0.0b2/tests/unittests/test_chat_evaluator.py +0 -109
- azure_ai_evaluation-1.0.0b2/tests/unittests/test_content_safety_chat_evaluator.py +0 -82
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/MANIFEST.in +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/README.md +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/constants.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/rai_service.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_eval_run.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_utils.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_qa/_qa.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_http_utils.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_user_agent.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/py.typed +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_adversarial_scenario.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_adversarial_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_constants.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_conversation/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_helpers/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_helpers/_experimental.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/models.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_tracing.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_utils.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/requires.txt +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/pyproject.toml +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/setup.cfg +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/setup.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/__openai_patcher.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/target_fn.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_adv_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_metrics_upload.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_batch_run_context.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_content_safety_defect_rate.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_content_safety_rai_script.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_eval_run.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_evaluate_telemetry.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_evaluators/apology_dag/apology.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_evaluators/test_inputs_evaluators.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_jailbreak_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_save_eval.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_synthetic_conversation_bot.py +0 -0
- {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_utils.py +0 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Release History
|
|
2
|
+
|
|
3
|
+
## 1.0.0b3 (2024-10-01)
|
|
4
|
+
|
|
5
|
+
### Features Added
|
|
6
|
+
|
|
7
|
+
- Added `type` field to `AzureOpenAIModelConfiguration` and `OpenAIModelConfiguration`
|
|
8
|
+
- The following evaluators now support `conversation` as an alternative input to their usual single-turn inputs:
|
|
9
|
+
- `ViolenceEvaluator`
|
|
10
|
+
- `SexualEvaluator`
|
|
11
|
+
- `SelfHarmEvaluator`
|
|
12
|
+
- `HateUnfairnessEvaluator`
|
|
13
|
+
- `ProtectedMaterialEvaluator`
|
|
14
|
+
- `IndirectAttackEvaluator`
|
|
15
|
+
- `CoherenceEvaluator`
|
|
16
|
+
- `RelevanceEvaluator`
|
|
17
|
+
- `FluencyEvaluator`
|
|
18
|
+
- `GroundednessEvaluator`
|
|
19
|
+
- Surfaced `RetrievalScoreEvaluator`, formally an internal part of `ChatEvaluator` as a standalone conversation-only evaluator.
|
|
20
|
+
|
|
21
|
+
### Breaking Changes
|
|
22
|
+
|
|
23
|
+
- Removed `ContentSafetyChatEvaluator` and `ChatEvaluator`
|
|
24
|
+
- The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
|
|
25
|
+
`column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
|
|
26
|
+
|
|
27
|
+
Before:
|
|
28
|
+
```python
|
|
29
|
+
evaluate(
|
|
30
|
+
...,
|
|
31
|
+
evaluator_config={
|
|
32
|
+
"hate_unfairness": {
|
|
33
|
+
"query": "${data.question}",
|
|
34
|
+
"response": "${data.answer}",
|
|
35
|
+
}
|
|
36
|
+
},
|
|
37
|
+
...
|
|
38
|
+
)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
After
|
|
42
|
+
```python
|
|
43
|
+
evaluate(
|
|
44
|
+
...,
|
|
45
|
+
evaluator_config={
|
|
46
|
+
"hate_unfairness": {
|
|
47
|
+
"column_mapping": {
|
|
48
|
+
"query": "${data.question}",
|
|
49
|
+
"response": "${data.answer}",
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
...
|
|
54
|
+
)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Bugs Fixed
|
|
58
|
+
|
|
59
|
+
- Fixed issue where Entra ID authentication was not working with `AzureOpenAIModelConfiguration`
|
|
60
|
+
|
|
61
|
+
## 1.0.0b2 (2024-09-24)
|
|
62
|
+
|
|
63
|
+
### Breaking Changes
|
|
64
|
+
|
|
65
|
+
- `data` and `evaluators` are now required keywords in `evaluate`.
|
|
66
|
+
|
|
67
|
+
## 1.0.0b1 (2024-09-20)
|
|
68
|
+
|
|
69
|
+
### Breaking Changes
|
|
70
|
+
|
|
71
|
+
- The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
|
|
72
|
+
- The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`
|
|
73
|
+
- The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
|
|
74
|
+
- Model configurations classes are now of type `TypedDict` and are exposed in the `azure.ai.evaluation` module instead of coming from `promptflow.core`.
|
|
75
|
+
- Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
|
|
76
|
+
|
|
77
|
+
### Features Added
|
|
78
|
+
|
|
79
|
+
- First preview
|
|
80
|
+
- This package is port of `promptflow-evals`. New features will be added only to this package moving forward.
|
|
81
|
+
- Added a `TypedDict` for `AzureAIProject` that allows for better intellisense and type checking when passing in project information
|
{azure_ai_evaluation-1.0.0b2/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.0.0b3}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0b3
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -426,6 +426,64 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
426
426
|
|
|
427
427
|
# Release History
|
|
428
428
|
|
|
429
|
+
## 1.0.0b3 (2024-10-01)
|
|
430
|
+
|
|
431
|
+
### Features Added
|
|
432
|
+
|
|
433
|
+
- Added `type` field to `AzureOpenAIModelConfiguration` and `OpenAIModelConfiguration`
|
|
434
|
+
- The following evaluators now support `conversation` as an alternative input to their usual single-turn inputs:
|
|
435
|
+
- `ViolenceEvaluator`
|
|
436
|
+
- `SexualEvaluator`
|
|
437
|
+
- `SelfHarmEvaluator`
|
|
438
|
+
- `HateUnfairnessEvaluator`
|
|
439
|
+
- `ProtectedMaterialEvaluator`
|
|
440
|
+
- `IndirectAttackEvaluator`
|
|
441
|
+
- `CoherenceEvaluator`
|
|
442
|
+
- `RelevanceEvaluator`
|
|
443
|
+
- `FluencyEvaluator`
|
|
444
|
+
- `GroundednessEvaluator`
|
|
445
|
+
- Surfaced `RetrievalScoreEvaluator`, formally an internal part of `ChatEvaluator` as a standalone conversation-only evaluator.
|
|
446
|
+
|
|
447
|
+
### Breaking Changes
|
|
448
|
+
|
|
449
|
+
- Removed `ContentSafetyChatEvaluator` and `ChatEvaluator`
|
|
450
|
+
- The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
|
|
451
|
+
`column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
|
|
452
|
+
|
|
453
|
+
Before:
|
|
454
|
+
```python
|
|
455
|
+
evaluate(
|
|
456
|
+
...,
|
|
457
|
+
evaluator_config={
|
|
458
|
+
"hate_unfairness": {
|
|
459
|
+
"query": "${data.question}",
|
|
460
|
+
"response": "${data.answer}",
|
|
461
|
+
}
|
|
462
|
+
},
|
|
463
|
+
...
|
|
464
|
+
)
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
After
|
|
468
|
+
```python
|
|
469
|
+
evaluate(
|
|
470
|
+
...,
|
|
471
|
+
evaluator_config={
|
|
472
|
+
"hate_unfairness": {
|
|
473
|
+
"column_mapping": {
|
|
474
|
+
"query": "${data.question}",
|
|
475
|
+
"response": "${data.answer}",
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
},
|
|
479
|
+
...
|
|
480
|
+
)
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
### Bugs Fixed
|
|
484
|
+
|
|
485
|
+
- Fixed issue where Entra ID authentication was not working with `AzureOpenAIModelConfiguration`
|
|
486
|
+
|
|
429
487
|
## 1.0.0b2 (2024-09-24)
|
|
430
488
|
|
|
431
489
|
### Breaking Changes
|
|
@@ -4,10 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
from ._evaluate._evaluate import evaluate
|
|
6
6
|
from ._evaluators._bleu import BleuScoreEvaluator
|
|
7
|
-
from ._evaluators._chat import ChatEvaluator
|
|
8
7
|
from ._evaluators._coherence import CoherenceEvaluator
|
|
9
8
|
from ._evaluators._content_safety import (
|
|
10
|
-
ContentSafetyChatEvaluator,
|
|
11
9
|
ContentSafetyEvaluator,
|
|
12
10
|
HateUnfairnessEvaluator,
|
|
13
11
|
SelfHarmEvaluator,
|
|
@@ -22,10 +20,16 @@ from ._evaluators._meteor import MeteorScoreEvaluator
|
|
|
22
20
|
from ._evaluators._protected_material import ProtectedMaterialEvaluator
|
|
23
21
|
from ._evaluators._qa import QAEvaluator
|
|
24
22
|
from ._evaluators._relevance import RelevanceEvaluator
|
|
23
|
+
from ._evaluators._retrieval import RetrievalEvaluator
|
|
25
24
|
from ._evaluators._rouge import RougeScoreEvaluator, RougeType
|
|
26
25
|
from ._evaluators._similarity import SimilarityEvaluator
|
|
27
26
|
from ._evaluators._xpia import IndirectAttackEvaluator
|
|
28
|
-
from ._model_configurations import
|
|
27
|
+
from ._model_configurations import (
|
|
28
|
+
AzureAIProject,
|
|
29
|
+
AzureOpenAIModelConfiguration,
|
|
30
|
+
OpenAIModelConfiguration,
|
|
31
|
+
EvaluatorConfig,
|
|
32
|
+
)
|
|
29
33
|
|
|
30
34
|
__all__ = [
|
|
31
35
|
"evaluate",
|
|
@@ -36,21 +40,21 @@ __all__ = [
|
|
|
36
40
|
"RelevanceEvaluator",
|
|
37
41
|
"SimilarityEvaluator",
|
|
38
42
|
"QAEvaluator",
|
|
39
|
-
"ChatEvaluator",
|
|
40
43
|
"ViolenceEvaluator",
|
|
41
44
|
"SexualEvaluator",
|
|
42
45
|
"SelfHarmEvaluator",
|
|
43
46
|
"HateUnfairnessEvaluator",
|
|
44
47
|
"ContentSafetyEvaluator",
|
|
45
|
-
"ContentSafetyChatEvaluator",
|
|
46
48
|
"IndirectAttackEvaluator",
|
|
47
49
|
"BleuScoreEvaluator",
|
|
48
50
|
"GleuScoreEvaluator",
|
|
49
51
|
"MeteorScoreEvaluator",
|
|
52
|
+
"RetrievalEvaluator",
|
|
50
53
|
"RougeScoreEvaluator",
|
|
51
54
|
"RougeType",
|
|
52
55
|
"ProtectedMaterialEvaluator",
|
|
53
56
|
"AzureAIProject",
|
|
54
57
|
"AzureOpenAIModelConfiguration",
|
|
55
58
|
"OpenAIModelConfiguration",
|
|
59
|
+
"EvaluatorConfig",
|
|
56
60
|
]
|
{azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/utils.py
RENAMED
|
@@ -3,12 +3,13 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
import threading
|
|
6
|
-
from typing import List,
|
|
6
|
+
from typing import List, Union
|
|
7
7
|
|
|
8
8
|
import nltk
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
11
11
|
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
12
|
+
from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
|
|
12
13
|
|
|
13
14
|
from . import constants
|
|
14
15
|
|
|
@@ -70,18 +71,32 @@ def nltk_tokenize(text: str) -> List[str]:
|
|
|
70
71
|
return list(tokens)
|
|
71
72
|
|
|
72
73
|
|
|
73
|
-
def
|
|
74
|
+
def parse_model_config_type(
|
|
74
75
|
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
75
|
-
default_api_version: str,
|
|
76
76
|
) -> None:
|
|
77
77
|
if "azure_endpoint" in model_config or "azure_deployment" in model_config:
|
|
78
|
-
model_config["
|
|
78
|
+
model_config["type"] = AZURE_OPENAI_TYPE
|
|
79
|
+
else:
|
|
80
|
+
model_config["type"] = OPENAI_TYPE
|
|
79
81
|
|
|
80
82
|
|
|
81
|
-
def
|
|
83
|
+
def construct_prompty_model_config(
|
|
82
84
|
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
83
|
-
|
|
84
|
-
user_agent:
|
|
85
|
-
) ->
|
|
86
|
-
|
|
85
|
+
default_api_version: str,
|
|
86
|
+
user_agent: str,
|
|
87
|
+
) -> dict:
|
|
88
|
+
parse_model_config_type(model_config)
|
|
89
|
+
|
|
90
|
+
if model_config["type"] == AZURE_OPENAI_TYPE:
|
|
91
|
+
model_config["api_version"] = model_config.get("api_version", default_api_version)
|
|
92
|
+
|
|
93
|
+
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
94
|
+
|
|
95
|
+
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
96
|
+
# https://github.com/encode/httpx/discussions/2959
|
|
97
|
+
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
98
|
+
|
|
99
|
+
if model_config["type"] == AZURE_OPENAI_TYPE and user_agent:
|
|
87
100
|
prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
|
|
101
|
+
|
|
102
|
+
return prompty_model_config
|
|
@@ -19,7 +19,7 @@ from .._constants import (
|
|
|
19
19
|
Prefixes,
|
|
20
20
|
_InternalEvaluationMetrics,
|
|
21
21
|
)
|
|
22
|
-
from .._model_configurations import AzureAIProject
|
|
22
|
+
from .._model_configurations import AzureAIProject, EvaluatorConfig
|
|
23
23
|
from .._user_agent import USER_AGENT
|
|
24
24
|
from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
|
|
25
25
|
from ._utils import (
|
|
@@ -158,6 +158,12 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
|
|
|
158
158
|
]
|
|
159
159
|
|
|
160
160
|
missing_inputs = [col for col in required_inputs if col not in df_data.columns]
|
|
161
|
+
if missing_inputs and "conversation" in required_inputs:
|
|
162
|
+
non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
|
|
163
|
+
if len(missing_inputs) == len(non_conversation_inputs) and [
|
|
164
|
+
input in non_conversation_inputs for input in missing_inputs
|
|
165
|
+
]:
|
|
166
|
+
missing_inputs = []
|
|
161
167
|
if missing_inputs:
|
|
162
168
|
if not is_target_fn:
|
|
163
169
|
msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
|
|
@@ -273,7 +279,7 @@ def _validate_columns(
|
|
|
273
279
|
df: pd.DataFrame,
|
|
274
280
|
evaluators: Dict[str, Any],
|
|
275
281
|
target: Optional[Callable],
|
|
276
|
-
|
|
282
|
+
column_mapping: Dict[str, Dict[str, str]],
|
|
277
283
|
) -> None:
|
|
278
284
|
"""
|
|
279
285
|
Check that all columns needed by evaluator or target function are present.
|
|
@@ -284,8 +290,8 @@ def _validate_columns(
|
|
|
284
290
|
:type evaluators: Dict[str, Any]
|
|
285
291
|
:param target: The callable to be applied to data set.
|
|
286
292
|
:type target: Optional[Callable]
|
|
287
|
-
:param
|
|
288
|
-
:type
|
|
293
|
+
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
|
|
294
|
+
:type column_mapping: Dict[str, Dict[str, str]]
|
|
289
295
|
:raises EvaluationException: If column starts from "__outputs." while target is defined.
|
|
290
296
|
"""
|
|
291
297
|
if target:
|
|
@@ -306,7 +312,7 @@ def _validate_columns(
|
|
|
306
312
|
else:
|
|
307
313
|
for evaluator_name, evaluator in evaluators.items():
|
|
308
314
|
# Apply column mapping
|
|
309
|
-
mapping_config =
|
|
315
|
+
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
|
|
310
316
|
new_df = _apply_column_mapping(df, mapping_config)
|
|
311
317
|
|
|
312
318
|
# Validate input data for evaluator
|
|
@@ -372,11 +378,11 @@ def _apply_target_to_data(
|
|
|
372
378
|
return target_output, generated_columns, run
|
|
373
379
|
|
|
374
380
|
|
|
375
|
-
def
|
|
376
|
-
"""Process
|
|
381
|
+
def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
|
|
382
|
+
"""Process column_mapping to replace ${target.} with ${data.}
|
|
377
383
|
|
|
378
|
-
:param
|
|
379
|
-
:type
|
|
384
|
+
:param column_mapping: The configuration for evaluators.
|
|
385
|
+
:type column_mapping: Dict[str, Dict[str, str]]
|
|
380
386
|
:return: The processed configuration.
|
|
381
387
|
:rtype: Dict[str, Dict[str, str]]
|
|
382
388
|
"""
|
|
@@ -385,15 +391,15 @@ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Di
|
|
|
385
391
|
|
|
386
392
|
unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
|
|
387
393
|
|
|
388
|
-
if
|
|
389
|
-
for evaluator, mapping_config in
|
|
394
|
+
if column_mapping:
|
|
395
|
+
for evaluator, mapping_config in column_mapping.items():
|
|
390
396
|
if isinstance(mapping_config, dict):
|
|
391
397
|
processed_config[evaluator] = {}
|
|
392
398
|
|
|
393
399
|
for map_to_key, map_value in mapping_config.items():
|
|
394
400
|
# Check if there's any unexpected reference other than ${target.} or ${data.}
|
|
395
401
|
if unexpected_references.search(map_value):
|
|
396
|
-
msg = "Unexpected references detected in '
|
|
402
|
+
msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
|
|
397
403
|
raise EvaluationException(
|
|
398
404
|
message=msg,
|
|
399
405
|
internal_message=msg,
|
|
@@ -439,7 +445,7 @@ def evaluate(
|
|
|
439
445
|
evaluators: Dict[str, Callable],
|
|
440
446
|
evaluation_name: Optional[str] = None,
|
|
441
447
|
target: Optional[Callable] = None,
|
|
442
|
-
evaluator_config: Optional[Dict[str,
|
|
448
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
443
449
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
444
450
|
output_path: Optional[str] = None,
|
|
445
451
|
**kwargs,
|
|
@@ -458,10 +464,10 @@ def evaluate(
|
|
|
458
464
|
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
|
|
459
465
|
:paramtype target: Optional[Callable]
|
|
460
466
|
:keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
|
|
461
|
-
names as keys and a
|
|
462
|
-
keys as the column names in the evaluator input and values as the column names in the
|
|
463
|
-
generated by target.
|
|
464
|
-
:paramtype evaluator_config: Optional[Dict[str,
|
|
467
|
+
names as keys and a values that are dictionaries containing the column mappings. The column mappings should
|
|
468
|
+
be a dictionary with keys as the column names in the evaluator input and values as the column names in the
|
|
469
|
+
input data or data generated by target.
|
|
470
|
+
:paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
|
|
465
471
|
:keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
|
|
466
472
|
the results will be saved to a file named `evaluation_results.json` in the folder.
|
|
467
473
|
:paramtype output_path: Optional[str]
|
|
@@ -482,7 +488,7 @@ def evaluate(
|
|
|
482
488
|
model_config = {
|
|
483
489
|
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
|
484
490
|
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
|
|
485
|
-
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
|
|
491
|
+
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
|
|
486
492
|
}
|
|
487
493
|
|
|
488
494
|
coherence_eval = CoherenceEvaluator(model_config=model_config)
|
|
@@ -497,15 +503,19 @@ def evaluate(
|
|
|
497
503
|
},
|
|
498
504
|
evaluator_config={
|
|
499
505
|
"coherence": {
|
|
500
|
-
"
|
|
501
|
-
|
|
506
|
+
"column_mapping": {
|
|
507
|
+
"response": "${data.response}",
|
|
508
|
+
"query": "${data.query}",
|
|
509
|
+
},
|
|
502
510
|
},
|
|
503
511
|
"relevance": {
|
|
504
|
-
"
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
512
|
+
"column_mapping": {
|
|
513
|
+
"response": "${data.response}",
|
|
514
|
+
"context": "${data.context}",
|
|
515
|
+
"query": "${data.query}",
|
|
516
|
+
},
|
|
517
|
+
},
|
|
518
|
+
},
|
|
509
519
|
)
|
|
510
520
|
|
|
511
521
|
"""
|
|
@@ -544,13 +554,13 @@ def evaluate(
|
|
|
544
554
|
raise e
|
|
545
555
|
|
|
546
556
|
|
|
547
|
-
def _evaluate( # pylint: disable=too-many-locals
|
|
557
|
+
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
548
558
|
*,
|
|
549
559
|
evaluation_name: Optional[str] = None,
|
|
550
560
|
target: Optional[Callable] = None,
|
|
551
561
|
data: Optional[str] = None,
|
|
552
562
|
evaluators: Optional[Dict[str, Callable]] = None,
|
|
553
|
-
evaluator_config: Optional[Dict[str,
|
|
563
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
554
564
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
555
565
|
output_path: Optional[str] = None,
|
|
556
566
|
**kwargs,
|
|
@@ -560,8 +570,13 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
560
570
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
561
571
|
if evaluator_config is None:
|
|
562
572
|
evaluator_config = {}
|
|
563
|
-
|
|
564
|
-
|
|
573
|
+
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
|
|
574
|
+
column_mapping = {
|
|
575
|
+
evaluator_name: evaluator_configuration.get("column_mapping", None)
|
|
576
|
+
for evaluator_name, evaluator_configuration in evaluator_config.items()
|
|
577
|
+
}
|
|
578
|
+
column_mapping = _process_column_mappings(column_mapping)
|
|
579
|
+
_validate_columns(input_data_df, evaluators, target, column_mapping)
|
|
565
580
|
|
|
566
581
|
# Target Run
|
|
567
582
|
pf_client = PFClient(
|
|
@@ -577,8 +592,8 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
577
592
|
|
|
578
593
|
# Create default configuration for evaluators that directly maps
|
|
579
594
|
# input data names to keyword inputs of the same name in the evaluators.
|
|
580
|
-
|
|
581
|
-
|
|
595
|
+
column_mapping = column_mapping or {}
|
|
596
|
+
column_mapping.setdefault("default", {})
|
|
582
597
|
|
|
583
598
|
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
584
599
|
if data is not None and target is not None:
|
|
@@ -586,21 +601,21 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
586
601
|
target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
|
|
587
602
|
)
|
|
588
603
|
|
|
589
|
-
for evaluator_name, mapping in
|
|
604
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
590
605
|
mapped_to_values = set(mapping.values())
|
|
591
606
|
for col in target_generated_columns:
|
|
592
607
|
# If user defined mapping differently, do not change it.
|
|
593
608
|
# If it was mapped to target, we have already changed it
|
|
594
|
-
# in
|
|
609
|
+
# in _process_column_mappings
|
|
595
610
|
run_output = f"${{run.outputs.{col}}}"
|
|
596
611
|
# We will add our mapping only if
|
|
597
612
|
# customer did not mapped target output.
|
|
598
613
|
if col not in mapping and run_output not in mapped_to_values:
|
|
599
|
-
|
|
614
|
+
column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
|
|
600
615
|
|
|
601
616
|
# After we have generated all columns we can check if we have
|
|
602
617
|
# everything we need for evaluators.
|
|
603
|
-
_validate_columns(input_data_df, evaluators, target=None,
|
|
618
|
+
_validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
|
|
604
619
|
|
|
605
620
|
# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
|
|
606
621
|
# via target mapping.
|
|
@@ -610,13 +625,16 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
610
625
|
for col in input_data_df.columns:
|
|
611
626
|
# Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
|
|
612
627
|
# Also ignore columns that are already in config, since they've been covered by target mapping.
|
|
613
|
-
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in
|
|
614
|
-
|
|
628
|
+
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
|
|
629
|
+
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
615
630
|
# Batch Run
|
|
616
631
|
evaluators_info = {}
|
|
617
632
|
use_pf_client = kwargs.get("_use_pf_client", True)
|
|
618
633
|
if use_pf_client:
|
|
619
|
-
|
|
634
|
+
# A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
|
|
635
|
+
# The root cause is still unclear, but it seems related to a conflict between the async run uploader
|
|
636
|
+
# and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
|
|
637
|
+
batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT))
|
|
620
638
|
|
|
621
639
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
622
640
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
@@ -632,7 +650,7 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
632
650
|
flow=evaluator,
|
|
633
651
|
run=target_run,
|
|
634
652
|
evaluator_name=evaluator_name,
|
|
635
|
-
column_mapping=
|
|
653
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
636
654
|
data=data,
|
|
637
655
|
stream=True,
|
|
638
656
|
name=kwargs.get("_run_name"),
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import os
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from typing_extensions import override
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CoherenceEvaluator(PromptyEvaluatorBase):
|
|
12
|
+
"""
|
|
13
|
+
Initialize a coherence evaluator configured for a specific Azure OpenAI model.
|
|
14
|
+
|
|
15
|
+
:param model_config: Configuration for the Azure OpenAI model.
|
|
16
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
17
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
18
|
+
|
|
19
|
+
**Usage**
|
|
20
|
+
|
|
21
|
+
.. code-block:: python
|
|
22
|
+
|
|
23
|
+
eval_fn = CoherenceEvaluator(model_config)
|
|
24
|
+
result = eval_fn(
|
|
25
|
+
query="What is the capital of Japan?",
|
|
26
|
+
response="The capital of Japan is Tokyo.")
|
|
27
|
+
|
|
28
|
+
**Output format**
|
|
29
|
+
|
|
30
|
+
.. code-block:: python
|
|
31
|
+
|
|
32
|
+
{
|
|
33
|
+
"gpt_coherence": 1.0
|
|
34
|
+
}
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
PROMPTY_FILE = "coherence.prompty"
|
|
38
|
+
RESULT_KEY = "gpt_coherence"
|
|
39
|
+
|
|
40
|
+
@override
|
|
41
|
+
def __init__(self, model_config: dict):
|
|
42
|
+
current_dir = os.path.dirname(__file__)
|
|
43
|
+
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
44
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
|
|
45
|
+
|
|
46
|
+
@override
|
|
47
|
+
def __call__(
|
|
48
|
+
self,
|
|
49
|
+
*,
|
|
50
|
+
query: Optional[str] = None,
|
|
51
|
+
response: Optional[str] = None,
|
|
52
|
+
conversation: Optional[dict] = None,
|
|
53
|
+
**kwargs
|
|
54
|
+
):
|
|
55
|
+
"""Evaluate coherence. Accepts either a query and response for a single evaluation,
|
|
56
|
+
or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
|
|
57
|
+
turns, the evaluator will aggregate the results of each turn.
|
|
58
|
+
|
|
59
|
+
:keyword response: The response to be evaluated.
|
|
60
|
+
:paramtype response: Optional[str]
|
|
61
|
+
:keyword context: The context to be evaluated.
|
|
62
|
+
:paramtype context: Optional[str]
|
|
63
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
64
|
+
key "messages". Conversation turns are expected
|
|
65
|
+
to be dictionaries with keys "content" and "role".
|
|
66
|
+
:paramtype conversation: Optional[Dict]
|
|
67
|
+
:return: The relevance score.
|
|
68
|
+
:rtype: dict
|
|
69
|
+
"""
|
|
70
|
+
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
|
|
@@ -3,11 +3,6 @@ name: Coherence
|
|
|
3
3
|
description: Evaluates coherence score for QA scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
-
configuration:
|
|
7
|
-
type: azure_openai
|
|
8
|
-
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
-
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
-
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
6
|
parameters:
|
|
12
7
|
temperature: 0.0
|
|
13
8
|
max_tokens: 1
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._base_eval import EvaluatorBase
|
|
6
|
+
from ._base_prompty_eval import PromptyEvaluatorBase
|
|
7
|
+
from ._base_rai_svc_eval import RaiServiceEvaluatorBase
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"EvaluatorBase",
|
|
11
|
+
"PromptyEvaluatorBase",
|
|
12
|
+
"RaiServiceEvaluatorBase",
|
|
13
|
+
]
|