azure-ai-evaluation 1.0.0b1__tar.gz → 1.0.0b2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/CHANGELOG.md +6 -0
- {azure_ai_evaluation-1.0.0b1/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.0.0b2}/PKG-INFO +86 -14
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/README.md +79 -13
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/__init__.py +1 -5
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_common/rai_service.py +4 -4
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_common/utils.py +19 -19
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_constants.py +9 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +2 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +39 -17
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +23 -13
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluate/_eval_run.py +38 -18
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluate/_evaluate.py +35 -28
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +13 -8
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluate/_utils.py +29 -22
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_chat/_chat.py +16 -9
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +4 -10
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -10
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -2
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +1 -2
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +9 -4
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_eci/_eci.py +2 -2
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +2 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +5 -10
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +5 -10
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +2 -2
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +2 -2
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_qa/_qa.py +3 -14
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +5 -10
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +3 -2
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +5 -10
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_xpia/xpia.py +1 -2
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_version.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/__init__.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_adversarial_simulator.py +8 -6
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_conversation/_conversation.py +16 -16
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +6 -6
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_helpers/__init__.py +3 -2
- azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/simulator/_helpers/_experimental.py +157 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +11 -29
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +6 -6
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -3
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +18 -11
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_model_tools/models.py +9 -11
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/simulator.py → azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/simulator/_simulator.py +147 -80
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_tracing.py +21 -24
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_utils.py +4 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2/azure_ai_evaluation.egg-info}/PKG-INFO +86 -14
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure_ai_evaluation.egg-info/SOURCES.txt +4 -1
- azure_ai_evaluation-1.0.0b2/pyproject.toml +21 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/setup.py +1 -0
- azure_ai_evaluation-1.0.0b2/tests/e2etests/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/e2etests/test_metrics_upload.py +9 -3
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_evaluate.py +102 -6
- azure_ai_evaluation-1.0.0b2/tests/unittests/test_evaluators/test_inputs_evaluators.py +46 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_non_adv_simulator.py +11 -12
- azure_ai_evaluation-1.0.0b1/pyproject.toml +0 -6
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/MANIFEST.in +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_common/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_common/constants.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluate/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_chat/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_exceptions.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_http_utils.py +3 -3
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_model_configurations.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_user_agent.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/py.typed +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_adversarial_scenario.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_constants.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +0 -0
- {azure_ai_evaluation-1.0.0b1/tests → azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/simulator/_prompty}/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure_ai_evaluation.egg-info/requires.txt +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/setup.cfg +0 -0
- {azure_ai_evaluation-1.0.0b1/tests/e2etests → azure_ai_evaluation-1.0.0b2/tests}/__init__.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/__openai_patcher.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/conftest.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/e2etests/target_fn.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/e2etests/test_adv_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/e2etests/test_builtin_evaluators.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/e2etests/test_evaluate.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_batch_run_context.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_built_in_evaluator.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_chat_evaluator.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_content_safety_chat_evaluator.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_content_safety_defect_rate.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_content_safety_rai_script.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_eval_run.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_evaluate_telemetry.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_evaluators/apology_dag/apology.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_jailbreak_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_save_eval.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_simulator.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_synthetic_conversation_bot.py +0 -0
- {azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/tests/unittests/test_utils.py +0 -0
{azure_ai_evaluation-1.0.0b1/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.0.0b2}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0b2
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -35,11 +35,27 @@ Requires-Dist: promptflow-azure<2.0.0,>=1.15.0; extra == "pf-azure"
|
|
|
35
35
|
|
|
36
36
|
# Azure AI Evaluation client library for Python
|
|
37
37
|
|
|
38
|
+
We are excited to introduce the public preview of the Azure AI Evaluation SDK.
|
|
39
|
+
|
|
40
|
+
[Source code][source_code]
|
|
41
|
+
| [Package (PyPI)][evaluation_pypi]
|
|
42
|
+
| [API reference documentation][evaluation_ref_docs]
|
|
43
|
+
| [Product documentation][product_documentation]
|
|
44
|
+
| [Samples][evaluation_samples]
|
|
45
|
+
|
|
46
|
+
This package has been tested with Python 3.8, 3.9, 3.10, 3.11, and 3.12.
|
|
47
|
+
|
|
48
|
+
For a more complete set of Azure libraries, see https://aka.ms/azsdk/python/all
|
|
49
|
+
|
|
38
50
|
## Getting started
|
|
39
51
|
|
|
52
|
+
### Prerequisites
|
|
53
|
+
|
|
54
|
+
- Python 3.8 or later is required to use this package.
|
|
55
|
+
|
|
40
56
|
### Install the package
|
|
41
57
|
|
|
42
|
-
Install the Azure AI Evaluation library for Python with
|
|
58
|
+
Install the Azure AI Evaluation library for Python with [pip][pip_link]::
|
|
43
59
|
|
|
44
60
|
```bash
|
|
45
61
|
pip install azure-ai-evaluation
|
|
@@ -51,6 +67,8 @@ Evaluators are custom or prebuilt classes or functions that are designed to meas
|
|
|
51
67
|
|
|
52
68
|
## Examples
|
|
53
69
|
|
|
70
|
+
### Evaluators
|
|
71
|
+
|
|
54
72
|
Users can create evaluator runs on the local machine as shown in the example below:
|
|
55
73
|
|
|
56
74
|
```python
|
|
@@ -92,9 +110,9 @@ if __name__ == "__main__":
|
|
|
92
110
|
|
|
93
111
|
# Initialize Project Scope
|
|
94
112
|
azure_ai_project = {
|
|
95
|
-
"subscription_id":
|
|
96
|
-
"resource_group_name":
|
|
97
|
-
"project_name":
|
|
113
|
+
"subscription_id": <subscription_id>,
|
|
114
|
+
"resource_group_name": <resource_group_name>,
|
|
115
|
+
"project_name": <project_name>
|
|
98
116
|
}
|
|
99
117
|
|
|
100
118
|
violence_eval = ViolenceEvaluator(azure_ai_project)
|
|
@@ -122,9 +140,13 @@ if __name__ == "__main__":
|
|
|
122
140
|
|
|
123
141
|
pprint(result)
|
|
124
142
|
```
|
|
125
|
-
|
|
143
|
+
### Simulator
|
|
126
144
|
|
|
127
|
-
|
|
145
|
+
|
|
146
|
+
Simulators allow users to generate synthentic data using their application. Simulator expects the user to have a callback method that invokes
|
|
147
|
+
their AI application.
|
|
148
|
+
|
|
149
|
+
#### Simulating with a Prompty
|
|
128
150
|
|
|
129
151
|
```yaml
|
|
130
152
|
---
|
|
@@ -163,7 +185,7 @@ Application code:
|
|
|
163
185
|
import json
|
|
164
186
|
import asyncio
|
|
165
187
|
from typing import Any, Dict, List, Optional
|
|
166
|
-
from azure.ai.evaluation.
|
|
188
|
+
from azure.ai.evaluation.simulator import Simulator
|
|
167
189
|
from promptflow.client import load_flow
|
|
168
190
|
from azure.identity import DefaultAzureCredential
|
|
169
191
|
import os
|
|
@@ -171,8 +193,7 @@ import os
|
|
|
171
193
|
azure_ai_project = {
|
|
172
194
|
"subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
|
|
173
195
|
"resource_group_name": os.environ.get("RESOURCE_GROUP"),
|
|
174
|
-
"project_name": os.environ.get("PROJECT_NAME")
|
|
175
|
-
"credential": DefaultAzureCredential(),
|
|
196
|
+
"project_name": os.environ.get("PROJECT_NAME")
|
|
176
197
|
}
|
|
177
198
|
|
|
178
199
|
import wikipedia
|
|
@@ -249,8 +270,7 @@ if __name__ == "__main__":
|
|
|
249
270
|
print("done!")
|
|
250
271
|
```
|
|
251
272
|
|
|
252
|
-
|
|
253
|
-
their AI application. Here's a sample of a callback which invokes AsyncAzureOpenAI:
|
|
273
|
+
#### Adversarial Simulator
|
|
254
274
|
|
|
255
275
|
```python
|
|
256
276
|
from from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
|
|
@@ -318,7 +338,9 @@ async def callback(
|
|
|
318
338
|
}
|
|
319
339
|
|
|
320
340
|
```
|
|
321
|
-
|
|
341
|
+
|
|
342
|
+
#### Adversarial QA
|
|
343
|
+
|
|
322
344
|
```python
|
|
323
345
|
scenario = AdversarialScenario.ADVERSARIAL_QA
|
|
324
346
|
simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
|
|
@@ -334,7 +356,7 @@ outputs = asyncio.run(
|
|
|
334
356
|
|
|
335
357
|
print(outputs.to_eval_qa_json_lines())
|
|
336
358
|
```
|
|
337
|
-
|
|
359
|
+
#### Direct Attack Simulator
|
|
338
360
|
|
|
339
361
|
```python
|
|
340
362
|
scenario = AdversarialScenario.ADVERSARIAL_QA
|
|
@@ -353,13 +375,63 @@ print(outputs)
|
|
|
353
375
|
```
|
|
354
376
|
## Troubleshooting
|
|
355
377
|
|
|
378
|
+
### General
|
|
379
|
+
|
|
380
|
+
Azure ML clients raise exceptions defined in [Azure Core][azure_core_readme].
|
|
381
|
+
|
|
382
|
+
### Logging
|
|
383
|
+
|
|
384
|
+
This library uses the standard
|
|
385
|
+
[logging][python_logging] library for logging.
|
|
386
|
+
Basic information about HTTP sessions (URLs, headers, etc.) is logged at INFO
|
|
387
|
+
level.
|
|
388
|
+
|
|
389
|
+
Detailed DEBUG level logging, including request/response bodies and unredacted
|
|
390
|
+
headers, can be enabled on a client with the `logging_enable` argument.
|
|
391
|
+
|
|
392
|
+
See full SDK logging documentation with examples [here][sdk_logging_docs].
|
|
393
|
+
|
|
356
394
|
## Next steps
|
|
357
395
|
|
|
396
|
+
- View our [samples][evaluation_samples].
|
|
397
|
+
- View our [documentation][product_documentation]
|
|
398
|
+
|
|
358
399
|
## Contributing
|
|
359
400
|
|
|
401
|
+
This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit [cla.microsoft.com][cla].
|
|
402
|
+
|
|
403
|
+
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
|
|
404
|
+
|
|
405
|
+
This project has adopted the [Microsoft Open Source Code of Conduct][code_of_conduct]. For more information see the [Code of Conduct FAQ][coc_faq] or contact [opencode@microsoft.com][coc_contact] with any additional questions or comments.
|
|
406
|
+
|
|
407
|
+
<!-- LINKS -->
|
|
408
|
+
|
|
409
|
+
[source_code]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/evaluation/azure-ai-evaluation
|
|
410
|
+
[evaluation_pypi]: https://pypi.org/project/azure-ai-evaluation/
|
|
411
|
+
[evaluation_ref_docs]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview
|
|
412
|
+
[evaluation_samples]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios
|
|
413
|
+
[product_documentation]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk
|
|
414
|
+
[python_logging]: https://docs.python.org/3/library/logging.html
|
|
415
|
+
[sdk_logging_docs]: https://docs.microsoft.com/azure/developer/python/azure-sdk-logging
|
|
416
|
+
[azure_core_readme]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md
|
|
417
|
+
[pip_link]: https://pypi.org/project/pip/
|
|
418
|
+
[azure_core_ref_docs]: https://aka.ms/azsdk-python-core-policies
|
|
419
|
+
[azure_core]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md
|
|
420
|
+
[azure_identity]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/identity/azure-identity
|
|
421
|
+
[cla]: https://cla.microsoft.com
|
|
422
|
+
[code_of_conduct]: https://opensource.microsoft.com/codeofconduct/
|
|
423
|
+
[coc_faq]: https://opensource.microsoft.com/codeofconduct/faq/
|
|
424
|
+
[coc_contact]: mailto:opencode@microsoft.com
|
|
425
|
+
|
|
360
426
|
|
|
361
427
|
# Release History
|
|
362
428
|
|
|
429
|
+
## 1.0.0b2 (2024-09-24)
|
|
430
|
+
|
|
431
|
+
### Breaking Changes
|
|
432
|
+
|
|
433
|
+
- `data` and `evaluators` are now required keywords in `evaluate`.
|
|
434
|
+
|
|
363
435
|
## 1.0.0b1 (2024-09-20)
|
|
364
436
|
|
|
365
437
|
### Breaking Changes
|
|
@@ -1,10 +1,26 @@
|
|
|
1
1
|
# Azure AI Evaluation client library for Python
|
|
2
2
|
|
|
3
|
+
We are excited to introduce the public preview of the Azure AI Evaluation SDK.
|
|
4
|
+
|
|
5
|
+
[Source code][source_code]
|
|
6
|
+
| [Package (PyPI)][evaluation_pypi]
|
|
7
|
+
| [API reference documentation][evaluation_ref_docs]
|
|
8
|
+
| [Product documentation][product_documentation]
|
|
9
|
+
| [Samples][evaluation_samples]
|
|
10
|
+
|
|
11
|
+
This package has been tested with Python 3.8, 3.9, 3.10, 3.11, and 3.12.
|
|
12
|
+
|
|
13
|
+
For a more complete set of Azure libraries, see https://aka.ms/azsdk/python/all
|
|
14
|
+
|
|
3
15
|
## Getting started
|
|
4
16
|
|
|
17
|
+
### Prerequisites
|
|
18
|
+
|
|
19
|
+
- Python 3.8 or later is required to use this package.
|
|
20
|
+
|
|
5
21
|
### Install the package
|
|
6
22
|
|
|
7
|
-
Install the Azure AI Evaluation library for Python with
|
|
23
|
+
Install the Azure AI Evaluation library for Python with [pip][pip_link]::
|
|
8
24
|
|
|
9
25
|
```bash
|
|
10
26
|
pip install azure-ai-evaluation
|
|
@@ -16,6 +32,8 @@ Evaluators are custom or prebuilt classes or functions that are designed to meas
|
|
|
16
32
|
|
|
17
33
|
## Examples
|
|
18
34
|
|
|
35
|
+
### Evaluators
|
|
36
|
+
|
|
19
37
|
Users can create evaluator runs on the local machine as shown in the example below:
|
|
20
38
|
|
|
21
39
|
```python
|
|
@@ -57,9 +75,9 @@ if __name__ == "__main__":
|
|
|
57
75
|
|
|
58
76
|
# Initialize Project Scope
|
|
59
77
|
azure_ai_project = {
|
|
60
|
-
"subscription_id":
|
|
61
|
-
"resource_group_name":
|
|
62
|
-
"project_name":
|
|
78
|
+
"subscription_id": <subscription_id>,
|
|
79
|
+
"resource_group_name": <resource_group_name>,
|
|
80
|
+
"project_name": <project_name>
|
|
63
81
|
}
|
|
64
82
|
|
|
65
83
|
violence_eval = ViolenceEvaluator(azure_ai_project)
|
|
@@ -87,9 +105,13 @@ if __name__ == "__main__":
|
|
|
87
105
|
|
|
88
106
|
pprint(result)
|
|
89
107
|
```
|
|
90
|
-
|
|
108
|
+
### Simulator
|
|
109
|
+
|
|
91
110
|
|
|
92
|
-
|
|
111
|
+
Simulators allow users to generate synthentic data using their application. Simulator expects the user to have a callback method that invokes
|
|
112
|
+
their AI application.
|
|
113
|
+
|
|
114
|
+
#### Simulating with a Prompty
|
|
93
115
|
|
|
94
116
|
```yaml
|
|
95
117
|
---
|
|
@@ -128,7 +150,7 @@ Application code:
|
|
|
128
150
|
import json
|
|
129
151
|
import asyncio
|
|
130
152
|
from typing import Any, Dict, List, Optional
|
|
131
|
-
from azure.ai.evaluation.
|
|
153
|
+
from azure.ai.evaluation.simulator import Simulator
|
|
132
154
|
from promptflow.client import load_flow
|
|
133
155
|
from azure.identity import DefaultAzureCredential
|
|
134
156
|
import os
|
|
@@ -136,8 +158,7 @@ import os
|
|
|
136
158
|
azure_ai_project = {
|
|
137
159
|
"subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
|
|
138
160
|
"resource_group_name": os.environ.get("RESOURCE_GROUP"),
|
|
139
|
-
"project_name": os.environ.get("PROJECT_NAME")
|
|
140
|
-
"credential": DefaultAzureCredential(),
|
|
161
|
+
"project_name": os.environ.get("PROJECT_NAME")
|
|
141
162
|
}
|
|
142
163
|
|
|
143
164
|
import wikipedia
|
|
@@ -214,8 +235,7 @@ if __name__ == "__main__":
|
|
|
214
235
|
print("done!")
|
|
215
236
|
```
|
|
216
237
|
|
|
217
|
-
|
|
218
|
-
their AI application. Here's a sample of a callback which invokes AsyncAzureOpenAI:
|
|
238
|
+
#### Adversarial Simulator
|
|
219
239
|
|
|
220
240
|
```python
|
|
221
241
|
from from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
|
|
@@ -283,7 +303,9 @@ async def callback(
|
|
|
283
303
|
}
|
|
284
304
|
|
|
285
305
|
```
|
|
286
|
-
|
|
306
|
+
|
|
307
|
+
#### Adversarial QA
|
|
308
|
+
|
|
287
309
|
```python
|
|
288
310
|
scenario = AdversarialScenario.ADVERSARIAL_QA
|
|
289
311
|
simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
|
|
@@ -299,7 +321,7 @@ outputs = asyncio.run(
|
|
|
299
321
|
|
|
300
322
|
print(outputs.to_eval_qa_json_lines())
|
|
301
323
|
```
|
|
302
|
-
|
|
324
|
+
#### Direct Attack Simulator
|
|
303
325
|
|
|
304
326
|
```python
|
|
305
327
|
scenario = AdversarialScenario.ADVERSARIAL_QA
|
|
@@ -318,6 +340,50 @@ print(outputs)
|
|
|
318
340
|
```
|
|
319
341
|
## Troubleshooting
|
|
320
342
|
|
|
343
|
+
### General
|
|
344
|
+
|
|
345
|
+
Azure ML clients raise exceptions defined in [Azure Core][azure_core_readme].
|
|
346
|
+
|
|
347
|
+
### Logging
|
|
348
|
+
|
|
349
|
+
This library uses the standard
|
|
350
|
+
[logging][python_logging] library for logging.
|
|
351
|
+
Basic information about HTTP sessions (URLs, headers, etc.) is logged at INFO
|
|
352
|
+
level.
|
|
353
|
+
|
|
354
|
+
Detailed DEBUG level logging, including request/response bodies and unredacted
|
|
355
|
+
headers, can be enabled on a client with the `logging_enable` argument.
|
|
356
|
+
|
|
357
|
+
See full SDK logging documentation with examples [here][sdk_logging_docs].
|
|
358
|
+
|
|
321
359
|
## Next steps
|
|
322
360
|
|
|
361
|
+
- View our [samples][evaluation_samples].
|
|
362
|
+
- View our [documentation][product_documentation]
|
|
363
|
+
|
|
323
364
|
## Contributing
|
|
365
|
+
|
|
366
|
+
This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit [cla.microsoft.com][cla].
|
|
367
|
+
|
|
368
|
+
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
|
|
369
|
+
|
|
370
|
+
This project has adopted the [Microsoft Open Source Code of Conduct][code_of_conduct]. For more information see the [Code of Conduct FAQ][coc_faq] or contact [opencode@microsoft.com][coc_contact] with any additional questions or comments.
|
|
371
|
+
|
|
372
|
+
<!-- LINKS -->
|
|
373
|
+
|
|
374
|
+
[source_code]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/evaluation/azure-ai-evaluation
|
|
375
|
+
[evaluation_pypi]: https://pypi.org/project/azure-ai-evaluation/
|
|
376
|
+
[evaluation_ref_docs]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview
|
|
377
|
+
[evaluation_samples]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios
|
|
378
|
+
[product_documentation]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk
|
|
379
|
+
[python_logging]: https://docs.python.org/3/library/logging.html
|
|
380
|
+
[sdk_logging_docs]: https://docs.microsoft.com/azure/developer/python/azure-sdk-logging
|
|
381
|
+
[azure_core_readme]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md
|
|
382
|
+
[pip_link]: https://pypi.org/project/pip/
|
|
383
|
+
[azure_core_ref_docs]: https://aka.ms/azsdk-python-core-policies
|
|
384
|
+
[azure_core]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md
|
|
385
|
+
[azure_identity]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/identity/azure-identity
|
|
386
|
+
[cla]: https://cla.microsoft.com
|
|
387
|
+
[code_of_conduct]: https://opensource.microsoft.com/codeofconduct/
|
|
388
|
+
[coc_faq]: https://opensource.microsoft.com/codeofconduct/faq/
|
|
389
|
+
[coc_contact]: mailto:opencode@microsoft.com
|
|
@@ -25,11 +25,7 @@ from ._evaluators._relevance import RelevanceEvaluator
|
|
|
25
25
|
from ._evaluators._rouge import RougeScoreEvaluator, RougeType
|
|
26
26
|
from ._evaluators._similarity import SimilarityEvaluator
|
|
27
27
|
from ._evaluators._xpia import IndirectAttackEvaluator
|
|
28
|
-
from ._model_configurations import
|
|
29
|
-
AzureAIProject,
|
|
30
|
-
AzureOpenAIModelConfiguration,
|
|
31
|
-
OpenAIModelConfiguration,
|
|
32
|
-
)
|
|
28
|
+
from ._model_configurations import AzureAIProject, AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
33
29
|
|
|
34
30
|
__all__ = [
|
|
35
31
|
"evaluate",
|
|
@@ -11,12 +11,12 @@ from urllib.parse import urlparse
|
|
|
11
11
|
|
|
12
12
|
import jwt
|
|
13
13
|
import numpy as np
|
|
14
|
-
from azure.core.credentials import TokenCredential
|
|
15
|
-
from azure.identity import DefaultAzureCredential
|
|
16
14
|
|
|
15
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
17
16
|
from azure.ai.evaluation._http_utils import get_async_http_client
|
|
18
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
19
17
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
18
|
+
from azure.core.credentials import TokenCredential
|
|
19
|
+
from azure.identity import DefaultAzureCredential
|
|
20
20
|
|
|
21
21
|
from .constants import (
|
|
22
22
|
CommonConstants,
|
|
@@ -348,7 +348,7 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
348
348
|
)
|
|
349
349
|
|
|
350
350
|
if response.status_code != 200:
|
|
351
|
-
msg =
|
|
351
|
+
msg = "Failed to retrieve the discovery service URL."
|
|
352
352
|
raise EvaluationException(
|
|
353
353
|
message=msg,
|
|
354
354
|
internal_message=msg,
|
{azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_common/utils.py
RENAMED
|
@@ -2,20 +2,15 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
5
|
+
import threading
|
|
6
|
+
from typing import List, Optional, Union
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
except ImportError:
|
|
12
|
-
import constants
|
|
8
|
+
import nltk
|
|
9
|
+
import numpy as np
|
|
13
10
|
|
|
14
|
-
from
|
|
11
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
15
12
|
|
|
16
|
-
import
|
|
17
|
-
import numpy as np
|
|
18
|
-
import nltk
|
|
13
|
+
from . import constants
|
|
19
14
|
|
|
20
15
|
_nltk_data_download_lock = threading.Lock()
|
|
21
16
|
|
|
@@ -46,7 +41,7 @@ def ensure_nltk_data_downloaded():
|
|
|
46
41
|
"""Download NLTK data packages if not already downloaded."""
|
|
47
42
|
with _nltk_data_download_lock:
|
|
48
43
|
try:
|
|
49
|
-
from nltk.tokenize.nist import NISTTokenizer
|
|
44
|
+
from nltk.tokenize.nist import NISTTokenizer # pylint: disable=unused-import
|
|
50
45
|
except LookupError:
|
|
51
46
|
nltk.download("perluniprops")
|
|
52
47
|
nltk.download("punkt")
|
|
@@ -54,12 +49,19 @@ def ensure_nltk_data_downloaded():
|
|
|
54
49
|
|
|
55
50
|
|
|
56
51
|
def nltk_tokenize(text: str) -> List[str]:
|
|
57
|
-
"""Tokenize the input text using the NLTK tokenizer.
|
|
52
|
+
"""Tokenize the input text using the NLTK tokenizer.
|
|
53
|
+
|
|
54
|
+
:param text: The text to tokenize
|
|
55
|
+
:type text: str
|
|
56
|
+
:return: A list of tokens
|
|
57
|
+
:rtype: list[str]
|
|
58
|
+
"""
|
|
58
59
|
ensure_nltk_data_downloaded()
|
|
59
60
|
|
|
60
61
|
if not text.isascii():
|
|
61
62
|
# Use NISTTokenizer for international tokenization
|
|
62
63
|
from nltk.tokenize.nist import NISTTokenizer
|
|
64
|
+
|
|
63
65
|
tokens = NISTTokenizer().international_tokenize(text)
|
|
64
66
|
else:
|
|
65
67
|
# By default, use NLTK word tokenizer
|
|
@@ -68,20 +70,18 @@ def nltk_tokenize(text: str) -> List[str]:
|
|
|
68
70
|
return list(tokens)
|
|
69
71
|
|
|
70
72
|
|
|
71
|
-
def
|
|
73
|
+
def ensure_api_version_in_aoai_model_config(
|
|
72
74
|
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
73
75
|
default_api_version: str,
|
|
74
76
|
) -> None:
|
|
75
|
-
if
|
|
76
|
-
"azure_endpoint" in model_config or "azure_deployment" in model_config
|
|
77
|
-
):
|
|
77
|
+
if "azure_endpoint" in model_config or "azure_deployment" in model_config:
|
|
78
78
|
model_config["api_version"] = model_config.get("api_version", default_api_version)
|
|
79
79
|
|
|
80
80
|
|
|
81
|
-
def
|
|
81
|
+
def ensure_user_agent_in_aoai_model_config(
|
|
82
82
|
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
83
83
|
prompty_model_config: dict,
|
|
84
84
|
user_agent: Optional[str] = None,
|
|
85
85
|
) -> None:
|
|
86
86
|
if user_agent and ("azure_endpoint" in model_config or "azure_deployment" in model_config):
|
|
87
|
-
prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
|
|
87
|
+
prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
|
{azure_ai_evaluation-1.0.0b1 → azure_ai_evaluation-1.0.0b2}/azure/ai/evaluation/_constants.py
RENAMED
|
@@ -39,6 +39,15 @@ class Prefixes:
|
|
|
39
39
|
TSG_OUTPUTS = "__outputs."
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
class DefaultOpenEncoding:
|
|
43
|
+
"""Enum that captures SDK's default values for the encoding param of open(...)"""
|
|
44
|
+
|
|
45
|
+
READ = "utf-8-sig"
|
|
46
|
+
"""SDK Default Encoding when reading a file"""
|
|
47
|
+
WRITE = "utf-8"
|
|
48
|
+
"""SDK Default Encoding when writing a file"""
|
|
49
|
+
|
|
50
|
+
|
|
42
51
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
|
|
43
52
|
|
|
44
53
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
|
|
@@ -5,13 +5,14 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
|
|
7
7
|
from promptflow._utils.user_agent_utils import ClientUserAgentUtil
|
|
8
|
+
from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
|
|
9
|
+
|
|
8
10
|
from azure.ai.evaluation._constants import (
|
|
9
11
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
|
|
10
12
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
|
|
11
13
|
PF_BATCH_TIMEOUT_SEC,
|
|
12
14
|
PF_BATCH_TIMEOUT_SEC_DEFAULT,
|
|
13
15
|
)
|
|
14
|
-
from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
|
|
15
16
|
|
|
16
17
|
from ..._user_agent import USER_AGENT
|
|
17
18
|
from .._utils import set_event_loop_policy
|
|
@@ -4,13 +4,16 @@
|
|
|
4
4
|
import inspect
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Callable, Dict, Optional, Union
|
|
7
10
|
|
|
8
11
|
import pandas as pd
|
|
9
|
-
|
|
10
12
|
from promptflow.contracts.types import AttrDict
|
|
11
|
-
from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
|
|
12
13
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
13
|
-
|
|
14
|
+
|
|
15
|
+
from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
|
|
16
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
17
|
|
|
15
18
|
from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
|
|
16
19
|
|
|
@@ -18,7 +21,9 @@ LOGGER = logging.getLogger(__name__)
|
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
class CodeRun:
|
|
21
|
-
def __init__(
|
|
24
|
+
def __init__(
|
|
25
|
+
self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs # pylint: disable=unused-argument
|
|
26
|
+
):
|
|
22
27
|
self.run = run
|
|
23
28
|
self.evaluator_name = evaluator_name if evaluator_name is not None else ""
|
|
24
29
|
self.input_data = input_data
|
|
@@ -40,13 +45,13 @@ class CodeRun:
|
|
|
40
45
|
else None
|
|
41
46
|
)
|
|
42
47
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
43
|
-
LOGGER.debug(
|
|
48
|
+
LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", self.evaluator_name, ex)
|
|
44
49
|
aggregated_metrics = None
|
|
45
50
|
|
|
46
51
|
if not isinstance(aggregated_metrics, dict):
|
|
47
52
|
LOGGER.warning(
|
|
48
|
-
|
|
49
|
-
|
|
53
|
+
"Aggregated metrics for evaluator %s is not a dictionary will not be logged as metrics",
|
|
54
|
+
self.evaluator_name,
|
|
50
55
|
)
|
|
51
56
|
|
|
52
57
|
aggregated_metrics = aggregated_metrics if isinstance(aggregated_metrics, dict) else {}
|
|
@@ -54,11 +59,15 @@ class CodeRun:
|
|
|
54
59
|
return aggregated_metrics
|
|
55
60
|
|
|
56
61
|
|
|
57
|
-
class CodeClient:
|
|
58
|
-
def __init__(
|
|
62
|
+
class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
63
|
+
def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
|
|
64
|
+
self,
|
|
65
|
+
) -> None:
|
|
59
66
|
self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
|
|
60
67
|
|
|
61
|
-
def _calculate_metric(
|
|
68
|
+
def _calculate_metric(
|
|
69
|
+
self, evaluator: Callable, input_df: pd.DataFrame, column_mapping: Optional[Dict[str, str]], evaluator_name: str
|
|
70
|
+
) -> pd.DataFrame:
|
|
62
71
|
row_metric_futures = []
|
|
63
72
|
row_metric_results = []
|
|
64
73
|
input_df = _apply_column_mapping(input_df, column_mapping)
|
|
@@ -110,18 +119,25 @@ class CodeClient:
|
|
|
110
119
|
return aggregated_output
|
|
111
120
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
112
121
|
LOGGER.warning(
|
|
113
|
-
|
|
122
|
+
"Error calculating aggregations for evaluator %s, failed with error %s", run.evaluator_name, ex
|
|
114
123
|
)
|
|
115
124
|
return None
|
|
116
125
|
|
|
117
|
-
def run(
|
|
126
|
+
def run(
|
|
127
|
+
self, # pylint: disable=unused-argument
|
|
128
|
+
flow: Callable,
|
|
129
|
+
data: Union[os.PathLike, Path, pd.DataFrame],
|
|
130
|
+
evaluator_name: Optional[str] = None,
|
|
131
|
+
column_mapping: Optional[Dict[str, str]] = None,
|
|
132
|
+
**kwargs,
|
|
133
|
+
) -> CodeRun:
|
|
118
134
|
input_df = data
|
|
119
135
|
if not isinstance(input_df, pd.DataFrame):
|
|
120
136
|
try:
|
|
121
137
|
json_data = load_jsonl(data)
|
|
122
138
|
except json.JSONDecodeError as exc:
|
|
123
139
|
raise EvaluationException(
|
|
124
|
-
message
|
|
140
|
+
message=f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
|
|
125
141
|
internal_message="Failed to parse data as JSON",
|
|
126
142
|
target=ErrorTarget.CODE_CLIENT,
|
|
127
143
|
category=ErrorCategory.INVALID_VALUE,
|
|
@@ -129,22 +145,28 @@ class CodeClient:
|
|
|
129
145
|
) from exc
|
|
130
146
|
|
|
131
147
|
input_df = pd.DataFrame(json_data)
|
|
132
|
-
eval_future = self._thread_pool.submit(
|
|
148
|
+
eval_future = self._thread_pool.submit(
|
|
149
|
+
self._calculate_metric,
|
|
150
|
+
evaluator=flow,
|
|
151
|
+
input_df=input_df,
|
|
152
|
+
column_mapping=column_mapping,
|
|
153
|
+
evaluator_name=evaluator_name,
|
|
154
|
+
)
|
|
133
155
|
run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
|
|
134
156
|
aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
|
|
135
157
|
run.aggregated_metrics = aggregation_future
|
|
136
158
|
return run
|
|
137
159
|
|
|
138
|
-
def get_details(self, run, all_results=False):
|
|
160
|
+
def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
|
|
139
161
|
result_df = run.get_result_df(exclude_inputs=not all_results)
|
|
140
162
|
return result_df
|
|
141
163
|
|
|
142
|
-
def get_metrics(self, run):
|
|
164
|
+
def get_metrics(self, run: CodeRun) -> Optional[None]:
|
|
143
165
|
try:
|
|
144
166
|
aggregated_metrics = run.get_aggregated_metrics()
|
|
145
167
|
print("Aggregated metrics")
|
|
146
168
|
print(aggregated_metrics)
|
|
147
169
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
148
|
-
LOGGER.debug(
|
|
170
|
+
LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
|
|
149
171
|
return None
|
|
150
172
|
return aggregated_metrics
|