azure-ai-evaluation 0.0.0b0__tar.gz → 1.0.0b1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure_ai_evaluation-1.0.0b1/CHANGELOG.md +17 -0
- azure_ai_evaluation-1.0.0b1/MANIFEST.in +6 -0
- azure_ai_evaluation-1.0.0b1/PKG-INFO +377 -0
- azure_ai_evaluation-1.0.0b1/README.md +323 -0
- azure_ai_evaluation-1.0.0b1/azure/__init__.py +5 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/__init__.py +5 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/__init__.py +60 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_common/__init__.py +16 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_common/constants.py +65 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_common/rai_service.py +452 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_common/utils.py +87 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_constants.py +50 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/__init__.py +3 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +8 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +72 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +150 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_eval_run.py +494 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_evaluate.py +689 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +174 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_utils.py +237 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/__init__.py +3 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +73 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +122 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +62 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +21 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +108 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +66 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +78 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +76 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +76 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_eci/_eci.py +99 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +141 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +122 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +61 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +71 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +123 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +96 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_qa/_qa.py +111 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +131 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +69 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +130 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +71 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_xpia/xpia.py +140 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_exceptions.py +107 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_http_utils.py +395 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_model_configurations.py +27 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_user_agent.py +6 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_version.py +5 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/py.typed +0 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/__init__.py +15 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_adversarial_scenario.py +27 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_adversarial_simulator.py +450 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_constants.py +17 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_conversation/__init__.py +315 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_direct_attack_simulator.py +252 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +93 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +207 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +147 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +228 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +157 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +157 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_model_tools/models.py +616 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +69 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +36 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_tracing.py +92 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/_utils.py +111 -0
- azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1/azure_ai_evaluation.egg-info/PKG-INFO +377 -0
- azure_ai_evaluation-1.0.0b1/azure_ai_evaluation.egg-info/SOURCES.txt +133 -0
- azure_ai_evaluation-1.0.0b1/azure_ai_evaluation.egg-info/requires.txt +16 -0
- azure_ai_evaluation-1.0.0b1/azure_ai_evaluation.egg-info/top_level.txt +1 -0
- azure_ai_evaluation-1.0.0b1/pyproject.toml +6 -0
- azure_ai_evaluation-1.0.0b1/setup.py +90 -0
- azure_ai_evaluation-1.0.0b1/tests/__init__.py +0 -0
- azure_ai_evaluation-1.0.0b1/tests/__openai_patcher.py +118 -0
- azure_ai_evaluation-1.0.0b1/tests/conftest.py +493 -0
- azure_ai_evaluation-1.0.0b1/tests/e2etests/__init__.py +0 -0
- azure_ai_evaluation-1.0.0b1/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +16 -0
- azure_ai_evaluation-1.0.0b1/tests/e2etests/target_fn.py +19 -0
- azure_ai_evaluation-1.0.0b1/tests/e2etests/test_adv_simulator.py +623 -0
- azure_ai_evaluation-1.0.0b1/tests/e2etests/test_builtin_evaluators.py +514 -0
- azure_ai_evaluation-1.0.0b1/tests/e2etests/test_evaluate.py +520 -0
- azure_ai_evaluation-1.0.0b1/tests/e2etests/test_metrics_upload.py +208 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_batch_run_context.py +78 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_built_in_evaluator.py +46 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_chat_evaluator.py +109 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_content_safety_chat_evaluator.py +82 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_content_safety_defect_rate.py +25 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_content_safety_rai_script.py +425 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_eval_run.py +474 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_evaluate.py +510 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_evaluate_telemetry.py +167 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_evaluators/apology_dag/apology.py +8 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_jailbreak_simulator.py +122 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_non_adv_simulator.py +131 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_save_eval.py +49 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_simulator.py +124 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_synthetic_callback_conv_bot.py +110 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_synthetic_conversation_bot.py +123 -0
- azure_ai_evaluation-1.0.0b1/tests/unittests/test_utils.py +20 -0
- azure_ai_evaluation-0.0.0b0/PKG-INFO +0 -6
- azure_ai_evaluation-0.0.0b0/azure_ai_evaluation.egg-info/PKG-INFO +0 -6
- azure_ai_evaluation-0.0.0b0/azure_ai_evaluation.egg-info/SOURCES.txt +0 -5
- azure_ai_evaluation-0.0.0b0/setup.py +0 -12
- {azure_ai_evaluation-0.0.0b0 → azure_ai_evaluation-1.0.0b1}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
- /azure_ai_evaluation-0.0.0b0/azure_ai_evaluation.egg-info/top_level.txt → /azure_ai_evaluation-1.0.0b1/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
- {azure_ai_evaluation-0.0.0b0 → azure_ai_evaluation-1.0.0b1}/setup.cfg +0 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Release History
|
|
2
|
+
|
|
3
|
+
## 1.0.0b1 (2024-09-20)
|
|
4
|
+
|
|
5
|
+
### Breaking Changes
|
|
6
|
+
|
|
7
|
+
- The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
|
|
8
|
+
- The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`
|
|
9
|
+
- The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
|
|
10
|
+
- Model configurations classes are now of type `TypedDict` and are exposed in the `azure.ai.evaluation` module instead of coming from `promptflow.core`.
|
|
11
|
+
- Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
|
|
12
|
+
|
|
13
|
+
### Features Added
|
|
14
|
+
|
|
15
|
+
- First preview
|
|
16
|
+
- This package is port of `promptflow-evals`. New features will be added only to this package moving forward.
|
|
17
|
+
- Added a `TypedDict` for `AzureAIProject` that allows for better intellisense and type checking when passing in project information
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: azure-ai-evaluation
|
|
3
|
+
Version: 1.0.0b1
|
|
4
|
+
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
|
+
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
|
+
Author: Microsoft Corporation
|
|
7
|
+
Author-email: azuresdkengsysadmins@microsoft.com
|
|
8
|
+
License: MIT License
|
|
9
|
+
Project-URL: Bug Reports, https://github.com/Azure/azure-sdk-for-python/issues
|
|
10
|
+
Project-URL: Source, https://github.com/Azure/azure-sdk-for-python
|
|
11
|
+
Keywords: azure,azure sdk
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Programming Language :: Python
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
21
|
+
Classifier: Operating System :: OS Independent
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
Requires-Dist: promptflow-devkit>=1.15.0
|
|
25
|
+
Requires-Dist: promptflow-core>=1.15.0
|
|
26
|
+
Requires-Dist: numpy>=1.23.2; python_version < "3.12"
|
|
27
|
+
Requires-Dist: numpy>=1.26.4; python_version >= "3.12"
|
|
28
|
+
Requires-Dist: pyjwt>=2.8.0
|
|
29
|
+
Requires-Dist: azure-identity>=1.12.0
|
|
30
|
+
Requires-Dist: azure-core>=1.30.2
|
|
31
|
+
Requires-Dist: nltk>=3.9.1
|
|
32
|
+
Requires-Dist: rouge-score>=0.1.2
|
|
33
|
+
Provides-Extra: pf-azure
|
|
34
|
+
Requires-Dist: promptflow-azure<2.0.0,>=1.15.0; extra == "pf-azure"
|
|
35
|
+
|
|
36
|
+
# Azure AI Evaluation client library for Python
|
|
37
|
+
|
|
38
|
+
## Getting started
|
|
39
|
+
|
|
40
|
+
### Install the package
|
|
41
|
+
|
|
42
|
+
Install the Azure AI Evaluation library for Python with:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install azure-ai-evaluation
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Key concepts
|
|
49
|
+
|
|
50
|
+
Evaluators are custom or prebuilt classes or functions that are designed to measure the quality of the outputs from language models.
|
|
51
|
+
|
|
52
|
+
## Examples
|
|
53
|
+
|
|
54
|
+
Users can create evaluator runs on the local machine as shown in the example below:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
import os
|
|
58
|
+
from pprint import pprint
|
|
59
|
+
|
|
60
|
+
from azure.ai.evaluation import evaluate, RelevanceEvaluator, ViolenceEvaluator
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def response_length(response, **kwargs):
|
|
64
|
+
return {"value": len(response)}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == "__main__":
|
|
68
|
+
# Built-in evaluators
|
|
69
|
+
# Initialize Azure OpenAI Model Configuration
|
|
70
|
+
model_config = {
|
|
71
|
+
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
|
72
|
+
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
|
|
73
|
+
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Initialzing Relevance Evaluator
|
|
77
|
+
relevance_eval = RelevanceEvaluator(model_config)
|
|
78
|
+
|
|
79
|
+
# Running Relevance Evaluator on single input row
|
|
80
|
+
relevance_score = relevance_eval(
|
|
81
|
+
response="The Alpine Explorer Tent is the most waterproof.",
|
|
82
|
+
context="From the our product list,"
|
|
83
|
+
" the alpine explorer tent is the most waterproof."
|
|
84
|
+
" The Adventure Dining Table has higher weight.",
|
|
85
|
+
query="Which tent is the most waterproof?",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
pprint(relevance_score)
|
|
89
|
+
# {'gpt_relevance': 5.0}
|
|
90
|
+
|
|
91
|
+
# Content Safety Evaluator
|
|
92
|
+
|
|
93
|
+
# Initialize Project Scope
|
|
94
|
+
azure_ai_project = {
|
|
95
|
+
"subscription_id": "e0fd569c-e34a-4249-8c24-e8d723c7f054",
|
|
96
|
+
"resource_group_name": "rg-test",
|
|
97
|
+
"project_name": "project-test",
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
violence_eval = ViolenceEvaluator(azure_ai_project)
|
|
101
|
+
violence_score = violence_eval(query="What is the capital of France?", response="Paris.")
|
|
102
|
+
pprint(violence_score)
|
|
103
|
+
# {'violence': 'Very low',
|
|
104
|
+
# 'violence_reason': "The system's response is a straightforward factual response "
|
|
105
|
+
# 'to a geography question. There is no violent content or '
|
|
106
|
+
# 'language present.',
|
|
107
|
+
# 'violence_score': 0}
|
|
108
|
+
|
|
109
|
+
# Code based evaluator
|
|
110
|
+
response_length("The Alpine Explorer Tent is the most waterproof.")
|
|
111
|
+
# {'value': 48}
|
|
112
|
+
|
|
113
|
+
# Using multiple evaluators together using `Evaluate` API
|
|
114
|
+
|
|
115
|
+
result = evaluate(
|
|
116
|
+
data="evaluate_test_data.jsonl",
|
|
117
|
+
evaluators={
|
|
118
|
+
"response_length": response_length,
|
|
119
|
+
"violence": violence_eval,
|
|
120
|
+
},
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
pprint(result)
|
|
124
|
+
```
|
|
125
|
+
## Simulator
|
|
126
|
+
|
|
127
|
+
Sample application prompty
|
|
128
|
+
|
|
129
|
+
```yaml
|
|
130
|
+
---
|
|
131
|
+
name: ApplicationPrompty
|
|
132
|
+
description: Simulates an application
|
|
133
|
+
model:
|
|
134
|
+
api: chat
|
|
135
|
+
configuration:
|
|
136
|
+
type: azure_openai
|
|
137
|
+
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
138
|
+
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
139
|
+
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
140
|
+
parameters:
|
|
141
|
+
temperature: 0.0
|
|
142
|
+
top_p: 1.0
|
|
143
|
+
presence_penalty: 0
|
|
144
|
+
frequency_penalty: 0
|
|
145
|
+
response_format:
|
|
146
|
+
type: text
|
|
147
|
+
|
|
148
|
+
inputs:
|
|
149
|
+
conversation_history:
|
|
150
|
+
type: dict
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
system:
|
|
154
|
+
You are a helpful assistant and you're helping with the user's query. Keep the conversation engaging and interesting.
|
|
155
|
+
|
|
156
|
+
Output with a string that continues the conversation, responding to the latest message from the user, given the conversation history:
|
|
157
|
+
{{ conversation_history }}
|
|
158
|
+
|
|
159
|
+
```
|
|
160
|
+
Application code:
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
import json
|
|
164
|
+
import asyncio
|
|
165
|
+
from typing import Any, Dict, List, Optional
|
|
166
|
+
from azure.ai.evaluation.synthetic import Simulator
|
|
167
|
+
from promptflow.client import load_flow
|
|
168
|
+
from azure.identity import DefaultAzureCredential
|
|
169
|
+
import os
|
|
170
|
+
|
|
171
|
+
azure_ai_project = {
|
|
172
|
+
"subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
|
|
173
|
+
"resource_group_name": os.environ.get("RESOURCE_GROUP"),
|
|
174
|
+
"project_name": os.environ.get("PROJECT_NAME"),
|
|
175
|
+
"credential": DefaultAzureCredential(),
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
import wikipedia
|
|
179
|
+
wiki_search_term = "Leonardo da vinci"
|
|
180
|
+
wiki_title = wikipedia.search(wiki_search_term)[0]
|
|
181
|
+
wiki_page = wikipedia.page(wiki_title)
|
|
182
|
+
text = wiki_page.summary[:1000]
|
|
183
|
+
|
|
184
|
+
def method_to_invoke_application_prompty(query: str):
|
|
185
|
+
try:
|
|
186
|
+
current_dir = os.path.dirname(__file__)
|
|
187
|
+
prompty_path = os.path.join(current_dir, "application.prompty")
|
|
188
|
+
_flow = load_flow(source=prompty_path, model={
|
|
189
|
+
"configuration": azure_ai_project
|
|
190
|
+
})
|
|
191
|
+
response = _flow(
|
|
192
|
+
query=query,
|
|
193
|
+
context=context,
|
|
194
|
+
conversation_history=messages_list
|
|
195
|
+
)
|
|
196
|
+
return response
|
|
197
|
+
except:
|
|
198
|
+
print("Something went wrong invoking the prompty")
|
|
199
|
+
return "something went wrong"
|
|
200
|
+
|
|
201
|
+
async def callback(
|
|
202
|
+
messages: List[Dict],
|
|
203
|
+
stream: bool = False,
|
|
204
|
+
session_state: Any = None, # noqa: ANN401
|
|
205
|
+
context: Optional[Dict[str, Any]] = None,
|
|
206
|
+
) -> dict:
|
|
207
|
+
messages_list = messages["messages"]
|
|
208
|
+
# get last message
|
|
209
|
+
latest_message = messages_list[-1]
|
|
210
|
+
query = latest_message["content"]
|
|
211
|
+
context = None
|
|
212
|
+
# call your endpoint or ai application here
|
|
213
|
+
response = method_to_invoke_application_prompty(query)
|
|
214
|
+
# we are formatting the response to follow the openAI chat protocol format
|
|
215
|
+
formatted_response = {
|
|
216
|
+
"content": response,
|
|
217
|
+
"role": "assistant",
|
|
218
|
+
"context": {
|
|
219
|
+
"citations": None,
|
|
220
|
+
},
|
|
221
|
+
}
|
|
222
|
+
messages["messages"].append(formatted_response)
|
|
223
|
+
return {"messages": messages["messages"], "stream": stream, "session_state": session_state, "context": context}
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
async def main():
|
|
228
|
+
simulator = Simulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
|
|
229
|
+
outputs = await simulator(
|
|
230
|
+
target=callback,
|
|
231
|
+
text=text,
|
|
232
|
+
num_queries=2,
|
|
233
|
+
max_conversation_turns=4,
|
|
234
|
+
user_persona=[
|
|
235
|
+
f"I am a student and I want to learn more about {wiki_search_term}",
|
|
236
|
+
f"I am a teacher and I want to teach my students about {wiki_search_term}"
|
|
237
|
+
],
|
|
238
|
+
)
|
|
239
|
+
print(json.dumps(outputs))
|
|
240
|
+
|
|
241
|
+
if __name__ == "__main__":
|
|
242
|
+
os.environ["AZURE_SUBSCRIPTION_ID"] = ""
|
|
243
|
+
os.environ["RESOURCE_GROUP"] = ""
|
|
244
|
+
os.environ["PROJECT_NAME"] = ""
|
|
245
|
+
os.environ["AZURE_OPENAI_API_KEY"] = ""
|
|
246
|
+
os.environ["AZURE_OPENAI_ENDPOINT"] = ""
|
|
247
|
+
os.environ["AZURE_DEPLOYMENT"] = ""
|
|
248
|
+
asyncio.run(main())
|
|
249
|
+
print("done!")
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
Simulators allow users to generate synthentic data using their application. Simulator expects the user to have a callback method that invokes
|
|
253
|
+
their AI application. Here's a sample of a callback which invokes AsyncAzureOpenAI:
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
from from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
|
|
257
|
+
from azure.identity import DefaultAzureCredential
|
|
258
|
+
from typing import Any, Dict, List, Optional
|
|
259
|
+
import asyncio
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
azure_ai_project = {
|
|
263
|
+
"subscription_id": <subscription_id>,
|
|
264
|
+
"resource_group_name": <resource_group_name>,
|
|
265
|
+
"project_name": <project_name>
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
async def callback(
|
|
269
|
+
messages: List[Dict],
|
|
270
|
+
stream: bool = False,
|
|
271
|
+
session_state: Any = None,
|
|
272
|
+
context: Dict[str, Any] = None
|
|
273
|
+
) -> dict:
|
|
274
|
+
messages_list = messages["messages"]
|
|
275
|
+
# get last message
|
|
276
|
+
latest_message = messages_list[-1]
|
|
277
|
+
query = latest_message["content"]
|
|
278
|
+
context = None
|
|
279
|
+
if 'file_content' in messages["template_parameters"]:
|
|
280
|
+
query += messages["template_parameters"]['file_content']
|
|
281
|
+
# the next few lines explains how to use the AsyncAzureOpenAI's chat.completions
|
|
282
|
+
# to respond to the simulator. You should replace it with a call to your model/endpoint/application
|
|
283
|
+
# make sure you pass the `query` and format the response as we have shown below
|
|
284
|
+
from openai import AsyncAzureOpenAI
|
|
285
|
+
oai_client = AsyncAzureOpenAI(
|
|
286
|
+
api_key=<api_key>,
|
|
287
|
+
azure_endpoint=<endpoint>,
|
|
288
|
+
api_version="2023-12-01-preview",
|
|
289
|
+
)
|
|
290
|
+
try:
|
|
291
|
+
response_from_oai_chat_completions = await oai_client.chat.completions.create(messages=[{"content": query, "role": "user"}], model="gpt-4", max_tokens=300)
|
|
292
|
+
except Exception as e:
|
|
293
|
+
print(f"Error: {e}")
|
|
294
|
+
# to continue the conversation, return the messages, else you can fail the adversarial with an exception
|
|
295
|
+
message = {
|
|
296
|
+
"content": "Something went wrong. Check the exception e for more details.",
|
|
297
|
+
"role": "assistant",
|
|
298
|
+
"context": None,
|
|
299
|
+
}
|
|
300
|
+
messages["messages"].append(message)
|
|
301
|
+
return {
|
|
302
|
+
"messages": messages["messages"],
|
|
303
|
+
"stream": stream,
|
|
304
|
+
"session_state": session_state
|
|
305
|
+
}
|
|
306
|
+
response_result = response_from_oai_chat_completions.choices[0].message.content
|
|
307
|
+
formatted_response = {
|
|
308
|
+
"content": response_result,
|
|
309
|
+
"role": "assistant",
|
|
310
|
+
"context": {},
|
|
311
|
+
}
|
|
312
|
+
messages["messages"].append(formatted_response)
|
|
313
|
+
return {
|
|
314
|
+
"messages": messages["messages"],
|
|
315
|
+
"stream": stream,
|
|
316
|
+
"session_state": session_state,
|
|
317
|
+
"context": context
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
```
|
|
321
|
+
### Adversarial QA:
|
|
322
|
+
```python
|
|
323
|
+
scenario = AdversarialScenario.ADVERSARIAL_QA
|
|
324
|
+
simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
|
|
325
|
+
|
|
326
|
+
outputs = asyncio.run(
|
|
327
|
+
simulator(
|
|
328
|
+
scenario=scenario,
|
|
329
|
+
max_conversation_turns=1,
|
|
330
|
+
max_simulation_results=3,
|
|
331
|
+
target=callback
|
|
332
|
+
)
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
print(outputs.to_eval_qa_json_lines())
|
|
336
|
+
```
|
|
337
|
+
### Direct Attack Simulator
|
|
338
|
+
|
|
339
|
+
```python
|
|
340
|
+
scenario = AdversarialScenario.ADVERSARIAL_QA
|
|
341
|
+
simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
|
|
342
|
+
|
|
343
|
+
outputs = asyncio.run(
|
|
344
|
+
simulator(
|
|
345
|
+
scenario=scenario,
|
|
346
|
+
max_conversation_turns=1,
|
|
347
|
+
max_simulation_results=2,
|
|
348
|
+
target=callback
|
|
349
|
+
)
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
print(outputs)
|
|
353
|
+
```
|
|
354
|
+
## Troubleshooting
|
|
355
|
+
|
|
356
|
+
## Next steps
|
|
357
|
+
|
|
358
|
+
## Contributing
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
# Release History
|
|
362
|
+
|
|
363
|
+
## 1.0.0b1 (2024-09-20)
|
|
364
|
+
|
|
365
|
+
### Breaking Changes
|
|
366
|
+
|
|
367
|
+
- The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
|
|
368
|
+
- The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`
|
|
369
|
+
- The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
|
|
370
|
+
- Model configurations classes are now of type `TypedDict` and are exposed in the `azure.ai.evaluation` module instead of coming from `promptflow.core`.
|
|
371
|
+
- Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
|
|
372
|
+
|
|
373
|
+
### Features Added
|
|
374
|
+
|
|
375
|
+
- First preview
|
|
376
|
+
- This package is port of `promptflow-evals`. New features will be added only to this package moving forward.
|
|
377
|
+
- Added a `TypedDict` for `AzureAIProject` that allows for better intellisense and type checking when passing in project information
|