azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. azure/ai/evaluation/__init__.py +22 -0
  2. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +4 -0
  3. azure/ai/evaluation/_common/constants.py +5 -0
  4. azure/ai/evaluation/_common/math.py +73 -2
  5. azure/ai/evaluation/_common/rai_service.py +250 -62
  6. azure/ai/evaluation/_common/utils.py +196 -23
  7. azure/ai/evaluation/_constants.py +7 -6
  8. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  9. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +13 -4
  10. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +19 -6
  11. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  12. azure/ai/evaluation/_evaluate/_eval_run.py +55 -14
  13. azure/ai/evaluation/_evaluate/_evaluate.py +312 -228
  14. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +7 -6
  15. azure/ai/evaluation/_evaluate/_utils.py +46 -11
  16. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +17 -18
  17. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +67 -31
  18. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  19. azure/ai/evaluation/_evaluators/_common/_base_eval.py +37 -24
  20. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +21 -9
  21. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +52 -16
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +91 -48
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +100 -26
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +94 -26
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +96 -26
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +97 -26
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +31 -4
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +67 -36
  30. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  31. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +14 -16
  32. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +106 -34
  33. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  34. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  35. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +20 -27
  36. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  37. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  38. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  43. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  44. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +87 -31
  45. azure/ai/evaluation/_evaluators/_qa/_qa.py +23 -31
  46. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +72 -36
  47. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  48. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +83 -125
  49. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  50. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +26 -27
  51. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  52. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  53. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +37 -28
  54. azure/ai/evaluation/_evaluators/_xpia/xpia.py +94 -33
  55. azure/ai/evaluation/_exceptions.py +19 -0
  56. azure/ai/evaluation/_model_configurations.py +83 -15
  57. azure/ai/evaluation/_version.py +1 -1
  58. azure/ai/evaluation/simulator/__init__.py +2 -1
  59. azure/ai/evaluation/simulator/_adversarial_scenario.py +20 -1
  60. azure/ai/evaluation/simulator/_adversarial_simulator.py +29 -35
  61. azure/ai/evaluation/simulator/_constants.py +11 -1
  62. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  63. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  64. azure/ai/evaluation/simulator/_direct_attack_simulator.py +17 -9
  65. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  66. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
  67. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +90 -35
  68. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +4 -2
  69. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
  70. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
  71. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
  72. azure/ai/evaluation/simulator/_simulator.py +165 -105
  73. azure/ai/evaluation/simulator/_utils.py +31 -13
  74. azure_ai_evaluation-1.0.1.dist-info/METADATA +600 -0
  75. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +20 -0
  76. azure_ai_evaluation-1.0.1.dist-info/RECORD +119 -0
  77. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +1 -1
  78. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  79. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  80. azure_ai_evaluation-1.0.0b4.dist-info/METADATA +0 -535
  81. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
  82. /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
  83. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,600 @@
1
+ Metadata-Version: 2.1
2
+ Name: azure-ai-evaluation
3
+ Version: 1.0.1
4
+ Summary: Microsoft Azure Evaluation Library for Python
5
+ Home-page: https://github.com/Azure/azure-sdk-for-python
6
+ Author: Microsoft Corporation
7
+ Author-email: azuresdkengsysadmins@microsoft.com
8
+ License: MIT License
9
+ Project-URL: Bug Reports, https://github.com/Azure/azure-sdk-for-python/issues
10
+ Project-URL: Source, https://github.com/Azure/azure-sdk-for-python
11
+ Keywords: azure,azure sdk
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Programming Language :: Python
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3 :: Only
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: License :: OSI Approved :: MIT License
21
+ Classifier: Operating System :: OS Independent
22
+ Requires-Python: >=3.8
23
+ Description-Content-Type: text/markdown
24
+ License-File: NOTICE.txt
25
+ Requires-Dist: promptflow-devkit >=1.15.0
26
+ Requires-Dist: promptflow-core >=1.15.0
27
+ Requires-Dist: pyjwt >=2.8.0
28
+ Requires-Dist: azure-identity >=1.16.0
29
+ Requires-Dist: azure-core >=1.30.2
30
+ Requires-Dist: nltk >=3.9.1
31
+ Provides-Extra: remote
32
+ Requires-Dist: promptflow-azure <2.0.0,>=1.15.0 ; extra == 'remote'
33
+
34
+ # Azure AI Evaluation client library for Python
35
+
36
+ Use Azure AI Evaluation SDK to assess the performance of your generative AI applications. Generative AI application generations are quantitatively measured with mathematical based metrics, AI-assisted quality and safety metrics. Metrics are defined as `evaluators`. Built-in or custom evaluators can provide comprehensive insights into the application's capabilities and limitations.
37
+
38
+ Use Azure AI Evaluation SDK to:
39
+ - Evaluate existing data from generative AI applications
40
+ - Evaluate generative AI applications
41
+ - Evaluate by generating mathematical, AI-assisted quality and safety metrics
42
+
43
+ Azure AI SDK provides following to evaluate Generative AI Applications:
44
+ - [Evaluators][evaluators] - Generate scores individually or when used together with `evaluate` API.
45
+ - [Evaluate API][evaluate_api] - Python API to evaluate dataset or application using built-in or custom evaluators.
46
+
47
+ [Source code][source_code]
48
+ | [Package (PyPI)][evaluation_pypi]
49
+ | [API reference documentation][evaluation_ref_docs]
50
+ | [Product documentation][product_documentation]
51
+ | [Samples][evaluation_samples]
52
+
53
+
54
+ ## Getting started
55
+
56
+ ### Prerequisites
57
+
58
+ - Python 3.8 or later is required to use this package.
59
+ - [Optional] You must have [Azure AI Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
60
+
61
+ ### Install the package
62
+
63
+ Install the Azure AI Evaluation SDK for Python with [pip][pip_link]:
64
+
65
+ ```bash
66
+ pip install azure-ai-evaluation
67
+ ```
68
+ If you want to track results in [AI Studio][ai_studio], install `remote` extra:
69
+ ```python
70
+ pip install azure-ai-evaluation[remote]
71
+ ```
72
+
73
+ ## Key concepts
74
+
75
+ ### Evaluators
76
+
77
+ Evaluators are custom or prebuilt classes or functions that are designed to measure the quality of the outputs from language models or generative AI applications.
78
+
79
+ #### Built-in evaluators
80
+
81
+ Built-in evaluators are out of box evaluators provided by Microsoft:
82
+ | Category | Evaluator class |
83
+ |-----------|------------------------------------------------------------------------------------------------------------------------------------|
84
+ | [Performance and quality][performance_and_quality_evaluators] (AI-assisted) | `GroundednessEvaluator`, `RelevanceEvaluator`, `CoherenceEvaluator`, `FluencyEvaluator`, `SimilarityEvaluator`, `RetrievalEvaluator` |
85
+ | [Performance and quality][performance_and_quality_evaluators] (NLP) | `F1ScoreEvaluator`, `RougeScoreEvaluator`, `GleuScoreEvaluator`, `BleuScoreEvaluator`, `MeteorScoreEvaluator`|
86
+ | [Risk and safety][risk_and_safety_evaluators] (AI-assisted) | `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `IndirectAttackEvaluator`, `ProtectedMaterialEvaluator` |
87
+ | [Composite][composite_evaluators] | `QAEvaluator`, `ContentSafetyEvaluator` |
88
+
89
+ For more in-depth information on each evaluator definition and how it's calculated, see [Evaluation and monitoring metrics for generative AI][evaluation_metrics].
90
+
91
+ ```python
92
+ import os
93
+
94
+ from azure.ai.evaluation import evaluate, RelevanceEvaluator, ViolenceEvaluator, BleuScoreEvaluator
95
+
96
+ # NLP bleu score evaluator
97
+ bleu_score_evaluator = BleuScoreEvaluator()
98
+ result = bleu_score(
99
+ response="Tokyo is the capital of Japan.",
100
+ ground_truth="The capital of Japan is Tokyo."
101
+ )
102
+
103
+ # AI assisted quality evaluator
104
+ model_config = {
105
+ "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
106
+ "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
107
+ "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
108
+ }
109
+
110
+ relevance_evaluator = RelevanceEvaluator(model_config)
111
+ result = relevance_evaluator(
112
+ query="What is the capital of Japan?",
113
+ response="The capital of Japan is Tokyo."
114
+ )
115
+
116
+ # AI assisted safety evaluator
117
+ azure_ai_project = {
118
+ "subscription_id": "<subscription_id>",
119
+ "resource_group_name": "<resource_group_name>",
120
+ "project_name": "<project_name>",
121
+ }
122
+
123
+ violence_evaluator = ViolenceEvaluator(azure_ai_project)
124
+ result = violence_evaluator(
125
+ query="What is the capital of France?",
126
+ response="Paris."
127
+ )
128
+ ```
129
+
130
+ #### Custom evaluators
131
+
132
+ Built-in evaluators are great out of the box to start evaluating your application's generations. However you can build your own code-based or prompt-based evaluator to cater to your specific evaluation needs.
133
+
134
+ ```python
135
+
136
+ # Custom evaluator as a function to calculate response length
137
+ def response_length(response, **kwargs):
138
+ return len(response)
139
+
140
+ # Custom class based evaluator to check for blocked words
141
+ class BlocklistEvaluator:
142
+ def __init__(self, blocklist):
143
+ self._blocklist = blocklist
144
+
145
+ def __call__(self, *, response: str, **kwargs):
146
+ score = any([word in answer for word in self._blocklist])
147
+ return {"score": score}
148
+
149
+ blocklist_evaluator = BlocklistEvaluator(blocklist=["bad, worst, terrible"])
150
+
151
+ result = response_length("The capital of Japan is Tokyo.")
152
+ result = blocklist_evaluator(answer="The capital of Japan is Tokyo.")
153
+
154
+ ```
155
+
156
+ ### Evaluate API
157
+ The package provides an `evaluate` API which can be used to run multiple evaluators together to evaluate generative AI application response.
158
+
159
+ #### Evaluate existing dataset
160
+
161
+ ```python
162
+ from azure.ai.evaluation import evaluate
163
+
164
+ result = evaluate(
165
+ data="data.jsonl", # provide your data here
166
+ evaluators={
167
+ "blocklist": blocklist_evaluator,
168
+ "relevance": relevance_evaluator
169
+ },
170
+ # column mapping
171
+ evaluator_config={
172
+ "relevance": {
173
+ "column_mapping": {
174
+ "query": "${data.queries}"
175
+ "ground_truth": "${data.ground_truth}"
176
+ "response": "${outputs.response}"
177
+ }
178
+ }
179
+ }
180
+ # Optionally provide your AI Studio project information to track your evaluation results in your Azure AI Studio project
181
+ azure_ai_project = azure_ai_project,
182
+ # Optionally provide an output path to dump a json of metric summary, row level data and metric and studio URL
183
+ output_path="./evaluation_results.json"
184
+ )
185
+ ```
186
+ For more details refer to [Evaluate on test dataset using evaluate()][evaluate_dataset]
187
+
188
+ #### Evaluate generative AI application
189
+ ```python
190
+ from askwiki import askwiki
191
+
192
+ result = evaluate(
193
+ data="data.jsonl",
194
+ target=askwiki,
195
+ evaluators={
196
+ "relevance": relevance_eval
197
+ },
198
+ evaluator_config={
199
+ "default": {
200
+ "column_mapping": {
201
+ "query": "${data.queries}"
202
+ "context": "${outputs.context}"
203
+ "response": "${outputs.response}"
204
+ }
205
+ }
206
+ }
207
+ )
208
+ ```
209
+ Above code snippet refers to askwiki application in this [sample][evaluate_app].
210
+
211
+ For more details refer to [Evaluate on a target][evaluate_target]
212
+
213
+ ### Simulator
214
+
215
+
216
+ Simulators allow users to generate synthentic data using their application. Simulator expects the user to have a callback method that invokes their AI application. The intergration between your AI application and the simulator happens at the callback method. Here's how a sample callback would look like:
217
+
218
+
219
+ ```python
220
+ async def callback(
221
+ messages: Dict[str, List[Dict]],
222
+ stream: bool = False,
223
+ session_state: Any = None,
224
+ context: Optional[Dict[str, Any]] = None,
225
+ ) -> dict:
226
+ messages_list = messages["messages"]
227
+ # Get the last message from the user
228
+ latest_message = messages_list[-1]
229
+ query = latest_message["content"]
230
+ # Call your endpoint or AI application here
231
+ # response should be a string
232
+ response = call_to_your_application(query, messages_list, context)
233
+ formatted_response = {
234
+ "content": response,
235
+ "role": "assistant",
236
+ "context": "",
237
+ }
238
+ messages["messages"].append(formatted_response)
239
+ return {"messages": messages["messages"], "stream": stream, "session_state": session_state, "context": context}
240
+ ```
241
+
242
+ The simulator initialization and invocation looks like this:
243
+ ```python
244
+ from azure.ai.evaluation.simulator import Simulator
245
+ model_config = {
246
+ "azure_endpoint": os.environ.get("AZURE_ENDPOINT"),
247
+ "azure_deployment": os.environ.get("AZURE_DEPLOYMENT_NAME"),
248
+ "api_version": os.environ.get("AZURE_API_VERSION"),
249
+ }
250
+ custom_simulator = Simulator(model_config=model_config)
251
+ outputs = asyncio.run(custom_simulator(
252
+ target=callback,
253
+ conversation_turns=[
254
+ [
255
+ "What should I know about the public gardens in the US?",
256
+ ],
257
+ [
258
+ "How do I simulate data against LLMs",
259
+ ],
260
+ ],
261
+ max_conversation_turns=2,
262
+ ))
263
+ with open("simulator_output.jsonl", "w") as f:
264
+ for output in outputs:
265
+ f.write(output.to_eval_qr_json_lines())
266
+ ```
267
+
268
+ #### Adversarial Simulator
269
+
270
+ ```python
271
+ from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
272
+ from azure.identity import DefaultAzureCredential
273
+ azure_ai_project = {
274
+ "subscription_id": <subscription_id>,
275
+ "resource_group_name": <resource_group_name>,
276
+ "project_name": <project_name>
277
+ }
278
+ scenario = AdversarialScenario.ADVERSARIAL_QA
279
+ simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
280
+
281
+ outputs = asyncio.run(
282
+ simulator(
283
+ scenario=scenario,
284
+ max_conversation_turns=1,
285
+ max_simulation_results=3,
286
+ target=callback
287
+ )
288
+ )
289
+
290
+ print(outputs.to_eval_qr_json_lines())
291
+ ```
292
+
293
+ For more details about the simulator, visit the following links:
294
+ - [Adversarial Simulation docs][adversarial_simulation_docs]
295
+ - [Adversarial scenarios][adversarial_simulation_scenarios]
296
+ - [Simulating jailbreak attacks][adversarial_jailbreak]
297
+
298
+ ## Examples
299
+
300
+ In following section you will find examples of:
301
+ - [Evaluate an application][evaluate_app]
302
+ - [Evaluate different models][evaluate_models]
303
+ - [Custom Evaluators][custom_evaluators]
304
+ - [Adversarial Simulation][adversarial_simulation]
305
+ - [Simulate with conversation starter][simulate_with_conversation_starter]
306
+
307
+ More examples can be found [here][evaluate_samples].
308
+
309
+ ## Troubleshooting
310
+
311
+ ### General
312
+
313
+ Please refer to [troubleshooting][evaluation_tsg] for common issues.
314
+
315
+ ### Logging
316
+
317
+ This library uses the standard
318
+ [logging][python_logging] library for logging.
319
+ Basic information about HTTP sessions (URLs, headers, etc.) is logged at INFO
320
+ level.
321
+
322
+ Detailed DEBUG level logging, including request/response bodies and unredacted
323
+ headers, can be enabled on a client with the `logging_enable` argument.
324
+
325
+ See full SDK logging documentation with examples [here][sdk_logging_docs].
326
+
327
+ ## Next steps
328
+
329
+ - View our [samples][evaluation_samples].
330
+ - View our [documentation][product_documentation]
331
+
332
+ ## Contributing
333
+
334
+ This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit [cla.microsoft.com][cla].
335
+
336
+ When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
337
+
338
+ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_conduct]. For more information see the [Code of Conduct FAQ][coc_faq] or contact [opencode@microsoft.com][coc_contact] with any additional questions or comments.
339
+
340
+ <!-- LINKS -->
341
+
342
+ [source_code]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/evaluation/azure-ai-evaluation
343
+ [evaluation_pypi]: https://pypi.org/project/azure-ai-evaluation/
344
+ [evaluation_ref_docs]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview
345
+ [evaluation_samples]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios
346
+ [product_documentation]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk
347
+ [python_logging]: https://docs.python.org/3/library/logging.html
348
+ [sdk_logging_docs]: https://docs.microsoft.com/azure/developer/python/azure-sdk-logging
349
+ [azure_core_readme]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md
350
+ [pip_link]: https://pypi.org/project/pip/
351
+ [azure_core_ref_docs]: https://aka.ms/azsdk-python-core-policies
352
+ [azure_core]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md
353
+ [azure_identity]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/identity/azure-identity
354
+ [cla]: https://cla.microsoft.com
355
+ [code_of_conduct]: https://opensource.microsoft.com/codeofconduct/
356
+ [coc_faq]: https://opensource.microsoft.com/codeofconduct/faq/
357
+ [coc_contact]: mailto:opencode@microsoft.com
358
+ [evaluate_target]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#evaluate-on-a-target
359
+ [evaluate_dataset]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#evaluate-on-test-dataset-using-evaluate
360
+ [evaluators]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview
361
+ [evaluate_api]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview#azure-ai-evaluation-evaluate
362
+ [evaluate_app]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/evaluate_app
363
+ [evaluation_tsg]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/TROUBLESHOOTING.md
364
+ [ai_studio]: https://learn.microsoft.com/azure/ai-studio/what-is-ai-studio
365
+ [ai_project]: https://learn.microsoft.com/azure/ai-studio/how-to/create-projects?tabs=ai-studio
366
+ [azure_openai]: https://learn.microsoft.com/azure/ai-services/openai/
367
+ [evaluate_models]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/evaluate_endpoints
368
+ [custom_evaluators]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/evaluate_custom
369
+ [evaluate_samples]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate
370
+ [evaluation_metrics]: https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in
371
+ [performance_and_quality_evaluators]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#performance-and-quality-evaluators
372
+ [risk_and_safety_evaluators]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#risk-and-safety-evaluators
373
+ [composite_evaluators]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#composite-evaluators
374
+ [adversarial_simulation_docs]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#generate-adversarial-simulations-for-safety-evaluation
375
+ [adversarial_simulation_scenarios]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#supported-adversarial-simulation-scenarios
376
+ [adversarial_simulation]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/simulate_adversarial
377
+ [simulate_with_conversation_starter]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/simulate_conversation_starter
378
+ [adversarial_jailbreak]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#simulating-jailbreak-attacks
379
+
380
+
381
+ # Release History
382
+
383
+ ## 1.0.1 (2024-11-15)
384
+
385
+ ### Bugs Fixed
386
+ - Fixed `[remote]` extra to be needed only when tracking results in Azure AI Studio.
387
+ - Removing `azure-ai-inference` as dependency.
388
+
389
+ ## 1.0.0 (2024-11-13)
390
+
391
+ ### Breaking Changes
392
+ - The `parallel` parameter has been removed from composite evaluators: `QAEvaluator`, `ContentSafetyChatEvaluator`, and `ContentSafetyMultimodalEvaluator`. To control evaluator parallelism, you can now use the `_parallel` keyword argument, though please note that this private parameter may change in the future.
393
+ - Parameters `query_response_generating_prompty_kwargs` and `user_simulator_prompty_kwargs` have been renamed to `query_response_generating_prompty_options` and `user_simulator_prompty_options` in the Simulator's __call__ method.
394
+
395
+ ### Bugs Fixed
396
+ - Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
397
+ - Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
398
+ - Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
399
+ - Fix `evaluate` function not producing aggregated metrics if ANY values to be aggregated were None, NaN, or
400
+ otherwise difficult to process. Such values are ignored fully, so the aggregated metric of `[1, 2, 3, NaN]`
401
+ would be 2, not 1.5.
402
+
403
+ ### Other Changes
404
+ - Refined error messages for serviced-based evaluators and simulators.
405
+ - Tracing has been disabled due to Cosmos DB initialization issue.
406
+ - Introduced environment variable `AI_EVALS_DISABLE_EXPERIMENTAL_WARNING` to disable the warning message for experimental features.
407
+ - Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex.
408
+ - For the `DirectAttackSimulator`, the prompt templates used to generate simulated outputs for each Adversarial harm category will no longer be in a randomized order by default. To override this behavior, pass `randomize_order=True` when you call the `DirectAttackSimulator`, for example:
409
+ ```python
410
+ adversarial_simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
411
+ outputs = asyncio.run(
412
+ adversarial_simulator(
413
+ scenario=scenario,
414
+ target=callback,
415
+ randomize_order=True
416
+ )
417
+ )
418
+ ```
419
+
420
+ ## 1.0.0b5 (2024-10-28)
421
+
422
+ ### Features Added
423
+ - Added `GroundednessProEvaluator`, which is a service-based evaluator for determining response groundedness.
424
+ - Groundedness detection in Non Adversarial Simulator via query/context pairs
425
+ ```python
426
+ import importlib.resources as pkg_resources
427
+ package = "azure.ai.evaluation.simulator._data_sources"
428
+ resource_name = "grounding.json"
429
+ custom_simulator = Simulator(model_config=model_config)
430
+ conversation_turns = []
431
+ with pkg_resources.path(package, resource_name) as grounding_file:
432
+ with open(grounding_file, "r") as file:
433
+ data = json.load(file)
434
+ for item in data:
435
+ conversation_turns.append([item])
436
+ outputs = asyncio.run(custom_simulator(
437
+ target=callback,
438
+ conversation_turns=conversation_turns,
439
+ max_conversation_turns=1,
440
+ ))
441
+ ```
442
+ - Adding evaluator for multimodal use cases
443
+
444
+ ### Breaking Changes
445
+ - Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`.
446
+ - `RetrievalEvaluator` now requires a `context` input in addition to `query` in single-turn evaluation.
447
+ - `RelevanceEvaluator` no longer takes `context` as an input. It now only takes `query` and `response` in single-turn evaluation.
448
+ - `FluencyEvaluator` no longer takes `query` as an input. It now only takes `response` in single-turn evaluation.
449
+ - AdversarialScenario enum does not include `ADVERSARIAL_INDIRECT_JAILBREAK`, invoking IndirectJailbreak or XPIA should be done with `IndirectAttackSimulator`
450
+ - Outputs of `Simulator` and `AdversarialSimulator` previously had `to_eval_qa_json_lines` and now has `to_eval_qr_json_lines`. Where `to_eval_qa_json_lines` had:
451
+ ```json
452
+ {"question": <user_message>, "answer": <assistant_message>}
453
+ ```
454
+ `to_eval_qr_json_lines` now has:
455
+ ```json
456
+ {"query": <user_message>, "response": assistant_message}
457
+ ```
458
+
459
+ ### Bugs Fixed
460
+ - Non adversarial simulator works with `gpt-4o` models using the `json_schema` response format
461
+ - Fixed an issue where the `evaluate` API would fail with "[WinError 32] The process cannot access the file because it is being used by another process" when venv folder and target function file are in the same directory.
462
+ - Fix evaluate API failure when `trace.destination` is set to `none`
463
+ - Non adversarial simulator now accepts context from the callback
464
+
465
+ ### Other Changes
466
+ - Improved error messages for the `evaluate` API by enhancing the validation of input parameters. This update provides more detailed and actionable error descriptions.
467
+ - `GroundednessEvaluator` now supports `query` as an optional input in single-turn evaluation. If `query` is provided, a different prompt template will be used for the evaluation.
468
+ - To align with our support of a diverse set of models, the following evaluators will now have a new key in their result output without the `gpt_` prefix. To maintain backwards compatibility, the old key with the `gpt_` prefix will still be present in the output; however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
469
+ - `CoherenceEvaluator`
470
+ - `RelevanceEvaluator`
471
+ - `FluencyEvaluator`
472
+ - `GroundednessEvaluator`
473
+ - `SimilarityEvaluator`
474
+ - `RetrievalEvaluator`
475
+ - The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "<metric_name>_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased.
476
+
477
+ | Evaluator | New `max_token` for Generation |
478
+ | --- | --- |
479
+ | `CoherenceEvaluator` | 800 |
480
+ | `RelevanceEvaluator` | 800 |
481
+ | `FluencyEvaluator` | 800 |
482
+ | `GroundednessEvaluator` | 800 |
483
+ | `RetrievalEvaluator` | 1600 |
484
+ - Improved the error message for storage access permission issues to provide clearer guidance for users.
485
+
486
+ ## 1.0.0b4 (2024-10-16)
487
+
488
+ ### Breaking Changes
489
+
490
+ - Removed `numpy` dependency. All NaN values returned by the SDK have been changed to from `numpy.nan` to `math.nan`.
491
+ - `credential` is now required to be passed in for all content safety evaluators and `ProtectedMaterialsEvaluator`. `DefaultAzureCredential` will no longer be chosen if a credential is not passed.
492
+ - Changed package extra name from "pf-azure" to "remote".
493
+
494
+ ### Bugs Fixed
495
+ - Adversarial Conversation simulations would fail with `Forbidden`. Added logic to re-fetch token in the exponential retry logic to retrive RAI Service response.
496
+ - Fixed an issue where the Evaluate API did not fail due to missing inputs when the target did not return columns required by the evaluators.
497
+
498
+ ### Other Changes
499
+ - Enhance the error message to provide clearer instruction when required packages for the remote tracking feature are missing.
500
+ - Print the per-evaluator run summary at the end of the Evaluate API call to make troubleshooting row-level failures easier.
501
+
502
+ ## 1.0.0b3 (2024-10-01)
503
+
504
+ ### Features Added
505
+
506
+ - Added `type` field to `AzureOpenAIModelConfiguration` and `OpenAIModelConfiguration`
507
+ - The following evaluators now support `conversation` as an alternative input to their usual single-turn inputs:
508
+ - `ViolenceEvaluator`
509
+ - `SexualEvaluator`
510
+ - `SelfHarmEvaluator`
511
+ - `HateUnfairnessEvaluator`
512
+ - `ProtectedMaterialEvaluator`
513
+ - `IndirectAttackEvaluator`
514
+ - `CoherenceEvaluator`
515
+ - `RelevanceEvaluator`
516
+ - `FluencyEvaluator`
517
+ - `GroundednessEvaluator`
518
+ - Surfaced `RetrievalScoreEvaluator`, formally an internal part of `ChatEvaluator` as a standalone conversation-only evaluator.
519
+
520
+ ### Breaking Changes
521
+
522
+ - Removed `ContentSafetyChatEvaluator` and `ChatEvaluator`
523
+ - The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
524
+ `column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
525
+
526
+ Before:
527
+ ```python
528
+ evaluate(
529
+ ...,
530
+ evaluator_config={
531
+ "hate_unfairness": {
532
+ "query": "${data.question}",
533
+ "response": "${data.answer}",
534
+ }
535
+ },
536
+ ...
537
+ )
538
+ ```
539
+
540
+ After
541
+ ```python
542
+ evaluate(
543
+ ...,
544
+ evaluator_config={
545
+ "hate_unfairness": {
546
+ "column_mapping": {
547
+ "query": "${data.question}",
548
+ "response": "${data.answer}",
549
+ }
550
+ }
551
+ },
552
+ ...
553
+ )
554
+ ```
555
+
556
+ - Simulator now requires a model configuration to call the prompty instead of an Azure AI project scope. This enables the usage of simulator with Entra ID based auth.
557
+ Before:
558
+ ```python
559
+ azure_ai_project = {
560
+ "subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
561
+ "resource_group_name": os.environ.get("RESOURCE_GROUP"),
562
+ "project_name": os.environ.get("PROJECT_NAME"),
563
+ }
564
+ sim = Simulator(azure_ai_project=azure_ai_project, credentails=DefaultAzureCredentials())
565
+ ```
566
+ After:
567
+ ```python
568
+ model_config = {
569
+ "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
570
+ "azure_deployment": os.environ.get("AZURE_DEPLOYMENT"),
571
+ }
572
+ sim = Simulator(model_config=model_config)
573
+ ```
574
+ If `api_key` is not included in the `model_config`, the prompty runtime in `promptflow-core` will pick up `DefaultAzureCredential`.
575
+
576
+ ### Bugs Fixed
577
+
578
+ - Fixed issue where Entra ID authentication was not working with `AzureOpenAIModelConfiguration`
579
+
580
+ ## 1.0.0b2 (2024-09-24)
581
+
582
+ ### Breaking Changes
583
+
584
+ - `data` and `evaluators` are now required keywords in `evaluate`.
585
+
586
+ ## 1.0.0b1 (2024-09-20)
587
+
588
+ ### Breaking Changes
589
+
590
+ - The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
591
+ - The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`
592
+ - The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
593
+ - Model configurations classes are now of type `TypedDict` and are exposed in the `azure.ai.evaluation` module instead of coming from `promptflow.core`.
594
+ - Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
595
+
596
+ ### Features Added
597
+
598
+ - First preview
599
+ - This package is port of `promptflow-evals`. New features will be added only to this package moving forward.
600
+ - Added a `TypedDict` for `AzureAIProject` that allows for better intellisense and type checking when passing in project information
@@ -48,3 +48,23 @@ distributed under the License is distributed on an "AS IS" BASIS,
48
48
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
49
49
  See the License for the specific language governing permissions and
50
50
  limitations under the License.
51
+
52
+
53
+ License notice for [Is GPT-4 a reliable rater? Evaluating consistency in GPT-4's text ratings](https://www.frontiersin.org/journals/education/articles/10.3389/feduc.2023.1272229/full)
54
+ ------------------------------------------------------------------------------------------------------------------
55
+ Copyright © 2023 Hackl, Müller, Granitzer and Sailer. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
56
+
57
+
58
+ License notice for [Is ChatGPT a Good NLG Evaluator? A Preliminary Study](https://aclanthology.org/2023.newsum-1.1) (Wang et al., NewSum 2023)
59
+ ------------------------------------------------------------------------------------------------------------------
60
+ Copyright © 2023. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
61
+
62
+
63
+ License notice for [SummEval: Re-evaluating Summarization Evaluation.](https://doi.org/10.1162/tacl_a_00373) (Fabbri et al.)
64
+ ------------------------------------------------------------------------------------------------------------------
65
+ © 2021 Association for Computational Linguistics. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
66
+
67
+
68
+ License notice for [Evaluation Metrics in the Era of GPT-4: Reliably Evaluating Large Language Models on Sequence to Sequence Tasks](https://aclanthology.org/2023.emnlp-main.543) (Sottana et al., EMNLP 2023)
69
+ ------------------------------------------------------------------------------------------------------------------
70
+ © 2023 Association for Computational Linguistics. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).