azure-ai-evaluation 0.0.0b0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. azure_ai_evaluation-1.0.0/CHANGELOG.md +214 -0
  2. azure_ai_evaluation-1.0.0/MANIFEST.in +8 -0
  3. azure_ai_evaluation-1.0.0/NOTICE.txt +70 -0
  4. azure_ai_evaluation-1.0.0/PKG-INFO +595 -0
  5. azure_ai_evaluation-1.0.0/README.md +345 -0
  6. azure_ai_evaluation-1.0.0/TROUBLESHOOTING.md +61 -0
  7. azure_ai_evaluation-1.0.0/azure/__init__.py +5 -0
  8. azure_ai_evaluation-1.0.0/azure/ai/__init__.py +5 -0
  9. azure_ai_evaluation-1.0.0/azure/ai/evaluation/__init__.py +82 -0
  10. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_common/__init__.py +16 -0
  11. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_common/_experimental.py +172 -0
  12. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_common/constants.py +72 -0
  13. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_common/math.py +89 -0
  14. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_common/rai_service.py +632 -0
  15. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_common/utils.py +445 -0
  16. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_constants.py +72 -0
  17. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/__init__.py +3 -0
  18. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
  19. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
  20. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
  21. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
  22. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  23. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
  24. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
  25. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
  26. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_utils.py +298 -0
  27. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/__init__.py +3 -0
  28. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  29. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
  30. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  31. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
  32. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
  33. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  34. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
  35. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
  36. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
  37. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
  38. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
  39. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
  40. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
  41. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
  42. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
  43. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  44. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
  45. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  46. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
  47. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  48. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
  49. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
  50. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  51. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
  52. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  53. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
  54. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  55. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  56. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  57. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
  58. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  59. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  60. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  61. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  62. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  63. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  64. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  65. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  66. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  67. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
  68. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  69. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
  70. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  71. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
  72. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
  73. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
  74. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
  75. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  76. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  77. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  78. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  79. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  80. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  81. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
  82. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
  83. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  84. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
  85. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_exceptions.py +128 -0
  86. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_http_utils.py +466 -0
  87. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_model_configurations.py +123 -0
  88. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_user_agent.py +6 -0
  89. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/__init__.py +3 -0
  90. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  91. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  92. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  93. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  94. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  95. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_version.py +5 -0
  96. azure_ai_evaluation-1.0.0/azure/ai/evaluation/py.typed +0 -0
  97. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/__init__.py +16 -0
  98. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
  99. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
  100. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_constants.py +27 -0
  101. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
  102. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  103. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  104. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  105. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  106. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
  107. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  108. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  109. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
  110. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
  111. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  112. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
  113. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
  114. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
  115. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
  116. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
  117. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  118. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
  119. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
  120. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_simulator.py +716 -0
  121. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_tracing.py +89 -0
  122. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_utils.py +132 -0
  123. azure_ai_evaluation-1.0.0/azure_ai_evaluation.egg-info/PKG-INFO +595 -0
  124. azure_ai_evaluation-1.0.0/azure_ai_evaluation.egg-info/SOURCES.txt +162 -0
  125. azure_ai_evaluation-1.0.0/azure_ai_evaluation.egg-info/requires.txt +10 -0
  126. azure_ai_evaluation-1.0.0/azure_ai_evaluation.egg-info/top_level.txt +1 -0
  127. azure_ai_evaluation-1.0.0/pyproject.toml +19 -0
  128. azure_ai_evaluation-1.0.0/samples/README.md +57 -0
  129. azure_ai_evaluation-1.0.0/samples/data/evaluate_test_data.jsonl +3 -0
  130. azure_ai_evaluation-1.0.0/samples/evaluation_samples_common.py +60 -0
  131. azure_ai_evaluation-1.0.0/samples/evaluation_samples_evaluate.py +395 -0
  132. azure_ai_evaluation-1.0.0/samples/evaluation_samples_simulate.py +249 -0
  133. azure_ai_evaluation-1.0.0/setup.py +91 -0
  134. azure_ai_evaluation-1.0.0/tests/__init__.py +0 -0
  135. azure_ai_evaluation-1.0.0/tests/__openai_patcher.py +118 -0
  136. azure_ai_evaluation-1.0.0/tests/__pf_service_isolation.py +28 -0
  137. azure_ai_evaluation-1.0.0/tests/conftest.py +541 -0
  138. azure_ai_evaluation-1.0.0/tests/e2etests/__init__.py +0 -0
  139. azure_ai_evaluation-1.0.0/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +23 -0
  140. azure_ai_evaluation-1.0.0/tests/e2etests/target_fn.py +37 -0
  141. azure_ai_evaluation-1.0.0/tests/e2etests/test_adv_simulator.py +650 -0
  142. azure_ai_evaluation-1.0.0/tests/e2etests/test_builtin_evaluators.py +997 -0
  143. azure_ai_evaluation-1.0.0/tests/e2etests/test_evaluate.py +926 -0
  144. azure_ai_evaluation-1.0.0/tests/e2etests/test_metrics_upload.py +214 -0
  145. azure_ai_evaluation-1.0.0/tests/e2etests/test_sim_and_eval.py +129 -0
  146. azure_ai_evaluation-1.0.0/tests/unittests/test_batch_run_context.py +78 -0
  147. azure_ai_evaluation-1.0.0/tests/unittests/test_built_in_evaluator.py +128 -0
  148. azure_ai_evaluation-1.0.0/tests/unittests/test_content_safety_defect_rate.py +25 -0
  149. azure_ai_evaluation-1.0.0/tests/unittests/test_content_safety_rai_script.py +471 -0
  150. azure_ai_evaluation-1.0.0/tests/unittests/test_eval_run.py +503 -0
  151. azure_ai_evaluation-1.0.0/tests/unittests/test_evaluate.py +686 -0
  152. azure_ai_evaluation-1.0.0/tests/unittests/test_evaluate_telemetry.py +168 -0
  153. azure_ai_evaluation-1.0.0/tests/unittests/test_evaluators/apology_dag/apology.py +8 -0
  154. azure_ai_evaluation-1.0.0/tests/unittests/test_evaluators/test_inputs_evaluators.py +46 -0
  155. azure_ai_evaluation-1.0.0/tests/unittests/test_jailbreak_simulator.py +123 -0
  156. azure_ai_evaluation-1.0.0/tests/unittests/test_non_adv_simulator.py +362 -0
  157. azure_ai_evaluation-1.0.0/tests/unittests/test_save_eval.py +49 -0
  158. azure_ai_evaluation-1.0.0/tests/unittests/test_simulator.py +123 -0
  159. azure_ai_evaluation-1.0.0/tests/unittests/test_synthetic_callback_conv_bot.py +110 -0
  160. azure_ai_evaluation-1.0.0/tests/unittests/test_synthetic_conversation_bot.py +123 -0
  161. azure_ai_evaluation-1.0.0/tests/unittests/test_utils.py +258 -0
  162. azure_ai_evaluation-0.0.0b0/PKG-INFO +0 -6
  163. azure_ai_evaluation-0.0.0b0/azure_ai_evaluation.egg-info/PKG-INFO +0 -6
  164. azure_ai_evaluation-0.0.0b0/azure_ai_evaluation.egg-info/SOURCES.txt +0 -5
  165. azure_ai_evaluation-0.0.0b0/setup.py +0 -12
  166. {azure_ai_evaluation-0.0.0b0 → azure_ai_evaluation-1.0.0}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
  167. /azure_ai_evaluation-0.0.0b0/azure_ai_evaluation.egg-info/top_level.txt → /azure_ai_evaluation-1.0.0/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
  168. {azure_ai_evaluation-0.0.0b0 → azure_ai_evaluation-1.0.0}/setup.cfg +0 -0
@@ -0,0 +1,214 @@
1
+ # Release History
2
+
3
+ ## 1.0.0 (2024-11-13)
4
+
5
+ ### Breaking Changes
6
+ - The `parallel` parameter has been removed from composite evaluators: `QAEvaluator`, `ContentSafetyChatEvaluator`, and `ContentSafetyMultimodalEvaluator`. To control evaluator parallelism, you can now use the `_parallel` keyword argument, though please note that this private parameter may change in the future.
7
+ - Parameters `query_response_generating_prompty_kwargs` and `user_simulator_prompty_kwargs` have been renamed to `query_response_generating_prompty_options` and `user_simulator_prompty_options` in the Simulator's __call__ method.
8
+
9
+ ### Bugs Fixed
10
+ - Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
11
+ - Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
12
+ - Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
13
+ - Fix `evaluate` function not producing aggregated metrics if ANY values to be aggregated were None, NaN, or
14
+ otherwise difficult to process. Such values are ignored fully, so the aggregated metric of `[1, 2, 3, NaN]`
15
+ would be 2, not 1.5.
16
+
17
+ ### Other Changes
18
+ - Refined error messages for serviced-based evaluators and simulators.
19
+ - Tracing has been disabled due to Cosmos DB initialization issue.
20
+ - Introduced environment variable `AI_EVALS_DISABLE_EXPERIMENTAL_WARNING` to disable the warning message for experimental features.
21
+ - Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex.
22
+ - For the `DirectAttackSimulator`, the prompt templates used to generate simulated outputs for each Adversarial harm category will no longer be in a randomized order by default. To override this behavior, pass `randomize_order=True` when you call the `DirectAttackSimulator`, for example:
23
+ ```python
24
+ adversarial_simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
25
+ outputs = asyncio.run(
26
+ adversarial_simulator(
27
+ scenario=scenario,
28
+ target=callback,
29
+ randomize_order=True
30
+ )
31
+ )
32
+ ```
33
+
34
+ ## 1.0.0b5 (2024-10-28)
35
+
36
+ ### Features Added
37
+ - Added `GroundednessProEvaluator`, which is a service-based evaluator for determining response groundedness.
38
+ - Groundedness detection in Non Adversarial Simulator via query/context pairs
39
+ ```python
40
+ import importlib.resources as pkg_resources
41
+ package = "azure.ai.evaluation.simulator._data_sources"
42
+ resource_name = "grounding.json"
43
+ custom_simulator = Simulator(model_config=model_config)
44
+ conversation_turns = []
45
+ with pkg_resources.path(package, resource_name) as grounding_file:
46
+ with open(grounding_file, "r") as file:
47
+ data = json.load(file)
48
+ for item in data:
49
+ conversation_turns.append([item])
50
+ outputs = asyncio.run(custom_simulator(
51
+ target=callback,
52
+ conversation_turns=conversation_turns,
53
+ max_conversation_turns=1,
54
+ ))
55
+ ```
56
+ - Adding evaluator for multimodal use cases
57
+
58
+ ### Breaking Changes
59
+ - Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`.
60
+ - `RetrievalEvaluator` now requires a `context` input in addition to `query` in single-turn evaluation.
61
+ - `RelevanceEvaluator` no longer takes `context` as an input. It now only takes `query` and `response` in single-turn evaluation.
62
+ - `FluencyEvaluator` no longer takes `query` as an input. It now only takes `response` in single-turn evaluation.
63
+ - AdversarialScenario enum does not include `ADVERSARIAL_INDIRECT_JAILBREAK`, invoking IndirectJailbreak or XPIA should be done with `IndirectAttackSimulator`
64
+ - Outputs of `Simulator` and `AdversarialSimulator` previously had `to_eval_qa_json_lines` and now has `to_eval_qr_json_lines`. Where `to_eval_qa_json_lines` had:
65
+ ```json
66
+ {"question": <user_message>, "answer": <assistant_message>}
67
+ ```
68
+ `to_eval_qr_json_lines` now has:
69
+ ```json
70
+ {"query": <user_message>, "response": assistant_message}
71
+ ```
72
+
73
+ ### Bugs Fixed
74
+ - Non adversarial simulator works with `gpt-4o` models using the `json_schema` response format
75
+ - Fixed an issue where the `evaluate` API would fail with "[WinError 32] The process cannot access the file because it is being used by another process" when venv folder and target function file are in the same directory.
76
+ - Fix evaluate API failure when `trace.destination` is set to `none`
77
+ - Non adversarial simulator now accepts context from the callback
78
+
79
+ ### Other Changes
80
+ - Improved error messages for the `evaluate` API by enhancing the validation of input parameters. This update provides more detailed and actionable error descriptions.
81
+ - `GroundednessEvaluator` now supports `query` as an optional input in single-turn evaluation. If `query` is provided, a different prompt template will be used for the evaluation.
82
+ - To align with our support of a diverse set of models, the following evaluators will now have a new key in their result output without the `gpt_` prefix. To maintain backwards compatibility, the old key with the `gpt_` prefix will still be present in the output; however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
83
+ - `CoherenceEvaluator`
84
+ - `RelevanceEvaluator`
85
+ - `FluencyEvaluator`
86
+ - `GroundednessEvaluator`
87
+ - `SimilarityEvaluator`
88
+ - `RetrievalEvaluator`
89
+ - The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "<metric_name>_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased.
90
+
91
+ | Evaluator | New `max_token` for Generation |
92
+ | --- | --- |
93
+ | `CoherenceEvaluator` | 800 |
94
+ | `RelevanceEvaluator` | 800 |
95
+ | `FluencyEvaluator` | 800 |
96
+ | `GroundednessEvaluator` | 800 |
97
+ | `RetrievalEvaluator` | 1600 |
98
+ - Improved the error message for storage access permission issues to provide clearer guidance for users.
99
+
100
+ ## 1.0.0b4 (2024-10-16)
101
+
102
+ ### Breaking Changes
103
+
104
+ - Removed `numpy` dependency. All NaN values returned by the SDK have been changed to from `numpy.nan` to `math.nan`.
105
+ - `credential` is now required to be passed in for all content safety evaluators and `ProtectedMaterialsEvaluator`. `DefaultAzureCredential` will no longer be chosen if a credential is not passed.
106
+ - Changed package extra name from "pf-azure" to "remote".
107
+
108
+ ### Bugs Fixed
109
+ - Adversarial Conversation simulations would fail with `Forbidden`. Added logic to re-fetch token in the exponential retry logic to retrive RAI Service response.
110
+ - Fixed an issue where the Evaluate API did not fail due to missing inputs when the target did not return columns required by the evaluators.
111
+
112
+ ### Other Changes
113
+ - Enhance the error message to provide clearer instruction when required packages for the remote tracking feature are missing.
114
+ - Print the per-evaluator run summary at the end of the Evaluate API call to make troubleshooting row-level failures easier.
115
+
116
+ ## 1.0.0b3 (2024-10-01)
117
+
118
+ ### Features Added
119
+
120
+ - Added `type` field to `AzureOpenAIModelConfiguration` and `OpenAIModelConfiguration`
121
+ - The following evaluators now support `conversation` as an alternative input to their usual single-turn inputs:
122
+ - `ViolenceEvaluator`
123
+ - `SexualEvaluator`
124
+ - `SelfHarmEvaluator`
125
+ - `HateUnfairnessEvaluator`
126
+ - `ProtectedMaterialEvaluator`
127
+ - `IndirectAttackEvaluator`
128
+ - `CoherenceEvaluator`
129
+ - `RelevanceEvaluator`
130
+ - `FluencyEvaluator`
131
+ - `GroundednessEvaluator`
132
+ - Surfaced `RetrievalScoreEvaluator`, formally an internal part of `ChatEvaluator` as a standalone conversation-only evaluator.
133
+
134
+ ### Breaking Changes
135
+
136
+ - Removed `ContentSafetyChatEvaluator` and `ChatEvaluator`
137
+ - The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
138
+ `column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
139
+
140
+ Before:
141
+ ```python
142
+ evaluate(
143
+ ...,
144
+ evaluator_config={
145
+ "hate_unfairness": {
146
+ "query": "${data.question}",
147
+ "response": "${data.answer}",
148
+ }
149
+ },
150
+ ...
151
+ )
152
+ ```
153
+
154
+ After
155
+ ```python
156
+ evaluate(
157
+ ...,
158
+ evaluator_config={
159
+ "hate_unfairness": {
160
+ "column_mapping": {
161
+ "query": "${data.question}",
162
+ "response": "${data.answer}",
163
+ }
164
+ }
165
+ },
166
+ ...
167
+ )
168
+ ```
169
+
170
+ - Simulator now requires a model configuration to call the prompty instead of an Azure AI project scope. This enables the usage of simulator with Entra ID based auth.
171
+ Before:
172
+ ```python
173
+ azure_ai_project = {
174
+ "subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
175
+ "resource_group_name": os.environ.get("RESOURCE_GROUP"),
176
+ "project_name": os.environ.get("PROJECT_NAME"),
177
+ }
178
+ sim = Simulator(azure_ai_project=azure_ai_project, credentails=DefaultAzureCredentials())
179
+ ```
180
+ After:
181
+ ```python
182
+ model_config = {
183
+ "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
184
+ "azure_deployment": os.environ.get("AZURE_DEPLOYMENT"),
185
+ }
186
+ sim = Simulator(model_config=model_config)
187
+ ```
188
+ If `api_key` is not included in the `model_config`, the prompty runtime in `promptflow-core` will pick up `DefaultAzureCredential`.
189
+
190
+ ### Bugs Fixed
191
+
192
+ - Fixed issue where Entra ID authentication was not working with `AzureOpenAIModelConfiguration`
193
+
194
+ ## 1.0.0b2 (2024-09-24)
195
+
196
+ ### Breaking Changes
197
+
198
+ - `data` and `evaluators` are now required keywords in `evaluate`.
199
+
200
+ ## 1.0.0b1 (2024-09-20)
201
+
202
+ ### Breaking Changes
203
+
204
+ - The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
205
+ - The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`
206
+ - The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
207
+ - Model configurations classes are now of type `TypedDict` and are exposed in the `azure.ai.evaluation` module instead of coming from `promptflow.core`.
208
+ - Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
209
+
210
+ ### Features Added
211
+
212
+ - First preview
213
+ - This package is port of `promptflow-evals`. New features will be added only to this package moving forward.
214
+ - Added a `TypedDict` for `AzureAIProject` that allows for better intellisense and type checking when passing in project information
@@ -0,0 +1,8 @@
1
+ recursive-include tests *.py
2
+ include *.md
3
+ include azure/__init__.py
4
+ include azure/ai/__init__.py
5
+ include azure/ai/evaluation/py.typed
6
+ recursive-include azure/ai/evaluation *.prompty
7
+ include azure/ai/evaluation/simulator/_data_sources/grounding.json
8
+ recursive-include samples *
@@ -0,0 +1,70 @@
1
+ NOTICES AND INFORMATION
2
+ Do Not Translate or Localize
3
+
4
+ This software incorporates material from third parties.
5
+ Microsoft makes certain open source code available at https://3rdpartysource.microsoft.com,
6
+ or you may send a check or money order for US $5.00, including the product name,
7
+ the open source component name, platform, and version number, to:
8
+
9
+ Source Code Compliance Team
10
+ Microsoft Corporation
11
+ One Microsoft Way
12
+ Redmond, WA 98052
13
+ USA
14
+
15
+ Notwithstanding any other terms, you may reverse engineer this software to the extent
16
+ required to debug changes to any libraries licensed under the GNU Lesser General Public License.
17
+
18
+ License notice for nltk
19
+ ---------------------------------------------------------
20
+
21
+ Copyright 2024 The NLTK Project
22
+
23
+ Licensed under the Apache License, Version 2.0 (the "License");
24
+ you may not use this file except in compliance with the License.
25
+ You may obtain a copy of the License at
26
+
27
+ http://www.apache.org/licenses/LICENSE-2.0
28
+
29
+ Unless required by applicable law or agreed to in writing, software
30
+ distributed under the License is distributed on an "AS IS" BASIS,
31
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
32
+ See the License for the specific language governing permissions and
33
+ limitations under the License.
34
+
35
+ License notice for rouge-score
36
+ ---------------------------------------------------------
37
+
38
+ Copyright 2024 The Google Research Authors
39
+
40
+ Licensed under the Apache License, Version 2.0 (the "License");
41
+ you may not use this file except in compliance with the License.
42
+ You may obtain a copy of the License at
43
+
44
+ http://www.apache.org/licenses/LICENSE-2.0
45
+
46
+ Unless required by applicable law or agreed to in writing, software
47
+ distributed under the License is distributed on an "AS IS" BASIS,
48
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
49
+ See the License for the specific language governing permissions and
50
+ limitations under the License.
51
+
52
+
53
+ License notice for [Is GPT-4 a reliable rater? Evaluating consistency in GPT-4's text ratings](https://www.frontiersin.org/journals/education/articles/10.3389/feduc.2023.1272229/full)
54
+ ------------------------------------------------------------------------------------------------------------------
55
+ Copyright © 2023 Hackl, Müller, Granitzer and Sailer. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
56
+
57
+
58
+ License notice for [Is ChatGPT a Good NLG Evaluator? A Preliminary Study](https://aclanthology.org/2023.newsum-1.1) (Wang et al., NewSum 2023)
59
+ ------------------------------------------------------------------------------------------------------------------
60
+ Copyright © 2023. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
61
+
62
+
63
+ License notice for [SummEval: Re-evaluating Summarization Evaluation.](https://doi.org/10.1162/tacl_a_00373) (Fabbri et al.)
64
+ ------------------------------------------------------------------------------------------------------------------
65
+ © 2021 Association for Computational Linguistics. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
66
+
67
+
68
+ License notice for [Evaluation Metrics in the Era of GPT-4: Reliably Evaluating Large Language Models on Sequence to Sequence Tasks](https://aclanthology.org/2023.emnlp-main.543) (Sottana et al., EMNLP 2023)
69
+ ------------------------------------------------------------------------------------------------------------------
70
+ © 2023 Association for Computational Linguistics. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).