azure-ai-evaluation 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +43 -1
- azure/ai/evaluation/_azure/_models.py +6 -6
- azure/ai/evaluation/_common/constants.py +6 -2
- azure/ai/evaluation/_common/rai_service.py +38 -4
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +22 -2
- azure/ai/evaluation/_constants.py +7 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +804 -0
- azure/ai/evaluation/_converters/_models.py +302 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +31 -2
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +43 -3
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +3 -1
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +43 -4
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +21 -3
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
- azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +157 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
- azure/ai/evaluation/_exceptions.py +5 -0
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_red_team/__init__.py +3 -0
- azure/ai/evaluation/_red_team/_attack_objective_generator.py +192 -0
- azure/ai/evaluation/_red_team/_attack_strategy.py +42 -0
- azure/ai/evaluation/_red_team/_callback_chat_target.py +74 -0
- azure/ai/evaluation/_red_team/_default_converter.py +21 -0
- azure/ai/evaluation/_red_team/_red_team.py +1858 -0
- azure/ai/evaluation/_red_team/_red_team_result.py +246 -0
- azure/ai/evaluation/_red_team/_utils/__init__.py +3 -0
- azure/ai/evaluation/_red_team/_utils/constants.py +64 -0
- azure/ai/evaluation/_red_team/_utils/formatting_utils.py +164 -0
- azure/ai/evaluation/_red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/_red_team/_utils/strategy_utils.py +188 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +251 -150
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +54 -27
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/METADATA +69 -15
- azure_ai_evaluation-1.4.0.dist-info/RECORD +197 -0
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.3.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# ------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation.
|
|
3
|
+
# Licensed under the MIT License.
|
|
4
|
+
# ------------------------------------
|
|
5
|
+
"""Customize generated code here.
|
|
6
|
+
|
|
7
|
+
Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
|
|
8
|
+
"""
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
__all__: List[str] = [] # Add all objects you want publicly available to users at this package level
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def patch_sdk():
|
|
15
|
+
"""Do not remove from this file.
|
|
16
|
+
|
|
17
|
+
`patch_sdk` is a last resort escape hatch that allows you to do customizations
|
|
18
|
+
you can't accomplish using the techniques described in
|
|
19
|
+
https://aka.ms/azsdk/python/dpcodegen/python/customize
|
|
20
|
+
"""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Marker file for PEP 561.
|
|
@@ -274,8 +274,26 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
|
|
|
274
274
|
|
|
275
275
|
return cast(T_TypedDict, o)
|
|
276
276
|
|
|
277
|
+
def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5) -> bool:
|
|
278
|
+
"""Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
|
|
279
|
+
|
|
280
|
+
:param score: The score to check.
|
|
281
|
+
:type score: Union[str, float]
|
|
282
|
+
:param min_score: The minimum score. Default is 1.
|
|
283
|
+
:type min_score: int
|
|
284
|
+
:param max_score: The maximum score. Default is 5.
|
|
285
|
+
:type max_score: int
|
|
286
|
+
:return: True if the score is valid, False otherwise.
|
|
287
|
+
:rtype: bool
|
|
288
|
+
"""
|
|
289
|
+
try:
|
|
290
|
+
numeric_score = float(score)
|
|
291
|
+
except (ValueError, TypeError):
|
|
292
|
+
return False
|
|
293
|
+
|
|
294
|
+
return min_score <= numeric_score <= max_score
|
|
277
295
|
|
|
278
|
-
def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
|
|
296
|
+
def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
|
|
279
297
|
"""Parse the output of prompt-based quality evaluators that return a score and reason.
|
|
280
298
|
|
|
281
299
|
Current supported evaluators:
|
|
@@ -284,6 +302,8 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
|
|
|
284
302
|
- Retrieval
|
|
285
303
|
- Groundedness
|
|
286
304
|
- Coherence
|
|
305
|
+
- ResponseCompleteness
|
|
306
|
+
- TaskAdherence
|
|
287
307
|
|
|
288
308
|
:param llm_output: The output of the prompt-based quality evaluator.
|
|
289
309
|
:type llm_output: str
|
|
@@ -294,7 +314,7 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
|
|
|
294
314
|
reason = ""
|
|
295
315
|
if llm_output:
|
|
296
316
|
try:
|
|
297
|
-
score_pattern =
|
|
317
|
+
score_pattern = rf"<S2>\D*?({valid_score_range}).*?</S2>"
|
|
298
318
|
reason_pattern = r"<S1>(.*?)</S1>"
|
|
299
319
|
score_match = re.findall(score_pattern, llm_output, re.DOTALL)
|
|
300
320
|
reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
|
|
@@ -26,6 +26,8 @@ class EvaluationMetrics:
|
|
|
26
26
|
FICTIONAL_CHARACTERS = "fictional_characters"
|
|
27
27
|
LOGOS_AND_BRANDS = "logos_and_brands"
|
|
28
28
|
XPIA = "xpia"
|
|
29
|
+
CODE_VULNERABILITY = "code_vulnerability"
|
|
30
|
+
UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
|
|
29
31
|
|
|
30
32
|
|
|
31
33
|
class _InternalEvaluationMetrics:
|
|
@@ -92,3 +94,8 @@ OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
|
|
|
92
94
|
AZURE_OPENAI_TYPE: Literal["azure_openai"] = "azure_openai"
|
|
93
95
|
|
|
94
96
|
OPENAI_TYPE: Literal["openai"] = "openai"
|
|
97
|
+
|
|
98
|
+
EVALUATION_PASS_FAIL_MAPPING = {
|
|
99
|
+
True: "pass",
|
|
100
|
+
False: "fail",
|
|
101
|
+
}
|