azure-ai-evaluation 1.0.0b1__py3-none-any.whl → 1.0.0b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +4 -4
- azure/ai/evaluation/_common/rai_service.py +4 -4
- azure/ai/evaluation/_common/utils.py +40 -25
- azure/ai/evaluation/_constants.py +13 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +2 -1
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +39 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +23 -13
- azure/ai/evaluation/_evaluate/_eval_run.py +38 -18
- azure/ai/evaluation/_evaluate/_evaluate.py +88 -63
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +13 -8
- azure/ai/evaluation/_evaluate/_utils.py +29 -22
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +34 -86
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +302 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +79 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +99 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -2
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +9 -4
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
- azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +2 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +29 -79
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +33 -85
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
- azure/ai/evaluation/_evaluators/_qa/_qa.py +3 -14
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +34 -88
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
- azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +17 -29
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +3 -2
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +5 -18
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +16 -91
- azure/ai/evaluation/_exceptions.py +0 -1
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_model_configurations.py +36 -8
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +8 -6
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +16 -16
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +6 -6
- azure/ai/evaluation/simulator/_helpers/__init__.py +3 -2
- azure/ai/evaluation/simulator/_helpers/_experimental.py +157 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +11 -29
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +6 -6
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -3
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +18 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +9 -11
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -1
- azure/ai/evaluation/simulator/{simulator.py → _simulator.py} +166 -88
- azure/ai/evaluation/simulator/_tracing.py +21 -24
- azure/ai/evaluation/simulator/_utils.py +4 -1
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/METADATA +144 -14
- azure_ai_evaluation-1.0.0b3.dist-info/RECORD +98 -0
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -350
- azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -66
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +0 -97
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/top_level.txt +0 -0
|
@@ -1,82 +1,14 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
|
|
5
4
|
import os
|
|
6
|
-
import
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
-
import numpy as np
|
|
10
|
-
|
|
11
|
-
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
12
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
|
-
from promptflow.core import AsyncPrompty
|
|
14
|
-
|
|
15
|
-
from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
16
|
-
from ..._common.utils import (
|
|
17
|
-
check_and_add_api_version_for_aoai_model_config,
|
|
18
|
-
check_and_add_user_agent_for_aoai_model_config,
|
|
19
|
-
)
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from typing_extensions import override
|
|
20
7
|
|
|
21
|
-
|
|
22
|
-
from ..._user_agent import USER_AGENT
|
|
23
|
-
except ImportError:
|
|
24
|
-
USER_AGENT = None
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class _AsyncGroundednessEvaluator:
|
|
28
|
-
# Constants must be defined within eval's directory to be save/loadable
|
|
29
|
-
PROMPTY_FILE = "groundedness.prompty"
|
|
30
|
-
LLM_CALL_TIMEOUT = 600
|
|
31
|
-
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
8
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
32
9
|
|
|
33
|
-
def __init__(self, model_config: dict):
|
|
34
|
-
check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
35
|
-
|
|
36
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
37
|
-
|
|
38
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
39
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
40
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
41
|
-
|
|
42
|
-
check_and_add_user_agent_for_aoai_model_config(
|
|
43
|
-
model_config,
|
|
44
|
-
prompty_model_config,
|
|
45
|
-
USER_AGENT,
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
current_dir = os.path.dirname(__file__)
|
|
49
|
-
prompty_path = os.path.join(current_dir, "groundedness.prompty")
|
|
50
|
-
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
|
|
51
10
|
|
|
52
|
-
|
|
53
|
-
# Validate input parameters
|
|
54
|
-
response = str(response or "")
|
|
55
|
-
context = str(context or "")
|
|
56
|
-
|
|
57
|
-
if not response.strip() or not context.strip():
|
|
58
|
-
msg = "Both 'response' and 'context' must be non-empty strings."
|
|
59
|
-
raise EvaluationException(
|
|
60
|
-
message=msg,
|
|
61
|
-
internal_message=msg,
|
|
62
|
-
error_category=ErrorCategory.MISSING_FIELD,
|
|
63
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
64
|
-
error_target=ErrorTarget.F1_EVALUATOR,
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
# Run the evaluation flow
|
|
68
|
-
llm_output = await self._flow(response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
|
|
69
|
-
|
|
70
|
-
score = np.nan
|
|
71
|
-
if llm_output:
|
|
72
|
-
match = re.search(r"\d", llm_output)
|
|
73
|
-
if match:
|
|
74
|
-
score = float(match.group())
|
|
75
|
-
|
|
76
|
-
return {"gpt_groundedness": float(score)}
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
class GroundednessEvaluator:
|
|
11
|
+
class GroundednessEvaluator(PromptyEvaluatorBase):
|
|
80
12
|
"""
|
|
81
13
|
Initialize a groundedness evaluator configured for a specific Azure OpenAI model.
|
|
82
14
|
|
|
@@ -103,21 +35,37 @@ class GroundednessEvaluator:
|
|
|
103
35
|
}
|
|
104
36
|
"""
|
|
105
37
|
|
|
106
|
-
|
|
107
|
-
|
|
38
|
+
PROMPTY_FILE = "groundedness.prompty"
|
|
39
|
+
RESULT_KEY = "gpt_groundedness"
|
|
108
40
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
41
|
+
@override
|
|
42
|
+
def __init__(self, model_config: dict):
|
|
43
|
+
current_dir = os.path.dirname(__file__)
|
|
44
|
+
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
45
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
|
|
46
|
+
|
|
47
|
+
@override
|
|
48
|
+
def __call__(
|
|
49
|
+
self,
|
|
50
|
+
*,
|
|
51
|
+
response: Optional[str] = None,
|
|
52
|
+
context: Optional[str] = None,
|
|
53
|
+
conversation: Optional[dict] = None,
|
|
54
|
+
**kwargs
|
|
55
|
+
):
|
|
56
|
+
"""Evaluate groundedless. Accepts either a response and context a single evaluation,
|
|
57
|
+
or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
|
|
58
|
+
the evaluator will aggregate the results of each turn.
|
|
112
59
|
|
|
113
60
|
:keyword response: The response to be evaluated.
|
|
114
|
-
:paramtype response: str
|
|
115
|
-
:keyword context: The context
|
|
116
|
-
:paramtype context: str
|
|
117
|
-
:
|
|
61
|
+
:paramtype response: Optional[str]
|
|
62
|
+
:keyword context: The context to be evaluated.
|
|
63
|
+
:paramtype context: Optional[str]
|
|
64
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
65
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
66
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
67
|
+
:paramtype conversation: Optional[Dict]
|
|
68
|
+
:return: The relevance score.
|
|
118
69
|
:rtype: dict
|
|
119
70
|
"""
|
|
120
|
-
return
|
|
121
|
-
|
|
122
|
-
def _to_async(self):
|
|
123
|
-
return self._async_evaluator
|
|
71
|
+
return super().__call__(response=response, context=context, conversation=conversation, **kwargs)
|
|
@@ -3,11 +3,6 @@ name: Groundedness
|
|
|
3
3
|
description: Evaluates groundedness score for QA scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
-
configuration:
|
|
7
|
-
type: azure_openai
|
|
8
|
-
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
-
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
-
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
6
|
parameters:
|
|
12
7
|
temperature: 0.0
|
|
13
8
|
max_tokens: 1
|
|
@@ -1,55 +1,13 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from
|
|
4
|
+
from typing import Optional
|
|
5
|
+
from typing_extensions import override
|
|
5
6
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
6
|
-
from azure.ai.evaluation._common
|
|
7
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
8
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
7
|
+
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
9
8
|
|
|
10
9
|
|
|
11
|
-
class
|
|
12
|
-
def __init__(self, azure_ai_project: dict, credential=None):
|
|
13
|
-
self._azure_ai_project = azure_ai_project
|
|
14
|
-
self._credential = credential
|
|
15
|
-
|
|
16
|
-
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
17
|
-
"""
|
|
18
|
-
Evaluates content according to this evaluator's metric.
|
|
19
|
-
|
|
20
|
-
:keyword query: The query to be evaluated.
|
|
21
|
-
:paramtype query: str
|
|
22
|
-
:keyword response: The response to be evaluated.
|
|
23
|
-
:paramtype response: str
|
|
24
|
-
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
25
|
-
:rtype: Any
|
|
26
|
-
"""
|
|
27
|
-
# Validate inputs
|
|
28
|
-
# Raises value error if failed, so execution alone signifies success.
|
|
29
|
-
if not (query and query.strip() and query != "None") or not (
|
|
30
|
-
response and response.strip() and response != "None"
|
|
31
|
-
):
|
|
32
|
-
msg = "Both 'query' and 'response' must be non-empty strings."
|
|
33
|
-
raise EvaluationException(
|
|
34
|
-
message=msg,
|
|
35
|
-
internal_message=msg,
|
|
36
|
-
error_category=ErrorCategory.MISSING_FIELD,
|
|
37
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
38
|
-
error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
# Run score computation based on supplied metric.
|
|
42
|
-
result = await evaluate_with_rai_service(
|
|
43
|
-
metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
|
|
44
|
-
query=query,
|
|
45
|
-
response=response,
|
|
46
|
-
project_scope=self._azure_ai_project,
|
|
47
|
-
credential=self._credential,
|
|
48
|
-
)
|
|
49
|
-
return result
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class ProtectedMaterialEvaluator:
|
|
10
|
+
class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase):
|
|
53
11
|
"""
|
|
54
12
|
Initialize a protected material evaluator to detect whether protected material
|
|
55
13
|
is present in your AI system's response. Outputs True or False with AI-generated reasoning.
|
|
@@ -58,7 +16,7 @@ class ProtectedMaterialEvaluator:
|
|
|
58
16
|
It contains subscription id, resource group, and project name.
|
|
59
17
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
60
18
|
:param credential: The credential for connecting to Azure AI project.
|
|
61
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
19
|
+
:type credential: Optional[~azure.core.credentials.TokenCredential]
|
|
62
20
|
:return: Whether or not protected material was found in the response, with AI-generated reasoning.
|
|
63
21
|
:rtype: Dict[str, str]
|
|
64
22
|
|
|
@@ -84,21 +42,16 @@ class ProtectedMaterialEvaluator:
|
|
|
84
42
|
}
|
|
85
43
|
"""
|
|
86
44
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
"""
|
|
101
|
-
return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
|
|
102
|
-
|
|
103
|
-
def _to_async(self):
|
|
104
|
-
return self._async_evaluator
|
|
45
|
+
@override
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
azure_ai_project: dict,
|
|
49
|
+
credential: Optional[dict] = None,
|
|
50
|
+
eval_last_turn: bool = False,
|
|
51
|
+
):
|
|
52
|
+
super().__init__(
|
|
53
|
+
eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
|
|
54
|
+
azure_ai_project=azure_ai_project,
|
|
55
|
+
credential=credential,
|
|
56
|
+
eval_last_turn=eval_last_turn,
|
|
57
|
+
)
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
from concurrent.futures import as_completed
|
|
6
|
-
from typing import Union
|
|
7
6
|
|
|
8
7
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
9
8
|
|
|
@@ -11,7 +10,6 @@ from .._coherence import CoherenceEvaluator
|
|
|
11
10
|
from .._f1_score import F1ScoreEvaluator
|
|
12
11
|
from .._fluency import FluencyEvaluator
|
|
13
12
|
from .._groundedness import GroundednessEvaluator
|
|
14
|
-
from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
15
13
|
from .._relevance import RelevanceEvaluator
|
|
16
14
|
from .._similarity import SimilarityEvaluator
|
|
17
15
|
|
|
@@ -52,9 +50,7 @@ class QAEvaluator:
|
|
|
52
50
|
}
|
|
53
51
|
"""
|
|
54
52
|
|
|
55
|
-
def __init__(
|
|
56
|
-
self, model_config: dict, parallel: bool = True
|
|
57
|
-
):
|
|
53
|
+
def __init__(self, model_config: dict, parallel: bool = True):
|
|
58
54
|
self._parallel = parallel
|
|
59
55
|
|
|
60
56
|
self._evaluators = [
|
|
@@ -88,12 +84,7 @@ class QAEvaluator:
|
|
|
88
84
|
with ThreadPoolExecutor() as executor:
|
|
89
85
|
futures = {
|
|
90
86
|
executor.submit(
|
|
91
|
-
evaluator,
|
|
92
|
-
query=query,
|
|
93
|
-
response=response,
|
|
94
|
-
context=context,
|
|
95
|
-
ground_truth=ground_truth,
|
|
96
|
-
**kwargs
|
|
87
|
+
evaluator, query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
|
|
97
88
|
): evaluator
|
|
98
89
|
for evaluator in self._evaluators
|
|
99
90
|
}
|
|
@@ -103,9 +94,7 @@ class QAEvaluator:
|
|
|
103
94
|
results.update(future.result())
|
|
104
95
|
else:
|
|
105
96
|
for evaluator in self._evaluators:
|
|
106
|
-
result = evaluator(
|
|
107
|
-
query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
|
|
108
|
-
)
|
|
97
|
+
result = evaluator(query=query, response=response, context=context, ground_truth=ground_truth, **kwargs)
|
|
109
98
|
results.update(result)
|
|
110
99
|
|
|
111
100
|
return results
|
|
@@ -3,83 +3,13 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
-
import
|
|
7
|
-
from
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from typing_extensions import override
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
10
10
|
|
|
11
|
-
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
12
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
|
-
from promptflow.core import AsyncPrompty
|
|
14
11
|
|
|
15
|
-
|
|
16
|
-
from ..._common.utils import (
|
|
17
|
-
check_and_add_api_version_for_aoai_model_config,
|
|
18
|
-
check_and_add_user_agent_for_aoai_model_config,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
try:
|
|
22
|
-
from ..._user_agent import USER_AGENT
|
|
23
|
-
except ImportError:
|
|
24
|
-
USER_AGENT = None
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class _AsyncRelevanceEvaluator:
|
|
28
|
-
# Constants must be defined within eval's directory to be save/loadable
|
|
29
|
-
PROMPTY_FILE = "relevance.prompty"
|
|
30
|
-
LLM_CALL_TIMEOUT = 600
|
|
31
|
-
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
32
|
-
|
|
33
|
-
def __init__(self, model_config: dict):
|
|
34
|
-
check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
35
|
-
|
|
36
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
37
|
-
|
|
38
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
39
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
40
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
41
|
-
|
|
42
|
-
check_and_add_user_agent_for_aoai_model_config(
|
|
43
|
-
model_config,
|
|
44
|
-
prompty_model_config,
|
|
45
|
-
USER_AGENT,
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
current_dir = os.path.dirname(__file__)
|
|
49
|
-
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
50
|
-
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
|
|
51
|
-
|
|
52
|
-
async def __call__(self, *, query: str, response: str, context: str, **kwargs):
|
|
53
|
-
# Validate input parameters
|
|
54
|
-
query = str(query or "")
|
|
55
|
-
response = str(response or "")
|
|
56
|
-
context = str(context or "")
|
|
57
|
-
|
|
58
|
-
if not (query.strip() and response.strip() and context.strip()):
|
|
59
|
-
msg = "'query', 'response' and 'context' must be non-empty strings."
|
|
60
|
-
raise EvaluationException(
|
|
61
|
-
message=msg,
|
|
62
|
-
internal_message=msg,
|
|
63
|
-
error_category=ErrorCategory.MISSING_FIELD,
|
|
64
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
65
|
-
error_target=ErrorTarget.RELEVANCE_EVALUATOR,
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
# Run the evaluation flow
|
|
69
|
-
llm_output = await self._flow(
|
|
70
|
-
query=query, response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
score = np.nan
|
|
74
|
-
if llm_output:
|
|
75
|
-
match = re.search(r"\d", llm_output)
|
|
76
|
-
if match:
|
|
77
|
-
score = float(match.group())
|
|
78
|
-
|
|
79
|
-
return {"gpt_relevance": float(score)}
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
class RelevanceEvaluator:
|
|
12
|
+
class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
83
13
|
"""
|
|
84
14
|
Initialize a relevance evaluator configured for a specific Azure OpenAI model.
|
|
85
15
|
|
|
@@ -107,25 +37,41 @@ class RelevanceEvaluator:
|
|
|
107
37
|
}
|
|
108
38
|
"""
|
|
109
39
|
|
|
110
|
-
|
|
111
|
-
|
|
40
|
+
# Constants must be defined within eval's directory to be save/loadable
|
|
41
|
+
PROMPTY_FILE = "relevance.prompty"
|
|
42
|
+
RESULT_KEY = "gpt_relevance"
|
|
112
43
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
44
|
+
@override
|
|
45
|
+
def __init__(self, model_config: dict):
|
|
46
|
+
current_dir = os.path.dirname(__file__)
|
|
47
|
+
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
48
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
|
|
49
|
+
|
|
50
|
+
@override
|
|
51
|
+
def __call__(
|
|
52
|
+
self,
|
|
53
|
+
*,
|
|
54
|
+
query: Optional[str] = None,
|
|
55
|
+
response: Optional[str] = None,
|
|
56
|
+
context: Optional[str] = None,
|
|
57
|
+
conversation: Optional[dict] = None,
|
|
58
|
+
**kwargs
|
|
59
|
+
):
|
|
60
|
+
"""Evaluate relevance. Accepts either a response and context a single evaluation,
|
|
61
|
+
or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
|
|
62
|
+
the evaluator will aggregate the results of each turn.
|
|
116
63
|
|
|
117
64
|
:keyword query: The query to be evaluated.
|
|
118
|
-
:paramtype query: str
|
|
65
|
+
:paramtype query: Optional[str]
|
|
119
66
|
:keyword response: The response to be evaluated.
|
|
120
|
-
:paramtype response: str
|
|
67
|
+
:paramtype response: Optional[str]
|
|
121
68
|
:keyword context: The context to be evaluated.
|
|
122
|
-
:paramtype context: str
|
|
69
|
+
:paramtype context: Optional[str]
|
|
70
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
71
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
72
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
73
|
+
:paramtype conversation: Optional[Dict]
|
|
123
74
|
:return: The relevance score.
|
|
124
75
|
:rtype: dict
|
|
125
76
|
"""
|
|
126
|
-
return
|
|
127
|
-
self._async_evaluator, query=query, response=response, context=context, **kwargs
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
def _to_async(self):
|
|
131
|
-
return self._async_evaluator
|
|
77
|
+
return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
|
|
@@ -3,11 +3,6 @@ name: Relevance
|
|
|
3
3
|
description: Evaluates relevance score for QA scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
-
configuration:
|
|
7
|
-
type: azure_openai
|
|
8
|
-
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
-
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
-
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
6
|
parameters:
|
|
12
7
|
temperature: 0.0
|
|
13
8
|
max_tokens: 1
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from .
|
|
5
|
+
from ._retrieval import RetrievalEvaluator
|
|
6
6
|
|
|
7
7
|
__all__ = [
|
|
8
|
-
"
|
|
8
|
+
"RetrievalEvaluator",
|
|
9
9
|
]
|
|
@@ -6,45 +6,32 @@ import json
|
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
8
|
import re
|
|
9
|
-
from typing import Union
|
|
10
9
|
|
|
11
10
|
import numpy as np
|
|
12
|
-
|
|
13
11
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
14
12
|
from promptflow.core import AsyncPrompty
|
|
15
13
|
|
|
16
|
-
|
|
17
|
-
from
|
|
18
|
-
check_and_add_api_version_for_aoai_model_config,
|
|
19
|
-
check_and_add_user_agent_for_aoai_model_config,
|
|
20
|
-
)
|
|
14
|
+
|
|
15
|
+
from ..._common.utils import construct_prompty_model_config
|
|
21
16
|
|
|
22
17
|
logger = logging.getLogger(__name__)
|
|
23
18
|
|
|
24
19
|
try:
|
|
25
|
-
from
|
|
20
|
+
from .._user_agent import USER_AGENT
|
|
26
21
|
except ImportError:
|
|
27
22
|
USER_AGENT = None
|
|
28
23
|
|
|
29
24
|
|
|
30
|
-
class
|
|
25
|
+
class _AsyncRetrievalScoreEvaluator:
|
|
31
26
|
# Constants must be defined within eval's directory to be save/loadable
|
|
32
27
|
PROMPTY_FILE = "retrieval.prompty"
|
|
33
28
|
LLM_CALL_TIMEOUT = 600
|
|
34
29
|
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
35
30
|
|
|
36
31
|
def __init__(self, model_config: dict):
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
40
|
-
|
|
41
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
42
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
43
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
44
|
-
|
|
45
|
-
check_and_add_user_agent_for_aoai_model_config(
|
|
32
|
+
prompty_model_config = construct_prompty_model_config(
|
|
46
33
|
model_config,
|
|
47
|
-
|
|
34
|
+
self.DEFAULT_OPEN_API_VERSION,
|
|
48
35
|
USER_AGENT,
|
|
49
36
|
)
|
|
50
37
|
|
|
@@ -92,7 +79,7 @@ class _AsyncRetrievalChatEvaluator:
|
|
|
92
79
|
|
|
93
80
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
94
81
|
logger.warning(
|
|
95
|
-
|
|
82
|
+
"Evaluator %s failed for turn %s with exception: %s", self.__class__.__name__, turn_num + 1, e
|
|
96
83
|
)
|
|
97
84
|
|
|
98
85
|
per_turn_scores.append(np.nan)
|
|
@@ -107,7 +94,7 @@ class _AsyncRetrievalChatEvaluator:
|
|
|
107
94
|
}
|
|
108
95
|
|
|
109
96
|
|
|
110
|
-
class
|
|
97
|
+
class RetrievalEvaluator:
|
|
111
98
|
"""
|
|
112
99
|
Initialize an evaluator configured for a specific Azure OpenAI model.
|
|
113
100
|
|
|
@@ -116,11 +103,12 @@ class RetrievalChatEvaluator:
|
|
|
116
103
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
117
104
|
:return: A function that evaluates and generates metrics for "chat" scenario.
|
|
118
105
|
:rtype: Callable
|
|
106
|
+
|
|
119
107
|
**Usage**
|
|
120
108
|
|
|
121
109
|
.. code-block:: python
|
|
122
110
|
|
|
123
|
-
chat_eval =
|
|
111
|
+
chat_eval = RetrievalScoreEvaluator(model_config)
|
|
124
112
|
conversation = [
|
|
125
113
|
{"role": "user", "content": "What is the value of 2 + 2?"},
|
|
126
114
|
{"role": "assistant", "content": "2 + 2 = 4", "context": {
|
|
@@ -136,18 +124,18 @@ class RetrievalChatEvaluator:
|
|
|
136
124
|
|
|
137
125
|
.. code-block:: python
|
|
138
126
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
127
|
+
{
|
|
128
|
+
"gpt_retrieval": 3.0
|
|
129
|
+
"evaluation_per_turn": {
|
|
130
|
+
"gpt_retrieval": {
|
|
131
|
+
"score": [1.0, 2.0, 3.0]
|
|
132
|
+
}
|
|
144
133
|
}
|
|
145
134
|
}
|
|
146
|
-
}
|
|
147
135
|
"""
|
|
148
136
|
|
|
149
137
|
def __init__(self, model_config: dict):
|
|
150
|
-
self._async_evaluator =
|
|
138
|
+
self._async_evaluator = _AsyncRetrievalScoreEvaluator(model_config)
|
|
151
139
|
|
|
152
140
|
def __call__(self, *, conversation, **kwargs):
|
|
153
141
|
"""Evaluates retrieval score chat scenario.
|
|
@@ -3,11 +3,6 @@ name: Retrieval
|
|
|
3
3
|
description: Evaluates retrieval score for Chat scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
-
configuration:
|
|
7
|
-
type: azure_openai
|
|
8
|
-
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
-
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
-
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
6
|
parameters:
|
|
12
7
|
temperature: 0.0
|
|
13
8
|
top_p: 1.0
|
|
@@ -3,12 +3,13 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from enum import Enum
|
|
5
5
|
|
|
6
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
6
7
|
from rouge_score import rouge_scorer
|
|
7
8
|
|
|
8
|
-
from
|
|
9
|
+
from azure.core import CaseInsensitiveEnumMeta
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
class RougeType(str, Enum):
|
|
12
|
+
class RougeType(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
12
13
|
"""
|
|
13
14
|
Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
|
|
14
15
|
"""
|
|
@@ -4,19 +4,14 @@
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import re
|
|
7
|
-
from typing import Union
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
|
-
|
|
11
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
12
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
10
|
from promptflow.core import AsyncPrompty
|
|
14
11
|
|
|
15
|
-
from
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
check_and_add_user_agent_for_aoai_model_config,
|
|
19
|
-
)
|
|
12
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
13
|
+
|
|
14
|
+
from ..._common.utils import construct_prompty_model_config
|
|
20
15
|
|
|
21
16
|
try:
|
|
22
17
|
from ..._user_agent import USER_AGENT
|
|
@@ -31,17 +26,9 @@ class _AsyncSimilarityEvaluator:
|
|
|
31
26
|
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
32
27
|
|
|
33
28
|
def __init__(self, model_config: dict):
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
37
|
-
|
|
38
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
39
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
40
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
41
|
-
|
|
42
|
-
check_and_add_user_agent_for_aoai_model_config(
|
|
29
|
+
prompty_model_config = construct_prompty_model_config(
|
|
43
30
|
model_config,
|
|
44
|
-
|
|
31
|
+
self.DEFAULT_OPEN_API_VERSION,
|
|
45
32
|
USER_AGENT,
|
|
46
33
|
)
|
|
47
34
|
|