azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -5
- azure/ai/evaluation/_common/constants.py +4 -2
- azure/ai/evaluation/_common/math.py +18 -0
- azure/ai/evaluation/_common/rai_service.py +54 -62
- azure/ai/evaluation/_common/utils.py +201 -16
- azure/ai/evaluation/_constants.py +12 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
- azure/ai/evaluation/_evaluate/_evaluate.py +161 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
- azure/ai/evaluation/_evaluate/_utils.py +44 -25
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +33 -79
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +331 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +76 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +97 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
- azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +30 -74
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +34 -80
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +35 -83
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
- azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +25 -28
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +23 -17
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +15 -90
- azure/ai/evaluation/_exceptions.py +9 -7
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +37 -9
- azure/ai/evaluation/{_evaluators/_chat/retrieval → _vendor}/__init__.py +0 -6
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
- azure/ai/evaluation/simulator/_simulator.py +127 -117
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +129 -43
- azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +106 -0
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
|
@@ -1,64 +1,22 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from
|
|
4
|
+
from typing_extensions import override
|
|
5
5
|
|
|
6
6
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
7
|
-
from azure.ai.evaluation._common
|
|
8
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
7
|
+
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
9
8
|
|
|
10
9
|
|
|
11
|
-
class
|
|
12
|
-
def __init__(self, azure_ai_project: dict, credential=None):
|
|
13
|
-
self._azure_ai_project = azure_ai_project
|
|
14
|
-
self._credential = credential
|
|
15
|
-
|
|
16
|
-
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
17
|
-
"""
|
|
18
|
-
Evaluates content according to this evaluator's metric.
|
|
19
|
-
|
|
20
|
-
:keyword query: The query to be evaluated.
|
|
21
|
-
:paramtype query: str
|
|
22
|
-
:keyword response: The response to be evaluated.
|
|
23
|
-
:paramtype response: str
|
|
24
|
-
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
25
|
-
:rtype: Any
|
|
26
|
-
"""
|
|
27
|
-
# Validate inputs
|
|
28
|
-
# Raises value error if failed, so execution alone signifies success.
|
|
29
|
-
if not (query and query.strip() and query != "None") or not (
|
|
30
|
-
response and response.strip() and response != "None"
|
|
31
|
-
):
|
|
32
|
-
msg = "Both 'query' and 'response' must be non-empty strings."
|
|
33
|
-
raise EvaluationException(
|
|
34
|
-
message=msg,
|
|
35
|
-
internal_message=msg,
|
|
36
|
-
error_category=ErrorCategory.MISSING_FIELD,
|
|
37
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
38
|
-
error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
# Run score computation based on supplied metric.
|
|
42
|
-
result = await evaluate_with_rai_service(
|
|
43
|
-
metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
|
|
44
|
-
query=query,
|
|
45
|
-
response=response,
|
|
46
|
-
project_scope=self._azure_ai_project,
|
|
47
|
-
credential=self._credential,
|
|
48
|
-
)
|
|
49
|
-
return result
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class ProtectedMaterialEvaluator:
|
|
10
|
+
class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase):
|
|
53
11
|
"""
|
|
54
12
|
Initialize a protected material evaluator to detect whether protected material
|
|
55
13
|
is present in your AI system's response. Outputs True or False with AI-generated reasoning.
|
|
56
14
|
|
|
15
|
+
:param credential: The credential for connecting to Azure AI project. Required
|
|
16
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
57
17
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
58
18
|
It contains subscription id, resource group, and project name.
|
|
59
19
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
60
|
-
:param credential: The credential for connecting to Azure AI project.
|
|
61
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
62
20
|
:return: Whether or not protected material was found in the response, with AI-generated reasoning.
|
|
63
21
|
:rtype: Dict[str, str]
|
|
64
22
|
|
|
@@ -84,21 +42,16 @@ class ProtectedMaterialEvaluator:
|
|
|
84
42
|
}
|
|
85
43
|
"""
|
|
86
44
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
"""
|
|
101
|
-
return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
|
|
102
|
-
|
|
103
|
-
def _to_async(self):
|
|
104
|
-
return self._async_evaluator
|
|
45
|
+
@override
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
credential,
|
|
49
|
+
azure_ai_project: dict,
|
|
50
|
+
eval_last_turn: bool = False,
|
|
51
|
+
):
|
|
52
|
+
super().__init__(
|
|
53
|
+
eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
|
|
54
|
+
azure_ai_project=azure_ai_project,
|
|
55
|
+
credential=credential,
|
|
56
|
+
eval_last_turn=eval_last_turn,
|
|
57
|
+
)
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
from concurrent.futures import as_completed
|
|
6
|
+
from typing import Callable, Dict, List
|
|
6
7
|
|
|
7
8
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
8
9
|
|
|
@@ -53,7 +54,7 @@ class QAEvaluator:
|
|
|
53
54
|
def __init__(self, model_config: dict, parallel: bool = True):
|
|
54
55
|
self._parallel = parallel
|
|
55
56
|
|
|
56
|
-
self._evaluators = [
|
|
57
|
+
self._evaluators: List[Callable[..., Dict[str, float]]] = [
|
|
57
58
|
GroundednessEvaluator(model_config),
|
|
58
59
|
RelevanceEvaluator(model_config),
|
|
59
60
|
CoherenceEvaluator(model_config),
|
|
@@ -77,9 +78,9 @@ class QAEvaluator:
|
|
|
77
78
|
:keyword parallel: Whether to evaluate in parallel. Defaults to True.
|
|
78
79
|
:paramtype parallel: bool
|
|
79
80
|
:return: The scores for QA scenario.
|
|
80
|
-
:rtype:
|
|
81
|
+
:rtype: Dict[str, float]
|
|
81
82
|
"""
|
|
82
|
-
results = {}
|
|
83
|
+
results: Dict[str, float] = {}
|
|
83
84
|
if self._parallel:
|
|
84
85
|
with ThreadPoolExecutor() as executor:
|
|
85
86
|
futures = {
|
|
@@ -3,78 +3,14 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
-
import
|
|
6
|
+
from typing import Optional
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
|
-
from promptflow.core import AsyncPrompty
|
|
8
|
+
from typing_extensions import override
|
|
11
9
|
|
|
12
|
-
from azure.ai.evaluation.
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
13
11
|
|
|
14
|
-
from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
|
|
15
12
|
|
|
16
|
-
|
|
17
|
-
from ..._user_agent import USER_AGENT
|
|
18
|
-
except ImportError:
|
|
19
|
-
USER_AGENT = None
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class _AsyncRelevanceEvaluator:
|
|
23
|
-
# Constants must be defined within eval's directory to be save/loadable
|
|
24
|
-
PROMPTY_FILE = "relevance.prompty"
|
|
25
|
-
LLM_CALL_TIMEOUT = 600
|
|
26
|
-
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
27
|
-
|
|
28
|
-
def __init__(self, model_config: dict):
|
|
29
|
-
ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
30
|
-
|
|
31
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
32
|
-
|
|
33
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
34
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
35
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
36
|
-
|
|
37
|
-
ensure_user_agent_in_aoai_model_config(
|
|
38
|
-
model_config,
|
|
39
|
-
prompty_model_config,
|
|
40
|
-
USER_AGENT,
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
current_dir = os.path.dirname(__file__)
|
|
44
|
-
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
45
|
-
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
|
|
46
|
-
|
|
47
|
-
async def __call__(self, *, query: str, response: str, context: str, **kwargs):
|
|
48
|
-
# Validate input parameters
|
|
49
|
-
query = str(query or "")
|
|
50
|
-
response = str(response or "")
|
|
51
|
-
context = str(context or "")
|
|
52
|
-
|
|
53
|
-
if not (query.strip() and response.strip() and context.strip()):
|
|
54
|
-
msg = "'query', 'response' and 'context' must be non-empty strings."
|
|
55
|
-
raise EvaluationException(
|
|
56
|
-
message=msg,
|
|
57
|
-
internal_message=msg,
|
|
58
|
-
error_category=ErrorCategory.MISSING_FIELD,
|
|
59
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
60
|
-
error_target=ErrorTarget.RELEVANCE_EVALUATOR,
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
# Run the evaluation flow
|
|
64
|
-
llm_output = await self._flow(
|
|
65
|
-
query=query, response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
score = np.nan
|
|
69
|
-
if llm_output:
|
|
70
|
-
match = re.search(r"\d", llm_output)
|
|
71
|
-
if match:
|
|
72
|
-
score = float(match.group())
|
|
73
|
-
|
|
74
|
-
return {"gpt_relevance": float(score)}
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class RelevanceEvaluator:
|
|
13
|
+
class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
78
14
|
"""
|
|
79
15
|
Initialize a relevance evaluator configured for a specific Azure OpenAI model.
|
|
80
16
|
|
|
@@ -102,25 +38,41 @@ class RelevanceEvaluator:
|
|
|
102
38
|
}
|
|
103
39
|
"""
|
|
104
40
|
|
|
105
|
-
|
|
106
|
-
|
|
41
|
+
# Constants must be defined within eval's directory to be save/loadable
|
|
42
|
+
PROMPTY_FILE = "relevance.prompty"
|
|
43
|
+
RESULT_KEY = "gpt_relevance"
|
|
107
44
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
45
|
+
@override
|
|
46
|
+
def __init__(self, model_config: dict):
|
|
47
|
+
current_dir = os.path.dirname(__file__)
|
|
48
|
+
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
49
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
|
|
50
|
+
|
|
51
|
+
@override
|
|
52
|
+
def __call__(
|
|
53
|
+
self,
|
|
54
|
+
*,
|
|
55
|
+
query: Optional[str] = None,
|
|
56
|
+
response: Optional[str] = None,
|
|
57
|
+
context: Optional[str] = None,
|
|
58
|
+
conversation: Optional[dict] = None,
|
|
59
|
+
**kwargs,
|
|
60
|
+
):
|
|
61
|
+
"""Evaluate relevance. Accepts either a response and context a single evaluation,
|
|
62
|
+
or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
|
|
63
|
+
the evaluator will aggregate the results of each turn.
|
|
111
64
|
|
|
112
65
|
:keyword query: The query to be evaluated.
|
|
113
|
-
:paramtype query: str
|
|
66
|
+
:paramtype query: Optional[str]
|
|
114
67
|
:keyword response: The response to be evaluated.
|
|
115
|
-
:paramtype response: str
|
|
68
|
+
:paramtype response: Optional[str]
|
|
116
69
|
:keyword context: The context to be evaluated.
|
|
117
|
-
:paramtype context: str
|
|
70
|
+
:paramtype context: Optional[str]
|
|
71
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
72
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
73
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
74
|
+
:paramtype conversation: Optional[Dict]
|
|
118
75
|
:return: The relevance score.
|
|
119
|
-
:rtype:
|
|
76
|
+
:rtype: Dict[str, float]
|
|
120
77
|
"""
|
|
121
|
-
return
|
|
122
|
-
self._async_evaluator, query=query, response=response, context=context, **kwargs
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
def _to_async(self):
|
|
126
|
-
return self._async_evaluator
|
|
78
|
+
return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
|
|
@@ -3,11 +3,6 @@ name: Relevance
|
|
|
3
3
|
description: Evaluates relevance score for QA scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
-
configuration:
|
|
7
|
-
type: azure_openai
|
|
8
|
-
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
-
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
-
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
6
|
parameters:
|
|
12
7
|
temperature: 0.0
|
|
13
8
|
max_tokens: 1
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from .
|
|
5
|
+
from ._retrieval import RetrievalEvaluator
|
|
6
6
|
|
|
7
7
|
__all__ = [
|
|
8
|
-
"
|
|
8
|
+
"RetrievalEvaluator",
|
|
9
9
|
]
|
|
@@ -4,41 +4,37 @@
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
+
import math
|
|
7
8
|
import os
|
|
8
9
|
import re
|
|
10
|
+
from typing import Union
|
|
9
11
|
|
|
10
|
-
import numpy as np
|
|
11
12
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
12
13
|
from promptflow.core import AsyncPrompty
|
|
13
14
|
|
|
14
|
-
from
|
|
15
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
16
|
+
|
|
17
|
+
from ..._common.math import list_mean_nan_safe
|
|
18
|
+
from ..._common.utils import construct_prompty_model_config, validate_model_config
|
|
15
19
|
|
|
16
20
|
logger = logging.getLogger(__name__)
|
|
17
21
|
|
|
18
22
|
try:
|
|
19
|
-
from
|
|
23
|
+
from .._user_agent import USER_AGENT
|
|
20
24
|
except ImportError:
|
|
21
|
-
USER_AGENT = None
|
|
25
|
+
USER_AGENT = "None"
|
|
22
26
|
|
|
23
27
|
|
|
24
|
-
class
|
|
28
|
+
class _AsyncRetrievalScoreEvaluator:
|
|
25
29
|
# Constants must be defined within eval's directory to be save/loadable
|
|
26
30
|
PROMPTY_FILE = "retrieval.prompty"
|
|
27
31
|
LLM_CALL_TIMEOUT = 600
|
|
28
32
|
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
29
33
|
|
|
30
|
-
def __init__(self, model_config:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
34
|
-
|
|
35
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
36
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
37
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
38
|
-
|
|
39
|
-
ensure_user_agent_in_aoai_model_config(
|
|
34
|
+
def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]):
|
|
35
|
+
prompty_model_config = construct_prompty_model_config(
|
|
40
36
|
model_config,
|
|
41
|
-
|
|
37
|
+
self.DEFAULT_OPEN_API_VERSION,
|
|
42
38
|
USER_AGENT,
|
|
43
39
|
)
|
|
44
40
|
|
|
@@ -76,7 +72,7 @@ class _AsyncRetrievalChatEvaluator:
|
|
|
76
72
|
llm_output = await self._flow(
|
|
77
73
|
query=query, history=history, documents=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
|
|
78
74
|
)
|
|
79
|
-
score =
|
|
75
|
+
score = math.nan
|
|
80
76
|
if llm_output:
|
|
81
77
|
parsed_score_response = re.findall(r"\d+", llm_output.split("# Result")[-1].strip())
|
|
82
78
|
if len(parsed_score_response) > 0:
|
|
@@ -89,10 +85,10 @@ class _AsyncRetrievalChatEvaluator:
|
|
|
89
85
|
"Evaluator %s failed for turn %s with exception: %s", self.__class__.__name__, turn_num + 1, e
|
|
90
86
|
)
|
|
91
87
|
|
|
92
|
-
per_turn_scores.append(
|
|
88
|
+
per_turn_scores.append(math.nan)
|
|
93
89
|
|
|
94
90
|
return {
|
|
95
|
-
"gpt_retrieval":
|
|
91
|
+
"gpt_retrieval": list_mean_nan_safe(per_turn_scores),
|
|
96
92
|
"evaluation_per_turn": {
|
|
97
93
|
"gpt_retrieval": {
|
|
98
94
|
"score": per_turn_scores,
|
|
@@ -101,7 +97,7 @@ class _AsyncRetrievalChatEvaluator:
|
|
|
101
97
|
}
|
|
102
98
|
|
|
103
99
|
|
|
104
|
-
class
|
|
100
|
+
class RetrievalEvaluator:
|
|
105
101
|
"""
|
|
106
102
|
Initialize an evaluator configured for a specific Azure OpenAI model.
|
|
107
103
|
|
|
@@ -110,11 +106,12 @@ class RetrievalChatEvaluator:
|
|
|
110
106
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
111
107
|
:return: A function that evaluates and generates metrics for "chat" scenario.
|
|
112
108
|
:rtype: Callable
|
|
109
|
+
|
|
113
110
|
**Usage**
|
|
114
111
|
|
|
115
112
|
.. code-block:: python
|
|
116
113
|
|
|
117
|
-
chat_eval =
|
|
114
|
+
chat_eval = RetrievalScoreEvaluator(model_config)
|
|
118
115
|
conversation = [
|
|
119
116
|
{"role": "user", "content": "What is the value of 2 + 2?"},
|
|
120
117
|
{"role": "assistant", "content": "2 + 2 = 4", "context": {
|
|
@@ -130,18 +127,18 @@ class RetrievalChatEvaluator:
|
|
|
130
127
|
|
|
131
128
|
.. code-block:: python
|
|
132
129
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
130
|
+
{
|
|
131
|
+
"gpt_retrieval": 3.0
|
|
132
|
+
"evaluation_per_turn": {
|
|
133
|
+
"gpt_retrieval": {
|
|
134
|
+
"score": [1.0, 2.0, 3.0]
|
|
135
|
+
}
|
|
138
136
|
}
|
|
139
137
|
}
|
|
140
|
-
}
|
|
141
138
|
"""
|
|
142
139
|
|
|
143
140
|
def __init__(self, model_config: dict):
|
|
144
|
-
self._async_evaluator =
|
|
141
|
+
self._async_evaluator = _AsyncRetrievalScoreEvaluator(validate_model_config(model_config))
|
|
145
142
|
|
|
146
143
|
def __call__(self, *, conversation, **kwargs):
|
|
147
144
|
"""Evaluates retrieval score chat scenario.
|
|
@@ -3,11 +3,6 @@ name: Retrieval
|
|
|
3
3
|
description: Evaluates retrieval score for Chat scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
-
configuration:
|
|
7
|
-
type: azure_openai
|
|
8
|
-
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
-
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
-
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
6
|
parameters:
|
|
12
7
|
temperature: 0.0
|
|
13
8
|
top_p: 1.0
|
|
@@ -2,21 +2,23 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import math
|
|
5
6
|
import os
|
|
6
7
|
import re
|
|
8
|
+
from typing import Union
|
|
7
9
|
|
|
8
|
-
import numpy as np
|
|
9
10
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
11
|
from promptflow.core import AsyncPrompty
|
|
11
12
|
|
|
12
13
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
13
15
|
|
|
14
|
-
from ..._common.utils import
|
|
16
|
+
from ..._common.utils import construct_prompty_model_config, validate_model_config
|
|
15
17
|
|
|
16
18
|
try:
|
|
17
19
|
from ..._user_agent import USER_AGENT
|
|
18
20
|
except ImportError:
|
|
19
|
-
USER_AGENT = None
|
|
21
|
+
USER_AGENT = "None"
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class _AsyncSimilarityEvaluator:
|
|
@@ -25,18 +27,10 @@ class _AsyncSimilarityEvaluator:
|
|
|
25
27
|
LLM_CALL_TIMEOUT = 600
|
|
26
28
|
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
27
29
|
|
|
28
|
-
def __init__(self, model_config:
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
32
|
-
|
|
33
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
34
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
35
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
36
|
-
|
|
37
|
-
ensure_user_agent_in_aoai_model_config(
|
|
30
|
+
def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]):
|
|
31
|
+
prompty_model_config = construct_prompty_model_config(
|
|
38
32
|
model_config,
|
|
39
|
-
|
|
33
|
+
self.DEFAULT_OPEN_API_VERSION,
|
|
40
34
|
USER_AGENT,
|
|
41
35
|
)
|
|
42
36
|
|
|
@@ -45,6 +39,18 @@ class _AsyncSimilarityEvaluator:
|
|
|
45
39
|
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
|
|
46
40
|
|
|
47
41
|
async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
|
|
42
|
+
"""
|
|
43
|
+
Evaluate similarity.
|
|
44
|
+
|
|
45
|
+
:keyword query: The query to be evaluated.
|
|
46
|
+
:paramtype query: str
|
|
47
|
+
:keyword response: The response to be evaluated.
|
|
48
|
+
:paramtype response: str
|
|
49
|
+
:keyword ground_truth: The ground truth to be evaluated.
|
|
50
|
+
:paramtype ground_truth: str
|
|
51
|
+
:return: The similarity score.
|
|
52
|
+
:rtype: Dict[str, float]
|
|
53
|
+
"""
|
|
48
54
|
# Validate input parameters
|
|
49
55
|
query = str(query or "")
|
|
50
56
|
response = str(response or "")
|
|
@@ -65,7 +71,7 @@ class _AsyncSimilarityEvaluator:
|
|
|
65
71
|
query=query, response=response, ground_truth=ground_truth, timeout=self.LLM_CALL_TIMEOUT, **kwargs
|
|
66
72
|
)
|
|
67
73
|
|
|
68
|
-
score =
|
|
74
|
+
score = math.nan
|
|
69
75
|
if llm_output:
|
|
70
76
|
match = re.search(r"\d", llm_output)
|
|
71
77
|
if match:
|
|
@@ -102,7 +108,7 @@ class SimilarityEvaluator:
|
|
|
102
108
|
"""
|
|
103
109
|
|
|
104
110
|
def __init__(self, model_config: dict):
|
|
105
|
-
self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
|
|
111
|
+
self._async_evaluator = _AsyncSimilarityEvaluator(validate_model_config(model_config))
|
|
106
112
|
|
|
107
113
|
def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
|
|
108
114
|
"""
|
|
@@ -115,7 +121,7 @@ class SimilarityEvaluator:
|
|
|
115
121
|
:keyword ground_truth: The ground truth to be evaluated.
|
|
116
122
|
:paramtype ground_truth: str
|
|
117
123
|
:return: The similarity score.
|
|
118
|
-
:rtype:
|
|
124
|
+
:rtype: Dict[str, float]
|
|
119
125
|
"""
|
|
120
126
|
return async_run_allowing_running_loop(
|
|
121
127
|
self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs
|
|
@@ -3,11 +3,6 @@ name: Similarity
|
|
|
3
3
|
description: Evaluates similarity score for QA scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
-
configuration:
|
|
7
|
-
type: azure_openai
|
|
8
|
-
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
-
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
-
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
6
|
parameters:
|
|
12
7
|
temperature: 0.0
|
|
13
8
|
max_tokens: 1
|