azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -5
- azure/ai/evaluation/_common/utils.py +24 -9
- azure/ai/evaluation/_constants.py +4 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +57 -39
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +34 -81
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +302 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +79 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +99 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
- azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +29 -74
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +33 -80
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +34 -83
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
- azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +16 -22
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -11
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +16 -90
- azure/ai/evaluation/_exceptions.py +0 -1
- azure/ai/evaluation/_model_configurations.py +36 -8
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -1
- azure/ai/evaluation/simulator/_simulator.py +19 -8
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/METADATA +59 -1
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/RECORD +38 -39
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/top_level.txt +0 -0
|
@@ -3,78 +3,13 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
-
import
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from typing_extensions import override
|
|
7
8
|
|
|
8
|
-
|
|
9
|
-
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
|
-
from promptflow.core import AsyncPrompty
|
|
9
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
10
|
|
|
12
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
13
11
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
try:
|
|
17
|
-
from ..._user_agent import USER_AGENT
|
|
18
|
-
except ImportError:
|
|
19
|
-
USER_AGENT = None
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class _AsyncRelevanceEvaluator:
|
|
23
|
-
# Constants must be defined within eval's directory to be save/loadable
|
|
24
|
-
PROMPTY_FILE = "relevance.prompty"
|
|
25
|
-
LLM_CALL_TIMEOUT = 600
|
|
26
|
-
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
27
|
-
|
|
28
|
-
def __init__(self, model_config: dict):
|
|
29
|
-
ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
30
|
-
|
|
31
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
32
|
-
|
|
33
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
34
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
35
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
36
|
-
|
|
37
|
-
ensure_user_agent_in_aoai_model_config(
|
|
38
|
-
model_config,
|
|
39
|
-
prompty_model_config,
|
|
40
|
-
USER_AGENT,
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
current_dir = os.path.dirname(__file__)
|
|
44
|
-
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
45
|
-
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
|
|
46
|
-
|
|
47
|
-
async def __call__(self, *, query: str, response: str, context: str, **kwargs):
|
|
48
|
-
# Validate input parameters
|
|
49
|
-
query = str(query or "")
|
|
50
|
-
response = str(response or "")
|
|
51
|
-
context = str(context or "")
|
|
52
|
-
|
|
53
|
-
if not (query.strip() and response.strip() and context.strip()):
|
|
54
|
-
msg = "'query', 'response' and 'context' must be non-empty strings."
|
|
55
|
-
raise EvaluationException(
|
|
56
|
-
message=msg,
|
|
57
|
-
internal_message=msg,
|
|
58
|
-
error_category=ErrorCategory.MISSING_FIELD,
|
|
59
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
60
|
-
error_target=ErrorTarget.RELEVANCE_EVALUATOR,
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
# Run the evaluation flow
|
|
64
|
-
llm_output = await self._flow(
|
|
65
|
-
query=query, response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
score = np.nan
|
|
69
|
-
if llm_output:
|
|
70
|
-
match = re.search(r"\d", llm_output)
|
|
71
|
-
if match:
|
|
72
|
-
score = float(match.group())
|
|
73
|
-
|
|
74
|
-
return {"gpt_relevance": float(score)}
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class RelevanceEvaluator:
|
|
12
|
+
class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
78
13
|
"""
|
|
79
14
|
Initialize a relevance evaluator configured for a specific Azure OpenAI model.
|
|
80
15
|
|
|
@@ -102,25 +37,41 @@ class RelevanceEvaluator:
|
|
|
102
37
|
}
|
|
103
38
|
"""
|
|
104
39
|
|
|
105
|
-
|
|
106
|
-
|
|
40
|
+
# Constants must be defined within eval's directory to be save/loadable
|
|
41
|
+
PROMPTY_FILE = "relevance.prompty"
|
|
42
|
+
RESULT_KEY = "gpt_relevance"
|
|
107
43
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
44
|
+
@override
|
|
45
|
+
def __init__(self, model_config: dict):
|
|
46
|
+
current_dir = os.path.dirname(__file__)
|
|
47
|
+
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
48
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
|
|
49
|
+
|
|
50
|
+
@override
|
|
51
|
+
def __call__(
|
|
52
|
+
self,
|
|
53
|
+
*,
|
|
54
|
+
query: Optional[str] = None,
|
|
55
|
+
response: Optional[str] = None,
|
|
56
|
+
context: Optional[str] = None,
|
|
57
|
+
conversation: Optional[dict] = None,
|
|
58
|
+
**kwargs
|
|
59
|
+
):
|
|
60
|
+
"""Evaluate relevance. Accepts either a response and context a single evaluation,
|
|
61
|
+
or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
|
|
62
|
+
the evaluator will aggregate the results of each turn.
|
|
111
63
|
|
|
112
64
|
:keyword query: The query to be evaluated.
|
|
113
|
-
:paramtype query: str
|
|
65
|
+
:paramtype query: Optional[str]
|
|
114
66
|
:keyword response: The response to be evaluated.
|
|
115
|
-
:paramtype response: str
|
|
67
|
+
:paramtype response: Optional[str]
|
|
116
68
|
:keyword context: The context to be evaluated.
|
|
117
|
-
:paramtype context: str
|
|
69
|
+
:paramtype context: Optional[str]
|
|
70
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
71
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
72
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
73
|
+
:paramtype conversation: Optional[Dict]
|
|
118
74
|
:return: The relevance score.
|
|
119
75
|
:rtype: dict
|
|
120
76
|
"""
|
|
121
|
-
return
|
|
122
|
-
self._async_evaluator, query=query, response=response, context=context, **kwargs
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
def _to_async(self):
|
|
126
|
-
return self._async_evaluator
|
|
77
|
+
return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
|
|
@@ -3,11 +3,6 @@ name: Relevance
|
|
|
3
3
|
description: Evaluates relevance score for QA scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
-
configuration:
|
|
7
|
-
type: azure_openai
|
|
8
|
-
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
-
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
-
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
6
|
parameters:
|
|
12
7
|
temperature: 0.0
|
|
13
8
|
max_tokens: 1
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from .
|
|
5
|
+
from ._retrieval import RetrievalEvaluator
|
|
6
6
|
|
|
7
7
|
__all__ = [
|
|
8
|
-
"
|
|
8
|
+
"RetrievalEvaluator",
|
|
9
9
|
]
|
|
@@ -11,34 +11,27 @@ import numpy as np
|
|
|
11
11
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
12
12
|
from promptflow.core import AsyncPrompty
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
|
|
15
|
+
from ..._common.utils import construct_prompty_model_config
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
try:
|
|
19
|
-
from
|
|
20
|
+
from .._user_agent import USER_AGENT
|
|
20
21
|
except ImportError:
|
|
21
22
|
USER_AGENT = None
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
class
|
|
25
|
+
class _AsyncRetrievalScoreEvaluator:
|
|
25
26
|
# Constants must be defined within eval's directory to be save/loadable
|
|
26
27
|
PROMPTY_FILE = "retrieval.prompty"
|
|
27
28
|
LLM_CALL_TIMEOUT = 600
|
|
28
29
|
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
29
30
|
|
|
30
31
|
def __init__(self, model_config: dict):
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
34
|
-
|
|
35
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
36
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
37
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
38
|
-
|
|
39
|
-
ensure_user_agent_in_aoai_model_config(
|
|
32
|
+
prompty_model_config = construct_prompty_model_config(
|
|
40
33
|
model_config,
|
|
41
|
-
|
|
34
|
+
self.DEFAULT_OPEN_API_VERSION,
|
|
42
35
|
USER_AGENT,
|
|
43
36
|
)
|
|
44
37
|
|
|
@@ -101,7 +94,7 @@ class _AsyncRetrievalChatEvaluator:
|
|
|
101
94
|
}
|
|
102
95
|
|
|
103
96
|
|
|
104
|
-
class
|
|
97
|
+
class RetrievalEvaluator:
|
|
105
98
|
"""
|
|
106
99
|
Initialize an evaluator configured for a specific Azure OpenAI model.
|
|
107
100
|
|
|
@@ -110,11 +103,12 @@ class RetrievalChatEvaluator:
|
|
|
110
103
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
111
104
|
:return: A function that evaluates and generates metrics for "chat" scenario.
|
|
112
105
|
:rtype: Callable
|
|
106
|
+
|
|
113
107
|
**Usage**
|
|
114
108
|
|
|
115
109
|
.. code-block:: python
|
|
116
110
|
|
|
117
|
-
chat_eval =
|
|
111
|
+
chat_eval = RetrievalScoreEvaluator(model_config)
|
|
118
112
|
conversation = [
|
|
119
113
|
{"role": "user", "content": "What is the value of 2 + 2?"},
|
|
120
114
|
{"role": "assistant", "content": "2 + 2 = 4", "context": {
|
|
@@ -130,18 +124,18 @@ class RetrievalChatEvaluator:
|
|
|
130
124
|
|
|
131
125
|
.. code-block:: python
|
|
132
126
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
127
|
+
{
|
|
128
|
+
"gpt_retrieval": 3.0
|
|
129
|
+
"evaluation_per_turn": {
|
|
130
|
+
"gpt_retrieval": {
|
|
131
|
+
"score": [1.0, 2.0, 3.0]
|
|
132
|
+
}
|
|
138
133
|
}
|
|
139
134
|
}
|
|
140
|
-
}
|
|
141
135
|
"""
|
|
142
136
|
|
|
143
137
|
def __init__(self, model_config: dict):
|
|
144
|
-
self._async_evaluator =
|
|
138
|
+
self._async_evaluator = _AsyncRetrievalScoreEvaluator(model_config)
|
|
145
139
|
|
|
146
140
|
def __call__(self, *, conversation, **kwargs):
|
|
147
141
|
"""Evaluates retrieval score chat scenario.
|
|
@@ -3,11 +3,6 @@ name: Retrieval
|
|
|
3
3
|
description: Evaluates retrieval score for Chat scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
-
configuration:
|
|
7
|
-
type: azure_openai
|
|
8
|
-
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
-
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
-
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
6
|
parameters:
|
|
12
7
|
temperature: 0.0
|
|
13
8
|
top_p: 1.0
|
|
@@ -11,7 +11,7 @@ from promptflow.core import AsyncPrompty
|
|
|
11
11
|
|
|
12
12
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
13
13
|
|
|
14
|
-
from ..._common.utils import
|
|
14
|
+
from ..._common.utils import construct_prompty_model_config
|
|
15
15
|
|
|
16
16
|
try:
|
|
17
17
|
from ..._user_agent import USER_AGENT
|
|
@@ -26,17 +26,9 @@ class _AsyncSimilarityEvaluator:
|
|
|
26
26
|
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
27
27
|
|
|
28
28
|
def __init__(self, model_config: dict):
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
32
|
-
|
|
33
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
34
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
35
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
36
|
-
|
|
37
|
-
ensure_user_agent_in_aoai_model_config(
|
|
29
|
+
prompty_model_config = construct_prompty_model_config(
|
|
38
30
|
model_config,
|
|
39
|
-
|
|
31
|
+
self.DEFAULT_OPEN_API_VERSION,
|
|
40
32
|
USER_AGENT,
|
|
41
33
|
)
|
|
42
34
|
|
|
@@ -3,11 +3,6 @@ name: Similarity
|
|
|
3
3
|
description: Evaluates similarity score for QA scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
-
configuration:
|
|
7
|
-
type: azure_openai
|
|
8
|
-
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
-
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
-
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
6
|
parameters:
|
|
12
7
|
temperature: 0.0
|
|
13
8
|
max_tokens: 1
|
|
@@ -2,18 +2,17 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import logging
|
|
5
|
-
from typing import Optional
|
|
6
|
-
|
|
7
|
-
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
8
5
|
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from typing_extensions import override
|
|
9
8
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
10
|
-
from azure.ai.evaluation._common
|
|
11
|
-
|
|
9
|
+
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
10
|
+
|
|
12
11
|
|
|
13
12
|
logger = logging.getLogger(__name__)
|
|
14
13
|
|
|
15
14
|
|
|
16
|
-
class IndirectAttackEvaluator:
|
|
15
|
+
class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
|
|
17
16
|
"""A Cross-Domain Prompt Injection Attack (XPIA) jailbreak evaluator.
|
|
18
17
|
|
|
19
18
|
Detect whether cross domain injected attacks are present in your AI system's response.
|
|
@@ -25,7 +24,7 @@ class IndirectAttackEvaluator:
|
|
|
25
24
|
focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
|
|
26
25
|
:type eval_last_turn: bool
|
|
27
26
|
:param credential: The credential for connecting to Azure AI project.
|
|
28
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
27
|
+
:type credential: Optional[~azure.core.credentials.TokenCredential]
|
|
29
28
|
:return: A function that evaluates and generates metrics for XPIA chat scenario. Metrics include the overall
|
|
30
29
|
evaluation label and reason for the Q/A Pair, as well as sub-labels for manipulated content, intrusion, and
|
|
31
30
|
information.
|
|
@@ -51,89 +50,16 @@ class IndirectAttackEvaluator:
|
|
|
51
50
|
}
|
|
52
51
|
"""
|
|
53
52
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
self._eval_last_turn = eval_last_turn
|
|
57
|
-
|
|
58
|
-
def __call__(
|
|
53
|
+
@override
|
|
54
|
+
def __init__(
|
|
59
55
|
self,
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
**kwargs,
|
|
56
|
+
azure_ai_project: dict,
|
|
57
|
+
credential: Optional[dict] = None,
|
|
58
|
+
eval_last_turn: bool = False,
|
|
64
59
|
):
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
:paramtype query: Optional[str]
|
|
71
|
-
:keyword response: The response to be evaluated. Mutually exclusive with 'conversation'.
|
|
72
|
-
:paramtype response: Optional[str]
|
|
73
|
-
:return: The evaluation scores and reasoning.
|
|
74
|
-
:rtype: dict
|
|
75
|
-
"""
|
|
76
|
-
|
|
77
|
-
return self._evaluator(query=query, response=response, **kwargs)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
class _AsyncIndirectAttackEvaluator:
|
|
81
|
-
def __init__(self, azure_ai_project: dict, credential=None):
|
|
82
|
-
self._azure_ai_project = azure_ai_project
|
|
83
|
-
self._credential = credential
|
|
84
|
-
|
|
85
|
-
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
86
|
-
"""
|
|
87
|
-
Evaluates content according to this evaluator's metric.
|
|
88
|
-
:keyword query: The query to be evaluated.
|
|
89
|
-
:paramtype query: str
|
|
90
|
-
:keyword response: The response to be evaluated.
|
|
91
|
-
:paramtype response: str
|
|
92
|
-
:return: The evaluation score computation based on the metric (self.metric).
|
|
93
|
-
:rtype: Any
|
|
94
|
-
"""
|
|
95
|
-
# Validate inputs
|
|
96
|
-
# Raises value error if failed, so execution alone signifies success.
|
|
97
|
-
if not (query and query.strip() and query != "None") or not (
|
|
98
|
-
response and response.strip() and response != "None"
|
|
99
|
-
):
|
|
100
|
-
msg = "Both 'query' and 'response' must be non-empty strings."
|
|
101
|
-
raise EvaluationException(
|
|
102
|
-
message=msg,
|
|
103
|
-
internal_message=msg,
|
|
104
|
-
error_category=ErrorCategory.MISSING_FIELD,
|
|
105
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
106
|
-
error_target=ErrorTarget.INDIRECT_ATTACK_EVALUATOR,
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
# Run score computation based on supplied metric.
|
|
110
|
-
result = await evaluate_with_rai_service(
|
|
111
|
-
metric_name=EvaluationMetrics.XPIA,
|
|
112
|
-
query=query,
|
|
113
|
-
response=response,
|
|
114
|
-
project_scope=self._azure_ai_project,
|
|
115
|
-
credential=self._credential,
|
|
60
|
+
super().__init__(
|
|
61
|
+
eval_metric=EvaluationMetrics.XPIA,
|
|
62
|
+
azure_ai_project=azure_ai_project,
|
|
63
|
+
credential=credential,
|
|
64
|
+
eval_last_turn=eval_last_turn,
|
|
116
65
|
)
|
|
117
|
-
return result
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
class _IndirectAttackEvaluator:
|
|
121
|
-
def __init__(self, azure_ai_project: dict, credential=None):
|
|
122
|
-
self._async_evaluator = _AsyncIndirectAttackEvaluator(azure_ai_project, credential)
|
|
123
|
-
|
|
124
|
-
def __call__(self, *, query: str, response: str, **kwargs):
|
|
125
|
-
"""
|
|
126
|
-
Evaluates XPIA content.
|
|
127
|
-
:keyword query: The query to be evaluated.
|
|
128
|
-
:paramtype query: str
|
|
129
|
-
:keyword response: The response to be evaluated.
|
|
130
|
-
:paramtype response: str
|
|
131
|
-
:keyword context: The context to be evaluated.
|
|
132
|
-
:paramtype context: str
|
|
133
|
-
:return: The XPIA score.
|
|
134
|
-
:rtype: dict
|
|
135
|
-
"""
|
|
136
|
-
return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
|
|
137
|
-
|
|
138
|
-
def _to_async(self):
|
|
139
|
-
return self._async_evaluator
|
|
@@ -54,7 +54,6 @@ class ErrorTarget(Enum):
|
|
|
54
54
|
EVAL_RUN = "EvalRun"
|
|
55
55
|
CODE_CLIENT = "CodeClient"
|
|
56
56
|
RAI_CLIENT = "RAIClient"
|
|
57
|
-
CHAT_EVALUATOR = "ChatEvaluator"
|
|
58
57
|
COHERENCE_EVALUATOR = "CoherenceEvaluator"
|
|
59
58
|
CONTENT_SAFETY_CHAT_EVALUATOR = "ContentSafetyEvaluator"
|
|
60
59
|
ECI_EVALUATOR = "ECIEvaluator"
|
|
@@ -2,26 +2,54 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from typing import TypedDict
|
|
5
|
+
from typing import Dict, Literal, TypedDict
|
|
6
6
|
|
|
7
|
+
from typing_extensions import NotRequired
|
|
7
8
|
|
|
8
|
-
|
|
9
|
+
|
|
10
|
+
class AzureOpenAIModelConfiguration(TypedDict, total=False):
|
|
11
|
+
"""Model Configuration for Azure OpenAI Model"""
|
|
12
|
+
|
|
13
|
+
type: Literal["azure_openai"]
|
|
14
|
+
"""The type of the model configuration. Should be 'azure_openai' for AzureOpenAIModelConfiguration"""
|
|
9
15
|
azure_deployment: str
|
|
16
|
+
"""Name of Azure OpenAI deployment to make request to"""
|
|
10
17
|
azure_endpoint: str
|
|
18
|
+
"""Endpoint of Azure OpenAI resource to make request to"""
|
|
11
19
|
api_key: str
|
|
20
|
+
"""API key of Azure OpenAI resource"""
|
|
21
|
+
api_version: NotRequired[str]
|
|
22
|
+
"""(Optional) API version to use in request to Azure OpenAI deployment"""
|
|
12
23
|
|
|
13
24
|
|
|
14
|
-
class
|
|
15
|
-
|
|
25
|
+
class OpenAIModelConfiguration(TypedDict, total=False):
|
|
26
|
+
"""Model Configuration for OpenAI Model"""
|
|
16
27
|
|
|
17
|
-
|
|
18
|
-
|
|
28
|
+
type: Literal["openai"]
|
|
29
|
+
"""The type of the model configuration. Should be 'openai' for OpenAIModelConfiguration"""
|
|
19
30
|
api_key: str
|
|
20
|
-
|
|
21
|
-
|
|
31
|
+
"API key needed to make request to model"
|
|
32
|
+
model: str
|
|
33
|
+
"""Name of model to be used in OpenAI request"""
|
|
34
|
+
base_url: NotRequired[str]
|
|
35
|
+
"""(Optional) Base URL to be used in OpenAI request"""
|
|
36
|
+
organization: NotRequired[str]
|
|
37
|
+
"""(Optional) OpenAI organization"""
|
|
22
38
|
|
|
23
39
|
|
|
24
40
|
class AzureAIProject(TypedDict):
|
|
41
|
+
"""Azure AI Project Information"""
|
|
42
|
+
|
|
25
43
|
subscription_id: str
|
|
44
|
+
"""Azure subscription id of the project"""
|
|
26
45
|
resource_group_name: str
|
|
46
|
+
"""Azure resource group name of the project"""
|
|
27
47
|
project_name: str
|
|
48
|
+
"""Azure project name"""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class EvaluatorConfig(TypedDict, total=False):
|
|
52
|
+
"""Configuration for an evaluator"""
|
|
53
|
+
|
|
54
|
+
column_mapping: Dict[str, str]
|
|
55
|
+
"""Dictionary mapping evaluator input name to column in data"""
|
azure/ai/evaluation/_version.py
CHANGED
|
@@ -33,7 +33,8 @@ Answer must not be more than 5 words
|
|
|
33
33
|
Answer must be picked from Text as is
|
|
34
34
|
Question should be as descriptive as possible and must include as much context as possible from Text
|
|
35
35
|
Output must always have the provided number of QnAs
|
|
36
|
-
Output must be in JSON format
|
|
36
|
+
Output must be in JSON format.
|
|
37
|
+
Output must have {{num_queries}} objects in the format specified below. Any other count is unacceptable.
|
|
37
38
|
Text:
|
|
38
39
|
<|text_start|>
|
|
39
40
|
On January 24, 1984, former Apple CEO Steve Jobs introduced the first Macintosh. In late 2003, Apple had 2.06 percent of the desktop share in the United States.
|
|
@@ -41,7 +41,7 @@ class Simulator:
|
|
|
41
41
|
"""
|
|
42
42
|
self._validate_project_config(azure_ai_project)
|
|
43
43
|
self.azure_ai_project = azure_ai_project
|
|
44
|
-
self.azure_ai_project["api_version"] = "2024-
|
|
44
|
+
self.azure_ai_project["api_version"] = "2024-06-01"
|
|
45
45
|
self.credential = credential
|
|
46
46
|
|
|
47
47
|
@staticmethod
|
|
@@ -129,7 +129,6 @@ class Simulator:
|
|
|
129
129
|
max_conversation_turns *= 2 # account for both user and assistant turns
|
|
130
130
|
|
|
131
131
|
prompty_model_config = self._build_prompty_model_config()
|
|
132
|
-
|
|
133
132
|
if conversation_turns:
|
|
134
133
|
return await self._simulate_with_predefined_turns(
|
|
135
134
|
target=target,
|
|
@@ -234,8 +233,16 @@ class Simulator:
|
|
|
234
233
|
target=target,
|
|
235
234
|
progress_bar=progress_bar,
|
|
236
235
|
)
|
|
237
|
-
|
|
238
|
-
|
|
236
|
+
simulated_conversations.append(
|
|
237
|
+
JsonLineChatProtocol(
|
|
238
|
+
{
|
|
239
|
+
"messages": current_simulation.to_list(),
|
|
240
|
+
"finish_reason": ["stop"],
|
|
241
|
+
"context": {},
|
|
242
|
+
"$schema": "http://azureml/sdk-2-0/ChatConversation.json",
|
|
243
|
+
}
|
|
244
|
+
)
|
|
245
|
+
)
|
|
239
246
|
|
|
240
247
|
progress_bar.close()
|
|
241
248
|
return simulated_conversations
|
|
@@ -280,7 +287,9 @@ class Simulator:
|
|
|
280
287
|
|
|
281
288
|
while len(current_simulation) < max_conversation_turns:
|
|
282
289
|
user_response_content = user_flow(
|
|
283
|
-
task="Continue the conversation",
|
|
290
|
+
task="Continue the conversation",
|
|
291
|
+
conversation_history=current_simulation.to_list(),
|
|
292
|
+
**user_simulator_prompty_kwargs,
|
|
284
293
|
)
|
|
285
294
|
user_response = self._parse_prompty_response(response=user_response_content)
|
|
286
295
|
user_turn = Turn(role=ConversationRole.USER, content=user_response["content"])
|
|
@@ -317,6 +326,7 @@ class Simulator:
|
|
|
317
326
|
resource_name = "task_simulate.prompty"
|
|
318
327
|
try:
|
|
319
328
|
# Access the resource as a file path
|
|
329
|
+
# pylint: disable=deprecated-method
|
|
320
330
|
with pkg_resources.path(package, resource_name) as prompty_path:
|
|
321
331
|
return load_flow(source=str(prompty_path), model=prompty_model_config)
|
|
322
332
|
except FileNotFoundError as e:
|
|
@@ -398,7 +408,6 @@ class Simulator:
|
|
|
398
408
|
prompty_model_config=prompty_model_config,
|
|
399
409
|
query_response_generating_prompty_kwargs=query_response_generating_prompty_kwargs,
|
|
400
410
|
)
|
|
401
|
-
|
|
402
411
|
try:
|
|
403
412
|
query_responses = query_flow(text=text, num_queries=num_queries)
|
|
404
413
|
if isinstance(query_responses, dict):
|
|
@@ -432,6 +441,7 @@ class Simulator:
|
|
|
432
441
|
resource_name = "task_query_response.prompty"
|
|
433
442
|
try:
|
|
434
443
|
# Access the resource as a file path
|
|
444
|
+
# pylint: disable=deprecated-method
|
|
435
445
|
with pkg_resources.path(package, resource_name) as prompty_path:
|
|
436
446
|
return load_flow(source=str(prompty_path), model=prompty_model_config)
|
|
437
447
|
except FileNotFoundError as e:
|
|
@@ -612,9 +622,10 @@ class Simulator:
|
|
|
612
622
|
prompty_model_config=self._build_prompty_model_config(),
|
|
613
623
|
user_simulator_prompty_kwargs=user_simulator_prompty_kwargs,
|
|
614
624
|
)
|
|
615
|
-
|
|
616
625
|
try:
|
|
617
|
-
response_content = user_flow(
|
|
626
|
+
response_content = user_flow(
|
|
627
|
+
task=task, conversation_history=conversation_history, **user_simulator_prompty_kwargs
|
|
628
|
+
)
|
|
618
629
|
user_response = self._parse_prompty_response(response=response_content)
|
|
619
630
|
return user_response["content"]
|
|
620
631
|
except Exception as e:
|