azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +82 -0
- azure/ai/evaluation/_common/__init__.py +16 -0
- azure/ai/evaluation/_common/_experimental.py +172 -0
- azure/ai/evaluation/_common/constants.py +72 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/rai_service.py +632 -0
- azure/ai/evaluation/_common/utils.py +445 -0
- azure/ai/evaluation/_constants.py +72 -0
- azure/ai/evaluation/_evaluate/__init__.py +3 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
- azure/ai/evaluation/_evaluate/_utils.py +298 -0
- azure/ai/evaluation/_evaluators/__init__.py +3 -0
- azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
- azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
- azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
- azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
- azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
- azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
- azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
- azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
- azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
- azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
- azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
- azure/ai/evaluation/_exceptions.py +128 -0
- azure/ai/evaluation/_http_utils.py +466 -0
- azure/ai/evaluation/_model_configurations.py +123 -0
- azure/ai/evaluation/_user_agent.py +6 -0
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +5 -0
- azure/ai/evaluation/py.typed +0 -0
- azure/ai/evaluation/simulator/__init__.py +16 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
- azure/ai/evaluation/simulator/_constants.py +27 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
- azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
- azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
- azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
- azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
- azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
- azure/ai/evaluation/simulator/_simulator.py +716 -0
- azure/ai/evaluation/simulator/_tracing.py +89 -0
- azure/ai/evaluation/simulator/_utils.py +132 -0
- azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
- {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
- azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
- azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
- azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
# noqa: E402
|
|
5
|
+
|
|
6
|
+
import copy
|
|
7
|
+
import logging
|
|
8
|
+
import time
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
|
11
|
+
|
|
12
|
+
import jinja2
|
|
13
|
+
|
|
14
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
15
|
+
from azure.ai.evaluation._http_utils import AsyncHttpPipeline
|
|
16
|
+
|
|
17
|
+
from .._model_tools import LLMBase, OpenAIChatCompletionsModel
|
|
18
|
+
from .._model_tools._template_handler import TemplateParameters
|
|
19
|
+
from .constants import ConversationRole
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ConversationTurn:
|
|
24
|
+
"""Class to represent a turn in a conversation.
|
|
25
|
+
|
|
26
|
+
A "turn" involves only one exchange between the user and the chatbot.
|
|
27
|
+
|
|
28
|
+
:param role: The role of the participant in the conversation. Accepted values are
|
|
29
|
+
"user" and "assistant".
|
|
30
|
+
:type role: ~azure.ai.evaluation.simulator._conversation.constants.ConversationRole
|
|
31
|
+
:param name: The name of the participant in the conversation.
|
|
32
|
+
:type name: Optional[str]
|
|
33
|
+
:param message: The message exchanged in the conversation. Defaults to an empty string.
|
|
34
|
+
:type message: str
|
|
35
|
+
:param full_response: The full response.
|
|
36
|
+
:type full_response: Optional[Any]
|
|
37
|
+
:param request: The request.
|
|
38
|
+
:type request: Optional[Any]
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
role: "ConversationRole"
|
|
42
|
+
name: Optional[str] = None
|
|
43
|
+
message: str = ""
|
|
44
|
+
full_response: Optional[Dict[str, Any]] = None
|
|
45
|
+
request: Optional[Any] = None
|
|
46
|
+
|
|
47
|
+
def to_openai_chat_format(self, reverse: bool = False) -> Dict[str, str]:
|
|
48
|
+
"""Convert the conversation turn to the OpenAI chat format.
|
|
49
|
+
|
|
50
|
+
OpenAI chat format is a dictionary with two keys: "role" and "content".
|
|
51
|
+
|
|
52
|
+
:param reverse: Whether to reverse the conversation turn. Defaults to False.
|
|
53
|
+
:type reverse: bool
|
|
54
|
+
:return: The conversation turn in the OpenAI chat format.
|
|
55
|
+
:rtype: Dict[str, str]
|
|
56
|
+
"""
|
|
57
|
+
if reverse is False:
|
|
58
|
+
return {"role": self.role.value, "content": self.message}
|
|
59
|
+
if self.role == ConversationRole.ASSISTANT:
|
|
60
|
+
return {"role": ConversationRole.USER.value, "content": self.message}
|
|
61
|
+
return {"role": ConversationRole.ASSISTANT.value, "content": self.message}
|
|
62
|
+
|
|
63
|
+
def to_annotation_format(self, turn_number: int) -> Dict[str, Any]:
|
|
64
|
+
"""Convert the conversation turn to an annotation format.
|
|
65
|
+
|
|
66
|
+
Annotation format is a dictionary with the following keys:
|
|
67
|
+
- "turn_number": The turn number.
|
|
68
|
+
- "response": The response.
|
|
69
|
+
- "actor": The actor.
|
|
70
|
+
- "request": The request.
|
|
71
|
+
- "full_json_response": The full JSON response.
|
|
72
|
+
|
|
73
|
+
:param turn_number: The turn number.
|
|
74
|
+
:type turn_number: int
|
|
75
|
+
:return: The conversation turn in the annotation format.
|
|
76
|
+
:rtype: Dict[str, Any]
|
|
77
|
+
"""
|
|
78
|
+
return {
|
|
79
|
+
"turn_number": turn_number,
|
|
80
|
+
"response": self.message,
|
|
81
|
+
"actor": self.role.value if self.name is None else self.name,
|
|
82
|
+
"request": self.request,
|
|
83
|
+
"full_json_response": self.full_response,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def __str__(self) -> str:
|
|
87
|
+
return f"({self.role.value}): {self.message}"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ConversationBot:
|
|
91
|
+
"""
|
|
92
|
+
A conversation chat bot with a specific name, persona and a sentence that can be used as a conversation starter.
|
|
93
|
+
|
|
94
|
+
:param role: The role of the bot in the conversation, either "user" or "assistant".
|
|
95
|
+
:type role: ~azure.ai.evaluation.simulator._conversation.constants.ConversationRole
|
|
96
|
+
:param model: The LLM model to use for generating responses.
|
|
97
|
+
:type model: Union[
|
|
98
|
+
~azure.ai.evaluation.simulator._model_tools.LLMBase,
|
|
99
|
+
~azure.ai.evaluation.simulator._model_tools.OpenAIChatCompletionsModel
|
|
100
|
+
]
|
|
101
|
+
:param conversation_template: A Jinja2 template describing the conversation to generate the prompt for the LLM
|
|
102
|
+
:type conversation_template: str
|
|
103
|
+
:param instantiation_parameters: A dictionary of parameters used to instantiate the conversation template
|
|
104
|
+
:type instantiation_parameters: Dict[str, str]
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
def __init__(
|
|
108
|
+
self,
|
|
109
|
+
*,
|
|
110
|
+
role: ConversationRole,
|
|
111
|
+
model: Union[LLMBase, OpenAIChatCompletionsModel],
|
|
112
|
+
conversation_template: str,
|
|
113
|
+
instantiation_parameters: TemplateParameters,
|
|
114
|
+
) -> None:
|
|
115
|
+
self.role = role
|
|
116
|
+
self.conversation_template_orig = conversation_template
|
|
117
|
+
self.conversation_template: jinja2.Template = jinja2.Template(
|
|
118
|
+
conversation_template, undefined=jinja2.StrictUndefined
|
|
119
|
+
)
|
|
120
|
+
self.persona_template_args = instantiation_parameters
|
|
121
|
+
if self.role == ConversationRole.USER:
|
|
122
|
+
self.name: str = cast(str, self.persona_template_args.get("name", role.value))
|
|
123
|
+
else:
|
|
124
|
+
self.name = cast(str, self.persona_template_args.get("chatbot_name", role.value)) or model.name
|
|
125
|
+
self.model = model
|
|
126
|
+
|
|
127
|
+
self.logger = logging.getLogger(repr(self))
|
|
128
|
+
self.conversation_starter: Optional[Union[str, jinja2.Template, Dict]] = None
|
|
129
|
+
if role == ConversationRole.USER:
|
|
130
|
+
if "conversation_starter" in self.persona_template_args:
|
|
131
|
+
conversation_starter_content = self.persona_template_args["conversation_starter"]
|
|
132
|
+
if isinstance(conversation_starter_content, dict):
|
|
133
|
+
self.conversation_starter = conversation_starter_content
|
|
134
|
+
else:
|
|
135
|
+
try:
|
|
136
|
+
self.conversation_starter = jinja2.Template(
|
|
137
|
+
conversation_starter_content, undefined=jinja2.StrictUndefined
|
|
138
|
+
)
|
|
139
|
+
except jinja2.exceptions.TemplateSyntaxError: # noqa: F841
|
|
140
|
+
self.conversation_starter = conversation_starter_content
|
|
141
|
+
else:
|
|
142
|
+
self.logger.info(
|
|
143
|
+
"This simulated bot will generate the first turn as no conversation starter is provided"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
async def generate_response(
|
|
147
|
+
self,
|
|
148
|
+
session: AsyncHttpPipeline,
|
|
149
|
+
conversation_history: List[ConversationTurn],
|
|
150
|
+
max_history: int,
|
|
151
|
+
turn_number: int = 0,
|
|
152
|
+
) -> Tuple[dict, dict, float, dict]:
|
|
153
|
+
"""
|
|
154
|
+
Prompt the ConversationBot for a response.
|
|
155
|
+
|
|
156
|
+
:param session: AsyncHttpPipeline to use for the request.
|
|
157
|
+
:type session: AsyncHttpPipeline
|
|
158
|
+
:param conversation_history: The turns in the conversation so far.
|
|
159
|
+
:type conversation_history: List[ConversationTurn]
|
|
160
|
+
:param max_history: Parameters used to query GPT-4 model.
|
|
161
|
+
:type max_history: int
|
|
162
|
+
:param turn_number: Parameters used to query GPT-4 model.
|
|
163
|
+
:type turn_number: int
|
|
164
|
+
:return: The response from the ConversationBot.
|
|
165
|
+
:rtype: Tuple[dict, dict, float, dict]
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
# check if this is the first turn and the conversation_starter is not None,
|
|
169
|
+
# return the conversations starter rather than generating turn using LLM
|
|
170
|
+
if turn_number == 0 and self.conversation_starter is not None:
|
|
171
|
+
# if conversation_starter is a dictionary, pass it into samples as is
|
|
172
|
+
if isinstance(self.conversation_starter, dict):
|
|
173
|
+
samples: List[Union[str, jinja2.Template, Dict]] = [self.conversation_starter]
|
|
174
|
+
if isinstance(self.conversation_starter, jinja2.Template):
|
|
175
|
+
samples = [self.conversation_starter.render(**self.persona_template_args)]
|
|
176
|
+
else:
|
|
177
|
+
samples = [self.conversation_starter]
|
|
178
|
+
time_taken = 0
|
|
179
|
+
|
|
180
|
+
finish_reason = ["stop"]
|
|
181
|
+
|
|
182
|
+
parsed_response = {"samples": samples, "finish_reason": finish_reason, "id": None}
|
|
183
|
+
full_response = parsed_response
|
|
184
|
+
return parsed_response, {}, time_taken, full_response
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
prompt = self.conversation_template.render(
|
|
188
|
+
conversation_turns=conversation_history[-max_history:],
|
|
189
|
+
role=self.role.value,
|
|
190
|
+
**self.persona_template_args,
|
|
191
|
+
)
|
|
192
|
+
except Exception: # pylint: disable=broad-except
|
|
193
|
+
import code
|
|
194
|
+
|
|
195
|
+
code.interact(local=locals())
|
|
196
|
+
|
|
197
|
+
messages = [{"role": "system", "content": prompt}]
|
|
198
|
+
|
|
199
|
+
# The ChatAPI must respond as ASSISTANT, so if this bot is USER, we need to reverse the messages
|
|
200
|
+
if (self.role == ConversationRole.USER) and (isinstance(self.model, (OpenAIChatCompletionsModel))):
|
|
201
|
+
# in here we need to simulate the user, The chatapi only generate turn as assistant and
|
|
202
|
+
# can't generate turn as user
|
|
203
|
+
# thus we reverse all rules in history messages,
|
|
204
|
+
# so that messages produced from the other bot passed here as user messages
|
|
205
|
+
messages.extend([turn.to_openai_chat_format(reverse=True) for turn in conversation_history[-max_history:]])
|
|
206
|
+
prompt_role = ConversationRole.USER.value
|
|
207
|
+
else:
|
|
208
|
+
messages.extend([turn.to_openai_chat_format() for turn in conversation_history[-max_history:]])
|
|
209
|
+
prompt_role = self.role.value
|
|
210
|
+
|
|
211
|
+
response = await self.model.get_conversation_completion(
|
|
212
|
+
messages=messages,
|
|
213
|
+
session=session,
|
|
214
|
+
role=prompt_role,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
return response["response"], response["request"], response["time_taken"], response["full_response"]
|
|
218
|
+
|
|
219
|
+
def __repr__(self):
|
|
220
|
+
return f"Bot(name={self.name}, role={self.role.name}, model={self.model.__class__.__name__})"
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class CallbackConversationBot(ConversationBot):
|
|
224
|
+
"""Conversation bot that uses a user provided callback to generate responses.
|
|
225
|
+
|
|
226
|
+
:param callback: The callback function to use to generate responses.
|
|
227
|
+
:type callback: Callable
|
|
228
|
+
:param user_template: The template to use for the request.
|
|
229
|
+
:type user_template: str
|
|
230
|
+
:param user_template_parameters: The template parameters to use for the request.
|
|
231
|
+
:type user_template_parameters: Dict
|
|
232
|
+
:param args: Optional arguments to pass to the parent class.
|
|
233
|
+
:type args: Any
|
|
234
|
+
:param kwargs: Optional keyword arguments to pass to the parent class.
|
|
235
|
+
:type kwargs: Any
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
def __init__(
|
|
239
|
+
self,
|
|
240
|
+
callback: Callable,
|
|
241
|
+
user_template: str,
|
|
242
|
+
user_template_parameters: TemplateParameters,
|
|
243
|
+
*args,
|
|
244
|
+
**kwargs,
|
|
245
|
+
) -> None:
|
|
246
|
+
self.callback = callback
|
|
247
|
+
self.user_template = user_template
|
|
248
|
+
self.user_template_parameters = user_template_parameters
|
|
249
|
+
|
|
250
|
+
super().__init__(*args, **kwargs)
|
|
251
|
+
|
|
252
|
+
async def generate_response(
|
|
253
|
+
self,
|
|
254
|
+
session: AsyncHttpPipeline,
|
|
255
|
+
conversation_history: List[Any],
|
|
256
|
+
max_history: int,
|
|
257
|
+
turn_number: int = 0,
|
|
258
|
+
) -> Tuple[dict, dict, float, dict]:
|
|
259
|
+
chat_protocol_message = self._to_chat_protocol(
|
|
260
|
+
self.user_template, conversation_history, self.user_template_parameters
|
|
261
|
+
)
|
|
262
|
+
msg_copy = copy.deepcopy(chat_protocol_message)
|
|
263
|
+
result = {}
|
|
264
|
+
start_time = time.time()
|
|
265
|
+
result = await self.callback(msg_copy)
|
|
266
|
+
end_time = time.time()
|
|
267
|
+
if not result:
|
|
268
|
+
result = {
|
|
269
|
+
"messages": [{"content": "Callback did not return a response.", "role": "assistant"}],
|
|
270
|
+
"finish_reason": ["stop"],
|
|
271
|
+
"id": None,
|
|
272
|
+
"template_parameters": {},
|
|
273
|
+
}
|
|
274
|
+
self.logger.info("Using user provided callback returning response.")
|
|
275
|
+
|
|
276
|
+
time_taken = end_time - start_time
|
|
277
|
+
try:
|
|
278
|
+
response = {
|
|
279
|
+
"samples": [result["messages"][-1]["content"]],
|
|
280
|
+
"finish_reason": ["stop"],
|
|
281
|
+
"id": None,
|
|
282
|
+
}
|
|
283
|
+
except Exception as exc:
|
|
284
|
+
msg = "User provided callback does not conform to chat protocol standard."
|
|
285
|
+
raise EvaluationException(
|
|
286
|
+
message=msg,
|
|
287
|
+
internal_message=msg,
|
|
288
|
+
target=ErrorTarget.CALLBACK_CONVERSATION_BOT,
|
|
289
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
290
|
+
blame=ErrorBlame.USER_ERROR,
|
|
291
|
+
) from exc
|
|
292
|
+
|
|
293
|
+
self.logger.info("Parsed callback response")
|
|
294
|
+
|
|
295
|
+
return response, {}, time_taken, result
|
|
296
|
+
|
|
297
|
+
# Bug 3354264: template is unused in the method - is this intentional?
|
|
298
|
+
def _to_chat_protocol(self, template, conversation_history, template_parameters): # pylint: disable=unused-argument
|
|
299
|
+
messages = []
|
|
300
|
+
|
|
301
|
+
for _, m in enumerate(conversation_history):
|
|
302
|
+
messages.append({"content": m.message, "role": m.role.value})
|
|
303
|
+
|
|
304
|
+
return {
|
|
305
|
+
"template_parameters": template_parameters,
|
|
306
|
+
"messages": messages,
|
|
307
|
+
"$schema": "http://azureml/sdk-2-0/ChatConversation.json",
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
__all__ = [
|
|
312
|
+
"ConversationRole",
|
|
313
|
+
"ConversationBot",
|
|
314
|
+
"CallbackConversationBot",
|
|
315
|
+
"ConversationTurn",
|
|
316
|
+
]
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
|
8
|
+
|
|
9
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
10
|
+
from azure.ai.evaluation.simulator._constants import SupportedLanguages
|
|
11
|
+
from azure.ai.evaluation.simulator._helpers._language_suffix_mapping import SUPPORTED_LANGUAGES_MAPPING
|
|
12
|
+
|
|
13
|
+
from ..._http_utils import AsyncHttpPipeline
|
|
14
|
+
from . import ConversationBot, ConversationTurn
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def is_closing_message(response: Union[Dict, str], recursion_depth: int = 0) -> bool:
|
|
18
|
+
"""Determine if a response indicates an end to the conversation.
|
|
19
|
+
|
|
20
|
+
:param response: The response to check.
|
|
21
|
+
:type response: Union[Dict, str]
|
|
22
|
+
:param recursion_depth: The current recursion depth. Defaults to 0.
|
|
23
|
+
:type recursion_depth: int
|
|
24
|
+
:return: True if the response indicates an end to the conversation, False otherwise.
|
|
25
|
+
:rtype: bool
|
|
26
|
+
"""
|
|
27
|
+
if recursion_depth > 10:
|
|
28
|
+
msg = "Exceeded max call depth in is_closing_message"
|
|
29
|
+
raise EvaluationException(
|
|
30
|
+
message=msg,
|
|
31
|
+
internal_message=msg,
|
|
32
|
+
error_category=ErrorCategory.INVALID_VALUE,
|
|
33
|
+
error_target=ErrorTarget.CONVERSATION,
|
|
34
|
+
error_blame=ErrorBlame.USER_ERROR,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# recursively go through each inner dictionary in the JSON dict
|
|
38
|
+
# and check if any value entry contains a closing message
|
|
39
|
+
if isinstance(response, dict):
|
|
40
|
+
for value in response.values():
|
|
41
|
+
if is_closing_message(value, recursion_depth=recursion_depth + 1):
|
|
42
|
+
return True
|
|
43
|
+
elif isinstance(response, str):
|
|
44
|
+
return is_closing_message_helper(response)
|
|
45
|
+
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def is_closing_message_helper(response: str) -> bool:
|
|
50
|
+
"""Determine if a response indicates an end to the conversation.
|
|
51
|
+
|
|
52
|
+
:param response: The response to check.
|
|
53
|
+
:type response: str
|
|
54
|
+
:return: True if the response indicates an end to the conversation, False otherwise.
|
|
55
|
+
:rtype: bool
|
|
56
|
+
"""
|
|
57
|
+
message = response.lower()
|
|
58
|
+
if "?" in message.lower():
|
|
59
|
+
return False
|
|
60
|
+
punctuation = [".", ",", "!", ";", ":"]
|
|
61
|
+
for p in punctuation:
|
|
62
|
+
message = message.replace(p, "")
|
|
63
|
+
if (
|
|
64
|
+
"bye" not in message.lower().split()
|
|
65
|
+
and "goodbye" not in message.lower().split()
|
|
66
|
+
# and "thanks" not in message.lower()
|
|
67
|
+
# and "thank" not in message.lower()
|
|
68
|
+
):
|
|
69
|
+
return False
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def simulate_conversation(
|
|
74
|
+
*,
|
|
75
|
+
bots: List[ConversationBot],
|
|
76
|
+
session: AsyncHttpPipeline,
|
|
77
|
+
language: SupportedLanguages,
|
|
78
|
+
stopping_criteria: Callable[[str], bool] = is_closing_message,
|
|
79
|
+
turn_limit: int = 10,
|
|
80
|
+
history_limit: int = 5,
|
|
81
|
+
api_call_delay_sec: float = 0,
|
|
82
|
+
logger: logging.Logger = logging.getLogger(__name__),
|
|
83
|
+
) -> Tuple[Optional[str], List[ConversationTurn]]:
|
|
84
|
+
"""
|
|
85
|
+
Simulate a conversation between the given bots.
|
|
86
|
+
|
|
87
|
+
:keyword bots: List of ConversationBot instances participating in the conversation.
|
|
88
|
+
:paramtype bots: List[ConversationBot]
|
|
89
|
+
:keyword session: The session to use for making API calls.
|
|
90
|
+
:paramtype session: AsyncHttpPipeline
|
|
91
|
+
:keyword stopping_criteria: A callable that determines when the conversation should stop.
|
|
92
|
+
:paramtype stopping_criteria: Callable[[str], bool]
|
|
93
|
+
:keyword turn_limit: The maximum number of turns in the conversation. Defaults to 10.
|
|
94
|
+
:paramtype turn_limit: int
|
|
95
|
+
:keyword history_limit: The maximum number of turns to keep in the conversation history. Defaults to 5.
|
|
96
|
+
:paramtype history_limit: int
|
|
97
|
+
:keyword api_call_delay_sec: Delay between API calls in seconds. Defaults to 0.
|
|
98
|
+
:paramtype api_call_delay_sec: float
|
|
99
|
+
:keyword logger: The logger to use for logging. Defaults to the logger named after the current module.
|
|
100
|
+
:paramtype logger: logging.Logger
|
|
101
|
+
:return: Simulation a conversation between the given bots.
|
|
102
|
+
:rtype: Tuple[Optional[str], List[ConversationTurn]]
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
# Read the first prompt.
|
|
106
|
+
(first_response, request, _, full_response) = await bots[0].generate_response(
|
|
107
|
+
session=session,
|
|
108
|
+
conversation_history=[],
|
|
109
|
+
max_history=history_limit,
|
|
110
|
+
turn_number=0,
|
|
111
|
+
)
|
|
112
|
+
if "id" in first_response:
|
|
113
|
+
conversation_id: Optional[str] = first_response["id"]
|
|
114
|
+
else:
|
|
115
|
+
conversation_id = None
|
|
116
|
+
first_prompt = first_response["samples"][0]
|
|
117
|
+
if language != SupportedLanguages.English:
|
|
118
|
+
if not isinstance(language, SupportedLanguages) or language not in SupportedLanguages:
|
|
119
|
+
raise Exception( # pylint: disable=broad-exception-raised
|
|
120
|
+
f"Language option '{language}' isn't supported. Select a supported language option from "
|
|
121
|
+
f"azure.ai.evaluation.simulator.SupportedLanguages: {[f'{e}' for e in SupportedLanguages]}"
|
|
122
|
+
)
|
|
123
|
+
first_prompt += f" {SUPPORTED_LANGUAGES_MAPPING[language]}"
|
|
124
|
+
# Add all generated turns into array to pass for each bot while generating
|
|
125
|
+
# new responses. We add generated response and the person generating it.
|
|
126
|
+
# in the case of the first turn, it is supposed to be the user search query
|
|
127
|
+
conversation_history = [
|
|
128
|
+
ConversationTurn(
|
|
129
|
+
role=bots[0].role,
|
|
130
|
+
name=bots[0].name,
|
|
131
|
+
message=first_prompt,
|
|
132
|
+
full_response=full_response,
|
|
133
|
+
request=request,
|
|
134
|
+
)
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
# initialize the turn counter
|
|
138
|
+
current_turn = 1
|
|
139
|
+
|
|
140
|
+
# Keep iterating and alternate between bots until a stopping word is
|
|
141
|
+
# generated or maximum number of turns is reached.
|
|
142
|
+
while (not stopping_criteria(conversation_history[-1].message)) and (current_turn < turn_limit):
|
|
143
|
+
try:
|
|
144
|
+
current_character_idx = current_turn % len(bots)
|
|
145
|
+
current_bot = bots[current_character_idx]
|
|
146
|
+
# invoke Bot to generate response given the input request
|
|
147
|
+
# pass only the last generated turn without passing the bot name.
|
|
148
|
+
response, request, _, full_response = await current_bot.generate_response(
|
|
149
|
+
session=session,
|
|
150
|
+
conversation_history=conversation_history,
|
|
151
|
+
max_history=history_limit,
|
|
152
|
+
turn_number=current_turn,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# check if conversation id is null, which means conversation starter was used. use id from next turn
|
|
156
|
+
if conversation_id is None and "id" in response:
|
|
157
|
+
conversation_id = response["id"]
|
|
158
|
+
# add the generated response to the list of generated responses
|
|
159
|
+
conversation_history.append(
|
|
160
|
+
ConversationTurn(
|
|
161
|
+
role=current_bot.role,
|
|
162
|
+
name=current_bot.name,
|
|
163
|
+
message=response["samples"][0],
|
|
164
|
+
full_response=full_response,
|
|
165
|
+
request=request,
|
|
166
|
+
)
|
|
167
|
+
)
|
|
168
|
+
except Exception as e: # pylint: disable=broad-except
|
|
169
|
+
logger.warning("Error: %s", str(e))
|
|
170
|
+
|
|
171
|
+
# Increment outside the try block so we don't get stuck if
|
|
172
|
+
# an exception is thrown
|
|
173
|
+
current_turn += 1
|
|
174
|
+
|
|
175
|
+
# Sleep between consecutive requests to avoid rate limit
|
|
176
|
+
await asyncio.sleep(api_call_delay_sec)
|
|
177
|
+
|
|
178
|
+
return conversation_id, conversation_history
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
|
|
7
|
+
BOT_NAMES = ["chat_bot", "other_bot"]
|
|
8
|
+
TASK_BOT_NAMES = ["system_bot", "simulated_bot"]
|
|
9
|
+
|
|
10
|
+
REQUESTS_BATCH_SIZE = 200 # Number of input lines to process at once, must fit into memory
|
|
11
|
+
OUTPUT_FILE = "openai_api_response.jsonl"
|
|
12
|
+
|
|
13
|
+
# Azure endpoint constants
|
|
14
|
+
AZUREML_TOKEN_SCOPE = "https://ml.azure.com"
|
|
15
|
+
COGNITIVE_SERVICES_TOKEN_SCOPE = "https://cognitiveservices.azure.com/"
|
|
16
|
+
AZURE_TOKEN_REFRESH_INTERVAL = 600 # seconds
|
|
17
|
+
AZURE_ENDPOINT_DOMAIN_VALID_PATTERN_RE = (
|
|
18
|
+
r"^(?=.{1,255}$)(?!-)[a-zA-Z0-9-]{1,63}(?<!-)"
|
|
19
|
+
r"(\.(?!-)[a-zA-Z0-9-]{1,63}(?<!-))*\."
|
|
20
|
+
r"(inference\.ml|openai)\.azure\.com$"
|
|
21
|
+
)
|
|
22
|
+
CHAT_START_TOKEN = "<|im_start|>"
|
|
23
|
+
CHAT_END_TOKEN = "<|im_end|>"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ConversationRole(Enum):
|
|
27
|
+
"""Role in a chatbot conversation"""
|
|
28
|
+
|
|
29
|
+
USER = "user"
|
|
30
|
+
ASSISTANT = "assistant"
|