azure-ai-evaluation 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +1 -15
- azure/ai/evaluation/_common/utils.py +8 -8
- azure/ai/evaluation/_constants.py +3 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +5 -2
- azure/ai/evaluation/_exceptions.py +0 -1
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +640 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +10 -3
- azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
- {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/METADATA +7 -1
- {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/RECORD +17 -23
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,7 @@ import asyncio
|
|
|
7
7
|
import logging
|
|
8
8
|
import random
|
|
9
9
|
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
|
10
|
+
import uuid
|
|
10
11
|
|
|
11
12
|
from tqdm import tqdm
|
|
12
13
|
|
|
@@ -187,6 +188,8 @@ class AdversarialSimulator:
|
|
|
187
188
|
)
|
|
188
189
|
self._ensure_service_dependencies()
|
|
189
190
|
templates = await self.adversarial_template_handler._get_content_harm_template_collections(scenario.value)
|
|
191
|
+
simulation_id = str(uuid.uuid4())
|
|
192
|
+
logger.warning("Use simulation_id to help debug the issue: %s", str(simulation_id))
|
|
190
193
|
concurrent_async_task = min(concurrent_async_task, 1000)
|
|
191
194
|
semaphore = asyncio.Semaphore(concurrent_async_task)
|
|
192
195
|
sim_results = []
|
|
@@ -236,6 +239,7 @@ class AdversarialSimulator:
|
|
|
236
239
|
language=language,
|
|
237
240
|
semaphore=semaphore,
|
|
238
241
|
scenario=scenario,
|
|
242
|
+
simulation_id=simulation_id,
|
|
239
243
|
)
|
|
240
244
|
)
|
|
241
245
|
)
|
|
@@ -298,9 +302,10 @@ class AdversarialSimulator:
|
|
|
298
302
|
language: SupportedLanguages,
|
|
299
303
|
semaphore: asyncio.Semaphore,
|
|
300
304
|
scenario: Union[AdversarialScenario, AdversarialScenarioJailbreak],
|
|
305
|
+
simulation_id: str = "",
|
|
301
306
|
) -> List[Dict]:
|
|
302
307
|
user_bot = self._setup_bot(
|
|
303
|
-
role=ConversationRole.USER, template=template, parameters=parameters, scenario=scenario
|
|
308
|
+
role=ConversationRole.USER, template=template, parameters=parameters, scenario=scenario, simulation_id=simulation_id
|
|
304
309
|
)
|
|
305
310
|
system_bot = self._setup_bot(
|
|
306
311
|
target=target, role=ConversationRole.ASSISTANT, template=template, parameters=parameters, scenario=scenario
|
|
@@ -329,7 +334,7 @@ class AdversarialSimulator:
|
|
|
329
334
|
)
|
|
330
335
|
|
|
331
336
|
def _get_user_proxy_completion_model(
|
|
332
|
-
self, template_key: str, template_parameters: TemplateParameters
|
|
337
|
+
self, template_key: str, template_parameters: TemplateParameters, simulation_id: str = ""
|
|
333
338
|
) -> ProxyChatCompletionsModel:
|
|
334
339
|
return ProxyChatCompletionsModel(
|
|
335
340
|
name="raisvc_proxy_model",
|
|
@@ -340,6 +345,7 @@ class AdversarialSimulator:
|
|
|
340
345
|
api_version="2023-07-01-preview",
|
|
341
346
|
max_tokens=1200,
|
|
342
347
|
temperature=0.0,
|
|
348
|
+
simulation_id=simulation_id,
|
|
343
349
|
)
|
|
344
350
|
|
|
345
351
|
def _setup_bot(
|
|
@@ -350,10 +356,11 @@ class AdversarialSimulator:
|
|
|
350
356
|
parameters: TemplateParameters,
|
|
351
357
|
target: Optional[Callable] = None,
|
|
352
358
|
scenario: Union[AdversarialScenario, AdversarialScenarioJailbreak],
|
|
359
|
+
simulation_id: str = "",
|
|
353
360
|
) -> ConversationBot:
|
|
354
361
|
if role is ConversationRole.USER:
|
|
355
362
|
model = self._get_user_proxy_completion_model(
|
|
356
|
-
template_key=template.template_name, template_parameters=parameters
|
|
363
|
+
template_key=template.template_name, template_parameters=parameters, simulation_id=simulation_id,
|
|
357
364
|
)
|
|
358
365
|
return ConversationBot(
|
|
359
366
|
role=role,
|
|
@@ -128,19 +128,15 @@ class ConversationBot:
|
|
|
128
128
|
self.conversation_starter: Optional[Union[str, jinja2.Template, Dict]] = None
|
|
129
129
|
if role == ConversationRole.USER:
|
|
130
130
|
if "conversation_starter" in self.persona_template_args:
|
|
131
|
-
print(self.persona_template_args)
|
|
132
131
|
conversation_starter_content = self.persona_template_args["conversation_starter"]
|
|
133
132
|
if isinstance(conversation_starter_content, dict):
|
|
134
133
|
self.conversation_starter = conversation_starter_content
|
|
135
|
-
print(f"Conversation starter content: {conversation_starter_content}")
|
|
136
134
|
else:
|
|
137
135
|
try:
|
|
138
136
|
self.conversation_starter = jinja2.Template(
|
|
139
137
|
conversation_starter_content, undefined=jinja2.StrictUndefined
|
|
140
138
|
)
|
|
141
|
-
print("Successfully created a Jinja2 template for the conversation starter.")
|
|
142
139
|
except jinja2.exceptions.TemplateSyntaxError as e: # noqa: F841
|
|
143
|
-
print(f"Template syntax error: {e}. Using raw content.")
|
|
144
140
|
self.conversation_starter = conversation_starter_content
|
|
145
141
|
else:
|
|
146
142
|
self.logger.info(
|
|
@@ -153,6 +149,7 @@ class ConversationBot:
|
|
|
153
149
|
conversation_history: List[ConversationTurn],
|
|
154
150
|
max_history: int,
|
|
155
151
|
turn_number: int = 0,
|
|
152
|
+
session_state: Optional[Dict[str, Any]] = None,
|
|
156
153
|
) -> Tuple[dict, dict, float, dict]:
|
|
157
154
|
"""
|
|
158
155
|
Prompt the ConversationBot for a response.
|
|
@@ -262,6 +259,7 @@ class CallbackConversationBot(ConversationBot):
|
|
|
262
259
|
conversation_history: List[Any],
|
|
263
260
|
max_history: int,
|
|
264
261
|
turn_number: int = 0,
|
|
262
|
+
session_state: Optional[Dict[str, Any]] = None,
|
|
265
263
|
) -> Tuple[dict, dict, float, dict]:
|
|
266
264
|
chat_protocol_message = self._to_chat_protocol(
|
|
267
265
|
self.user_template, conversation_history, self.user_template_parameters
|
|
@@ -269,7 +267,7 @@ class CallbackConversationBot(ConversationBot):
|
|
|
269
267
|
msg_copy = copy.deepcopy(chat_protocol_message)
|
|
270
268
|
result = {}
|
|
271
269
|
start_time = time.time()
|
|
272
|
-
result = await self.callback(msg_copy)
|
|
270
|
+
result = await self.callback(msg_copy, session_state=session_state)
|
|
273
271
|
end_time = time.time()
|
|
274
272
|
if not result:
|
|
275
273
|
result = {
|
|
@@ -348,6 +346,7 @@ class MultiModalConversationBot(ConversationBot):
|
|
|
348
346
|
conversation_history: List[Any],
|
|
349
347
|
max_history: int,
|
|
350
348
|
turn_number: int = 0,
|
|
349
|
+
session_state: Optional[Dict[str, Any]] = None,
|
|
351
350
|
) -> Tuple[dict, dict, float, dict]:
|
|
352
351
|
previous_prompt = conversation_history[-1]
|
|
353
352
|
chat_protocol_message = await self._to_chat_protocol(conversation_history, self.user_template_parameters)
|
|
@@ -101,6 +101,7 @@ async def simulate_conversation(
|
|
|
101
101
|
:rtype: Tuple[Optional[str], List[ConversationTurn]]
|
|
102
102
|
"""
|
|
103
103
|
|
|
104
|
+
session_state = {}
|
|
104
105
|
# Read the first prompt.
|
|
105
106
|
(first_response, request, _, full_response) = await bots[0].generate_response(
|
|
106
107
|
session=session,
|
|
@@ -149,7 +150,10 @@ async def simulate_conversation(
|
|
|
149
150
|
conversation_history=conversation_history,
|
|
150
151
|
max_history=history_limit,
|
|
151
152
|
turn_number=current_turn,
|
|
153
|
+
session_state=session_state,
|
|
152
154
|
)
|
|
155
|
+
if "session_state" in full_response and full_response["session_state"] is not None:
|
|
156
|
+
session_state.update(full_response["session_state"])
|
|
153
157
|
|
|
154
158
|
# check if conversation id is null, which means conversation starter was used. use id from next turn
|
|
155
159
|
if conversation_id is None and "id" in response:
|
|
@@ -89,6 +89,7 @@ class ProxyChatCompletionsModel(OpenAIChatCompletionsModel):
|
|
|
89
89
|
self.tkey = template_key
|
|
90
90
|
self.tparam = template_parameters
|
|
91
91
|
self.result_url: Optional[str] = None
|
|
92
|
+
self.simulation_id: Optional[str] = kwargs.pop("simulation_id", "")
|
|
92
93
|
|
|
93
94
|
super().__init__(name=name, **kwargs)
|
|
94
95
|
|
|
@@ -169,6 +170,7 @@ class ProxyChatCompletionsModel(OpenAIChatCompletionsModel):
|
|
|
169
170
|
"Content-Type": "application/json",
|
|
170
171
|
"X-CV": f"{uuid.uuid4()}",
|
|
171
172
|
"X-ModelType": self.model or "",
|
|
173
|
+
"x-ms-client-request-id": self.simulation_id,
|
|
172
174
|
}
|
|
173
175
|
# add all additional headers
|
|
174
176
|
headers.update(self.additional_headers) # type: ignore[arg-type]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -377,6 +377,12 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
377
377
|
|
|
378
378
|
# Release History
|
|
379
379
|
|
|
380
|
+
## 1.3.0 (2025-02-28)
|
|
381
|
+
|
|
382
|
+
### Breaking Changes
|
|
383
|
+
- Multimodal specific evaluators `ContentSafetyMultimodalEvaluator`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` has been removed. Please use `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator` instead.
|
|
384
|
+
- Metric name in ProtectedMaterialEvaluator's output is changed from `protected_material.fictional_characters_label` to `protected_material.fictional_characters_defect_rate`. It's now consistent with other evaluator's metric names (ending with `_defect_rate`).
|
|
385
|
+
|
|
380
386
|
## 1.2.0 (2025-01-27)
|
|
381
387
|
|
|
382
388
|
### Features Added
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
azure/ai/evaluation/__init__.py,sha256=
|
|
2
|
-
azure/ai/evaluation/_constants.py,sha256=
|
|
3
|
-
azure/ai/evaluation/_exceptions.py,sha256=
|
|
1
|
+
azure/ai/evaluation/__init__.py,sha256=3lV2dhUqPzUz4lBo8DoEr5o4PEF_Lh5yWQtuLDUQZCM,2192
|
|
2
|
+
azure/ai/evaluation/_constants.py,sha256=Avp26R4qPqaze024EJXzAX8A457DExF5rnIKMWcE8X4,2881
|
|
3
|
+
azure/ai/evaluation/_exceptions.py,sha256=o3AjCyIAIBo_rkJAH_TsAT-hEZ4SMKd9Uo0eLmD9VHY,5085
|
|
4
4
|
azure/ai/evaluation/_http_utils.py,sha256=1bGce6pKAL-vmaUGRPxVX7DVO05XVQ8YPIwIQ3q7mfA,17221
|
|
5
5
|
azure/ai/evaluation/_model_configurations.py,sha256=MNN6cQlz7P9vNfHmfEKsUcly3j1FEOEFsA8WV7GPuKQ,4043
|
|
6
6
|
azure/ai/evaluation/_user_agent.py,sha256=O2y-QPBAcw7w7qQ6M2aRPC3Vy3TKd789u5lcs2yuFaI,290
|
|
7
|
-
azure/ai/evaluation/_version.py,sha256=
|
|
7
|
+
azure/ai/evaluation/_version.py,sha256=xsol5X6WDVwo8FFtHWt28TPyzT0is0sJ3N5i9_ikPVI,229
|
|
8
8
|
azure/ai/evaluation/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
azure/ai/evaluation/_azure/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
|
|
10
10
|
azure/ai/evaluation/_azure/_clients.py,sha256=N1V-LyQkItPuoKl0aieypFPdGSRSld9lQqH1x-n3L7U,9119
|
|
@@ -15,10 +15,10 @@ azure/ai/evaluation/_common/_experimental.py,sha256=GVtSn9r1CeR_yEa578dJVNDJ3P24
|
|
|
15
15
|
azure/ai/evaluation/_common/constants.py,sha256=OsExttFGLnTAyZa26jnY5_PCDTb7uJNFqtE2qsRZ1mg,1957
|
|
16
16
|
azure/ai/evaluation/_common/math.py,sha256=d4bwWe35_RWDIZNcbV1BTBbHNx2QHQ4-I3EofDyyNE0,2863
|
|
17
17
|
azure/ai/evaluation/_common/rai_service.py,sha256=DcakzdOour9qNdMXU-8UFfvLb12oexAoiJXG8XFTRBs,26462
|
|
18
|
-
azure/ai/evaluation/_common/utils.py,sha256=
|
|
18
|
+
azure/ai/evaluation/_common/utils.py,sha256=wssBc9i0JrWwsrvtp45P5_z5ceB7kVGf-KKzpfVQQH4,17594
|
|
19
19
|
azure/ai/evaluation/_evaluate/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
|
|
20
20
|
azure/ai/evaluation/_evaluate/_eval_run.py,sha256=QBtNBwUxqxsIVmKPU-_H8MDFkF4s_bW7arQYXAniRpo,21965
|
|
21
|
-
azure/ai/evaluation/_evaluate/_evaluate.py,sha256=
|
|
21
|
+
azure/ai/evaluation/_evaluate/_evaluate.py,sha256=pyl0HscBAPllpq385p9lMqR2sqip3RZ5BGfcBa4X334,39068
|
|
22
22
|
azure/ai/evaluation/_evaluate/_utils.py,sha256=sKj_4iN-QjrRlEkiZwA9UNiWozS4LgJcUZ6AWdHrTY4,14231
|
|
23
23
|
azure/ai/evaluation/_evaluate/_batch_run/__init__.py,sha256=Z-TQdSxKTn0bjsF0YosIJMbQFQHDUv_b9zCBu1TeogQ,474
|
|
24
24
|
azure/ai/evaluation/_evaluate/_batch_run/code_client.py,sha256=XQLaXfswF6ReHLpQthHLuLLa65Pts8uawGp7kRqmMDs,8260
|
|
@@ -59,14 +59,6 @@ azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty,sh
|
|
|
59
59
|
azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty,sha256=8kNShdfxQvkII7GnqjmdqQ5TNelA2B6cjnqWZk8FFe4,5296
|
|
60
60
|
azure/ai/evaluation/_evaluators/_meteor/__init__.py,sha256=209na3pPsdmcuYpYHUYtqQybCpc3yZkc93HnRdicSlI,266
|
|
61
61
|
azure/ai/evaluation/_evaluators/_meteor/_meteor.py,sha256=OpugAjIgcTcNQ6g6Rks_8GVhcRiH524PbmBKH3bTefs,4369
|
|
62
|
-
azure/ai/evaluation/_evaluators/_multimodal/__init__.py,sha256=tPvsY0nv8T3VtiiAwJM6wT5A9FhKP2XXwUlCH994xl4,906
|
|
63
|
-
azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py,sha256=x0l6eLQhxVP85jEyGfFCl27C2okMgD0S3aJ_qrgB3Q8,5219
|
|
64
|
-
azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py,sha256=X2IVw0YvymDD3e4Vx-TfjqgqtYiAKVhUumjBowCpOmA,2441
|
|
65
|
-
azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py,sha256=ral1AAbP5pfsygDe30MtuwajuydiXoXzzCeuLBzIkWc,3779
|
|
66
|
-
azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py,sha256=gMrfyn3KHcV6SoowuEjR7Fon9vVLN7GOPM4rkJRK6xU,4906
|
|
67
|
-
azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py,sha256=QwOCBb618ZXSs-OoVXyNM65N4ZEL7IZt-S1Nqd8xNbY,3703
|
|
68
|
-
azure/ai/evaluation/_evaluators/_multimodal/_sexual.py,sha256=6zz89yzr_SdldqBVv-3wOErz3H5sBO6wYgNh39aHXmY,3668
|
|
69
|
-
azure/ai/evaluation/_evaluators/_multimodal/_violence.py,sha256=t1h3bY6N7SwlSgP_1P-90KGTsq1oWvTYDJpy_uMvzjA,3694
|
|
70
62
|
azure/ai/evaluation/_evaluators/_protected_material/__init__.py,sha256=eRAQIU9diVXfO5bp6aLWxZoYUvOsrDIfy1gnDOeNTiI,109
|
|
71
63
|
azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py,sha256=IABs1YMBZdIi1u57dPi-aQpSiPWIGxEZ4hyt97jvdNA,4604
|
|
72
64
|
azure/ai/evaluation/_evaluators/_qa/__init__.py,sha256=bcXfT--C0hjym2haqd1B2-u9bDciyM0ThOFtU1Q69sk,244
|
|
@@ -86,6 +78,8 @@ azure/ai/evaluation/_evaluators/_similarity/_similarity.py,sha256=AeqJ_OJUAsdu9C
|
|
|
86
78
|
azure/ai/evaluation/_evaluators/_similarity/similarity.prompty,sha256=eoludASychZoGL625bFCaZai-OY7DIAg90ZLax_o4XE,4594
|
|
87
79
|
azure/ai/evaluation/_evaluators/_xpia/__init__.py,sha256=VMEL8WrpJQeh4sQiOLzP7hRFPnjzsvwfvTzaGCVJPCM,88
|
|
88
80
|
azure/ai/evaluation/_evaluators/_xpia/xpia.py,sha256=Nv14lU7jN0yXKbHgHRXMHEy6pn1rXmesBOYI2Ge9ewk,5849
|
|
81
|
+
azure/ai/evaluation/_safety_evaluation/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
|
|
82
|
+
azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py,sha256=HtT6aKdZ4EP6Dx-JuExdysO_CBERFMwRWmGTWeHQ_NU,33890
|
|
89
83
|
azure/ai/evaluation/_vendor/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
|
|
90
84
|
azure/ai/evaluation/_vendor/rouge_score/__init__.py,sha256=03OkyfS_UmzRnHv6-z9juTaJ6OXJoEJM989hgifIZbc,607
|
|
91
85
|
azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py,sha256=DtNSeshHipzc6vFnvx7kbs5viXe4LNq-ZrgllFvfR4U,11299
|
|
@@ -94,15 +88,15 @@ azure/ai/evaluation/_vendor/rouge_score/tokenize.py,sha256=IyHVsWY6IFFZdB23cLiJs
|
|
|
94
88
|
azure/ai/evaluation/_vendor/rouge_score/tokenizers.py,sha256=3_-y1TyvyluHuERhSJ5CdXSwnpcMA7aAKU6PCz9wH_Q,1745
|
|
95
89
|
azure/ai/evaluation/simulator/__init__.py,sha256=JbrPZ8pvTBalyX94SvZ9btHNoovX8rbZV03KmzxxWys,552
|
|
96
90
|
azure/ai/evaluation/simulator/_adversarial_scenario.py,sha256=9rpAPz594tYjxzM3XMeDq6CZSc2yvf5YaNaGC7nzYhM,1710
|
|
97
|
-
azure/ai/evaluation/simulator/_adversarial_simulator.py,sha256=
|
|
91
|
+
azure/ai/evaluation/simulator/_adversarial_simulator.py,sha256=2QEt5hHOKnNqMySt3SNsr-LY22DvFyusVnPkvAyur1I,22100
|
|
98
92
|
azure/ai/evaluation/simulator/_constants.py,sha256=nCL7_1BnYh6k0XvxudxsDVMbiG9MMEvYw5wO9FZHHZ8,857
|
|
99
93
|
azure/ai/evaluation/simulator/_direct_attack_simulator.py,sha256=FTtWf655dHJF5FLJi0xGSBgIlGWNiVWyqaLDJSud9XA,10199
|
|
100
94
|
azure/ai/evaluation/simulator/_indirect_attack_simulator.py,sha256=nweIU_AkUIR50qLQpjmljf_OkpsCPth2Ebf4vusygCA,10226
|
|
101
95
|
azure/ai/evaluation/simulator/_simulator.py,sha256=LBzez7qvObpVjTwmlGS_PfhDLo8pRknh5epra2yo9X8,36484
|
|
102
96
|
azure/ai/evaluation/simulator/_tracing.py,sha256=frZ4-usrzINast9F4-ONRzEGGox71y8bYw0UHNufL1Y,3069
|
|
103
97
|
azure/ai/evaluation/simulator/_utils.py,sha256=16NltlywpbMtoFtULwTKqeURguIS1kSKSo3g8uKV8TA,5181
|
|
104
|
-
azure/ai/evaluation/simulator/_conversation/__init__.py,sha256=
|
|
105
|
-
azure/ai/evaluation/simulator/_conversation/_conversation.py,sha256=
|
|
98
|
+
azure/ai/evaluation/simulator/_conversation/__init__.py,sha256=LOR5h7vSACrv_cmaS6r7KUqjJcHYg7PAgkDbdvOmd4g,17726
|
|
99
|
+
azure/ai/evaluation/simulator/_conversation/_conversation.py,sha256=h8OHq0sWKiTH821tC5zF44CJ-QgutTgDnEYsFEA7Cw0,7635
|
|
106
100
|
azure/ai/evaluation/simulator/_conversation/constants.py,sha256=3v7zkjPwJAPbSpJYIK6VOZZy70bJXMo_QTVqSFGlq9A,984
|
|
107
101
|
azure/ai/evaluation/simulator/_data_sources/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
|
|
108
102
|
azure/ai/evaluation/simulator/_data_sources/grounding.json,sha256=jqdqHrCgS7hN7K2kXSEcPCmzFjV4cv_qcCSR-Hutwx4,1257075
|
|
@@ -111,15 +105,15 @@ azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py,sha256=7BBLH7
|
|
|
111
105
|
azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py,sha256=BOttMTec3muMiA4OzwD_iW08GTrhja7PL9XVjRCN3jM,3029
|
|
112
106
|
azure/ai/evaluation/simulator/_model_tools/__init__.py,sha256=aMv5apb7uVjuhMF9ohhA5kQmo652hrGIJlhdl3y2R1I,835
|
|
113
107
|
azure/ai/evaluation/simulator/_model_tools/_identity_manager.py,sha256=-hptp2vpJIcfjvtd0E2c7ry00LVh23LxuYGevsNFfgs,6385
|
|
114
|
-
azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py,sha256=
|
|
108
|
+
azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py,sha256=D2Am07l2gq_v-vztqDBTX2MxZTUumm5ysaVSL2L0rxQ,9118
|
|
115
109
|
azure/ai/evaluation/simulator/_model_tools/_rai_client.py,sha256=40MGzIXGv7oVshWH7AbOPLCigI4HlMrqbF2Rq5jFMGo,8755
|
|
116
110
|
azure/ai/evaluation/simulator/_model_tools/_template_handler.py,sha256=NQWqjE7csSzkhb2XdW82AoCA-DxixpTrfBxAnOt2Wlc,7075
|
|
117
111
|
azure/ai/evaluation/simulator/_model_tools/models.py,sha256=bfVm0PV3vfH_8DkdmTMZqYVN-G51hZ6Y0TOO-NiysJY,21811
|
|
118
112
|
azure/ai/evaluation/simulator/_prompty/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
119
113
|
azure/ai/evaluation/simulator/_prompty/task_query_response.prompty,sha256=2BzSqDDYilDushvR56vMRDmqFIaIYAewdUlUZg_elMg,2182
|
|
120
114
|
azure/ai/evaluation/simulator/_prompty/task_simulate.prompty,sha256=NE6lH4bfmibgMn4NgJtm9_l3PMoHSFrfjjosDJEKM0g,939
|
|
121
|
-
azure_ai_evaluation-1.
|
|
122
|
-
azure_ai_evaluation-1.
|
|
123
|
-
azure_ai_evaluation-1.
|
|
124
|
-
azure_ai_evaluation-1.
|
|
125
|
-
azure_ai_evaluation-1.
|
|
115
|
+
azure_ai_evaluation-1.3.0.dist-info/METADATA,sha256=msxfA0EnNspmcqbsGuYtdEESA3kLPnEfWBpzf7yFHEw,32885
|
|
116
|
+
azure_ai_evaluation-1.3.0.dist-info/NOTICE.txt,sha256=4tzi_Yq4-eBGhBvveobWHCgUIVF-ZeouGN0m7hVq5Mk,3592
|
|
117
|
+
azure_ai_evaluation-1.3.0.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
|
|
118
|
+
azure_ai_evaluation-1.3.0.dist-info/top_level.txt,sha256=S7DhWV9m80TBzAhOFjxDUiNbKszzoThbnrSz5MpbHSQ,6
|
|
119
|
+
azure_ai_evaluation-1.3.0.dist-info/RECORD,,
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
# ---------------------------------------------------------
|
|
2
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
-
# ---------------------------------------------------------
|
|
4
|
-
from ._content_safety_multimodal import ContentSafetyMultimodalEvaluator
|
|
5
|
-
from ._content_safety_multimodal_base import ContentSafetyMultimodalEvaluatorBase
|
|
6
|
-
from ._hate_unfairness import HateUnfairnessMultimodalEvaluator
|
|
7
|
-
from ._self_harm import SelfHarmMultimodalEvaluator
|
|
8
|
-
from ._sexual import SexualMultimodalEvaluator
|
|
9
|
-
from ._violence import ViolenceMultimodalEvaluator
|
|
10
|
-
from ._protected_material import ProtectedMaterialMultimodalEvaluator
|
|
11
|
-
|
|
12
|
-
__all__ = [
|
|
13
|
-
"ContentSafetyMultimodalEvaluator",
|
|
14
|
-
"ContentSafetyMultimodalEvaluatorBase",
|
|
15
|
-
"ViolenceMultimodalEvaluator",
|
|
16
|
-
"SexualMultimodalEvaluator",
|
|
17
|
-
"SelfHarmMultimodalEvaluator",
|
|
18
|
-
"HateUnfairnessMultimodalEvaluator",
|
|
19
|
-
"ProtectedMaterialMultimodalEvaluator",
|
|
20
|
-
]
|
|
@@ -1,132 +0,0 @@
|
|
|
1
|
-
# ---------------------------------------------------------
|
|
2
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
-
# ---------------------------------------------------------
|
|
4
|
-
import logging
|
|
5
|
-
from concurrent.futures import as_completed
|
|
6
|
-
from typing import Callable, Dict, List, Union
|
|
7
|
-
|
|
8
|
-
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
9
|
-
from azure.ai.evaluation._common._experimental import experimental
|
|
10
|
-
from azure.ai.evaluation._common.utils import validate_conversation
|
|
11
|
-
from ._hate_unfairness import HateUnfairnessMultimodalEvaluator
|
|
12
|
-
from ._self_harm import SelfHarmMultimodalEvaluator
|
|
13
|
-
from ._sexual import SexualMultimodalEvaluator
|
|
14
|
-
from ._violence import ViolenceMultimodalEvaluator
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
@experimental
|
|
21
|
-
class ContentSafetyMultimodalEvaluator:
|
|
22
|
-
"""
|
|
23
|
-
Initialize a content safety multimodal evaluator configured to evaluate content safety metrics
|
|
24
|
-
in a multimodal scenario.
|
|
25
|
-
|
|
26
|
-
:param credential: The credential required for connecting to the Azure AI project.
|
|
27
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
28
|
-
:param azure_ai_project: The scope of the Azure AI project, containing the subscription ID,
|
|
29
|
-
resource group, and project name.
|
|
30
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
31
|
-
:param kwargs: Additional arguments to pass to the evaluator.
|
|
32
|
-
:type kwargs: Any
|
|
33
|
-
|
|
34
|
-
:return: A function that evaluates multimodal chat messages and generates content safety metrics.
|
|
35
|
-
|
|
36
|
-
**Usage Example**
|
|
37
|
-
|
|
38
|
-
.. code-block:: python
|
|
39
|
-
|
|
40
|
-
azure_ai_project = {
|
|
41
|
-
"subscription_id": "<subscription_id>",
|
|
42
|
-
"resource_group_name": "<resource_group_name>",
|
|
43
|
-
"project_name": "<project_name>",
|
|
44
|
-
}
|
|
45
|
-
eval_fn = ContentSafetyMultimodalEvaluator(azure_ai_project)
|
|
46
|
-
result = eval_fn(conversation=
|
|
47
|
-
{
|
|
48
|
-
"messages": [
|
|
49
|
-
{
|
|
50
|
-
"role": "user",
|
|
51
|
-
"content": [
|
|
52
|
-
{
|
|
53
|
-
"type": "text",
|
|
54
|
-
"text": "What's in this image?"
|
|
55
|
-
},
|
|
56
|
-
{
|
|
57
|
-
"type": "image_url",
|
|
58
|
-
"image_url": {
|
|
59
|
-
"url": "<image url or base64 encoded image>"
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
]
|
|
63
|
-
},
|
|
64
|
-
{
|
|
65
|
-
"role": "assistant",
|
|
66
|
-
"content": "This picture shows an astronaut standing in the desert."
|
|
67
|
-
}
|
|
68
|
-
]
|
|
69
|
-
}
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
**Output Format**
|
|
73
|
-
|
|
74
|
-
.. code-block:: json
|
|
75
|
-
|
|
76
|
-
{
|
|
77
|
-
"violence": "Medium",
|
|
78
|
-
"violence_score": 5.0,
|
|
79
|
-
"violence_reason": "Some reason",
|
|
80
|
-
"sexual": "Medium",
|
|
81
|
-
"sexual_score": 5.0,
|
|
82
|
-
"sexual_reason": "Some reason",
|
|
83
|
-
"self_harm": "Medium",
|
|
84
|
-
"self_harm_score": 5.0,
|
|
85
|
-
"self_harm_reason": "Some reason",
|
|
86
|
-
"hate_unfairness": "Medium",
|
|
87
|
-
"hate_unfairness_score": 5.0,
|
|
88
|
-
"hate_unfairness_reason": "Some reason"
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
"""
|
|
92
|
-
|
|
93
|
-
id = "content_safety_multimodal"
|
|
94
|
-
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
95
|
-
|
|
96
|
-
def __init__(self, credential, azure_ai_project, **kwargs):
|
|
97
|
-
self._parallel = kwargs.pop("_parallel", False)
|
|
98
|
-
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
|
|
99
|
-
ViolenceMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project),
|
|
100
|
-
SexualMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project),
|
|
101
|
-
SelfHarmMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project),
|
|
102
|
-
HateUnfairnessMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project),
|
|
103
|
-
]
|
|
104
|
-
|
|
105
|
-
def __call__(self, *, conversation, **kwargs):
|
|
106
|
-
"""
|
|
107
|
-
Evaluates content-safety metrics for list of messages.
|
|
108
|
-
|
|
109
|
-
:keyword conversation: The conversation contains list of messages to be evaluated.
|
|
110
|
-
Each message should have "role" and "content" keys. It supports single turn only.
|
|
111
|
-
:paramtype conversation: ~azure.ai.evaluation.Conversation
|
|
112
|
-
:return: The evaluation score based on the Content Safety Metrics.
|
|
113
|
-
:rtype: Dict[str, Union[float, str]]
|
|
114
|
-
"""
|
|
115
|
-
# validate inputs
|
|
116
|
-
validate_conversation(conversation)
|
|
117
|
-
results: Dict[str, Union[str, float]] = {}
|
|
118
|
-
if self._parallel:
|
|
119
|
-
with ThreadPoolExecutor() as executor:
|
|
120
|
-
futures = {
|
|
121
|
-
executor.submit(evaluator, conversation=conversation, **kwargs): evaluator
|
|
122
|
-
for evaluator in self._evaluators
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
for future in as_completed(futures):
|
|
126
|
-
results.update(future.result())
|
|
127
|
-
else:
|
|
128
|
-
for evaluator in self._evaluators:
|
|
129
|
-
result = evaluator(conversation=conversation, **kwargs)
|
|
130
|
-
results.update(result)
|
|
131
|
-
|
|
132
|
-
return results
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
# ---------------------------------------------------------
|
|
2
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
-
# ---------------------------------------------------------
|
|
4
|
-
from abc import ABC
|
|
5
|
-
from typing import Union
|
|
6
|
-
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_multimodal
|
|
7
|
-
from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics
|
|
8
|
-
from azure.ai.evaluation._common.utils import validate_conversation
|
|
9
|
-
from azure.core.credentials import TokenCredential
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class ContentSafetyMultimodalEvaluatorBase(ABC):
|
|
13
|
-
"""
|
|
14
|
-
Initialize a evaluator for a specified Evaluation Metric. Base class that is not
|
|
15
|
-
meant to be instantiated by users.
|
|
16
|
-
|
|
17
|
-
:param metric: The metric to be evaluated.
|
|
18
|
-
:type metric: ~azure.ai.evaluation._evaluators._content_safety.flow.constants.EvaluationMetrics
|
|
19
|
-
:param credential: The credential for connecting to Azure AI project. Required
|
|
20
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
21
|
-
:param azure_ai_project: The scope of the Azure AI project.
|
|
22
|
-
It contains subscription id, resource group, and project name.
|
|
23
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
def __init__(
|
|
27
|
-
self,
|
|
28
|
-
metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
|
|
29
|
-
credential: TokenCredential,
|
|
30
|
-
azure_ai_project,
|
|
31
|
-
):
|
|
32
|
-
self._metric = metric
|
|
33
|
-
self._azure_ai_project = azure_ai_project
|
|
34
|
-
self._credential = credential
|
|
35
|
-
|
|
36
|
-
async def __call__(self, *, conversation, **kwargs):
|
|
37
|
-
"""
|
|
38
|
-
Evaluates content according to this evaluator's metric.
|
|
39
|
-
:keyword conversation: The conversation contains list of messages to be evaluated.
|
|
40
|
-
Each message should have "role" and "content" keys.
|
|
41
|
-
:paramtype conversation: ~azure.ai.evaluation.Conversation
|
|
42
|
-
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
43
|
-
:rtype: Dict[str, Union[float, str]]
|
|
44
|
-
"""
|
|
45
|
-
# validate inputs
|
|
46
|
-
validate_conversation(conversation)
|
|
47
|
-
messages = conversation["messages"]
|
|
48
|
-
# Run score computation based on supplied metric.
|
|
49
|
-
result = await evaluate_with_rai_service_multimodal(
|
|
50
|
-
messages=messages,
|
|
51
|
-
metric_name=self._metric,
|
|
52
|
-
project_scope=self._azure_ai_project,
|
|
53
|
-
credential=self._credential,
|
|
54
|
-
)
|
|
55
|
-
return result
|
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
# ---------------------------------------------------------
|
|
2
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
-
# ---------------------------------------------------------
|
|
4
|
-
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
5
|
-
from azure.ai.evaluation._common._experimental import experimental
|
|
6
|
-
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
7
|
-
from ._content_safety_multimodal_base import ContentSafetyMultimodalEvaluatorBase
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@experimental
|
|
11
|
-
class HateUnfairnessMultimodalEvaluator:
|
|
12
|
-
"""
|
|
13
|
-
Initialize a hate-unfairness multimodal evaluator for hate unfairness score.
|
|
14
|
-
|
|
15
|
-
:param credential: The credential for connecting to the Azure AI project. This is required.
|
|
16
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
17
|
-
:param azure_ai_project: The scope of the Azure AI project,
|
|
18
|
-
which includes subscription ID, resource group name, and project name.
|
|
19
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
20
|
-
|
|
21
|
-
**Usage Example**
|
|
22
|
-
|
|
23
|
-
.. code-block:: python
|
|
24
|
-
|
|
25
|
-
azure_ai_project = {
|
|
26
|
-
"subscription_id": "<subscription_id>",
|
|
27
|
-
"resource_group_name": "<resource_group_name>",
|
|
28
|
-
"project_name": "<project_name>",
|
|
29
|
-
}
|
|
30
|
-
eval_fn = HateUnfairnessMultimodalEvaluator(azure_ai_project)
|
|
31
|
-
result = eval_fn(conversation=
|
|
32
|
-
{
|
|
33
|
-
"messages": [
|
|
34
|
-
{
|
|
35
|
-
"role": "user",
|
|
36
|
-
"content": [
|
|
37
|
-
{
|
|
38
|
-
"type": "text",
|
|
39
|
-
"text": "What's in this image?"
|
|
40
|
-
},
|
|
41
|
-
{
|
|
42
|
-
"type": "image_url",
|
|
43
|
-
"image_url": {
|
|
44
|
-
"url": "<image url or base64 encoded image>"
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
]
|
|
48
|
-
},
|
|
49
|
-
{
|
|
50
|
-
"role": "assistant",
|
|
51
|
-
"content": "This picture shows an astronaut standing in the desert."
|
|
52
|
-
}
|
|
53
|
-
]
|
|
54
|
-
}
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
**Output Format**
|
|
58
|
-
|
|
59
|
-
.. code-block:: json
|
|
60
|
-
|
|
61
|
-
{
|
|
62
|
-
"hate_unfairness": "High",
|
|
63
|
-
"hate_unfairness_score": 6.5,
|
|
64
|
-
"hate_unfairness_reason": "Some reason"
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
"""
|
|
68
|
-
|
|
69
|
-
id = "hate_unfairness_multimodal"
|
|
70
|
-
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
71
|
-
|
|
72
|
-
def __init__(self, credential, azure_ai_project):
|
|
73
|
-
self._async_evaluator = _AsyncHateUnfairnessMultimodalEvaluator(credential, azure_ai_project)
|
|
74
|
-
|
|
75
|
-
def __call__(self, *, conversation, **kwargs):
|
|
76
|
-
"""
|
|
77
|
-
Evaluates hate unfairness content.
|
|
78
|
-
|
|
79
|
-
:keyword conversation: The conversation contains list of messages to be evaluated.
|
|
80
|
-
Each message should have "role" and "content" keys. It supports single turn only.
|
|
81
|
-
:paramtype conversation: ~azure.ai.evaluation.Conversation
|
|
82
|
-
:return: The hate unfairness score.
|
|
83
|
-
:rtype: Dict[str, Union[float, str]]
|
|
84
|
-
"""
|
|
85
|
-
return async_run_allowing_running_loop(self._async_evaluator, conversation=conversation, **kwargs)
|
|
86
|
-
|
|
87
|
-
def _to_async(self):
|
|
88
|
-
return self._async_evaluator
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
class _AsyncHateUnfairnessMultimodalEvaluator(ContentSafetyMultimodalEvaluatorBase):
|
|
92
|
-
def __init__(self, credential, azure_ai_project):
|
|
93
|
-
super().__init__(
|
|
94
|
-
metric=EvaluationMetrics.HATE_FAIRNESS,
|
|
95
|
-
credential=credential,
|
|
96
|
-
azure_ai_project=azure_ai_project,
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
async def __call__(self, *, conversation, **kwargs):
|
|
100
|
-
return await super().__call__(conversation=conversation, **kwargs)
|