azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +82 -0
- azure/ai/evaluation/_common/__init__.py +16 -0
- azure/ai/evaluation/_common/_experimental.py +172 -0
- azure/ai/evaluation/_common/constants.py +72 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/rai_service.py +632 -0
- azure/ai/evaluation/_common/utils.py +445 -0
- azure/ai/evaluation/_constants.py +72 -0
- azure/ai/evaluation/_evaluate/__init__.py +3 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
- azure/ai/evaluation/_evaluate/_utils.py +298 -0
- azure/ai/evaluation/_evaluators/__init__.py +3 -0
- azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
- azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
- azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
- azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
- azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
- azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
- azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
- azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
- azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
- azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
- azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
- azure/ai/evaluation/_exceptions.py +128 -0
- azure/ai/evaluation/_http_utils.py +466 -0
- azure/ai/evaluation/_model_configurations.py +123 -0
- azure/ai/evaluation/_user_agent.py +6 -0
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +5 -0
- azure/ai/evaluation/py.typed +0 -0
- azure/ai/evaluation/simulator/__init__.py +16 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
- azure/ai/evaluation/simulator/_constants.py +27 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
- azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
- azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
- azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
- azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
- azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
- azure/ai/evaluation/simulator/_simulator.py +716 -0
- azure/ai/evaluation/simulator/_tracing.py +89 -0
- azure/ai/evaluation/simulator/_utils.py +132 -0
- azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
- {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
- azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
- azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
- azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
# pylint: disable=C0103,C0114,C0116,E0401,E0611
|
|
5
|
+
|
|
6
|
+
import functools
|
|
7
|
+
from typing import Callable, TypeVar
|
|
8
|
+
|
|
9
|
+
from promptflow._sdk._telemetry.activity import ActivityType, monitor_operation
|
|
10
|
+
from typing_extensions import ParamSpec
|
|
11
|
+
|
|
12
|
+
P = ParamSpec("P")
|
|
13
|
+
R = TypeVar("R")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def monitor_adversarial_scenario(activity_name: str = "adversarial.simulator.call"):
|
|
17
|
+
"""
|
|
18
|
+
Monitor an adversarial scenario.
|
|
19
|
+
|
|
20
|
+
:param activity_name: The name of the activity to monitor.
|
|
21
|
+
:type activity_name: str
|
|
22
|
+
:returns: A decorator
|
|
23
|
+
:rtype: Callable[[Callable], Callable]
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def decorator(func: Callable[P, R]) -> Callable[P, R]:
|
|
27
|
+
"""
|
|
28
|
+
Decorator for monitoring an adversarial scenario.
|
|
29
|
+
|
|
30
|
+
:param func: The function to be decorated.
|
|
31
|
+
:type func: Callable[P, R]
|
|
32
|
+
:returns: The decorated function
|
|
33
|
+
:rtype: Callable[P, R]
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
@functools.wraps(func)
|
|
37
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
38
|
+
scenario = str(kwargs.get("scenario", None))
|
|
39
|
+
max_conversation_turns = kwargs.get("max_conversation_turns", None)
|
|
40
|
+
max_simulation_results = kwargs.get("max_simulation_results", None)
|
|
41
|
+
jailbreak = kwargs.get("jailbreak", None)
|
|
42
|
+
decorated_func = monitor_operation(
|
|
43
|
+
activity_name=activity_name,
|
|
44
|
+
activity_type=ActivityType.PUBLICAPI,
|
|
45
|
+
custom_dimensions={
|
|
46
|
+
"scenario": scenario,
|
|
47
|
+
"max_conversation_turns": max_conversation_turns,
|
|
48
|
+
"max_simulation_results": max_simulation_results,
|
|
49
|
+
"jailbreak": jailbreak,
|
|
50
|
+
},
|
|
51
|
+
)(func)
|
|
52
|
+
|
|
53
|
+
return decorated_func(*args, **kwargs)
|
|
54
|
+
|
|
55
|
+
return wrapper
|
|
56
|
+
|
|
57
|
+
return decorator
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def monitor_task_simulator(func: Callable[P, R]) -> Callable[P, R]:
|
|
61
|
+
"""
|
|
62
|
+
Monitor a task simulator.
|
|
63
|
+
|
|
64
|
+
:param func: The function to be decorated.
|
|
65
|
+
:type func: Callable[P, R]
|
|
66
|
+
:returns: The decorated function
|
|
67
|
+
:rtype: Callable[P, R]
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
@functools.wraps(func)
|
|
71
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
72
|
+
text = kwargs.get("text")
|
|
73
|
+
user_persona = kwargs.get("user_persona")
|
|
74
|
+
num_queries = kwargs.get("num_queries", 0)
|
|
75
|
+
max_conversation_turns = kwargs.get("max_conversation_turns", 0)
|
|
76
|
+
decorated_func = monitor_operation(
|
|
77
|
+
activity_name="task.simulator.call",
|
|
78
|
+
activity_type=ActivityType.PUBLICAPI,
|
|
79
|
+
custom_dimensions={
|
|
80
|
+
"text_length": len(text) if isinstance(text, str) else 0,
|
|
81
|
+
"user_persona_length": len(user_persona) if isinstance(user_persona, list) else 0,
|
|
82
|
+
"number_of_queries": num_queries,
|
|
83
|
+
"max_conversation_turns": max_conversation_turns,
|
|
84
|
+
},
|
|
85
|
+
)(func)
|
|
86
|
+
|
|
87
|
+
return decorated_func(*args, **kwargs)
|
|
88
|
+
|
|
89
|
+
return wrapper
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
"""
|
|
5
|
+
This module contains a utility class for managing a list of JSON lines.
|
|
6
|
+
"""
|
|
7
|
+
import json
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class JsonLineList(list):
|
|
11
|
+
"""
|
|
12
|
+
A util to manage a list of JSON lines.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def to_json_lines(self):
|
|
16
|
+
"""
|
|
17
|
+
Converts the list to a string of JSON lines.
|
|
18
|
+
Each item in the list is converted to a JSON string
|
|
19
|
+
and appended to the result string with a newline.
|
|
20
|
+
|
|
21
|
+
:returns: A string of JSON lines, where each line is a JSON representation of an item in the list.
|
|
22
|
+
:rtype: str
|
|
23
|
+
"""
|
|
24
|
+
json_lines = ""
|
|
25
|
+
for item in self:
|
|
26
|
+
json_lines += json.dumps(item) + "\n"
|
|
27
|
+
return json_lines
|
|
28
|
+
|
|
29
|
+
def to_eval_qr_json_lines(self):
|
|
30
|
+
"""
|
|
31
|
+
Converts the list to a string of JSON lines suitable for evaluation in a query & response format.
|
|
32
|
+
Each item in the list is expected to be a dictionary with
|
|
33
|
+
'messages' key. The 'messages' value is a list of
|
|
34
|
+
dictionaries, each with a 'role' key and a 'content' key.
|
|
35
|
+
The 'role' value should be either 'user' or 'assistant',
|
|
36
|
+
and the 'content' value should be a string.
|
|
37
|
+
If a 'context' key is present in the message, its value is also included
|
|
38
|
+
in the output.
|
|
39
|
+
|
|
40
|
+
:returns: A string of JSON lines.
|
|
41
|
+
:rtype: str
|
|
42
|
+
"""
|
|
43
|
+
json_lines = ""
|
|
44
|
+
for item in self:
|
|
45
|
+
user_message = None
|
|
46
|
+
assistant_message = None
|
|
47
|
+
user_context = None
|
|
48
|
+
assistant_context = None
|
|
49
|
+
template_parameters = item.get("template_parameters", {})
|
|
50
|
+
category = template_parameters.get("category", None)
|
|
51
|
+
for message in item["messages"]:
|
|
52
|
+
if message["role"] == "user":
|
|
53
|
+
user_message = message["content"]
|
|
54
|
+
user_context = message.get("context", "")
|
|
55
|
+
elif message["role"] == "assistant":
|
|
56
|
+
assistant_message = message["content"]
|
|
57
|
+
assistant_context = message.get("context", "")
|
|
58
|
+
if user_message and assistant_message:
|
|
59
|
+
if user_context or assistant_context:
|
|
60
|
+
json_lines += (
|
|
61
|
+
json.dumps(
|
|
62
|
+
{
|
|
63
|
+
"query": user_message,
|
|
64
|
+
"response": assistant_message,
|
|
65
|
+
"context": str(
|
|
66
|
+
{
|
|
67
|
+
"user_context": user_context,
|
|
68
|
+
"assistant_context": assistant_context,
|
|
69
|
+
}
|
|
70
|
+
),
|
|
71
|
+
"category": category,
|
|
72
|
+
}
|
|
73
|
+
)
|
|
74
|
+
+ "\n"
|
|
75
|
+
)
|
|
76
|
+
user_message = assistant_message = None
|
|
77
|
+
else:
|
|
78
|
+
json_lines += (
|
|
79
|
+
json.dumps({"query": user_message, "response": assistant_message, "category": category})
|
|
80
|
+
+ "\n"
|
|
81
|
+
)
|
|
82
|
+
user_message = assistant_message = None
|
|
83
|
+
|
|
84
|
+
return json_lines
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class JsonLineChatProtocol(dict):
|
|
88
|
+
"""
|
|
89
|
+
A util to manage a JSON object that follows the chat protocol.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
def to_json(self):
|
|
93
|
+
"""
|
|
94
|
+
Converts the object to a JSON string.
|
|
95
|
+
|
|
96
|
+
:returns: A JSON representation of the object.
|
|
97
|
+
:rtype: str
|
|
98
|
+
"""
|
|
99
|
+
return json.dumps(self)
|
|
100
|
+
|
|
101
|
+
def to_eval_qr_json_lines(self) -> str:
|
|
102
|
+
"""
|
|
103
|
+
Converts the object to a string of JSON lines suitable for evaluation in a query and response format.
|
|
104
|
+
The object is expected to be a dictionary with 'messages' key.
|
|
105
|
+
|
|
106
|
+
:returns: A json lines document
|
|
107
|
+
:rtype: str
|
|
108
|
+
"""
|
|
109
|
+
user_message = None
|
|
110
|
+
assistant_message = None
|
|
111
|
+
if "context" in self:
|
|
112
|
+
context = self["context"]
|
|
113
|
+
else:
|
|
114
|
+
context = None
|
|
115
|
+
json_lines = ""
|
|
116
|
+
for message in self["messages"]:
|
|
117
|
+
if message["role"] == "user":
|
|
118
|
+
user_message = message["content"]
|
|
119
|
+
elif message["role"] == "assistant":
|
|
120
|
+
assistant_message = message["content"]
|
|
121
|
+
if "context" in message and message["context"] is not None:
|
|
122
|
+
context = message.get("context", context)
|
|
123
|
+
if user_message and assistant_message:
|
|
124
|
+
if context:
|
|
125
|
+
json_lines += (
|
|
126
|
+
json.dumps({"query": user_message, "response": assistant_message, "context": context}) + "\n"
|
|
127
|
+
)
|
|
128
|
+
user_message = assistant_message = None
|
|
129
|
+
else:
|
|
130
|
+
json_lines += json.dumps({"query": user_message, "response": assistant_message}) + "\n"
|
|
131
|
+
user_message = assistant_message = None
|
|
132
|
+
return json_lines
|