azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +4 -26
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +122 -302
- azure/ai/evaluation/_common/utils.py +35 -393
- azure/ai/evaluation/_constants.py +6 -28
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
- azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
- azure/ai/evaluation/_evaluate/_utils.py +47 -108
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +134 -205
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -3
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
- azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_tracing.py +28 -25
- azure/ai/evaluation/simulator/_utils.py +13 -34
- azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/_experimental.py +0 -172
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_simulator.py +0 -716
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
|
@@ -4,37 +4,35 @@
|
|
|
4
4
|
# pylint: disable=C0103,C0114,C0116,E0401,E0611
|
|
5
5
|
|
|
6
6
|
import functools
|
|
7
|
-
from typing import Callable, TypeVar
|
|
8
7
|
|
|
9
8
|
from promptflow._sdk._telemetry.activity import ActivityType, monitor_operation
|
|
10
|
-
from typing_extensions import ParamSpec
|
|
11
|
-
|
|
12
|
-
P = ParamSpec("P")
|
|
13
|
-
R = TypeVar("R")
|
|
14
9
|
|
|
15
10
|
|
|
16
11
|
def monitor_adversarial_scenario(activity_name: str = "adversarial.simulator.call"):
|
|
17
12
|
"""
|
|
18
13
|
Monitor an adversarial scenario.
|
|
19
14
|
|
|
20
|
-
:
|
|
21
|
-
|
|
22
|
-
:returns: A decorator
|
|
23
|
-
:rtype: Callable[[Callable], Callable]
|
|
15
|
+
Parameters:
|
|
16
|
+
activity_name (str): The name of the activity to monitor.
|
|
24
17
|
"""
|
|
25
18
|
|
|
26
|
-
def decorator(func
|
|
19
|
+
def decorator(func):
|
|
27
20
|
"""
|
|
28
21
|
Decorator for monitoring an adversarial scenario.
|
|
29
22
|
|
|
30
|
-
:
|
|
31
|
-
|
|
32
|
-
:returns: The decorated function
|
|
33
|
-
:rtype: Callable[P, R]
|
|
23
|
+
Parameters:
|
|
24
|
+
func (function): The function to be decorated.
|
|
34
25
|
"""
|
|
35
26
|
|
|
36
27
|
@functools.wraps(func)
|
|
37
|
-
def wrapper(*args
|
|
28
|
+
def wrapper(*args, **kwargs):
|
|
29
|
+
"""
|
|
30
|
+
Wrapper for monitoring an adversarial scenario.
|
|
31
|
+
|
|
32
|
+
Parameters:
|
|
33
|
+
*args: Variable length argument list.
|
|
34
|
+
**kwargs: Arbitrary keyword arguments.
|
|
35
|
+
"""
|
|
38
36
|
scenario = str(kwargs.get("scenario", None))
|
|
39
37
|
max_conversation_turns = kwargs.get("max_conversation_turns", None)
|
|
40
38
|
max_simulation_results = kwargs.get("max_simulation_results", None)
|
|
@@ -57,28 +55,33 @@ def monitor_adversarial_scenario(activity_name: str = "adversarial.simulator.cal
|
|
|
57
55
|
return decorator
|
|
58
56
|
|
|
59
57
|
|
|
60
|
-
def monitor_task_simulator(func
|
|
58
|
+
def monitor_task_simulator(func):
|
|
61
59
|
"""
|
|
62
60
|
Monitor a task simulator.
|
|
63
61
|
|
|
64
|
-
:
|
|
65
|
-
|
|
66
|
-
:returns: The decorated function
|
|
67
|
-
:rtype: Callable[P, R]
|
|
62
|
+
Parameters:
|
|
63
|
+
func (function): The function to be decorated.
|
|
68
64
|
"""
|
|
69
65
|
|
|
70
66
|
@functools.wraps(func)
|
|
71
|
-
def wrapper(*args
|
|
72
|
-
|
|
73
|
-
|
|
67
|
+
def wrapper(*args, **kwargs):
|
|
68
|
+
"""
|
|
69
|
+
Wrapper for monitoring a task simulator.
|
|
70
|
+
|
|
71
|
+
Parameters:
|
|
72
|
+
*args: Variable length argument list.
|
|
73
|
+
**kwargs: Arbitrary keyword arguments.
|
|
74
|
+
"""
|
|
75
|
+
text_length = len(kwargs.get("text", ""))
|
|
76
|
+
user_persona_length = len(kwargs.get("user_persona", []))
|
|
74
77
|
num_queries = kwargs.get("num_queries", 0)
|
|
75
78
|
max_conversation_turns = kwargs.get("max_conversation_turns", 0)
|
|
76
79
|
decorated_func = monitor_operation(
|
|
77
80
|
activity_name="task.simulator.call",
|
|
78
81
|
activity_type=ActivityType.PUBLICAPI,
|
|
79
82
|
custom_dimensions={
|
|
80
|
-
"text_length":
|
|
81
|
-
"user_persona_length":
|
|
83
|
+
"text_length": text_length,
|
|
84
|
+
"user_persona_length": user_persona_length,
|
|
82
85
|
"number_of_queries": num_queries,
|
|
83
86
|
"max_conversation_turns": max_conversation_turns,
|
|
84
87
|
},
|
|
@@ -26,9 +26,9 @@ class JsonLineList(list):
|
|
|
26
26
|
json_lines += json.dumps(item) + "\n"
|
|
27
27
|
return json_lines
|
|
28
28
|
|
|
29
|
-
def
|
|
29
|
+
def to_eval_qa_json_lines(self):
|
|
30
30
|
"""
|
|
31
|
-
Converts the list to a string of JSON lines suitable for evaluation in a
|
|
31
|
+
Converts the list to a string of JSON lines suitable for evaluation in a Q&A format.
|
|
32
32
|
Each item in the list is expected to be a dictionary with
|
|
33
33
|
'messages' key. The 'messages' value is a list of
|
|
34
34
|
dictionaries, each with a 'role' key and a 'content' key.
|
|
@@ -44,41 +44,23 @@ class JsonLineList(list):
|
|
|
44
44
|
for item in self:
|
|
45
45
|
user_message = None
|
|
46
46
|
assistant_message = None
|
|
47
|
-
|
|
48
|
-
assistant_context = None
|
|
49
|
-
template_parameters = item.get("template_parameters", {})
|
|
50
|
-
category = template_parameters.get("category", None)
|
|
47
|
+
context = None
|
|
51
48
|
for message in item["messages"]:
|
|
52
49
|
if message["role"] == "user":
|
|
53
50
|
user_message = message["content"]
|
|
54
|
-
user_context = message.get("context", "")
|
|
55
51
|
elif message["role"] == "assistant":
|
|
56
52
|
assistant_message = message["content"]
|
|
57
|
-
|
|
53
|
+
if "context" in message:
|
|
54
|
+
context = message.get("context", None)
|
|
58
55
|
if user_message and assistant_message:
|
|
59
|
-
if
|
|
56
|
+
if context:
|
|
60
57
|
json_lines += (
|
|
61
|
-
json.dumps(
|
|
62
|
-
{
|
|
63
|
-
"query": user_message,
|
|
64
|
-
"response": assistant_message,
|
|
65
|
-
"context": str(
|
|
66
|
-
{
|
|
67
|
-
"user_context": user_context,
|
|
68
|
-
"assistant_context": assistant_context,
|
|
69
|
-
}
|
|
70
|
-
),
|
|
71
|
-
"category": category,
|
|
72
|
-
}
|
|
73
|
-
)
|
|
58
|
+
json.dumps({"query": user_message, "response": assistant_message, "context": context})
|
|
74
59
|
+ "\n"
|
|
75
60
|
)
|
|
76
|
-
user_message = assistant_message = None
|
|
61
|
+
user_message = assistant_message = context = None
|
|
77
62
|
else:
|
|
78
|
-
json_lines += (
|
|
79
|
-
json.dumps({"query": user_message, "response": assistant_message, "category": category})
|
|
80
|
-
+ "\n"
|
|
81
|
-
)
|
|
63
|
+
json_lines += json.dumps({"query": user_message, "response": assistant_message}) + "\n"
|
|
82
64
|
user_message = assistant_message = None
|
|
83
65
|
|
|
84
66
|
return json_lines
|
|
@@ -98,13 +80,10 @@ class JsonLineChatProtocol(dict):
|
|
|
98
80
|
"""
|
|
99
81
|
return json.dumps(self)
|
|
100
82
|
|
|
101
|
-
def
|
|
83
|
+
def to_eval_qa_json_lines(self):
|
|
102
84
|
"""
|
|
103
|
-
Converts the object to a string of JSON lines suitable for evaluation in a
|
|
85
|
+
Converts the object to a string of JSON lines suitable for evaluation in a Q&A format.
|
|
104
86
|
The object is expected to be a dictionary with 'messages' key.
|
|
105
|
-
|
|
106
|
-
:returns: A json lines document
|
|
107
|
-
:rtype: str
|
|
108
87
|
"""
|
|
109
88
|
user_message = None
|
|
110
89
|
assistant_message = None
|
|
@@ -123,10 +102,10 @@ class JsonLineChatProtocol(dict):
|
|
|
123
102
|
if user_message and assistant_message:
|
|
124
103
|
if context:
|
|
125
104
|
json_lines += (
|
|
126
|
-
json.dumps({"
|
|
105
|
+
json.dumps({"question": user_message, "answer": assistant_message, "context": context}) + "\n"
|
|
127
106
|
)
|
|
128
107
|
user_message = assistant_message = None
|
|
129
108
|
else:
|
|
130
|
-
json_lines += json.dumps({"
|
|
109
|
+
json_lines += json.dumps({"question": user_message, "answer": assistant_message}) + "\n"
|
|
131
110
|
user_message = assistant_message = None
|
|
132
111
|
return json_lines
|