azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +82 -0
- azure/ai/evaluation/_common/__init__.py +16 -0
- azure/ai/evaluation/_common/_experimental.py +172 -0
- azure/ai/evaluation/_common/constants.py +72 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/rai_service.py +632 -0
- azure/ai/evaluation/_common/utils.py +445 -0
- azure/ai/evaluation/_constants.py +72 -0
- azure/ai/evaluation/_evaluate/__init__.py +3 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
- azure/ai/evaluation/_evaluate/_utils.py +298 -0
- azure/ai/evaluation/_evaluators/__init__.py +3 -0
- azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
- azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
- azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
- azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
- azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
- azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
- azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
- azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
- azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
- azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
- azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
- azure/ai/evaluation/_exceptions.py +128 -0
- azure/ai/evaluation/_http_utils.py +466 -0
- azure/ai/evaluation/_model_configurations.py +123 -0
- azure/ai/evaluation/_user_agent.py +6 -0
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +5 -0
- azure/ai/evaluation/py.typed +0 -0
- azure/ai/evaluation/simulator/__init__.py +16 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
- azure/ai/evaluation/simulator/_constants.py +27 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
- azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
- azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
- azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
- azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
- azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
- azure/ai/evaluation/simulator/_simulator.py +716 -0
- azure/ai/evaluation/simulator/_tracing.py +89 -0
- azure/ai/evaluation/simulator/_utils.py +132 -0
- azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
- {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
- azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
- azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
- azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from typing import Dict, List, Optional, TypedDict, cast
|
|
6
|
+
|
|
7
|
+
from typing_extensions import NotRequired
|
|
8
|
+
|
|
9
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
10
|
+
|
|
11
|
+
from ._rai_client import RAIClient
|
|
12
|
+
|
|
13
|
+
CONTENT_HARM_TEMPLATES_COLLECTION_KEY = {
|
|
14
|
+
"adv_qa",
|
|
15
|
+
"adv_conversation",
|
|
16
|
+
"adv_summarization",
|
|
17
|
+
"adv_search",
|
|
18
|
+
"adv_rewrite",
|
|
19
|
+
"adv_content_gen_ungrounded",
|
|
20
|
+
"adv_content_gen_grounded",
|
|
21
|
+
"adv_content_protected_material",
|
|
22
|
+
"adv_politics",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TemplateParameters(TypedDict):
|
|
27
|
+
"""Parameters used in Templates
|
|
28
|
+
|
|
29
|
+
.. note::
|
|
30
|
+
|
|
31
|
+
This type is good enough to type check, but is incorrect. It's meant to represent a dictionary with a known
|
|
32
|
+
`metadata` key (Dict[str, str]), a known `ch_template_placeholder` key (str), and an unknown number of keys
|
|
33
|
+
that map to `str` values.
|
|
34
|
+
|
|
35
|
+
In typescript, this type would be spelled:
|
|
36
|
+
|
|
37
|
+
.. code-block:: typescript
|
|
38
|
+
|
|
39
|
+
type AdversarialTemplateParameters = {
|
|
40
|
+
[key: string]: string
|
|
41
|
+
ch_template_placeholder: string
|
|
42
|
+
metadata: {[index: string]: string} # Doesn't typecheck but gets the point across
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
At time of writing, this isn't possible to express with a TypedDict. TypedDicts must be "closed" in that
|
|
46
|
+
they fully specify all the keys they can contain.
|
|
47
|
+
|
|
48
|
+
`PEP 728 – TypedDict with Typed Extra Items <https://peps.python.org/pep-0728/>` is a proposal to support
|
|
49
|
+
this, but would only be available in Python 3.13 at the earliest.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
metadata: Dict[str, str]
|
|
53
|
+
conversation_starter: str
|
|
54
|
+
ch_template_placeholder: str
|
|
55
|
+
group_of_people: NotRequired[str]
|
|
56
|
+
category: NotRequired[str]
|
|
57
|
+
target_population: NotRequired[str]
|
|
58
|
+
topic: NotRequired[str]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class _CategorizedParameter(TypedDict):
|
|
62
|
+
parameters: List[TemplateParameters]
|
|
63
|
+
category: str
|
|
64
|
+
parameters_key: str
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class ContentHarmTemplatesUtils:
|
|
68
|
+
"""Content harm templates utility functions."""
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def get_template_category(key: str) -> str:
|
|
72
|
+
"""Parse category from template key
|
|
73
|
+
|
|
74
|
+
:param key: The template key
|
|
75
|
+
:type key: str
|
|
76
|
+
:return: The category
|
|
77
|
+
:rtype: str
|
|
78
|
+
"""
|
|
79
|
+
# Check for datasets whose names do not align with the normal
|
|
80
|
+
# naming convention where the first segment of the name is the category.
|
|
81
|
+
if key == "conversation/public/ip/bing_ip.json":
|
|
82
|
+
return "content_protected_material"
|
|
83
|
+
return key.split("/")[0]
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def get_template_key(key: str) -> str:
|
|
87
|
+
"""Given a template dataset name (which looks like a .json file name) convert it into
|
|
88
|
+
the corresponding template key (which looks like a .md file name). This allows us to
|
|
89
|
+
properly link datasets to the LLM that must be used to simulate them.
|
|
90
|
+
|
|
91
|
+
:param key: The dataset key.
|
|
92
|
+
:type key: str
|
|
93
|
+
:return: The template key.
|
|
94
|
+
:rtype: str
|
|
95
|
+
"""
|
|
96
|
+
filepath = key.rsplit(".json")[0]
|
|
97
|
+
parts = str(filepath).split("/")
|
|
98
|
+
filename = ContentHarmTemplatesUtils.json_name_to_md_name(parts[-1])
|
|
99
|
+
prefix = parts[:-1]
|
|
100
|
+
prefix.append(filename)
|
|
101
|
+
|
|
102
|
+
return "/".join(prefix)
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def json_name_to_md_name(name) -> str:
|
|
106
|
+
"""Convert JSON filename to Markdown filename
|
|
107
|
+
|
|
108
|
+
:param name: The JSON filename
|
|
109
|
+
:type name: str
|
|
110
|
+
:return: The Markdown filename
|
|
111
|
+
:rtype: str
|
|
112
|
+
"""
|
|
113
|
+
result = name.replace("_aml", "")
|
|
114
|
+
|
|
115
|
+
return result + ".md"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class AdversarialTemplate:
|
|
119
|
+
"""Template for adversarial scenarios.
|
|
120
|
+
|
|
121
|
+
:param template_name: The name of the template.
|
|
122
|
+
:type template_name: str
|
|
123
|
+
:param text: The template text.
|
|
124
|
+
:type text: str
|
|
125
|
+
:param context_key: The context key.
|
|
126
|
+
:param template_parameters: The template parameters.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
def __init__(
|
|
130
|
+
self,
|
|
131
|
+
template_name: str,
|
|
132
|
+
text: Optional[str],
|
|
133
|
+
context_key: List,
|
|
134
|
+
template_parameters: Optional[List[TemplateParameters]] = None,
|
|
135
|
+
) -> None:
|
|
136
|
+
self.text = text
|
|
137
|
+
self.context_key = context_key
|
|
138
|
+
self.template_name = template_name
|
|
139
|
+
self.template_parameters = template_parameters or []
|
|
140
|
+
|
|
141
|
+
def __str__(self) -> str:
|
|
142
|
+
return "{{ch_template_placeholder}}"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class AdversarialTemplateHandler:
|
|
146
|
+
"""
|
|
147
|
+
Adversarial template handler constructor.
|
|
148
|
+
|
|
149
|
+
:param azure_ai_project: The Azure AI project.
|
|
150
|
+
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
151
|
+
:param rai_client: The RAI client.
|
|
152
|
+
:type rai_client: ~azure.ai.evaluation.simulator._model_tools.RAIClient
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
def __init__(self, azure_ai_project: AzureAIProject, rai_client: RAIClient) -> None:
|
|
156
|
+
self.azure_ai_project = azure_ai_project
|
|
157
|
+
self.categorized_ch_parameters: Optional[Dict[str, _CategorizedParameter]] = None
|
|
158
|
+
self.rai_client = rai_client
|
|
159
|
+
|
|
160
|
+
async def _get_content_harm_template_collections(self, collection_key: str) -> List[AdversarialTemplate]:
|
|
161
|
+
if self.categorized_ch_parameters is None:
|
|
162
|
+
categorized_parameters: Dict[str, _CategorizedParameter] = {}
|
|
163
|
+
util = ContentHarmTemplatesUtils
|
|
164
|
+
|
|
165
|
+
parameters = await self.rai_client.get_contentharm_parameters()
|
|
166
|
+
|
|
167
|
+
for k in parameters.keys():
|
|
168
|
+
template_key = util.get_template_key(k)
|
|
169
|
+
categorized_parameters[template_key] = {
|
|
170
|
+
"parameters": cast(List[TemplateParameters], parameters[k]),
|
|
171
|
+
"category": util.get_template_category(k),
|
|
172
|
+
"parameters_key": k,
|
|
173
|
+
}
|
|
174
|
+
self.categorized_ch_parameters = categorized_parameters
|
|
175
|
+
|
|
176
|
+
template_category = collection_key.split("adv_")[-1]
|
|
177
|
+
|
|
178
|
+
plist = self.categorized_ch_parameters
|
|
179
|
+
ch_templates = []
|
|
180
|
+
for key, value in plist.items():
|
|
181
|
+
if value["category"] == template_category:
|
|
182
|
+
params = value["parameters"]
|
|
183
|
+
for p in params:
|
|
184
|
+
p.update({"ch_template_placeholder": "{{ch_template_placeholder}}"})
|
|
185
|
+
|
|
186
|
+
template = AdversarialTemplate(template_name=key, text=None, context_key=[], template_parameters=params)
|
|
187
|
+
|
|
188
|
+
ch_templates.append(template)
|
|
189
|
+
return ch_templates
|
|
190
|
+
|
|
191
|
+
def get_template(self, template_name: str) -> Optional[AdversarialTemplate]:
|
|
192
|
+
"""Generate content harm template.
|
|
193
|
+
|
|
194
|
+
:param template_name: The name of the template.
|
|
195
|
+
:type template_name: str
|
|
196
|
+
:return: The generated content harm template.
|
|
197
|
+
:rtype: Optional[~azure.ai.evaluation.simulator._model_tools.AdversarialTemplate]
|
|
198
|
+
"""
|
|
199
|
+
if template_name in CONTENT_HARM_TEMPLATES_COLLECTION_KEY:
|
|
200
|
+
return AdversarialTemplate(template_name=template_name, text=None, context_key=[], template_parameters=None)
|
|
201
|
+
return None
|