azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +10 -0
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
- azure/ai/evaluation/_aoai/label_grader.py +66 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
- azure/ai/evaluation/_azure/_clients.py +4 -4
- azure/ai/evaluation/_azure/_envs.py +208 -0
- azure/ai/evaluation/_azure/_token_manager.py +12 -7
- azure/ai/evaluation/_common/__init__.py +7 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +163 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +139 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +165 -34
- azure/ai/evaluation/_common/raiclient/_version.py +1 -1
- azure/ai/evaluation/_common/utils.py +79 -1
- azure/ai/evaluation/_constants.py +16 -0
- azure/ai/evaluation/_converters/_ai_services.py +162 -118
- azure/ai/evaluation/_converters/_models.py +76 -6
- azure/ai/evaluation/_eval_mapping.py +73 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +325 -76
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +553 -0
- azure/ai/evaluation/_evaluate/_utils.py +117 -4
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +469 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +11 -1
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -1
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +16 -2
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +11 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -2
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +11 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +86 -12
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
- azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +114 -22
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +976 -546
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
- azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +38 -25
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +43 -28
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +26 -18
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +15 -10
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +49 -3
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +144 -86
- /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,7 @@ from pathlib import Path
|
|
|
10
10
|
from typing import Any, Dict, NamedTuple, Optional, Union, cast
|
|
11
11
|
import uuid
|
|
12
12
|
import base64
|
|
13
|
+
import math
|
|
13
14
|
|
|
14
15
|
import pandas as pd
|
|
15
16
|
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
@@ -126,6 +127,82 @@ def process_message_content(content, images_folder_path):
|
|
|
126
127
|
f.write(image_data_binary)
|
|
127
128
|
return None
|
|
128
129
|
|
|
130
|
+
def _log_metrics_and_instance_results_onedp(
|
|
131
|
+
metrics: Dict[str, Any],
|
|
132
|
+
instance_results: pd.DataFrame,
|
|
133
|
+
project_url: str,
|
|
134
|
+
evaluation_name: Optional[str],
|
|
135
|
+
name_map: Dict[str, str],
|
|
136
|
+
**kwargs,
|
|
137
|
+
) -> Optional[str]:
|
|
138
|
+
|
|
139
|
+
# One RP Client
|
|
140
|
+
from azure.ai.evaluation._azure._token_manager import AzureMLTokenManager
|
|
141
|
+
from azure.ai.evaluation._constants import TokenScope
|
|
142
|
+
from azure.ai.evaluation._common import EvaluationServiceOneDPClient, EvaluationUpload
|
|
143
|
+
|
|
144
|
+
credentials = AzureMLTokenManager(
|
|
145
|
+
TokenScope.COGNITIVE_SERVICES_MANAGEMENT.value, LOGGER, credential=kwargs.get("credential")
|
|
146
|
+
)
|
|
147
|
+
client = EvaluationServiceOneDPClient(
|
|
148
|
+
endpoint=project_url,
|
|
149
|
+
credential=credentials
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Massaging before artifacts are put on disk
|
|
153
|
+
# Adding line_number as index column this is needed by UI to form link to individual instance run
|
|
154
|
+
instance_results["line_number"] = instance_results.index.values
|
|
155
|
+
|
|
156
|
+
artifact_name = "instance_results.jsonl"
|
|
157
|
+
|
|
158
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
159
|
+
# storing multi_modal images if exists
|
|
160
|
+
col_name = "inputs.conversation"
|
|
161
|
+
if col_name in instance_results.columns:
|
|
162
|
+
for item in instance_results[col_name].items():
|
|
163
|
+
value = item[1]
|
|
164
|
+
if "messages" in value:
|
|
165
|
+
_store_multimodal_content(value["messages"], tmpdir)
|
|
166
|
+
|
|
167
|
+
# storing artifact result
|
|
168
|
+
tmp_path = os.path.join(tmpdir, artifact_name)
|
|
169
|
+
|
|
170
|
+
with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
171
|
+
f.write(instance_results.to_json(orient="records", lines=True))
|
|
172
|
+
|
|
173
|
+
properties = {
|
|
174
|
+
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
175
|
+
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
176
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
177
|
+
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
178
|
+
}
|
|
179
|
+
properties.update(_convert_name_map_into_property_entries(name_map))
|
|
180
|
+
|
|
181
|
+
create_evaluation_result_response = client.create_evaluation_result(
|
|
182
|
+
name=uuid.uuid4(),
|
|
183
|
+
path=tmpdir,
|
|
184
|
+
metrics=metrics
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
upload_run_response = client.start_evaluation_run(
|
|
188
|
+
evaluation=EvaluationUpload(
|
|
189
|
+
display_name=evaluation_name,
|
|
190
|
+
)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
update_run_response = client.update_evaluation_run(
|
|
194
|
+
name=upload_run_response.id,
|
|
195
|
+
evaluation=EvaluationUpload(
|
|
196
|
+
display_name=evaluation_name,
|
|
197
|
+
status="Completed",
|
|
198
|
+
outputs={
|
|
199
|
+
'evaluationResultId': create_evaluation_result_response.id,
|
|
200
|
+
},
|
|
201
|
+
properties=properties,
|
|
202
|
+
)
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return update_run_response.properties.get("AiStudioEvaluationUri")
|
|
129
206
|
|
|
130
207
|
def _log_metrics_and_instance_results(
|
|
131
208
|
metrics: Dict[str, Any],
|
|
@@ -133,6 +210,7 @@ def _log_metrics_and_instance_results(
|
|
|
133
210
|
trace_destination: Optional[str],
|
|
134
211
|
run: Optional[Run],
|
|
135
212
|
evaluation_name: Optional[str],
|
|
213
|
+
name_map: Dict[str, str],
|
|
136
214
|
**kwargs,
|
|
137
215
|
) -> Optional[str]:
|
|
138
216
|
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
@@ -187,14 +265,14 @@ def _log_metrics_and_instance_results(
|
|
|
187
265
|
# adding these properties to avoid showing traces if a dummy run is created.
|
|
188
266
|
# We are doing that only for the pure evaluation runs.
|
|
189
267
|
if run is None:
|
|
190
|
-
|
|
191
|
-
properties={
|
|
268
|
+
properties = {
|
|
192
269
|
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
193
270
|
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
194
271
|
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
195
272
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
196
273
|
}
|
|
197
|
-
)
|
|
274
|
+
properties.update(_convert_name_map_into_property_entries(name_map))
|
|
275
|
+
ev_run.write_properties_to_run_history(properties=properties)
|
|
198
276
|
else:
|
|
199
277
|
ev_run.write_properties_to_run_history(
|
|
200
278
|
properties={
|
|
@@ -241,7 +319,7 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
|
241
319
|
p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
|
|
242
320
|
|
|
243
321
|
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
244
|
-
json.dump(data_dict, f)
|
|
322
|
+
json.dump(data_dict, f, ensure_ascii=False)
|
|
245
323
|
|
|
246
324
|
print(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
247
325
|
|
|
@@ -329,6 +407,41 @@ def set_event_loop_policy() -> None:
|
|
|
329
407
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
330
408
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
331
409
|
|
|
410
|
+
# textwrap.wrap tries to do fancy nonsense that we don't want
|
|
411
|
+
def _wrap(s, w):
|
|
412
|
+
return [s[i:i + w] for i in range(0, len(s), w)]
|
|
413
|
+
|
|
414
|
+
def _convert_name_map_into_property_entries(
|
|
415
|
+
name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
|
|
416
|
+
) -> Dict[str, Any]:
|
|
417
|
+
"""
|
|
418
|
+
Convert the name map into property entries.
|
|
419
|
+
|
|
420
|
+
:param name_map: The name map to be converted.
|
|
421
|
+
:type name_map: Dict[str, str]
|
|
422
|
+
:param segment_length: The max length of each individual segment,
|
|
423
|
+
which will each have their own dictionary entry
|
|
424
|
+
:type segment_length: str
|
|
425
|
+
:param max_segments: The max number of segments we can have. If the stringified
|
|
426
|
+
name map is too long, we just return a length entry with a value
|
|
427
|
+
of -1 to indicate that the map was too long.
|
|
428
|
+
:type max_segments: str
|
|
429
|
+
:return: The converted name map.
|
|
430
|
+
:rtype: Dict[str, Any]
|
|
431
|
+
"""
|
|
432
|
+
name_map_string = json.dumps(name_map)
|
|
433
|
+
num_segments = math.ceil(len(name_map_string) / segment_length)
|
|
434
|
+
# Property map is somehow still too long to encode within the space
|
|
435
|
+
# we allow, so give up, but make sure the service knows we gave up
|
|
436
|
+
if (num_segments > max_segments):
|
|
437
|
+
return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
|
|
438
|
+
|
|
439
|
+
result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
|
|
440
|
+
segments_list = _wrap(name_map_string, segment_length)
|
|
441
|
+
for i in range(0, num_segments):
|
|
442
|
+
segment_key = f"{EvaluationRunProperties.NAME_MAP}_{i}"
|
|
443
|
+
result[segment_key] = segments_list[i]
|
|
444
|
+
return result
|
|
332
445
|
|
|
333
446
|
class JSONLDataFileLoader:
|
|
334
447
|
def __init__(self, filename: Union[os.PathLike, str]):
|
|
@@ -33,7 +33,17 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
33
33
|
:end-before: [END bleu_score_evaluator]
|
|
34
34
|
:language: python
|
|
35
35
|
:dedent: 8
|
|
36
|
-
:caption: Initialize and call an BleuScoreEvaluator.
|
|
36
|
+
:caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
|
|
37
|
+
|
|
38
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
39
|
+
|
|
40
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
41
|
+
:start-after: [START bleu_score_evaluator]
|
|
42
|
+
:end-before: [END bleu_score_evaluator]
|
|
43
|
+
:language: python
|
|
44
|
+
:dedent: 8
|
|
45
|
+
:caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
|
|
46
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
37
47
|
|
|
38
48
|
.. admonition:: Example with Threshold:
|
|
39
49
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -62,7 +62,15 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
62
62
|
:end-before: [END code_vulnerability_evaluator]
|
|
63
63
|
:language: python
|
|
64
64
|
:dedent: 8
|
|
65
|
-
:caption: Initialize and call
|
|
65
|
+
:caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
|
|
66
|
+
|
|
67
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
68
|
+
:start-after: [START code_vulnerability_evaluator]
|
|
69
|
+
:end-before: [END code_vulnerability_evaluator]
|
|
70
|
+
:language: python
|
|
71
|
+
:dedent: 8
|
|
72
|
+
:caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
|
|
73
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
66
74
|
|
|
67
75
|
.. note::
|
|
68
76
|
|
|
@@ -31,7 +31,17 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
31
31
|
:end-before: [END coherence_evaluator]
|
|
32
32
|
:language: python
|
|
33
33
|
:dedent: 8
|
|
34
|
-
:caption: Initialize and call
|
|
34
|
+
:caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
|
|
35
|
+
|
|
36
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
37
|
+
|
|
38
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
39
|
+
:start-after: [START coherence_evaluator]
|
|
40
|
+
:end-before: [END coherence_evaluator]
|
|
41
|
+
:language: python
|
|
42
|
+
:dedent: 8
|
|
43
|
+
:caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
|
|
44
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
35
45
|
|
|
36
46
|
.. admonition:: Example with Threshold:
|
|
37
47
|
|
|
@@ -40,7 +50,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
40
50
|
:end-before: [END threshold_coherence_evaluator]
|
|
41
51
|
:language: python
|
|
42
52
|
:dedent: 8
|
|
43
|
-
:caption: Initialize with threshold and
|
|
53
|
+
:caption: Initialize with threshold and call a CoherenceEvaluator with a query and response.
|
|
44
54
|
|
|
45
55
|
.. note::
|
|
46
56
|
|
|
@@ -86,6 +86,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
86
86
|
:type _higher_is_better: Optional[bool]
|
|
87
87
|
"""
|
|
88
88
|
|
|
89
|
+
_NOT_APPLICABLE_RESULT = "not applicable"
|
|
90
|
+
_PASS_RESULT = "pass"
|
|
91
|
+
_FAIL_RESULT = "fail"
|
|
92
|
+
|
|
89
93
|
# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
90
94
|
|
|
91
95
|
# Make sure to call super().__init__() in the child class's __init__ method.
|
|
@@ -495,7 +499,8 @@ class AsyncEvaluatorBase:
|
|
|
495
499
|
# are known to throw at this, mash them into kwargs, and then pass them into the real call.
|
|
496
500
|
async def __call__(
|
|
497
501
|
self, *, query=None, response=None, context=None, conversation=None, ground_truth=None,
|
|
498
|
-
|
|
502
|
+
tool_calls=None, tool_definitions=None, messages=None, retrieval_ground_truth=None,
|
|
503
|
+
retrieved_documents=None,**kwargs
|
|
499
504
|
):
|
|
500
505
|
if conversation is not None:
|
|
501
506
|
kwargs["conversation"] = conversation
|
|
@@ -509,11 +514,15 @@ class AsyncEvaluatorBase:
|
|
|
509
514
|
kwargs["context"] = context
|
|
510
515
|
if ground_truth is not None:
|
|
511
516
|
kwargs["ground_truth"] = ground_truth
|
|
512
|
-
if
|
|
513
|
-
kwargs["
|
|
517
|
+
if tool_calls is not None:
|
|
518
|
+
kwargs["tool_calls"] = tool_calls
|
|
514
519
|
if tool_definitions is not None:
|
|
515
520
|
kwargs["tool_definitions"] = tool_definitions
|
|
516
521
|
if messages is not None:
|
|
517
522
|
kwargs["messages"] = messages
|
|
523
|
+
if retrieval_ground_truth is not None:
|
|
524
|
+
kwargs["retrieval_ground_truth"] = retrieval_ground_truth
|
|
525
|
+
if retrieved_documents is not None:
|
|
526
|
+
kwargs["retrieved_documents"] = retrieved_documents
|
|
518
527
|
|
|
519
528
|
return await self._real_call(**kwargs)
|
|
@@ -4,9 +4,13 @@
|
|
|
4
4
|
|
|
5
5
|
import math
|
|
6
6
|
import re
|
|
7
|
+
import os
|
|
7
8
|
from typing import Dict, TypeVar, Union
|
|
8
9
|
|
|
9
|
-
|
|
10
|
+
if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
|
|
11
|
+
from promptflow.core._flow import AsyncPrompty
|
|
12
|
+
else:
|
|
13
|
+
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
|
|
10
14
|
from typing_extensions import override
|
|
11
15
|
|
|
12
16
|
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
|
|
@@ -39,13 +43,17 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
39
43
|
:param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
|
|
40
44
|
Useful since some evaluators of this format are response-only.
|
|
41
45
|
:type ignore_queries: bool
|
|
46
|
+
:keyword is_reasoning_model: This parameter is in preview. If True, updates the config parameters in prompty file based on reasoning models. Defaults to False.
|
|
47
|
+
:type is_reasoning_model: bool
|
|
42
48
|
"""
|
|
43
49
|
|
|
44
50
|
_LLM_CALL_TIMEOUT = 600
|
|
45
51
|
_DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
46
52
|
|
|
47
|
-
def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False,
|
|
53
|
+
def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False,
|
|
54
|
+
threshold: int = 3, _higher_is_better: bool = False, **kwargs) -> None:
|
|
48
55
|
self._result_key = result_key
|
|
56
|
+
self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
|
|
49
57
|
self._prompty_file = prompty_file
|
|
50
58
|
self._threshold = threshold
|
|
51
59
|
self._higher_is_better = _higher_is_better
|
|
@@ -59,7 +67,8 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
59
67
|
user_agent,
|
|
60
68
|
)
|
|
61
69
|
|
|
62
|
-
self._flow = AsyncPrompty.load(source=
|
|
70
|
+
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config,
|
|
71
|
+
is_reasoning_model=self._is_reasoning_model)
|
|
63
72
|
|
|
64
73
|
# __call__ not overridden here because child classes have such varied signatures that there's no point
|
|
65
74
|
# defining a default here.
|
|
@@ -12,7 +12,7 @@ from azure.ai.evaluation._common.constants import (
|
|
|
12
12
|
_InternalAnnotationTasks,
|
|
13
13
|
)
|
|
14
14
|
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
|
|
15
|
-
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
15
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
|
|
16
16
|
from azure.ai.evaluation._exceptions import EvaluationException
|
|
17
17
|
from azure.ai.evaluation._common.utils import validate_conversation
|
|
18
18
|
from azure.ai.evaluation._constants import _AggregationType
|
|
@@ -50,7 +50,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
50
50
|
def __init__(
|
|
51
51
|
self,
|
|
52
52
|
eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
|
|
53
|
-
azure_ai_project: dict,
|
|
53
|
+
azure_ai_project: Union[dict, str],
|
|
54
54
|
credential: TokenCredential,
|
|
55
55
|
eval_last_turn: bool = False,
|
|
56
56
|
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
|
|
@@ -44,7 +44,17 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
44
44
|
:end-before: [END content_safety_evaluator]
|
|
45
45
|
:language: python
|
|
46
46
|
:dedent: 8
|
|
47
|
-
:caption: Initialize and call
|
|
47
|
+
:caption: Initialize and call ContentSafetyEvaluator using azure.ai.evaluation.AzureAIProject.
|
|
48
|
+
|
|
49
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
50
|
+
|
|
51
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
52
|
+
:start-after: [START content_safety_evaluator]
|
|
53
|
+
:end-before: [END content_safety_evaluator]
|
|
54
|
+
:language: python
|
|
55
|
+
:dedent: 8
|
|
56
|
+
:caption: Initialize and call ContentSafetyEvaluator using Azure AI Project URL in the following format
|
|
57
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
48
58
|
|
|
49
59
|
.. admonition:: Example with Threshold:
|
|
50
60
|
|
|
@@ -53,7 +63,7 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
53
63
|
:end-before: [END threshold_content_safety_evaluator]
|
|
54
64
|
:language: python
|
|
55
65
|
:dedent: 8
|
|
56
|
-
:caption: Initialize with threshold and call a ContentSafetyEvaluator.
|
|
66
|
+
:caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
|
|
57
67
|
"""
|
|
58
68
|
|
|
59
69
|
id = "content_safety"
|
|
@@ -58,16 +58,26 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
58
58
|
:end-before: [END hate_unfairness_evaluator]
|
|
59
59
|
:language: python
|
|
60
60
|
:dedent: 8
|
|
61
|
-
:caption: Initialize and call
|
|
62
|
-
|
|
63
|
-
.. admonition:: Example
|
|
61
|
+
:caption: Initialize and call HateUnfairnessEvaluator using azure.ai.evaluation.AzureAIProject.
|
|
62
|
+
|
|
63
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
64
|
+
|
|
65
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
66
|
+
:start-after: [START hate_unfairness_evaluator]
|
|
67
|
+
:end-before: [END hate_unfairness_evaluator]
|
|
68
|
+
:language: python
|
|
69
|
+
:dedent: 8
|
|
70
|
+
:caption: Initialize and call HateUnfairnessEvaluator using Azure AI Project URL in the following format
|
|
71
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
64
72
|
|
|
73
|
+
.. admonition:: Example with Threshold:
|
|
74
|
+
|
|
65
75
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
66
76
|
:start-after: [START threshold_hate_unfairness_evaluator]
|
|
67
77
|
:end-before: [END threshold_hate_unfairness_evaluator]
|
|
68
78
|
:language: python
|
|
69
79
|
:dedent: 8
|
|
70
|
-
:caption: Initialize with threshold and call a HateUnfairnessEvaluator.
|
|
80
|
+
:caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
|
|
71
81
|
"""
|
|
72
82
|
|
|
73
83
|
id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"
|
|
@@ -52,16 +52,17 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
52
52
|
:end-before: [END self_harm_evaluator]
|
|
53
53
|
:language: python
|
|
54
54
|
:dedent: 8
|
|
55
|
-
:caption: Initialize and call
|
|
56
|
-
|
|
57
|
-
.. admonition:: Example:
|
|
58
|
-
|
|
59
|
-
.. literalinclude:: ../samples/
|
|
60
|
-
:start-after: [START
|
|
61
|
-
:end-before: [END
|
|
55
|
+
:caption: Initialize and call SelfHarmEvaluator using azure.ai.evaluation.AzureAIProject.
|
|
56
|
+
|
|
57
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
58
|
+
|
|
59
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
60
|
+
:start-after: [START self_harm_evaluator]
|
|
61
|
+
:end-before: [END self_harm_evaluator]
|
|
62
62
|
:language: python
|
|
63
63
|
:dedent: 8
|
|
64
|
-
:caption: Initialize
|
|
64
|
+
:caption: Initialize and call SelfHarmEvaluator using Azure AI Project URL in the following format
|
|
65
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
65
66
|
"""
|
|
66
67
|
|
|
67
68
|
id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"
|
|
@@ -56,6 +56,16 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
56
56
|
:dedent: 8
|
|
57
57
|
:caption: Initialize and call a SexualEvaluator.
|
|
58
58
|
|
|
59
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
60
|
+
|
|
61
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
62
|
+
:start-after: [START sexual_evaluator]
|
|
63
|
+
:end-before: [END sexual_evaluator]
|
|
64
|
+
:language: python
|
|
65
|
+
:dedent: 8
|
|
66
|
+
:caption: Initialize and call SexualEvaluator using Azure AI Project URL in following format
|
|
67
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
68
|
+
|
|
59
69
|
.. admonition:: Example with Threshold:
|
|
60
70
|
|
|
61
71
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -56,6 +56,16 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
56
56
|
:dedent: 8
|
|
57
57
|
:caption: Initialize and call a ViolenceEvaluator.
|
|
58
58
|
|
|
59
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
60
|
+
|
|
61
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
62
|
+
:start-after: [START violence_evaluator]
|
|
63
|
+
:end-before: [END violence_evaluator]
|
|
64
|
+
:language: python
|
|
65
|
+
:dedent: 8
|
|
66
|
+
:caption: Initialize and call ViolenceEvaluator using Azure AI Project URL in following format
|
|
67
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
68
|
+
|
|
59
69
|
.. admonition:: Example:
|
|
60
70
|
|
|
61
71
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._document_retrieval import DocumentRetrievalEvaluator, RetrievalGroundTruthDocument, RetrievedDocument
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"DocumentRetrievalEvaluator",
|
|
9
|
+
"RetrievalGroundTruthDocument",
|
|
10
|
+
"RetrievedDocument"
|
|
11
|
+
]
|