azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +100 -5
- azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
- azure/ai/evaluation/_common/constants.py +131 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +831 -142
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +870 -34
- azure/ai/evaluation/_constants.py +167 -6
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
- azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
- azure/ai/evaluation/_evaluate/_utils.py +289 -40
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
- azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
- azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
- azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
- azure/ai/evaluation/_exceptions.py +51 -7
- azure/ai/evaluation/_http_utils.py +210 -137
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +130 -8
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
- azure/ai/evaluation/simulator/_constants.py +12 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
- azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
- azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +302 -208
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Intent Resolution Evaluator
|
|
3
|
+
description: Evaluates whether user intent was identified and correctly resolved
|
|
4
|
+
model:
|
|
5
|
+
api: chat
|
|
6
|
+
parameters:
|
|
7
|
+
temperature: 0.0
|
|
8
|
+
max_tokens: 800
|
|
9
|
+
top_p: 1.0
|
|
10
|
+
presence_penalty: 0
|
|
11
|
+
frequency_penalty: 0
|
|
12
|
+
response_format:
|
|
13
|
+
type: json_object
|
|
14
|
+
|
|
15
|
+
inputs:
|
|
16
|
+
query:
|
|
17
|
+
type: string
|
|
18
|
+
response:
|
|
19
|
+
type: string
|
|
20
|
+
tool_definitions:
|
|
21
|
+
type: string
|
|
22
|
+
optional: true
|
|
23
|
+
default: "[]"
|
|
24
|
+
---
|
|
25
|
+
system:
|
|
26
|
+
You are an expert in evaluating the quality of a AGENT_RESPONSE from an intelligent assistant based on provided definition and CONVERSATION_HISTORY.
|
|
27
|
+
|
|
28
|
+
user:
|
|
29
|
+
ROLE
|
|
30
|
+
====
|
|
31
|
+
You are Intent-Resolution-Judge, an impartial grader that scores how well an AI agent *resolved* the user's intent in a multi-turn conversation.
|
|
32
|
+
You are NOT grading intent recognition. Assume the agent has understood the intent that is expressed; you only judge whether the reply satisfies or completes that intent.
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
INPUT
|
|
36
|
+
=====
|
|
37
|
+
CONVERSATION_HISTORY: {{query}}
|
|
38
|
+
AGENT_RESPONSE: {{response}}
|
|
39
|
+
|
|
40
|
+
CONVERSATION_HISTORY is the full dialogue between the user and the agent up to the user's latest message.
|
|
41
|
+
AGENT_RESPONSE is the agent reply to that latest user message.
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
TASK
|
|
45
|
+
====
|
|
46
|
+
Output a JSON object with:
|
|
47
|
+
1) a concise explanation of 15-60 words that summarizes the agent's performance in resolving the user's intent
|
|
48
|
+
2) an integer score from 1 (very poor) to 5 (excellent) on how well the agent resolved the user's intent.
|
|
49
|
+
|
|
50
|
+
The explanation should always precede the score and should clearly justify the score based on the agent's performance in resolving the user's intent.
|
|
51
|
+
Response format exactly as follows:
|
|
52
|
+
|
|
53
|
+
{
|
|
54
|
+
"explanation": "<15-60 words>",
|
|
55
|
+
"score": <1-5>
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
EVALUATION STEPS
|
|
59
|
+
================
|
|
60
|
+
|
|
61
|
+
A. Identify the expressed intent in the final user turn (look at the full conversation_history for context if necessary).
|
|
62
|
+
B. Check resolution - Does the agent's reply, in AGENT_RESPONSE, actually complete or satisfy that intent?
|
|
63
|
+
- If the agent's response is a direct answer, does it fully address the user's request?
|
|
64
|
+
- If the agent's response is an action (like scheduling, deleting, etc.), does it confirm completion of that action?
|
|
65
|
+
- If the agent's response is a clarification or follow-up question, does it lead towards fulfilling the intent?
|
|
66
|
+
- If the agent response is empty or irrelevant, it does not resolve the intent and should be scored accordingly.
|
|
67
|
+
C. Verify correctness & completeness of the resolution.
|
|
68
|
+
D. Weigh impact - Minor style issues matter only for tie-breaking; resolution quality dominates.
|
|
69
|
+
E. Write a concise explanation of 15-60 words that summarizes the agent's performance in resolving the user's intent, including:
|
|
70
|
+
- What the user wanted
|
|
71
|
+
- How well the agent addressed it
|
|
72
|
+
- Any notable gaps or strengths
|
|
73
|
+
F. Choose the single closest integer score (1,2,3,4 or 5) from the scoring rubric below.
|
|
74
|
+
|
|
75
|
+
SCORING RUBRIC
|
|
76
|
+
5 - Excellent: Agent fully fulfills the intent with accurate, thorough, relevant action.
|
|
77
|
+
4 - Good: Intent mostly resolved; only minor, non-blocking gaps or imprecision.
|
|
78
|
+
3 - Adequate: Core of intent addressed but notable omissions, vagueness, or slips.
|
|
79
|
+
2 - Poor: Partial or incorrect resolution; answer unlikely to satisfy the intent.
|
|
80
|
+
1 - Very poor: Intent ignored or badly misunderstood; irrelevant or harmful response.
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
EXAMPLES
|
|
84
|
+
========
|
|
85
|
+
|
|
86
|
+
### Score 5 - Example A (single-turn)
|
|
87
|
+
|
|
88
|
+
CONVERSATION_HISTORY:
|
|
89
|
+
User turn 1:
|
|
90
|
+
Give me a Python one-liner to reverse a string.
|
|
91
|
+
|
|
92
|
+
AGENT_RESPONSE:
|
|
93
|
+
`s[::-1]` - this slice returns the string in reverse.
|
|
94
|
+
|
|
95
|
+
EXPECTED OUTPUT:
|
|
96
|
+
{
|
|
97
|
+
"explanation": "User wanted a Python one-liner to reverse a string. Agent provided the correct code and a brief, helpful explanation, fully resolving the request with notable conciseness and accuracy.",
|
|
98
|
+
"score": 5
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
### Score 5 - Example B (multi-turn)
|
|
103
|
+
|
|
104
|
+
CONVERSATION_HISTORY:
|
|
105
|
+
User turn 1:
|
|
106
|
+
I need to cancel my 2 pm meeting with Dr. Lee this Friday.
|
|
107
|
+
|
|
108
|
+
Agent turn 1:
|
|
109
|
+
To confirm, that's Friday at 14:00 with Dr. Lee in Meeting Room 2, right?
|
|
110
|
+
|
|
111
|
+
User turn 2:
|
|
112
|
+
Yes, that one.
|
|
113
|
+
|
|
114
|
+
AGENT_RESPONSE:
|
|
115
|
+
Done! The event has been removed from your calendar and a cancellation email was sent to Dr. Lee.
|
|
116
|
+
|
|
117
|
+
EXPECTED OUTPUT:
|
|
118
|
+
{
|
|
119
|
+
"explanation": "User wanted to cancel a meeting. Agent first confirmed details, then successfully completed the cancellation and notified the attendee, fully resolving the intent with excellent clarity and thoroughness.",
|
|
120
|
+
"score": 5
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
### Score 4 - Example A (single-turn)
|
|
125
|
+
|
|
126
|
+
CONVERSATION_HISTORY:
|
|
127
|
+
User turn 1:
|
|
128
|
+
Translate 'carpe diem' to Portuguese and explain it in one sentence.
|
|
129
|
+
|
|
130
|
+
AGENT_RESPONSE:
|
|
131
|
+
'Aproveita o dia.' Significa que deves aproveitar o momento presente.
|
|
132
|
+
|
|
133
|
+
EXPECTED OUTPUT:
|
|
134
|
+
{
|
|
135
|
+
"explanation": "User wanted a Portuguese translation of 'carpe diem' and a one-sentence explanation. Agent provided an accurate translation and a concise explanation, mostly resolving the intent adequately.",
|
|
136
|
+
"score": 4
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
### Score 4 - Example B (multi-turn)
|
|
141
|
+
|
|
142
|
+
CONVERSATION_HISTORY:
|
|
143
|
+
User turn 1:
|
|
144
|
+
I need three book recommendations on deep work.
|
|
145
|
+
|
|
146
|
+
Agent turn 1:
|
|
147
|
+
Sure—any particular genre?
|
|
148
|
+
|
|
149
|
+
User turn 2:
|
|
150
|
+
Non-fiction, preferably practical.
|
|
151
|
+
|
|
152
|
+
AGENT_RESPONSE:
|
|
153
|
+
1) *Deep Work* by Cal Newport, 2) *Indistractable* by Nir Eyal, 3) *Digital Minimalism* by Cal Newport.
|
|
154
|
+
All are actionable non-fiction.
|
|
155
|
+
|
|
156
|
+
EXPECTED OUTPUT:
|
|
157
|
+
{
|
|
158
|
+
"explanation": "User wanted three practical non-fiction book recommendations on deep work. Agent supplied three relevant titles, confirming they fit the criteria, mostly resolving the intent. Brief justifications for each pick would improve it.",
|
|
159
|
+
"score": 4
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
### Score 3 - Example A (single-turn)
|
|
164
|
+
|
|
165
|
+
CONVERSATION_HISTORY:
|
|
166
|
+
User turn 1:
|
|
167
|
+
Summarise the key plot points of *The Hobbit* in 5 bullet points.
|
|
168
|
+
|
|
169
|
+
AGENT_RESPONSE:
|
|
170
|
+
• Bilbo meets Gandalf • They find a ring • They fight a dragon.
|
|
171
|
+
|
|
172
|
+
EXPECTED OUTPUT:
|
|
173
|
+
{
|
|
174
|
+
"explanation": "User wanted a 5-bullet summary of *The Hobbit*'s plot. Agent provided only three bullets, capturing some main events but omitting several pivotal ones and failing to meet the requested count, thus only adequately addressing the intent.",
|
|
175
|
+
"score": 3
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
### Score 3 - Example B (multi-turn)
|
|
180
|
+
|
|
181
|
+
CONVERSATION_HISTORY:
|
|
182
|
+
User turn 1:
|
|
183
|
+
What's the weather in Tokyo tomorrow?
|
|
184
|
+
|
|
185
|
+
Agent turn 1:
|
|
186
|
+
Would you like it in Celsius or Fahrenheit?
|
|
187
|
+
|
|
188
|
+
User turn 2:
|
|
189
|
+
Celsius, please.
|
|
190
|
+
|
|
191
|
+
AGENT_RESPONSE:
|
|
192
|
+
Looks mostly sunny with a high around 24 °C and a 10 % chance of light showers.
|
|
193
|
+
|
|
194
|
+
EXPECTED OUTPUT:
|
|
195
|
+
{
|
|
196
|
+
"explanation": "User wanted tomorrow's weather in Tokyo (Celsius). Agent provided temperature and rain chance but omitted wind information, offering an adequate but incomplete resolution of the intent.",
|
|
197
|
+
"score": 3
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
### Score 2 - Example A (single-turn)
|
|
202
|
+
|
|
203
|
+
CONVERSATION_HISTORY:
|
|
204
|
+
User turn 1:
|
|
205
|
+
"Delete my last email draft."
|
|
206
|
+
|
|
207
|
+
AGENT_RESPONSE:
|
|
208
|
+
"Here's how to compose a new email…"
|
|
209
|
+
|
|
210
|
+
EXPECTED OUTPUT:
|
|
211
|
+
{
|
|
212
|
+
"explanation": "User wanted to delete their last email draft. Agent misunderstood the request, offering irrelevant guidance on composing a new email instead of performing the deletion, resulting in a poor resolution.",
|
|
213
|
+
"score": 2
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
### Score 2 - Example B (multi-turn)
|
|
218
|
+
|
|
219
|
+
CONVERSATION_HISTORY:
|
|
220
|
+
User turn 1:
|
|
221
|
+
"Calculate 15 % of 80."
|
|
222
|
+
|
|
223
|
+
Agent turn 1:
|
|
224
|
+
"Do you need the exact number or a rounded estimate?"
|
|
225
|
+
|
|
226
|
+
User turn 2:
|
|
227
|
+
"Exact, please."
|
|
228
|
+
|
|
229
|
+
AGENT_RESPONSE:
|
|
230
|
+
"It's 10.5."
|
|
231
|
+
|
|
232
|
+
EXPECTED OUTPUT:
|
|
233
|
+
{
|
|
234
|
+
"explanation": "User wanted an exact calculation of 15% of 80. Agent provided an incorrect result (10.5 instead of 12), leading to a poor resolution of the user's intent.",
|
|
235
|
+
"score": 2
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
### Score 1 - Example A (single-turn)
|
|
240
|
+
|
|
241
|
+
CONVERSATION_HISTORY:
|
|
242
|
+
User turn 1:
|
|
243
|
+
"Mute notifications for the next hour."
|
|
244
|
+
|
|
245
|
+
AGENT_RESPONSE:
|
|
246
|
+
"Here's a cake recipe! 🍰"
|
|
247
|
+
|
|
248
|
+
EXPECTED OUTPUT:
|
|
249
|
+
{
|
|
250
|
+
"explanation": "User wanted to mute notifications for an hour. Agent responded with a completely irrelevant cake recipe, making no attempt to address the user's intent, resulting in a very poor resolution.",
|
|
251
|
+
"score": 1
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
### Score 1 - Example B (multi-turn)
|
|
256
|
+
|
|
257
|
+
CONVERSATION_HISTORY:
|
|
258
|
+
User turn 1:
|
|
259
|
+
"Set an alarm for 6 am tomorrow."
|
|
260
|
+
|
|
261
|
+
Agent turn 1:
|
|
262
|
+
"Why not sleep in? Alarms are overrated."
|
|
263
|
+
|
|
264
|
+
User turn 2:
|
|
265
|
+
"I really need the alarm—please set it."
|
|
266
|
+
|
|
267
|
+
AGENT_RESPONSE:
|
|
268
|
+
"Alarms are pointless; wake up whenever."
|
|
269
|
+
|
|
270
|
+
EXPECTED OUTPUT:
|
|
271
|
+
{
|
|
272
|
+
"explanation": "User wanted to set an alarm for 6 am. Agent was dismissive and refused to perform the requested action, completely failing to resolve the user's intent, leading to a very poor resolution.",
|
|
273
|
+
"score": 1
|
|
274
|
+
}
|
|
275
|
+
|
|
@@ -1,44 +1,19 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
import
|
|
5
|
-
from nltk.translate.meteor_score import meteor_score
|
|
6
|
-
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
7
|
-
|
|
8
|
-
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class _AsyncMeteorScoreEvaluator:
|
|
12
|
-
def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
|
|
13
|
-
self._alpha = alpha
|
|
14
|
-
self._beta = beta
|
|
15
|
-
self._gamma = gamma
|
|
4
|
+
from typing import Dict
|
|
16
5
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
except LookupError:
|
|
20
|
-
nltk.download("wordnet")
|
|
21
|
-
|
|
22
|
-
async def __call__(self, *, ground_truth: str, response: str, **kwargs):
|
|
23
|
-
reference_tokens = nltk_tokenize(ground_truth)
|
|
24
|
-
hypothesis_tokens = nltk_tokenize(response)
|
|
25
|
-
|
|
26
|
-
score = meteor_score(
|
|
27
|
-
[reference_tokens],
|
|
28
|
-
hypothesis_tokens,
|
|
29
|
-
alpha=self._alpha,
|
|
30
|
-
beta=self._beta,
|
|
31
|
-
gamma=self._gamma,
|
|
32
|
-
)
|
|
6
|
+
from nltk.translate.meteor_score import meteor_score
|
|
7
|
+
from typing_extensions import overload, override
|
|
33
8
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
9
|
+
from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
11
|
+
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
37
12
|
|
|
38
13
|
|
|
39
|
-
class MeteorScoreEvaluator:
|
|
14
|
+
class MeteorScoreEvaluator(EvaluatorBase):
|
|
40
15
|
"""
|
|
41
|
-
|
|
16
|
+
Calculates the METEOR score for a given response and ground truth.
|
|
42
17
|
|
|
43
18
|
The METEOR (Metric for Evaluation of Translation with Explicit Ordering) score grader evaluates generated text by
|
|
44
19
|
comparing it to reference texts, focusing on precision, recall, and content alignment. It addresses limitations of
|
|
@@ -46,39 +21,98 @@ class MeteorScoreEvaluator:
|
|
|
46
21
|
word stems to more accurately capture meaning and language variations. In addition to machine translation and
|
|
47
22
|
text summarization, paraphrase detection is an optimal use case for the METEOR score.
|
|
48
23
|
|
|
24
|
+
Use the METEOR score when you want a more linguistically informed evaluation metric that captures not only
|
|
25
|
+
n-gram overlap but also accounts for synonyms, stemming, and word order. This is particularly useful for evaluating
|
|
26
|
+
tasks like machine translation, text summarization, and text generation.
|
|
27
|
+
|
|
28
|
+
The METEOR score ranges from 0 to 1, with 1 indicating a perfect match.
|
|
29
|
+
|
|
49
30
|
:param alpha: The METEOR score alpha parameter. Default is 0.9.
|
|
50
31
|
:type alpha: float
|
|
51
32
|
:param beta: The METEOR score beta parameter. Default is 3.0.
|
|
52
33
|
:type beta: float
|
|
53
34
|
:param gamma: The METEOR score gamma parameter. Default is 0.5.
|
|
54
35
|
:type gamma: float
|
|
36
|
+
:param threshold: The threshold for the METEOR score evaluator. Default is 0.5.
|
|
37
|
+
:type threshold: float
|
|
38
|
+
|
|
39
|
+
.. admonition:: Example:
|
|
40
|
+
|
|
41
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
42
|
+
:start-after: [START meteor_score_evaluator]
|
|
43
|
+
:end-before: [END meteor_score_evaluator]
|
|
44
|
+
:language: python
|
|
45
|
+
:dedent: 8
|
|
46
|
+
:caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
|
|
47
|
+
|
|
48
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
49
|
+
|
|
50
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
51
|
+
:start-after: [START meteor_score_evaluator]
|
|
52
|
+
:end-before: [END meteor_score_evaluator]
|
|
53
|
+
:language: python
|
|
54
|
+
:dedent: 8
|
|
55
|
+
:caption: Initialize and call MeteorScoreEvaluator using Azure AI Project URL in the following format
|
|
56
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
57
|
+
|
|
58
|
+
.. admonition:: Example with Threshold:
|
|
59
|
+
|
|
60
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
61
|
+
:start-after: [START threshold_meteor_score_evaluator]
|
|
62
|
+
:end-before: [END threshold_meteor_score_evaluator]
|
|
63
|
+
:language: python
|
|
64
|
+
:dedent: 8
|
|
65
|
+
:caption: Initialize with threshold and call a MeteorScoreEvaluator.
|
|
66
|
+
"""
|
|
55
67
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
.. code-block:: python
|
|
68
|
+
id = "azureai://built-in/evaluators/meteor_score"
|
|
69
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
59
70
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
71
|
+
@override
|
|
72
|
+
def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5, *, threshold: float = 0.5):
|
|
73
|
+
self._alpha = alpha
|
|
74
|
+
self._beta = beta
|
|
75
|
+
self._gamma = gamma
|
|
76
|
+
ensure_nltk_data_downloaded()
|
|
77
|
+
self._threshold = threshold
|
|
78
|
+
self._higher_is_better = True
|
|
79
|
+
super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
|
|
80
|
+
|
|
81
|
+
@override
|
|
82
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
83
|
+
"""Produce a meteor score evaluation result.
|
|
84
|
+
|
|
85
|
+
:param eval_input: The input to the evaluation function.
|
|
86
|
+
:type eval_input: Dict
|
|
87
|
+
:return: The evaluation result.
|
|
88
|
+
:rtype: Dict
|
|
89
|
+
"""
|
|
90
|
+
ground_truth = eval_input["ground_truth"]
|
|
91
|
+
response = eval_input["response"]
|
|
92
|
+
reference_tokens = nltk_tokenize(ground_truth)
|
|
93
|
+
hypothesis_tokens = nltk_tokenize(response)
|
|
94
|
+
score = meteor_score(
|
|
95
|
+
[reference_tokens],
|
|
96
|
+
hypothesis_tokens,
|
|
97
|
+
alpha=self._alpha,
|
|
98
|
+
beta=self._beta,
|
|
99
|
+
gamma=self._gamma,
|
|
64
100
|
)
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
"
|
|
101
|
+
binary_result = False
|
|
102
|
+
if self._higher_is_better:
|
|
103
|
+
if score >= self._threshold:
|
|
104
|
+
binary_result = True
|
|
105
|
+
else:
|
|
106
|
+
if score <= self._threshold:
|
|
107
|
+
binary_result = True
|
|
108
|
+
return {
|
|
109
|
+
"meteor_score": score,
|
|
110
|
+
"meteor_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
|
|
111
|
+
"meteor_threshold": self._threshold,
|
|
75
112
|
}
|
|
76
|
-
"""
|
|
77
113
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def __call__(self, *, ground_truth: str, response: str, **kwargs):
|
|
114
|
+
@overload # type: ignore
|
|
115
|
+
def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
|
|
82
116
|
"""
|
|
83
117
|
Evaluate the METEOR score between the response and the ground truth.
|
|
84
118
|
|
|
@@ -87,11 +121,23 @@ class MeteorScoreEvaluator:
|
|
|
87
121
|
:keyword ground_truth: The ground truth to be compared against.
|
|
88
122
|
:paramtype ground_truth: str
|
|
89
123
|
:return: The METEOR score.
|
|
90
|
-
:rtype:
|
|
124
|
+
:rtype: Dict[str, float]
|
|
91
125
|
"""
|
|
92
|
-
return async_run_allowing_running_loop(
|
|
93
|
-
self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
|
|
94
|
-
)
|
|
95
126
|
|
|
96
|
-
|
|
97
|
-
|
|
127
|
+
@override
|
|
128
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
129
|
+
self,
|
|
130
|
+
*args,
|
|
131
|
+
**kwargs,
|
|
132
|
+
):
|
|
133
|
+
"""
|
|
134
|
+
Evaluate the METEOR score between the response and the ground truth.
|
|
135
|
+
|
|
136
|
+
:keyword response: The response to be evaluated.
|
|
137
|
+
:paramtype response: str
|
|
138
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
139
|
+
:paramtype ground_truth: str
|
|
140
|
+
:return: The METEOR score.
|
|
141
|
+
:rtype: Dict[str, float]
|
|
142
|
+
"""
|
|
143
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -1,104 +1,131 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
5
4
|
|
|
6
|
-
from
|
|
7
|
-
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
8
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
5
|
+
from typing import Dict, List, Optional, Union
|
|
9
6
|
|
|
7
|
+
from typing_extensions import overload, override
|
|
10
8
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
9
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
10
|
+
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
11
|
+
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
12
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
15
13
|
|
|
16
|
-
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
17
|
-
"""
|
|
18
|
-
Evaluates content according to this evaluator's metric.
|
|
19
14
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
25
|
-
:rtype: Any
|
|
26
|
-
"""
|
|
27
|
-
# Validate inputs
|
|
28
|
-
# Raises value error if failed, so execution alone signifies success.
|
|
29
|
-
if not (query and query.strip() and query != "None") or not (
|
|
30
|
-
response and response.strip() and response != "None"
|
|
31
|
-
):
|
|
32
|
-
msg = "Both 'query' and 'response' must be non-empty strings."
|
|
33
|
-
raise EvaluationException(
|
|
34
|
-
message=msg,
|
|
35
|
-
internal_message=msg,
|
|
36
|
-
error_category=ErrorCategory.MISSING_FIELD,
|
|
37
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
38
|
-
error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
# Run score computation based on supplied metric.
|
|
42
|
-
result = await evaluate_with_rai_service(
|
|
43
|
-
metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
|
|
44
|
-
query=query,
|
|
45
|
-
response=response,
|
|
46
|
-
project_scope=self._azure_ai_project,
|
|
47
|
-
credential=self._credential,
|
|
48
|
-
)
|
|
49
|
-
return result
|
|
15
|
+
@experimental
|
|
16
|
+
class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
17
|
+
"""
|
|
18
|
+
Evaluates the protected material score for a given query and response or a multi-turn conversation, with reasoning.
|
|
50
19
|
|
|
20
|
+
Protected material is any text that is under copyright, including song lyrics, recipes, and articles. Protected
|
|
21
|
+
material evaluation leverages the Azure AI Content Safety Protected Material for Text service to perform the
|
|
22
|
+
classification.
|
|
51
23
|
|
|
52
|
-
|
|
53
|
-
"""
|
|
54
|
-
Initialize a protected material evaluator to detect whether protected material
|
|
55
|
-
is present in your AI system's response. Outputs True or False with AI-generated reasoning.
|
|
24
|
+
The protected material score is a boolean value, where True indicates that protected material was detected.
|
|
56
25
|
|
|
57
|
-
:param
|
|
58
|
-
It contains subscription id, resource group, and project name.
|
|
59
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
60
|
-
:param credential: The credential for connecting to Azure AI project.
|
|
26
|
+
:param credential: The credential required for connecting to the Azure AI project.
|
|
61
27
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
62
|
-
:
|
|
63
|
-
|
|
28
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
29
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
30
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
64
31
|
|
|
65
|
-
|
|
32
|
+
.. admonition:: Example:
|
|
66
33
|
|
|
67
|
-
|
|
34
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
35
|
+
:start-after: [START protected_material_evaluator]
|
|
36
|
+
:end-before: [END protected_material_evaluator]
|
|
37
|
+
:language: python
|
|
38
|
+
:dedent: 8
|
|
39
|
+
:caption: Initialize and call a ProtectedMaterialEvaluator.
|
|
68
40
|
|
|
69
|
-
|
|
70
|
-
"subscription_id": "<subscription_id>",
|
|
71
|
-
"resource_group_name": "<resource_group_name>",
|
|
72
|
-
"project_name": "<project_name>",
|
|
73
|
-
}
|
|
74
|
-
eval_fn = ProtectedMaterialEvaluator(azure_ai_project)
|
|
75
|
-
result = eval_fn(query="What is the capital of France?", response="Paris.")
|
|
41
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
76
42
|
|
|
77
|
-
|
|
43
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
44
|
+
:start-after: [START protected_material_evaluator]
|
|
45
|
+
:end-before: [END protected_material_evaluator]
|
|
46
|
+
:language: python
|
|
47
|
+
:dedent: 8
|
|
48
|
+
:caption: Initialize and call ProtectedMaterialEvaluator using Azure AI Project URL in the following format
|
|
49
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
78
50
|
|
|
79
|
-
.. code-block:: python
|
|
80
|
-
|
|
81
|
-
{
|
|
82
|
-
"protected_material_label": "False",
|
|
83
|
-
"protected_material_reason": "This query does not contain any protected material."
|
|
84
|
-
}
|
|
85
51
|
"""
|
|
86
52
|
|
|
87
|
-
|
|
88
|
-
|
|
53
|
+
id = "azureai://built-in/evaluators/protected_material"
|
|
54
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
55
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
56
|
+
|
|
57
|
+
@override
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
credential,
|
|
61
|
+
azure_ai_project,
|
|
62
|
+
**kwargs,
|
|
63
|
+
):
|
|
64
|
+
# Set default for evaluate_query if not provided
|
|
65
|
+
if "evaluate_query" not in kwargs:
|
|
66
|
+
kwargs["evaluate_query"] = True
|
|
67
|
+
|
|
68
|
+
super().__init__(
|
|
69
|
+
eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
|
|
70
|
+
azure_ai_project=azure_ai_project,
|
|
71
|
+
credential=credential,
|
|
72
|
+
**kwargs,
|
|
73
|
+
)
|
|
89
74
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
75
|
+
@overload
|
|
76
|
+
def __call__(
|
|
77
|
+
self,
|
|
78
|
+
*,
|
|
79
|
+
query: str,
|
|
80
|
+
response: str,
|
|
81
|
+
) -> Dict[str, Union[str, bool]]:
|
|
82
|
+
"""Evaluate a given query/response pair for protected material
|
|
93
83
|
|
|
94
84
|
:keyword query: The query to be evaluated.
|
|
95
85
|
:paramtype query: str
|
|
96
86
|
:keyword response: The response to be evaluated.
|
|
97
87
|
:paramtype response: str
|
|
98
|
-
:return:
|
|
99
|
-
:rtype:
|
|
88
|
+
:return: The protected material score.
|
|
89
|
+
:rtype: Dict[str, Union[str, bool]]
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
@overload
|
|
93
|
+
def __call__(
|
|
94
|
+
self,
|
|
95
|
+
*,
|
|
96
|
+
conversation: Conversation,
|
|
97
|
+
) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
|
|
98
|
+
"""Evaluate a conversation for protected material
|
|
99
|
+
|
|
100
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
101
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
102
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
103
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
104
|
+
:return: The protected material score.
|
|
105
|
+
:rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]
|
|
100
106
|
"""
|
|
101
|
-
return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
|
|
102
107
|
|
|
103
|
-
|
|
104
|
-
|
|
108
|
+
@override
|
|
109
|
+
def __call__(
|
|
110
|
+
self,
|
|
111
|
+
*,
|
|
112
|
+
query: Optional[str] = None,
|
|
113
|
+
response: Optional[str] = None,
|
|
114
|
+
conversation=None,
|
|
115
|
+
**kwargs,
|
|
116
|
+
):
|
|
117
|
+
"""
|
|
118
|
+
Evaluate if protected material is present in your AI system's response.
|
|
119
|
+
|
|
120
|
+
:keyword query: The query to be evaluated.
|
|
121
|
+
:paramtype query: Optional[str]
|
|
122
|
+
:keyword response: The response to be evaluated.
|
|
123
|
+
:paramtype response: Optional[str]
|
|
124
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
125
|
+
key "messages". Conversation turns are expected
|
|
126
|
+
to be dictionaries with keys "content" and "role".
|
|
127
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
128
|
+
:return: The fluency score.
|
|
129
|
+
:rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
|
|
130
|
+
"""
|
|
131
|
+
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
|