azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +100 -5
- azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
- azure/ai/evaluation/_common/constants.py +131 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +831 -142
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +870 -34
- azure/ai/evaluation/_constants.py +167 -6
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
- azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
- azure/ai/evaluation/_evaluate/_utils.py +289 -40
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
- azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
- azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
- azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
- azure/ai/evaluation/_exceptions.py +51 -7
- azure/ai/evaluation/_http_utils.py +210 -137
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +130 -8
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
- azure/ai/evaluation/simulator/_constants.py +12 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
- azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
- azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +302 -208
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -1,25 +1,36 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
import enum
|
|
5
|
+
from typing import Literal
|
|
6
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
4
7
|
|
|
5
8
|
|
|
6
9
|
class EvaluationMetrics:
|
|
7
10
|
"""Metrics for model evaluation."""
|
|
8
11
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
GROUNDEDNESS = "groundedness"
|
|
13
|
+
RELEVANCE = "relevance"
|
|
14
|
+
COHERENCE = "coherence"
|
|
15
|
+
FLUENCY = "fluency"
|
|
16
|
+
SIMILARITY = "similarity"
|
|
14
17
|
F1_SCORE = "f1_score"
|
|
15
|
-
|
|
18
|
+
RETRIEVAL_SCORE = "retrieval_score"
|
|
16
19
|
HATE_FAIRNESS = "hate_fairness"
|
|
17
20
|
HATE_UNFAIRNESS = "hate_unfairness"
|
|
18
21
|
VIOLENCE = "violence"
|
|
19
22
|
SELF_HARM = "self_harm"
|
|
20
23
|
SEXUAL = "sexual"
|
|
21
24
|
PROTECTED_MATERIAL = "protected_material"
|
|
25
|
+
ARTWORK = "artwork"
|
|
26
|
+
FICTIONAL_CHARACTERS = "fictional_characters"
|
|
27
|
+
LOGOS_AND_BRANDS = "logos_and_brands"
|
|
22
28
|
XPIA = "xpia"
|
|
29
|
+
CODE_VULNERABILITY = "code_vulnerability"
|
|
30
|
+
UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
|
|
31
|
+
SENSITIVE_DATA_LEAKAGE = "sensitive_data_leakage"
|
|
32
|
+
TASK_ADHERENCE = "task_adherence"
|
|
33
|
+
PROHIBITED_ACTIONS = "prohibited_actions"
|
|
23
34
|
|
|
24
35
|
|
|
25
36
|
class _InternalEvaluationMetrics:
|
|
@@ -48,12 +59,162 @@ class DefaultOpenEncoding:
|
|
|
48
59
|
"""SDK Default Encoding when writing a file"""
|
|
49
60
|
|
|
50
61
|
|
|
62
|
+
class EvaluationRunProperties:
|
|
63
|
+
"""Defines properties used to identify an evaluation run by UI"""
|
|
64
|
+
|
|
65
|
+
RUN_TYPE = "runType"
|
|
66
|
+
EVALUATION_RUN = "_azureml.evaluation_run"
|
|
67
|
+
EVALUATION_SDK = "_azureml.evaluation_sdk_name"
|
|
68
|
+
NAME_MAP = "_azureml.evaluation_name_map"
|
|
69
|
+
NAME_MAP_LENGTH = "_azureml.evaluation_name_map_length"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@experimental
|
|
73
|
+
class _AggregationType(enum.Enum):
|
|
74
|
+
"""Defines how numeric evaluation results should be aggregated
|
|
75
|
+
to produce a single value. Used by individual evaluators to combine per-turn results for
|
|
76
|
+
a conversation-based input. In general, wherever this enum is used, it is also possible
|
|
77
|
+
to directly assign the underlying aggregation function for more complex use cases.
|
|
78
|
+
The 'custom' value is generally not an acceptable input, and should only be used as an output
|
|
79
|
+
to indicate that a custom aggregation function has been injected."""
|
|
80
|
+
|
|
81
|
+
MEAN = "mean"
|
|
82
|
+
MAX = "max"
|
|
83
|
+
MIN = "min"
|
|
84
|
+
SUM = "sum"
|
|
85
|
+
CUSTOM = "custom"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class TokenScope(str, enum.Enum):
|
|
89
|
+
"""Defines the scope of the token used to access Azure resources."""
|
|
90
|
+
|
|
91
|
+
DEFAULT_AZURE_MANAGEMENT = "https://management.azure.com/.default"
|
|
92
|
+
COGNITIVE_SERVICES_MANAGEMENT = "https://ai.azure.com/.default"
|
|
93
|
+
AZURE_ML = "https://ml.azure.com/.default"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class _EvaluatorMetricMapping:
|
|
97
|
+
"""
|
|
98
|
+
Static mapping of evaluator names to their metric names, based on assets.json.
|
|
99
|
+
The 'builtin.' prefix is removed from the evaluator name keys.
|
|
100
|
+
If an evaluator maps to multiple metrics, all metric names are included in the list, and the first one is considered the primary metric.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
EVALUATOR_NAME_METRICS_MAPPINGS = {
|
|
104
|
+
"bleu_score": ["bleu"],
|
|
105
|
+
"coherence": ["coherence"],
|
|
106
|
+
"document_retrieval": [
|
|
107
|
+
"xdcg@3",
|
|
108
|
+
"ndcg@3",
|
|
109
|
+
"fidelity",
|
|
110
|
+
"top1_relevance",
|
|
111
|
+
"top3_max_relevance",
|
|
112
|
+
"holes",
|
|
113
|
+
"holes_ratio",
|
|
114
|
+
"total_retrieved_documents",
|
|
115
|
+
"total_ground_truth_documents",
|
|
116
|
+
],
|
|
117
|
+
"f1_score": ["f1_score"],
|
|
118
|
+
"fluency": ["fluency"],
|
|
119
|
+
"gleu_score": ["gleu"],
|
|
120
|
+
"meteor_score": ["meteor"],
|
|
121
|
+
"relevance": ["relevance"],
|
|
122
|
+
"response_completeness": ["response_completeness"],
|
|
123
|
+
"rouge_score": ["rouge_f1_score", "rouge_precision", "rouge_recall"],
|
|
124
|
+
"groundedness_pro": ["groundedness_pro"],
|
|
125
|
+
"similarity": ["similarity"],
|
|
126
|
+
"intent_resolution": ["intent_resolution"],
|
|
127
|
+
"retrieval": ["retrieval"],
|
|
128
|
+
"task_adherence": ["task_adherence"],
|
|
129
|
+
"tool_call_accuracy": ["tool_call_accuracy"],
|
|
130
|
+
"groundedness": ["groundedness"],
|
|
131
|
+
"code_vulnerability": ["code_vulnerability"],
|
|
132
|
+
"eci": ["eci"],
|
|
133
|
+
"protected_material": ["protected_material"],
|
|
134
|
+
"ungrounded_attributes": ["ungrounded_attributes"],
|
|
135
|
+
"indirect_attack": ["xpia", "xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"],
|
|
136
|
+
"label_grader": ["label_model"],
|
|
137
|
+
"string_check_grader": ["string_check"],
|
|
138
|
+
"text_similarity_grader": ["similarity"],
|
|
139
|
+
"score_model_grader": ["score_model"],
|
|
140
|
+
"sexual": ["sexual"],
|
|
141
|
+
"self_harm": ["self_harm"],
|
|
142
|
+
"violence": ["violence"],
|
|
143
|
+
"hate_unfairness": ["hate_unfairness"],
|
|
144
|
+
"tool_input_accuracy": ["tool_input_accuracy"],
|
|
145
|
+
"task_completion": ["task_completion"],
|
|
146
|
+
"tool_success": ["tool_success"],
|
|
147
|
+
"tool_call_success": ["tool_call_success"],
|
|
148
|
+
"tool_selection": ["tool_selection"],
|
|
149
|
+
"tool_output_utilization": ["tool_output_utilization"],
|
|
150
|
+
"task_navigation_efficiency": ["task_navigation_efficiency"],
|
|
151
|
+
"text_similarity": ["similarity"],
|
|
152
|
+
"string_check": ["string_check"],
|
|
153
|
+
"sensitive_data_leakage": ["prohibited_actions"],
|
|
154
|
+
"score_model": ["score_model"],
|
|
155
|
+
"label_model": ["label_model"],
|
|
156
|
+
"prohibited_actions": ["prohibited_actions"],
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
EVAL_CLASS_NAME_MAP = {
|
|
160
|
+
"BleuScoreEvaluator": "bleu_score",
|
|
161
|
+
"CodeVulnerabilityEvaluator": "code_vulnerability",
|
|
162
|
+
"CoherenceEvaluator": "coherence",
|
|
163
|
+
"ContentSafetyEvaluator": "content_safety",
|
|
164
|
+
"DocumentRetrievalEvaluator": "document_retrieval",
|
|
165
|
+
"ECIEvaluator": "eci",
|
|
166
|
+
"F1ScoreEvaluator": "f1_score",
|
|
167
|
+
"FluencyEvaluator": "fluency",
|
|
168
|
+
"GleuScoreEvaluator": "gleu_score",
|
|
169
|
+
"GroundednessEvaluator": "groundedness",
|
|
170
|
+
"GroundednessProEvaluator": "groundedness_pro",
|
|
171
|
+
"HateUnfairnessEvaluator": "hate_unfairness",
|
|
172
|
+
"IndirectAttackEvaluator": "indirect_attack",
|
|
173
|
+
"IntentResolutionEvaluator": "intent_resolution",
|
|
174
|
+
"MeteorScoreEvaluator": "meteor_score",
|
|
175
|
+
"ProtectedMaterialEvaluator": "protected_material",
|
|
176
|
+
"QAEvaluator": "qa",
|
|
177
|
+
"RelevanceEvaluator": "relevance",
|
|
178
|
+
"ResponseCompletenessEvaluator": "response_completeness",
|
|
179
|
+
"RetrievalEvaluator": "retrieval",
|
|
180
|
+
"RougeScoreEvaluator": "rouge_score",
|
|
181
|
+
"SelfHarmEvaluator": "self_harm",
|
|
182
|
+
"SexualEvaluator": "sexual",
|
|
183
|
+
"SimilarityEvaluator": "similarity",
|
|
184
|
+
"TaskAdherenceEvaluator": "task_adherence",
|
|
185
|
+
"TaskCompletionEvaluator": "task_completion",
|
|
186
|
+
"ToolCallAccuracyEvaluator": "tool_call_accuracy",
|
|
187
|
+
"UngroundedAttributesEvaluator": "ungrounded_attributes",
|
|
188
|
+
"ViolenceEvaluator": "violence",
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
|
|
51
192
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
|
|
52
193
|
|
|
53
194
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
|
|
54
195
|
|
|
55
196
|
PF_BATCH_TIMEOUT_SEC_DEFAULT = 3600
|
|
56
197
|
PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
|
|
198
|
+
PF_DISABLE_TRACING = "PF_DISABLE_TRACING"
|
|
57
199
|
|
|
58
200
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
|
|
59
201
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
|
|
202
|
+
|
|
203
|
+
AZURE_OPENAI_TYPE: Literal["azure_openai"] = "azure_openai"
|
|
204
|
+
|
|
205
|
+
OPENAI_TYPE: Literal["openai"] = "openai"
|
|
206
|
+
|
|
207
|
+
EVALUATION_PASS_FAIL_MAPPING = {
|
|
208
|
+
True: "pass",
|
|
209
|
+
False: "fail",
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS = 60000
|
|
213
|
+
BINARY_AGGREGATE_SUFFIX = "binary_aggregate"
|
|
214
|
+
|
|
215
|
+
AOAI_COLUMN_NAME = "aoai"
|
|
216
|
+
DEFAULT_OAI_EVAL_RUN_NAME = "AI_SDK_EVAL_RUN"
|
|
217
|
+
DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.
|
|
218
|
+
|
|
219
|
+
# OpenTelemetry event names
|
|
220
|
+
EVALUATION_EVENT_NAME = "gen_ai.evaluation.result"
|