azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +83 -14
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/_common/constants.py +124 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +578 -69
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +505 -27
- azure/ai/evaluation/_constants.py +148 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
- azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
- azure/ai/evaluation/_evaluate/_utils.py +237 -42
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
- azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
- azure/ai/evaluation/_exceptions.py +24 -1
- azure/ai/evaluation/_http_utils.py +7 -5
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
- azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- azure/ai/evaluation/simulator/_simulator.py +43 -19
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -1,30 +1,17 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict
|
|
4
5
|
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
|
|
5
|
-
from
|
|
6
|
+
from typing_extensions import overload, override
|
|
6
7
|
|
|
7
8
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
9
|
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
11
|
+
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
9
12
|
|
|
10
|
-
class _AsyncBleuScoreEvaluator:
|
|
11
|
-
def __init__(self):
|
|
12
|
-
pass
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
reference_tokens = nltk_tokenize(ground_truth)
|
|
16
|
-
hypothesis_tokens = nltk_tokenize(response)
|
|
17
|
-
|
|
18
|
-
# NIST Smoothing
|
|
19
|
-
smoothing_function = SmoothingFunction().method4
|
|
20
|
-
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
|
|
21
|
-
|
|
22
|
-
return {
|
|
23
|
-
"bleu_score": score,
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class BleuScoreEvaluator:
|
|
14
|
+
class BleuScoreEvaluator(EvaluatorBase):
|
|
28
15
|
"""
|
|
29
16
|
Calculate the BLEU score for a given response and ground truth.
|
|
30
17
|
|
|
@@ -36,6 +23,8 @@ class BleuScoreEvaluator:
|
|
|
36
23
|
indicator of quality.
|
|
37
24
|
|
|
38
25
|
The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
|
|
26
|
+
:param threshold: The threshold for the evaluation. Default is 0.5.
|
|
27
|
+
:type threshold: float
|
|
39
28
|
|
|
40
29
|
.. admonition:: Example:
|
|
41
30
|
|
|
@@ -44,16 +33,67 @@ class BleuScoreEvaluator:
|
|
|
44
33
|
:end-before: [END bleu_score_evaluator]
|
|
45
34
|
:language: python
|
|
46
35
|
:dedent: 8
|
|
47
|
-
:caption: Initialize and call an BleuScoreEvaluator.
|
|
36
|
+
:caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
|
|
37
|
+
|
|
38
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
39
|
+
|
|
40
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
41
|
+
:start-after: [START bleu_score_evaluator]
|
|
42
|
+
:end-before: [END bleu_score_evaluator]
|
|
43
|
+
:language: python
|
|
44
|
+
:dedent: 8
|
|
45
|
+
:caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
|
|
46
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
47
|
+
|
|
48
|
+
.. admonition:: Example with Threshold:
|
|
49
|
+
|
|
50
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
51
|
+
:start-after: [START threshold_bleu_score_evaluator]
|
|
52
|
+
:end-before: [END threshold_bleu_score_evaluator]
|
|
53
|
+
:language: python
|
|
54
|
+
:dedent: 8
|
|
55
|
+
:caption: Initialize with threshold and call an BleuScoreEvaluator.
|
|
48
56
|
"""
|
|
49
57
|
|
|
50
|
-
id = "
|
|
58
|
+
id = "azureai://built-in/evaluators/bleu_score"
|
|
51
59
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
52
60
|
|
|
53
|
-
def __init__(self):
|
|
54
|
-
self.
|
|
61
|
+
def __init__(self, *, threshold=0.5):
|
|
62
|
+
self._threshold = threshold
|
|
63
|
+
self._higher_is_better = True
|
|
64
|
+
super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
|
|
65
|
+
|
|
66
|
+
@override
|
|
67
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
68
|
+
"""Produce a bleu score evaluation result.
|
|
55
69
|
|
|
56
|
-
|
|
70
|
+
:param eval_input: The input to the evaluation function.
|
|
71
|
+
:type eval_input: Dict
|
|
72
|
+
:return: The evaluation result.
|
|
73
|
+
:rtype: Dict
|
|
74
|
+
"""
|
|
75
|
+
ground_truth = eval_input["ground_truth"]
|
|
76
|
+
response = eval_input["response"]
|
|
77
|
+
reference_tokens = nltk_tokenize(ground_truth)
|
|
78
|
+
hypothesis_tokens = nltk_tokenize(response)
|
|
79
|
+
|
|
80
|
+
# NIST Smoothing
|
|
81
|
+
smoothing_function = SmoothingFunction().method4
|
|
82
|
+
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
|
|
83
|
+
binary_result = False
|
|
84
|
+
if self._higher_is_better:
|
|
85
|
+
binary_result = score >= self._threshold
|
|
86
|
+
else:
|
|
87
|
+
binary_result = score <= self._threshold
|
|
88
|
+
|
|
89
|
+
return {
|
|
90
|
+
"bleu_score": score,
|
|
91
|
+
"bleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
|
|
92
|
+
"bleu_threshold": self._threshold,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
@overload # type: ignore
|
|
96
|
+
def __call__(self, *, response: str, ground_truth: str):
|
|
57
97
|
"""
|
|
58
98
|
Evaluate the BLEU score between the response and the ground truth.
|
|
59
99
|
|
|
@@ -64,9 +104,21 @@ class BleuScoreEvaluator:
|
|
|
64
104
|
:return: The BLEU score.
|
|
65
105
|
:rtype: Dict[str, float]
|
|
66
106
|
"""
|
|
67
|
-
return async_run_allowing_running_loop(
|
|
68
|
-
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
|
|
69
|
-
)
|
|
70
107
|
|
|
71
|
-
|
|
72
|
-
|
|
108
|
+
@override
|
|
109
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
110
|
+
self,
|
|
111
|
+
*args,
|
|
112
|
+
**kwargs,
|
|
113
|
+
):
|
|
114
|
+
"""
|
|
115
|
+
Evaluate the BLEU score between the response and the ground truth.
|
|
116
|
+
|
|
117
|
+
:keyword response: The response to be evaluated.
|
|
118
|
+
:paramtype response: str
|
|
119
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
120
|
+
:paramtype ground_truth: str
|
|
121
|
+
:return: The BLEU score.
|
|
122
|
+
:rtype: Dict[str, float]
|
|
123
|
+
"""
|
|
124
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing_extensions import overload, override
|
|
5
|
+
from typing import Dict, Union
|
|
6
|
+
|
|
7
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
8
|
+
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
9
|
+
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# cspell:ignore ssrf, vuln
|
|
13
|
+
@experimental
|
|
14
|
+
class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
15
|
+
"""
|
|
16
|
+
Evaluates code vulnerability for a given query and response for a single-turn evaluation only,
|
|
17
|
+
where query represents the user query or code before the completion, and response represents the code recommended by the assistant.
|
|
18
|
+
|
|
19
|
+
The code vulnerability evaluation checks for vulnerabilities in the following coding languages:
|
|
20
|
+
|
|
21
|
+
- Python
|
|
22
|
+
- Java
|
|
23
|
+
- C++
|
|
24
|
+
- C#
|
|
25
|
+
- Go
|
|
26
|
+
- Javascript
|
|
27
|
+
- SQL
|
|
28
|
+
|
|
29
|
+
The code vulnerability evaluation identifies the following vulnerabilities:
|
|
30
|
+
|
|
31
|
+
- path-injection
|
|
32
|
+
- sql-injection
|
|
33
|
+
- code-injection
|
|
34
|
+
- stack-trace-exposure
|
|
35
|
+
- incomplete-url-substring-sanitization
|
|
36
|
+
- flask-debug
|
|
37
|
+
- clear-text-logging-sensitive-data
|
|
38
|
+
- incomplete-hostname-regexp
|
|
39
|
+
- server-side-unvalidated-url-redirection
|
|
40
|
+
- weak-cryptographic-algorithm
|
|
41
|
+
- full-ssrf
|
|
42
|
+
- bind-socket-all-network-interfaces
|
|
43
|
+
- client-side-unvalidated-url-redirection
|
|
44
|
+
- likely-bugs
|
|
45
|
+
- reflected-xss
|
|
46
|
+
- clear-text-storage-sensitive-data
|
|
47
|
+
- tarslip
|
|
48
|
+
- hardcoded-credentials
|
|
49
|
+
- insecure-randomness
|
|
50
|
+
|
|
51
|
+
:param credential: The credential for connecting to Azure AI project. Required
|
|
52
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
53
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
54
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
55
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
56
|
+
:param kwargs: Additional arguments to pass to the evaluator.
|
|
57
|
+
:type kwargs: Any
|
|
58
|
+
|
|
59
|
+
.. note::
|
|
60
|
+
|
|
61
|
+
If this evaluator is supplied to the `evaluate` function, the metric
|
|
62
|
+
for the code vulnerability will be "code_vulnerability_label".
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
id = "azureai://built-in/evaluators/code_vulnerability"
|
|
66
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
67
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
68
|
+
|
|
69
|
+
@override
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
credential,
|
|
73
|
+
azure_ai_project,
|
|
74
|
+
**kwargs,
|
|
75
|
+
):
|
|
76
|
+
# Set default for evaluate_query if not provided
|
|
77
|
+
if "evaluate_query" not in kwargs:
|
|
78
|
+
kwargs["evaluate_query"] = True
|
|
79
|
+
|
|
80
|
+
super().__init__(
|
|
81
|
+
eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
|
|
82
|
+
azure_ai_project=azure_ai_project,
|
|
83
|
+
credential=credential,
|
|
84
|
+
**kwargs,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
@overload
|
|
88
|
+
def __call__(
|
|
89
|
+
self,
|
|
90
|
+
*,
|
|
91
|
+
query: str,
|
|
92
|
+
response: str,
|
|
93
|
+
) -> Dict[str, Union[str, float]]:
|
|
94
|
+
"""Evaluate a given query/response pair for code vulnerability
|
|
95
|
+
|
|
96
|
+
:keyword query: The query to be evaluated.
|
|
97
|
+
:paramtype query: str
|
|
98
|
+
:keyword response: The response to be evaluated.
|
|
99
|
+
:paramtype response: str
|
|
100
|
+
:return: The code vulnerability label.
|
|
101
|
+
:rtype: Dict[str, Union[str, bool]]
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
@override
|
|
105
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
106
|
+
self,
|
|
107
|
+
*args,
|
|
108
|
+
**kwargs,
|
|
109
|
+
):
|
|
110
|
+
"""Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.
|
|
111
|
+
|
|
112
|
+
:keyword query: The query to be evaluated.
|
|
113
|
+
:paramtype query: Optional[str]
|
|
114
|
+
:keyword response: The response to be evaluated.
|
|
115
|
+
:paramtype response: Optional[str]
|
|
116
|
+
:rtype: Dict[str, Union[str, bool]]
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -21,6 +21,13 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
21
21
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
22
22
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
23
23
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
24
|
+
:param threshold: The threshold for the coherence evaluator. Default is 3.
|
|
25
|
+
:type threshold: int
|
|
26
|
+
:param credential: The credential for authenticating to Azure AI service.
|
|
27
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
28
|
+
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
|
|
29
|
+
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
|
|
30
|
+
:paramtype is_reasoning_model: bool
|
|
24
31
|
|
|
25
32
|
.. admonition:: Example:
|
|
26
33
|
|
|
@@ -29,7 +36,26 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
29
36
|
:end-before: [END coherence_evaluator]
|
|
30
37
|
:language: python
|
|
31
38
|
:dedent: 8
|
|
32
|
-
:caption: Initialize and call
|
|
39
|
+
:caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
|
|
40
|
+
|
|
41
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
42
|
+
|
|
43
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
44
|
+
:start-after: [START coherence_evaluator]
|
|
45
|
+
:end-before: [END coherence_evaluator]
|
|
46
|
+
:language: python
|
|
47
|
+
:dedent: 8
|
|
48
|
+
:caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
|
|
49
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
50
|
+
|
|
51
|
+
.. admonition:: Example with Threshold:
|
|
52
|
+
|
|
53
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
54
|
+
:start-after: [START threshold_coherence_evaluator]
|
|
55
|
+
:end-before: [END threshold_coherence_evaluator]
|
|
56
|
+
:language: python
|
|
57
|
+
:dedent: 8
|
|
58
|
+
:caption: Initialize with threshold and call a CoherenceEvaluator with a query and response.
|
|
33
59
|
|
|
34
60
|
.. note::
|
|
35
61
|
|
|
@@ -41,14 +67,24 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
41
67
|
_PROMPTY_FILE = "coherence.prompty"
|
|
42
68
|
_RESULT_KEY = "coherence"
|
|
43
69
|
|
|
44
|
-
id = "
|
|
70
|
+
id = "azureai://built-in/evaluators/coherence"
|
|
45
71
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
46
72
|
|
|
47
73
|
@override
|
|
48
|
-
def __init__(self, model_config):
|
|
74
|
+
def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
|
|
49
75
|
current_dir = os.path.dirname(__file__)
|
|
50
76
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
51
|
-
|
|
77
|
+
self._threshold = threshold
|
|
78
|
+
self._higher_is_better = True
|
|
79
|
+
super().__init__(
|
|
80
|
+
model_config=model_config,
|
|
81
|
+
prompty_file=prompty_path,
|
|
82
|
+
result_key=self._RESULT_KEY,
|
|
83
|
+
threshold=threshold,
|
|
84
|
+
credential=credential,
|
|
85
|
+
_higher_is_better=self._higher_is_better,
|
|
86
|
+
**kwargs,
|
|
87
|
+
)
|
|
52
88
|
|
|
53
89
|
@overload
|
|
54
90
|
def __call__(
|
|
@@ -5,9 +5,11 @@
|
|
|
5
5
|
from ._base_eval import EvaluatorBase
|
|
6
6
|
from ._base_prompty_eval import PromptyEvaluatorBase
|
|
7
7
|
from ._base_rai_svc_eval import RaiServiceEvaluatorBase
|
|
8
|
+
from ._base_multi_eval import MultiEvaluatorBase
|
|
8
9
|
|
|
9
10
|
__all__ = [
|
|
10
11
|
"EvaluatorBase",
|
|
11
12
|
"PromptyEvaluatorBase",
|
|
12
13
|
"RaiServiceEvaluatorBase",
|
|
14
|
+
"MultiEvaluatorBase",
|
|
13
15
|
]
|