azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +85 -14
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/_common/constants.py +124 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +578 -69
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +505 -27
- azure/ai/evaluation/_constants.py +147 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +87 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
- azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
- azure/ai/evaluation/_evaluate/_utils.py +237 -42
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +430 -29
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
- azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_call_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +306 -0
- azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
- azure/ai/evaluation/_exceptions.py +24 -1
- azure/ai/evaluation/_http_utils.py +7 -5
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
- azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- azure/ai/evaluation/simulator/_simulator.py +43 -19
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/METADATA +378 -27
- azure_ai_evaluation-1.13.5.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -12,27 +12,28 @@ from ._evaluators._content_safety import (
|
|
|
12
12
|
SexualEvaluator,
|
|
13
13
|
ViolenceEvaluator,
|
|
14
14
|
)
|
|
15
|
-
from ._evaluators._multimodal._content_safety_multimodal import (
|
|
16
|
-
ContentSafetyMultimodalEvaluator,
|
|
17
|
-
HateUnfairnessMultimodalEvaluator,
|
|
18
|
-
SelfHarmMultimodalEvaluator,
|
|
19
|
-
SexualMultimodalEvaluator,
|
|
20
|
-
ViolenceMultimodalEvaluator,
|
|
21
|
-
)
|
|
22
|
-
from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
|
|
23
15
|
from ._evaluators._f1_score import F1ScoreEvaluator
|
|
24
16
|
from ._evaluators._fluency import FluencyEvaluator
|
|
25
17
|
from ._evaluators._gleu import GleuScoreEvaluator
|
|
26
18
|
from ._evaluators._groundedness import GroundednessEvaluator
|
|
27
19
|
from ._evaluators._service_groundedness import GroundednessProEvaluator
|
|
20
|
+
from ._evaluators._intent_resolution import IntentResolutionEvaluator
|
|
28
21
|
from ._evaluators._meteor import MeteorScoreEvaluator
|
|
29
22
|
from ._evaluators._protected_material import ProtectedMaterialEvaluator
|
|
30
23
|
from ._evaluators._qa import QAEvaluator
|
|
24
|
+
from ._evaluators._response_completeness import ResponseCompletenessEvaluator
|
|
25
|
+
from ._evaluators._task_adherence import TaskAdherenceEvaluator
|
|
31
26
|
from ._evaluators._relevance import RelevanceEvaluator
|
|
32
27
|
from ._evaluators._retrieval import RetrievalEvaluator
|
|
33
28
|
from ._evaluators._rouge import RougeScoreEvaluator, RougeType
|
|
34
29
|
from ._evaluators._similarity import SimilarityEvaluator
|
|
35
30
|
from ._evaluators._xpia import IndirectAttackEvaluator
|
|
31
|
+
from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
|
|
32
|
+
from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator
|
|
33
|
+
from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator
|
|
34
|
+
from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
|
|
35
|
+
from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator
|
|
36
|
+
from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator
|
|
36
37
|
from ._model_configurations import (
|
|
37
38
|
AzureAIProject,
|
|
38
39
|
AzureOpenAIModelConfiguration,
|
|
@@ -42,6 +43,59 @@ from ._model_configurations import (
|
|
|
42
43
|
Message,
|
|
43
44
|
OpenAIModelConfiguration,
|
|
44
45
|
)
|
|
46
|
+
from ._aoai.aoai_grader import AzureOpenAIGrader
|
|
47
|
+
from ._aoai.label_grader import AzureOpenAILabelGrader
|
|
48
|
+
from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader
|
|
49
|
+
from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader
|
|
50
|
+
from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader
|
|
51
|
+
from ._aoai.python_grader import AzureOpenAIPythonGrader
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_patch_all = []
|
|
55
|
+
|
|
56
|
+
# The converter from the AI service to the evaluator schema requires a dependency on
|
|
57
|
+
# ai.projects, but we also don't want to force users installing ai.evaluations to pull
|
|
58
|
+
# in ai.projects. So we only import it if it's available and the user has ai.projects.
|
|
59
|
+
# We use lazy loading to avoid printing messages during import unless the classes are actually used.
|
|
60
|
+
_lazy_imports = {}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _create_lazy_import(class_name, module_path, dependency_name):
|
|
64
|
+
"""Create a lazy import function for optional dependencies.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
class_name: Name of the class to import
|
|
68
|
+
module_path: Module path to import from
|
|
69
|
+
dependency_name: Name of the dependency package for error message
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
A function that performs the lazy import when called
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def lazy_import():
|
|
76
|
+
try:
|
|
77
|
+
module = __import__(module_path, fromlist=[class_name])
|
|
78
|
+
cls = getattr(module, class_name)
|
|
79
|
+
_patch_all.append(class_name)
|
|
80
|
+
return cls
|
|
81
|
+
except ImportError:
|
|
82
|
+
raise ImportError(
|
|
83
|
+
f"Could not import {class_name}. Please install the dependency with `pip install {dependency_name}`."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return lazy_import
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
_lazy_imports["AIAgentConverter"] = _create_lazy_import(
|
|
90
|
+
"AIAgentConverter",
|
|
91
|
+
"azure.ai.evaluation._converters._ai_services",
|
|
92
|
+
"azure-ai-projects",
|
|
93
|
+
)
|
|
94
|
+
_lazy_imports["SKAgentConverter"] = _create_lazy_import(
|
|
95
|
+
"SKAgentConverter",
|
|
96
|
+
"azure.ai.evaluation._converters._sk_services",
|
|
97
|
+
"semantic-kernel",
|
|
98
|
+
)
|
|
45
99
|
|
|
46
100
|
__all__ = [
|
|
47
101
|
"evaluate",
|
|
@@ -50,6 +104,9 @@ __all__ = [
|
|
|
50
104
|
"FluencyEvaluator",
|
|
51
105
|
"GroundednessEvaluator",
|
|
52
106
|
"GroundednessProEvaluator",
|
|
107
|
+
"ResponseCompletenessEvaluator",
|
|
108
|
+
"TaskAdherenceEvaluator",
|
|
109
|
+
"IntentResolutionEvaluator",
|
|
53
110
|
"RelevanceEvaluator",
|
|
54
111
|
"SimilarityEvaluator",
|
|
55
112
|
"QAEvaluator",
|
|
@@ -73,10 +130,24 @@ __all__ = [
|
|
|
73
130
|
"Conversation",
|
|
74
131
|
"Message",
|
|
75
132
|
"EvaluationResult",
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
"
|
|
80
|
-
"
|
|
81
|
-
"
|
|
133
|
+
"CodeVulnerabilityEvaluator",
|
|
134
|
+
"UngroundedAttributesEvaluator",
|
|
135
|
+
"ToolCallAccuracyEvaluator",
|
|
136
|
+
"_ToolOutputUtilizationEvaluator",
|
|
137
|
+
"_ToolCallSuccessEvaluator",
|
|
138
|
+
"AzureOpenAIGrader",
|
|
139
|
+
"AzureOpenAILabelGrader",
|
|
140
|
+
"AzureOpenAIStringCheckGrader",
|
|
141
|
+
"AzureOpenAITextSimilarityGrader",
|
|
142
|
+
"AzureOpenAIScoreModelGrader",
|
|
143
|
+
"AzureOpenAIPythonGrader",
|
|
82
144
|
]
|
|
145
|
+
|
|
146
|
+
__all__.extend([p for p in _patch_all if p not in __all__])
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def __getattr__(name):
|
|
150
|
+
"""Handle lazy imports for optional dependencies."""
|
|
151
|
+
if name in _lazy_imports:
|
|
152
|
+
return _lazy_imports[name]()
|
|
153
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from .aoai_grader import AzureOpenAIGrader
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"AzureOpenAIGrader",
|
|
10
|
+
]
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
|
|
5
|
+
|
|
6
|
+
from typing_extensions import TypeIs
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._constants import DEFAULT_AOAI_API_VERSION, TokenScope
|
|
10
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
11
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
12
|
+
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
13
|
+
from azure.core.credentials import TokenCredential
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from openai.lib.azure import AzureADTokenProvider
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@experimental
|
|
20
|
+
class AzureOpenAIGrader:
|
|
21
|
+
"""Base class for Azure OpenAI grader wrappers.
|
|
22
|
+
|
|
23
|
+
Recommended only for use by experienced OpenAI API users.
|
|
24
|
+
Combines a model configuration and any grader configuration
|
|
25
|
+
into a singular object that can be used in evaluations.
|
|
26
|
+
|
|
27
|
+
Supplying an AzureOpenAIGrader to the `evaluate` method will cause an asynchronous request to evaluate
|
|
28
|
+
the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
|
|
29
|
+
evaluation results.
|
|
30
|
+
|
|
31
|
+
:param model_config: The model configuration to use for the grader.
|
|
32
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
33
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
34
|
+
:param grader_config: The grader configuration to use for the grader. This is expected
|
|
35
|
+
to be formatted as a dictionary that matches the specifications of the sub-types of
|
|
36
|
+
the TestingCriterion alias specified in `OpenAI's SDK <https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L151>`_.
|
|
37
|
+
:type grader_config: Dict[str, Any]
|
|
38
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
39
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
40
|
+
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
41
|
+
:type kwargs: Any
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
id = "azureai://built-in/evaluators/azure-openai/custom_grader"
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
*,
|
|
49
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
50
|
+
grader_config: Dict[str, Any],
|
|
51
|
+
credential: Optional[TokenCredential] = None,
|
|
52
|
+
**kwargs: Any,
|
|
53
|
+
):
|
|
54
|
+
self._model_config = model_config
|
|
55
|
+
self._grader_config = grader_config
|
|
56
|
+
self._credential = credential
|
|
57
|
+
|
|
58
|
+
if kwargs.get("validate", True):
|
|
59
|
+
self._validate_model_config()
|
|
60
|
+
self._validate_grader_config()
|
|
61
|
+
|
|
62
|
+
def _validate_model_config(self) -> None:
|
|
63
|
+
"""Validate the model configuration that this grader wrapper is using."""
|
|
64
|
+
msg = None
|
|
65
|
+
if self._is_azure_model_config(self._model_config):
|
|
66
|
+
if not any(auth for auth in (self._model_config.get("api_key"), self._credential)):
|
|
67
|
+
msg = (
|
|
68
|
+
f"{type(self).__name__}: Requires an api_key in the supplied model_config, "
|
|
69
|
+
+ "or providing a credential to the grader's __init__ method. "
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
else:
|
|
73
|
+
if "api_key" not in self._model_config or not self._model_config.get("api_key"):
|
|
74
|
+
msg = f"{type(self).__name__}: Requires an api_key in the supplied model_config."
|
|
75
|
+
|
|
76
|
+
if msg is None:
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
raise EvaluationException(
|
|
80
|
+
message=msg,
|
|
81
|
+
blame=ErrorBlame.USER_ERROR,
|
|
82
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
83
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def _validate_grader_config(self) -> None:
|
|
87
|
+
"""Validate the grader configuration that this grader wrapper is using."""
|
|
88
|
+
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def _is_azure_model_config(
|
|
93
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
94
|
+
) -> TypeIs[AzureOpenAIModelConfiguration]:
|
|
95
|
+
return "azure_endpoint" in model_config
|
|
96
|
+
|
|
97
|
+
def get_client(self) -> Any:
|
|
98
|
+
"""Construct an appropriate OpenAI client using this grader's model configuration.
|
|
99
|
+
Returns a slightly different client depending on whether or not this grader's model
|
|
100
|
+
configuration is for Azure OpenAI or OpenAI.
|
|
101
|
+
|
|
102
|
+
:return: The OpenAI client.
|
|
103
|
+
:rtype: [~openai.OpenAI, ~openai.AzureOpenAI]
|
|
104
|
+
"""
|
|
105
|
+
default_headers = {"User-Agent": UserAgentSingleton().value}
|
|
106
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration] = self._model_config
|
|
107
|
+
api_key: Optional[str] = model_config.get("api_key")
|
|
108
|
+
|
|
109
|
+
if self._is_azure_model_config(model_config):
|
|
110
|
+
from openai import AzureOpenAI
|
|
111
|
+
|
|
112
|
+
# TODO set default values?
|
|
113
|
+
return AzureOpenAI(
|
|
114
|
+
azure_endpoint=model_config["azure_endpoint"],
|
|
115
|
+
api_key=api_key, # Default-style access to appease linters.
|
|
116
|
+
api_version=DEFAULT_AOAI_API_VERSION, # Force a known working version
|
|
117
|
+
azure_deployment=model_config.get("azure_deployment", ""),
|
|
118
|
+
azure_ad_token_provider=self._get_token_provider(self._credential) if not api_key else None,
|
|
119
|
+
default_headers=default_headers,
|
|
120
|
+
)
|
|
121
|
+
from openai import OpenAI
|
|
122
|
+
|
|
123
|
+
# TODO add default values for base_url and organization?
|
|
124
|
+
return OpenAI(
|
|
125
|
+
api_key=api_key,
|
|
126
|
+
base_url=model_config.get("base_url", ""),
|
|
127
|
+
organization=model_config.get("organization", ""),
|
|
128
|
+
default_headers=default_headers,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def _get_token_provider(cred: TokenCredential) -> "AzureADTokenProvider":
|
|
133
|
+
"""Get the token provider the AzureOpenAI client.
|
|
134
|
+
|
|
135
|
+
:param TokenCredential cred: The Azure authentication credential.
|
|
136
|
+
:return: The token provider if a credential is provided, otherwise None.
|
|
137
|
+
:rtype: openai.lib.azure.AzureADTokenProvider
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
return lambda: cred.get_token(TokenScope.COGNITIVE_SERVICES_MANAGEMENT).token
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from openai.types.graders import LabelModelGrader
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
10
|
+
from azure.core.credentials import TokenCredential
|
|
11
|
+
|
|
12
|
+
from .aoai_grader import AzureOpenAIGrader
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@experimental
|
|
16
|
+
class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
17
|
+
"""Wrapper class for OpenAI's label model graders.
|
|
18
|
+
|
|
19
|
+
Supplying a LabelGrader to the `evaluate` method will cause an asynchronous request to evaluate
|
|
20
|
+
the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
|
|
21
|
+
evaluation results.
|
|
22
|
+
|
|
23
|
+
:param model_config: The model configuration to use for the grader.
|
|
24
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
25
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
26
|
+
:param input: The list of label-based testing criterion for this grader. Individual
|
|
27
|
+
values of this list are expected to be dictionaries that match the format of any of the valid
|
|
28
|
+
`TestingCriterionLabelModelInput <https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L125C1-L125C32>`_
|
|
29
|
+
subtypes.
|
|
30
|
+
:type input: List[Dict[str, str]]
|
|
31
|
+
:param labels: A list of strings representing the classification labels of this grader.
|
|
32
|
+
:type labels: List[str]
|
|
33
|
+
:param model: The model to use for the evaluation. Must support structured outputs.
|
|
34
|
+
:type model: str
|
|
35
|
+
:param name: The name of the grader.
|
|
36
|
+
:type name: str
|
|
37
|
+
:param passing_labels: The labels that indicate a passing result. Must be a subset of labels.
|
|
38
|
+
:type passing_labels: List[str]
|
|
39
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
40
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
41
|
+
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
42
|
+
:type kwargs: Any
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
id = "azureai://built-in/evaluators/azure-openai/label_grader"
|
|
46
|
+
_type = "label_model"
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
*,
|
|
51
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
52
|
+
input: List[Dict[str, str]],
|
|
53
|
+
labels: List[str],
|
|
54
|
+
model: str,
|
|
55
|
+
name: str,
|
|
56
|
+
passing_labels: List[str],
|
|
57
|
+
credential: Optional[TokenCredential] = None,
|
|
58
|
+
**kwargs: Any
|
|
59
|
+
):
|
|
60
|
+
grader = LabelModelGrader(
|
|
61
|
+
input=input,
|
|
62
|
+
labels=labels,
|
|
63
|
+
model=model,
|
|
64
|
+
name=name,
|
|
65
|
+
passing_labels=passing_labels,
|
|
66
|
+
type=AzureOpenAILabelGrader._type,
|
|
67
|
+
)
|
|
68
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Any, Dict, Optional, Union
|
|
5
|
+
|
|
6
|
+
from openai.types.graders import PythonGrader
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
10
|
+
from azure.core.credentials import TokenCredential
|
|
11
|
+
|
|
12
|
+
from .aoai_grader import AzureOpenAIGrader
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@experimental
|
|
16
|
+
class AzureOpenAIPythonGrader(AzureOpenAIGrader):
|
|
17
|
+
"""Wrapper class for OpenAI's Python code graders.
|
|
18
|
+
|
|
19
|
+
Enables custom Python-based evaluation logic with flexible scoring and
|
|
20
|
+
pass/fail thresholds. The grader executes user-provided Python code
|
|
21
|
+
to evaluate outputs against custom criteria.
|
|
22
|
+
|
|
23
|
+
Supplying a PythonGrader to the `evaluate` method will cause an
|
|
24
|
+
asynchronous request to evaluate the grader via the OpenAI API. The
|
|
25
|
+
results of the evaluation will then be merged into the standard
|
|
26
|
+
evaluation results.
|
|
27
|
+
|
|
28
|
+
:param model_config: The model configuration to use for the grader.
|
|
29
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
30
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
31
|
+
:param name: The name of the grader.
|
|
32
|
+
:type name: str
|
|
33
|
+
:param image_tag: The image tag for the Python execution environment.
|
|
34
|
+
:type image_tag: str
|
|
35
|
+
:param pass_threshold: Score threshold for pass/fail classification. Scores >= threshold are considered passing.
|
|
36
|
+
:type pass_threshold: float
|
|
37
|
+
:param source: Python source code containing the grade function.
|
|
38
|
+
Must define: def grade(sample: dict, item: dict) -> float
|
|
39
|
+
:type source: str
|
|
40
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
41
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
42
|
+
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
43
|
+
:type kwargs: Any
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
.. admonition:: Example:
|
|
47
|
+
|
|
48
|
+
.. literalinclude:: ../samples/evaluation_samples_common.py
|
|
49
|
+
:start-after: [START python_grader_example]
|
|
50
|
+
:end-before: [END python_grader_example]
|
|
51
|
+
:language: python
|
|
52
|
+
:dedent: 8
|
|
53
|
+
:caption: Using AzureOpenAIPythonGrader for custom evaluation logic.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
id = "azureai://built-in/evaluators/azure-openai/python_grader"
|
|
57
|
+
_type = "python"
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
*,
|
|
62
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
63
|
+
name: str,
|
|
64
|
+
pass_threshold: float,
|
|
65
|
+
source: str,
|
|
66
|
+
image_tag: Optional[str] = None,
|
|
67
|
+
credential: Optional[TokenCredential] = None,
|
|
68
|
+
**kwargs: Any,
|
|
69
|
+
):
|
|
70
|
+
# Validate pass_threshold
|
|
71
|
+
if not 0.0 <= pass_threshold <= 1.0:
|
|
72
|
+
raise ValueError("pass_threshold must be between 0.0 and 1.0")
|
|
73
|
+
|
|
74
|
+
# Store pass_threshold as instance attribute for potential future use
|
|
75
|
+
self.pass_threshold = pass_threshold
|
|
76
|
+
|
|
77
|
+
# Create OpenAI PythonGrader instance
|
|
78
|
+
grader = PythonGrader(
|
|
79
|
+
name=name,
|
|
80
|
+
image_tag=image_tag,
|
|
81
|
+
pass_threshold=pass_threshold,
|
|
82
|
+
source=source,
|
|
83
|
+
type=AzureOpenAIPythonGrader._type,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from openai.types.graders import ScoreModelGrader
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
10
|
+
from azure.core.credentials import TokenCredential
|
|
11
|
+
|
|
12
|
+
from .aoai_grader import AzureOpenAIGrader
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@experimental
|
|
16
|
+
class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
17
|
+
"""Wrapper class for OpenAI's score model graders.
|
|
18
|
+
|
|
19
|
+
Enables continuous scoring evaluation with custom prompts and flexible
|
|
20
|
+
conversation-style inputs. Supports configurable score ranges and
|
|
21
|
+
pass thresholds for binary classification.
|
|
22
|
+
|
|
23
|
+
Supplying a ScoreModelGrader to the `evaluate` method will cause an
|
|
24
|
+
asynchronous request to evaluate the grader via the OpenAI API. The
|
|
25
|
+
results of the evaluation will then be merged into the standard
|
|
26
|
+
evaluation results.
|
|
27
|
+
|
|
28
|
+
:param model_config: The model configuration to use for the grader.
|
|
29
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
30
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
31
|
+
:param input: The input messages for the grader. List of conversation
|
|
32
|
+
messages with role and content.
|
|
33
|
+
:type input: List[Dict[str, str]]
|
|
34
|
+
:param model: The model to use for the evaluation.
|
|
35
|
+
:type model: str
|
|
36
|
+
:param name: The name of the grader.
|
|
37
|
+
:type name: str
|
|
38
|
+
:param range: The range of the score. Defaults to [0, 1].
|
|
39
|
+
:type range: Optional[List[float]]
|
|
40
|
+
:param pass_threshold: Score threshold for pass/fail classification.
|
|
41
|
+
Defaults to midpoint of range.
|
|
42
|
+
:type pass_threshold: Optional[float]
|
|
43
|
+
:param sampling_params: The sampling parameters for the model.
|
|
44
|
+
:type sampling_params: Optional[Dict[str, Any]]
|
|
45
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
46
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
47
|
+
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
48
|
+
:type kwargs: Any
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
id = "azureai://built-in/evaluators/azure-openai/score_model_grader"
|
|
52
|
+
_type = "score_model"
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
*,
|
|
57
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
58
|
+
input: List[Dict[str, str]],
|
|
59
|
+
model: str,
|
|
60
|
+
name: str,
|
|
61
|
+
range: Optional[List[float]] = None,
|
|
62
|
+
pass_threshold: Optional[float] = None,
|
|
63
|
+
sampling_params: Optional[Dict[str, Any]] = None,
|
|
64
|
+
credential: Optional[TokenCredential] = None,
|
|
65
|
+
**kwargs: Any,
|
|
66
|
+
):
|
|
67
|
+
# Validate range and pass_threshold
|
|
68
|
+
if range is not None:
|
|
69
|
+
if len(range) != 2 or range[0] >= range[1]:
|
|
70
|
+
raise ValueError("range must be a list of two numbers [min, max] where min < max")
|
|
71
|
+
else:
|
|
72
|
+
range = [0.0, 1.0] # Default range
|
|
73
|
+
|
|
74
|
+
if pass_threshold is not None:
|
|
75
|
+
if range and (pass_threshold < range[0] or pass_threshold > range[1]):
|
|
76
|
+
raise ValueError(f"pass_threshold {pass_threshold} must be within range {range}")
|
|
77
|
+
else:
|
|
78
|
+
pass_threshold = (range[0] + range[1]) / 2 # Default to midpoint
|
|
79
|
+
|
|
80
|
+
# Store pass_threshold as instance attribute
|
|
81
|
+
self.pass_threshold = pass_threshold
|
|
82
|
+
|
|
83
|
+
# Create OpenAI ScoreModelGrader instance
|
|
84
|
+
grader_kwargs = {"input": input, "model": model, "name": name, "type": AzureOpenAIScoreModelGrader._type}
|
|
85
|
+
|
|
86
|
+
if range is not None:
|
|
87
|
+
grader_kwargs["range"] = range
|
|
88
|
+
if sampling_params is not None:
|
|
89
|
+
grader_kwargs["sampling_params"] = sampling_params
|
|
90
|
+
grader_kwargs["pass_threshold"] = self.pass_threshold
|
|
91
|
+
|
|
92
|
+
grader = ScoreModelGrader(**grader_kwargs)
|
|
93
|
+
|
|
94
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Any, Dict, Optional, Union
|
|
5
|
+
|
|
6
|
+
from openai.types.graders import StringCheckGrader
|
|
7
|
+
from typing_extensions import Literal
|
|
8
|
+
|
|
9
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
10
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
11
|
+
from azure.core.credentials import TokenCredential
|
|
12
|
+
|
|
13
|
+
from .aoai_grader import AzureOpenAIGrader
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@experimental
|
|
17
|
+
class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
18
|
+
"""Wrapper class for OpenAI's string check graders.
|
|
19
|
+
|
|
20
|
+
Supplying a StringCheckGrader to the `evaluate` method will cause an asynchronous request to evaluate
|
|
21
|
+
the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
|
|
22
|
+
evaluation results.
|
|
23
|
+
|
|
24
|
+
:param model_config: The model configuration to use for the grader.
|
|
25
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
26
|
+
:param input: The input text. This may include template strings.
|
|
27
|
+
:type input: str
|
|
28
|
+
:param name: The name of the grader.
|
|
29
|
+
:type name: str
|
|
30
|
+
:param operation: The string check operation to perform. One of `eq`, `ne`, `like`, or `ilike`.
|
|
31
|
+
:type operation: Literal["eq", "ne", "like", "ilike"]
|
|
32
|
+
:param reference: The reference text. This may include template strings.
|
|
33
|
+
:type reference: str
|
|
34
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
35
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
36
|
+
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
37
|
+
:type kwargs: Any
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
id = "azureai://built-in/evaluators/azure-openai/string_check_grader"
|
|
41
|
+
_type = "string_check"
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
*,
|
|
46
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
47
|
+
input: str,
|
|
48
|
+
name: str,
|
|
49
|
+
operation: Literal[
|
|
50
|
+
"eq",
|
|
51
|
+
"ne",
|
|
52
|
+
"like",
|
|
53
|
+
"ilike",
|
|
54
|
+
],
|
|
55
|
+
reference: str,
|
|
56
|
+
credential: Optional[TokenCredential] = None,
|
|
57
|
+
**kwargs: Any
|
|
58
|
+
):
|
|
59
|
+
grader = StringCheckGrader(
|
|
60
|
+
input=input,
|
|
61
|
+
name=name,
|
|
62
|
+
operation=operation,
|
|
63
|
+
reference=reference,
|
|
64
|
+
type=AzureOpenAIStringCheckGrader._type,
|
|
65
|
+
)
|
|
66
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Any, Dict, Optional, Union
|
|
5
|
+
|
|
6
|
+
from openai.types.graders import TextSimilarityGrader
|
|
7
|
+
from typing_extensions import Literal
|
|
8
|
+
|
|
9
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
10
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
11
|
+
from azure.core.credentials import TokenCredential
|
|
12
|
+
|
|
13
|
+
from .aoai_grader import AzureOpenAIGrader
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@experimental
|
|
17
|
+
class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
18
|
+
"""Wrapper class for OpenAI's string check graders.
|
|
19
|
+
|
|
20
|
+
Supplying a StringCheckGrader to the `evaluate` method will cause an asynchronous request to evaluate
|
|
21
|
+
the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
|
|
22
|
+
evaluation results.
|
|
23
|
+
|
|
24
|
+
:param model_config: The model configuration to use for the grader.
|
|
25
|
+
:type model_config: Union[
|
|
26
|
+
~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
27
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
28
|
+
:param evaluation_metric: The evaluation metric to use.
|
|
29
|
+
:type evaluation_metric: Literal["fuzzy_match", "bleu", "gleu", "meteor", "rouge_1", "rouge_2", "rouge_3",
|
|
30
|
+
"rouge_4", "rouge_5", "rouge_l", "cosine"]
|
|
31
|
+
:param input: The text being graded.
|
|
32
|
+
:type input: str
|
|
33
|
+
:param pass_threshold: A float score where a value greater than or equal indicates a passing grade.
|
|
34
|
+
:type pass_threshold: float
|
|
35
|
+
:param reference: The text being graded against.
|
|
36
|
+
:type reference: str
|
|
37
|
+
:param name: The name of the grader.
|
|
38
|
+
:type name: str
|
|
39
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
40
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
41
|
+
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
42
|
+
:type kwargs: Any
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
id = "azureai://built-in/evaluators/azure-openai/text_similarity_grader"
|
|
46
|
+
_type = "text_similarity"
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
*,
|
|
51
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
52
|
+
evaluation_metric: Literal[
|
|
53
|
+
"fuzzy_match",
|
|
54
|
+
"bleu",
|
|
55
|
+
"gleu",
|
|
56
|
+
"meteor",
|
|
57
|
+
"rouge_1",
|
|
58
|
+
"rouge_2",
|
|
59
|
+
"rouge_3",
|
|
60
|
+
"rouge_4",
|
|
61
|
+
"rouge_5",
|
|
62
|
+
"rouge_l",
|
|
63
|
+
"cosine",
|
|
64
|
+
],
|
|
65
|
+
input: str,
|
|
66
|
+
pass_threshold: float,
|
|
67
|
+
reference: str,
|
|
68
|
+
name: str,
|
|
69
|
+
credential: Optional[TokenCredential] = None,
|
|
70
|
+
**kwargs: Any
|
|
71
|
+
):
|
|
72
|
+
grader = TextSimilarityGrader(
|
|
73
|
+
evaluation_metric=evaluation_metric,
|
|
74
|
+
input=input,
|
|
75
|
+
pass_threshold=pass_threshold,
|
|
76
|
+
name=name,
|
|
77
|
+
reference=reference,
|
|
78
|
+
type=AzureOpenAITextSimilarityGrader._type,
|
|
79
|
+
)
|
|
80
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|